12 #define MAX(A,B) ((A) > (B) ? (A) : (B))
14 #ifndef MEM_OPT_VERSION
15 const char * AVXTileLists::buildExclFlyList(
const int itileList,
const int z,
16 const __m512i &atomIndex_i,
17 const int n,
void *molIn) {
18 if (itileList == _lastFlyListTile)
return _exclFlyLists[
z];
22 if (!_exclFlyListBuffer) {
23 _exclFlyListBuffer =
new char[mol->
numAtoms * 16];
24 for (
int i = 0; i < 16; i++)
25 _exclFlyLists[i] = _exclFlyListBuffer + mol->
numAtoms * i;
27 if (_lastFlyListTile == -1)
28 memset( (
void*) _exclFlyListBuffer, 0, mol->
numAtoms * 16);
30 for (
int i = 0; i < 16; i++) {
32 char *exclFlyList = _exclFlyLists[i];
33 const int32 *& fullExcl = _fullExcl[i];
34 const int32 *& modExcl = _modExcl[i];
36 if (_lastFlyListTile != -1) {
39 for (
int l=1; l<nl; ++l ) exclFlyList[fullExcl[l]] = 0;
41 for (
int l=1; l<nl; ++l ) exclFlyList[modExcl[l]] = 0;
45 const int id = *((
int*)&(atomIndex_i) + i);
48 for (
int l=1; l<nl; ++l )
52 for (
int l=1; l<nl; ++l )
57 return _exclFlyLists[
z];
65 template <
bool doEnergy,
bool doVirial,
bool doSlow,
bool doList,
67 void AVXTileLists::nbAVX512Tiles(__m512 &energyVdw, __m512 &energyElec,
68 __m512 &energySlow, __m512 &fNet_x,
69 __m512 &fNet_y, __m512 &fNet_z,
70 __m512 &fNetSlow_x, __m512 &fNetSlow_y,
73 #ifndef MEM_OPT_VERSION
74 _lastFlyListTile = -1;
79 const AVXTiles::AVXTilesAtom* __restrict__ xyzq_i = tiles_p0->atoms;
80 const AVXTiles::AVXTilesAtom* __restrict__ xyzq_j = tiles_p1->atoms;
81 AVXTiles::AVXTilesForce* __restrict__ forces_i = tiles_p0->forces;
82 AVXTiles::AVXTilesForce* __restrict__ forces_j = tiles_p1->forces;
83 AVXTiles::AVXTilesForce* __restrict__ forcesSlow_i;
84 AVXTiles::AVXTilesForce* __restrict__ forcesSlow_j;
86 forcesSlow_i = tiles_p0->forcesSlow;
87 forcesSlow_j = tiles_p1->forcesSlow;
90 const float* __restrict__ fastTable;
91 const float* __restrict__ energyTable;
92 const float * __restrict__ ljTable;
93 const float * __restrict__ eps4sigma;
95 if (interpMode == 3) {
96 fastTable = _paramFastTable;
97 energyTable = _paramFastEnergyTable;
100 if (interpMode > 1) ljTable = _paramLjTable;
102 if (interpMode == 1) eps4sigma = _paramEps4Sigma;
104 const float* __restrict__ slowTable = _paramSlowTable;
105 const float* __restrict__ slowEnergyTable = _paramSlowEnergyTable;
107 #ifdef TILE_LIST_STAT_DEBUG
109 int num_jtiles_empty = 0;
111 int num_rotates_empty = 0;
112 int num_rotates_excl_empty = 0;
116 int numModified, numExcluded;
123 const bool zeroShift = ! (_shx*_shx + _shy*_shy + _shz*_shz > 0) &&
124 (tiles_p0 == tiles_p1);
127 for (
int itileList = 0; itileList <
numTileLists; itileList++) {
130 unsigned int itileListLen;
132 const int atomStart_i = lists[
itileList].atomStart_i;
133 const int jtileStart = lists[
itileList].jtileStart;
134 const int jtileEnd = lists[itileList + 1].jtileStart;
136 int atomSize_i, atomFreeSize_i, atomSize_j, atomFreeSize_j;
138 __mmask16 freeMask_i, sizeMask_i;
140 float bbox_x, bbox_y, bbox_z, bbox_wx, bbox_wy, bbox_wz;
142 #ifdef MEM_OPT_VERSION
143 __m512i atomExclIndex_i;
145 __m512i exclIndex_i, exclMaxDiff_i;
147 atomSize_i = tiles_p0->numAtoms();
148 atomFreeSize_i = tiles_p0->numFreeAtoms();
149 atomSize_j = tiles_p1->numAtoms();
150 atomFreeSize_j = tiles_p1->numFreeAtoms();
153 nFree_i = MAX(atomFreeSize_i - atomStart_i, 0);
155 freeMask_i >>= 16 - nFree_i;
157 const int check_i = atomSize_i - atomStart_i;
159 sizeMask_i >>= 16 - check_i;
160 if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
161 if (check_i <= NAMD_AVXTILES_PAIR_THRESHOLD) doIpairList =
true;
162 else doIpairList =
false;
164 }
else if (NAMD_AVXTILES_PAIR_THRESHOLD > 0)
167 const int tileNum = atomStart_i >> 4;
168 bbox_x = tiles_p0->bbox_x[tileNum] + _shx;
169 bbox_y = tiles_p0->bbox_y[tileNum] + _shy;
170 bbox_z = tiles_p0->bbox_z[tileNum] + _shz;
171 bbox_wx = tiles_p0->bbox_wx[tileNum];
172 bbox_wy = tiles_p0->bbox_wy[tileNum];
173 bbox_wz = tiles_p0->bbox_wz[tileNum];
176 atomIndex_i = _mm512_load_epi32(tiles_p0->atomIndex + atomStart_i);
177 #ifdef MEM_OPT_VERSION
178 atomExclIndex_i = _mm512_load_epi32(tiles_p0->atomExclIndex +
181 exclIndex_i = _mm512_load_epi32(tiles_p0->exclIndexStart + atomStart_i);
182 exclMaxDiff_i = _mm512_load_epi32(tiles_p0->exclIndexMaxDiff +
187 const float *iptr = (
float *)(xyzq_i + atomStart_i);
188 const __m512 t0 = _mm512_load_ps(iptr);
189 const __m512 t1 = _mm512_load_ps(iptr+16);
190 const __m512 t2 = _mm512_load_ps(iptr+32);
191 const __m512 t3 = _mm512_load_ps(iptr+48);
192 const __m512i t4 = _mm512_set_epi32(29,25,21,17,13,9,5,1,
193 28,24,20,16,12,8,4,0);
194 const __m512 t5 = _mm512_permutex2var_ps(t0, t4, t1);
195 const __m512i t6 = _mm512_set_epi32(28,24,20,16,12,8,4,0,
196 29,25,21,17,13,9,5,1);
197 const __m512 t7 = _mm512_permutex2var_ps(t2, t6, t3);
198 const __mmask16 t9 = 0xFF00;
199 const __m512i t5i = _mm512_castps_si512(t5);
200 const __m512i t7i = _mm512_castps_si512(t7);
201 const __m512 x_i = _mm512_add_ps(_mm512_castsi512_ps(
202 _mm512_mask_blend_epi32(t9, t5i, t7i)),_mm512_set1_ps(_shx));
203 const __m512 y_i = _mm512_add_ps(_mm512_shuffle_f32x4(t5, t7, 0x4E),
204 _mm512_set1_ps(_shy));
205 const __m512i t12 = _mm512_set_epi32(31,27,23,19,15,11,7,3,
206 30,26,22,18,14,10,6,2);
207 const __m512 t13 = _mm512_permutex2var_ps(t0, t12, t1);
208 const __m512i t14 = _mm512_set_epi32(30,26,22,18,14,10,6,2,
209 31,27,23,19,15,11,7,3);
210 const __m512 t15 = _mm512_permutex2var_ps(t2, t14, t3);
211 const __m512i t13i = _mm512_castps_si512(t13);
212 const __m512i t15i = _mm512_castps_si512(t15);
213 const __m512 z_i = _mm512_add_ps(_mm512_castsi512_ps(
214 _mm512_mask_blend_epi32(t9, t13i, t15i)), _mm512_set1_ps(_shz));
215 const __m512 q_i = _mm512_castsi512_ps(_mm512_shuffle_i32x4(t13i, t15i,
217 const __m512i type_i = _mm512_load_epi32(tiles_p0->vdwTypes+atomStart_i);
220 __m512 eps4_i, sigma_i;
221 if (interpMode == 1) {
222 const __m512 t0 = (__m512)_mm512_i32logather_pd(type_i, eps4sigma,
224 const __m512i type_i2 = _mm512_shuffle_i32x4(type_i, type_i, 238);
225 const __m512 t1 = (__m512)_mm512_i32logather_pd(type_i2, eps4sigma,
227 const __m512i t4 = _mm512_set_epi32(31,29,27,25,23,21,19,17,
229 sigma_i = _mm512_permutex2var_ps(t0, t4, t1);
230 const __m512i t6 = _mm512_set_epi32(30,28,26,24,22,20,18,16,
232 eps4_i = _mm512_permutex2var_ps(t0, t6, t1);
236 __m512 forceSlow_i_x, forceSlow_i_y, forceSlow_i_z;
237 __m512 force_i_x = _mm512_setzero_ps();
238 __m512 force_i_y = _mm512_setzero_ps();
239 __m512 force_i_z = _mm512_setzero_ps();
241 forceSlow_i_x = _mm512_setzero_ps();
242 forceSlow_i_y = _mm512_setzero_ps();
243 forceSlow_i_z = _mm512_setzero_ps();
246 for (
int jtile=jtileStart;jtile < jtileEnd;jtile++) {
248 #ifdef TILE_LIST_STAT_DEBUG
250 bool jtile_empty =
true;
254 int atomStart_j = jTiles.atomStart[jtile];
255 const bool self = zeroShift && (atomStart_i == atomStart_j);
256 const int shiftVal = (
self) ? 1 : 0;
259 const float * jptr = (
float *)(xyzq_j + atomStart_j);
260 const __m512 t0 = _mm512_load_ps(jptr);
261 const __m512 t1 = _mm512_load_ps(jptr+16);
262 const __m512 t2 = _mm512_load_ps(jptr+32);
263 const __m512 t3 = _mm512_load_ps(jptr+48);
264 const __m512i t4 = _mm512_set_epi32(29,25,21,17,13,9,5,1,
265 28,24,20,16,12,8,4,0);
266 const __m512 t5 = _mm512_permutex2var_ps(t0, t4, t1);
267 const __m512i t6 = _mm512_set_epi32(28,24,20,16,12,8,4,0,
268 29,25,21,17,13,9,5,1);
269 const __m512 t7 = _mm512_permutex2var_ps(t2, t6, t3);
270 const __mmask16 t9 = 0xFF00;
271 const __m512i t5i = _mm512_castps_si512(t5);
272 const __m512i t7i = _mm512_castps_si512(t7);
273 __m512 x_j = _mm512_castsi512_ps(_mm512_mask_blend_epi32(t9, t5i, t7i));
274 __m512 y_j = _mm512_shuffle_f32x4(t5, t7, 0x4E);
275 const __m512i t12 = _mm512_set_epi32(31,27,23,19,15,11,7,3,
276 30,26,22,18,14,10,6,2);
277 const __m512 t13 = _mm512_permutex2var_ps(t0, t12, t1);
278 const __m512i t14 = _mm512_set_epi32(30,26,22,18,14,10,6,2,
279 31,27,23,19,15,11,7,3);
280 const __m512 t15 = _mm512_permutex2var_ps(t2, t14, t3);
281 const __m512i t13i = _mm512_castps_si512(t13);
282 const __m512i t15i = _mm512_castps_si512(t15);
283 __m512 z_j = _mm512_castsi512_ps(_mm512_mask_blend_epi32(t9, t13i,
285 __m512 q_j = _mm512_castsi512_ps(_mm512_shuffle_i32x4(t13i, t15i, 0x4E));
287 __m512i excl, atomIndex_j;
288 __mmask16 freeMask_j, sizeMask_j;
293 const __m512 dx_one = _mm512_abs_ps(
294 _mm512_sub_ps(_mm512_set1_ps(bbox_x), x_j));
295 const __m512 dx_two = _mm512_set1_ps(bbox_wx);
296 const __mmask16 lxmask = _mm512_cmplt_ps_mask(dx_two, dx_one);
297 const __m512 dx = _mm512_mask_sub_ps(_mm512_setzero_ps(), lxmask,
300 const __m512 dy_one = _mm512_abs_ps(
301 _mm512_sub_ps(_mm512_set1_ps(bbox_y), y_j));
302 const __m512 dy_two = _mm512_set1_ps(bbox_wy);
303 const __mmask16 lymask = _mm512_cmplt_ps_mask(dy_two, dy_one);
304 const __m512 dy = _mm512_mask_sub_ps(_mm512_setzero_ps(), lymask,
307 const __m512 dz_one = _mm512_abs_ps(
308 _mm512_sub_ps(_mm512_set1_ps(bbox_z), z_j));
309 const __m512 dz_two = _mm512_set1_ps(bbox_wz);
310 const __mmask16 lzmask = _mm512_cmplt_ps_mask(dz_two, dz_one);
311 const __m512 dz = _mm512_mask_sub_ps(_mm512_setzero_ps(), lzmask,
314 const __m512 r2bb = _mm512_fmadd_ps(dx,dx,_mm512_fmadd_ps(dy,dy,
315 _mm512_mul_ps(dz, dz)));
318 __mmask16 m = _mm512_cmple_ps_mask(r2bb, _mm512_set1_ps(_plcutoff2));
322 atomIndex_j = _mm512_load_epi32(tiles_p1->atomIndex+atomStart_j);
324 excl = _mm512_setzero_epi32();
328 int nFree_j = MAX(atomFreeSize_j - atomStart_j, 0);
330 freeMask_j >>= 16 - nFree_j;
331 const int check_j = atomSize_j - atomStart_j;
334 sizeMask_j >>= 16 - check_j;
340 if (NAMD_AVXTILES_PAIR_THRESHOLD > 0 && doIpairList) {
341 int maxPairs = _numPairs[0];
342 for (
int z = 1; z < NAMD_AVXTILES_PAIR_THRESHOLD; z++)
343 if (_numPairs[z] > maxPairs) maxPairs = _numPairs[
z];
344 if (maxPairs + 16 > _maxPairs) {
345 reallocPairLists(0, 1.4 * _maxPairs);
350 excl = _mm512_load_epi32(jTiles.excl + (jtile << 4));
353 __m512i type_j = _mm512_load_epi32(tiles_p1->vdwTypes+atomStart_j);
355 __m512 eps4_j, sigma_j;
356 if (interpMode == 1) {
357 const __m512 t0 = (__m512)_mm512_i32logather_pd(type_j, eps4sigma,
359 const __m512i type_j2 = _mm512_shuffle_i32x4(type_j, type_j, 238);
360 const __m512 t1 = (__m512)_mm512_i32logather_pd(type_j2, eps4sigma,
362 const __m512i t4 = _mm512_set_epi32(31,29,27,25,23,21,19,17,
364 sigma_j = _mm512_permutex2var_ps(t0, t4, t1);
365 const __m512i t6 = _mm512_set_epi32(30,28,26,24,22,20,18,16,
367 eps4_j = _mm512_permutex2var_ps(t0, t6, t1);
371 __m512 force_j_x = _mm512_setzero_ps();
372 __m512 force_j_y = _mm512_setzero_ps();
373 __m512 force_j_z = _mm512_setzero_ps();
375 __m512 forceSlow_j_x, forceSlow_j_y, forceSlow_j_z;
377 forceSlow_j_x = _mm512_setzero_ps();
378 forceSlow_j_y = _mm512_setzero_ps();
379 forceSlow_j_z = _mm512_setzero_ps();
382 int t = (
self) ? 1 : 0;
388 if (doList) mask_j = 0xFFFF;
391 if (!doList && !
self) excl = _mm512_slli_epi32(excl, 1);
395 x_j = (__m512)_mm512_alignr_epi32((__m512i)x_j, (__m512i)x_j, 1);
396 y_j = (__m512)_mm512_alignr_epi32((__m512i)y_j, (__m512i)y_j, 1);
397 z_j = (__m512)_mm512_alignr_epi32((__m512i)z_j, (__m512i)z_j, 1);
398 q_j = (__m512)_mm512_alignr_epi32((__m512i)q_j, (__m512i)q_j, 1);
400 freeMask_j = (freeMask_j>>1) | (freeMask_j<<15);
401 sizeMask_j = (sizeMask_j>>1) | (sizeMask_j<<15);
402 if (doList) mask_j >>= shiftVal;
403 atomIndex_j = _mm512_alignr_epi32(atomIndex_j, atomIndex_j, 1);
405 if (interpMode == 1) {
406 eps4_j = (__m512)_mm512_alignr_epi32((__m512i)eps4_j,
408 sigma_j = (__m512)_mm512_alignr_epi32((__m512i)sigma_j,
409 (__m512i)sigma_j, 1);
411 type_j = _mm512_alignr_epi32(type_j, type_j, 1);
415 for (; t < 16; t++) {
417 excl = _mm512_srli_epi32(excl, 1);
418 __mmask16 freeMask_ij;
421 freeMask_ij = (freeMask_j | freeMask_i) & (sizeMask_j & sizeMask_i);
423 #ifdef TILE_LIST_STAT_DEBUG
424 __mmask16 r2mask = (excl & 1).notzero();
426 if (!r2mask) {num_rotates_empty++; num_rotates_excl_empty++;}
431 __m512 dx, dy, dz, r2;
433 if (!doList || freeMask_ij) {
435 dx = _mm512_sub_ps(x_j, x_i);
437 dy = _mm512_sub_ps(y_j, y_i);
439 dz = _mm512_sub_ps(z_j, z_i);
441 r2 = _mm512_fmadd_ps(dx,dx,_mm512_fmadd_ps(dy,dy,
442 _mm512_mul_ps(dz, dz)));
444 #ifdef TILE_LIST_STAT_DEBUG
446 __mmask16 t=r2 <= _plcutoff2; t &= r2mask;
447 if (!t) num_rotates_empty++;
else jtile_empty =
false;
453 r2mask = _mm512_cmple_ps_mask(r2, _mm512_set1_ps(_plcutoff2)) &
455 r2mask &= freeMask_ij;
458 r2mask = _mm512_cmple_ps_mask(r2, _mm512_set1_ps(_cutoff2));
459 r2mask &= _mm512_cmpneq_epi32_mask(_mm512_and_epi32(excl,
460 _mm512_set1_epi32(1)), _mm512_setzero_epi32());
466 if (doList && r2mask) {
467 if (_numModifiedAlloc - numModified < 16) {
468 int newNum =
static_cast<double>(numModified) *
469 numTileLists / itileList;
470 if (newNum < _numModifiedAlloc + 16)
471 newNum = _numModifiedAlloc + 16;
472 reallocModified(newNum);
474 if (_numExcludedAlloc - numExcluded < 16) {
475 int newNum =
static_cast<double>(numExcluded) *
476 numTileLists / itileList;
477 if (newNum < _numExcludedAlloc + 16)
478 newNum = _numExcludedAlloc + 16;
479 reallocExcluded(newNum);
483 __mmask16 excludedMask = _mm512_cmpge_epi32_mask(atomIndex_j,
485 excludedMask &= _mm512_cmple_epi32_mask(atomIndex_j, exclMaxDiff_i);
486 excludedMask &= r2mask;
487 __mmask16 scalarPos = 1;
490 for (
int z = 0; z < 16; z++) {
491 if (scalarPos & excludedMask) {
492 #ifdef MEM_OPT_VERSION
493 const char *exclFlags =
494 mol->get_excl_check_for_idx(*((
int*)&(atomExclIndex_i) + z))->
497 const char *exclFlags =
501 #ifndef MEM_OPT_VERSION
504 exclFlags -= *((
int*)&(exclIndex_i) +
z);
505 #ifndef MEM_OPT_VERSION
508 exclFlags = buildExclFlyList(itileList, z, atomIndex_i,
509 atomSize_i - atomStart_i, mol);
512 const int exclType = exclFlags[*((
int*)&atomIndex_j + z)];
515 excludedMask &= ~scalarPos;
516 else if (exclType == 2) {
518 _modified_i[numModified] = atomStart_i +
z;
519 int jpos = (z + t) & 15;
520 _modified_j[numModified] = atomStart_j + jpos;
521 if (*((
float*)&r2 +
z) <= _cutoff2)
525 _excluded_i[numExcluded] = atomStart_i +
z;
526 int jpos = (z + t) & 15;
527 _excluded_j[numExcluded] = atomStart_j + jpos;
528 if (*((
float*)&r2 +
z) <= _cutoff2)
537 r2mask &= ~excludedMask;
540 if (NAMD_AVXTILES_PAIR_THRESHOLD > 0 &&
541 doIpairList && r2mask) {
542 for (
int z = 0; z < NAMD_AVXTILES_PAIR_THRESHOLD; z++) {
544 const int jPos = atomStart_j + ((z+t) & 15);
545 _pairLists[_pairStart[
z]+_numPairs[
z]] = jPos;
553 excl = _mm512_mask_or_epi32(excl, r2mask, excl,
554 _mm512_set1_epi32(0x8000));
555 if (r2mask) jTileActive =
true;
557 r2mask = _mm512_cmple_ps_mask(r2, _mm512_set1_ps(_cutoff2)) &
558 mask_j & freeMask_ij & ~excludedMask;
565 const __m512 kqq = _mm512_mul_ps(q_i, q_j);
566 __m512 force, forceSlow;
570 forceEnergyInterp1<doEnergy, doSlow>(r2, kqq, force, forceSlow,
571 energyVdw, energyElec, energySlow, r2mask, _paramC1,
572 _paramC3, _paramSwitchOn2, _cutoff2, _paramMinvCut3,
573 _paramCutUnder3, slowTable, slowEnergyTable, eps4_i, eps4_j,
576 forceEnergyInterp2<doEnergy, doSlow, interpMode>(r2, kqq, type_i,
577 type_j, force, forceSlow, energyVdw, energyElec, energySlow,
578 r2mask, _paramScale, _paramC1, _paramC3, _paramSwitchOn2,
579 _cutoff2, _paramMinvCut3, _paramCutUnder3, fastTable,
580 energyTable, slowTable, slowEnergyTable, ljTable,
584 force_i_x = _mm512_mask_mov_ps(force_i_x, r2mask,
585 _mm512_fmadd_ps(dx, force, force_i_x));
586 force_i_y = _mm512_mask_mov_ps(force_i_y, r2mask,
587 _mm512_fmadd_ps(dy, force, force_i_y));
588 force_i_z = _mm512_mask_mov_ps(force_i_z, r2mask,
589 _mm512_fmadd_ps(dz, force, force_i_z));
591 force_j_x = _mm512_mask_mov_ps(force_j_x, r2mask,
592 _mm512_fnmadd_ps(dx, force, force_j_x));
593 force_j_y = _mm512_mask_mov_ps(force_j_y, r2mask,
594 _mm512_fnmadd_ps(dy, force, force_j_y));
595 force_j_z = _mm512_mask_mov_ps(force_j_z, r2mask,
596 _mm512_fnmadd_ps(dz, force, force_j_z));
599 forceSlow_i_x = _mm512_mask_mov_ps(forceSlow_i_x, r2mask,
600 _mm512_fmadd_ps(dx, forceSlow, forceSlow_i_x));
601 forceSlow_i_y = _mm512_mask_mov_ps(forceSlow_i_y, r2mask,
602 _mm512_fmadd_ps(dy, forceSlow, forceSlow_i_y));
603 forceSlow_i_z = _mm512_mask_mov_ps(forceSlow_i_z, r2mask,
604 _mm512_fmadd_ps(dz, forceSlow, forceSlow_i_z));
606 forceSlow_j_x = _mm512_mask_mov_ps(forceSlow_j_x, r2mask,
607 _mm512_fnmadd_ps(dx, forceSlow, forceSlow_j_x));
608 forceSlow_j_y = _mm512_mask_mov_ps(forceSlow_j_y, r2mask,
609 _mm512_fnmadd_ps(dy, forceSlow, forceSlow_j_y));
610 forceSlow_j_z = _mm512_mask_mov_ps(forceSlow_j_z, r2mask,
611 _mm512_fnmadd_ps(dz, forceSlow, forceSlow_j_z));
616 x_j = (__m512)_mm512_alignr_epi32((__m512i)x_j, (__m512i)x_j, 1);
617 y_j = (__m512)_mm512_alignr_epi32((__m512i)y_j, (__m512i)y_j, 1);
618 z_j = (__m512)_mm512_alignr_epi32((__m512i)z_j, (__m512i)z_j, 1);
619 q_j = (__m512)_mm512_alignr_epi32((__m512i)q_j, (__m512i)q_j, 1);
621 freeMask_j = (freeMask_j>>1) | (freeMask_j<<15);
622 sizeMask_j = (sizeMask_j>>1) | (sizeMask_j<<15);
624 atomIndex_j = _mm512_alignr_epi32(atomIndex_j, atomIndex_j, 1);
626 if (interpMode == 1) {
627 eps4_j = (__m512)_mm512_alignr_epi32((__m512i)eps4_j,
629 sigma_j = (__m512)_mm512_alignr_epi32((__m512i)sigma_j,
630 (__m512i)sigma_j, 1);
632 type_j = _mm512_alignr_epi32(type_j, type_j, 1);
633 force_j_x = (__m512)_mm512_alignr_epi32((__m512i)force_j_x,
634 (__m512i)force_j_x, 1);
635 force_j_y = (__m512)_mm512_alignr_epi32((__m512i)force_j_y,
636 (__m512i)force_j_y, 1);
637 force_j_z = (__m512)_mm512_alignr_epi32((__m512i)force_j_z,
638 (__m512i)force_j_z, 1);
640 forceSlow_j_x = (__m512)_mm512_alignr_epi32((__m512i)forceSlow_j_x,
641 (__m512i)forceSlow_j_x,
643 forceSlow_j_y = (__m512)_mm512_alignr_epi32((__m512i)forceSlow_j_y,
644 (__m512i)forceSlow_j_y,
646 forceSlow_j_z = (__m512)_mm512_alignr_epi32((__m512i)forceSlow_j_z,
647 (__m512i)forceSlow_j_z,
653 const __m512i tp0x = _mm512_set_epi32(0,19,11,3,0,18,10,2,
655 const __m512i tp1x = _mm512_set_epi32(0,23,15,7,0,22,14,6,0,21,
657 const __m512i tp2x = _mm512_set_epi32(0,27,11,3,0,26,10,2,
659 const __m512i tp3x = _mm512_set_epi32(0,31,15,7,0,30,14,6,
660 0,29,13,5,0,28,12,4);
662 float * jptr = (
float *)(forces_j + atomStart_j);
663 const __m512 v0 = _mm512_load_ps(jptr);
664 const __m512 v1 = _mm512_load_ps(jptr + 16);
665 const __m512 v2 = _mm512_load_ps(jptr + 32);
666 const __m512 v3 = _mm512_load_ps(jptr + 48);
667 const __m512 w1 = _mm512_shuffle_f32x4(force_j_x, force_j_y,
669 const __m512 w2 = _mm512_shuffle_f32x4(force_j_x, force_j_y,
671 __m512 tp0 = _mm512_permutex2var_ps(w1, tp0x, force_j_z);
672 __m512 tp1 = _mm512_permutex2var_ps(w1, tp1x, force_j_z);
673 __m512 tp2 = _mm512_permutex2var_ps(w2, tp2x, force_j_z);
674 __m512 tp3 = _mm512_permutex2var_ps(w2, tp3x, force_j_z);
675 tp0 = _mm512_add_ps(v0, tp0);
676 tp1 = _mm512_add_ps(v1, tp1);
677 tp2 = _mm512_add_ps(v2, tp2);
678 tp3 = _mm512_add_ps(v3, tp3);
679 _mm512_store_ps(jptr, tp0);
680 _mm512_store_ps(jptr + 16, tp1);
681 _mm512_store_ps(jptr + 32, tp2);
682 _mm512_store_ps(jptr + 48, tp3);
686 float * jptr = (
float *)(forcesSlow_j + atomStart_j);
687 const __m512 v0 = _mm512_load_ps(jptr);
688 const __m512 v1 = _mm512_load_ps(jptr + 16);
689 const __m512 v2 = _mm512_load_ps(jptr + 32);
690 const __m512 v3 = _mm512_load_ps(jptr + 48);
691 const __m512 w1 = _mm512_shuffle_f32x4(forceSlow_j_x, forceSlow_j_y,
693 const __m512 w2 = _mm512_shuffle_f32x4(forceSlow_j_x, forceSlow_j_y,
695 __m512 tp0 = _mm512_permutex2var_ps(w1, tp0x, forceSlow_j_z);
696 __m512 tp1 = _mm512_permutex2var_ps(w1, tp1x, forceSlow_j_z);
697 __m512 tp2 = _mm512_permutex2var_ps(w2, tp2x, forceSlow_j_z);
698 __m512 tp3 = _mm512_permutex2var_ps(w2, tp3x, forceSlow_j_z);
699 tp0 = _mm512_add_ps(v0, tp0);
700 tp1 = _mm512_add_ps(v1, tp1);
701 tp2 = _mm512_add_ps(v2, tp2);
702 tp3 = _mm512_add_ps(v3, tp3);
703 _mm512_store_ps(jptr, tp0);
704 _mm512_store_ps(jptr + 16, tp1);
705 _mm512_store_ps(jptr + 32, tp2);
706 _mm512_store_ps(jptr + 48, tp3);
713 if (_mm512_cmpneq_epi32_mask(excl, _mm512_setzero_epi32()))
715 jTiles.status[jtile] = anyexcl;
717 _mm512_store_epi32((
void *)(jTiles.excl + (jtile << 4)),
719 itileListLen += anyexcl;
722 #ifdef TILE_LIST_STAT_DEBUG
723 if (jtile_empty) num_jtiles_empty++;
730 const __m512i tp0x = _mm512_set_epi32(0,19,11,3,0,18,10,2,
732 const __m512i tp1x = _mm512_set_epi32(0,23,15,7,0,22,14,6,0,21,
734 const __m512i tp2x = _mm512_set_epi32(0,27,11,3,0,26,10,2,
736 const __m512i tp3x = _mm512_set_epi32(0,31,15,7,0,30,14,6,
737 0,29,13,5,0,28,12,4);
739 float * iptr = (
float *)(forces_i + atomStart_i);
740 const __m512 v0 = _mm512_load_ps(iptr);
741 const __m512 v1 = _mm512_load_ps(iptr + 16);
742 const __m512 v2 = _mm512_load_ps(iptr + 32);
743 const __m512 v3 = _mm512_load_ps(iptr + 48);
744 const __m512 w1 = _mm512_shuffle_f32x4(force_i_x, force_i_y,
746 const __m512 w2 = _mm512_shuffle_f32x4(force_i_x, force_i_y,
748 __m512 tp0 = _mm512_permutex2var_ps(w1, tp0x, force_i_z);
749 __m512 tp1 = _mm512_permutex2var_ps(w1, tp1x, force_i_z);
750 __m512 tp2 = _mm512_permutex2var_ps(w2, tp2x, force_i_z);
751 __m512 tp3 = _mm512_permutex2var_ps(w2, tp3x, force_i_z);
752 tp0 = _mm512_add_ps(v0, tp0);
753 tp1 = _mm512_add_ps(v1, tp1);
754 tp2 = _mm512_add_ps(v2, tp2);
755 tp3 = _mm512_add_ps(v3, tp3);
756 _mm512_store_ps(iptr, tp0);
757 _mm512_store_ps(iptr + 16, tp1);
758 _mm512_store_ps(iptr + 32, tp2);
759 _mm512_store_ps(iptr + 48, tp3);
763 float * iptr = (
float *)(forcesSlow_i + atomStart_i);
764 const __m512 v0 = _mm512_load_ps(iptr);
765 const __m512 v1 = _mm512_load_ps(iptr + 16);
766 const __m512 v2 = _mm512_load_ps(iptr + 32);
767 const __m512 v3 = _mm512_load_ps(iptr + 48);
768 const __m512 w1 = _mm512_shuffle_f32x4(forceSlow_i_x, forceSlow_i_y,
770 const __m512 w2 = _mm512_shuffle_f32x4(forceSlow_i_x, forceSlow_i_y,
772 __m512 tp0 = _mm512_permutex2var_ps(w1, tp0x, forceSlow_i_z);
773 __m512 tp1 = _mm512_permutex2var_ps(w1, tp1x, forceSlow_i_z);
774 __m512 tp2 = _mm512_permutex2var_ps(w2, tp2x, forceSlow_i_z);
775 __m512 tp3 = _mm512_permutex2var_ps(w2, tp3x, forceSlow_i_z);
776 tp0 = _mm512_add_ps(v0, tp0);
777 tp1 = _mm512_add_ps(v1, tp1);
778 tp2 = _mm512_add_ps(v2, tp2);
779 tp3 = _mm512_add_ps(v3, tp3);
780 _mm512_store_ps(iptr, tp0);
781 _mm512_store_ps(iptr + 16, tp1);
782 _mm512_store_ps(iptr + 32, tp2);
783 _mm512_store_ps(iptr + 48, tp3);
792 fNet_x = _mm512_add_ps(fNet_x, force_i_x);
793 fNet_y = _mm512_add_ps(fNet_y, force_i_y);
794 fNet_z = _mm512_add_ps(fNet_z, force_i_z);
797 fNetSlow_x = _mm512_add_ps(fNetSlow_x, forceSlow_i_x);
798 fNetSlow_y = _mm512_add_ps(fNetSlow_y, forceSlow_i_y);
799 fNetSlow_z = _mm512_add_ps(fNetSlow_z, forceSlow_i_z);
803 if (NAMD_AVXTILES_PAIR_THRESHOLD > 0 && doList) {
805 for (
int z = 0; z < NAMD_AVXTILES_PAIR_THRESHOLD; z++)
806 if (_numPairs[z]) _numPairLists++;
811 _numModified = numModified;
813 _numExcluded = numExcluded;
814 _exclusionChecksum = numModified + numExcluded;
817 #ifdef TILE_LIST_STAT_DEBUG
819 printf(
"TILE_DBG: JTILES %d EMPTY %.2f ROTATES %d EMPTY %.2f EXCL %.2f\n",
820 num_jtiles,
double(num_jtiles_empty)/num_jtiles, num_rotates,
821 double(num_rotates_empty)/num_rotates,
822 double(num_rotates_excl_empty)/num_rotates);
830 template <
bool doEnergy,
bool doVirial,
bool doSlow,
int interpMode>
831 void AVXTileLists::nbAVX512Pairs(__m512 &energyVdw, __m512 &energyElec,
832 __m512 &energySlow, __m512 &fNet_x,
833 __m512 &fNet_y, __m512 &fNet_z,
834 __m512 &fNetSlow_x, __m512 &fNetSlow_y,
835 __m512 &fNetSlow_z) {
837 const AVXTiles::AVXTilesAtom* __restrict__ xyzq_i = tiles_p0->atoms;
838 const AVXTiles::AVXTilesAtom* __restrict__ xyzq_j = tiles_p1->atoms;
839 AVXTiles::AVXTilesForce* __restrict__ forces_i = tiles_p0->forces;
840 AVXTiles::AVXTilesForce* __restrict__ forces_j = tiles_p1->forces;
841 AVXTiles::AVXTilesForce* __restrict__ forcesSlow_i;
842 AVXTiles::AVXTilesForce* __restrict__ forcesSlow_j;
844 forcesSlow_i = tiles_p0->forcesSlow;
845 forcesSlow_j = tiles_p1->forcesSlow;
848 const float* __restrict__ fastTable;
849 const float* __restrict__ energyTable;
850 const float * __restrict__ ljTable;
851 const float * __restrict__ eps4sigma;
853 if (interpMode == 3) {
854 fastTable = _paramFastTable;
855 energyTable = _paramFastEnergyTable;
858 if (interpMode > 1) ljTable = _paramLjTable;
860 if (interpMode == 1) eps4sigma = _paramEps4Sigma;
862 const float* __restrict__ slowTable = _paramSlowTable;
863 const float* __restrict__ slowEnergyTable = _paramSlowEnergyTable;
865 for (
int z = 0; z < NAMD_AVXTILES_PAIR_THRESHOLD; z++) {
866 if (!_numPairs[z])
continue;
869 __m512 force_i_x = _mm512_setzero_ps();
870 __m512 force_i_y = _mm512_setzero_ps();
871 __m512 force_i_z = _mm512_setzero_ps();
872 __m512 forceSlow_i_x, forceSlow_i_y, forceSlow_i_z;
874 forceSlow_i_x = _mm512_setzero_ps();
875 forceSlow_i_y = _mm512_setzero_ps();
876 forceSlow_i_z = _mm512_setzero_ps();
879 const int pairI = _pair_i[
z];
880 const int nPairs = _numPairs[
z];
882 const __m512 x_i = _mm512_set1_ps(xyzq_i[pairI].
x + _shx);
883 const __m512 y_i = _mm512_set1_ps(xyzq_i[pairI].
y + _shy);
884 const __m512 z_i = _mm512_set1_ps(xyzq_i[pairI].z + _shz);
885 const __m512 q_i = _mm512_set1_ps(xyzq_i[pairI].q);
886 const int scalarType_i = tiles_p0->vdwTypes[pairI];
887 const __m512i type_i = _mm512_set1_epi32(scalarType_i);
888 __m512 eps4_i, sigma_i;
889 if (interpMode == 1) {
890 eps4_i = _mm512_set1_ps(eps4sigma[scalarType_i*2]);
891 sigma_i = _mm512_set1_ps(eps4sigma[scalarType_i*2+1]);
894 __mmask16 loopMask = 0xFFFF;
895 int listPos = _pairStart[
z];
896 for (
int mv = 0; mv < nPairs; mv += 16) {
898 if (nPairs - mv < 16)
899 loopMask >>= (16 - (nPairs - mv));
901 __m512i j = _mm512_load_epi32(_pairLists + listPos);
905 const __m512i jt2 = _mm512_slli_epi32(j, 1);
906 const __m512 t0 = (__m512)_mm512_mask_i32logather_pd(
907 _mm512_undefined_pd(), loopMask, jt2, (
float *)xyzq_j, _MM_SCALE_8);
908 const __m512 t8 = (__m512)_mm512_mask_i32logather_pd(
909 _mm512_undefined_pd(), loopMask, jt2, (
float *)(xyzq_j) + 2,
911 const __m512i jt2_2 = _mm512_shuffle_i32x4(jt2, jt2, 238);
912 const __mmask16 loopMask2 = loopMask >> 8;
913 const __m512 t1 = (__m512)_mm512_mask_i32logather_pd(
914 _mm512_undefined_pd(), loopMask2, jt2_2, (
float *)xyzq_j, _MM_SCALE_8);
915 const __m512 t9 = (__m512)_mm512_mask_i32logather_pd(
916 _mm512_undefined_pd(), loopMask2, jt2_2, (
float *)(xyzq_j) + 2,
918 const __m512i t4 = _mm512_set_epi32(31,29,27,25,23,21,19,17,
920 const __m512 y_j = _mm512_permutex2var_ps(t0, t4, t1);
921 const __m512 q_j = _mm512_permutex2var_ps(t8, t4, t9);
922 const __m512i t6 = _mm512_set_epi32(30,28,26,24,22,20,18,16,
924 const __m512 x_j = _mm512_permutex2var_ps(t0, t6, t1);
925 const __m512 z_j = _mm512_permutex2var_ps(t8, t6, t9);
928 const __m512 kqq = _mm512_mul_ps(q_i, q_j);
931 const __m512 dx = _mm512_sub_ps(x_i, x_j);
933 const __m512 dy = _mm512_sub_ps(y_i, y_j);
935 const __m512 dz = _mm512_sub_ps(z_i, z_j);
937 const __m512 r2(_mm512_fmadd_ps(dx,dx,_mm512_fmadd_ps(dy,dy,
938 _mm512_mul_ps(dz, dz))));
940 const __mmask16 r2mask = _mm512_cmple_ps_mask(r2,
941 _mm512_set1_ps(_cutoff2)) & loopMask;
944 const __m512i type_j = _mm512_mask_i32gather_epi32(type_j, r2mask, j,
945 tiles_p1->vdwTypes, _MM_SCALE_4);
948 __m512 eps4_j, sigma_j;
949 if (interpMode == 1) {
950 const __m512 t0 = (__m512)_mm512_mask_i32logather_pd(
951 _mm512_undefined_pd(), r2mask, type_j, eps4sigma, _MM_SCALE_8);
952 const __m512i type_j2 = _mm512_shuffle_i32x4(type_j, type_j, 238);
953 const __mmask16 r2mask2 = r2mask >> 8;
954 const __m512 t1 = (__m512)_mm512_mask_i32logather_pd(
955 _mm512_undefined_pd(), r2mask2, type_j2, eps4sigma, _MM_SCALE_8);
956 sigma_j = _mm512_permutex2var_ps(t0, t4, t1);
957 eps4_j = _mm512_permutex2var_ps(t0, t6, t1);
961 __m512 force, forceSlow;
963 forceEnergyInterp1<doEnergy, doSlow>(r2, kqq, force, forceSlow,
964 energyVdw, energyElec, energySlow, r2mask, _paramC1, _paramC3,
965 _paramSwitchOn2, _cutoff2, _paramMinvCut3, _paramCutUnder3,
966 slowTable, slowEnergyTable, eps4_i, eps4_j, sigma_i, sigma_j);
968 forceEnergyInterp2<doEnergy, doSlow, interpMode>(r2, kqq, type_i,
969 type_j, force, forceSlow, energyVdw, energyElec, energySlow,
970 r2mask, _paramScale, _paramC1, _paramC3, _paramSwitchOn2,
971 _cutoff2, _paramMinvCut3, _paramCutUnder3, fastTable, energyTable,
972 slowTable, slowEnergyTable, ljTable, _paramLjWidth);
975 const __m512 force_x = _mm512_mul_ps(dx, force);
976 const __m512 force_y = _mm512_mul_ps(dy, force);
977 const __m512 force_z = _mm512_mul_ps(dz, force);
979 const __m512i j4 = _mm512_slli_epi32(j, 2);
980 __m512 ft0 = (__m512)_mm512_mask_i32logather_pd(_mm512_undefined_pd(),
981 r2mask, j4, (
float*)forces_j, _MM_SCALE_4);
982 const __m512i j4_2 = _mm512_shuffle_i32x4(j4, j4, 238);
983 const __mmask16 r2mask2 = r2mask >> 8;
984 __m512 ft1 = (__m512)_mm512_mask_i32logather_pd(_mm512_undefined_pd(),
985 r2mask2, j4_2, (
float *)forces_j, _MM_SCALE_4);
986 const __m512i ft4 = _mm512_set_epi32(23,7,22,6,21,5,20,4,
987 19,3,18,2,17,1,16,0);
988 const __m512 ft5 = _mm512_permutex2var_ps(force_x, ft4, force_y);
989 ft0 = _mm512_add_ps(ft0, ft5);
990 _mm512_mask_i32loscatter_pd((
void *)forces_j, r2mask, j4, (__m512d)ft0,
992 const __m512i ft2 = _mm512_set_epi32(31,15,30,14,29,13,28,12,
993 27,11,26,10,25,9,24,8);
994 const __m512 ft3 = _mm512_permutex2var_ps(force_x, ft2, force_y);
995 ft1 = _mm512_add_ps(ft1, ft3);
996 _mm512_mask_i32loscatter_pd((
void *)forces_j, r2mask2, j4_2,
997 (__m512d)ft1, _MM_SCALE_4);
998 __m512 mem3 = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), r2mask,
999 j4, (
float *)(forces_j)+2,
1001 mem3 = _mm512_add_ps(mem3, force_z);
1002 _mm512_mask_i32scatter_ps((
float *)(forces_j)+2, r2mask, j4, mem3,
1006 force_i_x = _mm512_mask_sub_ps(force_i_x, r2mask, force_i_x, force_x);
1007 force_i_y = _mm512_mask_sub_ps(force_i_y, r2mask, force_i_y, force_y);
1008 force_i_z = _mm512_mask_sub_ps(force_i_z, r2mask, force_i_z, force_z);
1010 __m512 forceSlow_x, forceSlow_y, forceSlow_z;
1013 forceSlow_x = _mm512_mul_ps(dx, forceSlow);
1014 forceSlow_y = _mm512_mul_ps(dy, forceSlow);
1015 forceSlow_z = _mm512_mul_ps(dz, forceSlow);
1019 __m512 ft10 = (__m512)_mm512_mask_i32logather_pd(_mm512_undefined_pd(),
1020 r2mask, j4, (
float*)forcesSlow_j, _MM_SCALE_4);
1021 __m512 ft11 = (__m512)_mm512_mask_i32logather_pd(_mm512_undefined_pd(),
1022 r2mask2, j4_2, (
float *)forcesSlow_j, _MM_SCALE_4);
1023 const __m512 ft15 = _mm512_permutex2var_ps(forceSlow_x, ft4,
1025 ft10 = _mm512_add_ps(ft10, ft15);
1026 _mm512_mask_i32loscatter_pd((
void *)forcesSlow_j, r2mask, j4,
1027 (__m512d)ft10, _MM_SCALE_4);
1028 const __m512 ft13 = _mm512_permutex2var_ps(forceSlow_x, ft2,
1030 ft11 = _mm512_add_ps(ft11, ft13);
1031 _mm512_mask_i32loscatter_pd((
void *)forcesSlow_j, r2mask2, j4_2,
1032 (__m512d)ft11, _MM_SCALE_4);
1033 __m512 mem13 = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), r2mask,
1034 j4, (
float *)(forcesSlow_j)+2,
1036 mem13 = _mm512_add_ps(mem13, forceSlow_z);
1037 _mm512_mask_i32scatter_ps((
float *)(forcesSlow_j)+2, r2mask, j4, mem13,
1041 forceSlow_i_x = _mm512_mask_sub_ps(forceSlow_i_x, r2mask,
1042 forceSlow_i_x, forceSlow_x);
1043 forceSlow_i_y = _mm512_mask_sub_ps(forceSlow_i_y, r2mask,
1044 forceSlow_i_y, forceSlow_y);
1045 forceSlow_i_z = _mm512_mask_sub_ps(forceSlow_i_z, r2mask,
1046 forceSlow_i_z, forceSlow_z);
1051 float fI = _mm512_reduce_add_ps(force_i_x);
1052 forces_i[pairI].x += fI;
1053 if (doVirial) *((
float*)&fNet_x) += fI;
1054 fI = _mm512_reduce_add_ps(force_i_y);
1055 forces_i[pairI].y += fI;
1056 if (doVirial) *((
float*)&fNet_y) += fI;
1057 fI = _mm512_reduce_add_ps(force_i_z);
1058 forces_i[pairI].z += fI;
1059 if (doVirial) *((
float*)&fNet_z) += fI;
1061 fI = _mm512_reduce_add_ps(forceSlow_i_x);
1062 forcesSlow_i[pairI].x += fI;
1063 if (doVirial) *((
float*)&fNetSlow_x) += fI;
1064 fI = _mm512_reduce_add_ps(forceSlow_i_y);
1065 forcesSlow_i[pairI].y += fI;
1066 if (doVirial) *((
float*)&fNetSlow_y) += fI;
1067 fI = _mm512_reduce_add_ps(forceSlow_i_z);
1068 forcesSlow_i[pairI].z += fI;
1069 if (doVirial) *((
float*)&fNetSlow_z) += fI;
1078 template <
bool doEnergy,
bool doVirial,
bool doSlow,
int interpMode>
1079 void AVXTileLists::nbAVX512Modified(__m512 &energyVdw, __m512 &energyElec,
1080 __m512 &energySlow, __m512 &fNet_x,
1081 __m512 &fNet_y, __m512 &fNet_z,
1082 __m512 &fNetSlow_x, __m512 &fNetSlow_y,
1083 __m512 &fNetSlow_z) {
1085 const AVXTiles::AVXTilesAtom* __restrict__ xyzq_i = tiles_p0->atoms;
1086 const AVXTiles::AVXTilesAtom* __restrict__ xyzq_j = tiles_p1->atoms;
1087 AVXTiles::AVXTilesForce* __restrict__ forces_i = tiles_p0->forces;
1088 AVXTiles::AVXTilesForce* __restrict__ forces_j = tiles_p1->forces;
1089 AVXTiles::AVXTilesForce* __restrict__ forcesSlow_i;
1090 AVXTiles::AVXTilesForce* __restrict__ forcesSlow_j;
1092 forcesSlow_i = tiles_p0->forcesSlow;
1093 forcesSlow_j = tiles_p1->forcesSlow;
1096 const float* __restrict__ fastTable;
1097 const float* __restrict__ energyTable;
1099 if (interpMode == 3) {
1100 fastTable = _paramFastTable;
1101 energyTable = _paramFastEnergyTable;
1104 const float * __restrict__ mod_table = _paramModifiedTable;
1105 const float * __restrict__ mode_table = _paramModifiedEnergyTable;
1107 const float * __restrict__ ljTable14;
1108 const float * __restrict__ eps4sigma14;
1110 ljTable14 = _paramLjTable + 2;
1112 eps4sigma14 = _paramEps4Sigma14;
1114 __mmask16 loopMask = 0xFFFF;
1116 for (
int mv = 0; mv < _numModified; mv += 16) {
1118 if (_numModified - mv < 16)
1119 loopMask >>= (16 - (_numModified - mv));
1121 const __m512i i = _mm512_load_epi32(_modified_i + mv);
1122 const __m512i j = _mm512_load_epi32(_modified_j + mv);
1125 const __m512i it2 = _mm512_slli_epi32(i, 1);
1126 const __m512 t0 = (__m512)_mm512_mask_i32logather_pd(
1127 _mm512_undefined_pd(), loopMask, it2, (
float *)xyzq_i, _MM_SCALE_8);
1128 const __m512 t8 = (__m512)_mm512_mask_i32logather_pd(
1129 _mm512_undefined_pd(), loopMask, it2, (
float *)(xyzq_i) + 2,
1131 const __m512i it2_2 = _mm512_shuffle_i32x4(it2, it2, 238);
1132 const __mmask16 loopMask2 = loopMask >> 8;
1133 const __m512 t1 = (__m512)_mm512_mask_i32logather_pd(
1134 _mm512_undefined_pd(), loopMask2, it2_2, (
float *)xyzq_i, _MM_SCALE_8);
1135 const __m512 t9 = (__m512)_mm512_mask_i32logather_pd(
1136 _mm512_undefined_pd(), loopMask2, it2_2, (
float *)(xyzq_i) + 2,
1138 const __m512i t4 = _mm512_set_epi32(31,29,27,25,23,21,19,17,
1139 15,13,11,9,7,5,3,1);
1140 const __m512 y_i = _mm512_add_ps(_mm512_permutex2var_ps(t0, t4, t1),
1141 _mm512_set1_ps(_shy));
1142 const __m512 q_i = _mm512_permutex2var_ps(t8, t4, t9);
1143 const __m512i t6 = _mm512_set_epi32(30,28,26,24,22,20,18,16,
1144 14,12,10,8,6,4,2,0);
1145 const __m512 x_i = _mm512_add_ps(_mm512_permutex2var_ps(t0, t6, t1),
1146 _mm512_set1_ps(_shx));
1147 const __m512 z_i = _mm512_add_ps(_mm512_permutex2var_ps(t8, t6, t9),
1148 _mm512_set1_ps(_shz));
1151 const __m512i jt2 = _mm512_slli_epi32(j, 1);
1152 const __m512 t10 = (__m512)_mm512_mask_i32logather_pd(
1153 _mm512_undefined_pd(), loopMask, jt2, (
float *)xyzq_j, _MM_SCALE_8);
1154 const __m512 t18 = (__m512)_mm512_mask_i32logather_pd(
1155 _mm512_undefined_pd(), loopMask, jt2, (
float *)(xyzq_j) + 2,
1157 const __m512i jt2_2 = _mm512_shuffle_i32x4(jt2, jt2, 238);
1158 const __m512 t11 = (__m512)_mm512_mask_i32logather_pd(
1159 _mm512_undefined_pd(), loopMask2, jt2_2, (
float *)xyzq_j, _MM_SCALE_8);
1160 const __m512 t19 = (__m512)_mm512_mask_i32logather_pd(
1161 _mm512_undefined_pd(), loopMask2, jt2_2, (
float *)(xyzq_j) + 2,
1163 const __m512 y_j = _mm512_permutex2var_ps(t10, t4, t11);
1164 const __m512 q_j = _mm512_permutex2var_ps(t18, t4, t19);
1165 const __m512 x_j = _mm512_permutex2var_ps(t10, t6, t11);
1166 const __m512 z_j = _mm512_permutex2var_ps(t18, t6, t19);
1169 const __m512 kqq = _mm512_mul_ps(q_i, _mm512_mul_ps(q_j,
1170 _mm512_set1_ps(_paramScale14)));
1173 const __m512 dx = _mm512_sub_ps(x_i, x_j);
1175 const __m512 dy = _mm512_sub_ps(y_i, y_j);
1177 const __m512 dz = _mm512_sub_ps(z_i, z_j);
1179 const __m512 r2(_mm512_fmadd_ps(dx,dx,_mm512_fmadd_ps(dy,dy,
1180 _mm512_mul_ps(dz, dz))));
1182 __mmask16 r2mask = _mm512_cmple_ps_mask(r2, _mm512_set1_ps(_cutoff2)) &
1186 const __m512i type_i = _mm512_mask_i32gather_epi32(type_i, r2mask, i,
1187 tiles_p0->vdwTypes, _MM_SCALE_4);
1188 const __m512i type_j = _mm512_mask_i32gather_epi32(type_j, r2mask, j,
1189 tiles_p1->vdwTypes, _MM_SCALE_4);
1192 __m512 eps4_i14, sigma_i14, eps4_j14, sigma_j14;
1193 if (interpMode == 1) {
1194 const __m512 t0 = (__m512)_mm512_mask_i32logather_pd(
1195 _mm512_undefined_pd(), r2mask, type_i, eps4sigma14, _MM_SCALE_8);
1196 const __m512i type_i2 = _mm512_shuffle_i32x4(type_i, type_i, 238);
1197 const __mmask16 r2mask2 = r2mask >> 8;
1198 const __m512 t1 = (__m512)_mm512_mask_i32logather_pd(
1199 _mm512_undefined_pd(), r2mask2, type_i2, eps4sigma14, _MM_SCALE_8);
1200 const __m512 t10 = (__m512)_mm512_mask_i32logather_pd(
1201 _mm512_undefined_pd(), r2mask, type_j, eps4sigma14, _MM_SCALE_8);
1202 const __m512i type_j2 = _mm512_shuffle_i32x4(type_j, type_j, 238);
1203 const __m512 t11 = (__m512)_mm512_mask_i32logather_pd(
1204 _mm512_undefined_pd(), r2mask2, type_j2, eps4sigma14, _MM_SCALE_8);
1206 sigma_i14 = _mm512_permutex2var_ps(t0, t4, t1);
1207 sigma_j14 = _mm512_permutex2var_ps(t10, t4, t11);
1208 eps4_i14 = _mm512_permutex2var_ps(t0, t6, t1);
1209 eps4_j14 = _mm512_permutex2var_ps(t10, t6, t11);
1213 __m512 force, forceSlow;
1214 if (interpMode == 1)
1215 forceEnergyInterp1<doEnergy, doSlow>(r2, kqq, force, forceSlow,
1216 energyVdw, energyElec, energySlow, r2mask, _paramC1, _paramC3,
1217 _paramSwitchOn2, _cutoff2, _paramMinvCut3, _paramCutUnder3,
1218 mod_table, mode_table, eps4_i14, eps4_j14, sigma_i14, sigma_j14);
1220 forceEnergyInterp2<doEnergy, doSlow, interpMode>(r2, kqq, type_i, type_j,
1221 force, forceSlow, energyVdw, energyElec, energySlow, r2mask,
1222 _paramScale, _paramC1, _paramC3, _paramSwitchOn2, _cutoff2,
1223 _paramMinvCut3, _paramCutUnder3, fastTable, energyTable,
1224 mod_table, mode_table, ljTable14, _paramLjWidth);
1227 const __m512 force_i_x = _mm512_mul_ps(dx, force);
1228 const __m512 force_i_y = _mm512_mul_ps(dy, force);
1229 const __m512 force_i_z = _mm512_mul_ps(dz, force);
1230 __m512 forceSlow_i_x, forceSlow_i_y, forceSlow_i_z;
1233 forceSlow_i_x = _mm512_mul_ps(dx, forceSlow);
1234 forceSlow_i_y = _mm512_mul_ps(dy, forceSlow);
1235 forceSlow_i_z = _mm512_mul_ps(dz, forceSlow);
1240 fNet_x = _mm512_mask_sub_ps(fNet_x, r2mask, fNet_x, force_i_x);
1241 fNet_y = _mm512_mask_sub_ps(fNet_y, r2mask, fNet_y, force_i_y);
1242 fNet_z = _mm512_mask_sub_ps(fNet_z, r2mask, fNet_z, force_i_z);
1245 fNetSlow_x = _mm512_mask_sub_ps(fNetSlow_x, r2mask,
1246 fNetSlow_x, forceSlow_i_x);
1247 fNetSlow_y = _mm512_mask_sub_ps(fNetSlow_y, r2mask,
1248 fNetSlow_y, forceSlow_i_y);
1249 fNetSlow_z = _mm512_mask_sub_ps(fNetSlow_z, r2mask,
1250 fNetSlow_z, forceSlow_i_z);
1255 for (
int z = 0; z < 16; z++) {
1257 if (!(r2mask & 1)) {
1266 const int i_z = *((
int*)&i + z);
1267 const int j_z = *((
int*)&j + z);
1268 forces_i[i_z].x -= *((
float*)&force_i_x + z);
1269 forces_i[i_z].y -= *((
float*)&force_i_y + z);
1270 forces_i[i_z].z -= *((
float*)&force_i_z + z);
1271 forces_j[j_z].x += *((
float*)&force_i_x + z);
1272 forces_j[j_z].y += *((
float*)&force_i_y + z);
1273 forces_j[j_z].z += *((
float*)&force_i_z + z);
1275 forcesSlow_i[i_z].x -= *((
float*)&forceSlow_i_x + z);
1276 forcesSlow_i[i_z].y -= *((
float*)&forceSlow_i_y + z);
1277 forcesSlow_i[i_z].z -= *((
float*)&forceSlow_i_z + z);
1278 forcesSlow_j[j_z].x += *((
float*)&forceSlow_i_x + z);
1279 forcesSlow_j[j_z].y += *((
float*)&forceSlow_i_y + z);
1280 forcesSlow_j[j_z].z += *((
float*)&forceSlow_i_z + z);
1290 template <
bool doEnergy,
bool doVirial>
1291 void AVXTileLists::nbAVX512Excluded(__m512 &energySlow, __m512 &fNetSlow_x,
1292 __m512 &fNetSlow_y, __m512 &fNetSlow_z) {
1294 const AVXTiles::AVXTilesAtom* __restrict__ xyzq_i = tiles_p0->atoms;
1295 const AVXTiles::AVXTilesAtom* __restrict__ xyzq_j = tiles_p1->atoms;
1296 AVXTiles::AVXTilesForce* __restrict__ forcesSlow_i = tiles_p0->forcesSlow;
1297 AVXTiles::AVXTilesForce* __restrict__ forcesSlow_j = tiles_p1->forcesSlow;
1299 const float * __restrict__ exclTable = _paramExcludedTable;
1300 const float * __restrict__ exclEtable = _paramExcludedEnergyTable;
1302 __mmask16 loopMask = 0xFFFF;
1303 for (
int mv = 0; mv < _numExcluded; mv += 16) {
1305 if (_numExcluded - mv < 16)
1306 loopMask >>= (16 - (_numExcluded - mv));
1308 const __m512i i = _mm512_load_epi32(_excluded_i + mv);
1309 const __m512i j = _mm512_load_epi32(_excluded_j + mv);
1312 const __m512i it2 = _mm512_slli_epi32(i, 1);
1313 const __m512 t0 = (__m512)_mm512_mask_i32logather_pd(
1314 _mm512_undefined_pd(), loopMask, it2, (
float *)xyzq_i, _MM_SCALE_8);
1315 const __m512 t8 = (__m512)_mm512_mask_i32logather_pd(
1316 _mm512_undefined_pd(), loopMask, it2, (
float *)(xyzq_i) + 2,
1318 const __m512i it2_2 = _mm512_shuffle_i32x4(it2, it2, 238);
1319 const __mmask16 loopMask2 = loopMask >> 8;
1320 const __m512 t1 = (__m512)_mm512_mask_i32logather_pd(
1321 _mm512_undefined_pd(), loopMask2, it2_2, (
float *)xyzq_i, _MM_SCALE_8);
1322 const __m512 t9 = (__m512)_mm512_mask_i32logather_pd(
1323 _mm512_undefined_pd(), loopMask2, it2_2, (
float *)(xyzq_i) + 2,
1325 const __m512i t4 = _mm512_set_epi32(31,29,27,25,23,21,19,17,
1326 15,13,11,9,7,5,3,1);
1327 const __m512 y_i = _mm512_add_ps(_mm512_permutex2var_ps(t0, t4, t1),
1328 _mm512_set1_ps(_shy));
1329 const __m512 q_i = _mm512_permutex2var_ps(t8, t4, t9);
1330 const __m512i t6 = _mm512_set_epi32(30,28,26,24,22,20,18,16,
1331 14,12,10,8,6,4,2,0);
1332 const __m512 x_i = _mm512_add_ps(_mm512_permutex2var_ps(t0, t6, t1),
1333 _mm512_set1_ps(_shx));
1334 const __m512 z_i = _mm512_add_ps(_mm512_permutex2var_ps(t8, t6, t9),
1335 _mm512_set1_ps(_shz));
1338 const __m512i jt2 = _mm512_slli_epi32(j, 1);
1339 const __m512 t10 = (__m512)_mm512_mask_i32logather_pd(
1340 _mm512_undefined_pd(), loopMask, jt2, (
float *)xyzq_j, _MM_SCALE_8);
1341 const __m512 t18 = (__m512)_mm512_mask_i32logather_pd(
1342 _mm512_undefined_pd(), loopMask, jt2, (
float *)(xyzq_j) + 2,
1344 const __m512i jt2_2 = _mm512_shuffle_i32x4(jt2, jt2, 238);
1345 const __m512 t11 = (__m512)_mm512_mask_i32logather_pd(
1346 _mm512_undefined_pd(), loopMask2, jt2_2, (
float *)xyzq_j, _MM_SCALE_8);
1347 const __m512 t19 = (__m512)_mm512_mask_i32logather_pd(
1348 _mm512_undefined_pd(), loopMask2, jt2_2, (
float *)(xyzq_j) + 2,
1350 const __m512 y_j = _mm512_permutex2var_ps(t10, t4, t11);
1351 const __m512 q_j = _mm512_permutex2var_ps(t18, t4, t19);
1352 const __m512 x_j = _mm512_permutex2var_ps(t10, t6, t11);
1353 const __m512 z_j = _mm512_permutex2var_ps(t18, t6, t19);
1356 const __m512 kqq = _mm512_mul_ps(q_i, q_j);
1359 const __m512 dx = _mm512_sub_ps(x_i, x_j);
1361 const __m512 dy = _mm512_sub_ps(y_i, y_j);
1363 const __m512 dz = _mm512_sub_ps(z_i, z_j);
1365 const __m512 r2(_mm512_fmadd_ps(dx,dx,_mm512_fmadd_ps(dy,dy,
1366 _mm512_mul_ps(dz, dz))));
1368 __mmask16 r2mask = _mm512_cmple_ps_mask(r2, _mm512_set1_ps(_cutoff2)) &
1372 const __m512 r_1 = _mm512_invsqrt_ps(r2);
1373 __m512 forceSlow, tableDiff, rTableDiff;
1375 getOmpSimdTableI(r_1, table_int, tableDiff, rTableDiff);
1376 forceEnergySlow512<doEnergy>(r2mask, kqq, exclTable, exclEtable,
1377 table_int, tableDiff, rTableDiff, forceSlow,
1381 const __m512 forceSlow_i_x = _mm512_mul_ps(dx, forceSlow);
1382 const __m512 forceSlow_i_y = _mm512_mul_ps(dy, forceSlow);
1383 const __m512 forceSlow_i_z = _mm512_mul_ps(dz, forceSlow);
1385 fNetSlow_x = _mm512_mask_sub_ps(fNetSlow_x, r2mask,
1386 fNetSlow_x, forceSlow_i_x);
1387 fNetSlow_y = _mm512_mask_sub_ps(fNetSlow_y, r2mask,
1388 fNetSlow_y, forceSlow_i_y);
1389 fNetSlow_z = _mm512_mask_sub_ps(fNetSlow_z, r2mask,
1390 fNetSlow_z, forceSlow_i_z);
1392 for (
int z = 0; z < 16; z++) {
1394 if (!(r2mask & 1)) {
1401 const int i_z = *((
int*)&i + z);
1402 const int j_z = *((
int*)&j + z);
1403 forcesSlow_i[i_z].x -= *((
float*)&forceSlow_i_x + z);
1404 forcesSlow_i[i_z].y -= *((
float*)&forceSlow_i_y + z);
1405 forcesSlow_i[i_z].z -= *((
float*)&forceSlow_i_z + z);
1406 forcesSlow_j[j_z].x += *((
float*)&forceSlow_i_x + z);
1407 forcesSlow_j[j_z].y += *((
float*)&forceSlow_i_y + z);
1408 forcesSlow_j[j_z].z += *((
float*)&forceSlow_i_z + z);
1413 template <
bool doEnergy,
bool doVirial,
bool doSlow,
bool doList,
1415 void AVXTileLists::doAll() {
1417 if (doList) build();
1419 __m512 energyVdw, energyElec, energySlow;
1420 __m512 fNet_x, fNet_y, fNet_z;
1421 __m512 fNetSlow_x, fNetSlow_y, fNetSlow_z;
1425 energyVdw = _mm512_setzero_ps();
1426 energyElec = _mm512_setzero_ps();
1427 if (doSlow) energySlow = _mm512_setzero_ps();
1430 fNet_x = _mm512_setzero_ps();
1431 fNet_y = _mm512_setzero_ps();
1432 fNet_z = _mm512_setzero_ps();
1434 fNetSlow_x = _mm512_setzero_ps();
1435 fNetSlow_y = _mm512_setzero_ps();
1436 fNetSlow_z = _mm512_setzero_ps();
1441 nbAVX512Tiles<doEnergy, doVirial, doSlow, doList,
1442 interpMode>(energyVdw, energyElec, energySlow, fNet_x, fNet_y,
1443 fNet_z, fNetSlow_x, fNetSlow_y, fNetSlow_z);
1445 if (doList) delEmptyLists();
1448 if (NAMD_AVXTILES_PAIR_THRESHOLD > 0 && _numPairLists)
1449 nbAVX512Pairs<doEnergy, doVirial, doSlow,
1450 interpMode>(energyVdw, energyElec, energySlow, fNet_x,
1451 fNet_y, fNet_z, fNetSlow_x, fNetSlow_y,
1456 nbAVX512Modified<doEnergy, doVirial, doSlow,
1457 interpMode>(energyVdw, energyElec, energySlow, fNet_x,
1458 fNet_y, fNet_z, fNetSlow_x, fNetSlow_y,
1462 if (doSlow && _numExcluded)
1463 nbAVX512Excluded<doEnergy, doVirial>(energySlow, fNetSlow_x,
1464 fNetSlow_y, fNetSlow_z);
1468 _energyVdw = _mm512_reduce_add_ps(energyVdw);
1469 if (interpMode > 1) _energyVdw *= _paramScale * 0.5f;
1470 _energyElec = _mm512_reduce_add_ps(energyElec);
1471 if (doSlow) _energySlow = _mm512_reduce_add_ps(energySlow);
1476 _fNet_x = _mm512_reduce_add_ps(fNet_x);
1477 _fNet_y = _mm512_reduce_add_ps(fNet_y);
1478 _fNet_z = _mm512_reduce_add_ps(fNet_z);
1480 _fNetSlow_x = _mm512_reduce_add_ps(fNetSlow_x);
1481 _fNetSlow_y = _mm512_reduce_add_ps(fNetSlow_y);
1482 _fNetSlow_z = _mm512_reduce_add_ps(fNetSlow_z);
1489 if (_numLists || _numModified || _numExcluded ||
1490 (NAMD_AVXTILES_PAIR_THRESHOLD > 0 && _numPairLists)) {
1502 void AVXTileLists::nbForceAVX512(
const int doEnergy,
const int doVirial,
1503 const int doSlow,
const int doList) {
1505 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, INTERPMODE) \
1506 doAll<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, INTERPMODE>();
1508 if (_interpolationMode == 1) {
1509 if (!doEnergy && !doVirial && !doSlow && !doList)
CALL(0, 0, 0, 0, 1);
1510 if (!doEnergy && !doVirial && doSlow && !doList)
CALL(0, 0, 1, 0, 1);
1511 if (!doEnergy && doVirial && !doSlow && !doList)
CALL(0, 1, 0, 0, 1);
1512 if (!doEnergy && doVirial && doSlow && !doList)
CALL(0, 1, 1, 0, 1);
1513 if ( doEnergy && !doVirial && !doSlow && !doList)
CALL(1, 0, 0, 0, 1);
1514 if ( doEnergy && !doVirial && doSlow && !doList)
CALL(1, 0, 1, 0, 1);
1515 if ( doEnergy && doVirial && !doSlow && !doList)
CALL(1, 1, 0, 0, 1);
1516 if ( doEnergy && doVirial && doSlow && !doList)
CALL(1, 1, 1, 0, 1);
1518 if (!doEnergy && !doVirial && !doSlow && doList)
CALL(0, 0, 0, 1, 1);
1519 if (!doEnergy && !doVirial && doSlow && doList)
CALL(0, 0, 1, 1, 1);
1520 if (!doEnergy && doVirial && !doSlow && doList)
CALL(0, 1, 0, 1, 1);
1521 if (!doEnergy && doVirial && doSlow && doList)
CALL(0, 1, 1, 1, 1);
1522 if ( doEnergy && !doVirial && !doSlow && doList)
CALL(1, 0, 0, 1, 1);
1523 if ( doEnergy && !doVirial && doSlow && doList)
CALL(1, 0, 1, 1, 1);
1524 if ( doEnergy && doVirial && !doSlow && doList)
CALL(1, 1, 0, 1, 1);
1525 if ( doEnergy && doVirial && doSlow && doList)
CALL(1, 1, 1, 1, 1);
1526 }
else if (_interpolationMode == 2) {
1527 if (!doEnergy && !doVirial && !doSlow && !doList)
CALL(0, 0, 0, 0, 2);
1528 if (!doEnergy && !doVirial && doSlow && !doList)
CALL(0, 0, 1, 0, 2);
1529 if (!doEnergy && doVirial && !doSlow && !doList)
CALL(0, 1, 0, 0, 2);
1530 if (!doEnergy && doVirial && doSlow && !doList)
CALL(0, 1, 1, 0, 2);
1531 if ( doEnergy && !doVirial && !doSlow && !doList)
CALL(1, 0, 0, 0, 2);
1532 if ( doEnergy && !doVirial && doSlow && !doList)
CALL(1, 0, 1, 0, 2);
1533 if ( doEnergy && doVirial && !doSlow && !doList)
CALL(1, 1, 0, 0, 2);
1534 if ( doEnergy && doVirial && doSlow && !doList)
CALL(1, 1, 1, 0, 2);
1536 if (!doEnergy && !doVirial && !doSlow && doList)
CALL(0, 0, 0, 1, 2);
1537 if (!doEnergy && !doVirial && doSlow && doList)
CALL(0, 0, 1, 1, 2);
1538 if (!doEnergy && doVirial && !doSlow && doList)
CALL(0, 1, 0, 1, 2);
1539 if (!doEnergy && doVirial && doSlow && doList)
CALL(0, 1, 1, 1, 2);
1540 if ( doEnergy && !doVirial && !doSlow && doList)
CALL(1, 0, 0, 1, 2);
1541 if ( doEnergy && !doVirial && doSlow && doList)
CALL(1, 0, 1, 1, 2);
1542 if ( doEnergy && doVirial && !doSlow && doList)
CALL(1, 1, 0, 1, 2);
1543 if ( doEnergy && doVirial && doSlow && doList)
CALL(1, 1, 1, 1, 2);
1545 if (!doEnergy && !doVirial && !doSlow && !doList)
CALL(0, 0, 0, 0, 3);
1546 if (!doEnergy && !doVirial && doSlow && !doList)
CALL(0, 0, 1, 0, 3);
1547 if (!doEnergy && doVirial && !doSlow && !doList)
CALL(0, 1, 0, 0, 3);
1548 if (!doEnergy && doVirial && doSlow && !doList)
CALL(0, 1, 1, 0, 3);
1549 if ( doEnergy && !doVirial && !doSlow && !doList)
CALL(1, 0, 0, 0, 3);
1550 if ( doEnergy && !doVirial && doSlow && !doList)
CALL(1, 0, 1, 0, 3);
1551 if ( doEnergy && doVirial && !doSlow && !doList)
CALL(1, 1, 0, 0, 3);
1552 if ( doEnergy && doVirial && doSlow && !doList)
CALL(1, 1, 1, 0, 3);
1554 if (!doEnergy && !doVirial && !doSlow && doList)
CALL(0, 0, 0, 1, 3);
1555 if (!doEnergy && !doVirial && doSlow && doList)
CALL(0, 0, 1, 1, 3);
1556 if (!doEnergy && doVirial && !doSlow && doList)
CALL(0, 1, 0, 1, 3);
1557 if (!doEnergy && doVirial && doSlow && doList)
CALL(0, 1, 1, 1, 3);
1558 if ( doEnergy && !doVirial && !doSlow && doList)
CALL(1, 0, 0, 1, 3);
1559 if ( doEnergy && !doVirial && doSlow && doList)
CALL(1, 0, 1, 1, 3);
1560 if ( doEnergy && doVirial && !doSlow && doList)
CALL(1, 1, 0, 1, 3);
1561 if ( doEnergy && doVirial && doSlow && doList)
CALL(1, 1, 1, 1, 3);
1567 #endif // NAMD_AVXTILES
const int32 * get_mod_exclusions_for_atom(int anum) const
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int numTileLists
const ExclusionCheck * get_excl_check_for_atom(int anum) const
#define CALL(DOENERGY, DOVIRIAL)
const int32 * get_full_exclusions_for_atom(int anum) const