NAMD
AVXTilesKernel.C
Go to the documentation of this file.
1 #include "AVXTileLists.h"
2 
3 // -- WMB: For exclusions
4 #include "Node.h"
5 #include "Molecule.h"
6 // --
7 
8 #include "AVXTilesKernel.h"
9 
10 #ifdef NAMD_AVXTILES
11 
12 #define MAX(A,B) ((A) > (B) ? (A) : (B))
13 
14 #ifndef MEM_OPT_VERSION
15 const char * AVXTileLists::buildExclFlyList(const int itileList, const int z,
16  const __m512i &atomIndex_i,
17  const int n, void *molIn) {
18  if (itileList == _lastFlyListTile) return _exclFlyLists[z];
19 
20  Molecule *mol = (Molecule *)molIn;
21 
22  if (!_exclFlyListBuffer) {
23  _exclFlyListBuffer = new char[mol->numAtoms * 16];
24  for (int i = 0; i < 16; i++)
25  _exclFlyLists[i] = _exclFlyListBuffer + mol->numAtoms * i;
26  }
27  if (_lastFlyListTile == -1)
28  memset( (void*) _exclFlyListBuffer, 0, mol->numAtoms * 16);
29 
30  for (int i = 0; i < 16; i++) {
31  if (i >= n) break;
32  char *exclFlyList = _exclFlyLists[i];
33  const int32 *& fullExcl = _fullExcl[i];
34  const int32 *& modExcl = _modExcl[i];
35 
36  if (_lastFlyListTile != -1) {
37  int nl;
38  nl = fullExcl[0] + 1;
39  for (int l=1; l<nl; ++l ) exclFlyList[fullExcl[l]] = 0;
40  nl = modExcl[0] + 1;
41  for (int l=1; l<nl; ++l ) exclFlyList[modExcl[l]] = 0;
42  }
43 
44  int nl;
45  const int id = *((int*)&(atomIndex_i) + i);
46  fullExcl = mol->get_full_exclusions_for_atom(id);
47  nl = fullExcl[0] + 1;
48  for (int l=1; l<nl; ++l )
49  exclFlyList[fullExcl[l]] = EXCHCK_FULL;
50  modExcl = mol->get_mod_exclusions_for_atom(id);
51  nl = modExcl[0] + 1;
52  for (int l=1; l<nl; ++l )
53  exclFlyList[modExcl[l]] = EXCHCK_MOD;
54  } // for i
55 
56  _lastFlyListTile = itileList;
57  return _exclFlyLists[z];
58 }
59 #endif
60 
61 //---------------------------------------------------------------------------
62 // Calculations for unmodified/unexcluded from tile lists
63 //---------------------------------------------------------------------------
64 
65 template <bool doEnergy, bool doVirial, bool doSlow, bool doList,
66  int interpMode>
67 void AVXTileLists::nbAVX512Tiles(__m512 &energyVdw, __m512 &energyElec,
68  __m512 &energySlow, __m512 &fNet_x,
69  __m512 &fNet_y, __m512 &fNet_z,
70  __m512 &fNetSlow_x, __m512 &fNetSlow_y,
71  __m512 &fNetSlow_z) {
72 
73  #ifndef MEM_OPT_VERSION
74  _lastFlyListTile = -1;
75  #endif
76 
77  Molecule* mol = Node::Object()->molecule;
78 
79  const AVXTiles::AVXTilesAtom* __restrict__ xyzq_i = tiles_p0->atoms;
80  const AVXTiles::AVXTilesAtom* __restrict__ xyzq_j = tiles_p1->atoms;
81  AVXTiles::AVXTilesForce* __restrict__ forces_i = tiles_p0->forces;
82  AVXTiles::AVXTilesForce* __restrict__ forces_j = tiles_p1->forces;
83  AVXTiles::AVXTilesForce* __restrict__ forcesSlow_i;
84  AVXTiles::AVXTilesForce* __restrict__ forcesSlow_j;
85  if (doSlow) {
86  forcesSlow_i = tiles_p0->forcesSlow;
87  forcesSlow_j = tiles_p1->forcesSlow;
88  }
89 
90  const float* __restrict__ fastTable;
91  const float* __restrict__ energyTable;
92  const float * __restrict__ ljTable;
93  const float * __restrict__ eps4sigma;
94  // Interpolation for long-range splitting function
95  if (interpMode == 3) {
96  fastTable = _paramFastTable;
97  energyTable = _paramFastEnergyTable;
98  }
99  // LJ mixing not performed within kernel
100  if (interpMode > 1) ljTable = _paramLjTable;
101  // LJ mixing performed within kernel
102  if (interpMode == 1) eps4sigma = _paramEps4Sigma;
103 
104  const float* __restrict__ slowTable = _paramSlowTable;
105  const float* __restrict__ slowEnergyTable = _paramSlowEnergyTable;
106 
107  #ifdef TILE_LIST_STAT_DEBUG
108  int num_jtiles = 0;
109  int num_jtiles_empty = 0;
110  int num_rotates = 0;
111  int num_rotates_empty = 0;
112  int num_rotates_excl_empty = 0;
113  #endif
114 
115 
116  int numModified, numExcluded;
117  if (doList) {
118  numModified = 0;
119  // WMB: Only need for slow electrostatics
120  numExcluded = 0;
121  }
122 
123  const bool zeroShift = ! (_shx*_shx + _shy*_shy + _shz*_shz > 0) &&
124  (tiles_p0 == tiles_p1);
125 
126  const int numTileLists = numLists();
127  for (int itileList = 0; itileList < numTileLists; itileList++) {
128 
129  bool jTileActive;
130  unsigned int itileListLen;
131 
132  const int atomStart_i = lists[itileList].atomStart_i;
133  const int jtileStart = lists[itileList].jtileStart;
134  const int jtileEnd = lists[itileList + 1].jtileStart;
135 
136  int atomSize_i, atomFreeSize_i, atomSize_j, atomFreeSize_j;
137  int nFree_i;
138  __mmask16 freeMask_i, sizeMask_i;
139  bool doIpairList;
140  float bbox_x, bbox_y, bbox_z, bbox_wx, bbox_wy, bbox_wz;
141  __m512i atomIndex_i;
142 #ifdef MEM_OPT_VERSION
143  __m512i atomExclIndex_i;
144 #endif
145  __m512i exclIndex_i, exclMaxDiff_i;
146  if (doList) {
147  atomSize_i = tiles_p0->numAtoms();
148  atomFreeSize_i = tiles_p0->numFreeAtoms();
149  atomSize_j = tiles_p1->numAtoms();
150  atomFreeSize_j = tiles_p1->numFreeAtoms();
151 
152  freeMask_i = 0xFFFF;
153  nFree_i = MAX(atomFreeSize_i - atomStart_i, 0);
154  if (nFree_i < 16)
155  freeMask_i >>= 16 - nFree_i;
156  sizeMask_i = 0xFFFF;
157  const int check_i = atomSize_i - atomStart_i;
158  if (check_i < 16) {
159  sizeMask_i >>= 16 - check_i;
160  if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
161  if (check_i <= NAMD_AVXTILES_PAIR_THRESHOLD) doIpairList = true;
162  else doIpairList = false;
163  }
164  } else if (NAMD_AVXTILES_PAIR_THRESHOLD > 0)
165  doIpairList = false;
166 
167  const int tileNum = atomStart_i >> 4;
168  bbox_x = tiles_p0->bbox_x[tileNum] + _shx;
169  bbox_y = tiles_p0->bbox_y[tileNum] + _shy;
170  bbox_z = tiles_p0->bbox_z[tileNum] + _shz;
171  bbox_wx = tiles_p0->bbox_wx[tileNum];
172  bbox_wy = tiles_p0->bbox_wy[tileNum];
173  bbox_wz = tiles_p0->bbox_wz[tileNum];
174 
175  itileListLen = 0;
176  atomIndex_i = _mm512_load_epi32(tiles_p0->atomIndex + atomStart_i);
177 #ifdef MEM_OPT_VERSION
178  atomExclIndex_i = _mm512_load_epi32(tiles_p0->atomExclIndex +
179  atomStart_i);
180 #endif
181  exclIndex_i = _mm512_load_epi32(tiles_p0->exclIndexStart + atomStart_i);
182  exclMaxDiff_i = _mm512_load_epi32(tiles_p0->exclIndexMaxDiff +
183  atomStart_i);
184  }
185 
186  // Load i-atom data (and shift coordinates)
187  const float *iptr = (float *)(xyzq_i + atomStart_i);
188  const __m512 t0 = _mm512_load_ps(iptr);
189  const __m512 t1 = _mm512_load_ps(iptr+16);
190  const __m512 t2 = _mm512_load_ps(iptr+32);
191  const __m512 t3 = _mm512_load_ps(iptr+48);
192  const __m512i t4 = _mm512_set_epi32(29,25,21,17,13,9,5,1,
193  28,24,20,16,12,8,4,0);
194  const __m512 t5 = _mm512_permutex2var_ps(t0, t4, t1);
195  const __m512i t6 = _mm512_set_epi32(28,24,20,16,12,8,4,0,
196  29,25,21,17,13,9,5,1);
197  const __m512 t7 = _mm512_permutex2var_ps(t2, t6, t3);
198  const __mmask16 t9 = 0xFF00;
199  const __m512i t5i = _mm512_castps_si512(t5);
200  const __m512i t7i = _mm512_castps_si512(t7);
201  const __m512 x_i = _mm512_add_ps(_mm512_castsi512_ps(
202  _mm512_mask_blend_epi32(t9, t5i, t7i)),_mm512_set1_ps(_shx));
203  const __m512 y_i = _mm512_add_ps(_mm512_shuffle_f32x4(t5, t7, 0x4E),
204  _mm512_set1_ps(_shy));
205  const __m512i t12 = _mm512_set_epi32(31,27,23,19,15,11,7,3,
206  30,26,22,18,14,10,6,2);
207  const __m512 t13 = _mm512_permutex2var_ps(t0, t12, t1);
208  const __m512i t14 = _mm512_set_epi32(30,26,22,18,14,10,6,2,
209  31,27,23,19,15,11,7,3);
210  const __m512 t15 = _mm512_permutex2var_ps(t2, t14, t3);
211  const __m512i t13i = _mm512_castps_si512(t13);
212  const __m512i t15i = _mm512_castps_si512(t15);
213  const __m512 z_i = _mm512_add_ps(_mm512_castsi512_ps(
214  _mm512_mask_blend_epi32(t9, t13i, t15i)), _mm512_set1_ps(_shz));
215  const __m512 q_i = _mm512_castsi512_ps(_mm512_shuffle_i32x4(t13i, t15i,
216  0x4E));
217  const __m512i type_i = _mm512_load_epi32(tiles_p0->vdwTypes+atomStart_i);
218 
219  // WMB: This can be masked by sizeMask_i; currently only get for doList
220  __m512 eps4_i, sigma_i;
221  if (interpMode == 1) {
222  const __m512 t0 = (__m512)_mm512_i32logather_pd(type_i, eps4sigma,
223  _MM_SCALE_8);
224  const __m512i type_i2 = _mm512_shuffle_i32x4(type_i, type_i, 238);
225  const __m512 t1 = (__m512)_mm512_i32logather_pd(type_i2, eps4sigma,
226  _MM_SCALE_8);
227  const __m512i t4 = _mm512_set_epi32(31,29,27,25,23,21,19,17,
228  15,13,11,9,7,5,3,1);
229  sigma_i = _mm512_permutex2var_ps(t0, t4, t1);
230  const __m512i t6 = _mm512_set_epi32(30,28,26,24,22,20,18,16,
231  14,12,10,8,6,4,2,0);
232  eps4_i = _mm512_permutex2var_ps(t0, t6, t1);
233  }
234 
235  // Zero i-tile force vectors
236  __m512 forceSlow_i_x, forceSlow_i_y, forceSlow_i_z;
237  __m512 force_i_x = _mm512_setzero_ps();
238  __m512 force_i_y = _mm512_setzero_ps();
239  __m512 force_i_z = _mm512_setzero_ps();
240  if (doSlow) {
241  forceSlow_i_x = _mm512_setzero_ps();
242  forceSlow_i_y = _mm512_setzero_ps();
243  forceSlow_i_z = _mm512_setzero_ps();
244  }
245 
246  for (int jtile=jtileStart;jtile < jtileEnd;jtile++) {
247 
248  #ifdef TILE_LIST_STAT_DEBUG
249  num_jtiles++;
250  bool jtile_empty = true;
251  #endif
252 
253  // Load j-atom starting index
254  int atomStart_j = jTiles.atomStart[jtile];
255  const bool self = zeroShift && (atomStart_i == atomStart_j);
256  const int shiftVal = (self) ? 1 : 0;
257 
258  // Load j-atom positions / charges
259  const float * jptr = (float *)(xyzq_j + atomStart_j);
260  const __m512 t0 = _mm512_load_ps(jptr);
261  const __m512 t1 = _mm512_load_ps(jptr+16);
262  const __m512 t2 = _mm512_load_ps(jptr+32);
263  const __m512 t3 = _mm512_load_ps(jptr+48);
264  const __m512i t4 = _mm512_set_epi32(29,25,21,17,13,9,5,1,
265  28,24,20,16,12,8,4,0);
266  const __m512 t5 = _mm512_permutex2var_ps(t0, t4, t1);
267  const __m512i t6 = _mm512_set_epi32(28,24,20,16,12,8,4,0,
268  29,25,21,17,13,9,5,1);
269  const __m512 t7 = _mm512_permutex2var_ps(t2, t6, t3);
270  const __mmask16 t9 = 0xFF00;
271  const __m512i t5i = _mm512_castps_si512(t5);
272  const __m512i t7i = _mm512_castps_si512(t7);
273  __m512 x_j = _mm512_castsi512_ps(_mm512_mask_blend_epi32(t9, t5i, t7i));
274  __m512 y_j = _mm512_shuffle_f32x4(t5, t7, 0x4E);
275  const __m512i t12 = _mm512_set_epi32(31,27,23,19,15,11,7,3,
276  30,26,22,18,14,10,6,2);
277  const __m512 t13 = _mm512_permutex2var_ps(t0, t12, t1);
278  const __m512i t14 = _mm512_set_epi32(30,26,22,18,14,10,6,2,
279  31,27,23,19,15,11,7,3);
280  const __m512 t15 = _mm512_permutex2var_ps(t2, t14, t3);
281  const __m512i t13i = _mm512_castps_si512(t13);
282  const __m512i t15i = _mm512_castps_si512(t15);
283  __m512 z_j = _mm512_castsi512_ps(_mm512_mask_blend_epi32(t9, t13i,
284  t15i));
285  __m512 q_j = _mm512_castsi512_ps(_mm512_shuffle_i32x4(t13i, t15i, 0x4E));
286 
287  __m512i excl, atomIndex_j;
288  __mmask16 freeMask_j, sizeMask_j;
289  if (doList) {
290  // Check for early bail from i-tile bounding box
291 
292  // dx = max(0.f, abs(bbox_x - x_j) - bbox_wx)
293  const __m512 dx_one = _mm512_abs_ps(
294  _mm512_sub_ps(_mm512_set1_ps(bbox_x), x_j));
295  const __m512 dx_two = _mm512_set1_ps(bbox_wx);
296  const __mmask16 lxmask = _mm512_cmplt_ps_mask(dx_two, dx_one);
297  const __m512 dx = _mm512_mask_sub_ps(_mm512_setzero_ps(), lxmask,
298  dx_one, dx_two);
299  // dy = max(0.f, abs(bbox_y - y_j) - bbox_wy)
300  const __m512 dy_one = _mm512_abs_ps(
301  _mm512_sub_ps(_mm512_set1_ps(bbox_y), y_j));
302  const __m512 dy_two = _mm512_set1_ps(bbox_wy);
303  const __mmask16 lymask = _mm512_cmplt_ps_mask(dy_two, dy_one);
304  const __m512 dy = _mm512_mask_sub_ps(_mm512_setzero_ps(), lymask,
305  dy_one, dy_two);
306  // dz = max(0.f, abs(bbox_z - z_j) - bbox_wz)
307  const __m512 dz_one = _mm512_abs_ps(
308  _mm512_sub_ps(_mm512_set1_ps(bbox_z), z_j));
309  const __m512 dz_two = _mm512_set1_ps(bbox_wz);
310  const __mmask16 lzmask = _mm512_cmplt_ps_mask(dz_two, dz_one);
311  const __m512 dz = _mm512_mask_sub_ps(_mm512_setzero_ps(), lzmask,
312  dz_one, dz_two);
313  // r2bb = dx*dx + dy*dy + dz*dz
314  const __m512 r2bb = _mm512_fmadd_ps(dx,dx,_mm512_fmadd_ps(dy,dy,
315  _mm512_mul_ps(dz, dz)));
316 
317  // If no atoms within bounding box, skip this neighbor tile
318  __mmask16 m = _mm512_cmple_ps_mask(r2bb, _mm512_set1_ps(_plcutoff2));
319  if (!m) continue;
320 
321  // Load atom indices
322  atomIndex_j = _mm512_load_epi32(tiles_p1->atomIndex+atomStart_j);
323  // Zero exclusion data
324  excl = _mm512_setzero_epi32();
325 
326  // Predication for number of j atoms and free atoms in tile
327  freeMask_j = 0xFFFF;
328  int nFree_j = MAX(atomFreeSize_j - atomStart_j, 0);
329  if (nFree_j < 16)
330  freeMask_j >>= 16 - nFree_j;
331  const int check_j = atomSize_j - atomStart_j;
332  sizeMask_j = 0xFFFF;
333  if (check_j < 16)
334  sizeMask_j >>= 16 - check_j;
335 
336  jTileActive = false;
337 
338  // If doing a pair list instead of tile list, make sure we have
339  // room to store neighbor atom indices
340  if (NAMD_AVXTILES_PAIR_THRESHOLD > 0 && doIpairList) {
341  int maxPairs = _numPairs[0];
342  for (int z = 1; z < NAMD_AVXTILES_PAIR_THRESHOLD; z++)
343  if (_numPairs[z] > maxPairs) maxPairs = _numPairs[z];
344  if (maxPairs + 16 > _maxPairs) {
345  reallocPairLists(0, 1.4 * _maxPairs);
346  }
347  }
348  } else
349  // Load tile exclusion data
350  excl = _mm512_load_epi32(jTiles.excl + (jtile << 4));
351 
352  // Load j tile LJ types and epsilon/sigma for interp mode 1
353  __m512i type_j = _mm512_load_epi32(tiles_p1->vdwTypes+atomStart_j);
354 
355  __m512 eps4_j, sigma_j;
356  if (interpMode == 1) {
357  const __m512 t0 = (__m512)_mm512_i32logather_pd(type_j, eps4sigma,
358  _MM_SCALE_8);
359  const __m512i type_j2 = _mm512_shuffle_i32x4(type_j, type_j, 238);
360  const __m512 t1 = (__m512)_mm512_i32logather_pd(type_j2, eps4sigma,
361  _MM_SCALE_8);
362  const __m512i t4 = _mm512_set_epi32(31,29,27,25,23,21,19,17,
363  15,13,11,9,7,5,3,1);
364  sigma_j = _mm512_permutex2var_ps(t0, t4, t1);
365  const __m512i t6 = _mm512_set_epi32(30,28,26,24,22,20,18,16,
366  14,12,10,8,6,4,2,0);
367  eps4_j = _mm512_permutex2var_ps(t0, t6, t1);
368  }
369 
370  // Zero force vectors for j tile
371  __m512 force_j_x = _mm512_setzero_ps();
372  __m512 force_j_y = _mm512_setzero_ps();
373  __m512 force_j_z = _mm512_setzero_ps();
374 
375  __m512 forceSlow_j_x, forceSlow_j_y, forceSlow_j_z;
376  if (doSlow) {
377  forceSlow_j_x = _mm512_setzero_ps();
378  forceSlow_j_y = _mm512_setzero_ps();
379  forceSlow_j_z = _mm512_setzero_ps();
380  }
381 
382  int t = (self) ? 1 : 0;
383 
384  // WMB: If adding GBIS p 2, need to add check diagonal for excluded
385 
386  // Predication for self-interactions to prevent duplicate interactions
387  __mmask16 mask_j;
388  if (doList) mask_j = 0xFFFF;
389 
390  // If doList, excl is 0; if self, skipping first bit
391  if (!doList && !self) excl = _mm512_slli_epi32(excl, 1);
392 
393  // Skip self interaction
394  if (self) {
395  x_j = (__m512)_mm512_alignr_epi32((__m512i)x_j, (__m512i)x_j, 1);
396  y_j = (__m512)_mm512_alignr_epi32((__m512i)y_j, (__m512i)y_j, 1);
397  z_j = (__m512)_mm512_alignr_epi32((__m512i)z_j, (__m512i)z_j, 1);
398  q_j = (__m512)_mm512_alignr_epi32((__m512i)q_j, (__m512i)q_j, 1);
399  if (doList) {
400  freeMask_j = (freeMask_j>>1) | (freeMask_j<<15);
401  sizeMask_j = (sizeMask_j>>1) | (sizeMask_j<<15);
402  if (doList) mask_j >>= shiftVal;
403  atomIndex_j = _mm512_alignr_epi32(atomIndex_j, atomIndex_j, 1);
404  }
405  if (interpMode == 1) {
406  eps4_j = (__m512)_mm512_alignr_epi32((__m512i)eps4_j,
407  (__m512i)eps4_j, 1);
408  sigma_j = (__m512)_mm512_alignr_epi32((__m512i)sigma_j,
409  (__m512i)sigma_j, 1);
410  } else
411  type_j = _mm512_alignr_epi32(type_j, type_j, 1);
412  // force is zero, no need to rotate
413  }
414 
415  for (; t < 16; t++) {
416  // Exclusion bit for next pair
417  excl = _mm512_srli_epi32(excl, 1);
418  __mmask16 freeMask_ij;
419  if (doList)
420  // Predication for partial tiles and fixed atoms at ends
421  freeMask_ij = (freeMask_j | freeMask_i) & (sizeMask_j & sizeMask_i);
422  else {
423  #ifdef TILE_LIST_STAT_DEBUG
424  __mmask16 r2mask = (excl & 1).notzero();
425  num_rotates++;
426  if (!r2mask) {num_rotates_empty++; num_rotates_excl_empty++;}
427  #endif
428  }
429 
430  // ------------- Distance squared calculation and cutoff check
431  __m512 dx, dy, dz, r2;
432  __mmask16 r2mask;
433  if (!doList || freeMask_ij) {
434  // dx = x_j - x_i;
435  dx = _mm512_sub_ps(x_j, x_i);
436  // dy = y_j - y_i;
437  dy = _mm512_sub_ps(y_j, y_i);
438  // dz = z_j - z_i;
439  dz = _mm512_sub_ps(z_j, z_i);
440  // r2 = dx*dx + dy*dy + dz*dz;
441  r2 = _mm512_fmadd_ps(dx,dx,_mm512_fmadd_ps(dy,dy,
442  _mm512_mul_ps(dz, dz)));
443 
444  #ifdef TILE_LIST_STAT_DEBUG
445  if (!doList) {
446  __mmask16 t=r2 <= _plcutoff2; t &= r2mask;
447  if (!t) num_rotates_empty++; else jtile_empty = false;
448  }
449  #endif
450 
451  if (doList) {
452  // Predication for atom pairs within build cutoff
453  r2mask = _mm512_cmple_ps_mask(r2, _mm512_set1_ps(_plcutoff2)) &
454  mask_j;
455  r2mask &= freeMask_ij;
456  } else {
457  // Predication for atom pairs within force cutoff
458  r2mask = _mm512_cmple_ps_mask(r2, _mm512_set1_ps(_cutoff2));
459  r2mask &= _mm512_cmpneq_epi32_mask(_mm512_and_epi32(excl,
460  _mm512_set1_epi32(1)), _mm512_setzero_epi32());
461  }
462  } else
463  r2mask = 0;
464 
465  // ------------- Exclusions
466  if (doList && r2mask) {
467  if (_numModifiedAlloc - numModified < 16) {
468  int newNum = static_cast<double>(numModified) *
469  numTileLists / itileList;
470  if (newNum < _numModifiedAlloc + 16)
471  newNum = _numModifiedAlloc + 16;
472  reallocModified(newNum);
473  }
474  if (_numExcludedAlloc - numExcluded < 16) {
475  int newNum = static_cast<double>(numExcluded) *
476  numTileLists / itileList;
477  if (newNum < _numExcludedAlloc + 16)
478  newNum = _numExcludedAlloc + 16;
479  reallocExcluded(newNum);
480  }
481 
482  // Predication for exclusions and cutoff
483  __mmask16 excludedMask = _mm512_cmpge_epi32_mask(atomIndex_j,
484  exclIndex_i);
485  excludedMask &= _mm512_cmple_epi32_mask(atomIndex_j, exclMaxDiff_i);
486  excludedMask &= r2mask;
487  __mmask16 scalarPos = 1;
488 
489  // Check each pair for exclusions
490  for (int z = 0; z < 16; z++) {
491  if (scalarPos & excludedMask) {
492  #ifdef MEM_OPT_VERSION
493  const char *exclFlags =
494  mol->get_excl_check_for_idx(*((int*)&(atomExclIndex_i) + z))->
495  flags;
496  #else
497  const char *exclFlags =
498  mol->get_excl_check_for_atom(*((int*)&(atomIndex_i) + z))->
499  flags;
500  #endif
501  #ifndef MEM_OPT_VERSION
502  if (exclFlags)
503  #endif
504  exclFlags -= *((int*)&(exclIndex_i) + z);
505  #ifndef MEM_OPT_VERSION
506  else {
507  #pragma noinline
508  exclFlags = buildExclFlyList(itileList, z, atomIndex_i,
509  atomSize_i - atomStart_i, mol);
510  }
511  #endif
512  const int exclType = exclFlags[*((int*)&atomIndex_j + z)];
513  if (exclType == 0)
514  // Clear exclusion bit
515  excludedMask &= ~scalarPos;
516  else if (exclType == 2) {
517  // Exclude from tiles and add pair to modified list
518  _modified_i[numModified] = atomStart_i + z;
519  int jpos = (z + t) & 15;
520  _modified_j[numModified] = atomStart_j + jpos;
521  if (*((float*)&r2 + z) <= _cutoff2)
522  numModified++;
523  } else {
524  // Excluded from tiles and add pair to fully excluded list
525  _excluded_i[numExcluded] = atomStart_i + z;
526  int jpos = (z + t) & 15;
527  _excluded_j[numExcluded] = atomStart_j + jpos;
528  if (*((float*)&r2 + z) <= _cutoff2)
529  numExcluded++;
530  }
531  }
532  // Next pair
533  scalarPos <<= 1;
534  }
535 
536  // Note: For exclusions, use force cutoff and not list cutoff
537  r2mask &= ~excludedMask;
538 
539  // If building a pair list, store each pair within cutoff
540  if (NAMD_AVXTILES_PAIR_THRESHOLD > 0 &&
541  doIpairList && r2mask) {
542  for (int z = 0; z < NAMD_AVXTILES_PAIR_THRESHOLD; z++) {
543  if (r2mask & 1) {
544  const int jPos = atomStart_j + ((z+t) & 15);
545  _pairLists[_pairStart[z]+_numPairs[z]] = jPos;
546  _numPairs[z]++;
547  }
548  r2mask >>= 1;
549  }
550  r2mask = 0;
551  } else {
552  // Mark tile list as active if neighbors within cutoff
553  excl = _mm512_mask_or_epi32(excl, r2mask, excl,
554  _mm512_set1_epi32(0x8000));
555  if (r2mask) jTileActive = true;
556  // Redo predication for pairs w/in force cutoff rather build cutoff
557  r2mask = _mm512_cmple_ps_mask(r2, _mm512_set1_ps(_cutoff2)) &
558  mask_j & freeMask_ij & ~excludedMask;
559  }
560  } // if doList
561 
562  // ------------- Force, Energy, Virial Calculations
563  if (r2mask) {
564  // kqq = q_i * q_j
565  const __m512 kqq = _mm512_mul_ps(q_i, q_j);
566  __m512 force, forceSlow;
567 
568  // Call force kernel
569  if (interpMode == 1)
570  forceEnergyInterp1<doEnergy, doSlow>(r2, kqq, force, forceSlow,
571  energyVdw, energyElec, energySlow, r2mask, _paramC1,
572  _paramC3, _paramSwitchOn2, _cutoff2, _paramMinvCut3,
573  _paramCutUnder3, slowTable, slowEnergyTable, eps4_i, eps4_j,
574  sigma_i, sigma_j);
575  else
576  forceEnergyInterp2<doEnergy, doSlow, interpMode>(r2, kqq, type_i,
577  type_j, force, forceSlow, energyVdw, energyElec, energySlow,
578  r2mask, _paramScale, _paramC1, _paramC3, _paramSwitchOn2,
579  _cutoff2, _paramMinvCut3, _paramCutUnder3, fastTable,
580  energyTable, slowTable, slowEnergyTable, ljTable,
581  _paramLjWidth);
582 
583  // force_i_. += d. * force
584  force_i_x = _mm512_mask_mov_ps(force_i_x, r2mask,
585  _mm512_fmadd_ps(dx, force, force_i_x));
586  force_i_y = _mm512_mask_mov_ps(force_i_y, r2mask,
587  _mm512_fmadd_ps(dy, force, force_i_y));
588  force_i_z = _mm512_mask_mov_ps(force_i_z, r2mask,
589  _mm512_fmadd_ps(dz, force, force_i_z));
590  // force_j_. += d. * force
591  force_j_x = _mm512_mask_mov_ps(force_j_x, r2mask,
592  _mm512_fnmadd_ps(dx, force, force_j_x));
593  force_j_y = _mm512_mask_mov_ps(force_j_y, r2mask,
594  _mm512_fnmadd_ps(dy, force, force_j_y));
595  force_j_z = _mm512_mask_mov_ps(force_j_z, r2mask,
596  _mm512_fnmadd_ps(dz, force, force_j_z));
597  if (doSlow) {
598  // force_i_. += d. * forceSlow_i_.
599  forceSlow_i_x = _mm512_mask_mov_ps(forceSlow_i_x, r2mask,
600  _mm512_fmadd_ps(dx, forceSlow, forceSlow_i_x));
601  forceSlow_i_y = _mm512_mask_mov_ps(forceSlow_i_y, r2mask,
602  _mm512_fmadd_ps(dy, forceSlow, forceSlow_i_y));
603  forceSlow_i_z = _mm512_mask_mov_ps(forceSlow_i_z, r2mask,
604  _mm512_fmadd_ps(dz, forceSlow, forceSlow_i_z));
605  // force_j_. += d. * forceSlow_j_.
606  forceSlow_j_x = _mm512_mask_mov_ps(forceSlow_j_x, r2mask,
607  _mm512_fnmadd_ps(dx, forceSlow, forceSlow_j_x));
608  forceSlow_j_y = _mm512_mask_mov_ps(forceSlow_j_y, r2mask,
609  _mm512_fnmadd_ps(dy, forceSlow, forceSlow_j_y));
610  forceSlow_j_z = _mm512_mask_mov_ps(forceSlow_j_z, r2mask,
611  _mm512_fnmadd_ps(dz, forceSlow, forceSlow_j_z));
612  }
613  }
614 
615  // ------------- Next set of pairs for tile interactions
616  x_j = (__m512)_mm512_alignr_epi32((__m512i)x_j, (__m512i)x_j, 1);
617  y_j = (__m512)_mm512_alignr_epi32((__m512i)y_j, (__m512i)y_j, 1);
618  z_j = (__m512)_mm512_alignr_epi32((__m512i)z_j, (__m512i)z_j, 1);
619  q_j = (__m512)_mm512_alignr_epi32((__m512i)q_j, (__m512i)q_j, 1);
620  if (doList) {
621  freeMask_j = (freeMask_j>>1) | (freeMask_j<<15);
622  sizeMask_j = (sizeMask_j>>1) | (sizeMask_j<<15);
623  mask_j >>= shiftVal;
624  atomIndex_j = _mm512_alignr_epi32(atomIndex_j, atomIndex_j, 1);
625  }
626  if (interpMode == 1) {
627  eps4_j = (__m512)_mm512_alignr_epi32((__m512i)eps4_j,
628  (__m512i)eps4_j, 1);
629  sigma_j = (__m512)_mm512_alignr_epi32((__m512i)sigma_j,
630  (__m512i)sigma_j, 1);
631  } else
632  type_j = _mm512_alignr_epi32(type_j, type_j, 1);
633  force_j_x = (__m512)_mm512_alignr_epi32((__m512i)force_j_x,
634  (__m512i)force_j_x, 1);
635  force_j_y = (__m512)_mm512_alignr_epi32((__m512i)force_j_y,
636  (__m512i)force_j_y, 1);
637  force_j_z = (__m512)_mm512_alignr_epi32((__m512i)force_j_z,
638  (__m512i)force_j_z, 1);
639  if (doSlow) {
640  forceSlow_j_x = (__m512)_mm512_alignr_epi32((__m512i)forceSlow_j_x,
641  (__m512i)forceSlow_j_x,
642  1);
643  forceSlow_j_y = (__m512)_mm512_alignr_epi32((__m512i)forceSlow_j_y,
644  (__m512i)forceSlow_j_y,
645  1);
646  forceSlow_j_z = (__m512)_mm512_alignr_epi32((__m512i)forceSlow_j_z,
647  (__m512i)forceSlow_j_z,
648  1);
649  }
650  } // for t
651 
652  // ------------- Accumulate j-forces in memory
653  const __m512i tp0x = _mm512_set_epi32(0,19,11,3,0,18,10,2,
654  0,17,9,1,0,16,8,0);
655  const __m512i tp1x = _mm512_set_epi32(0,23,15,7,0,22,14,6,0,21,
656  13,5,0,20,12,4);
657  const __m512i tp2x = _mm512_set_epi32(0,27,11,3,0,26,10,2,
658  0,25,9,1,0,24,8,0);
659  const __m512i tp3x = _mm512_set_epi32(0,31,15,7,0,30,14,6,
660  0,29,13,5,0,28,12,4);
661  {
662  float * jptr = (float *)(forces_j + atomStart_j);
663  const __m512 v0 = _mm512_load_ps(jptr);
664  const __m512 v1 = _mm512_load_ps(jptr + 16);
665  const __m512 v2 = _mm512_load_ps(jptr + 32);
666  const __m512 v3 = _mm512_load_ps(jptr + 48);
667  const __m512 w1 = _mm512_shuffle_f32x4(force_j_x, force_j_y,
668  0b01000100);
669  const __m512 w2 = _mm512_shuffle_f32x4(force_j_x, force_j_y,
670  0b11101110);
671  __m512 tp0 = _mm512_permutex2var_ps(w1, tp0x, force_j_z);
672  __m512 tp1 = _mm512_permutex2var_ps(w1, tp1x, force_j_z);
673  __m512 tp2 = _mm512_permutex2var_ps(w2, tp2x, force_j_z);
674  __m512 tp3 = _mm512_permutex2var_ps(w2, tp3x, force_j_z);
675  tp0 = _mm512_add_ps(v0, tp0);
676  tp1 = _mm512_add_ps(v1, tp1);
677  tp2 = _mm512_add_ps(v2, tp2);
678  tp3 = _mm512_add_ps(v3, tp3);
679  _mm512_store_ps(jptr, tp0);
680  _mm512_store_ps(jptr + 16, tp1);
681  _mm512_store_ps(jptr + 32, tp2);
682  _mm512_store_ps(jptr + 48, tp3);
683  }
684 
685  if (doSlow) {
686  float * jptr = (float *)(forcesSlow_j + atomStart_j);
687  const __m512 v0 = _mm512_load_ps(jptr);
688  const __m512 v1 = _mm512_load_ps(jptr + 16);
689  const __m512 v2 = _mm512_load_ps(jptr + 32);
690  const __m512 v3 = _mm512_load_ps(jptr + 48);
691  const __m512 w1 = _mm512_shuffle_f32x4(forceSlow_j_x, forceSlow_j_y,
692  0b01000100);
693  const __m512 w2 = _mm512_shuffle_f32x4(forceSlow_j_x, forceSlow_j_y,
694  0b11101110);
695  __m512 tp0 = _mm512_permutex2var_ps(w1, tp0x, forceSlow_j_z);
696  __m512 tp1 = _mm512_permutex2var_ps(w1, tp1x, forceSlow_j_z);
697  __m512 tp2 = _mm512_permutex2var_ps(w2, tp2x, forceSlow_j_z);
698  __m512 tp3 = _mm512_permutex2var_ps(w2, tp3x, forceSlow_j_z);
699  tp0 = _mm512_add_ps(v0, tp0);
700  tp1 = _mm512_add_ps(v1, tp1);
701  tp2 = _mm512_add_ps(v2, tp2);
702  tp3 = _mm512_add_ps(v3, tp3);
703  _mm512_store_ps(jptr, tp0);
704  _mm512_store_ps(jptr + 16, tp1);
705  _mm512_store_ps(jptr + 32, tp2);
706  _mm512_store_ps(jptr + 48, tp3);
707  }
708 
709  // ------------- Write exclusions to memory
710  if (doList) {
711  if (jTileActive) {
712  int anyexcl = 65536;
713  if (_mm512_cmpneq_epi32_mask(excl, _mm512_setzero_epi32()))
714  anyexcl |= 1;
715  jTiles.status[jtile] = anyexcl;
716  // Store exclusions
717  _mm512_store_epi32((void *)(jTiles.excl + (jtile << 4)),
718  excl);
719  itileListLen += anyexcl;
720  }
721  } // if doList
722  #ifdef TILE_LIST_STAT_DEBUG
723  if (jtile_empty) num_jtiles_empty++;
724  #endif
725  } // jtile
726 
727  // ------------------------------------------------------
728 
729  // ------------- Accumulate i-forces in memory
730  const __m512i tp0x = _mm512_set_epi32(0,19,11,3,0,18,10,2,
731  0,17,9,1,0,16,8,0);
732  const __m512i tp1x = _mm512_set_epi32(0,23,15,7,0,22,14,6,0,21,
733  13,5,0,20,12,4);
734  const __m512i tp2x = _mm512_set_epi32(0,27,11,3,0,26,10,2,
735  0,25,9,1,0,24,8,0);
736  const __m512i tp3x = _mm512_set_epi32(0,31,15,7,0,30,14,6,
737  0,29,13,5,0,28,12,4);
738  {
739  float * iptr = (float *)(forces_i + atomStart_i);
740  const __m512 v0 = _mm512_load_ps(iptr);
741  const __m512 v1 = _mm512_load_ps(iptr + 16);
742  const __m512 v2 = _mm512_load_ps(iptr + 32);
743  const __m512 v3 = _mm512_load_ps(iptr + 48);
744  const __m512 w1 = _mm512_shuffle_f32x4(force_i_x, force_i_y,
745  0b01000100);
746  const __m512 w2 = _mm512_shuffle_f32x4(force_i_x, force_i_y,
747  0b11101110);
748  __m512 tp0 = _mm512_permutex2var_ps(w1, tp0x, force_i_z);
749  __m512 tp1 = _mm512_permutex2var_ps(w1, tp1x, force_i_z);
750  __m512 tp2 = _mm512_permutex2var_ps(w2, tp2x, force_i_z);
751  __m512 tp3 = _mm512_permutex2var_ps(w2, tp3x, force_i_z);
752  tp0 = _mm512_add_ps(v0, tp0);
753  tp1 = _mm512_add_ps(v1, tp1);
754  tp2 = _mm512_add_ps(v2, tp2);
755  tp3 = _mm512_add_ps(v3, tp3);
756  _mm512_store_ps(iptr, tp0);
757  _mm512_store_ps(iptr + 16, tp1);
758  _mm512_store_ps(iptr + 32, tp2);
759  _mm512_store_ps(iptr + 48, tp3);
760  }
761 
762  if (doSlow) {
763  float * iptr = (float *)(forcesSlow_i + atomStart_i);
764  const __m512 v0 = _mm512_load_ps(iptr);
765  const __m512 v1 = _mm512_load_ps(iptr + 16);
766  const __m512 v2 = _mm512_load_ps(iptr + 32);
767  const __m512 v3 = _mm512_load_ps(iptr + 48);
768  const __m512 w1 = _mm512_shuffle_f32x4(forceSlow_i_x, forceSlow_i_y,
769  0b01000100);
770  const __m512 w2 = _mm512_shuffle_f32x4(forceSlow_i_x, forceSlow_i_y,
771  0b11101110);
772  __m512 tp0 = _mm512_permutex2var_ps(w1, tp0x, forceSlow_i_z);
773  __m512 tp1 = _mm512_permutex2var_ps(w1, tp1x, forceSlow_i_z);
774  __m512 tp2 = _mm512_permutex2var_ps(w2, tp2x, forceSlow_i_z);
775  __m512 tp3 = _mm512_permutex2var_ps(w2, tp3x, forceSlow_i_z);
776  tp0 = _mm512_add_ps(v0, tp0);
777  tp1 = _mm512_add_ps(v1, tp1);
778  tp2 = _mm512_add_ps(v2, tp2);
779  tp3 = _mm512_add_ps(v3, tp3);
780  _mm512_store_ps(iptr, tp0);
781  _mm512_store_ps(iptr + 16, tp1);
782  _mm512_store_ps(iptr + 32, tp2);
783  _mm512_store_ps(iptr + 48, tp3);
784  }
785 
786  if (doList)
787  listDepth[itileList] = itileListLen;
788 
789  // Update net forces for virial
790  if (doVirial) {
791  // fNet_. += force_i_.
792  fNet_x = _mm512_add_ps(fNet_x, force_i_x);
793  fNet_y = _mm512_add_ps(fNet_y, force_i_y);
794  fNet_z = _mm512_add_ps(fNet_z, force_i_z);
795  if (doSlow) {
796  // fNetSlow_. += forceSlow_i_.
797  fNetSlow_x = _mm512_add_ps(fNetSlow_x, forceSlow_i_x);
798  fNetSlow_y = _mm512_add_ps(fNetSlow_y, forceSlow_i_y);
799  fNetSlow_z = _mm512_add_ps(fNetSlow_z, forceSlow_i_z);
800  }
801  } // if (doVirial)
802 
803  if (NAMD_AVXTILES_PAIR_THRESHOLD > 0 && doList) {
804  _numPairLists = 0;
805  for (int z = 0; z < NAMD_AVXTILES_PAIR_THRESHOLD; z++)
806  if (_numPairs[z]) _numPairLists++;
807  }
808  } // for itileList
809 
810  if (doList) {
811  _numModified = numModified;
812  // WMB: Only needed for slow elec
813  _numExcluded = numExcluded;
814  _exclusionChecksum = numModified + numExcluded;
815  }
816 
817  #ifdef TILE_LIST_STAT_DEBUG
818  if (!doList)
819  printf("TILE_DBG: JTILES %d EMPTY %.2f ROTATES %d EMPTY %.2f EXCL %.2f\n",
820  num_jtiles, double(num_jtiles_empty)/num_jtiles, num_rotates,
821  double(num_rotates_empty)/num_rotates,
822  double(num_rotates_excl_empty)/num_rotates);
823  #endif
824 } // nbAVX512Tiles()
825 
826 //---------------------------------------------------------------------------
827 // Calculations for unmodified/unexcluded from pair lists
828 //---------------------------------------------------------------------------
829 
830 template <bool doEnergy, bool doVirial, bool doSlow, int interpMode>
831 void AVXTileLists::nbAVX512Pairs(__m512 &energyVdw, __m512 &energyElec,
832  __m512 &energySlow, __m512 &fNet_x,
833  __m512 &fNet_y, __m512 &fNet_z,
834  __m512 &fNetSlow_x, __m512 &fNetSlow_y,
835  __m512 &fNetSlow_z) {
836 
837  const AVXTiles::AVXTilesAtom* __restrict__ xyzq_i = tiles_p0->atoms;
838  const AVXTiles::AVXTilesAtom* __restrict__ xyzq_j = tiles_p1->atoms;
839  AVXTiles::AVXTilesForce* __restrict__ forces_i = tiles_p0->forces;
840  AVXTiles::AVXTilesForce* __restrict__ forces_j = tiles_p1->forces;
841  AVXTiles::AVXTilesForce* __restrict__ forcesSlow_i;
842  AVXTiles::AVXTilesForce* __restrict__ forcesSlow_j;
843  if (doSlow) {
844  forcesSlow_i = tiles_p0->forcesSlow;
845  forcesSlow_j = tiles_p1->forcesSlow;
846  }
847 
848  const float* __restrict__ fastTable;
849  const float* __restrict__ energyTable;
850  const float * __restrict__ ljTable;
851  const float * __restrict__ eps4sigma;
852  // Interpolation for long-range splitting function
853  if (interpMode == 3) {
854  fastTable = _paramFastTable;
855  energyTable = _paramFastEnergyTable;
856  }
857  // LJ mixing not performed within kernel
858  if (interpMode > 1) ljTable = _paramLjTable;
859  // LJ mixing performed within kernel
860  if (interpMode == 1) eps4sigma = _paramEps4Sigma;
861 
862  const float* __restrict__ slowTable = _paramSlowTable;
863  const float* __restrict__ slowEnergyTable = _paramSlowEnergyTable;
864 
865  for (int z = 0; z < NAMD_AVXTILES_PAIR_THRESHOLD; z++) {
866  if (!_numPairs[z]) continue;
867 
868  // Zero i-tile force vectors
869  __m512 force_i_x = _mm512_setzero_ps();
870  __m512 force_i_y = _mm512_setzero_ps();
871  __m512 force_i_z = _mm512_setzero_ps();
872  __m512 forceSlow_i_x, forceSlow_i_y, forceSlow_i_z;
873  if (doSlow) {
874  forceSlow_i_x = _mm512_setzero_ps();
875  forceSlow_i_y = _mm512_setzero_ps();
876  forceSlow_i_z = _mm512_setzero_ps();
877  }
878 
879  const int pairI = _pair_i[z];
880  const int nPairs = _numPairs[z];
881  // Load i-atom data
882  const __m512 x_i = _mm512_set1_ps(xyzq_i[pairI].x + _shx);
883  const __m512 y_i = _mm512_set1_ps(xyzq_i[pairI].y + _shy);
884  const __m512 z_i = _mm512_set1_ps(xyzq_i[pairI].z + _shz);
885  const __m512 q_i = _mm512_set1_ps(xyzq_i[pairI].q);
886  const int scalarType_i = tiles_p0->vdwTypes[pairI];
887  const __m512i type_i = _mm512_set1_epi32(scalarType_i);
888  __m512 eps4_i, sigma_i;
889  if (interpMode == 1) {
890  eps4_i = _mm512_set1_ps(eps4sigma[scalarType_i*2]);
891  sigma_i = _mm512_set1_ps(eps4sigma[scalarType_i*2+1]);
892  }
893 
894  __mmask16 loopMask = 0xFFFF;
895  int listPos = _pairStart[z];
896  for (int mv = 0; mv < nPairs; mv += 16) {
897  // Remainder predication
898  if (nPairs - mv < 16)
899  loopMask >>= (16 - (nPairs - mv));
900  // Load j indices from pair list
901  __m512i j = _mm512_load_epi32(_pairLists + listPos);
902  listPos += 16;
903 
904  // Load j atom data
905  const __m512i jt2 = _mm512_slli_epi32(j, 1);
906  const __m512 t0 = (__m512)_mm512_mask_i32logather_pd(
907  _mm512_undefined_pd(), loopMask, jt2, (float *)xyzq_j, _MM_SCALE_8);
908  const __m512 t8 = (__m512)_mm512_mask_i32logather_pd(
909  _mm512_undefined_pd(), loopMask, jt2, (float *)(xyzq_j) + 2,
910  _MM_SCALE_8);
911  const __m512i jt2_2 = _mm512_shuffle_i32x4(jt2, jt2, 238);
912  const __mmask16 loopMask2 = loopMask >> 8;
913  const __m512 t1 = (__m512)_mm512_mask_i32logather_pd(
914  _mm512_undefined_pd(), loopMask2, jt2_2, (float *)xyzq_j, _MM_SCALE_8);
915  const __m512 t9 = (__m512)_mm512_mask_i32logather_pd(
916  _mm512_undefined_pd(), loopMask2, jt2_2, (float *)(xyzq_j) + 2,
917  _MM_SCALE_8);
918  const __m512i t4 = _mm512_set_epi32(31,29,27,25,23,21,19,17,
919  15,13,11,9,7,5,3,1);
920  const __m512 y_j = _mm512_permutex2var_ps(t0, t4, t1);
921  const __m512 q_j = _mm512_permutex2var_ps(t8, t4, t9);
922  const __m512i t6 = _mm512_set_epi32(30,28,26,24,22,20,18,16,
923  14,12,10,8,6,4,2,0);
924  const __m512 x_j = _mm512_permutex2var_ps(t0, t6, t1);
925  const __m512 z_j = _mm512_permutex2var_ps(t8, t6, t9);
926 
927  // kqq = q_i * q_j
928  const __m512 kqq = _mm512_mul_ps(q_i, q_j);
929 
930  // dx = x_i - x_j;
931  const __m512 dx = _mm512_sub_ps(x_i, x_j);
932  // dy = y_i - y_j;
933  const __m512 dy = _mm512_sub_ps(y_i, y_j);
934  // dz = z_i - z_j;
935  const __m512 dz = _mm512_sub_ps(z_i, z_j);
936  // r2 = dx*dx + dy*dy + dz*dz;
937  const __m512 r2(_mm512_fmadd_ps(dx,dx,_mm512_fmadd_ps(dy,dy,
938  _mm512_mul_ps(dz, dz))));
939  // Atoms within cutoff
940  const __mmask16 r2mask = _mm512_cmple_ps_mask(r2,
941  _mm512_set1_ps(_cutoff2)) & loopMask;
942 
943  // Load LJ types
944  const __m512i type_j = _mm512_mask_i32gather_epi32(type_j, r2mask, j,
945  tiles_p1->vdwTypes, _MM_SCALE_4);
946 
947  // Load eps and sigma
948  __m512 eps4_j, sigma_j;
949  if (interpMode == 1) {
950  const __m512 t0 = (__m512)_mm512_mask_i32logather_pd(
951  _mm512_undefined_pd(), r2mask, type_j, eps4sigma, _MM_SCALE_8);
952  const __m512i type_j2 = _mm512_shuffle_i32x4(type_j, type_j, 238);
953  const __mmask16 r2mask2 = r2mask >> 8;
954  const __m512 t1 = (__m512)_mm512_mask_i32logather_pd(
955  _mm512_undefined_pd(), r2mask2, type_j2, eps4sigma, _MM_SCALE_8);
956  sigma_j = _mm512_permutex2var_ps(t0, t4, t1);
957  eps4_j = _mm512_permutex2var_ps(t0, t6, t1);
958  }
959 
960  // Force, Energy, Virial calculation
961  __m512 force, forceSlow;
962  if (interpMode == 1)
963  forceEnergyInterp1<doEnergy, doSlow>(r2, kqq, force, forceSlow,
964  energyVdw, energyElec, energySlow, r2mask, _paramC1, _paramC3,
965  _paramSwitchOn2, _cutoff2, _paramMinvCut3, _paramCutUnder3,
966  slowTable, slowEnergyTable, eps4_i, eps4_j, sigma_i, sigma_j);
967  else
968  forceEnergyInterp2<doEnergy, doSlow, interpMode>(r2, kqq, type_i,
969  type_j, force, forceSlow, energyVdw, energyElec, energySlow,
970  r2mask, _paramScale, _paramC1, _paramC3, _paramSwitchOn2,
971  _cutoff2, _paramMinvCut3, _paramCutUnder3, fastTable, energyTable,
972  slowTable, slowEnergyTable, ljTable, _paramLjWidth);
973 
974  // force_. = d. * force
975  const __m512 force_x = _mm512_mul_ps(dx, force);
976  const __m512 force_y = _mm512_mul_ps(dy, force);
977  const __m512 force_z = _mm512_mul_ps(dz, force);
978  // Accumulate j forces in memory
979  const __m512i j4 = _mm512_slli_epi32(j, 2);
980  __m512 ft0 = (__m512)_mm512_mask_i32logather_pd(_mm512_undefined_pd(),
981  r2mask, j4, (float*)forces_j, _MM_SCALE_4);
982  const __m512i j4_2 = _mm512_shuffle_i32x4(j4, j4, 238);
983  const __mmask16 r2mask2 = r2mask >> 8;
984  __m512 ft1 = (__m512)_mm512_mask_i32logather_pd(_mm512_undefined_pd(),
985  r2mask2, j4_2, (float *)forces_j, _MM_SCALE_4);
986  const __m512i ft4 = _mm512_set_epi32(23,7,22,6,21,5,20,4,
987  19,3,18,2,17,1,16,0);
988  const __m512 ft5 = _mm512_permutex2var_ps(force_x, ft4, force_y);
989  ft0 = _mm512_add_ps(ft0, ft5);
990  _mm512_mask_i32loscatter_pd((void *)forces_j, r2mask, j4, (__m512d)ft0,
991  _MM_SCALE_4);
992  const __m512i ft2 = _mm512_set_epi32(31,15,30,14,29,13,28,12,
993  27,11,26,10,25,9,24,8);
994  const __m512 ft3 = _mm512_permutex2var_ps(force_x, ft2, force_y);
995  ft1 = _mm512_add_ps(ft1, ft3);
996  _mm512_mask_i32loscatter_pd((void *)forces_j, r2mask2, j4_2,
997  (__m512d)ft1, _MM_SCALE_4);
998  __m512 mem3 = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), r2mask,
999  j4, (float *)(forces_j)+2,
1000  _MM_SCALE_4);
1001  mem3 = _mm512_add_ps(mem3, force_z);
1002  _mm512_mask_i32scatter_ps((float *)(forces_j)+2, r2mask, j4, mem3,
1003  _MM_SCALE_4);
1004 
1005  // force_i_. -= force_.
1006  force_i_x = _mm512_mask_sub_ps(force_i_x, r2mask, force_i_x, force_x);
1007  force_i_y = _mm512_mask_sub_ps(force_i_y, r2mask, force_i_y, force_y);
1008  force_i_z = _mm512_mask_sub_ps(force_i_z, r2mask, force_i_z, force_z);
1009 
1010  __m512 forceSlow_x, forceSlow_y, forceSlow_z;
1011  if (doSlow) {
1012  // forceSlow_. = d. * forceSlow
1013  forceSlow_x = _mm512_mul_ps(dx, forceSlow);
1014  forceSlow_y = _mm512_mul_ps(dy, forceSlow);
1015  forceSlow_z = _mm512_mul_ps(dz, forceSlow);
1016  // Accumulate j slow forces in memory
1017  // acc3(r2mask, (float*)forcesSlow_j, j, forceSlow_x,
1018  // forceSlow_y, forceSlow_z);
1019  __m512 ft10 = (__m512)_mm512_mask_i32logather_pd(_mm512_undefined_pd(),
1020  r2mask, j4, (float*)forcesSlow_j, _MM_SCALE_4);
1021  __m512 ft11 = (__m512)_mm512_mask_i32logather_pd(_mm512_undefined_pd(),
1022  r2mask2, j4_2, (float *)forcesSlow_j, _MM_SCALE_4);
1023  const __m512 ft15 = _mm512_permutex2var_ps(forceSlow_x, ft4,
1024  forceSlow_y);
1025  ft10 = _mm512_add_ps(ft10, ft15);
1026  _mm512_mask_i32loscatter_pd((void *)forcesSlow_j, r2mask, j4,
1027  (__m512d)ft10, _MM_SCALE_4);
1028  const __m512 ft13 = _mm512_permutex2var_ps(forceSlow_x, ft2,
1029  forceSlow_y);
1030  ft11 = _mm512_add_ps(ft11, ft13);
1031  _mm512_mask_i32loscatter_pd((void *)forcesSlow_j, r2mask2, j4_2,
1032  (__m512d)ft11, _MM_SCALE_4);
1033  __m512 mem13 = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), r2mask,
1034  j4, (float *)(forcesSlow_j)+2,
1035  _MM_SCALE_4);
1036  mem13 = _mm512_add_ps(mem13, forceSlow_z);
1037  _mm512_mask_i32scatter_ps((float *)(forcesSlow_j)+2, r2mask, j4, mem13,
1038  _MM_SCALE_4);
1039 
1040  // forceSlow_i_. -= forceSlow_.
1041  forceSlow_i_x = _mm512_mask_sub_ps(forceSlow_i_x, r2mask,
1042  forceSlow_i_x, forceSlow_x);
1043  forceSlow_i_y = _mm512_mask_sub_ps(forceSlow_i_y, r2mask,
1044  forceSlow_i_y, forceSlow_y);
1045  forceSlow_i_z = _mm512_mask_sub_ps(forceSlow_i_z, r2mask,
1046  forceSlow_i_z, forceSlow_z);
1047  }
1048  } // for mv
1049 
1050  // Reduction on i vectors, accumulate in memory, update fNet_. for virial
1051  float fI = _mm512_reduce_add_ps(force_i_x);
1052  forces_i[pairI].x += fI;
1053  if (doVirial) *((float*)&fNet_x) += fI;
1054  fI = _mm512_reduce_add_ps(force_i_y);
1055  forces_i[pairI].y += fI;
1056  if (doVirial) *((float*)&fNet_y) += fI;
1057  fI = _mm512_reduce_add_ps(force_i_z);
1058  forces_i[pairI].z += fI;
1059  if (doVirial) *((float*)&fNet_z) += fI;
1060  if (doSlow) {
1061  fI = _mm512_reduce_add_ps(forceSlow_i_x);
1062  forcesSlow_i[pairI].x += fI;
1063  if (doVirial) *((float*)&fNetSlow_x) += fI;
1064  fI = _mm512_reduce_add_ps(forceSlow_i_y);
1065  forcesSlow_i[pairI].y += fI;
1066  if (doVirial) *((float*)&fNetSlow_y) += fI;
1067  fI = _mm512_reduce_add_ps(forceSlow_i_z);
1068  forcesSlow_i[pairI].z += fI;
1069  if (doVirial) *((float*)&fNetSlow_z) += fI;
1070  }
1071  } // for z
1072 } // nbAVX512Pairs()
1073 
1074 //---------------------------------------------------------------------------
1075 // Calculations for modified pairs
1076 //---------------------------------------------------------------------------
1077 
1078 template <bool doEnergy, bool doVirial, bool doSlow, int interpMode>
1079 void AVXTileLists::nbAVX512Modified(__m512 &energyVdw, __m512 &energyElec,
1080  __m512 &energySlow, __m512 &fNet_x,
1081  __m512 &fNet_y, __m512 &fNet_z,
1082  __m512 &fNetSlow_x, __m512 &fNetSlow_y,
1083  __m512 &fNetSlow_z) {
1084 
1085  const AVXTiles::AVXTilesAtom* __restrict__ xyzq_i = tiles_p0->atoms;
1086  const AVXTiles::AVXTilesAtom* __restrict__ xyzq_j = tiles_p1->atoms;
1087  AVXTiles::AVXTilesForce* __restrict__ forces_i = tiles_p0->forces;
1088  AVXTiles::AVXTilesForce* __restrict__ forces_j = tiles_p1->forces;
1089  AVXTiles::AVXTilesForce* __restrict__ forcesSlow_i;
1090  AVXTiles::AVXTilesForce* __restrict__ forcesSlow_j;
1091  if (doSlow) {
1092  forcesSlow_i = tiles_p0->forcesSlow;
1093  forcesSlow_j = tiles_p1->forcesSlow;
1094  }
1095 
1096  const float* __restrict__ fastTable;
1097  const float* __restrict__ energyTable;
1098  // Interpolation for long-range splitting function
1099  if (interpMode == 3) {
1100  fastTable = _paramFastTable;
1101  energyTable = _paramFastEnergyTable;
1102  }
1103 
1104  const float * __restrict__ mod_table = _paramModifiedTable;
1105  const float * __restrict__ mode_table = _paramModifiedEnergyTable;
1106 
1107  const float * __restrict__ ljTable14;
1108  const float * __restrict__ eps4sigma14;
1109  if (interpMode > 1)
1110  ljTable14 = _paramLjTable + 2;
1111  else
1112  eps4sigma14 = _paramEps4Sigma14;
1113 
1114  __mmask16 loopMask = 0xFFFF;
1115  #pragma novector
1116  for (int mv = 0; mv < _numModified; mv += 16) {
1117  // Remainder predication
1118  if (_numModified - mv < 16)
1119  loopMask >>= (16 - (_numModified - mv));
1120  // Load i and j indices for pairs on modified list
1121  const __m512i i = _mm512_load_epi32(_modified_i + mv);
1122  const __m512i j = _mm512_load_epi32(_modified_j + mv);
1123 
1124  // Load i atom data and shift coordinates
1125  const __m512i it2 = _mm512_slli_epi32(i, 1);
1126  const __m512 t0 = (__m512)_mm512_mask_i32logather_pd(
1127  _mm512_undefined_pd(), loopMask, it2, (float *)xyzq_i, _MM_SCALE_8);
1128  const __m512 t8 = (__m512)_mm512_mask_i32logather_pd(
1129  _mm512_undefined_pd(), loopMask, it2, (float *)(xyzq_i) + 2,
1130  _MM_SCALE_8);
1131  const __m512i it2_2 = _mm512_shuffle_i32x4(it2, it2, 238);
1132  const __mmask16 loopMask2 = loopMask >> 8;
1133  const __m512 t1 = (__m512)_mm512_mask_i32logather_pd(
1134  _mm512_undefined_pd(), loopMask2, it2_2, (float *)xyzq_i, _MM_SCALE_8);
1135  const __m512 t9 = (__m512)_mm512_mask_i32logather_pd(
1136  _mm512_undefined_pd(), loopMask2, it2_2, (float *)(xyzq_i) + 2,
1137  _MM_SCALE_8);
1138  const __m512i t4 = _mm512_set_epi32(31,29,27,25,23,21,19,17,
1139  15,13,11,9,7,5,3,1);
1140  const __m512 y_i = _mm512_add_ps(_mm512_permutex2var_ps(t0, t4, t1),
1141  _mm512_set1_ps(_shy));
1142  const __m512 q_i = _mm512_permutex2var_ps(t8, t4, t9);
1143  const __m512i t6 = _mm512_set_epi32(30,28,26,24,22,20,18,16,
1144  14,12,10,8,6,4,2,0);
1145  const __m512 x_i = _mm512_add_ps(_mm512_permutex2var_ps(t0, t6, t1),
1146  _mm512_set1_ps(_shx));
1147  const __m512 z_i = _mm512_add_ps(_mm512_permutex2var_ps(t8, t6, t9),
1148  _mm512_set1_ps(_shz));
1149 
1150  // Load j atom data
1151  const __m512i jt2 = _mm512_slli_epi32(j, 1);
1152  const __m512 t10 = (__m512)_mm512_mask_i32logather_pd(
1153  _mm512_undefined_pd(), loopMask, jt2, (float *)xyzq_j, _MM_SCALE_8);
1154  const __m512 t18 = (__m512)_mm512_mask_i32logather_pd(
1155  _mm512_undefined_pd(), loopMask, jt2, (float *)(xyzq_j) + 2,
1156  _MM_SCALE_8);
1157  const __m512i jt2_2 = _mm512_shuffle_i32x4(jt2, jt2, 238);
1158  const __m512 t11 = (__m512)_mm512_mask_i32logather_pd(
1159  _mm512_undefined_pd(), loopMask2, jt2_2, (float *)xyzq_j, _MM_SCALE_8);
1160  const __m512 t19 = (__m512)_mm512_mask_i32logather_pd(
1161  _mm512_undefined_pd(), loopMask2, jt2_2, (float *)(xyzq_j) + 2,
1162  _MM_SCALE_8);
1163  const __m512 y_j = _mm512_permutex2var_ps(t10, t4, t11);
1164  const __m512 q_j = _mm512_permutex2var_ps(t18, t4, t19);
1165  const __m512 x_j = _mm512_permutex2var_ps(t10, t6, t11);
1166  const __m512 z_j = _mm512_permutex2var_ps(t18, t6, t19);
1167 
1168  // kqq = q_i * q_j * _paramScale14
1169  const __m512 kqq = _mm512_mul_ps(q_i, _mm512_mul_ps(q_j,
1170  _mm512_set1_ps(_paramScale14)));
1171 
1172  // dx = x_i - x_j;
1173  const __m512 dx = _mm512_sub_ps(x_i, x_j);
1174  // dy = y_i - y_j;
1175  const __m512 dy = _mm512_sub_ps(y_i, y_j);
1176  // dz = z_i - z_j;
1177  const __m512 dz = _mm512_sub_ps(z_i, z_j);
1178  // r2 = dx*dx + dy*dy + dz*dz;
1179  const __m512 r2(_mm512_fmadd_ps(dx,dx,_mm512_fmadd_ps(dy,dy,
1180  _mm512_mul_ps(dz, dz))));
1181  // Atoms within cutoff
1182  __mmask16 r2mask = _mm512_cmple_ps_mask(r2, _mm512_set1_ps(_cutoff2)) &
1183  loopMask;
1184 
1185  // Load LJ types
1186  const __m512i type_i = _mm512_mask_i32gather_epi32(type_i, r2mask, i,
1187  tiles_p0->vdwTypes, _MM_SCALE_4);
1188  const __m512i type_j = _mm512_mask_i32gather_epi32(type_j, r2mask, j,
1189  tiles_p1->vdwTypes, _MM_SCALE_4);
1190 
1191  // Load eps and sigma
1192  __m512 eps4_i14, sigma_i14, eps4_j14, sigma_j14;
1193  if (interpMode == 1) {
1194  const __m512 t0 = (__m512)_mm512_mask_i32logather_pd(
1195  _mm512_undefined_pd(), r2mask, type_i, eps4sigma14, _MM_SCALE_8);
1196  const __m512i type_i2 = _mm512_shuffle_i32x4(type_i, type_i, 238);
1197  const __mmask16 r2mask2 = r2mask >> 8;
1198  const __m512 t1 = (__m512)_mm512_mask_i32logather_pd(
1199  _mm512_undefined_pd(), r2mask2, type_i2, eps4sigma14, _MM_SCALE_8);
1200  const __m512 t10 = (__m512)_mm512_mask_i32logather_pd(
1201  _mm512_undefined_pd(), r2mask, type_j, eps4sigma14, _MM_SCALE_8);
1202  const __m512i type_j2 = _mm512_shuffle_i32x4(type_j, type_j, 238);
1203  const __m512 t11 = (__m512)_mm512_mask_i32logather_pd(
1204  _mm512_undefined_pd(), r2mask2, type_j2, eps4sigma14, _MM_SCALE_8);
1205 
1206  sigma_i14 = _mm512_permutex2var_ps(t0, t4, t1);
1207  sigma_j14 = _mm512_permutex2var_ps(t10, t4, t11);
1208  eps4_i14 = _mm512_permutex2var_ps(t0, t6, t1);
1209  eps4_j14 = _mm512_permutex2var_ps(t10, t6, t11);
1210  }
1211 
1212  // Force, Energy, Virial calculation
1213  __m512 force, forceSlow;
1214  if (interpMode == 1)
1215  forceEnergyInterp1<doEnergy, doSlow>(r2, kqq, force, forceSlow,
1216  energyVdw, energyElec, energySlow, r2mask, _paramC1, _paramC3,
1217  _paramSwitchOn2, _cutoff2, _paramMinvCut3, _paramCutUnder3,
1218  mod_table, mode_table, eps4_i14, eps4_j14, sigma_i14, sigma_j14);
1219  else
1220  forceEnergyInterp2<doEnergy, doSlow, interpMode>(r2, kqq, type_i, type_j,
1221  force, forceSlow, energyVdw, energyElec, energySlow, r2mask,
1222  _paramScale, _paramC1, _paramC3, _paramSwitchOn2, _cutoff2,
1223  _paramMinvCut3, _paramCutUnder3, fastTable, energyTable,
1224  mod_table, mode_table, ljTable14, _paramLjWidth);
1225 
1226  // force_i_. = d. * force
1227  const __m512 force_i_x = _mm512_mul_ps(dx, force);
1228  const __m512 force_i_y = _mm512_mul_ps(dy, force);
1229  const __m512 force_i_z = _mm512_mul_ps(dz, force);
1230  __m512 forceSlow_i_x, forceSlow_i_y, forceSlow_i_z;
1231  if (doSlow) {
1232  // forceSlow_i_. = d. * forceSlow
1233  forceSlow_i_x = _mm512_mul_ps(dx, forceSlow);
1234  forceSlow_i_y = _mm512_mul_ps(dy, forceSlow);
1235  forceSlow_i_z = _mm512_mul_ps(dz, forceSlow);
1236  }
1237 
1238  if (doVirial) {
1239  // fNet_. -= force_i_.
1240  fNet_x = _mm512_mask_sub_ps(fNet_x, r2mask, fNet_x, force_i_x);
1241  fNet_y = _mm512_mask_sub_ps(fNet_y, r2mask, fNet_y, force_i_y);
1242  fNet_z = _mm512_mask_sub_ps(fNet_z, r2mask, fNet_z, force_i_z);
1243  if (doSlow) {
1244  // fNetSlow_. -= forceSlow_i_.
1245  fNetSlow_x = _mm512_mask_sub_ps(fNetSlow_x, r2mask,
1246  fNetSlow_x, forceSlow_i_x);
1247  fNetSlow_y = _mm512_mask_sub_ps(fNetSlow_y, r2mask,
1248  fNetSlow_y, forceSlow_i_y);
1249  fNetSlow_z = _mm512_mask_sub_ps(fNetSlow_z, r2mask,
1250  fNetSlow_z, forceSlow_i_z);
1251  }
1252  }
1253 
1254  #pragma novector
1255  for (int z = 0; z < 16; z++) {
1256  // Skip if outside cutoff or remainder
1257  if (!(r2mask & 1)) {
1258  r2mask >>= 1;
1259  continue;
1260  }
1261  r2mask >>= 1;
1262 
1263  // WMB: Might be better to check if next i is same and update in mem once
1264 
1265  // Accumulate i and j forces in memory
1266  const int i_z = *((int*)&i + z);
1267  const int j_z = *((int*)&j + z);
1268  forces_i[i_z].x -= *((float*)&force_i_x + z);
1269  forces_i[i_z].y -= *((float*)&force_i_y + z);
1270  forces_i[i_z].z -= *((float*)&force_i_z + z);
1271  forces_j[j_z].x += *((float*)&force_i_x + z);
1272  forces_j[j_z].y += *((float*)&force_i_y + z);
1273  forces_j[j_z].z += *((float*)&force_i_z + z);
1274  if (doSlow) {
1275  forcesSlow_i[i_z].x -= *((float*)&forceSlow_i_x + z);
1276  forcesSlow_i[i_z].y -= *((float*)&forceSlow_i_y + z);
1277  forcesSlow_i[i_z].z -= *((float*)&forceSlow_i_z + z);
1278  forcesSlow_j[j_z].x += *((float*)&forceSlow_i_x + z);
1279  forcesSlow_j[j_z].y += *((float*)&forceSlow_i_y + z);
1280  forcesSlow_j[j_z].z += *((float*)&forceSlow_i_z + z);
1281  }
1282  }
1283  }
1284 } // nbAVX512Modified()
1285 
1286 //---------------------------------------------------------------------------
1287 // Calculations for excluded pairs
1288 //---------------------------------------------------------------------------
1289 
1290 template <bool doEnergy, bool doVirial>
1291 void AVXTileLists::nbAVX512Excluded(__m512 &energySlow, __m512 &fNetSlow_x,
1292  __m512 &fNetSlow_y, __m512 &fNetSlow_z) {
1293 
1294  const AVXTiles::AVXTilesAtom* __restrict__ xyzq_i = tiles_p0->atoms;
1295  const AVXTiles::AVXTilesAtom* __restrict__ xyzq_j = tiles_p1->atoms;
1296  AVXTiles::AVXTilesForce* __restrict__ forcesSlow_i = tiles_p0->forcesSlow;
1297  AVXTiles::AVXTilesForce* __restrict__ forcesSlow_j = tiles_p1->forcesSlow;
1298 
1299  const float * __restrict__ exclTable = _paramExcludedTable;
1300  const float * __restrict__ exclEtable = _paramExcludedEnergyTable;
1301 
1302  __mmask16 loopMask = 0xFFFF;
1303  for (int mv = 0; mv < _numExcluded; mv += 16) {
1304  // Remainder predication
1305  if (_numExcluded - mv < 16)
1306  loopMask >>= (16 - (_numExcluded - mv));
1307  // Load i and j indices for pairs on modified list
1308  const __m512i i = _mm512_load_epi32(_excluded_i + mv);
1309  const __m512i j = _mm512_load_epi32(_excluded_j + mv);
1310 
1311  // Load i atom data and shift coordinates
1312  const __m512i it2 = _mm512_slli_epi32(i, 1);
1313  const __m512 t0 = (__m512)_mm512_mask_i32logather_pd(
1314  _mm512_undefined_pd(), loopMask, it2, (float *)xyzq_i, _MM_SCALE_8);
1315  const __m512 t8 = (__m512)_mm512_mask_i32logather_pd(
1316  _mm512_undefined_pd(), loopMask, it2, (float *)(xyzq_i) + 2,
1317  _MM_SCALE_8);
1318  const __m512i it2_2 = _mm512_shuffle_i32x4(it2, it2, 238);
1319  const __mmask16 loopMask2 = loopMask >> 8;
1320  const __m512 t1 = (__m512)_mm512_mask_i32logather_pd(
1321  _mm512_undefined_pd(), loopMask2, it2_2, (float *)xyzq_i, _MM_SCALE_8);
1322  const __m512 t9 = (__m512)_mm512_mask_i32logather_pd(
1323  _mm512_undefined_pd(), loopMask2, it2_2, (float *)(xyzq_i) + 2,
1324  _MM_SCALE_8);
1325  const __m512i t4 = _mm512_set_epi32(31,29,27,25,23,21,19,17,
1326  15,13,11,9,7,5,3,1);
1327  const __m512 y_i = _mm512_add_ps(_mm512_permutex2var_ps(t0, t4, t1),
1328  _mm512_set1_ps(_shy));
1329  const __m512 q_i = _mm512_permutex2var_ps(t8, t4, t9);
1330  const __m512i t6 = _mm512_set_epi32(30,28,26,24,22,20,18,16,
1331  14,12,10,8,6,4,2,0);
1332  const __m512 x_i = _mm512_add_ps(_mm512_permutex2var_ps(t0, t6, t1),
1333  _mm512_set1_ps(_shx));
1334  const __m512 z_i = _mm512_add_ps(_mm512_permutex2var_ps(t8, t6, t9),
1335  _mm512_set1_ps(_shz));
1336 
1337  // Load j atom data
1338  const __m512i jt2 = _mm512_slli_epi32(j, 1);
1339  const __m512 t10 = (__m512)_mm512_mask_i32logather_pd(
1340  _mm512_undefined_pd(), loopMask, jt2, (float *)xyzq_j, _MM_SCALE_8);
1341  const __m512 t18 = (__m512)_mm512_mask_i32logather_pd(
1342  _mm512_undefined_pd(), loopMask, jt2, (float *)(xyzq_j) + 2,
1343  _MM_SCALE_8);
1344  const __m512i jt2_2 = _mm512_shuffle_i32x4(jt2, jt2, 238);
1345  const __m512 t11 = (__m512)_mm512_mask_i32logather_pd(
1346  _mm512_undefined_pd(), loopMask2, jt2_2, (float *)xyzq_j, _MM_SCALE_8);
1347  const __m512 t19 = (__m512)_mm512_mask_i32logather_pd(
1348  _mm512_undefined_pd(), loopMask2, jt2_2, (float *)(xyzq_j) + 2,
1349  _MM_SCALE_8);
1350  const __m512 y_j = _mm512_permutex2var_ps(t10, t4, t11);
1351  const __m512 q_j = _mm512_permutex2var_ps(t18, t4, t19);
1352  const __m512 x_j = _mm512_permutex2var_ps(t10, t6, t11);
1353  const __m512 z_j = _mm512_permutex2var_ps(t18, t6, t19);
1354 
1355  // kqq = q_i * q_j
1356  const __m512 kqq = _mm512_mul_ps(q_i, q_j);
1357 
1358  // dx = x_i - x_j;
1359  const __m512 dx = _mm512_sub_ps(x_i, x_j);
1360  // dy = y_i - y_j;
1361  const __m512 dy = _mm512_sub_ps(y_i, y_j);
1362  // dz = z_i - z_j;
1363  const __m512 dz = _mm512_sub_ps(z_i, z_j);
1364  // r2 = dx*dx + dy*dy + dz*dz;
1365  const __m512 r2(_mm512_fmadd_ps(dx,dx,_mm512_fmadd_ps(dy,dy,
1366  _mm512_mul_ps(dz, dz))));
1367  // Atoms within cutoff
1368  __mmask16 r2mask = _mm512_cmple_ps_mask(r2, _mm512_set1_ps(_cutoff2)) &
1369  loopMask;
1370 
1371  // Force, Energy, Virial calculation
1372  const __m512 r_1 = _mm512_invsqrt_ps(r2);
1373  __m512 forceSlow, tableDiff, rTableDiff;
1374  __m512i table_int;
1375  getOmpSimdTableI(r_1, table_int, tableDiff, rTableDiff);
1376  forceEnergySlow512<doEnergy>(r2mask, kqq, exclTable, exclEtable,
1377  table_int, tableDiff, rTableDiff, forceSlow,
1378  energySlow);
1379 
1380  // forceSlow_i_. = d. * forceSlow
1381  const __m512 forceSlow_i_x = _mm512_mul_ps(dx, forceSlow);
1382  const __m512 forceSlow_i_y = _mm512_mul_ps(dy, forceSlow);
1383  const __m512 forceSlow_i_z = _mm512_mul_ps(dz, forceSlow);
1384  // fNetSlow_. -= forceSlow_i_.
1385  fNetSlow_x = _mm512_mask_sub_ps(fNetSlow_x, r2mask,
1386  fNetSlow_x, forceSlow_i_x);
1387  fNetSlow_y = _mm512_mask_sub_ps(fNetSlow_y, r2mask,
1388  fNetSlow_y, forceSlow_i_y);
1389  fNetSlow_z = _mm512_mask_sub_ps(fNetSlow_z, r2mask,
1390  fNetSlow_z, forceSlow_i_z);
1391 
1392  for (int z = 0; z < 16; z++) {
1393  // Skip if outside cutoff or remainder
1394  if (!(r2mask & 1)) {
1395  r2mask >>= 1;
1396  continue;
1397  }
1398  r2mask >>= 1;
1399 
1400  // WMB: Might be better to check if next i is same and update in mem once
1401  const int i_z = *((int*)&i + z);
1402  const int j_z = *((int*)&j + z);
1403  forcesSlow_i[i_z].x -= *((float*)&forceSlow_i_x + z);
1404  forcesSlow_i[i_z].y -= *((float*)&forceSlow_i_y + z);
1405  forcesSlow_i[i_z].z -= *((float*)&forceSlow_i_z + z);
1406  forcesSlow_j[j_z].x += *((float*)&forceSlow_i_x + z);
1407  forcesSlow_j[j_z].y += *((float*)&forceSlow_i_y + z);
1408  forcesSlow_j[j_z].z += *((float*)&forceSlow_i_z + z);
1409  } // for m
1410  }
1411 }
1412 
1413 template <bool doEnergy, bool doVirial, bool doSlow, bool doList,
1414  int interpMode>
1415 void AVXTileLists::doAll() {
1416 
1417  if (doList) build();
1418 
1419  __m512 energyVdw, energyElec, energySlow;
1420  __m512 fNet_x, fNet_y, fNet_z;
1421  __m512 fNetSlow_x, fNetSlow_y, fNetSlow_z;
1422 
1423  // Zero energy and virial vectors used in all routines
1424  if (doEnergy) {
1425  energyVdw = _mm512_setzero_ps();
1426  energyElec = _mm512_setzero_ps();
1427  if (doSlow) energySlow = _mm512_setzero_ps();
1428  }
1429  if (doVirial) {
1430  fNet_x = _mm512_setzero_ps();
1431  fNet_y = _mm512_setzero_ps();
1432  fNet_z = _mm512_setzero_ps();
1433  if (doSlow) {
1434  fNetSlow_x = _mm512_setzero_ps();
1435  fNetSlow_y = _mm512_setzero_ps();
1436  fNetSlow_z = _mm512_setzero_ps();
1437  }
1438  }
1439 
1440  // Calculations from tile lists
1441  nbAVX512Tiles<doEnergy, doVirial, doSlow, doList,
1442  interpMode>(energyVdw, energyElec, energySlow, fNet_x, fNet_y,
1443  fNet_z, fNetSlow_x, fNetSlow_y, fNetSlow_z);
1444 
1445  if (doList) delEmptyLists();
1446 
1447  // Calculations from pair lists
1448  if (NAMD_AVXTILES_PAIR_THRESHOLD > 0 && _numPairLists)
1449  nbAVX512Pairs<doEnergy, doVirial, doSlow,
1450  interpMode>(energyVdw, energyElec, energySlow, fNet_x,
1451  fNet_y, fNet_z, fNetSlow_x, fNetSlow_y,
1452  fNetSlow_z);
1453 
1454  // Calculations for modified exclusions
1455  if (_numModified)
1456  nbAVX512Modified<doEnergy, doVirial, doSlow,
1457  interpMode>(energyVdw, energyElec, energySlow, fNet_x,
1458  fNet_y, fNet_z, fNetSlow_x, fNetSlow_y,
1459  fNetSlow_z);
1460 
1461  // Calculations for full exclusions
1462  if (doSlow && _numExcluded)
1463  nbAVX512Excluded<doEnergy, doVirial>(energySlow, fNetSlow_x,
1464  fNetSlow_y, fNetSlow_z);
1465 
1466  // Reduce energy vectors into scalar
1467  if (doEnergy) {
1468  _energyVdw = _mm512_reduce_add_ps(energyVdw);
1469  if (interpMode > 1) _energyVdw *= _paramScale * 0.5f;
1470  _energyElec = _mm512_reduce_add_ps(energyElec);
1471  if (doSlow) _energySlow = _mm512_reduce_add_ps(energySlow);
1472  }
1473 
1474  // Reduce virial vectors into scalars
1475  if (doVirial) {
1476  _fNet_x = _mm512_reduce_add_ps(fNet_x);
1477  _fNet_y = _mm512_reduce_add_ps(fNet_y);
1478  _fNet_z = _mm512_reduce_add_ps(fNet_z);
1479  if (doSlow) {
1480  _fNetSlow_x = _mm512_reduce_add_ps(fNetSlow_x);
1481  _fNetSlow_y = _mm512_reduce_add_ps(fNetSlow_y);
1482  _fNetSlow_z = _mm512_reduce_add_ps(fNetSlow_z);
1483  }
1484  }
1485 
1486  // Touch "tiles" data structures on patches used to indicate they
1487  // need reduction with native NAMD data and zeroing for subsequent use
1488  if (doList) {
1489  if (_numLists || _numModified || _numExcluded ||
1490  (NAMD_AVXTILES_PAIR_THRESHOLD > 0 && _numPairLists)) {
1491  tiles_p0->touch();
1492  tiles_p1->touch();
1493  }
1494  } else {
1495  tiles_p0->touch();
1496  tiles_p1->touch();
1497  }
1498 }
1499 
1500 //---------------------------------------------------------------------------
1501 
1502 void AVXTileLists::nbForceAVX512(const int doEnergy, const int doVirial,
1503  const int doSlow, const int doList) {
1504 
1505 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, INTERPMODE) \
1506  doAll<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, INTERPMODE>();
1507 
1508  if (_interpolationMode == 1) {
1509  if (!doEnergy && !doVirial && !doSlow && !doList) CALL(0, 0, 0, 0, 1);
1510  if (!doEnergy && !doVirial && doSlow && !doList) CALL(0, 0, 1, 0, 1);
1511  if (!doEnergy && doVirial && !doSlow && !doList) CALL(0, 1, 0, 0, 1);
1512  if (!doEnergy && doVirial && doSlow && !doList) CALL(0, 1, 1, 0, 1);
1513  if ( doEnergy && !doVirial && !doSlow && !doList) CALL(1, 0, 0, 0, 1);
1514  if ( doEnergy && !doVirial && doSlow && !doList) CALL(1, 0, 1, 0, 1);
1515  if ( doEnergy && doVirial && !doSlow && !doList) CALL(1, 1, 0, 0, 1);
1516  if ( doEnergy && doVirial && doSlow && !doList) CALL(1, 1, 1, 0, 1);
1517 
1518  if (!doEnergy && !doVirial && !doSlow && doList) CALL(0, 0, 0, 1, 1);
1519  if (!doEnergy && !doVirial && doSlow && doList) CALL(0, 0, 1, 1, 1);
1520  if (!doEnergy && doVirial && !doSlow && doList) CALL(0, 1, 0, 1, 1);
1521  if (!doEnergy && doVirial && doSlow && doList) CALL(0, 1, 1, 1, 1);
1522  if ( doEnergy && !doVirial && !doSlow && doList) CALL(1, 0, 0, 1, 1);
1523  if ( doEnergy && !doVirial && doSlow && doList) CALL(1, 0, 1, 1, 1);
1524  if ( doEnergy && doVirial && !doSlow && doList) CALL(1, 1, 0, 1, 1);
1525  if ( doEnergy && doVirial && doSlow && doList) CALL(1, 1, 1, 1, 1);
1526  } else if (_interpolationMode == 2) {
1527  if (!doEnergy && !doVirial && !doSlow && !doList) CALL(0, 0, 0, 0, 2);
1528  if (!doEnergy && !doVirial && doSlow && !doList) CALL(0, 0, 1, 0, 2);
1529  if (!doEnergy && doVirial && !doSlow && !doList) CALL(0, 1, 0, 0, 2);
1530  if (!doEnergy && doVirial && doSlow && !doList) CALL(0, 1, 1, 0, 2);
1531  if ( doEnergy && !doVirial && !doSlow && !doList) CALL(1, 0, 0, 0, 2);
1532  if ( doEnergy && !doVirial && doSlow && !doList) CALL(1, 0, 1, 0, 2);
1533  if ( doEnergy && doVirial && !doSlow && !doList) CALL(1, 1, 0, 0, 2);
1534  if ( doEnergy && doVirial && doSlow && !doList) CALL(1, 1, 1, 0, 2);
1535 
1536  if (!doEnergy && !doVirial && !doSlow && doList) CALL(0, 0, 0, 1, 2);
1537  if (!doEnergy && !doVirial && doSlow && doList) CALL(0, 0, 1, 1, 2);
1538  if (!doEnergy && doVirial && !doSlow && doList) CALL(0, 1, 0, 1, 2);
1539  if (!doEnergy && doVirial && doSlow && doList) CALL(0, 1, 1, 1, 2);
1540  if ( doEnergy && !doVirial && !doSlow && doList) CALL(1, 0, 0, 1, 2);
1541  if ( doEnergy && !doVirial && doSlow && doList) CALL(1, 0, 1, 1, 2);
1542  if ( doEnergy && doVirial && !doSlow && doList) CALL(1, 1, 0, 1, 2);
1543  if ( doEnergy && doVirial && doSlow && doList) CALL(1, 1, 1, 1, 2);
1544  } else {
1545  if (!doEnergy && !doVirial && !doSlow && !doList) CALL(0, 0, 0, 0, 3);
1546  if (!doEnergy && !doVirial && doSlow && !doList) CALL(0, 0, 1, 0, 3);
1547  if (!doEnergy && doVirial && !doSlow && !doList) CALL(0, 1, 0, 0, 3);
1548  if (!doEnergy && doVirial && doSlow && !doList) CALL(0, 1, 1, 0, 3);
1549  if ( doEnergy && !doVirial && !doSlow && !doList) CALL(1, 0, 0, 0, 3);
1550  if ( doEnergy && !doVirial && doSlow && !doList) CALL(1, 0, 1, 0, 3);
1551  if ( doEnergy && doVirial && !doSlow && !doList) CALL(1, 1, 0, 0, 3);
1552  if ( doEnergy && doVirial && doSlow && !doList) CALL(1, 1, 1, 0, 3);
1553 
1554  if (!doEnergy && !doVirial && !doSlow && doList) CALL(0, 0, 0, 1, 3);
1555  if (!doEnergy && !doVirial && doSlow && doList) CALL(0, 0, 1, 1, 3);
1556  if (!doEnergy && doVirial && !doSlow && doList) CALL(0, 1, 0, 1, 3);
1557  if (!doEnergy && doVirial && doSlow && doList) CALL(0, 1, 1, 1, 3);
1558  if ( doEnergy && !doVirial && !doSlow && doList) CALL(1, 0, 0, 1, 3);
1559  if ( doEnergy && !doVirial && doSlow && doList) CALL(1, 0, 1, 1, 3);
1560  if ( doEnergy && doVirial && !doSlow && doList) CALL(1, 1, 0, 1, 3);
1561  if ( doEnergy && doVirial && doSlow && doList) CALL(1, 1, 1, 1, 3);
1562  }
1563 
1564 #undef CALL
1565 }
1566 
1567 #endif // NAMD_AVXTILES
static Node * Object()
Definition: Node.h:86
short int32
Definition: dumpdcd.c:24
const int32 * get_mod_exclusions_for_atom(int anum) const
Definition: Molecule.h:1163
#define EXCHCK_MOD
Definition: Molecule.h:87
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
#define EXCHCK_FULL
Definition: Molecule.h:86
gridSize z
int numAtoms
Definition: Molecule.h:557
__global__ void const int numTileLists
const ExclusionCheck * get_excl_check_for_atom(int anum) const
Definition: Molecule.h:1177
gridSize y
gridSize x
#define CALL(DOENERGY, DOVIRIAL)
Molecule * molecule
Definition: Node.h:176
const int32 * get_full_exclusions_for_atom(int anum) const
Definition: Molecule.h:1161