NAMD
CudaComputeNonbondedKernel.cu
Go to the documentation of this file.
1 #include <cuda.h>
2 #if __CUDACC_VER_MAJOR__ >= 11
3 #include <cub/cub.cuh>
4 #else
5 #include <namd_cub/cub.cuh>
6 #endif
8 #include "CudaTileListKernel.h"
9 #include "DeviceCUDA.h"
10 #ifdef WIN32
11 #define __thread __declspec(thread)
12 #endif
13 extern __thread DeviceCUDA *deviceCUDA;
14 
15 #define OVERALLOC 1.2f
16 
17 void NAMD_die(const char *);
18 
19 #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k
20 __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS];
21 
22 #define NONBONDKERNEL_NUM_WARP 4
23 
24 template<bool doEnergy, bool doSlow>
25 __device__ __forceinline__
26 void calcForceEnergy(const float r2, const float qi, const float qj,
27  const float dx, const float dy, const float dz,
28  const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
29  cudaTextureObject_t vdwCoefTableTex,
30  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
31  float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow,
32  float& energyVdw, float& energyElec, float& energySlow) {
33 
34  int vdwIndex = vdwtypej + vdwtypei;
35 #if __CUDA_ARCH__ >= 350
36  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
37 #else
38  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
39 #endif
40 
41  float rinv = rsqrtf(r2);
42  float4 ei;
43  float4 fi = tex1D<float4>(forceTableTex, rinv);
44  if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
45 
46  float fSlow = qi * qj;
47  float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;
48 
49  if (doEnergy) {
50  energyVdw += ljab.x * ei.z + ljab.y * ei.y;
51  energyElec += fSlow * ei.x;
52  if (doSlow) energySlow += fSlow * ei.w;
53  }
54  if (doSlow) fSlow *= fi.w;
55 
56  float fx = dx * f;
57  float fy = dy * f;
58  float fz = dz * f;
59  iforce.x += fx;
60  iforce.y += fy;
61  iforce.z += fz;
62  jforce.x -= fx;
63  jforce.y -= fy;
64  jforce.z -= fz;
65 
66  if (doSlow) {
67  float fxSlow = dx * fSlow;
68  float fySlow = dy * fSlow;
69  float fzSlow = dz * fSlow;
70  iforceSlow.x += fxSlow;
71  iforceSlow.y += fySlow;
72  iforceSlow.z += fzSlow;
73  jforceSlow.x -= fxSlow;
74  jforceSlow.y -= fySlow;
75  jforceSlow.z -= fzSlow;
76  }
77 }
78 
79 template<bool doSlow>
80 __device__ __forceinline__
81 void storeForces(const int pos, const float3 force, const float3 forceSlow,
82  float4* __restrict__ devForces, float4* __restrict__ devForcesSlow) {
83  atomicAdd(&devForces[pos].x, force.x);
84  atomicAdd(&devForces[pos].y, force.y);
85  atomicAdd(&devForces[pos].z, force.z);
86  if (doSlow) {
87  atomicAdd(&devForcesSlow[pos].x, forceSlow.x);
88  atomicAdd(&devForcesSlow[pos].y, forceSlow.y);
89  atomicAdd(&devForcesSlow[pos].z, forceSlow.z);
90  }
91 }
92 
93 template<bool doSlow>
94 __device__ __forceinline__
95 void storeForces(const int pos, const float3 force, const float3 forceSlow,
96  float* __restrict__ devForces_x,
97  float* __restrict__ devForces_y,
98  float* __restrict__ devForces_z,
99  float* __restrict__ devForcesSlow_x,
100  float* __restrict__ devForcesSlow_y,
101  float* __restrict__ devForcesSlow_z)
102 {
103  atomicAdd(&devForces_x[pos], force.x);
104  atomicAdd(&devForces_y[pos], force.y);
105  atomicAdd(&devForces_z[pos], force.z);
106  if (doSlow) {
107  atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
108  atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
109  atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
110  }
111 }
112 
113 template<bool doSlow>
114 __device__ __forceinline__
115 void storeForces(const int pos, const float3 force, const float3 forceSlow,
116  float3* __restrict__ forces, float3* __restrict__ forcesSlow) {
117  atomicAdd(&forces[pos].x, force.x);
118  atomicAdd(&forces[pos].y, force.y);
119  atomicAdd(&forces[pos].z, force.z);
120  if (doSlow) {
121  atomicAdd(&forcesSlow[pos].x, forceSlow.x);
122  atomicAdd(&forcesSlow[pos].y, forceSlow.y);
123  atomicAdd(&forcesSlow[pos].z, forceSlow.z);
124  }
125 }
126 
127 template<bool doPairlist>
128 __device__ __forceinline__
129 void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex, int& jexclMaxdiff, int& jexclIndex) {
130  xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
131  vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
132  if (doPairlist) {
133  jatomIndex = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
134  jexclIndex = WARP_SHUFFLE(WARP_FULL_MASK, jexclIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
135  jexclMaxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jexclMaxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
136  }
137 }
138 
139 template<bool doPairlist>
140 __device__ __forceinline__
141 void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex) {
142  xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
143  vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
144  if (doPairlist) {
145  jatomIndex = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
146  }
147 }
148 
149 template<bool doSlow>
150 __device__ __forceinline__
151 void shuffleNext(float3& jforce, float3& jforceSlow) {
152  jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
153  jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
154  jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
155  if (doSlow) {
156  jforceSlow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
157  jforceSlow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
158  jforceSlow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
159  }
160 }
161 
162 //#define USE_NEW_EXCL_METHOD
163 
164 //
165 // Returns the lower estimate for the distance between a bounding box and a set of atoms
166 //
167 __device__ __forceinline__ float distsq(const BoundingBox a, const float4 b) {
168  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx);
169  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy);
170  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz);
171  float r2 = dx*dx + dy*dy + dz*dz;
172  return r2;
173 }
174 
175 #define LARGE_FLOAT (float)(1.0e10)
176 
177 //
178 // Nonbonded force kernel
179 //
180 template <bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doStreaming>
181 __global__ void
183  doPairlist ? (10) : (doEnergy ? (10) : (10) )
184  )
185 nonbondedForceKernel(
186  const int start, const int numTileLists,
187  const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls,
188  const int* __restrict__ tileJatomStart,
189  const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, const int* __restrict__ vdwTypes,
190  const float3 lata, const float3 latb, const float3 latc,
191  const float4* __restrict__ xyzq, const float cutoff2,
192  cudaTextureObject_t vdwCoefTableTex,
193  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
194  // ----------
195  // doPairlist
196  const int atomStorageSize, const float plcutoff2, const PatchPairRecord* __restrict__ patchPairs,
197  const int* __restrict__ atomIndex,
198  const int2* __restrict__ exclIndexMaxDiff, const unsigned int* __restrict__ overflowExclusions,
199  unsigned int* __restrict__ tileListDepth, int* __restrict__ tileListOrder,
200  int* __restrict__ jtiles, TileListStat* __restrict__ tileListStat,
201  const BoundingBox* __restrict__ boundingBoxes,
202 #ifdef USE_NEW_EXCL_METHOD
203  const int* __restrict__ minmaxExclAtom,
204 #endif
205  // ----------
206  float4* __restrict__ devForces, float4* __restrict__ devForcesSlow,
207  float * __restrict__ devForce_x,
208  float * __restrict__ devForce_y,
209  float * __restrict__ devForce_z,
210  float * __restrict__ devForce_w,
211  float * __restrict__ devForceSlow_x,
212  float * __restrict__ devForceSlow_y,
213  float * __restrict__ devForceSlow_z,
214  float * __restrict__ devForceSlow_w,
215  // ---- USE_STREAMING_FORCES ----
216  const int numPatches,
217  unsigned int* __restrict__ patchNumCount,
218  const CudaPatchRecord* __restrict__ cudaPatches,
219  float4* __restrict__ mapForces, float4* __restrict__ mapForcesSlow,
220  int* __restrict__ mapPatchReadyQueue,
221  int* __restrict__ outputOrder,
222  // ------------------------------
223  TileListVirialEnergy* __restrict__ virialEnergy) {
224 
225  // Single warp takes care of one list of tiles
226  // for (int itileList = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;itileList < numTileLists;itileList += blockDim.x*gridDim.x/WARPSIZE)
227  int itileList = start + threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;
228  if (itileList < numTileLists)
229  {
230 
231  float3 iforce;
232  float3 iforceSlow;
233  float energyVdw, energyElec, energySlow;
234  int nexcluded;
235  unsigned int itileListLen;
236  int2 patchInd;
237  int2 patchNumList;
238  __shared__ float4 s_xyzq[NONBONDKERNEL_NUM_WARP][WARPSIZE];
239  __shared__ int s_vdwtypej[NONBONDKERNEL_NUM_WARP][WARPSIZE];
240  __shared__ float3 s_jforce[NONBONDKERNEL_NUM_WARP][WARPSIZE];
241  __shared__ float3 s_jforceSlow[NONBONDKERNEL_NUM_WARP][WARPSIZE];
242  __shared__ int s_jatomIndex[NONBONDKERNEL_NUM_WARP][WARPSIZE];
243 
244  // Start computation
245  {
246  // Warp index (0...warpsize-1)
247  const int wid = threadIdx.x % WARPSIZE;
248  const int iwarp = threadIdx.x / WARPSIZE;
249 
250  TileList tmp = tileLists[itileList];
251  int iatomStart = tmp.iatomStart;
252  int jtileStart = tmp.jtileStart;
253  int jtileEnd = tmp.jtileEnd;
254  patchInd = tmp.patchInd;
255  patchNumList = tmp.patchNumList;
256 
257  float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
258  float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
259  float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
260 
261  // DH - set zeroShift flag if magnitude of shift vector is zero
262  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
263 
264  int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
265  if (doPairlist) {
266  PatchPairRecord PPStmp = patchPairs[itileList];
267  iatomSize = PPStmp.iatomSize;
268  iatomFreeSize = PPStmp.iatomFreeSize;
269  jatomSize = PPStmp.jatomSize;
270  jatomFreeSize = PPStmp.jatomFreeSize;
271  }
272 
273  // Write to global memory here to avoid register spilling
274  if (doVirial) {
275  if (wid == 0) {
276  virialEnergy[itileList].shx = shx;
277  virialEnergy[itileList].shy = shy;
278  virialEnergy[itileList].shz = shz;
279  }
280  }
281 
282  // Load i-atom data (and shift coordinates)
283  float4 xyzq_i = xyzq[iatomStart + wid];
284  xyzq_i.x += shx;
285  xyzq_i.y += shy;
286  xyzq_i.z += shz;
287  int vdwtypei = vdwTypes[iatomStart + wid]*vdwCoefTableWidth;
288 
289  // Load i-atom data (and shift coordinates)
290  BoundingBox boundingBoxI;
291  if (doPairlist) {
292  boundingBoxI = boundingBoxes[iatomStart/WARPSIZE];
293  boundingBoxI.x += shx;
294  boundingBoxI.y += shy;
295  boundingBoxI.z += shz;
296  }
297 
298  // Get i-atom global index
299 #ifdef USE_NEW_EXCL_METHOD
300  int iatomIndex, minExclAtom, maxExclAtom;
301 #else
302  int iatomIndex;
303 #endif
304  if (doPairlist) {
305 #ifdef USE_NEW_EXCL_METHOD
306  iatomIndex = atomIndex[iatomStart + wid];
307  int2 tmp = minmaxExclAtom[iatomStart + wid];
308  minExclAtom = tmp.x;
309  maxExclAtom = tmp.y;
310 #else
311  iatomIndex = atomIndex[iatomStart + wid];
312 #endif
313  }
314 
315  // i-forces in registers
316  // float3 iforce;
317  iforce.x = 0.0f;
318  iforce.y = 0.0f;
319  iforce.z = 0.0f;
320 
321  // float3 iforceSlow;
322  if (doSlow) {
323  iforceSlow.x = 0.0f;
324  iforceSlow.y = 0.0f;
325  iforceSlow.z = 0.0f;
326  }
327 
328  // float energyVdw, energyElec, energySlow;
329  if (doEnergy) {
330  energyVdw = 0.0f;
331  energyElec = 0.0f;
332  if (doSlow) energySlow = 0.0f;
333  }
334 
335  // Number of exclusions
336  // NOTE: Lowest bit is used as indicator bit for tile pairs:
337  // bit 0 tile has no atoms within pairlist cutoff
338  // bit 1 tile has atoms within pairlist cutoff
339  // int nexcluded;
340  if (doPairlist) nexcluded = 0;
341 
342  // Number of i loops and free atoms
343  int nfreei;
344  if (doPairlist) {
345  int nloopi = min(iatomSize - iatomStart, WARPSIZE);
346  nfreei = max(iatomFreeSize - iatomStart, 0);
347  if (wid >= nloopi) {
348  xyzq_i.x = -LARGE_FLOAT;
349  xyzq_i.y = -LARGE_FLOAT;
350  xyzq_i.z = -LARGE_FLOAT;
351  }
352  }
353 
354  // tile list stuff
355  // int itileListLen;
356  // int minJatomStart;
357  if (doPairlist) {
358  // minJatomStart = tileJatomStart[jtileStart];
359  itileListLen = 0;
360  }
361 
362  // Exclusion index and maxdiff
363  int iexclIndex, iexclMaxdiff;
364  if (doPairlist) {
365  int2 tmp = exclIndexMaxDiff[iatomStart + wid];
366  iexclIndex = tmp.x;
367  iexclMaxdiff = tmp.y;
368  }
369 
370  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
371 
372  // Load j-atom starting index and exclusion mask
373  int jatomStart = tileJatomStart[jtile];
374 
375  float4 xyzq_j = xyzq[jatomStart + wid];
377 
378  // Check for early bail
379  if (doPairlist) {
380  float r2bb = distsq(boundingBoxI, xyzq_j);
381  if (WARP_ALL(WARP_FULL_MASK, r2bb > plcutoff2)) continue;
382  }
383  unsigned int excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
384  int vdwtypej = vdwTypes[jatomStart + wid];
385  s_vdwtypej[iwarp][wid] = vdwtypej;
386 
387  // Get i-atom global index
388  if (doPairlist) {
389  s_jatomIndex[iwarp][wid] = atomIndex[jatomStart + wid];
390  }
391 
392  // Number of j loops and free atoms
393  int nfreej;
394  if (doPairlist) {
395  int nloopj = min(jatomSize - jatomStart, WARPSIZE);
396  nfreej = max(jatomFreeSize - jatomStart, 0);
397  //if (nfreei == 0 && nfreej == 0) continue;
398  if (wid >= nloopj) {
399  xyzq_j.x = LARGE_FLOAT;
400  xyzq_j.y = LARGE_FLOAT;
401  xyzq_j.z = LARGE_FLOAT;
402  }
403  }
404 
405  s_xyzq[iwarp][wid] = xyzq_j;
406 
407  // DH - self requires that zeroShift is also set
408  const bool self = zeroShift && (iatomStart == jatomStart);
409  const int modval = (self) ? 2*WARPSIZE-1 : WARPSIZE-1;
410 
411  s_jforce[iwarp][wid] = make_float3(0.0f, 0.0f, 0.0f);
412  if (doSlow)
413  s_jforceSlow[iwarp][wid] = make_float3(0.0f, 0.0f, 0.0f);
415 
416 
417  int t = (self) ? 1 : 0;
418 
419  if (doPairlist) {
420  // Build pair list
421  // NOTE: Pairlist update, we must also include the diagonal since this is used
422  // in GBIS phase 2.
423  // Clear the lowest (indicator) bit
424  nexcluded &= (~1);
425 
426  // For self tiles, do the diagonal term (t=0).
427  // NOTE: No energies are computed here, since this self-diagonal term is only for GBIS phase 2
428  if (self) {
429  int j = (0 + wid) & modval;
430  xyzq_j = s_xyzq[iwarp][j];
431  float dx = xyzq_j.x - xyzq_i.x;
432  float dy = xyzq_j.y - xyzq_i.y;
433  float dz = xyzq_j.z - xyzq_i.z;
434 
435  float r2 = dx*dx + dy*dy + dz*dz;
436 
437  if (j < WARPSIZE && r2 < plcutoff2) {
438  // We have atom pair within the pairlist cutoff => Set indicator bit
439  nexcluded |= 1;
440  }
441  }
442 
443  for (;t < WARPSIZE;t++) {
444  int j = (t + wid) & modval;
445 
446  excl >>= 1;
447  if (j < WARPSIZE ) {
448  xyzq_j = s_xyzq[iwarp][j];
449  float dx = xyzq_j.x - xyzq_i.x;
450  float dy = xyzq_j.y - xyzq_i.y;
451  float dz = xyzq_j.z - xyzq_i.z;
452  float r2 = dx*dx + dy*dy + dz*dz;
453  // We have atom pair within the pairlist cutoff => Set indicator bit
454  if(r2 < plcutoff2){
455  nexcluded |= 1;
456  if (j < nfreej || wid < nfreei) {
457  bool excluded = false;
458  int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
459  if ( abs(indexdiff) <= iexclMaxdiff) {
460  indexdiff += iexclIndex;
461  int indexword = ((unsigned int) indexdiff) >> 5;
462 
463  if ( indexword < MAX_CONST_EXCLUSIONS ) {
464  indexword = constExclusions[indexword];
465  } else {
466  indexword = overflowExclusions[indexword];
467  }
468 
469  excluded = ((indexword & (1<<(indexdiff&31))) != 0);
470  }
471  if (excluded) nexcluded += 2;
472  if (!excluded) excl |= 0x80000000;
473  if (!excluded && r2 < cutoff2) {
474  calcForceEnergy<doEnergy, doSlow>(
475  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
476  vdwtypei, s_vdwtypej[iwarp][j],
478  iforce, iforceSlow,
479  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
480  energyVdw,
481  energyElec, energySlow);
482  }
483  }
484  }
485  }
487  } // t
488  } else {
489  // Just compute forces
490  if (self) {
491  excl >>= 1;
492  }
493  for (;t < WARPSIZE;t++) {
494  if ((excl & 1)) {
495  xyzq_j = s_xyzq[iwarp][(wid+t) & (WARPSIZE-1)];
496  float dx = xyzq_j.x - xyzq_i.x;
497  float dy = xyzq_j.y - xyzq_i.y;
498  float dz = xyzq_j.z - xyzq_i.z;
499 
500  float r2 = dx*dx + dy*dy + dz*dz;
501 
502  if (r2 < cutoff2) {
503  calcForceEnergy<doEnergy, doSlow>(
504  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
505  vdwtypei, s_vdwtypej[iwarp][(wid+t) & (WARPSIZE-1)], vdwCoefTable,
507  iforce, iforceSlow,
508  s_jforce[iwarp][(wid+t) & (WARPSIZE-1)],
509  s_jforceSlow[iwarp][(wid+t) & (WARPSIZE-1)],
510  energyVdw, energyElec, energySlow);
511  } // (r2 < cutoff2)
512  } // (excl & 1)
513  excl >>= 1;
515  } // t
516  }
517 
518  // Write j-forces
519  storeForces<doSlow>(jatomStart + wid, s_jforce[iwarp][wid], s_jforceSlow[iwarp][wid],
522 
523  // Write exclusions
524  if (doPairlist && WARP_ANY(WARP_FULL_MASK, nexcluded & 1)) {
525  int anyexcl = (65536 | WARP_ANY(WARP_FULL_MASK, excl));
526  // Mark this jtile as non-empty:
527  // VdW: 1 if tile has atom pairs within pairlist cutoff and some these atoms interact
528  // GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
529  if (wid == 0) jtiles[jtile] = anyexcl;
530  // Store exclusions
531  tileExcls[jtile].excl[wid] = excl;
532  // itileListLen:
533  // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
534  // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
535  itileListLen += anyexcl;
536  // NOTE, this minJatomStart is only stored once for the first tile list entry
537  // minJatomStart = min(minJatomStart, jatomStart);
538  }
539 
540  } // jtile
541 
542  // Write i-forces
543  storeForces<doSlow>(iatomStart + wid, iforce, iforceSlow,
546  }
547  // Done with computation
548 
549  // Save pairlist stuff
550  if (doPairlist) {
551 
552  // Warp index (0...warpsize-1)
553  const int wid = threadIdx.x % WARPSIZE;
554 
555  if (wid == 0) {
556  // minJatomStart is in range [0 ... atomStorageSize-1]
557  //int atom0 = (minJatomStart)/WARPSIZE;
558  // int atom0 = 0;
559  // int storageOffset = atomStorageSize/WARPSIZE;
560  // int itileListLen = 0;
561  // for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) itileListLen += jtiles[jtile];
562  // Store 0 if itileListLen == 0
563  // tileListDepth[itileList] = (itileListLen > 0)*(itileListLen*storageOffset + atom0);
564  tileListDepth[itileList] = itileListLen;
565  tileListOrder[itileList] = itileList;
566  // Number of active tilelists with tile with atom pairs within pairlist cutoff that interact
567  if ((itileListLen & 65535) > 0) atomicAdd(&tileListStat->numTileLists, 1);
568  // Number of active tilelists with tiles with atom pairs within pairlist cutoff (but not necessary interacting)
569  if (itileListLen > 0) atomicAdd(&tileListStat->numTileListsGBIS, 1);
570  // NOTE: always numTileListsGBIS >= numTileLists
571  }
572 
573  typedef cub::WarpReduce<int> WarpReduceInt;
574  __shared__ typename WarpReduceInt::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
575  int warpId = threadIdx.x / WARPSIZE;
576  // Remove indicator bit
577  nexcluded >>= 1;
578  volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
579  if (wid == 0) atomicAdd(&tileListStat->numExcluded, nexcludedWarp);
580 
581  }
582 
583  if (doVirial) {
584  // Warp index (0...warpsize-1)
585  const int wid = threadIdx.x % WARPSIZE;
586 
587  typedef cub::WarpReduce<float> WarpReduce;
588  __shared__ typename WarpReduce::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
589  int warpId = threadIdx.x / WARPSIZE;
590  volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
592  volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
594  volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
596  if (wid == 0) {
597  virialEnergy[itileList].forcex = iforcexSum;
598  virialEnergy[itileList].forcey = iforceySum;
599  virialEnergy[itileList].forcez = iforcezSum;
600  }
601 
602  if (doSlow) {
603  iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
605  iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
607  iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
609  if (wid == 0) {
610  virialEnergy[itileList].forceSlowx = iforcexSum;
611  virialEnergy[itileList].forceSlowy = iforceySum;
612  virialEnergy[itileList].forceSlowz = iforcezSum;
613  }
614  }
615  }
616 
617  // Reduce energy
618  if (doEnergy) {
619  // NOTE: We must hand write these warp-wide reductions to avoid excess register spillage
620  // (Why does CUB suck here?)
621 #pragma unroll
622  for (int i=16;i >= 1;i/=2) {
623  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
624  energyElec += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec, i, 32);
625  if (doSlow) energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
626  }
627 
628  if (threadIdx.x % WARPSIZE == 0) {
629  virialEnergy[itileList].energyVdw = energyVdw;
630  virialEnergy[itileList].energyElec = energyElec;
631  if (doSlow) virialEnergy[itileList].energySlow = energySlow;
632  }
633  }
634 
635  if (doStreaming) {
636  // Make sure devForces and devForcesSlow have been written into device memory
638  __threadfence();
639 
640  int patchDone[2] = {false, false};
641  const int wid = threadIdx.x % WARPSIZE;
642  if (wid == 0) {
643  int patchCountOld0 = atomicInc(&patchNumCount[patchInd.x], (unsigned int)(patchNumList.x-1));
644  patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
645  if (patchInd.x != patchInd.y) {
646  int patchCountOld1 = atomicInc(&patchNumCount[patchInd.y], (unsigned int)(patchNumList.y-1));
647  patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
648  }
649  }
650 
651  patchDone[0] = WARP_ANY(WARP_FULL_MASK, patchDone[0]);
652  patchDone[1] = WARP_ANY(WARP_FULL_MASK, patchDone[1]);
653 
654  if (patchDone[0]) {
655  // Patch 1 is done, write onto host-mapped memory
656  CudaPatchRecord patch = cudaPatches[patchInd.x];
657  int start = patch.atomStart;
658  int end = start + patch.numAtoms;
659  for (int i=start+wid;i < end;i+=WARPSIZE) {
660  mapForces[i] = make_float4(devForce_x[i],
661  devForce_y[i], devForce_z[i], devForce_w[i]);
662  if (doSlow){
663  mapForcesSlow[i] = make_float4(devForceSlow_x[i],
664  devForceSlow_y[i],
665  devForceSlow_z[i],
666  devForceSlow_w[i]);
667  }
668  }
669  }
670  if (patchDone[1]) {
671  // Patch 2 is done
672  CudaPatchRecord patch = cudaPatches[patchInd.y];
673  int start = patch.atomStart;
674  int end = start + patch.numAtoms;
675  for (int i=start+wid;i < end;i+=WARPSIZE) {
676  mapForces[i] = make_float4(devForce_x[i], devForce_y[i], devForce_z[i], devForce_w[i]);
677  if (doSlow){
678  mapForcesSlow[i] = make_float4(devForceSlow_x[i],
679  devForceSlow_y[i],
680  devForceSlow_z[i],
681  devForceSlow_w[i]);
682  }
683  }
684  }
685 
686  if (patchDone[0] || patchDone[1]) {
687  // Make sure mapForces and mapForcesSlow are up-to-date
689  __threadfence_system();
690  // Add patch into "patchReadyQueue"
691  if (wid == 0) {
692  if (patchDone[0]) {
693  int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
694  // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
695  mapPatchReadyQueue[ind] = patchInd.x;
696  }
697  if (patchDone[1]) {
698  int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
699  // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
700  mapPatchReadyQueue[ind] = patchInd.y;
701  }
702  }
703  }
704  }
705 
706  if (doStreaming && outputOrder != NULL && threadIdx.x % WARPSIZE == 0) {
707  int index = atomicAdd(&tileListStat->outputOrderIndex, 1);
708  outputOrder[index] = itileList;
709  }
710  } // if (itileList < numTileLists)
711 }
712 
713 //
714 // Finish up - reduce virials from nonbonded kernel
715 //
716 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 32
717 __global__ void reduceNonbondedVirialKernel(const bool doSlow,
718  const int atomStorageSize,
719  const float4* __restrict__ xyzq,
720  const float4* __restrict__ devForces, const float4* __restrict__ devForcesSlow,
721  VirialEnergy* __restrict__ virialEnergy) {
722 
723  for (int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
724  {
725  int i = ibase + threadIdx.x;
726 
727  // Set to zero to avoid nan*0
728  float4 pos;
729  pos.x = 0.0f;
730  pos.y = 0.0f;
731  pos.z = 0.0f;
732  float4 force, forceSlow;
733  force.x = 0.0f;
734  force.y = 0.0f;
735  force.z = 0.0f;
736  forceSlow.x = 0.0f;
737  forceSlow.y = 0.0f;
738  forceSlow.z = 0.0f;
739  if (i < atomStorageSize) {
740  pos = xyzq[i];
741  force = devForces[i];
742  if (doSlow) forceSlow = devForcesSlow[i];
743  }
744  // Reduce across the entire thread block
745  float vxxt = force.x*pos.x;
746  float vxyt = force.x*pos.y;
747  float vxzt = force.x*pos.z;
748  float vyxt = force.y*pos.x;
749  float vyyt = force.y*pos.y;
750  float vyzt = force.y*pos.z;
751  float vzxt = force.z*pos.x;
752  float vzyt = force.z*pos.y;
753  float vzzt = force.z*pos.z;
754  // atomicAdd(&virialEnergy->virial[0], (double)vxx);
755  // atomicAdd(&virialEnergy->virial[1], (double)vxy);
756  // atomicAdd(&virialEnergy->virial[2], (double)vxz);
757  // atomicAdd(&virialEnergy->virial[3], (double)vyx);
758  // atomicAdd(&virialEnergy->virial[4], (double)vyy);
759  // atomicAdd(&virialEnergy->virial[5], (double)vyz);
760  // atomicAdd(&virialEnergy->virial[6], (double)vzx);
761  // atomicAdd(&virialEnergy->virial[7], (double)vzy);
762  // atomicAdd(&virialEnergy->virial[8], (double)vzz);
763 
764  typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
765  __shared__ typename BlockReduce::TempStorage tempStorage;
766  volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
767  volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
768  volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
769  volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
770  volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
771  volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
772  volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
773  volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
774  volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
775  if (threadIdx.x == 0) {
776  atomicAdd(&virialEnergy->virial[0], (double)vxx);
777  atomicAdd(&virialEnergy->virial[1], (double)vxy);
778  atomicAdd(&virialEnergy->virial[2], (double)vxz);
779  atomicAdd(&virialEnergy->virial[3], (double)vyx);
780  atomicAdd(&virialEnergy->virial[4], (double)vyy);
781  atomicAdd(&virialEnergy->virial[5], (double)vyz);
782  atomicAdd(&virialEnergy->virial[6], (double)vzx);
783  atomicAdd(&virialEnergy->virial[7], (double)vzy);
784  atomicAdd(&virialEnergy->virial[8], (double)vzz);
785  }
786 
787  if (doSlow) {
788  // if (isnan(forceSlow.x) || isnan(forceSlow.y) || isnan(forceSlow.z))
789  float vxxSlowt = forceSlow.x*pos.x;
790  float vxySlowt = forceSlow.x*pos.y;
791  float vxzSlowt = forceSlow.x*pos.z;
792  float vyxSlowt = forceSlow.y*pos.x;
793  float vyySlowt = forceSlow.y*pos.y;
794  float vyzSlowt = forceSlow.y*pos.z;
795  float vzxSlowt = forceSlow.z*pos.x;
796  float vzySlowt = forceSlow.z*pos.y;
797  float vzzSlowt = forceSlow.z*pos.z;
798  // atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
799  // atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
800  // atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
801  // atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
802  // atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
803  // atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
804  // atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
805  // atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
806  // atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
807  volatile float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt); BLOCK_SYNC;
808  volatile float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt); BLOCK_SYNC;
809  volatile float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt); BLOCK_SYNC;
810  volatile float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt); BLOCK_SYNC;
811  volatile float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt); BLOCK_SYNC;
812  volatile float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt); BLOCK_SYNC;
813  volatile float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt); BLOCK_SYNC;
814  volatile float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt); BLOCK_SYNC;
815  volatile float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt); BLOCK_SYNC;
816  if (threadIdx.x == 0) {
817  atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
818  atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
819  atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
820  atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
821  atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
822  atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
823  atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
824  atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
825  atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
826  }
827  }
828 
829  }
830 
831 }
832 
833 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 32
834 __global__ void reduceVirialEnergyKernel(
835  const bool doEnergy, const bool doVirial, const bool doSlow,
836  const int numTileLists,
837  const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
838  VirialEnergy* __restrict__ virialEnergy) {
839 
840  for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
841  {
842  int itileList = ibase + threadIdx.x;
844  if (itileList < numTileLists) {
845  ve = tileListVirialEnergy[itileList];
846  } else {
847  // Set to zero to avoid nan*0
848  if (doVirial) {
849  ve.shx = 0.0f;
850  ve.shy = 0.0f;
851  ve.shz = 0.0f;
852  ve.forcex = 0.0f;
853  ve.forcey = 0.0f;
854  ve.forcez = 0.0f;
855  ve.forceSlowx = 0.0f;
856  ve.forceSlowy = 0.0f;
857  ve.forceSlowz = 0.0f;
858  }
859  if (doEnergy) {
860  ve.energyVdw = 0.0;
861  ve.energyElec = 0.0;
862  ve.energySlow = 0.0;
863  // ve.energyGBIS = 0.0;
864  }
865  }
866 
867  if (doVirial) {
868  typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
869  __shared__ typename BlockReduce::TempStorage tempStorage;
870  float vxxt = ve.forcex*ve.shx;
871  float vxyt = ve.forcex*ve.shy;
872  float vxzt = ve.forcex*ve.shz;
873  float vyxt = ve.forcey*ve.shx;
874  float vyyt = ve.forcey*ve.shy;
875  float vyzt = ve.forcey*ve.shz;
876  float vzxt = ve.forcez*ve.shx;
877  float vzyt = ve.forcez*ve.shy;
878  float vzzt = ve.forcez*ve.shz;
879  volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
880  volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
881  volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
882  volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
883  volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
884  volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
885  volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
886  volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
887  volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
888  if (threadIdx.x == 0) {
889  atomicAdd(&virialEnergy->virial[0], (double)vxx);
890  atomicAdd(&virialEnergy->virial[1], (double)vxy);
891  atomicAdd(&virialEnergy->virial[2], (double)vxz);
892  atomicAdd(&virialEnergy->virial[3], (double)vyx);
893  atomicAdd(&virialEnergy->virial[4], (double)vyy);
894  atomicAdd(&virialEnergy->virial[5], (double)vyz);
895  atomicAdd(&virialEnergy->virial[6], (double)vzx);
896  atomicAdd(&virialEnergy->virial[7], (double)vzy);
897  atomicAdd(&virialEnergy->virial[8], (double)vzz);
898  }
899 
900  if (doSlow) {
901  typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
902  __shared__ typename BlockReduce::TempStorage tempStorage;
903  float vxxt = ve.forceSlowx*ve.shx;
904  float vxyt = ve.forceSlowx*ve.shy;
905  float vxzt = ve.forceSlowx*ve.shz;
906  float vyxt = ve.forceSlowy*ve.shx;
907  float vyyt = ve.forceSlowy*ve.shy;
908  float vyzt = ve.forceSlowy*ve.shz;
909  float vzxt = ve.forceSlowz*ve.shx;
910  float vzyt = ve.forceSlowz*ve.shy;
911  float vzzt = ve.forceSlowz*ve.shz;
912  volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
913  volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
914  volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
915  volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
916  volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
917  volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
918  volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
919  volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
920  volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
921  if (threadIdx.x == 0) {
922  atomicAdd(&virialEnergy->virialSlow[0], (double)vxx);
923  atomicAdd(&virialEnergy->virialSlow[1], (double)vxy);
924  atomicAdd(&virialEnergy->virialSlow[2], (double)vxz);
925  atomicAdd(&virialEnergy->virialSlow[3], (double)vyx);
926  atomicAdd(&virialEnergy->virialSlow[4], (double)vyy);
927  atomicAdd(&virialEnergy->virialSlow[5], (double)vyz);
928  atomicAdd(&virialEnergy->virialSlow[6], (double)vzx);
929  atomicAdd(&virialEnergy->virialSlow[7], (double)vzy);
930  atomicAdd(&virialEnergy->virialSlow[8], (double)vzz);
931  }
932  }
933  }
934 
935  if (doEnergy) {
936  typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
937  __shared__ typename BlockReduce::TempStorage tempStorage;
938  volatile double energyVdw = BlockReduce(tempStorage).Sum(ve.energyVdw); BLOCK_SYNC;
939  volatile double energyElec = BlockReduce(tempStorage).Sum(ve.energyElec); BLOCK_SYNC;
940  if (threadIdx.x == 0) {
941  atomicAdd(&virialEnergy->energyVdw, (double)energyVdw);
942  atomicAdd(&virialEnergy->energyElec, (double)energyElec);
943  }
944  if (doSlow) {
945  volatile double energySlow = BlockReduce(tempStorage).Sum(ve.energySlow); BLOCK_SYNC;
946  if (threadIdx.x == 0) atomicAdd(&virialEnergy->energySlow, (double)energySlow);
947  }
948  // if (doGBIS) {
949  // double energyGBIS = BlockReduce(tempStorage).Sum(ve.energyGBIS); BLOCK_SYNC;
950  // if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
951  // }
952  }
953 
954  }
955 
956 }
957 
958 #define REDUCEGBISENERGYKERNEL_NUM_WARP 32
959 __global__ void reduceGBISEnergyKernel(const int numTileLists,
960  const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
961  VirialEnergy* __restrict__ virialEnergy) {
962 
963  for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
964  {
965  int itileList = ibase + threadIdx.x;
966  double energyGBISt = 0.0;
967  if (itileList < numTileLists) {
968  energyGBISt = tileListVirialEnergy[itileList].energyGBIS;
969  }
970 
971  typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
972  __shared__ typename BlockReduce::TempStorage tempStorage;
973  volatile double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt); BLOCK_SYNC;
974  if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
975  }
976 
977 }
978 
979 // ##############################################################################################
980 // ##############################################################################################
981 // ##############################################################################################
982 
984  bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
985 
986  cudaCheck(cudaSetDevice(deviceID));
987 
988  overflowExclusions = NULL;
989  overflowExclusionsSize = 0;
990 
991  exclIndexMaxDiff = NULL;
992  exclIndexMaxDiffSize = 0;
993 
994  atomIndex = NULL;
995  atomIndexSize = 0;
996 
997  vdwTypes = NULL;
998  vdwTypesSize = 0;
999 
1000  patchNumCount = NULL;
1001  patchNumCountSize = 0;
1002 
1003  patchReadyQueue = NULL;
1004  patchReadyQueueSize = 0;
1005 
1006  force_x = force_y = force_z = force_w = NULL;
1007  forceSize = 0;
1008  forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
1009  forceSlowSize = 0;
1010 }
1011 
1013 {
1014  reallocate_device<float>(&force_x, &forceSize, atomStorageSize, 1.4f);
1015  reallocate_device<float>(&force_y, &forceSize, atomStorageSize, 1.4f);
1016  reallocate_device<float>(&force_z, &forceSize, atomStorageSize, 1.4f);
1017  reallocate_device<float>(&force_w, &forceSize, atomStorageSize, 1.4f);
1018  reallocate_device<float>(&forceSlow_x, &forceSlowSize, atomStorageSize, 1.4f);
1019  reallocate_device<float>(&forceSlow_y, &forceSlowSize, atomStorageSize, 1.4f);
1020  reallocate_device<float>(&forceSlow_z, &forceSlowSize, atomStorageSize, 1.4f);
1021  reallocate_device<float>(&forceSlow_w, &forceSlowSize, atomStorageSize, 1.4f);
1022 }
1023 
1025  cudaCheck(cudaSetDevice(deviceID));
1026  if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
1027  if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
1028  if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
1029  if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
1030  if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
1031  if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
1032  if (force_x != NULL) deallocate_device<float>(&force_x);
1033  if (force_y != NULL) deallocate_device<float>(&force_y);
1034  if (force_z != NULL) deallocate_device<float>(&force_z);
1035  if (force_w != NULL) deallocate_device<float>(&force_w);
1036  if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
1037  if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
1038  if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
1039  if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);
1040 }
1041 
1042 void CudaComputeNonbondedKernel::updateVdwTypesExcl(const int atomStorageSize, const int* h_vdwTypes,
1043  const int2* h_exclIndexMaxDiff, const int* h_atomIndex, cudaStream_t stream) {
1044 
1045  reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
1046  reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
1047  reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
1048 
1049  copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
1050  copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
1051  copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
1052 }
1053 
1055  if (!doStreaming) {
1056  NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
1057  }
1058  return patchReadyQueue;
1059 }
1060 
1061 template <int doSlow>
1062 __global__ void transposeForcesKernel(float4 *f, float4 *fSlow,
1063  float *fx, float *fy, float *fz, float *fw,
1064  float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww,
1065  int n)
1066 {
1067  int tid = blockIdx.x*blockDim.x + threadIdx.x;
1068  if (tid < n) {
1069  f[tid] = make_float4(fx[tid], fy[tid], fz[tid], fw[tid]);
1070  if (doSlow) {
1071  fSlow[tid] = make_float4(fSlowx[tid], fSlowy[tid], fSlowz[tid], fSloww[tid]);
1072  }
1073  }
1074 }
1075 
1076 
1077 
1079  const int atomStorageSize, const bool doPairlist,
1080  const bool doEnergy, const bool doVirial, const bool doSlow,
1081  const float3 lata, const float3 latb, const float3 latc,
1082  const float4* h_xyzq, const float cutoff2,
1083  float4* d_forces, float4* d_forcesSlow,
1084  float4* h_forces, float4* h_forcesSlow,
1085  cudaStream_t stream) {
1086 
1087  if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
1088 
1089  // clear_device_array<float4>(d_forces, atomStorageSize, stream);
1090  // if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, stream);
1091 
1092 
1093  // XXX TODO: Clear all of these
1094  if(1){
1095  // two clears
1096  tlKernel.clearTileListStat(stream);
1097  clear_device_array<float>(force_x, atomStorageSize, stream);
1098  clear_device_array<float>(force_y, atomStorageSize, stream);
1099  clear_device_array<float>(force_z, atomStorageSize, stream);
1100  clear_device_array<float>(force_w, atomStorageSize, stream);
1101  if (doSlow) {
1102  clear_device_array<float>(forceSlow_x, atomStorageSize, stream);
1103  clear_device_array<float>(forceSlow_y, atomStorageSize, stream);
1104  clear_device_array<float>(forceSlow_z, atomStorageSize, stream);
1105  clear_device_array<float>(forceSlow_w, atomStorageSize, stream);
1106  }
1107  }
1108 
1109  // --- streaming ----
1110  float4* m_forces = NULL;
1111  float4* m_forcesSlow = NULL;
1112  int* m_patchReadyQueue = NULL;
1113  int numPatches = 0;
1114  unsigned int* patchNumCountPtr = NULL;
1115  if (doStreaming) {
1116  numPatches = tlKernel.getNumPatches();
1117  if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
1118  // If re-allocated, clear array
1119  clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
1120  }
1121  patchNumCountPtr = patchNumCount;
1122  bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
1123  if (re) {
1124  // If re-allocated, re-set to "-1"
1125  for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
1126  }
1127  cudaCheck(cudaHostGetDevicePointer(&m_patchReadyQueue, patchReadyQueue, 0));
1128  cudaCheck(cudaHostGetDevicePointer(&m_forces, h_forces, 0));
1129  cudaCheck(cudaHostGetDevicePointer(&m_forcesSlow, h_forcesSlow, 0));
1130  }
1131  // -----------------
1132 
1133  if (doVirial || doEnergy) {
1134  tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
1135  }
1136 
1137  int shMemSize = 0;
1138 
1139  int* outputOrderPtr = tlKernel.getOutputOrder();
1140 
1141  int nwarp = NONBONDKERNEL_NUM_WARP;
1142  int nthread = WARPSIZE*nwarp;
1143  int start = 0;
1144  while (start < tlKernel.getNumTileLists())
1145  {
1146 
1147  int nleft = tlKernel.getNumTileLists() - start;
1148  int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
1149 
1150 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
1151  nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
1152  <<< nblock, nthread, shMemSize, stream >>> \
1153  (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
1154  cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
1155  vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
1156  cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
1157  atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
1158  tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
1159  tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
1160  force_x, force_y, force_z, force_w, \
1161  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
1162  numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
1163  outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
1164 
1165  bool called = false;
1166 
1167  if (doStreaming) {
1168  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1);
1169  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1);
1170  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1);
1171  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1);
1172  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1);
1173  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1);
1174  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1);
1175  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1);
1176 
1177  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1);
1178  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1);
1179  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1);
1180  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1);
1181  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1);
1182  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1);
1183  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1);
1184  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1);
1185  } else {
1186  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0);
1187  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0);
1188  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0);
1189  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0);
1190  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0);
1191  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0);
1192  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0);
1193  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0);
1194 
1195  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0);
1196  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0);
1197  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0);
1198  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0);
1199  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0);
1200  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0);
1201  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0);
1202  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0);
1203  }
1204 
1205  if (!called) {
1206  NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
1207  }
1208 
1209  {
1210  int block = 128;
1211  int grid = (atomStorageSize + block - 1)/block;
1212  if (doSlow)
1213  transposeForcesKernel<1><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
1214  force_x, force_y, force_z, force_w,
1215  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1216  atomStorageSize);
1217  else
1218  transposeForcesKernel<0><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
1219  force_x, force_y, force_z, force_w,
1220  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1221  atomStorageSize);
1222  }
1223 
1224 #undef CALL
1225  cudaCheck(cudaGetLastError());
1226 
1227  start += nblock*nwarp;
1228  }
1229 
1230 }
1231 
1232 //
1233 // Perform virial and energy reductions for non-bonded force calculation
1234 //
1236  const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS,
1237  float4* d_forces, float4* d_forcesSlow,
1238  VirialEnergy* d_virialEnergy, cudaStream_t stream) {
1239 
1240  if (doEnergy || doVirial) {
1241  clear_device_array<VirialEnergy>(d_virialEnergy, 1, stream);
1242  }
1243 
1244  if (doVirial)
1245  {
1247  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1248  reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
1249  (doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
1250  cudaCheck(cudaGetLastError());
1251  }
1252 
1253  if (doVirial || doEnergy)
1254  {
1256  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
1257  reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
1258  (doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
1259  cudaCheck(cudaGetLastError());
1260  }
1261 
1262  if (doGBIS && doEnergy)
1263  {
1265  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
1266  reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
1267  (tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
1268  cudaCheck(cudaGetLastError());
1269  }
1270 
1271 }
1272 
1273 void CudaComputeNonbondedKernel::bindExclusions(int numExclusions, unsigned int* exclusion_bits) {
1274  int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
1275  cudaCheck(cudaMemcpyToSymbol(constExclusions, exclusion_bits, nconst*sizeof(unsigned int), 0));
1276 
1277  reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
1278  copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
1279 }
__global__ void reduceGBISEnergyKernel(const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
#define WARP_ALL(MASK, P)
Definition: CudaUtils.h:40
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
__forceinline__ __device__ void storeForces(const T fx, const T fy, const T fz, const int ind, const int stride, T *force)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t energyTableTex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ mapForcesSlow
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
void setTileListVirialEnergyLength(int len)
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
float x
Definition: PmeSolver.C:3
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ devForce_y
void clearTileListStat(cudaStream_t stream)
static __thread float4 * forces
#define OVERALLOC
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ devForces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ tileListStat
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ devForce_x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t forceTableTex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__global__ void transposeForcesKernel(float4 *f, float4 *fSlow, float *fx, float *fy, float *fz, float *fw, float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww, int n)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
__device__ __forceinline__ void shuffleNext(float &w)
#define LARGE_FLOAT
#define NONBONDKERNEL_NUM_WARP
void reallocate_forceSOA(int atomStorageSize)
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t vdwCoefTableTex
__global__ void reduceNonbondedVirialKernel(const bool doSlow, const int atomStorageSize, const float4 *__restrict__ xyzq, const float4 *__restrict__ devForces, const float4 *__restrict__ devForcesSlow, VirialEnergy *__restrict__ virialEnergy)
__global__ void const int const TileList *__restrict__ tileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
float y
Definition: PmeSolver.C:3
__global__ void reduceVirialEnergyKernel(const bool doEnergy, const bool doVirial, const bool doSlow, const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
gridSize z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForce_z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ devForcesSlow
#define MAX_CONST_EXCLUSIONS
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
#define __ldg
int getMaxNumBlocks()
Definition: DeviceCUDA.C:415
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float plcutoff2
void NAMD_die(const char *err_msg)
Definition: common.C:83
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ vdwTypes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_z
float3 offsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_x
__constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ mapForces
__global__ void const int numTileLists
__shared__ union @43 tempStorage
#define REDUCEVIRIALENERGYKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForce_w
__device__ __forceinline__ void calcForceEnergy(const float r2, const float qi, const float qj, const float dx, const float dy, const float dz, const int vdwtypei, const int vdwtypej, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, float3 &iforce, float3 &iforceSlow, float3 &jforce, float3 &jforceSlow, float &energyVdw, float &energyElec, float &energySlow)
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:43
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
TileListVirialEnergy * getTileListVirialEnergy()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:32
gridSize y
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ mapPatchReadyQueue
#define WARPSIZE
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ patchPairs
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ tileExcls
gridSize x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_y
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_w
#define CALL(DOENERGY, DOVIRIAL)
#define WARP_ANY(MASK, P)
Definition: CudaUtils.h:41
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP
#define REDUCEGBISENERGYKERNEL_NUM_WARP
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:38