NAMD
CudaComputeGBISKernel.cu
Go to the documentation of this file.
1 #include <cuda.h>
2 #if __CUDACC_VER_MAJOR__ >= 11
3 #include <cub/cub.cuh>
4 #else
5 #include <namd_cub/cub.cuh>
6 #endif
7 #include "CudaUtils.h"
8 #include "CudaTileListKernel.h"
10 #define GBIS_CUDA
11 #include "ComputeGBIS.inl"
12 #include "DeviceCUDA.h"
13 #ifdef WIN32
14 #define __thread __declspec(thread)
15 #endif
16 extern __thread DeviceCUDA *deviceCUDA;
17 
18 #define LARGE_FLOAT (float)(1.0e10)
19 
20 #define GBISKERNEL_NUM_WARP 4
21 
22 __device__ __forceinline__
23 void shuffleNext(float& w) {
24  w = WARP_SHUFFLE(WARP_FULL_MASK, w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
25 }
26 
27 // Generic
28 template <int phase> struct GBISParam {};
29 template <int phase> struct GBISInput {};
30 template <int phase> struct GBISResults {};
31 
32 // ------------------------------------------------------------------------------------------------
33 // Phase 1 specialization
34 
35 template <> struct GBISParam<1> {
36  float a_cut;
37 };
38 
39 template <> struct GBISInput<1> {
40  float qi, qj;
41  float intRad0j, intRadSi;
42  // inp1 = intRad0
43  // inp2 = intRadS
44  __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
45  qi = inp1[i];
46  intRadSi = inp2[i];
47  }
48  __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
49  qj = inp2[i];
50  intRad0j = inp1[i];
51  }
52  __device__ __forceinline__ void initQi(const GBISParam<1> param, const float q) {}
53  __device__ __forceinline__ void initQj(const float q) {}
54  __device__ __forceinline__ void shuffleNext() {
55  qj = WARP_SHUFFLE(WARP_FULL_MASK, qj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
56  intRad0j = WARP_SHUFFLE(WARP_FULL_MASK, intRad0j, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
57  }
58 };
59 
60 template <> struct GBISResults<1> {
61  float psiSum;
62  __device__ __forceinline__ void init() {psiSum = 0.0f;}
63  __device__ __forceinline__ void shuffleNext() {
64  psiSum = WARP_SHUFFLE(WARP_FULL_MASK, psiSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
65  }
66 };
67 
68 // calculate H(i,j) [and not H(j,i)]
69 template <bool doEnergy, bool doSlow>
70 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
71  const float cutoff2, const GBISParam<1> param, const GBISInput<1> inp,
72  GBISResults<1>& resI, GBISResults<1>& resJ, float& energy) {
73 
74  if (r2 < cutoff2 && r2 > 0.01f) {
75  float r_i = rsqrtf(r2);
76  float r = r2 * r_i;
77  float hij;
78  int dij;
79  CalcH(r, r2, r_i, param.a_cut, inp.qi, inp.qj, hij, dij);
80  resI.psiSum += hij;
81  float hji;
82  int dji;
83  CalcH(r, r2, r_i, param.a_cut, inp.intRad0j, inp.intRadSi, hji, dji);
84  resJ.psiSum += hji;
85  }
86 
87 }
88 
89 __device__ __forceinline__ void writeResult(const int i, const GBISResults<1> res, float* out, float4* forces) {
90  atomicAdd(&out[i], res.psiSum);
91 }
92 
93 // ------------------------------------------------------------------------------------------------
94 // Phase 2
95 
96 template <> struct GBISParam<2> {
97  float kappa;
98  float epsilon_p_i, epsilon_s_i;
99  float smoothDist;
100  float r_cut2, r_cut_2, r_cut_4;
101  float scaling;
102 };
103 
104 template <> struct GBISInput<2> {
105  float qi, qj;
106  float bornRadI, bornRadJ;
107  // inp1 = bornRad
108  __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
109  bornRadI = inp1[i];
110  }
111  __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
112  bornRadJ = inp1[i];
113  }
114  __device__ __forceinline__ void initQi(const GBISParam<2> param, const float q) {
115  qi = -q*param.scaling;
116  }
117  __device__ __forceinline__ void initQj(const float q) {
118  qj = q;
119  }
120  __device__ __forceinline__ void shuffleNext() {
121  qj = WARP_SHUFFLE(WARP_FULL_MASK, qj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
122  bornRadJ = WARP_SHUFFLE(WARP_FULL_MASK, bornRadJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
123  }
124 };
125 
126 template <> struct GBISResults<2> {
127  float3 force;
128  float dEdaSum;
129  __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f; dEdaSum = 0.0f;}
130  __device__ __forceinline__ void shuffleNext() {
131  force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
132  force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
133  force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
134  dEdaSum = WARP_SHUFFLE(WARP_FULL_MASK, dEdaSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
135  }
136 };
137 
138 template <bool doEnergy, bool doSlow>
139 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
140  const float cutoff2, const GBISParam<2> param, const GBISInput<2> inp,
141  GBISResults<2>& resI, GBISResults<2>& resJ, float& energyT) {
142 
143  if (r2 < cutoff2 && r2 > 0.01f) {
144  float r_i = rsqrtf(r2);
145  float r = r2 * r_i;
146  //float bornRadJ = sh_jBornRad[j];
147 
148  //calculate GB energy
149  float qiqj = inp.qi*inp.qj;
150  float aiaj = inp.bornRadI*inp.bornRadJ;
151  float aiaj4 = 4*aiaj;
152  float expr2aiaj4 = exp(-r2/aiaj4);
153  float fij = sqrt(r2 + aiaj*expr2aiaj4);
154  float f_i = 1/fij;
155  float expkappa = exp(-param.kappa*fij);
156  float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
157  float gbEij = qiqj*Dij*f_i;
158 
159  //calculate energy derivatives
160  float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
161  float ddrf_i = -ddrfij*f_i*f_i;
162  float ddrDij = param.kappa*expkappa*ddrfij*param.epsilon_s_i;
163  float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
164 
165  //NAMD smoothing function
166  float scale = 1.f;
167  float ddrScale = 0.f;
168  float forcedEdr;
169  if (param.smoothDist > 0.f) {
170  scale = r2 * param.r_cut_2 - 1.f;
171  scale *= scale;
172  ddrScale = r*(r2-param.r_cut2)*param.r_cut_4;
173  if (doEnergy) energyT += gbEij * scale;
174  forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
175  } else {
176  if (doEnergy) energyT += gbEij;
177  forcedEdr = ddrGbEij;
178  }
179 
180  //add dEda
181  if (doSlow) {
182  float dEdai = 0.5f*qiqj*f_i*f_i
183  *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
184  *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadI*scale;//0
185  resI.dEdaSum += dEdai;
186  float dEdaj = 0.5f*qiqj*f_i*f_i
187  *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
188  *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadJ*scale;//0
189  resJ.dEdaSum += dEdaj;
190  }
191 
192  forcedEdr *= r_i;
193  float tmpx = dx*forcedEdr;
194  float tmpy = dy*forcedEdr;
195  float tmpz = dz*forcedEdr;
196  resI.force.x += tmpx;
197  resI.force.y += tmpy;
198  resI.force.z += tmpz;
199  resJ.force.x -= tmpx;
200  resJ.force.y -= tmpy;
201  resJ.force.z -= tmpz;
202  }
203 
204  // GB Self Energy
205  if (doEnergy) {
206  if (r2 < 0.01f) {
207  float fij = inp.bornRadI;//inf
208  float expkappa = exp(-param.kappa*fij);//0
209  float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
210  float gbEij = inp.qi*(inp.qi / (-param.scaling) )*Dij/fij;
211  energyT += 0.5f*gbEij;
212  } //same atom or within cutoff
213  }
214 }
215 
216 __device__ __forceinline__ void writeResult(const int i, const GBISResults<2> res, float* out, float4* forces) {
217  atomicAdd(&out[i], res.dEdaSum);
218  atomicAdd(&forces[i].x, res.force.x);
219  atomicAdd(&forces[i].y, res.force.y);
220  atomicAdd(&forces[i].z, res.force.z);
221 }
222 
223 // ------------------------------------------------------------------------------------------------
224 // Phase 3
225 template <> struct GBISParam<3> {
226  float a_cut;
227 };
228 
229 template <> struct GBISInput<3> {
230  float qi;
231  float intRadSJ, intRadJ0, intRadIS;
232  float dHdrPrefixI, dHdrPrefixJ;
233  // inp1 = intRad0
234  // inp2 = intRadS
235  // inp3 = dHdrPrefix
236  __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
237  qi = inp1[i];
238  intRadIS = inp2[i];
239  dHdrPrefixI = inp3[i];
240  }
241  __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
242  intRadJ0 = inp1[i];
243  intRadSJ = inp2[i];
244  dHdrPrefixJ = inp3[i];
245  }
246  __device__ __forceinline__ void initQi(const GBISParam<3> param, const float q) {}
247  __device__ __forceinline__ void initQj(const float q) {}
248  __device__ __forceinline__ void shuffleNext() {
249  intRadSJ = WARP_SHUFFLE(WARP_FULL_MASK, intRadSJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
250  dHdrPrefixJ = WARP_SHUFFLE(WARP_FULL_MASK, dHdrPrefixJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
251  intRadJ0 = WARP_SHUFFLE(WARP_FULL_MASK, intRadJ0, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
252  }
253 };
254 
255 template <> struct GBISResults<3> {
256  float3 force;
257  __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f;}
258  __device__ __forceinline__ void shuffleNext() {
259  force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
260  force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
261  force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
262  }
263 };
264 
265 template <bool doEnergy, bool doSlow>
266 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
267  const float cutoff2, const GBISParam<3> param, const GBISInput<3> inp,
268  GBISResults<3>& resI, GBISResults<3>& resJ, float& energy) {
269 
270  if (r2 < cutoff2 && r2 > 0.01f) {
271  float r_i = rsqrtf(r2);
272  float r = r2 * r_i;
273  float dhij, dhji;
274  int dij, dji;
275  CalcDH(r, r2, r_i, param.a_cut, inp.qi, inp.intRadSJ, dhij, dij);
276  CalcDH(r, r2, r_i, param.a_cut, inp.intRadJ0, inp.intRadIS, dhji, dji);
277 
278  float forceAlpha = r_i*(inp.dHdrPrefixI*dhij + inp.dHdrPrefixJ*dhji);
279  float tmpx = dx * forceAlpha;
280  float tmpy = dy * forceAlpha;
281  float tmpz = dz * forceAlpha;
282  resI.force.x += tmpx;
283  resI.force.y += tmpy;
284  resI.force.z += tmpz;
285  resJ.force.x -= tmpx;
286  resJ.force.y -= tmpy;
287  resJ.force.z -= tmpz;
288  }
289 
290 }
291 
292 __device__ __forceinline__ void writeResult(const int i, const GBISResults<3> res, float* out, float4* forces) {
293  atomicAdd(&forces[i].x, res.force.x);
294  atomicAdd(&forces[i].y, res.force.y);
295  atomicAdd(&forces[i].z, res.force.z);
296 }
297 
298 // ------------------------------------------------------------------------------------------------
299 
300 //
301 // GBIS kernel
302 //
303 template <bool doEnergy, bool doSlow, int phase>
304 __global__ void
307  const TileList* __restrict__ tileLists,
308  const int* __restrict__ tileJatomStart,
309  const PatchPairRecord* __restrict__ patchPairs,
310  const float3 lata, const float3 latb, const float3 latc,
311  const float4* __restrict__ xyzq,
312  const float cutoff2,
313  const GBISParam<phase> param,
314  const float* __restrict__ inp1,
315  const float* __restrict__ inp2,
316  const float* __restrict__ inp3,
317  float* __restrict__ out,
318  float4* __restrict__ forces,
319  TileListVirialEnergy* __restrict__ virialEnergy) {
320 
321  // Single warp takes care of one list of tiles
322  for (int itileList = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;itileList < numTileLists;itileList += blockDim.x/WARPSIZE*gridDim.x)
323  {
324  TileList tmp = tileLists[itileList];
325  int iatomStart = tmp.iatomStart;
326  int jtileStart = tmp.jtileStart;
327  int jtileEnd = tmp.jtileEnd;
328  PatchPairRecord PPStmp = patchPairs[itileList];
329  int iatomSize = PPStmp.iatomSize;
330  int jatomSize = PPStmp.jatomSize;
331 
332  float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
333  float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
334  float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
335 
336  // Warp index (0...warpsize-1)
337  const int wid = threadIdx.x % WARPSIZE;
338 
339  // Load i-atom data (and shift coordinates)
340  float4 xyzq_i = xyzq[iatomStart + wid];
341  xyzq_i.x += shx;
342  xyzq_i.y += shy;
343  xyzq_i.z += shz;
344 
345  GBISInput<phase> inp;
346  inp.loadI(iatomStart + wid, inp1, inp2, inp3);
347  if (phase == 2) inp.initQi(param, xyzq_i.w);
348 
349  // Number of i loops
350  int nloopi = min(iatomSize - iatomStart, WARPSIZE);
351 
352  GBISResults<phase> resI;
353  resI.init();
354  float energyT;
355  if (doEnergy) energyT = 0.0f;
356 
357  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
358 
359  // Load j-atom starting index and exclusion mask
360  int jatomStart = tileJatomStart[jtile];
361 
362  // Load coordinates and charge
363  float4 xyzq_j = xyzq[jatomStart + wid];
364 
365  inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
366  if (phase == 2) inp.initQj(xyzq_j.w);
367 
368  // Number of j loops
369  int nloopj = min(jatomSize - jatomStart, WARPSIZE);
370 
371  const bool diag_tile = (iatomStart == jatomStart);
372  const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
373  int t = (phase != 2 && diag_tile) ? 1 : 0;
374  if (phase != 2 && diag_tile) {
375  inp.shuffleNext();
376  }
377 
378  GBISResults<phase> resJ;
379  resJ.init();
380 
381  for (;t < WARPSIZE;t++) {
382  int j = (t + wid) & modval;
383 
384  float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
385  float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
386  float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
387 
388  float r2 = dx*dx + dy*dy + dz*dz;
389 
390  if (j < nloopj && wid < nloopi) {
391  calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz, cutoff2, param, inp, resI, resJ, energyT);
392  }
393 
394  inp.shuffleNext();
395  resJ.shuffleNext();
396  } // t
397 
398  // Write j
399  writeResult(jatomStart + wid, resJ, out, forces);
400 
401  } // jtile
402 
403  // Write i
404  writeResult(iatomStart + wid, resI, out, forces);
405 
406  // Reduce energy
407  if (doEnergy) {
408  typedef cub::WarpReduce<double> WarpReduceDouble;
409  __shared__ typename WarpReduceDouble::TempStorage tempStorage[GBISKERNEL_NUM_WARP];
410  int warpId = threadIdx.x / WARPSIZE;
411  volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((double)energyT);
412  if (wid == 0) virialEnergy[itileList].energyGBIS = energyTot;
413  }
414 
415  }
416 }
417 
418 #undef GBIS_CUDA
419 
420 // ##############################################################################################
421 // ##############################################################################################
422 // ##############################################################################################
423 
424 //
425 // Class constructor
426 //
427 CudaComputeGBISKernel::CudaComputeGBISKernel(int deviceID) : deviceID(deviceID) {
428  cudaCheck(cudaSetDevice(deviceID));
429 
430  intRad0 = NULL;
431  intRad0Size = 0;
432 
433  intRadS = NULL;
434  intRadSSize = 0;
435 
436  psiSum = NULL;
437  psiSumSize = 0;
438 
439  bornRad = NULL;
440  bornRadSize = 0;
441 
442  dEdaSum = NULL;
443  dEdaSumSize = 0;
444 
445  dHdrPrefix = NULL;
446  dHdrPrefixSize = 0;
447 
448 }
449 
450 //
451 // Class destructor
452 //
454  cudaCheck(cudaSetDevice(deviceID));
455  if (intRad0 != NULL) deallocate_device<float>(&intRad0);
456  if (intRadS != NULL) deallocate_device<float>(&intRadS);
457  if (psiSum != NULL) deallocate_device<float>(&psiSum);
458  if (bornRad != NULL) deallocate_device<float>(&bornRad);
459  if (dEdaSum != NULL) deallocate_device<float>(&dEdaSum);
460  if (dHdrPrefix != NULL) deallocate_device<float>(&dHdrPrefix);
461 }
462 
463 //
464 // Update (copy Host->Device) intRad0, intRadS
465 //
467  cudaStream_t stream) {
468 
469  reallocate_device<float>(&intRad0, &intRad0Size, atomStorageSize, 1.2f);
470  reallocate_device<float>(&intRadS, &intRadSSize, atomStorageSize, 1.2f);
471 
472  copy_HtoD<float>(intRad0H, intRad0, atomStorageSize, stream);
473  copy_HtoD<float>(intRadSH, intRadS, atomStorageSize, stream);
474 }
475 
476 //
477 // Update (copy Host->Device) bornRad
478 //
480  reallocate_device<float>(&bornRad, &bornRadSize, atomStorageSize, 1.2f);
481  copy_HtoD<float>(bornRadH, bornRad, atomStorageSize, stream);
482 }
483 
484 //
485 // Update (copy Host->Device) dHdrPrefix
486 //
488  reallocate_device<float>(&dHdrPrefix, &dHdrPrefixSize, atomStorageSize, 1.2f);
489  copy_HtoD<float>(dHdrPrefixH, dHdrPrefix, atomStorageSize, stream);
490 }
491 
492 //
493 // Phase 1
494 //
496  const float3 lata, const float3 latb, const float3 latc, const float a_cut, float* h_psiSum,
497  cudaStream_t stream) {
498 
499  reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f);
500  clear_device_array<float>(psiSum, atomStorageSize, stream);
501 
502  int nwarp = GBISKERNEL_NUM_WARP;
503  int nthread = WARPSIZE*nwarp;
504  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
505 
506  GBISParam<1> param;
507  param.a_cut = a_cut;
508 
509  float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
510 
511  GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>>
512  (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
513  tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
514  param, intRad0, intRadS, NULL, psiSum, NULL, NULL);
515 
516  cudaCheck(cudaGetLastError());
517 
518  copy_DtoH<float>(psiSum, h_psiSum, atomStorageSize, stream);
519 }
520 
521 //
522 // Phase 2
523 //
525  const bool doEnergy, const bool doSlow,
526  const float3 lata, const float3 latb, const float3 latc,
527  const float r_cut, const float scaling, const float kappa, const float smoothDist,
528  const float epsilon_p, const float epsilon_s,
529  float4* d_forces, float* h_dEdaSum, cudaStream_t stream) {
530 
531  reallocate_device<float>(&dEdaSum, &dEdaSumSize, atomStorageSize, 1.2f);
532  clear_device_array<float>(dEdaSum, atomStorageSize, stream);
533 
534  if (doEnergy) {
536  }
537 
538  int nwarp = GBISKERNEL_NUM_WARP;
539  int nthread = WARPSIZE*nwarp;
540 
541  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
542 
543  GBISParam<2> param;
544  param.r_cut2 = r_cut*r_cut;
545  param.r_cut_2 = 1.f / param.r_cut2;
546  param.r_cut_4 = 4.f*param.r_cut_2*param.r_cut_2;
547  param.epsilon_s_i = 1.f / epsilon_s;
548  param.epsilon_p_i = 1.f / epsilon_p;
549  param.scaling = scaling;
550  param.kappa = kappa;
551  param.smoothDist = smoothDist;
552 
553 #define CALL(DOENERGY, DOSLOW) GBIS_Kernel<DOENERGY, DOSLOW, 2> \
554  <<< nblock, nthread, 0, stream >>> \
555  (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
556  tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
557  param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
558 
559  if (!doEnergy && !doSlow) CALL(false, false);
560  if (!doEnergy && doSlow) CALL(false, true);
561  if ( doEnergy && !doSlow) CALL(true, false);
562  if ( doEnergy && doSlow) CALL(true, true);
563 
564  cudaCheck(cudaGetLastError());
565 
566  copy_DtoH<float>(dEdaSum, h_dEdaSum, atomStorageSize, stream);
567 }
568 
569 //
570 // Phase 3
571 //
573  const float3 lata, const float3 latb, const float3 latc, const float a_cut,
574  float4* d_forces, cudaStream_t stream) {
575 
576  int nwarp = GBISKERNEL_NUM_WARP;
577  int nthread = WARPSIZE*nwarp;
578  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
579 
580  GBISParam<3> param;
581  param.a_cut = a_cut;
582 
583  float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
584 
585  GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>>
586  (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
587  tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
588  param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL);
589 
590  cudaCheck(cudaGetLastError());
591 }
__global__ void GBIS_Kernel(const int numTileLists, const TileList *__restrict__ tileLists, const int *__restrict__ tileJatomStart, const PatchPairRecord *__restrict__ patchPairs, const float3 lata, const float3 latb, const float3 latc, const float4 *__restrict__ xyzq, const float cutoff2, const GBISParam< phase > param, const float *__restrict__ inp1, const float *__restrict__ inp2, const float *__restrict__ inp3, float *__restrict__ out, float4 *__restrict__ forces, TileListVirialEnergy *__restrict__ virialEnergy)
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
__device__ __forceinline__ void loadI(const int i, const float *inp1, const float *inp2, const float *inp3)
__device__ __forceinline__ void initQi(const GBISParam< 2 > param, const float q)
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
PatchPairRecord * getPatchPairs()
__device__ __forceinline__ void initQj(const float q)
__device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 1 > param, const GBISInput< 1 > inp, GBISResults< 1 > &resI, GBISResults< 1 > &resJ, float &energy)
static __thread float4 * forces
static __thread float * bornRadH
void setTileListVirialEnergyGBISLength(int len)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
static __thread float * dHdrPrefixH
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__device__ __forceinline__ void shuffleNext()
if(ComputeNonbondedUtil::goMethod==2)
__device__ __forceinline__ void shuffleNext()
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
__device__ __forceinline__ void initQj(const float q)
__device__ __forceinline__ void shuffleNext()
__device__ __forceinline__ void shuffleNext(float &w)
__device__ __forceinline__ void init()
__device__ __forceinline__ void writeResult(const int i, const GBISResults< 1 > res, float *out, float4 *forces)
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ tileLists
__device__ __forceinline__ void loadI(const int i, const float *inp1, const float *inp2, const float *inp3)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
TileList * getTileListsGBIS()
static __thread float * intRadSH
static void CalcDH(float r, float r2, float ri, float rc, float r0, float rs, float &dh, int &d)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__device__ __forceinline__ void loadJ(const int i, const float *inp1, const float *inp2, const float *inp3)
__device__ __forceinline__ void init()
__device__ __forceinline__ void initQj(const float q)
gridSize z
int getMaxNumBlocks()
Definition: DeviceCUDA.C:415
float3 offsetXYZ
__global__ void const int numTileLists
__device__ __forceinline__ void init()
__shared__ union @43 tempStorage
#define FS_MAX
Definition: ComputeGBIS.inl:24
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__device__ __forceinline__ void initQi(const GBISParam< 1 > param, const float q)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
__device__ __forceinline__ void initQi(const GBISParam< 3 > param, const float q)
__device__ __forceinline__ void loadJ(const int i, const float *inp1, const float *inp2, const float *inp3)
static void CalcH(float r, float r2, float ri, float rc, float r0, float rs, float &h, int &d)
gridSize y
#define WARPSIZE
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
__device__ __forceinline__ void shuffleNext()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ patchPairs
gridSize x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__device__ __forceinline__ void shuffleNext()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
#define CALL(DOENERGY, DOVIRIAL)
#define GBISKERNEL_NUM_WARP
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
__device__ __forceinline__ void shuffleNext()
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
static __thread float * intRad0H
__device__ __forceinline__ void loadJ(const int i, const float *inp1, const float *inp2, const float *inp3)
__device__ __forceinline__ void loadI(const int i, const float *inp1, const float *inp2, const float *inp3)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:38