NAMD
ComputeBondedCUDAKernel.cu
Go to the documentation of this file.
1 #ifdef WIN32
2 #define _USE_MATH_DEFINES
3 #define __thread __declspec(thread)
4 #endif
5 #include <math.h>
6 #include <cuda.h>
7 #if __CUDACC_VER_MAJOR__ >= 11
8 #include <cub/cub.cuh>
9 #else
10 #include <namd_cub/cub.cuh>
11 #endif
13 
14 #ifdef FUTURE_CUDADEVICE
15 #include "CudaDevice.h"
16 #else
17 #include "DeviceCUDA.h"
18 extern __thread DeviceCUDA *deviceCUDA;
19 #endif
20 
21 __device__ inline long long int lliroundf(float f)
22 {
23  long long int l;
24  asm("cvt.rni.s64.f32 %0, %1;" : "=l"(l) : "f"(f));
25  return l;
26 }
27 
28 __device__ inline unsigned long long int llitoulli(long long int l)
29 {
30  unsigned long long int u;
31  asm("mov.b64 %0, %1;" : "=l"(u) : "l"(l));
32  return u;
33 }
34 
35 template <typename T>
36 __forceinline__ __device__
37 void convertForces(const float x, const float y, const float z,
38  T &Tx, T &Ty, T &Tz) {
39 
40  Tx = (T)(x);
41  Ty = (T)(y);
42  Tz = (T)(z);
43 }
44 
45 template <>
46 __forceinline__ __device__
47 void convertForces<long long int>(const float x, const float y, const float z,
48  long long int &Tx, long long int &Ty, long long int &Tz) {
49 
51  Ty = lliroundf(y*float_to_force);
52  Tz = lliroundf(z*float_to_force);
53 }
54 
55 template <typename T>
56 __forceinline__ __device__
58  float fij,
59  const float dx, const float dy, const float dz,
60  T &fxij, T &fyij, T &fzij) {
61 
62  fxij = (T)(fij*dx);
63  fyij = (T)(fij*dy);
64  fzij = (T)(fij*dz);
65 
66 }
67 
68 template <>
69 __forceinline__ __device__
71  float fij,
72  const float dx, const float dy, const float dz,
73  long long int &fxij,
74  long long int &fyij,
75  long long int &fzij) {
76 
77  fij *= float_to_force;
78  fxij = lliroundf(fij*dx);
79  fyij = lliroundf(fij*dy);
80  fzij = lliroundf(fij*dz);
81 
82 }
83 
84 template <typename T>
85 __forceinline__ __device__
87  float fij1,
88  const float dx1, const float dy1, const float dz1,
89  float fij2,
90  const float dx2, const float dy2, const float dz2,
91  T &fxij, T &fyij, T &fzij) {
92 
93  fxij = (T)(fij1*dx1 + fij2*dx2);
94  fyij = (T)(fij1*dy1 + fij2*dy2);
95  fzij = (T)(fij1*dz1 + fij2*dz2);
96 }
97 
98 template <>
99 __forceinline__ __device__
101  float fij1,
102  const float dx1,
103  const float dy1,
104  const float dz1,
105  float fij2,
106  const float dx2,
107  const float dy2,
108  const float dz2,
109  long long int &fxij,
110  long long int &fyij,
111  long long int &fzij) {
112 
113  fij1 *= float_to_force;
114  fij2 *= float_to_force;
115  fxij = lliroundf(fij1*dx1 + fij2*dx2);
116  fyij = lliroundf(fij1*dy1 + fij2*dy2);
117  fzij = lliroundf(fij1*dz1 + fij2*dz2);
118 }
119 
120 template <typename T>
121 __forceinline__ __device__
122 void storeForces(const T fx, const T fy, const T fz,
123  const int ind, const int stride,
124  T* force) {
125  // The generic version can not be used
126 }
127 
128 // Template specialization for 64bit integer = "long long int"
129 template <>
130 __forceinline__ __device__
131 void storeForces <long long int> (const long long int fx,
132  const long long int fy,
133  const long long int fz,
134  const int ind, const int stride,
135  long long int* force) {
136  atomicAdd((unsigned long long int *)&force[ind ], llitoulli(fx));
137  atomicAdd((unsigned long long int *)&force[ind + stride ], llitoulli(fy));
138  atomicAdd((unsigned long long int *)&force[ind + stride*2], llitoulli(fz));
139 }
140 
141 //
142 // Calculates bonds
143 //
144 template <typename T, bool doEnergy, bool doVirial>
145 __device__ void bondForce(
146  const int index,
147  const CudaBond* __restrict__ bondList,
148  const CudaBondValue* __restrict__ bondValues,
149  const float4* __restrict__ xyzq,
150  const int stride,
151  const float3 lata, const float3 latb, const float3 latc,
152  T* __restrict__ force, double &energy,
153 #ifdef WRITE_FULL_VIRIALS
155 #else
156  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
157 #endif
158  ) {
159 
160  CudaBond bl = bondList[index];
161  CudaBondValue bondValue = bondValues[bl.itype];
162  if (bondValue.x0 == 0.0f) return;
163 
164  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
165  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
166  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
167 
168  float4 xyzqi = xyzq[bl.i];
169  float4 xyzqj = xyzq[bl.j];
170 
171  float xij = xyzqi.x + shx - xyzqj.x;
172  float yij = xyzqi.y + shy - xyzqj.y;
173  float zij = xyzqi.z + shz - xyzqj.z;
174 
175  float r = sqrtf(xij*xij + yij*yij + zij*zij);
176 
177  float db = r - bondValue.x0;
178  if (bondValue.x1) {
179  // in this case, the bond represents a harmonic wall potential
180  // where x0 is the lower wall and x1 is the upper
181  db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
182  }
183  float fij = db * bondValue.k * bl.scale;
184 
185  if (doEnergy) {
186  energy += (double)(fij*db);
187  }
188  fij *= -2.0f/r;
189 
190  T T_fxij, T_fyij, T_fzij;
191  calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
192 
193  // Store forces
194  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force);
195  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force);
196 
197  // Store virial
198  if (doVirial) {
199 #ifdef WRITE_FULL_VIRIALS
200  float fxij = fij*xij;
201  float fyij = fij*yij;
202  float fzij = fij*zij;
203  virial.xx = (fxij*xij);
204  virial.xy = (fxij*yij);
205  virial.xz = (fxij*zij);
206  virial.yx = (fyij*xij);
207  virial.yy = (fyij*yij);
208  virial.yz = (fyij*zij);
209  virial.zx = (fzij*xij);
210  virial.zy = (fzij*yij);
211  virial.zz = (fzij*zij);
212 #endif
213  }
214 }
215 
216 //
217 // Calculates modified exclusions
218 //
219 template <typename T, bool doEnergy, bool doVirial, bool doElect>
220 __device__ void modifiedExclusionForce(
221  const int index,
222  const CudaExclusion* __restrict__ exclusionList,
223  const bool doSlow,
224  const float one_scale14, // 1 - scale14
225  const int vdwCoefTableWidth,
226 #if __CUDA_ARCH__ >= 350
227  const float2* __restrict__ vdwCoefTable,
228 #else
229  cudaTextureObject_t vdwCoefTableTex,
230 #endif
231  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
232  const float4* __restrict__ xyzq,
233  const int stride,
234  const float3 lata, const float3 latb, const float3 latc,
235  const float cutoff2,
236  double &energyVdw,
237  T* __restrict__ forceNbond, double &energyNbond,
238  T* __restrict__ forceSlow, double &energySlow,
239 #ifdef WRITE_FULL_VIRIALS
242 #else
243  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialNbond,
244  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow
245 #endif
246  ) {
247 
248  CudaExclusion bl = exclusionList[index];
249 
250  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
251  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
252  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
253 
254  float4 xyzqi = xyzq[bl.i];
255  float4 xyzqj = xyzq[bl.j];
256 
257  float xij = xyzqi.x + shx - xyzqj.x;
258  float yij = xyzqi.y + shy - xyzqj.y;
259  float zij = xyzqi.z + shz - xyzqj.z;
260 
261  float r2 = xij*xij + yij*yij + zij*zij;
262  if (r2 < cutoff2) {
263 
264  float rinv = rsqrtf(r2);
265 
266  float qq;
267  if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
268 
269  int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
270 #if __CUDA_ARCH__ >= 350
271  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
272 #else
273  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
274 #endif
275 
276  float4 fi = tex1D<float4>(forceTableTex, rinv);
277  float4 ei;
278  if (doEnergy) {
279  ei = tex1D<float4>(energyTableTex, rinv);
280  energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
281  if (doElect) {
282  energyNbond += qq * ei.x;
283  if (doSlow) energySlow += qq * ei.w;
284  }
285  }
286 
287  float fNbond;
288  if (doElect) {
289  fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
290  } else {
291  fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
292  }
293  T T_fxij, T_fyij, T_fzij;
294  calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
295  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond);
296  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond);
297 
298  float fSlow;
299  if (doSlow && doElect) {
300  fSlow = -qq * fi.w;
301  T T_fxij, T_fyij, T_fzij;
302  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
303  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow);
304  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow);
305  }
306 
307  // Store virial
308  if (doVirial) {
309 #ifdef WRITE_FULL_VIRIALS
310  float fxij = fNbond*xij;
311  float fyij = fNbond*yij;
312  float fzij = fNbond*zij;
313  virialNbond.xx = (fxij*xij);
314  virialNbond.xy = (fxij*yij);
315  virialNbond.xz = (fxij*zij);
316  virialNbond.yx = (fyij*xij);
317  virialNbond.yy = (fyij*yij);
318  virialNbond.yz = (fyij*zij);
319  virialNbond.zx = (fzij*xij);
320  virialNbond.zy = (fzij*yij);
321  virialNbond.zz = (fzij*zij);
322 #endif
323  }
324 
325  // Store virial
326  if (doVirial && doSlow && doElect) {
327 #ifdef WRITE_FULL_VIRIALS
328  float fxij = fSlow*xij;
329  float fyij = fSlow*yij;
330  float fzij = fSlow*zij;
331  virialSlow.xx = (fxij*xij);
332  virialSlow.xy = (fxij*yij);
333  virialSlow.xz = (fxij*zij);
334  virialSlow.yx = (fyij*xij);
335  virialSlow.yy = (fyij*yij);
336  virialSlow.yz = (fyij*zij);
337  virialSlow.zx = (fzij*xij);
338  virialSlow.zy = (fzij*yij);
339  virialSlow.zz = (fzij*zij);
340 #endif
341  }
342 
343  }
344 }
345 
346 //
347 // Calculates exclusions. Here doSlow = true
348 //
349 template <typename T, bool doEnergy, bool doVirial>
350 __device__ void exclusionForce(
351  const int index,
352  const CudaExclusion* __restrict__ exclusionList,
353  const float r2_delta, const int r2_delta_expc,
354 #if __CUDA_ARCH__ >= 350
355  const float* __restrict__ r2_table,
356  const float4* __restrict__ exclusionTable,
357 #else
358  cudaTextureObject_t r2_table_tex,
359  cudaTextureObject_t exclusionTableTex,
360 #endif
361  const float4* __restrict__ xyzq,
362  const int stride,
363  const float3 lata, const float3 latb, const float3 latc,
364  const float cutoff2,
365  T* __restrict__ forceSlow, double &energySlow,
366 #ifdef WRITE_FULL_VIRIALS
368 #else
369  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow
370 #endif
371  ) {
372 
373  CudaExclusion bl = exclusionList[index];
374 
375  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
376  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
377  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
378 
379  float4 xyzqi = xyzq[bl.i];
380  float4 xyzqj = xyzq[bl.j];
381 
382  float xij = xyzqi.x + shx - xyzqj.x;
383  float yij = xyzqi.y + shy - xyzqj.y;
384  float zij = xyzqi.z + shz - xyzqj.z;
385 
386  float r2 = xij*xij + yij*yij + zij*zij;
387  if (r2 < cutoff2) {
388  r2 += r2_delta;
389 
390  union { float f; int i; } r2i;
391  r2i.f = r2;
392  int table_i = (r2i.i >> 17) + r2_delta_expc; // table_i >= 0
393 
394 #if __CUDA_ARCH__ >= 350
395  float r2_table_val = __ldg(&r2_table[table_i]);
396 #else
397  float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
398 #endif
399  float diffa = r2 - r2_table_val;
400  float qq = xyzqi.w * xyzqj.w;
401 
402 #if __CUDA_ARCH__ >= 350
403  float4 slow = __ldg(&exclusionTable[table_i]);
404 #else
405  float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
406 #endif
407 
408  if (doEnergy) {
409  energySlow += (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
410  }
411 
412  float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
413 
414  T T_fxij, T_fyij, T_fzij;
415  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
416  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow);
417  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow);
418 
419  // Store virial
420  if (doVirial) {
421 #ifdef WRITE_FULL_VIRIALS
422  float fxij = fSlow*xij;
423  float fyij = fSlow*yij;
424  float fzij = fSlow*zij;
425  virialSlow.xx = (fxij*xij);
426  virialSlow.xy = (fxij*yij);
427  virialSlow.xz = (fxij*zij);
428  virialSlow.yx = (fyij*xij);
429  virialSlow.yy = (fyij*yij);
430  virialSlow.yz = (fyij*zij);
431  virialSlow.zx = (fzij*xij);
432  virialSlow.zy = (fzij*yij);
433  virialSlow.zz = (fzij*zij);
434 #endif
435  }
436  }
437 }
438 
439 template <typename T, bool doEnergy, bool doVirial>
440 __device__ void angleForce(const int index,
441  const CudaAngle* __restrict__ angleList,
442  const CudaAngleValue* __restrict__ angleValues,
443  const float4* __restrict__ xyzq,
444  const int stride,
445  const float3 lata, const float3 latb, const float3 latc,
446  T* __restrict__ force, double &energy,
447 #ifdef WRITE_FULL_VIRIALS
449 #else
450  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
451 #endif
452  ) {
453 
454  CudaAngle al = angleList[index];
455 
456  float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
457  float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
458  float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
459 
460  float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
461  float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
462  float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
463 
464  float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
465  float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
466  float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
467 
468  float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
469  float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
470  float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
471 
472  float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
473  float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
474 
475  float xijr = xij*rij_inv;
476  float yijr = yij*rij_inv;
477  float zijr = zij*rij_inv;
478  float xkjr = xkj*rkj_inv;
479  float ykjr = ykj*rkj_inv;
480  float zkjr = zkj*rkj_inv;
481  float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
482 
483  CudaAngleValue angleValue = angleValues[al.itype];
484  angleValue.k *= al.scale;
485 
486  float diff;
487  if (angleValue.normal == 1) {
488  // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
489  cos_theta = min(0.999f, max(-0.999f, cos_theta));
490  float theta = acosf(cos_theta);
491  diff = theta - angleValue.theta0;
492  } else {
493  diff = cos_theta - angleValue.theta0;
494  }
495 
496  if (doEnergy) {
497  energy += (double)(angleValue.k * diff * diff);
498  }
499  if (angleValue.normal == 1) {
500  float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
501  if (inv_sin_theta > 1.0e6) {
502  diff = (diff < 0.0f) ? 1.0f : -1.0f;
503  } else {
504  diff *= -inv_sin_theta;
505  }
506  }
507  diff *= -2.0f*angleValue.k;
508 
509  float dtxi = rij_inv*(xkjr - cos_theta*xijr);
510  float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
511  float dtyi = rij_inv*(ykjr - cos_theta*yijr);
512  float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
513  float dtzi = rij_inv*(zkjr - cos_theta*zijr);
514  float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
515 
516  T T_dtxi, T_dtyi, T_dtzi;
517  T T_dtxj, T_dtyj, T_dtzj;
518  calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
519  calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
520  T T_dtxk = -T_dtxi - T_dtxj;
521  T T_dtyk = -T_dtyi - T_dtyj;
522  T T_dtzk = -T_dtzi - T_dtzj;
523  storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force);
524 
525  if (angleValue.k_ub) {
526  float xik = xij - xkj;
527  float yik = yij - ykj;
528  float zik = zij - zkj;
529  float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
530  float rik = 1.0f/rik_inv;
531  float diff_ub = rik - angleValue.r_ub;
532  if (doEnergy) {
533  energy += (double)(angleValue.k_ub * diff_ub * diff_ub);
534  }
535  diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
536  T T_dxik, T_dyik, T_dzik;
537  calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
538  T_dtxi += T_dxik;
539  T_dtyi += T_dyik;
540  T_dtzi += T_dzik;
541  T_dtxj -= T_dxik;
542  T_dtyj -= T_dyik;
543  T_dtzj -= T_dzik;
544  }
545 
546  storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force);
547  storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force);
548 
549  // Store virial
550  if (doVirial) {
551 #ifdef WRITE_FULL_VIRIALS
552  float fxi = ((float)T_dtxi)*force_to_float;
553  float fyi = ((float)T_dtyi)*force_to_float;
554  float fzi = ((float)T_dtzi)*force_to_float;
555  float fxk = ((float)T_dtxj)*force_to_float;
556  float fyk = ((float)T_dtyj)*force_to_float;
557  float fzk = ((float)T_dtzj)*force_to_float;
558  virial.xx = (fxi*xij) + (fxk*xkj);
559  virial.xy = (fxi*yij) + (fxk*ykj);
560  virial.xz = (fxi*zij) + (fxk*zkj);
561  virial.yx = (fyi*xij) + (fyk*xkj);
562  virial.yy = (fyi*yij) + (fyk*ykj);
563  virial.yz = (fyi*zij) + (fyk*zkj);
564  virial.zx = (fzi*xij) + (fzk*xkj);
565  virial.zy = (fzi*yij) + (fzk*ykj);
566  virial.zz = (fzi*zij) + (fzk*zkj);
567 #endif
568  }
569 }
570 
571 //
572 // Dihedral computation
573 //
574 // Out: df, e
575 //
576 template <bool doEnergy>
577 __forceinline__ __device__
578 void diheComp(const CudaDihedralValue* dihedralValues, int ic,
579  const float sin_phi, const float cos_phi, const float scale, float& df, double& e) {
580 
581  df = 0.0f;
582  if (doEnergy) e = 0.0;
583 
584  float phi = atan2f(sin_phi, cos_phi);
585 
586  bool lrep = true;
587  while (lrep) {
588  CudaDihedralValue dihedralValue = dihedralValues[ic];
589  dihedralValue.k *= scale;
590 
591  // Last dihedral has n low bit set to 0
592  lrep = (dihedralValue.n & 1);
593  dihedralValue.n >>= 1;
594  if (dihedralValue.n) {
595  float nf = dihedralValue.n;
596  float x = nf*phi - dihedralValue.delta;
597  if (doEnergy) {
598  float sin_x, cos_x;
599  sincosf(x, &sin_x, &cos_x);
600  e += (double)(dihedralValue.k*(1.0f + cos_x));
601  df += (double)(nf*dihedralValue.k*sin_x);
602  } else {
603  float sin_x = sinf(x);
604  df += (double)(nf*dihedralValue.k*sin_x);
605  }
606  } else {
607  float diff = phi - dihedralValue.delta;
608  if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
609  if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
610  if (doEnergy) {
611  e += (double)(dihedralValue.k*diff*diff);
612  }
613  df -= 2.0f*dihedralValue.k*diff;
614  }
615  ic++;
616  }
617 
618 }
619 
620 
621 template <typename T, bool doEnergy, bool doVirial>
622 __device__ void diheForce(const int index,
623  const CudaDihedral* __restrict__ diheList,
624  const CudaDihedralValue* __restrict__ dihedralValues,
625  const float4* __restrict__ xyzq,
626  const int stride,
627  const float3 lata, const float3 latb, const float3 latc,
628  T* __restrict__ force, double &energy,
629 #ifdef WRITE_FULL_VIRIALS
631 #else
632  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
633 #endif
634  ) {
635 
636  CudaDihedral dl = diheList[index];
637 
638  float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
639  float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
640  float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
641 
642  float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
643  float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
644  float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
645 
646  float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
647  float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
648  float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
649 
650  float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
651  float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
652  float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
653 
654  float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
655  float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
656  float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
657 
658  float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
659  float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
660  float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
661 
662  // A=F^G, B=H^G.
663  float ax = yij*zjk - zij*yjk;
664  float ay = zij*xjk - xij*zjk;
665  float az = xij*yjk - yij*xjk;
666  float bx = ylk*zjk - zlk*yjk;
667  float by = zlk*xjk - xlk*zjk;
668  float bz = xlk*yjk - ylk*xjk;
669 
670  float ra2 = ax*ax + ay*ay + az*az;
671  float rb2 = bx*bx + by*by + bz*bz;
672  float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
673 
674  // if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
675  // nlinear = nlinear + 1
676  // endif
677 
678  float rgr = 1.0f / rg;
679  float ra2r = 1.0f / ra2;
680  float rb2r = 1.0f / rb2;
681  float rabr = sqrtf(ra2r*rb2r);
682 
683  float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
684  //
685  // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
686  // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
687  float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
688  //
689  // Energy and derivative contributions.
690 
691  float df;
692  double e;
693  diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
694 
695  if (doEnergy) energy += e;
696 
697  //
698  // Compute derivatives wrt catesian coordinates.
699  //
700  // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
701  // FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
702 
703  float fg = xij*xjk + yij*yjk + zij*zjk;
704  float hg = xlk*xjk + ylk*yjk + zlk*zjk;
705  ra2r *= df;
706  rb2r *= df;
707  float fga = fg*ra2r*rgr;
708  float hgb = hg*rb2r*rgr;
709  float gaa = ra2r*rg;
710  float gbb = rb2r*rg;
711  // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
712 
713  // Store forces
714  T T_fix, T_fiy, T_fiz;
715  calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
716  storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force);
717 
718  T dgx, dgy, dgz;
719  calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
720  T T_fjx = dgx - T_fix;
721  T T_fjy = dgy - T_fiy;
722  T T_fjz = dgz - T_fiz;
723  storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force);
724 
725  T dhx, dhy, dhz;
726  calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
727  T T_fkx = -dhx - dgx;
728  T T_fky = -dhy - dgy;
729  T T_fkz = -dhz - dgz;
730  storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force);
731  storeForces<T>(dhx, dhy, dhz, dl.l, stride, force);
732 
733  // Store virial
734  if (doVirial) {
735 #ifdef WRITE_FULL_VIRIALS
736  float fix = ((float)T_fix)*force_to_float;
737  float fiy = ((float)T_fiy)*force_to_float;
738  float fiz = ((float)T_fiz)*force_to_float;
739  float fjx = ((float)dgx)*force_to_float;
740  float fjy = ((float)dgy)*force_to_float;
741  float fjz = ((float)dgz)*force_to_float;
742  float flx = ((float)dhx)*force_to_float;
743  float fly = ((float)dhy)*force_to_float;
744  float flz = ((float)dhz)*force_to_float;
745  virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
746  virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
747  virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
748  virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
749  virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
750  virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
751  virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
752  virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
753  virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
754 #endif
755  }
756 
757 }
758 
759 __device__ __forceinline__ float3 cross(const float3 v1, const float3 v2) {
760  return make_float3(
761  v1.y*v2.z-v2.y*v1.z,
762  v2.x*v1.z-v1.x*v2.z,
763  v1.x*v2.y-v2.x*v1.y
764  );
765 }
766 
767 __device__ __forceinline__ float dot(const float3 v1, const float3 v2) {
768  return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
769 }
770 
771 //
772 // Calculates crossterms
773 //
774 template <typename T, bool doEnergy, bool doVirial>
775 __device__ void crosstermForce(
776  const int index,
777  const CudaCrossterm* __restrict__ crosstermList,
778  const CudaCrosstermValue* __restrict__ crosstermValues,
779  const float4* __restrict__ xyzq,
780  const int stride,
781  const float3 lata, const float3 latb, const float3 latc,
782  T* __restrict__ force, double &energy,
783 #ifdef WRITE_FULL_VIRIALS
785 #else
786  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
787 #endif
788  ) {
789 
790  CudaCrossterm cl = crosstermList[index];
791 
792  // ----------------------------------------------------------------------------
793  // Angle between 1 - 2 - 3 - 4
794  //
795  // 1 - 2
796  float3 sh12 = make_float3(
797  cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
798  cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
799  cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
800 
801  float4 xyzq1 = xyzq[cl.i1];
802  float4 xyzq2 = xyzq[cl.i2];
803 
804  float3 r12 = make_float3(
805  xyzq1.x + sh12.x - xyzq2.x,
806  xyzq1.y + sh12.y - xyzq2.y,
807  xyzq1.z + sh12.z - xyzq2.z);
808 
809  // 2 - 3
810  float3 sh23 = make_float3(
811  cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
812  cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
813  cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
814 
815  float4 xyzq3 = xyzq[cl.i3];
816 
817  float3 r23 = make_float3(
818  xyzq2.x + sh23.x - xyzq3.x,
819  xyzq2.y + sh23.y - xyzq3.y,
820  xyzq2.z + sh23.z - xyzq3.z);
821 
822  // 3 - 4
823  float3 sh34 = make_float3(
824  cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
825  cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
826  cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
827 
828  float4 xyzq4 = xyzq[cl.i4];
829 
830  float3 r34 = make_float3(
831  xyzq3.x + sh34.x - xyzq4.x,
832  xyzq3.y + sh34.y - xyzq4.y,
833  xyzq3.z + sh34.z - xyzq4.z);
834 
835  // Calculate the cross products
836  float3 A = cross(r12, r23);
837  float3 B = cross(r23, r34);
838  float3 C = cross(r23, A);
839 
840  // Calculate the inverse distances
841  float inv_rA = rsqrtf(dot(A, A));
842  float inv_rB = rsqrtf(dot(B, B));
843  float inv_rC = rsqrtf(dot(C, C));
844 
845  // Calculate the sin and cos
846  float cos_phi = dot(A, B)*(inv_rA*inv_rB);
847  float sin_phi = dot(C, B)*(inv_rC*inv_rB);
848 
849  float phi = -atan2f(sin_phi,cos_phi);
850 
851  // ----------------------------------------------------------------------------
852  // Angle between 5 - 6 - 7 - 8
853  //
854 
855  // 5 - 6
856  float3 sh56 = make_float3(
857  cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
858  cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
859  cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
860 
861  float4 xyzq5 = xyzq[cl.i5];
862  float4 xyzq6 = xyzq[cl.i6];
863 
864  float3 r56 = make_float3(
865  xyzq5.x + sh56.x - xyzq6.x,
866  xyzq5.y + sh56.y - xyzq6.y,
867  xyzq5.z + sh56.z - xyzq6.z);
868 
869  // 6 - 7
870  float3 sh67 = make_float3(
871  cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
872  cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
873  cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
874 
875  float4 xyzq7 = xyzq[cl.i7];
876 
877  float3 r67 = make_float3(
878  xyzq6.x + sh67.x - xyzq7.x,
879  xyzq6.y + sh67.y - xyzq7.y,
880  xyzq6.z + sh67.z - xyzq7.z);
881 
882  // 7 - 8
883  float3 sh78 = make_float3(
884  cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
885  cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
886  cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
887 
888  float4 xyzq8 = xyzq[cl.i8];
889 
890  float3 r78 = make_float3(
891  xyzq7.x + sh78.x - xyzq8.x,
892  xyzq7.y + sh78.y - xyzq8.y,
893  xyzq7.z + sh78.z - xyzq8.z);
894 
895  // Calculate the cross products
896  float3 D = cross(r56, r67);
897  float3 E = cross(r67, r78);
898  float3 F = cross(r67, D);
899 
900  // Calculate the inverse distances
901  float inv_rD = rsqrtf(dot(D, D));
902  float inv_rE = rsqrtf(dot(E, E));
903  float inv_rF = rsqrtf(dot(F, F));
904 
905  // Calculate the sin and cos
906  float cos_psi = dot(D, E)*(inv_rD*inv_rE);
907  float sin_psi = dot(F, E)*(inv_rF*inv_rE);
908 
909  float psi = -atan2f(sin_psi,cos_psi);
910 
911  // ----------------------------------------------------------------------------
912  // Calculate the energy
913 
914  const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
915 
916  // Shift angles
917  phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
918  psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
919 
920  // distance measured in grid points between angle and smallest value
921  float phi_h = phi * inv_h;
922  float psi_h = psi * inv_h;
923 
924  // find smallest numbered grid point in stencil
925  int iphi = (int)floor(phi_h);
926  int ipsi = (int)floor(psi_h);
927 
928  const CudaCrosstermValue& cp = crosstermValues[cl.itype];
929 
930  // Load coefficients
931  float4 c[4];
932  c[0] = cp.c[iphi][ipsi][0];
933  c[1] = cp.c[iphi][ipsi][1];
934  c[2] = cp.c[iphi][ipsi][2];
935  c[3] = cp.c[iphi][ipsi][3];
936 
937  float dphi = phi_h - (float)iphi;
938  float dpsi = psi_h - (float)ipsi;
939 
940  if (doEnergy) {
941  float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
942  energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
943  energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
944  energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
945  energy += energyf*cl.scale;
946  }
947 
948  float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
949  dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
950  dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) )); // 13 muls
951  dEdphi *= cl.scale*inv_h;
952 
953  float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
954  dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
955  dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) )); // 13 muls
956  dEdpsi *= cl.scale*inv_h;
957 
958  // float normCross1 = dot(A, A);
959  float square_r23 = dot(r23, r23);
960  float norm_r23 = sqrtf(square_r23);
961  float inv_square_r23 = 1.0f/square_r23;
962  float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
963  float ff2 = -dot(r12, r23)*inv_square_r23;
964  float ff3 = -dot(r34, r23)*inv_square_r23;
965  float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
966 
967  float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
968  float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
969  float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
970  float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
971  float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
972 
973  T T_f1x, T_f1y, T_f1z;
974  T T_f2x, T_f2y, T_f2z;
975  T T_f3x, T_f3y, T_f3z;
976  T T_f4x, T_f4y, T_f4z;
977  convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
978  convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
979  convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
980  convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
981  storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force);
982  storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force);
983  storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force);
984  storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force);
985 
986  float square_r67 = dot(r67, r67);
987  float norm_r67 = sqrtf(square_r67);
988  float inv_square_r67 = 1.0f/(square_r67);
989  ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
990  ff2 = -dot(r56, r67)*inv_square_r67;
991  ff3 = -dot(r78, r67)*inv_square_r67;
992  ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
993 
994  float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
995  float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
996  float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
997  float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
998  float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
999 
1000  T T_f5x, T_f5y, T_f5z;
1001  T T_f6x, T_f6y, T_f6z;
1002  T T_f7x, T_f7y, T_f7z;
1003  T T_f8x, T_f8y, T_f8z;
1004  convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1005  convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1006  convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1007  convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1008  storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force);
1009  storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force);
1010  storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force);
1011  storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force);
1012 
1013  // Store virial
1014  if (doVirial) {
1015 #ifdef WRITE_FULL_VIRIALS
1016  float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1017  float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1018  virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1019  virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1020  virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1021  virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1022  virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1023  virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1024  virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1025  virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1026  virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1027  }
1028 #endif
1029 
1030 }
1031 
1032 #define BONDEDFORCESKERNEL_NUM_WARP 4
1033 //
1034 // Calculates all forces in a single kernel call
1035 //
1036 template <typename T, bool doEnergy, bool doVirial>
1037 __global__ void bondedForcesKernel(
1038  const int start,
1039 
1040  const int numBonds,
1041  const CudaBond* __restrict__ bonds,
1042  const CudaBondValue* __restrict__ bondValues,
1043 
1044  const int numAngles,
1045  const CudaAngle* __restrict__ angles,
1046  const CudaAngleValue* __restrict__ angleValues,
1047 
1048  const int numDihedrals,
1049  const CudaDihedral* __restrict__ dihedrals,
1050  const CudaDihedralValue* __restrict__ dihedralValues,
1051 
1052  const int numImpropers,
1053  const CudaDihedral* __restrict__ impropers,
1054  const CudaDihedralValue* __restrict__ improperValues,
1055 
1056  const int numExclusions,
1057  const CudaExclusion* __restrict__ exclusions,
1058 
1059  const int numCrossterms,
1060  const CudaCrossterm* __restrict__ crossterms,
1061  const CudaCrosstermValue* __restrict__ crosstermValues,
1062 
1063  const float cutoff2,
1064  const float r2_delta, const int r2_delta_expc,
1065 
1066  const float* __restrict__ r2_table,
1067  const float4* __restrict__ exclusionTable,
1068  cudaTextureObject_t r2_table_tex,
1069  cudaTextureObject_t exclusionTableTex,
1070 
1071  const float4* __restrict__ xyzq,
1072  const int stride,
1073  const float3 lata, const float3 latb, const float3 latc,
1074  T* __restrict__ force,
1075  T* __restrict__ forceSlow,
1076  double* __restrict__ energies_virials) {
1077 
1078  // Thread-block index
1079  int indexTB = start + blockIdx.x;
1080 
1081  const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
1082  const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
1083  const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
1084  const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
1085  const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
1086  const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
1087 
1088  // Each thread computes single bonded interaction.
1089  // Each thread block computes single bonded type
1090  double energy;
1091  int energyIndex;
1092 
1093  if (doEnergy) {
1094  energy = 0.0;
1095  energyIndex = -1;
1096  }
1097 
1098 #ifdef WRITE_FULL_VIRIALS
1100  int virialIndex;
1101  if (doVirial) {
1102  virial.xx = 0.0;
1103  virial.xy = 0.0;
1104  virial.xz = 0.0;
1105  virial.yx = 0.0;
1106  virial.yy = 0.0;
1107  virial.yz = 0.0;
1108  virial.zx = 0.0;
1109  virial.zy = 0.0;
1110  virial.zz = 0.0;
1112  }
1113 #endif
1114 
1115  if (indexTB < numBondsTB) {
1116  int i = threadIdx.x + indexTB*blockDim.x;
1117  if (doEnergy) {
1119  }
1120  if (i < numBonds) {
1121  bondForce<T, doEnergy, doVirial>
1122  (i, bonds, bondValues, xyzq,
1123  stride, lata, latb, latc,
1124  force, energy, virial);
1125  }
1126  goto reduce;
1127  }
1128  indexTB -= numBondsTB;
1129 
1130  if (indexTB < numAnglesTB) {
1131  int i = threadIdx.x + indexTB*blockDim.x;
1132  if (doEnergy) {
1134  }
1135  if (i < numAngles) {
1136  angleForce<T, doEnergy, doVirial>
1137  (i, angles, angleValues, xyzq, stride,
1138  lata, latb, latc,
1139  force, energy, virial);
1140  }
1141  goto reduce;
1142  }
1143  indexTB -= numAnglesTB;
1144 
1145  if (indexTB < numDihedralsTB) {
1146  int i = threadIdx.x + indexTB*blockDim.x;
1147  if (doEnergy) {
1149  }
1150  if (doVirial) {
1152  }
1153  if (i < numDihedrals) {
1154  diheForce<T, doEnergy, doVirial>
1155  (i, dihedrals, dihedralValues, xyzq, stride,
1156  lata, latb, latc,
1157  force, energy, virial);
1158  }
1159  goto reduce;
1160  }
1161  indexTB -= numDihedralsTB;
1162 
1163  if (indexTB < numImpropersTB) {
1164  int i = threadIdx.x + indexTB*blockDim.x;
1165  if (doEnergy) {
1167  }
1168  if (i < numImpropers) {
1169  diheForce<T, doEnergy, doVirial>
1170  (i, impropers, improperValues, xyzq, stride,
1171  lata, latb, latc,
1172  force, energy, virial);
1173  }
1174  goto reduce;
1175  }
1176  indexTB -= numImpropersTB;
1177 
1178  if (indexTB < numExclusionsTB) {
1179  int i = threadIdx.x + indexTB*blockDim.x;
1180  if (doEnergy) {
1182  }
1183  if (doVirial) {
1185  }
1186  if (i < numExclusions) {
1187  exclusionForce<T, doEnergy, doVirial>
1188  (i, exclusions, r2_delta, r2_delta_expc,
1189 #if __CUDA_ARCH__ >= 350
1190  r2_table, exclusionTable,
1191 #else
1192  r2_table_tex, exclusionTableTex,
1193 #endif
1194  xyzq, stride, lata, latb, latc, cutoff2,
1195  forceSlow, energy, virial);
1196  }
1197  goto reduce;
1198  }
1199  indexTB -= numExclusionsTB;
1200 
1201  if (indexTB < numCrosstermsTB) {
1202  int i = threadIdx.x + indexTB*blockDim.x;
1203  if (doEnergy) {
1205  }
1206  if (doVirial) {
1208  }
1209  if (i < numCrossterms) {
1210  crosstermForce<T, doEnergy, doVirial>
1211  (i, crossterms, crosstermValues,
1212  xyzq, stride, lata, latb, latc,
1213  force, energy, virial);
1214  }
1215  goto reduce;
1216  }
1217  // indexTB -= numCrosstermsTB;
1218 
1219  reduce:
1220 
1221  // Write energies to global memory
1222  if (doEnergy) {
1223  // energyIndex is constant within thread-block
1224  __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
1225 #pragma unroll
1226  for (int i=16;i >= 1;i/=2) {
1227  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, 32);
1228  }
1229  int laneID = (threadIdx.x & (WARPSIZE - 1));
1230  int warpID = threadIdx.x / WARPSIZE;
1231  if (laneID == 0) {
1232  shEnergy[warpID] = energy;
1233  }
1234  BLOCK_SYNC;
1235  if (warpID == 0) {
1236  energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
1237 #pragma unroll
1238  for (int i=16;i >= 1;i/=2) {
1239  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, 32);
1240  }
1241  if (laneID == 0) {
1242  atomicAdd(&energies_virials[energyIndex], energy);
1243  }
1244  }
1245  }
1246 
1247  // Write virials to global memory
1248 #ifdef WRITE_FULL_VIRIALS
1249  if (doVirial) {
1250 #pragma unroll
1251  for (int i=16;i >= 1;i/=2) {
1252  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, 32);
1253  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, 32);
1254  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, 32);
1255  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, 32);
1256  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, 32);
1257  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, 32);
1258  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, 32);
1259  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, 32);
1260  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, 32);
1261  }
1263  int laneID = (threadIdx.x & (WARPSIZE - 1));
1264  int warpID = threadIdx.x / WARPSIZE;
1265  if (laneID == 0) {
1266  shVirial[warpID] = virial;
1267  }
1268  BLOCK_SYNC;
1269 
1270  if (warpID == 0) {
1271  virial.xx = 0.0;
1272  virial.xy = 0.0;
1273  virial.xz = 0.0;
1274  virial.yx = 0.0;
1275  virial.yy = 0.0;
1276  virial.yz = 0.0;
1277  virial.zx = 0.0;
1278  virial.zy = 0.0;
1279  virial.zz = 0.0;
1280  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
1281 #pragma unroll
1282  for (int i=16;i >= 1;i/=2) {
1283  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, 32);
1284  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, 32);
1285  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, 32);
1286  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, 32);
1287  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, 32);
1288  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, 32);
1289  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, 32);
1290  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, 32);
1291  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, 32);
1292  }
1293 
1294  if (laneID == 0) {
1295 #ifdef USE_FP_VIRIAL
1296  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 0], llitoulli(virial.xx*double_to_virial));
1297  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 1], llitoulli(virial.xy*double_to_virial));
1298  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 2], llitoulli(virial.xz*double_to_virial));
1299  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 3], llitoulli(virial.yx*double_to_virial));
1300  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 4], llitoulli(virial.yy*double_to_virial));
1301  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 5], llitoulli(virial.yz*double_to_virial));
1302  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 6], llitoulli(virial.zx*double_to_virial));
1303  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 7], llitoulli(virial.zy*double_to_virial));
1304  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 8], llitoulli(virial.zz*double_to_virial));
1305 #else
1306  atomicAdd(&energies_virials[virialIndex + 0], virial.xx);
1307  atomicAdd(&energies_virials[virialIndex + 1], virial.xy);
1308  atomicAdd(&energies_virials[virialIndex + 2], virial.xz);
1309  atomicAdd(&energies_virials[virialIndex + 3], virial.yx);
1310  atomicAdd(&energies_virials[virialIndex + 4], virial.yy);
1311  atomicAdd(&energies_virials[virialIndex + 5], virial.yz);
1312  atomicAdd(&energies_virials[virialIndex + 6], virial.zx);
1313  atomicAdd(&energies_virials[virialIndex + 7], virial.zy);
1314  atomicAdd(&energies_virials[virialIndex + 8], virial.zz);
1315 #endif
1316  }
1317  }
1318  }
1319 #endif
1320 
1321 }
1322 
1323 template <typename T, bool doEnergy, bool doVirial, bool doElect>
1325  const int start,
1326 
1327  const int numModifiedExclusions,
1328  const CudaExclusion* __restrict__ modifiedExclusions,
1329 
1330  const bool doSlow,
1331  const float one_scale14, // 1 - scale14
1332  const float cutoff2,
1333  const int vdwCoefTableWidth,
1334  const float2* __restrict__ vdwCoefTable,
1335  cudaTextureObject_t vdwCoefTableTex,
1336  cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex,
1337 
1338  const float4* __restrict__ xyzq,
1339  const int stride,
1340  const float3 lata, const float3 latb, const float3 latc,
1341  T* __restrict__ forceNbond, T* __restrict__ forceSlow,
1342  double* __restrict__ energies_virials
1343  ) {
1344 
1345  // index
1346  int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
1347 
1348  double energyVdw, energyNbond, energySlow;
1349  if (doEnergy) {
1350  energyVdw = 0.0;
1351  if (doElect) {
1352  energyNbond = 0.0;
1353  energySlow = 0.0;
1354  }
1355  }
1356 
1357 #ifdef WRITE_FULL_VIRIALS
1360  if (doVirial) {
1361  virialNbond.xx = 0.0;
1362  virialNbond.xy = 0.0;
1363  virialNbond.xz = 0.0;
1364  virialNbond.yx = 0.0;
1365  virialNbond.yy = 0.0;
1366  virialNbond.yz = 0.0;
1367  virialNbond.zx = 0.0;
1368  virialNbond.zy = 0.0;
1369  virialNbond.zz = 0.0;
1370  if (doElect) {
1371  virialSlow.xx = 0.0;
1372  virialSlow.xy = 0.0;
1373  virialSlow.xz = 0.0;
1374  virialSlow.yx = 0.0;
1375  virialSlow.yy = 0.0;
1376  virialSlow.yz = 0.0;
1377  virialSlow.zx = 0.0;
1378  virialSlow.zy = 0.0;
1379  virialSlow.zz = 0.0;
1380  }
1381  }
1382 #endif
1383 
1384  if (i < numModifiedExclusions)
1385  {
1386  modifiedExclusionForce<T, doEnergy, doVirial, doElect>
1387  (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
1388 #if __CUDA_ARCH__ >= 350
1389  vdwCoefTable,
1390 #else
1392 #endif
1393  modifiedExclusionForceTableTex, modifiedExclusionEnergyTableTex,
1394  xyzq, stride, lata, latb, latc, cutoff2,
1395  energyVdw, forceNbond, energyNbond,
1396  forceSlow, energySlow, virialNbond, virialSlow);
1397  }
1398 
1399  // Write energies to global memory
1400  if (doEnergy) {
1401  __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
1402  __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1403  __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1404 #pragma unroll
1405  for (int i=16;i >= 1;i/=2) {
1406  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
1407  if (doElect) {
1408  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, 32);
1409  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
1410  }
1411  }
1412  int laneID = (threadIdx.x & (WARPSIZE - 1));
1413  int warpID = threadIdx.x / WARPSIZE;
1414  if (laneID == 0) {
1415  shEnergyVdw[warpID] = energyVdw;
1416  if (doElect) {
1417  shEnergyNbond[warpID] = energyNbond;
1418  shEnergySlow[warpID] = energySlow;
1419  }
1420  }
1421  BLOCK_SYNC;
1422  if (warpID == 0) {
1423  energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
1424  if (doElect) {
1425  energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
1426  energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
1427  }
1428 #pragma unroll
1429  for (int i=16;i >= 1;i/=2) {
1430  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
1431  if (doElect) {
1432  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, 32);
1433  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
1434  }
1435  }
1436  if (laneID == 0) {
1437  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
1438  if (doElect) {
1439  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
1440  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW], energySlow);
1441  }
1442  }
1443  }
1444  }
1445 
1446  // Write virials to global memory
1447 #ifdef WRITE_FULL_VIRIALS
1448  if (doVirial) {
1449 #pragma unroll
1450  for (int i=16;i >= 1;i/=2) {
1451  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, 32);
1452  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, 32);
1453  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, 32);
1454  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, 32);
1455  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, 32);
1456  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, 32);
1457  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, 32);
1458  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, 32);
1459  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, 32);
1460  if (doElect && doSlow) {
1461  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, 32);
1462  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, 32);
1463  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, 32);
1464  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, 32);
1465  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, 32);
1466  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, 32);
1467  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, 32);
1468  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, 32);
1469  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, 32);
1470  }
1471  }
1473  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1474  int laneID = (threadIdx.x & (WARPSIZE - 1));
1475  int warpID = threadIdx.x / WARPSIZE;
1476  if (laneID == 0) {
1477  shVirialNbond[warpID] = virialNbond;
1478  shVirialSlow[warpID] = virialSlow;
1479  }
1480  BLOCK_SYNC;
1481 
1482  virialNbond.xx = 0.0;
1483  virialNbond.xy = 0.0;
1484  virialNbond.xz = 0.0;
1485  virialNbond.yx = 0.0;
1486  virialNbond.yy = 0.0;
1487  virialNbond.yz = 0.0;
1488  virialNbond.zx = 0.0;
1489  virialNbond.zy = 0.0;
1490  virialNbond.zz = 0.0;
1491  if (doElect && doSlow) {
1492  virialSlow.xx = 0.0;
1493  virialSlow.xy = 0.0;
1494  virialSlow.xz = 0.0;
1495  virialSlow.yx = 0.0;
1496  virialSlow.yy = 0.0;
1497  virialSlow.yz = 0.0;
1498  virialSlow.zx = 0.0;
1499  virialSlow.zy = 0.0;
1500  virialSlow.zz = 0.0;
1501  }
1502 
1503  if (warpID == 0) {
1504  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
1505  virialNbond = shVirialNbond[laneID];
1506  virialSlow = shVirialSlow[laneID];
1507  }
1508 #pragma unroll
1509  for (int i=16;i >= 1;i/=2) {
1510  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, 32);
1511  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, 32);
1512  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, 32);
1513  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, 32);
1514  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, 32);
1515  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, 32);
1516  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, 32);
1517  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, 32);
1518  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, 32);
1519  if (doElect && doSlow) {
1520  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, 32);
1521  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, 32);
1522  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, 32);
1523  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, 32);
1524  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, 32);
1525  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, 32);
1526  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, 32);
1527  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, 32);
1528  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, 32);
1529  }
1530  }
1531 
1532  if (laneID == 0)
1533  {
1534 #ifdef USE_FP_VIRIAL
1535  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX], llitoulli(virialNbond.xx*double_to_virial));
1536  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XY], llitoulli(virialNbond.xy*double_to_virial));
1537  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XZ], llitoulli(virialNbond.xz*double_to_virial));
1538  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YX], llitoulli(virialNbond.yx*double_to_virial));
1539  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YY], llitoulli(virialNbond.yy*double_to_virial));
1540  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YZ], llitoulli(virialNbond.yz*double_to_virial));
1541  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZX], llitoulli(virialNbond.zx*double_to_virial));
1542  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZY], llitoulli(virialNbond.zy*double_to_virial));
1543  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], llitoulli(virialNbond.zz*double_to_virial));
1544  if (doElect && doSlow) {
1545  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX], llitoulli(virialSlow.xx*double_to_virial));
1546  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XY], llitoulli(virialSlow.xy*double_to_virial));
1547  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XZ], llitoulli(virialSlow.xz*double_to_virial));
1548  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YX], llitoulli(virialSlow.yx*double_to_virial));
1549  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YY], llitoulli(virialSlow.yy*double_to_virial));
1550  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YZ], llitoulli(virialSlow.yz*double_to_virial));
1551  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZX], llitoulli(virialSlow.zx*double_to_virial));
1552  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZY], llitoulli(virialSlow.zy*double_to_virial));
1553  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZZ], llitoulli(virialSlow.zz*double_to_virial));
1554  }
1555 #else
1556  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
1557  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
1558  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
1559  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
1560  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
1561  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
1562  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
1563  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
1564  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
1565  if (doElect && doSlow) {
1566  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
1567  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
1568  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
1569  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
1570  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
1571  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
1572  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
1573  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
1574  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
1575  }
1576 #endif
1577  }
1578  }
1579  }
1580 #endif
1581 
1582 }
1583 
1584 // ##############################################################################################
1585 // ##############################################################################################
1586 // ##############################################################################################
1587 
1588 //
1589 // Class constructor
1590 //
1592 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
1593 
1594  cudaCheck(cudaSetDevice(deviceID));
1595 
1596  tupleData = NULL;
1597  tupleDataSize = 0;
1598 
1599  numBonds = 0;
1600  numAngles = 0;
1601  numDihedrals = 0;
1602  numImpropers = 0;
1603  numModifiedExclusions = 0;
1604  numExclusions = 0;
1605  numCrossterms = 0;
1606 
1607  bondValues = NULL;
1608  angleValues = NULL;
1609  dihedralValues = NULL;
1610  improperValues = NULL;
1611  crosstermValues = NULL;
1612 
1613  xyzq = NULL;
1614  xyzqSize = 0;
1615 
1616  forces = NULL;
1617  forcesSize = 0;
1618 
1619  allocate_device<double>(&energies_virials, energies_virials_SIZE);
1620 }
1621 
1622 //
1623 // Class destructor
1624 //
1626  cudaCheck(cudaSetDevice(deviceID));
1627 
1628  deallocate_device<double>(&energies_virials);
1629  // deallocate_device<BondedVirial>(&virial);
1630 
1631  if (tupleData != NULL) deallocate_device<char>(&tupleData);
1632  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
1633  if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
1634 
1635  if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
1636  if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
1637  if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
1638  if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
1639  if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
1640 }
1641 
1642 void ComputeBondedCUDAKernel::setupBondValues(int numBondValues, CudaBondValue* h_bondValues) {
1643  allocate_device<CudaBondValue>(&bondValues, numBondValues);
1644  copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
1645 }
1646 
1647 void ComputeBondedCUDAKernel::setupAngleValues(int numAngleValues, CudaAngleValue* h_angleValues) {
1648  allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
1649  copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
1650 }
1651 
1652 void ComputeBondedCUDAKernel::setupDihedralValues(int numDihedralValues, CudaDihedralValue* h_dihedralValues) {
1653  allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
1654  copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
1655 }
1656 
1657 void ComputeBondedCUDAKernel::setupImproperValues(int numImproperValues, CudaDihedralValue* h_improperValues) {
1658  allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
1659  copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
1660 }
1661 
1662 void ComputeBondedCUDAKernel::setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue* h_crosstermValues) {
1663  allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
1664  copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
1665 }
1666 
1667 //
1668 // Update bonded lists
1669 //
1671  const int numBondsIn,
1672  const int numAnglesIn,
1673  const int numDihedralsIn,
1674  const int numImpropersIn,
1675  const int numModifiedExclusionsIn,
1676  const int numExclusionsIn,
1677  const int numCrosstermsIn,
1678  const char* h_tupleData,
1679  cudaStream_t stream) {
1680 
1681  numBonds = numBondsIn;
1682  numAngles = numAnglesIn;
1683  numDihedrals = numDihedralsIn;
1684  numImpropers = numImpropersIn;
1685  numModifiedExclusions = numModifiedExclusionsIn;
1686  numExclusions = numExclusionsIn;
1687  numCrossterms = numCrosstermsIn;
1688 
1689  int numBondsWA = warpAlign(numBonds);
1690  int numAnglesWA = warpAlign(numAngles);
1691  int numDihedralsWA = warpAlign(numDihedrals);
1692  int numImpropersWA = warpAlign(numImpropers);
1693  int numModifiedExclusionsWA = warpAlign(numModifiedExclusions);
1694  int numExclusionsWA = warpAlign(numExclusions);
1695  int numCrosstermsWA = warpAlign(numCrossterms);
1696 
1697  int sizeTot = numBondsWA*sizeof(CudaBond) + numAnglesWA*sizeof(CudaAngle) +
1698  numDihedralsWA*sizeof(CudaDihedral) + numImpropersWA*sizeof(CudaDihedral) +
1699  numModifiedExclusionsWA*sizeof(CudaExclusion) + numExclusionsWA*sizeof(CudaExclusion) +
1700  numCrosstermsWA*sizeof(CudaCrossterm);
1701 
1702  reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, 1.4f);
1703  copy_HtoD<char>(h_tupleData, tupleData, sizeTot, stream);
1704 
1705  // Setup pointers
1706  int pos = 0;
1707  bonds = (CudaBond *)&tupleData[pos];
1708  pos += numBondsWA*sizeof(CudaBond);
1709 
1710  angles = (CudaAngle* )&tupleData[pos];
1711  pos += numAnglesWA*sizeof(CudaAngle);
1712 
1713  dihedrals = (CudaDihedral* )&tupleData[pos];
1714  pos += numDihedralsWA*sizeof(CudaDihedral);
1715 
1716  impropers = (CudaDihedral* )&tupleData[pos];
1717  pos += numImpropersWA*sizeof(CudaDihedral);
1718 
1719  modifiedExclusions = (CudaExclusion* )&tupleData[pos];
1720  pos += numModifiedExclusionsWA*sizeof(CudaExclusion);
1721 
1722  exclusions = (CudaExclusion* )&tupleData[pos];
1723  pos += numExclusionsWA*sizeof(CudaExclusion);
1724 
1725  crossterms = (CudaCrossterm* )&tupleData[pos];
1726  pos += numCrosstermsWA*sizeof(CudaCrossterm);
1727 }
1728 
1729 //
1730 // Return stride for force array
1731 //
1733 #ifdef USE_STRIDED_FORCE
1734  // Align stride to 256 bytes
1735  return ((atomStorageSize*sizeof(FORCE_TYPE) - 1)/256 + 1)*256/sizeof(FORCE_TYPE);
1736 #else
1737  return 1;
1738 #endif
1739 }
1740 
1741 //
1742 // Return size of single force array
1743 //
1745 #ifdef USE_STRIDED_FORCE
1746  return (3*getForceStride(atomStorageSize));
1747 #else
1748  return (3*atomStorageSize);
1749 #endif
1750 }
1751 
1752 //
1753 // Return size of the all force arrays
1754 //
1756 
1757  int forceSize = getForceSize(atomStorageSize);
1758 
1759  if (numModifiedExclusions > 0 || numExclusions > 0) {
1760  if (doSlow) {
1761  // All three force arrays [normal, nbond, slow]
1762  forceSize *= 3;
1763  } else {
1764  // Two force arrays [normal, nbond]
1765  forceSize *= 2;
1766  }
1767  }
1768 
1769  return forceSize;
1770 }
1771 
1772 //
1773 // Compute bonded forces
1774 //
1776  const double scale14, const int atomStorageSize,
1777  const bool doEnergy, const bool doVirial, const bool doSlow,
1778  const float3 lata, const float3 latb, const float3 latc,
1779  const float cutoff2, const float r2_delta, const int r2_delta_expc,
1780  const float4* h_xyzq, FORCE_TYPE* h_forces,
1781  double *h_energies_virials,
1782  cudaStream_t stream) {
1783 
1784  int forceStorageSize = getAllForceSize(atomStorageSize, true);
1785  int forceCopySize = getAllForceSize(atomStorageSize, doSlow);
1786  int forceStride = getForceStride(atomStorageSize);
1787 
1788  int forceSize = getForceSize(atomStorageSize);
1789  int startNbond = forceSize;
1790  int startSlow = 2*forceSize;
1791 
1792  // Re-allocate coordinate and force arrays if neccessary
1793  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
1794  reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
1795 
1796  // Copy coordinates to device
1797  copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
1798 
1799  // Clear force array
1800  clear_device_array<FORCE_TYPE>(forces, forceCopySize, stream);
1801  if (doEnergy || doVirial) {
1802  clear_device_array<double>(energies_virials, energies_virials_SIZE, stream);
1803  }
1804 
1805  float one_scale14 = (float)(1.0 - scale14);
1806 
1807  // If doSlow = false, these exclusions are not calculated
1808  int numExclusionsDoSlow = doSlow ? numExclusions : 0;
1809 
1810  int nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
1811 
1812  int numBondsTB = (numBonds + nthread - 1)/nthread;
1813  int numAnglesTB = (numAngles + nthread - 1)/nthread;
1814  int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
1815  int numImpropersTB = (numImpropers + nthread - 1)/nthread;
1816  int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
1817  int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
1818 
1819  int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
1820  numExclusionsTB + numCrosstermsTB;
1821  int shmem_size = 0;
1822 
1823  // printf("%d %d %d %d %d %d nblock %d\n",
1824  // numBonds, numAngles, numDihedrals, numImpropers, numModifiedExclusions, numExclusions, nblock);
1825 
1826  int max_nblock = deviceCUDA->getMaxNumBlocks();
1827 
1828  int start = 0;
1829  while (start < nblock)
1830  {
1831  int nleft = nblock - start;
1832  int nblock_use = min(max_nblock, nleft);
1833 
1834 #define CALL(DOENERGY, DOVIRIAL) \
1835  bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> \
1836  <<< nblock_use, nthread, shmem_size, stream >>> \
1837  (start, numBonds, bonds, bondValues, \
1838  numAngles, angles, angleValues, \
1839  numDihedrals, dihedrals, dihedralValues, \
1840  numImpropers, impropers, improperValues, \
1841  numExclusionsDoSlow, exclusions, \
1842  numCrossterms, crossterms, crosstermValues, \
1843  cutoff2, \
1844  r2_delta, r2_delta_expc, \
1845  cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
1846  cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex(), \
1847  xyzq, forceStride, \
1848  lata, latb, latc, \
1849  forces, &forces[startSlow], energies_virials);
1850 
1851  if (!doEnergy && !doVirial) CALL(0, 0);
1852  if (!doEnergy && doVirial) CALL(0, 1);
1853  if (doEnergy && !doVirial) CALL(1, 0);
1854  if (doEnergy && doVirial) CALL(1, 1);
1855 
1856 #undef CALL
1857  cudaCheck(cudaGetLastError());
1858 
1859  start += nblock_use;
1860  }
1861 
1863  nblock = (numModifiedExclusions + nthread - 1)/nthread;
1864 
1865  bool doElect = (one_scale14 == 0.0f) ? false : true;
1866 
1867  start = 0;
1868  while (start < nblock)
1869  {
1870  int nleft = nblock - start;
1871  int nblock_use = min(max_nblock, nleft);
1872 
1873 #define CALL(DOENERGY, DOVIRIAL, DOELECT) \
1874  modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT> \
1875  <<< nblock_use, nthread, shmem_size, stream >>> \
1876  (start, numModifiedExclusions, modifiedExclusions, \
1877  doSlow, one_scale14, cutoff2, \
1878  cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getExclusionVdwCoefTable(), \
1879  cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
1880  cudaNonbondedTables.getModifiedExclusionForceTableTex(), cudaNonbondedTables.getModifiedExclusionEnergyTableTex(), \
1881  xyzq, forceStride, lata, latb, latc, \
1882  &forces[startNbond], &forces[startSlow], energies_virials);
1883 
1884  if (!doEnergy && !doVirial && !doElect) CALL(0, 0, 0);
1885  if (!doEnergy && doVirial && !doElect) CALL(0, 1, 0);
1886  if (doEnergy && !doVirial && !doElect) CALL(1, 0, 0);
1887  if (doEnergy && doVirial && !doElect) CALL(1, 1, 0);
1888 
1889  if (!doEnergy && !doVirial && doElect) CALL(0, 0, 1);
1890  if (!doEnergy && doVirial && doElect) CALL(0, 1, 1);
1891  if (doEnergy && !doVirial && doElect) CALL(1, 0, 1);
1892  if (doEnergy && doVirial && doElect) CALL(1, 1, 1);
1893 
1894 #undef CALL
1895  cudaCheck(cudaGetLastError());
1896 
1897  start += nblock_use;
1898  }
1899 
1900  copy_DtoH<FORCE_TYPE>(forces, h_forces, forceCopySize, stream);
1901  if (doEnergy || doVirial) {
1902  copy_DtoH<double>(energies_virials, h_energies_virials, energies_virials_SIZE, stream);
1903  }
1904 
1905 }
__forceinline__ __device__ void convertForces< long long int >(const float x, const float y, const float z, long long int &Tx, long long int &Ty, long long int &Tz)
float scale
#define M_PI
Definition: GoMolecule.C:39
#define WRITE_FULL_VIRIALS
__forceinline__ __device__ void storeForces(const T fx, const T fy, const T fz, const int ind, const int stride, T *force)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t energyTableTex
float3 offset12XYZ
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
__forceinline__ __device__ void diheComp(const CudaDihedralValue *dihedralValues, int ic, const float sin_phi, const float cos_phi, const float scale, float &df, double &e)
static int warpAlign(const int n)
static __constant__ const float force_to_float
float3 ioffsetXYZ
float x
Definition: PmeSolver.C:3
__device__ unsigned long long int llitoulli(long long int l)
float3 offset78XYZ
float3 joffsetXYZ
int getForceStride(const int atomStorageSize)
__forceinline__ __device__ void calcComponentForces(float fij, const float dx, const float dy, const float dz, T &fxij, T &fyij, T &fzij)
const BigReal A
static __thread unsigned int * exclusions
int getAllForceSize(const int atomStorageSize, const bool doSlow)
__device__ __forceinline__ float3 cross(const float3 v1, const float3 v2)
void bondedForce(const double scale14, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float r2_delta, const int r2_delta_expc, const float4 *h_xyzq, FORCE_TYPE *h_forces, double *h_energies, cudaStream_t stream)
__device__ void angleForce(const int index, const CudaAngle *__restrict__ angleList, const CudaAngleValue *__restrict__ angleValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
static __constant__ const float float_to_force
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t forceTableTex
if(ComputeNonbondedUtil::goMethod==2)
__device__ void diheForce(const int index, const CudaDihedral *__restrict__ diheList, const CudaDihedralValue *__restrict__ dihedralValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
#define BONDEDFORCESKERNEL_NUM_WARP
float3 offset67XYZ
void setupAngleValues(int numAngleValues, CudaAngleValue *h_angleValues)
__forceinline__ __device__ void convertForces(const float x, const float y, const float z, T &Tx, T &Ty, T &Tz)
__device__ void modifiedExclusionForce(const int index, const CudaExclusion *__restrict__ exclusionList, const bool doSlow, const float one_scale14, const int vdwCoefTableWidth, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, double &energyVdw, T *__restrict__ forceNbond, double &energyNbond, T *__restrict__ forceSlow, double &energySlow, ComputeBondedCUDAKernel::BondedVirial< double > &virialNbond, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t vdwCoefTableTex
__global__ void modifiedExclusionForcesKernel(const int start, const int numModifiedExclusions, const CudaExclusion *__restrict__ modifiedExclusions, const bool doSlow, const float one_scale14, const float cutoff2, const int vdwCoefTableWidth, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ forceNbond, T *__restrict__ forceSlow, double *__restrict__ energies_virials)
float3 offset23XYZ
void setupBondValues(int numBondValues, CudaBondValue *h_bondValues)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
int getForceSize(const int atomStorageSize)
__forceinline__ __device__ void storeForces< long long int >(const long long int fx, const long long int fy, const long long int fz, const int ind, const int stride, long long int *force)
float3 koffsetXYZ
float3 ioffsetXYZ
void setupImproperValues(int numImproperValues, CudaDihedralValue *h_improperValues)
float y
Definition: PmeSolver.C:3
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
void setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue *h_crosstermValues)
__global__ void bondedForcesKernel(const int start, const int numBonds, const CudaBond *__restrict__ bonds, const CudaBondValue *__restrict__ bondValues, const int numAngles, const CudaAngle *__restrict__ angles, const CudaAngleValue *__restrict__ angleValues, const int numDihedrals, const CudaDihedral *__restrict__ dihedrals, const CudaDihedralValue *__restrict__ dihedralValues, const int numImpropers, const CudaDihedral *__restrict__ impropers, const CudaDihedralValue *__restrict__ improperValues, const int numExclusions, const CudaExclusion *__restrict__ exclusions, const int numCrossterms, const CudaCrossterm *__restrict__ crossterms, const CudaCrosstermValue *__restrict__ crosstermValues, const float cutoff2, const float r2_delta, const int r2_delta_expc, const float *__restrict__ r2_table, const float4 *__restrict__ exclusionTable, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, T *__restrict__ forceSlow, double *__restrict__ energies_virials)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
gridSize z
__device__ long long int lliroundf(float f)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
#define FORCE_TYPE
#define __ldg
int getMaxNumBlocks()
Definition: DeviceCUDA.C:415
float3 offset56XYZ
float4 c[dim][dim][4]
__forceinline__ __device__ void calcComponentForces< long long int >(float fij, const float dx, const float dy, const float dz, long long int &fxij, long long int &fyij, long long int &fzij)
__device__ __forceinline__ float dot(const float3 v1, const float3 v2)
ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables &cudaNonbondedTables)
float3 ioffsetXYZ
void setupDihedralValues(int numDihedralValues, CudaDihedralValue *h_dihedralValues)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
const BigReal B
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:32
gridSize y
static __constant__ const double double_to_virial
#define WARPSIZE
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
__device__ void crosstermForce(const int index, const CudaCrossterm *__restrict__ crosstermList, const CudaCrosstermValue *__restrict__ crosstermValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
gridSize x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__device__ void bondForce(const int index, const CudaBond *__restrict__ bondList, const CudaBondValue *__restrict__ bondValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
float3 loffsetXYZ
float3 offset34XYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
#define CALL(DOENERGY, DOVIRIAL)
__device__ void exclusionForce(const int index, const CudaExclusion *__restrict__ exclusionList, const float r2_delta, const int r2_delta_expc, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, T *__restrict__ forceSlow, double &energySlow, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
void update(const int numBondsIn, const int numAnglesIn, const int numDihedralsIn, const int numImpropersIn, const int numModifiedExclusionsIn, const int numExclusionsIn, const int numCrosstermsIn, const char *h_tupleData, cudaStream_t stream)