2 #define _USE_MATH_DEFINES
3 #define __thread __declspec(thread)
7 #if __CUDACC_VER_MAJOR__ >= 11
10 #include <namd_cub/cub.cuh>
14 #ifdef FUTURE_CUDADEVICE
15 #include "CudaDevice.h"
24 asm(
"cvt.rni.s64.f32 %0, %1;" :
"=l"(l) :
"f"(f));
28 __device__
inline unsigned long long int llitoulli(
long long int l)
30 unsigned long long int u;
31 asm(
"mov.b64 %0, %1;" :
"=l"(u) :
"l"(l));
36 __forceinline__ __device__
38 T &Tx, T &Ty, T &Tz) {
46 __forceinline__ __device__
48 long long int &Tx,
long long int &Ty,
long long int &Tz) {
56 __forceinline__ __device__
59 const float dx,
const float dy,
const float dz,
60 T &fxij, T &fyij, T &fzij) {
69 __forceinline__ __device__
72 const float dx,
const float dy,
const float dz,
75 long long int &fzij) {
85 __forceinline__ __device__
88 const float dx1,
const float dy1,
const float dz1,
90 const float dx2,
const float dy2,
const float dz2,
91 T &fxij, T &fyij, T &fzij) {
93 fxij = (T)(fij1*dx1 + fij2*dx2);
94 fyij = (T)(fij1*dy1 + fij2*dy2);
95 fzij = (T)(fij1*dz1 + fij2*dz2);
99 __forceinline__ __device__
111 long long int &fzij) {
120 template <
typename T>
121 __forceinline__ __device__
123 const int ind,
const int stride,
130 __forceinline__ __device__
132 const long long int fy,
133 const long long int fz,
134 const int ind,
const int stride,
135 long long int* force) {
136 atomicAdd((
unsigned long long int *)&force[ind ],
llitoulli(fx));
137 atomicAdd((
unsigned long long int *)&force[ind + stride ],
llitoulli(fy));
138 atomicAdd((
unsigned long long int *)&force[ind + stride*2],
llitoulli(fz));
144 template <
typename T,
bool doEnergy,
bool doVirial>
147 const CudaBond* __restrict__ bondList,
149 const float4* __restrict__
xyzq,
151 const float3
lata,
const float3
latb,
const float3
latc,
152 T* __restrict__ force,
double &energy,
162 if (bondValue.
x0 == 0.0f)
return;
168 float4 xyzqi = xyzq[bl.
i];
169 float4 xyzqj = xyzq[bl.
j];
171 float xij = xyzqi.x + shx - xyzqj.x;
172 float yij = xyzqi.y + shy - xyzqj.y;
173 float zij = xyzqi.z + shz - xyzqj.z;
175 float r = sqrtf(xij*xij + yij*yij + zij*zij);
177 float db = r - bondValue.
x0;
181 db = (r > bondValue.
x1 ? r - bondValue.
x1 : (r > bondValue.
x0 ? 0 : db));
183 float fij = db * bondValue.
k * bl.
scale;
186 energy += (double)(fij*db);
190 T T_fxij, T_fyij, T_fzij;
191 calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
194 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.
i, stride, force);
195 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.
j, stride, force);
199 #ifdef WRITE_FULL_VIRIALS
200 float fxij = fij*xij;
201 float fyij = fij*yij;
202 float fzij = fij*zij;
203 virial.xx = (fxij*xij);
204 virial.xy = (fxij*yij);
205 virial.xz = (fxij*zij);
206 virial.yx = (fyij*xij);
207 virial.yy = (fyij*yij);
208 virial.yz = (fyij*zij);
209 virial.zx = (fzij*xij);
210 virial.zy = (fzij*yij);
211 virial.zz = (fzij*zij);
219 template <
typename T,
bool doEnergy,
bool doVirial,
bool doElect>
224 const float one_scale14,
226 #
if __CUDA_ARCH__ >= 350
232 const float4* __restrict__
xyzq,
234 const float3
lata,
const float3
latb,
const float3
latc,
237 T* __restrict__ forceNbond,
double &energyNbond,
238 T* __restrict__ forceSlow,
double &energySlow,
254 float4 xyzqi = xyzq[bl.
i];
255 float4 xyzqj = xyzq[bl.
j];
257 float xij = xyzqi.x + shx - xyzqj.x;
258 float yij = xyzqi.y + shy - xyzqj.y;
259 float zij = xyzqi.z + shz - xyzqj.z;
261 float r2 = xij*xij + yij*yij + zij*zij;
264 float rinv = rsqrtf(r2);
267 if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
270 #if __CUDA_ARCH__ >= 350
280 energyVdw += (double)(ljab.
x * ei.z + ljab.
y * ei.y);
282 energyNbond += qq * ei.
x;
283 if (doSlow) energySlow += qq * ei.w;
289 fNbond = -(ljab.
x * fi.z + ljab.
y * fi.y + qq * fi.x);
291 fNbond = -(ljab.
x * fi.z + ljab.
y * fi.y);
293 T T_fxij, T_fyij, T_fzij;
294 calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
295 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.
i, stride, forceNbond);
296 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.
j, stride, forceNbond);
299 if (doSlow && doElect) {
301 T T_fxij, T_fyij, T_fzij;
302 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
303 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.
i, stride, forceSlow);
304 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.
j, stride, forceSlow);
309 #ifdef WRITE_FULL_VIRIALS
310 float fxij = fNbond*xij;
311 float fyij = fNbond*yij;
312 float fzij = fNbond*zij;
313 virialNbond.
xx = (fxij*xij);
314 virialNbond.
xy = (fxij*yij);
315 virialNbond.
xz = (fxij*zij);
316 virialNbond.
yx = (fyij*xij);
317 virialNbond.
yy = (fyij*yij);
318 virialNbond.
yz = (fyij*zij);
319 virialNbond.
zx = (fzij*xij);
320 virialNbond.
zy = (fzij*yij);
321 virialNbond.
zz = (fzij*zij);
326 if (doVirial && doSlow && doElect) {
327 #ifdef WRITE_FULL_VIRIALS
328 float fxij = fSlow*xij;
329 float fyij = fSlow*yij;
330 float fzij = fSlow*zij;
331 virialSlow.xx = (fxij*xij);
332 virialSlow.xy = (fxij*yij);
333 virialSlow.xz = (fxij*zij);
334 virialSlow.yx = (fyij*xij);
335 virialSlow.yy = (fyij*yij);
336 virialSlow.yz = (fyij*zij);
337 virialSlow.zx = (fzij*xij);
338 virialSlow.zy = (fzij*yij);
339 virialSlow.zz = (fzij*zij);
349 template <
typename T,
bool doEnergy,
bool doVirial>
353 const float r2_delta,
const int r2_delta_expc,
354 #
if __CUDA_ARCH__ >= 350
355 const float* __restrict__ r2_table,
356 const float4* __restrict__ exclusionTable,
358 cudaTextureObject_t r2_table_tex,
359 cudaTextureObject_t exclusionTableTex,
361 const float4* __restrict__
xyzq,
363 const float3
lata,
const float3
latb,
const float3
latc,
365 T* __restrict__ forceSlow,
double &energySlow,
379 float4 xyzqi = xyzq[bl.
i];
380 float4 xyzqj = xyzq[bl.
j];
382 float xij = xyzqi.x + shx - xyzqj.x;
383 float yij = xyzqi.y + shy - xyzqj.y;
384 float zij = xyzqi.z + shz - xyzqj.z;
386 float r2 = xij*xij + yij*yij + zij*zij;
390 union {
float f;
int i; } r2i;
392 int table_i = (r2i.i >> 17) + r2_delta_expc;
394 #if __CUDA_ARCH__ >= 350
395 float r2_table_val =
__ldg(&r2_table[table_i]);
397 float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
399 float diffa = r2 - r2_table_val;
400 float qq = xyzqi.w * xyzqj.w;
402 #if __CUDA_ARCH__ >= 350
403 float4 slow =
__ldg(&exclusionTable[table_i]);
405 float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
409 energySlow += (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
412 float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
414 T T_fxij, T_fyij, T_fzij;
415 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
416 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.
i, stride, forceSlow);
417 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.
j, stride, forceSlow);
421 #ifdef WRITE_FULL_VIRIALS
422 float fxij = fSlow*xij;
423 float fyij = fSlow*yij;
424 float fzij = fSlow*zij;
425 virialSlow.xx = (fxij*xij);
426 virialSlow.xy = (fxij*yij);
427 virialSlow.xz = (fxij*zij);
428 virialSlow.yx = (fyij*xij);
429 virialSlow.yy = (fyij*yij);
430 virialSlow.yz = (fyij*zij);
431 virialSlow.zx = (fzij*xij);
432 virialSlow.zy = (fzij*yij);
433 virialSlow.zz = (fzij*zij);
439 template <
typename T,
bool doEnergy,
bool doVirial>
443 const float4* __restrict__
xyzq,
445 const float3
lata,
const float3
latb,
const float3
latc,
446 T* __restrict__ force,
double &energy,
464 float xij = xyzq[al.
i].x + ishx - xyzq[al.
j].x;
465 float yij = xyzq[al.
i].y + ishy - xyzq[al.
j].y;
466 float zij = xyzq[al.
i].z + ishz - xyzq[al.
j].z;
468 float xkj = xyzq[al.
k].x + kshx - xyzq[al.
j].x;
469 float ykj = xyzq[al.
k].y + kshy - xyzq[al.
j].y;
470 float zkj = xyzq[al.
k].z + kshz - xyzq[al.
j].z;
472 float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
473 float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
475 float xijr = xij*rij_inv;
476 float yijr = yij*rij_inv;
477 float zijr = zij*rij_inv;
478 float xkjr = xkj*rkj_inv;
479 float ykjr = ykj*rkj_inv;
480 float zkjr = zkj*rkj_inv;
481 float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
487 if (angleValue.
normal == 1) {
489 cos_theta = min(0.999f, max(-0.999f, cos_theta));
490 float theta = acosf(cos_theta);
491 diff = theta - angleValue.
theta0;
493 diff = cos_theta - angleValue.
theta0;
497 energy += (double)(angleValue.
k * diff * diff);
499 if (angleValue.
normal == 1) {
500 float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
501 if (inv_sin_theta > 1.0e6) {
502 diff = (diff < 0.0f) ? 1.0f : -1.0f;
504 diff *= -inv_sin_theta;
507 diff *= -2.0f*angleValue.
k;
509 float dtxi = rij_inv*(xkjr - cos_theta*xijr);
510 float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
511 float dtyi = rij_inv*(ykjr - cos_theta*yijr);
512 float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
513 float dtzi = rij_inv*(zkjr - cos_theta*zijr);
514 float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
516 T T_dtxi, T_dtyi, T_dtzi;
517 T T_dtxj, T_dtyj, T_dtzj;
518 calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
519 calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
520 T T_dtxk = -T_dtxi - T_dtxj;
521 T T_dtyk = -T_dtyi - T_dtyj;
522 T T_dtzk = -T_dtzi - T_dtzj;
523 storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.
j, stride, force);
525 if (angleValue.
k_ub) {
526 float xik = xij - xkj;
527 float yik = yij - ykj;
528 float zik = zij - zkj;
529 float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
530 float rik = 1.0f/rik_inv;
531 float diff_ub = rik - angleValue.
r_ub;
533 energy += (double)(angleValue.
k_ub * diff_ub * diff_ub);
535 diff_ub *= -2.0f*angleValue.
k_ub*rik_inv;
536 T T_dxik, T_dyik, T_dzik;
537 calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
546 storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.
i, stride, force);
547 storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.
k, stride, force);
551 #ifdef WRITE_FULL_VIRIALS
558 virial.xx = (fxi*xij) + (fxk*xkj);
559 virial.xy = (fxi*yij) + (fxk*ykj);
560 virial.xz = (fxi*zij) + (fxk*zkj);
561 virial.yx = (fyi*xij) + (fyk*xkj);
562 virial.yy = (fyi*yij) + (fyk*ykj);
563 virial.yz = (fyi*zij) + (fyk*zkj);
564 virial.zx = (fzi*xij) + (fzk*xkj);
565 virial.zy = (fzi*yij) + (fzk*ykj);
566 virial.zz = (fzi*zij) + (fzk*zkj);
576 template <
bool doEnergy>
577 __forceinline__ __device__
579 const float sin_phi,
const float cos_phi,
const float scale,
float& df,
double& e) {
582 if (doEnergy) e = 0.0;
584 float phi = atan2f(sin_phi, cos_phi);
589 dihedralValue.
k *= scale;
592 lrep = (dihedralValue.
n & 1);
593 dihedralValue.
n >>= 1;
594 if (dihedralValue.
n) {
595 float nf = dihedralValue.
n;
596 float x = nf*phi - dihedralValue.
delta;
599 sincosf(x, &sin_x, &cos_x);
600 e += (double)(dihedralValue.
k*(1.0f + cos_x));
601 df += (double)(nf*dihedralValue.
k*sin_x);
603 float sin_x = sinf(x);
604 df += (double)(nf*dihedralValue.
k*sin_x);
607 float diff = phi - dihedralValue.
delta;
608 if (diff < -(
float)(
M_PI)) diff += (float)(2.0*
M_PI);
609 if (diff > (
float)(
M_PI)) diff -= (float)(2.0*
M_PI);
611 e += (double)(dihedralValue.
k*diff*diff);
613 df -= 2.0f*dihedralValue.
k*diff;
621 template <
typename T,
bool doEnergy,
bool doVirial>
625 const float4* __restrict__
xyzq,
627 const float3
lata,
const float3
latb,
const float3
latc,
628 T* __restrict__ force,
double &energy,
650 float xij = xyzq[dl.
i].x + ishx - xyzq[dl.
j].x;
651 float yij = xyzq[dl.
i].y + ishy - xyzq[dl.
j].y;
652 float zij = xyzq[dl.
i].z + ishz - xyzq[dl.
j].z;
654 float xjk = xyzq[dl.
j].x + jshx - xyzq[dl.
k].x;
655 float yjk = xyzq[dl.
j].y + jshy - xyzq[dl.
k].y;
656 float zjk = xyzq[dl.
j].z + jshz - xyzq[dl.
k].z;
658 float xlk = xyzq[dl.
l].x + lshx - xyzq[dl.
k].x;
659 float ylk = xyzq[dl.
l].y + lshy - xyzq[dl.
k].y;
660 float zlk = xyzq[dl.
l].z + lshz - xyzq[dl.
k].z;
663 float ax = yij*zjk - zij*yjk;
664 float ay = zij*xjk - xij*zjk;
665 float az = xij*yjk - yij*xjk;
666 float bx = ylk*zjk - zlk*yjk;
667 float by = zlk*xjk - xlk*zjk;
668 float bz = xlk*yjk - ylk*xjk;
670 float ra2 = ax*ax + ay*ay + az*az;
671 float rb2 = bx*bx + by*by + bz*bz;
672 float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
678 float rgr = 1.0f / rg;
679 float ra2r = 1.0f / ra2;
680 float rb2r = 1.0f / rb2;
681 float rabr = sqrtf(ra2r*rb2r);
683 float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
687 float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
693 diheComp<doEnergy>(dihedralValues, dl.
itype, sin_phi, cos_phi, dl.
scale, df, e);
695 if (doEnergy) energy += e;
703 float fg = xij*xjk + yij*yjk + zij*zjk;
704 float hg = xlk*xjk + ylk*yjk + zlk*zjk;
707 float fga = fg*ra2r*rgr;
708 float hgb = hg*rb2r*rgr;
714 T T_fix, T_fiy, T_fiz;
715 calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
716 storeForces<T>(T_fix, T_fiy, T_fiz, dl.
i, stride, force);
719 calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
720 T T_fjx = dgx - T_fix;
721 T T_fjy = dgy - T_fiy;
722 T T_fjz = dgz - T_fiz;
723 storeForces<T>(T_fjx, T_fjy, T_fjz, dl.
j, stride, force);
726 calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
727 T T_fkx = -dhx - dgx;
728 T T_fky = -dhy - dgy;
729 T T_fkz = -dhz - dgz;
730 storeForces<T>(T_fkx, T_fky, T_fkz, dl.
k, stride, force);
731 storeForces<T>(dhx, dhy, dhz, dl.
l, stride, force);
735 #ifdef WRITE_FULL_VIRIALS
745 virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
746 virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
747 virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
748 virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
749 virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
750 virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
751 virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
752 virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
753 virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
759 __device__ __forceinline__ float3
cross(
const float3 v1,
const float3 v2) {
767 __device__ __forceinline__
float dot(
const float3 v1,
const float3 v2) {
768 return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
774 template <
typename T,
bool doEnergy,
bool doVirial>
779 const float4* __restrict__
xyzq,
781 const float3
lata,
const float3
latb,
const float3
latc,
782 T* __restrict__ force,
double &energy,
796 float3 sh12 = make_float3(
801 float4 xyzq1 = xyzq[cl.
i1];
802 float4 xyzq2 = xyzq[cl.
i2];
804 float3 r12 = make_float3(
805 xyzq1.x + sh12.x - xyzq2.x,
806 xyzq1.y + sh12.y - xyzq2.y,
807 xyzq1.z + sh12.z - xyzq2.z);
810 float3 sh23 = make_float3(
815 float4 xyzq3 = xyzq[cl.
i3];
817 float3 r23 = make_float3(
818 xyzq2.x + sh23.x - xyzq3.x,
819 xyzq2.y + sh23.y - xyzq3.y,
820 xyzq2.z + sh23.z - xyzq3.z);
823 float3 sh34 = make_float3(
828 float4 xyzq4 = xyzq[cl.
i4];
830 float3 r34 = make_float3(
831 xyzq3.x + sh34.x - xyzq4.x,
832 xyzq3.y + sh34.y - xyzq4.y,
833 xyzq3.z + sh34.z - xyzq4.z);
836 float3
A =
cross(r12, r23);
837 float3
B =
cross(r23, r34);
838 float3 C =
cross(r23, A);
841 float inv_rA = rsqrtf(
dot(A, A));
842 float inv_rB = rsqrtf(
dot(B, B));
843 float inv_rC = rsqrtf(
dot(C, C));
846 float cos_phi =
dot(A, B)*(inv_rA*inv_rB);
847 float sin_phi =
dot(C, B)*(inv_rC*inv_rB);
849 float phi = -atan2f(sin_phi,cos_phi);
856 float3 sh56 = make_float3(
861 float4 xyzq5 = xyzq[cl.
i5];
862 float4 xyzq6 = xyzq[cl.
i6];
864 float3 r56 = make_float3(
865 xyzq5.x + sh56.x - xyzq6.x,
866 xyzq5.y + sh56.y - xyzq6.y,
867 xyzq5.z + sh56.z - xyzq6.z);
870 float3 sh67 = make_float3(
875 float4 xyzq7 = xyzq[cl.
i7];
877 float3 r67 = make_float3(
878 xyzq6.x + sh67.x - xyzq7.x,
879 xyzq6.y + sh67.y - xyzq7.y,
880 xyzq6.z + sh67.z - xyzq7.z);
883 float3 sh78 = make_float3(
888 float4 xyzq8 = xyzq[cl.
i8];
890 float3 r78 = make_float3(
891 xyzq7.x + sh78.x - xyzq8.x,
892 xyzq7.y + sh78.y - xyzq8.y,
893 xyzq7.z + sh78.z - xyzq8.z);
896 float3 D =
cross(r56, r67);
897 float3 E =
cross(r67, r78);
898 float3 F =
cross(r67, D);
901 float inv_rD = rsqrtf(
dot(D, D));
902 float inv_rE = rsqrtf(
dot(E, E));
903 float inv_rF = rsqrtf(
dot(F, F));
906 float cos_psi =
dot(D, E)*(inv_rD*inv_rE);
907 float sin_psi =
dot(F, E)*(inv_rF*inv_rE);
909 float psi = -atan2f(sin_psi,cos_psi);
917 phi = fmod(phi + (
float)
M_PI, 2.0f*(
float)M_PI);
918 psi = fmod(psi + (
float)M_PI, 2.0f*(
float)M_PI);
921 float phi_h = phi * inv_h;
922 float psi_h = psi * inv_h;
925 int iphi = (int)floor(phi_h);
926 int ipsi = (int)floor(psi_h);
932 c[0] = cp.
c[iphi][ipsi][0];
933 c[1] = cp.
c[iphi][ipsi][1];
934 c[2] = cp.
c[iphi][ipsi][2];
935 c[3] = cp.
c[iphi][ipsi][3];
937 float dphi = phi_h - (float)iphi;
938 float dpsi = psi_h - (float)ipsi;
941 float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
942 energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
943 energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
944 energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
945 energy += energyf*cl.
scale;
948 float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
949 dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
950 dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) ));
951 dEdphi *= cl.
scale*inv_h;
953 float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
954 dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
955 dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) ));
956 dEdpsi *= cl.
scale*inv_h;
959 float square_r23 =
dot(r23, r23);
960 float norm_r23 = sqrtf(square_r23);
961 float inv_square_r23 = 1.0f/square_r23;
962 float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
963 float ff2 = -
dot(r12, r23)*inv_square_r23;
964 float ff3 = -
dot(r34, r23)*inv_square_r23;
965 float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
967 float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
968 float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
969 float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
970 float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
971 float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
973 T T_f1x, T_f1y, T_f1z;
974 T T_f2x, T_f2y, T_f2z;
975 T T_f3x, T_f3y, T_f3z;
976 T T_f4x, T_f4y, T_f4z;
977 convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
978 convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
979 convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
980 convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
981 storeForces<T>(T_f1x, T_f1y, T_f1z, cl.
i1, stride, force);
982 storeForces<T>(T_f2x, T_f2y, T_f2z, cl.
i2, stride, force);
983 storeForces<T>(T_f3x, T_f3y, T_f3z, cl.
i3, stride, force);
984 storeForces<T>(T_f4x, T_f4y, T_f4z, cl.
i4, stride, force);
986 float square_r67 =
dot(r67, r67);
987 float norm_r67 = sqrtf(square_r67);
988 float inv_square_r67 = 1.0f/(square_r67);
989 ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
990 ff2 = -
dot(r56, r67)*inv_square_r67;
991 ff3 = -
dot(r78, r67)*inv_square_r67;
992 ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
994 float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
995 float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
996 float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
997 float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
998 float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
1000 T T_f5x, T_f5y, T_f5z;
1001 T T_f6x, T_f6y, T_f6z;
1002 T T_f7x, T_f7y, T_f7z;
1003 T T_f8x, T_f8y, T_f8z;
1004 convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1005 convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1006 convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1007 convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1008 storeForces<T>(T_f5x, T_f5y, T_f5z, cl.
i5, stride, force);
1009 storeForces<T>(T_f6x, T_f6y, T_f6z, cl.
i6, stride, force);
1010 storeForces<T>(T_f7x, T_f7y, T_f7z, cl.
i7, stride, force);
1011 storeForces<T>(T_f8x, T_f8y, T_f8z, cl.
i8, stride, force);
1015 #ifdef WRITE_FULL_VIRIALS
1016 float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1017 float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1018 virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1019 virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1020 virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1021 virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1022 virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1023 virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1024 virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1025 virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1026 virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1032 #define BONDEDFORCESKERNEL_NUM_WARP 4
1036 template <
typename T,
bool doEnergy,
bool doVirial>
1041 const CudaBond* __restrict__ bonds,
1044 const int numAngles,
1048 const int numDihedrals,
1052 const int numImpropers,
1056 const int numExclusions,
1059 const int numCrossterms,
1064 const float r2_delta,
const int r2_delta_expc,
1066 const float* __restrict__ r2_table,
1067 const float4* __restrict__ exclusionTable,
1068 cudaTextureObject_t r2_table_tex,
1069 cudaTextureObject_t exclusionTableTex,
1071 const float4* __restrict__
xyzq,
1073 const float3
lata,
const float3
latb,
const float3
latc,
1074 T* __restrict__ force,
1075 T* __restrict__ forceSlow,
1076 double* __restrict__ energies_virials) {
1079 int indexTB = start + blockIdx.x;
1081 const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
1082 const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
1083 const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
1084 const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
1085 const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
1086 const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
1098 #ifdef WRITE_FULL_VIRIALS
1115 if (indexTB < numBondsTB) {
1116 int i = threadIdx.x + indexTB*blockDim.x;
1121 bondForce<T, doEnergy, doVirial>
1122 (i, bonds, bondValues,
xyzq,
1124 force, energy, virial);
1128 indexTB -= numBondsTB;
1130 if (indexTB < numAnglesTB) {
1131 int i = threadIdx.x + indexTB*blockDim.x;
1135 if (i < numAngles) {
1136 angleForce<T, doEnergy, doVirial>
1137 (i, angles, angleValues,
xyzq, stride,
1139 force, energy, virial);
1143 indexTB -= numAnglesTB;
1145 if (indexTB < numDihedralsTB) {
1146 int i = threadIdx.x + indexTB*blockDim.x;
1153 if (i < numDihedrals) {
1154 diheForce<T, doEnergy, doVirial>
1155 (i, dihedrals, dihedralValues,
xyzq, stride,
1157 force, energy, virial);
1161 indexTB -= numDihedralsTB;
1163 if (indexTB < numImpropersTB) {
1164 int i = threadIdx.x + indexTB*blockDim.x;
1168 if (i < numImpropers) {
1169 diheForce<T, doEnergy, doVirial>
1170 (i, impropers, improperValues,
xyzq, stride,
1172 force, energy, virial);
1176 indexTB -= numImpropersTB;
1178 if (indexTB < numExclusionsTB) {
1179 int i = threadIdx.x + indexTB*blockDim.x;
1186 if (i < numExclusions) {
1187 exclusionForce<T, doEnergy, doVirial>
1189 #if __CUDA_ARCH__ >= 350
1190 r2_table, exclusionTable,
1192 r2_table_tex, exclusionTableTex,
1195 forceSlow, energy, virial);
1199 indexTB -= numExclusionsTB;
1201 if (indexTB < numCrosstermsTB) {
1202 int i = threadIdx.x + indexTB*blockDim.x;
1209 if (i < numCrossterms) {
1210 crosstermForce<T, doEnergy, doVirial>
1211 (i, crossterms, crosstermValues,
1213 force, energy, virial);
1226 for (
int i=16;i >= 1;i/=2) {
1229 int laneID = (threadIdx.x & (
WARPSIZE - 1));
1230 int warpID = threadIdx.x /
WARPSIZE;
1232 shEnergy[warpID] = energy;
1238 for (
int i=16;i >= 1;i/=2) {
1242 atomicAdd(&energies_virials[energyIndex], energy);
1248 #ifdef WRITE_FULL_VIRIALS
1251 for (
int i=16;i >= 1;i/=2) {
1263 int laneID = (threadIdx.x & (
WARPSIZE - 1));
1264 int warpID = threadIdx.x /
WARPSIZE;
1266 shVirial[warpID] = virial;
1282 for (
int i=16;i >= 1;i/=2) {
1295 #ifdef USE_FP_VIRIAL
1306 atomicAdd(&energies_virials[virialIndex + 0], virial.
xx);
1307 atomicAdd(&energies_virials[virialIndex + 1], virial.
xy);
1308 atomicAdd(&energies_virials[virialIndex + 2], virial.
xz);
1309 atomicAdd(&energies_virials[virialIndex + 3], virial.
yx);
1310 atomicAdd(&energies_virials[virialIndex + 4], virial.
yy);
1311 atomicAdd(&energies_virials[virialIndex + 5], virial.
yz);
1312 atomicAdd(&energies_virials[virialIndex + 6], virial.
zx);
1313 atomicAdd(&energies_virials[virialIndex + 7], virial.
zy);
1314 atomicAdd(&energies_virials[virialIndex + 8], virial.
zz);
1323 template <
typename T,
bool doEnergy,
bool doVirial,
bool doElect>
1327 const int numModifiedExclusions,
1331 const float one_scale14,
1336 cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex,
1338 const float4* __restrict__
xyzq,
1340 const float3
lata,
const float3
latb,
const float3
latc,
1341 T* __restrict__ forceNbond, T* __restrict__ forceSlow,
1342 double* __restrict__ energies_virials
1346 int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
1348 double energyVdw, energyNbond, energySlow;
1357 #ifdef WRITE_FULL_VIRIALS
1361 virialNbond.
xx = 0.0;
1362 virialNbond.
xy = 0.0;
1363 virialNbond.
xz = 0.0;
1364 virialNbond.
yx = 0.0;
1365 virialNbond.
yy = 0.0;
1366 virialNbond.
yz = 0.0;
1367 virialNbond.
zx = 0.0;
1368 virialNbond.
zy = 0.0;
1369 virialNbond.
zz = 0.0;
1371 virialSlow.
xx = 0.0;
1372 virialSlow.
xy = 0.0;
1373 virialSlow.
xz = 0.0;
1374 virialSlow.
yx = 0.0;
1375 virialSlow.
yy = 0.0;
1376 virialSlow.
yz = 0.0;
1377 virialSlow.
zx = 0.0;
1378 virialSlow.
zy = 0.0;
1379 virialSlow.
zz = 0.0;
1384 if (i < numModifiedExclusions)
1386 modifiedExclusionForce<T, doEnergy, doVirial, doElect>
1388 #if __CUDA_ARCH__ >= 350
1393 modifiedExclusionForceTableTex, modifiedExclusionEnergyTableTex,
1395 energyVdw, forceNbond, energyNbond,
1396 forceSlow, energySlow, virialNbond, virialSlow);
1405 for (
int i=16;i >= 1;i/=2) {
1412 int laneID = (threadIdx.x & (
WARPSIZE - 1));
1413 int warpID = threadIdx.x /
WARPSIZE;
1415 shEnergyVdw[warpID] = energyVdw;
1417 shEnergyNbond[warpID] = energyNbond;
1418 shEnergySlow[warpID] = energySlow;
1429 for (
int i=16;i >= 1;i/=2) {
1447 #ifdef WRITE_FULL_VIRIALS
1450 for (
int i=16;i >= 1;i/=2) {
1460 if (doElect && doSlow) {
1474 int laneID = (threadIdx.x & (
WARPSIZE - 1));
1475 int warpID = threadIdx.x /
WARPSIZE;
1477 shVirialNbond[warpID] = virialNbond;
1478 shVirialSlow[warpID] = virialSlow;
1482 virialNbond.
xx = 0.0;
1483 virialNbond.
xy = 0.0;
1484 virialNbond.
xz = 0.0;
1485 virialNbond.
yx = 0.0;
1486 virialNbond.
yy = 0.0;
1487 virialNbond.
yz = 0.0;
1488 virialNbond.
zx = 0.0;
1489 virialNbond.
zy = 0.0;
1490 virialNbond.
zz = 0.0;
1491 if (doElect && doSlow) {
1492 virialSlow.
xx = 0.0;
1493 virialSlow.
xy = 0.0;
1494 virialSlow.
xz = 0.0;
1495 virialSlow.
yx = 0.0;
1496 virialSlow.
yy = 0.0;
1497 virialSlow.
yz = 0.0;
1498 virialSlow.
zx = 0.0;
1499 virialSlow.
zy = 0.0;
1500 virialSlow.
zz = 0.0;
1505 virialNbond = shVirialNbond[laneID];
1506 virialSlow = shVirialSlow[laneID];
1509 for (
int i=16;i >= 1;i/=2) {
1519 if (doElect && doSlow) {
1534 #ifdef USE_FP_VIRIAL
1544 if (doElect && doSlow) {
1556 atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.
xx);
1557 atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.
xy);
1558 atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.
xz);
1559 atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.
yx);
1560 atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.
yy);
1561 atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.
yz);
1562 atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.
zx);
1563 atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.
zy);
1564 atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.
zz);
1565 if (doElect && doSlow) {
1592 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
1603 numModifiedExclusions = 0;
1609 dihedralValues = NULL;
1610 improperValues = NULL;
1611 crosstermValues = NULL;
1628 deallocate_device<double>(&energies_virials);
1631 if (tupleData != NULL) deallocate_device<char>(&tupleData);
1632 if (xyzq != NULL) deallocate_device<float4>(&xyzq);
1633 if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
1635 if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
1636 if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
1637 if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
1638 if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
1639 if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
1643 allocate_device<CudaBondValue>(&bondValues, numBondValues);
1644 copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
1648 allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
1649 copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
1653 allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
1654 copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
1658 allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
1659 copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
1663 allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
1664 copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
1671 const int numBondsIn,
1672 const int numAnglesIn,
1673 const int numDihedralsIn,
1674 const int numImpropersIn,
1675 const int numModifiedExclusionsIn,
1676 const int numExclusionsIn,
1677 const int numCrosstermsIn,
1678 const char* h_tupleData,
1681 numBonds = numBondsIn;
1682 numAngles = numAnglesIn;
1683 numDihedrals = numDihedralsIn;
1684 numImpropers = numImpropersIn;
1685 numModifiedExclusions = numModifiedExclusionsIn;
1686 numExclusions = numExclusionsIn;
1687 numCrossterms = numCrosstermsIn;
1691 int numDihedralsWA =
warpAlign(numDihedrals);
1692 int numImpropersWA =
warpAlign(numImpropers);
1693 int numModifiedExclusionsWA =
warpAlign(numModifiedExclusions);
1694 int numExclusionsWA =
warpAlign(numExclusions);
1695 int numCrosstermsWA =
warpAlign(numCrossterms);
1702 reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, 1.4f);
1703 copy_HtoD<char>(h_tupleData, tupleData, sizeTot,
stream);
1707 bonds = (
CudaBond *)&tupleData[pos];
1708 pos += numBondsWA*
sizeof(
CudaBond);
1733 #ifdef USE_STRIDED_FORCE
1745 #ifdef USE_STRIDED_FORCE
1748 return (3*atomStorageSize);
1759 if (numModifiedExclusions > 0 || numExclusions > 0) {
1777 const bool doEnergy,
const bool doVirial,
const bool doSlow,
1778 const float3
lata,
const float3
latb,
const float3
latc,
1779 const float cutoff2,
const float r2_delta,
const int r2_delta_expc,
1781 double *h_energies_virials,
1789 int startNbond = forceSize;
1790 int startSlow = 2*forceSize;
1794 reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
1800 clear_device_array<FORCE_TYPE>(forces, forceCopySize,
stream);
1801 if (doEnergy || doVirial) {
1805 float one_scale14 = (float)(1.0 - scale14);
1808 int numExclusionsDoSlow = doSlow ? numExclusions : 0;
1812 int numBondsTB = (numBonds + nthread - 1)/nthread;
1813 int numAnglesTB = (numAngles + nthread - 1)/nthread;
1814 int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
1815 int numImpropersTB = (numImpropers + nthread - 1)/nthread;
1816 int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
1817 int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
1819 int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
1820 numExclusionsTB + numCrosstermsTB;
1829 while (start < nblock)
1831 int nleft = nblock - start;
1832 int nblock_use = min(max_nblock, nleft);
1834 #define CALL(DOENERGY, DOVIRIAL) \
1835 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> \
1836 <<< nblock_use, nthread, shmem_size, stream >>> \
1837 (start, numBonds, bonds, bondValues, \
1838 numAngles, angles, angleValues, \
1839 numDihedrals, dihedrals, dihedralValues, \
1840 numImpropers, impropers, improperValues, \
1841 numExclusionsDoSlow, exclusions, \
1842 numCrossterms, crossterms, crosstermValues, \
1844 r2_delta, r2_delta_expc, \
1845 cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
1846 cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex(), \
1847 xyzq, forceStride, \
1849 forces, &forces[startSlow], energies_virials);
1851 if (!doEnergy && !doVirial)
CALL(0, 0);
1852 if (!doEnergy && doVirial)
CALL(0, 1);
1853 if (doEnergy && !doVirial)
CALL(1, 0);
1854 if (doEnergy && doVirial)
CALL(1, 1);
1859 start += nblock_use;
1863 nblock = (numModifiedExclusions + nthread - 1)/nthread;
1865 bool doElect = (one_scale14 == 0.0f) ?
false :
true;
1868 while (start < nblock)
1870 int nleft = nblock - start;
1871 int nblock_use = min(max_nblock, nleft);
1873 #define CALL(DOENERGY, DOVIRIAL, DOELECT) \
1874 modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT> \
1875 <<< nblock_use, nthread, shmem_size, stream >>> \
1876 (start, numModifiedExclusions, modifiedExclusions, \
1877 doSlow, one_scale14, cutoff2, \
1878 cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getExclusionVdwCoefTable(), \
1879 cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
1880 cudaNonbondedTables.getModifiedExclusionForceTableTex(), cudaNonbondedTables.getModifiedExclusionEnergyTableTex(), \
1881 xyzq, forceStride, lata, latb, latc, \
1882 &forces[startNbond], &forces[startSlow], energies_virials);
1884 if (!doEnergy && !doVirial && !doElect)
CALL(0, 0, 0);
1885 if (!doEnergy && doVirial && !doElect)
CALL(0, 1, 0);
1886 if (doEnergy && !doVirial && !doElect)
CALL(1, 0, 0);
1887 if (doEnergy && doVirial && !doElect)
CALL(1, 1, 0);
1889 if (!doEnergy && !doVirial && doElect)
CALL(0, 0, 1);
1890 if (!doEnergy && doVirial && doElect)
CALL(0, 1, 1);
1891 if (doEnergy && !doVirial && doElect)
CALL(1, 0, 1);
1892 if (doEnergy && doVirial && doElect)
CALL(1, 1, 1);
1897 start += nblock_use;
1900 copy_DtoH<FORCE_TYPE>(forces, h_forces, forceCopySize,
stream);
1901 if (doEnergy || doVirial) {
__forceinline__ __device__ void convertForces< long long int >(const float x, const float y, const float z, long long int &Tx, long long int &Ty, long long int &Tz)
~ComputeBondedCUDAKernel()
#define WRITE_FULL_VIRIALS
__forceinline__ __device__ void storeForces(const T fx, const T fy, const T fz, const int ind, const int stride, T *force)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t energyTableTex
__forceinline__ __device__ void diheComp(const CudaDihedralValue *dihedralValues, int ic, const float sin_phi, const float cos_phi, const float scale, float &df, double &e)
static int warpAlign(const int n)
static __constant__ const float force_to_float
__device__ unsigned long long int llitoulli(long long int l)
int getForceStride(const int atomStorageSize)
__forceinline__ __device__ void calcComponentForces(float fij, const float dx, const float dy, const float dz, T &fxij, T &fyij, T &fzij)
static __thread unsigned int * exclusions
int getAllForceSize(const int atomStorageSize, const bool doSlow)
__device__ __forceinline__ float3 cross(const float3 v1, const float3 v2)
void bondedForce(const double scale14, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float r2_delta, const int r2_delta_expc, const float4 *h_xyzq, FORCE_TYPE *h_forces, double *h_energies, cudaStream_t stream)
__device__ void angleForce(const int index, const CudaAngle *__restrict__ angleList, const CudaAngleValue *__restrict__ angleValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
static __constant__ const float float_to_force
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t forceTableTex
if(ComputeNonbondedUtil::goMethod==2)
__device__ void diheForce(const int index, const CudaDihedral *__restrict__ diheList, const CudaDihedralValue *__restrict__ dihedralValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
#define BONDEDFORCESKERNEL_NUM_WARP
void setupAngleValues(int numAngleValues, CudaAngleValue *h_angleValues)
__forceinline__ __device__ void convertForces(const float x, const float y, const float z, T &Tx, T &Ty, T &Tz)
__device__ void modifiedExclusionForce(const int index, const CudaExclusion *__restrict__ exclusionList, const bool doSlow, const float one_scale14, const int vdwCoefTableWidth, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, double &energyVdw, T *__restrict__ forceNbond, double &energyNbond, T *__restrict__ forceSlow, double &energySlow, ComputeBondedCUDAKernel::BondedVirial< double > &virialNbond, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t vdwCoefTableTex
__global__ void modifiedExclusionForcesKernel(const int start, const int numModifiedExclusions, const CudaExclusion *__restrict__ modifiedExclusions, const bool doSlow, const float one_scale14, const float cutoff2, const int vdwCoefTableWidth, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ forceNbond, T *__restrict__ forceSlow, double *__restrict__ energies_virials)
void setupBondValues(int numBondValues, CudaBondValue *h_bondValues)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
int getForceSize(const int atomStorageSize)
__forceinline__ __device__ void storeForces< long long int >(const long long int fx, const long long int fy, const long long int fz, const int ind, const int stride, long long int *force)
void setupImproperValues(int numImproperValues, CudaDihedralValue *h_improperValues)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
void setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue *h_crosstermValues)
__global__ void bondedForcesKernel(const int start, const int numBonds, const CudaBond *__restrict__ bonds, const CudaBondValue *__restrict__ bondValues, const int numAngles, const CudaAngle *__restrict__ angles, const CudaAngleValue *__restrict__ angleValues, const int numDihedrals, const CudaDihedral *__restrict__ dihedrals, const CudaDihedralValue *__restrict__ dihedralValues, const int numImpropers, const CudaDihedral *__restrict__ impropers, const CudaDihedralValue *__restrict__ improperValues, const int numExclusions, const CudaExclusion *__restrict__ exclusions, const int numCrossterms, const CudaCrossterm *__restrict__ crossterms, const CudaCrosstermValue *__restrict__ crosstermValues, const float cutoff2, const float r2_delta, const int r2_delta_expc, const float *__restrict__ r2_table, const float4 *__restrict__ exclusionTable, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, T *__restrict__ forceSlow, double *__restrict__ energies_virials)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__device__ long long int lliroundf(float f)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
__forceinline__ __device__ void calcComponentForces< long long int >(float fij, const float dx, const float dy, const float dz, long long int &fxij, long long int &fyij, long long int &fzij)
__device__ __forceinline__ float dot(const float3 v1, const float3 v2)
ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables &cudaNonbondedTables)
void setupDihedralValues(int numDihedralValues, CudaDihedralValue *h_dihedralValues)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__thread DeviceCUDA * deviceCUDA
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
static __constant__ const double double_to_virial
__device__ void crosstermForce(const int index, const CudaCrossterm *__restrict__ crosstermList, const CudaCrosstermValue *__restrict__ crosstermValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__device__ void bondForce(const int index, const CudaBond *__restrict__ bondList, const CudaBondValue *__restrict__ bondValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
#define CALL(DOENERGY, DOVIRIAL)
__device__ void exclusionForce(const int index, const CudaExclusion *__restrict__ exclusionList, const float r2_delta, const int r2_delta_expc, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, T *__restrict__ forceSlow, double &energySlow, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
void update(const int numBondsIn, const int numAnglesIn, const int numDihedralsIn, const int numImpropersIn, const int numModifiedExclusionsIn, const int numExclusionsIn, const int numCrosstermsIn, const char *h_tupleData, cudaStream_t stream)