2 #define _USE_MATH_DEFINES
3 #define __thread __declspec(thread)
7 #if defined(NAMD_HIP) && !defined(NAMD_CUDA)
8 #include <hipcub/hipcub.hpp>
13 #if __CUDACC_VER_MAJOR__ >= 11
14 #include <cub/cub.cuh>
16 #include <namd_cub/cub.cuh>
22 #ifdef FUTURE_CUDADEVICE
23 #include "CudaDevice.h"
29 __device__ __forceinline__
32 const float x = k * (float)tableSize - 0.5f;
33 const float f = floorf(x);
34 const float a = x - f;
35 const unsigned int i = (
unsigned int)f;
36 const int i0 = i < tableSize - 1 ? i : tableSize - 1;
37 const int i1 = i0 + 1;
38 const float4 t0 = tex1Dfetch<float4>(tex, i0);
39 const float4 t1 = tex1Dfetch<float4>(tex, i1);
41 a * (t1.x - t0.x) + t0.x,
42 a * (t1.y - t0.y) + t0.y,
43 a * (t1.z - t0.z) + t0.z,
44 a * (t1.w - t0.w) + t0.w);
47 #if defined(USE_FP_FORCE) || defined(USE_FP_VIRIAL)
49 __device__
inline long long int lliroundf(
float f)
52 asm(
"cvt.rni.s64.f32 %0, %1;" :
"=l"(l) :
"f"(f));
56 __device__
inline unsigned long long int llitoulli(
long long int l)
58 unsigned long long int u;
59 asm(
"mov.b64 %0, %1;" :
"=l"(u) :
"l"(l));
63 __device__
inline long long int lliroundf(
float f)
68 __device__
inline unsigned long long int llitoulli(
long long int l)
76 __forceinline__ __device__
78 T &Tx, T &Ty, T &Tz) {
85 #if defined(USE_FP_FORCE)
87 __forceinline__ __device__
88 void convertForces<long long int>(
const float x,
const float y,
const float z,
89 long long int &Tx,
long long int &Ty,
long long int &Tz) {
98 __forceinline__ __device__
101 const float dx,
const float dy,
const float dz,
102 T &fxij, T &fyij, T &fzij) {
110 #if defined(USE_FP_FORCE)
112 __forceinline__ __device__
113 void calcComponentForces<long long int>(
115 const float dx,
const float dy,
const float dz,
118 long long int &fzij) {
121 fxij = lliroundf(fij*dx);
122 fyij = lliroundf(fij*dy);
123 fzij = lliroundf(fij*dz);
128 template <
typename T>
129 __forceinline__ __device__
132 const float dx1,
const float dy1,
const float dz1,
134 const float dx2,
const float dy2,
const float dz2,
135 T &fxij, T &fyij, T &fzij) {
137 fxij = (T)(fij1*dx1 + fij2*dx2);
138 fyij = (T)(fij1*dy1 + fij2*dy2);
139 fzij = (T)(fij1*dz1 + fij2*dz2);
142 #if defined(USE_FP_FORCE)
144 __forceinline__ __device__
145 void calcComponentForces<long long int>(
156 long long int &fzij) {
160 fxij = lliroundf(fij1*dx1 + fij2*dx2);
161 fyij = lliroundf(fij1*dy1 + fij2*dy2);
162 fzij = lliroundf(fij1*dz1 + fij2*dz2);
166 __forceinline__ __device__
170 int total = __popcll(mask);
171 int prefix = __popcll(mask & cub::LaneMaskLt());
172 int firstLane = __ffsll(mask) - 1;
175 int total = __popc(mask);
176 int prefix = __popc(mask & cub::LaneMaskLt());
177 int firstLane = __ffs(mask) - 1;
181 start = atomicAdd(counter, total);
184 return start + prefix;
187 template <
typename T>
188 __forceinline__ __device__
190 const int ind,
const int stride,
192 T* forceList,
int* forceListCounter,
193 int* forceListStarts,
int* forceListNexts) {
194 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
195 #if defined(NAMD_HIP)
199 const int laneId = threadIdx.x %
WARPSIZE;
201 const bool isHead = laneId == 0 || ind != prevLaneInd;
204 typedef cub::WarpReduce<T> WarpReduce;
205 __shared__
typename WarpReduce::TempStorage temp_storage;
206 const T sumfx = WarpReduce(temp_storage).HeadSegmentedSum(fx, isHead);
207 const T sumfy = WarpReduce(temp_storage).HeadSegmentedSum(fy, isHead);
208 const T sumfz = WarpReduce(temp_storage).HeadSegmentedSum(fz, isHead);
210 atomicAdd(&force[ind ], sumfx);
211 atomicAdd(&force[ind + stride ], sumfy);
212 atomicAdd(&force[ind + stride*2], sumfz);
218 atomicAdd(&force[ind ], fx);
219 atomicAdd(&force[ind + stride ], fy);
220 atomicAdd(&force[ind + stride*2], fz);
222 atomicAdd(&force[ind ], fx);
223 atomicAdd(&force[ind + stride ], fy);
224 atomicAdd(&force[ind + stride*2], fz);
228 forceListNexts[newPos] = atomicExch(&forceListStarts[ind], newPos);
229 forceList[newPos * 3 + 0] = fx;
230 forceList[newPos * 3 + 1] = fy;
231 forceList[newPos * 3 + 2] = fz;
235 #if defined(USE_FP_FORCE)
238 __forceinline__ __device__
239 void storeForces <long long int> (
const long long int fx,
240 const long long int fy,
241 const long long int fz,
242 const int ind,
const int stride,
243 long long int* force) {
244 atomicAdd((
unsigned long long int *)&force[ind ], llitoulli(fx));
245 atomicAdd((
unsigned long long int *)&force[ind + stride ], llitoulli(fy));
246 atomicAdd((
unsigned long long int *)&force[ind + stride*2], llitoulli(fz));
253 template <
typename T,
bool doEnergy,
bool doVirial>
256 const CudaBond* __restrict__ bondList,
258 const float4* __restrict__
xyzq,
260 const float3
lata,
const float3
latb,
const float3
latc,
261 T* __restrict__ force,
double &energy,
262 T* __restrict__ forceList,
int* forceListCounter,
263 int* forceListStarts,
int* forceListNexts,
273 if (bondValue.
x0 == 0.0f)
return;
279 float4 xyzqi = xyzq[bl.
i];
280 float4 xyzqj = xyzq[bl.
j];
282 float xij = xyzqi.x + shx - xyzqj.x;
283 float yij = xyzqi.y + shy - xyzqj.y;
284 float zij = xyzqi.z + shz - xyzqj.z;
286 float r = sqrtf(xij*xij + yij*yij + zij*zij);
288 float db = r - bondValue.
x0;
292 db = (r > bondValue.
x1 ? r - bondValue.
x1 : (r > bondValue.
x0 ? 0 : db));
294 float fij = db * bondValue.
k * bl.
scale;
297 energy += (double)(fij*db);
301 T T_fxij, T_fyij, T_fzij;
302 calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
305 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.
i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
306 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.
j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
310 #ifdef WRITE_FULL_VIRIALS
311 float fxij = fij*xij;
312 float fyij = fij*yij;
313 float fzij = fij*zij;
314 virial.xx = (fxij*xij);
315 virial.xy = (fxij*yij);
316 virial.xz = (fxij*zij);
317 virial.yx = (fyij*xij);
318 virial.yy = (fyij*yij);
319 virial.yz = (fyij*zij);
320 virial.zx = (fzij*xij);
321 virial.zy = (fzij*yij);
322 virial.zz = (fzij*zij);
331 #if defined(NAMD_HIP)
338 __device__ __forceinline__
339 T tableLookup(
const T* table,
const int tableSize,
const float k)
341 const float x = k *
static_cast<float>(tableSize) - 0.5f;
342 const float f = floorf(x);
343 const float a = x - f;
344 const int i =
static_cast<int>(f);
345 const int i0 = max(0, min(tableSize - 1, i));
346 const int i1 = max(0, min(tableSize - 1, i + 1));
347 return (1.0f - a) *
__ldg(&table[i0]) + a *
__ldg(&table[i1]);
351 template <
typename T,
bool doEnergy,
bool doVirial,
bool doElect>
356 const float one_scale14,
358 #
if __CUDA_ARCH__ >= 350
364 const float4* __restrict__
xyzq,
366 const float3
lata,
const float3
latb,
const float3
latc,
369 T* __restrict__ forceNbond,
double &energyNbond,
370 T* __restrict__ forceSlow,
double &energySlow,
371 T* __restrict__ forceList,
int* forceListCounter,
372 int* forceListStartsNbond,
int* forceListStartsSlow,
int* forceListNexts,
388 float4 xyzqi = xyzq[bl.
i];
389 float4 xyzqj = xyzq[bl.
j];
391 float xij = xyzqi.x + shx - xyzqj.x;
392 float yij = xyzqi.y + shy - xyzqj.y;
393 float zij = xyzqi.z + shz - xyzqj.z;
395 float r2 = xij*xij + yij*yij + zij*zij;
398 float rinv = rsqrtf(r2);
401 if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
404 #if __CUDA_ARCH__ >= 350
410 #if defined(NAMD_CUDA)
417 #if defined NAMD_CUDA
422 energyVdw += (double)(ljab.
x * ei.z + ljab.
y * ei.y);
424 energyNbond += qq * ei.x;
425 if (doSlow) energySlow += qq * ei.w;
431 fNbond = -(ljab.
x * fi.z + ljab.
y * fi.y + qq * fi.x);
433 fNbond = -(ljab.
x * fi.z + ljab.
y * fi.y);
435 T T_fxij, T_fyij, T_fzij;
436 calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
437 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.
i, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
438 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.
j, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
441 if (doSlow && doElect) {
443 T T_fxij, T_fyij, T_fzij;
444 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
445 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.
i, stride, forceSlow, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
446 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.
j, stride, forceSlow, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
451 #ifdef WRITE_FULL_VIRIALS
452 float fxij = fNbond*xij;
453 float fyij = fNbond*yij;
454 float fzij = fNbond*zij;
455 virialNbond.
xx = (fxij*xij);
456 virialNbond.
xy = (fxij*yij);
457 virialNbond.
xz = (fxij*zij);
458 virialNbond.
yx = (fyij*xij);
459 virialNbond.
yy = (fyij*yij);
460 virialNbond.
yz = (fyij*zij);
461 virialNbond.
zx = (fzij*xij);
462 virialNbond.
zy = (fzij*yij);
463 virialNbond.
zz = (fzij*zij);
468 if (doVirial && doSlow && doElect) {
469 #ifdef WRITE_FULL_VIRIALS
470 float fxij = fSlow*xij;
471 float fyij = fSlow*yij;
472 float fzij = fSlow*zij;
473 virialSlow.xx = (fxij*xij);
474 virialSlow.xy = (fxij*yij);
475 virialSlow.xz = (fxij*zij);
476 virialSlow.yx = (fyij*xij);
477 virialSlow.yy = (fyij*yij);
478 virialSlow.yz = (fyij*zij);
479 virialSlow.zx = (fzij*xij);
480 virialSlow.zy = (fzij*yij);
481 virialSlow.zz = (fzij*zij);
491 template <
typename T,
bool doEnergy,
bool doVirial>
495 const float r2_delta,
const int r2_delta_expc,
497 #
if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
498 const float* __restrict__ r2_table,
499 const float4* __restrict__ exclusionTable,
501 cudaTextureObject_t r2_table_tex,
502 cudaTextureObject_t exclusionTableTex,
505 const float4* __restrict__
xyzq,
507 const float3
lata,
const float3
latb,
const float3
latc,
509 T* __restrict__ forceSlow,
double &energySlow,
510 T* __restrict__ forceList,
int* forceListCounter,
511 int* forceListStartsSlow,
int* forceListNexts,
525 float4 xyzqi = xyzq[bl.
i];
526 float4 xyzqj = xyzq[bl.
j];
528 float xij = xyzqi.x + shx - xyzqj.x;
529 float yij = xyzqi.y + shy - xyzqj.y;
530 float zij = xyzqi.z + shz - xyzqj.z;
532 float r2 = xij*xij + yij*yij + zij*zij;
536 union {
float f;
int i; } r2i;
538 int table_i = (r2i.i >> 17) + r2_delta_expc;
540 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
541 float r2_table_val =
__ldg(&r2_table[table_i]);
543 float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
546 float diffa = r2 - r2_table_val;
547 float qq = xyzqi.w * xyzqj.w;
549 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
550 float4 slow =
__ldg(&exclusionTable[table_i]);
552 float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
557 energySlow += (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
560 float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
562 T T_fxij, T_fyij, T_fzij;
563 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
564 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.
i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
565 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.
j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
569 #ifdef WRITE_FULL_VIRIALS
570 float fxij = fSlow*xij;
571 float fyij = fSlow*yij;
572 float fzij = fSlow*zij;
573 virialSlow.xx = (fxij*xij);
574 virialSlow.xy = (fxij*yij);
575 virialSlow.xz = (fxij*zij);
576 virialSlow.yx = (fyij*xij);
577 virialSlow.yy = (fyij*yij);
578 virialSlow.yz = (fyij*zij);
579 virialSlow.zx = (fzij*xij);
580 virialSlow.zy = (fzij*yij);
581 virialSlow.zz = (fzij*zij);
587 template <
typename T,
bool doEnergy,
bool doVirial>
591 const float4* __restrict__
xyzq,
593 const float3
lata,
const float3
latb,
const float3
latc,
594 T* __restrict__ force,
double &energy,
595 T* __restrict__ forceList,
int* forceListCounter,
596 int* forceListStarts,
int* forceListNexts,
614 float xij = xyzq[al.
i].x + ishx - xyzq[al.
j].x;
615 float yij = xyzq[al.
i].y + ishy - xyzq[al.
j].y;
616 float zij = xyzq[al.
i].z + ishz - xyzq[al.
j].z;
618 float xkj = xyzq[al.
k].x + kshx - xyzq[al.
j].x;
619 float ykj = xyzq[al.
k].y + kshy - xyzq[al.
j].y;
620 float zkj = xyzq[al.
k].z + kshz - xyzq[al.
j].z;
622 float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
623 float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
625 float xijr = xij*rij_inv;
626 float yijr = yij*rij_inv;
627 float zijr = zij*rij_inv;
628 float xkjr = xkj*rkj_inv;
629 float ykjr = ykj*rkj_inv;
630 float zkjr = zkj*rkj_inv;
631 float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
637 if (angleValue.
normal == 1) {
639 cos_theta = fminf(0.999f, fmaxf(-0.999f, cos_theta));
640 float theta = acosf(cos_theta);
641 diff = theta - angleValue.
theta0;
643 diff = cos_theta - angleValue.
theta0;
647 energy += (double)(angleValue.
k * diff * diff);
649 if (angleValue.
normal == 1) {
650 float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
651 if (inv_sin_theta > 1.0e6) {
652 diff = (diff < 0.0f) ? 1.0f : -1.0f;
654 diff *= -inv_sin_theta;
657 diff *= -2.0f*angleValue.
k;
659 float dtxi = rij_inv*(xkjr - cos_theta*xijr);
660 float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
661 float dtyi = rij_inv*(ykjr - cos_theta*yijr);
662 float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
663 float dtzi = rij_inv*(zkjr - cos_theta*zijr);
664 float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
666 T T_dtxi, T_dtyi, T_dtzi;
667 T T_dtxj, T_dtyj, T_dtzj;
668 calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
669 calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
670 T T_dtxk = -T_dtxi - T_dtxj;
671 T T_dtyk = -T_dtyi - T_dtyj;
672 T T_dtzk = -T_dtzi - T_dtzj;
673 storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.
j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
675 if (angleValue.
k_ub) {
676 float xik = xij - xkj;
677 float yik = yij - ykj;
678 float zik = zij - zkj;
679 float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
680 float rik = 1.0f/rik_inv;
681 float diff_ub = rik - angleValue.
r_ub;
683 energy += (double)(angleValue.
k_ub * diff_ub * diff_ub);
685 diff_ub *= -2.0f*angleValue.
k_ub*rik_inv;
686 T T_dxik, T_dyik, T_dzik;
687 calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
696 storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.
i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
697 storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.
k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
701 #ifdef WRITE_FULL_VIRIALS
708 virial.xx = (fxi*xij) + (fxk*xkj);
709 virial.xy = (fxi*yij) + (fxk*ykj);
710 virial.xz = (fxi*zij) + (fxk*zkj);
711 virial.yx = (fyi*xij) + (fyk*xkj);
712 virial.yy = (fyi*yij) + (fyk*ykj);
713 virial.yz = (fyi*zij) + (fyk*zkj);
714 virial.zx = (fzi*xij) + (fzk*xkj);
715 virial.zy = (fzi*yij) + (fzk*ykj);
716 virial.zz = (fzi*zij) + (fzk*zkj);
726 template <
bool doEnergy>
727 __forceinline__ __device__
729 const float sin_phi,
const float cos_phi,
const float scale,
float& df,
double& e) {
732 if (doEnergy) e = 0.0;
734 float phi = atan2f(sin_phi, cos_phi);
739 dihedralValue.
k *= scale;
742 lrep = (dihedralValue.
n & 1);
743 dihedralValue.
n >>= 1;
744 if (dihedralValue.
n) {
745 float nf = dihedralValue.
n;
746 float x = nf*phi - dihedralValue.
delta;
749 sincosf(x, &sin_x, &cos_x);
750 e += (double)(dihedralValue.
k*(1.0f + cos_x));
751 df += (double)(nf*dihedralValue.
k*sin_x);
753 float sin_x = sinf(x);
754 df += (double)(nf*dihedralValue.
k*sin_x);
757 float diff = phi - dihedralValue.
delta;
758 if (diff < -(
float)(
M_PI)) diff += (float)(2.0*
M_PI);
759 if (diff > (
float)(
M_PI)) diff -= (float)(2.0*
M_PI);
761 e += (double)(dihedralValue.
k*diff*diff);
763 df -= 2.0f*dihedralValue.
k*diff;
771 template <
typename T,
bool doEnergy,
bool doVirial>
775 const float4* __restrict__
xyzq,
777 const float3
lata,
const float3
latb,
const float3
latc,
778 T* __restrict__ force,
double &energy,
779 T* __restrict__ forceList,
int* forceListCounter,
int* forceListStarts,
int* forceListNexts,
801 float xij = xyzq[dl.
i].x + ishx - xyzq[dl.
j].x;
802 float yij = xyzq[dl.
i].y + ishy - xyzq[dl.
j].y;
803 float zij = xyzq[dl.
i].z + ishz - xyzq[dl.
j].z;
805 float xjk = xyzq[dl.
j].x + jshx - xyzq[dl.
k].x;
806 float yjk = xyzq[dl.
j].y + jshy - xyzq[dl.
k].y;
807 float zjk = xyzq[dl.
j].z + jshz - xyzq[dl.
k].z;
809 float xlk = xyzq[dl.
l].x + lshx - xyzq[dl.
k].x;
810 float ylk = xyzq[dl.
l].y + lshy - xyzq[dl.
k].y;
811 float zlk = xyzq[dl.
l].z + lshz - xyzq[dl.
k].z;
814 float ax = yij*zjk - zij*yjk;
815 float ay = zij*xjk - xij*zjk;
816 float az = xij*yjk - yij*xjk;
817 float bx = ylk*zjk - zlk*yjk;
818 float by = zlk*xjk - xlk*zjk;
819 float bz = xlk*yjk - ylk*xjk;
821 float ra2 = ax*ax + ay*ay + az*az;
822 float rb2 = bx*bx + by*by + bz*bz;
823 float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
829 float rgr = 1.0f / rg;
830 float ra2r = 1.0f / ra2;
831 float rb2r = 1.0f / rb2;
832 float rabr = sqrtf(ra2r*rb2r);
834 float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
838 float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
844 diheComp<doEnergy>(dihedralValues, dl.
itype, sin_phi, cos_phi, dl.
scale, df, e);
846 if (doEnergy) energy += e;
854 float fg = xij*xjk + yij*yjk + zij*zjk;
855 float hg = xlk*xjk + ylk*yjk + zlk*zjk;
858 float fga = fg*ra2r*rgr;
859 float hgb = hg*rb2r*rgr;
865 T T_fix, T_fiy, T_fiz;
866 calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
867 storeForces<T>(T_fix, T_fiy, T_fiz, dl.
i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
870 calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
871 T T_fjx = dgx - T_fix;
872 T T_fjy = dgy - T_fiy;
873 T T_fjz = dgz - T_fiz;
874 storeForces<T>(T_fjx, T_fjy, T_fjz, dl.
j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
877 calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
878 T T_fkx = -dhx - dgx;
879 T T_fky = -dhy - dgy;
880 T T_fkz = -dhz - dgz;
881 storeForces<T>(T_fkx, T_fky, T_fkz, dl.
k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
882 storeForces<T>(dhx, dhy, dhz, dl.
l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
886 #ifdef WRITE_FULL_VIRIALS
896 virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
897 virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
898 virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
899 virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
900 virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
901 virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
902 virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
903 virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
904 virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
910 __device__ __forceinline__ float3
cross(
const float3 v1,
const float3 v2) {
918 __device__ __forceinline__
float dot(
const float3 v1,
const float3 v2) {
919 return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
925 template <
typename T,
bool doEnergy,
bool doVirial>
930 const float4* __restrict__
xyzq,
932 const float3
lata,
const float3
latb,
const float3
latc,
933 T* __restrict__ force,
double &energy,
934 T* __restrict__ forceList,
int* forceListCounter,
int* forceListStarts,
int* forceListNexts,
948 float3 sh12 = make_float3(
953 float4 xyzq1 = xyzq[cl.
i1];
954 float4 xyzq2 = xyzq[cl.
i2];
956 float3 r12 = make_float3(
957 xyzq1.x + sh12.x - xyzq2.x,
958 xyzq1.y + sh12.y - xyzq2.y,
959 xyzq1.z + sh12.z - xyzq2.z);
962 float3 sh23 = make_float3(
967 float4 xyzq3 = xyzq[cl.
i3];
969 float3 r23 = make_float3(
970 xyzq2.x + sh23.x - xyzq3.x,
971 xyzq2.y + sh23.y - xyzq3.y,
972 xyzq2.z + sh23.z - xyzq3.z);
975 float3 sh34 = make_float3(
980 float4 xyzq4 = xyzq[cl.
i4];
982 float3 r34 = make_float3(
983 xyzq3.x + sh34.x - xyzq4.x,
984 xyzq3.y + sh34.y - xyzq4.y,
985 xyzq3.z + sh34.z - xyzq4.z);
988 float3
A =
cross(r12, r23);
989 float3
B =
cross(r23, r34);
990 float3 C =
cross(r23, A);
993 float inv_rA = rsqrtf(
dot(A, A));
994 float inv_rB = rsqrtf(
dot(B, B));
995 float inv_rC = rsqrtf(
dot(C, C));
998 float cos_phi =
dot(A, B)*(inv_rA*inv_rB);
999 float sin_phi =
dot(C, B)*(inv_rC*inv_rB);
1001 float phi = -atan2f(sin_phi,cos_phi);
1008 float3 sh56 = make_float3(
1013 float4 xyzq5 = xyzq[cl.
i5];
1014 float4 xyzq6 = xyzq[cl.
i6];
1016 float3 r56 = make_float3(
1017 xyzq5.x + sh56.x - xyzq6.x,
1018 xyzq5.y + sh56.y - xyzq6.y,
1019 xyzq5.z + sh56.z - xyzq6.z);
1022 float3 sh67 = make_float3(
1027 float4 xyzq7 = xyzq[cl.
i7];
1029 float3 r67 = make_float3(
1030 xyzq6.x + sh67.x - xyzq7.x,
1031 xyzq6.y + sh67.y - xyzq7.y,
1032 xyzq6.z + sh67.z - xyzq7.z);
1035 float3 sh78 = make_float3(
1040 float4 xyzq8 = xyzq[cl.
i8];
1042 float3 r78 = make_float3(
1043 xyzq7.x + sh78.x - xyzq8.x,
1044 xyzq7.y + sh78.y - xyzq8.y,
1045 xyzq7.z + sh78.z - xyzq8.z);
1048 float3 D =
cross(r56, r67);
1049 float3 E =
cross(r67, r78);
1050 float3 F =
cross(r67, D);
1053 float inv_rD = rsqrtf(
dot(D, D));
1054 float inv_rE = rsqrtf(
dot(E, E));
1055 float inv_rF = rsqrtf(
dot(F, F));
1058 float cos_psi =
dot(D, E)*(inv_rD*inv_rE);
1059 float sin_psi =
dot(F, E)*(inv_rF*inv_rE);
1061 float psi = -atan2f(sin_psi,cos_psi);
1069 phi = fmod(phi + (
float)
M_PI, 2.0f*(
float)M_PI);
1070 psi = fmod(psi + (
float)M_PI, 2.0f*(
float)M_PI);
1073 float phi_h = phi * inv_h;
1074 float psi_h = psi * inv_h;
1077 int iphi = (int)floor(phi_h);
1078 int ipsi = (int)floor(psi_h);
1084 c[0] = cp.
c[iphi][ipsi][0];
1085 c[1] = cp.
c[iphi][ipsi][1];
1086 c[2] = cp.
c[iphi][ipsi][2];
1087 c[3] = cp.
c[iphi][ipsi][3];
1089 float dphi = phi_h - (float)iphi;
1090 float dpsi = psi_h - (float)ipsi;
1093 float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
1094 energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
1095 energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
1096 energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
1097 energy += energyf*cl.
scale;
1100 float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
1101 dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
1102 dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) ));
1103 dEdphi *= cl.
scale*inv_h;
1105 float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
1106 dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
1107 dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) ));
1108 dEdpsi *= cl.
scale*inv_h;
1111 float square_r23 =
dot(r23, r23);
1112 float norm_r23 = sqrtf(square_r23);
1113 float inv_square_r23 = 1.0f/square_r23;
1114 float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
1115 float ff2 = -
dot(r12, r23)*inv_square_r23;
1116 float ff3 = -
dot(r34, r23)*inv_square_r23;
1117 float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
1119 float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
1120 float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
1121 float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
1122 float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
1123 float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
1125 T T_f1x, T_f1y, T_f1z;
1126 T T_f2x, T_f2y, T_f2z;
1127 T T_f3x, T_f3y, T_f3z;
1128 T T_f4x, T_f4y, T_f4z;
1129 convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
1130 convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
1131 convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
1132 convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
1133 storeForces<T>(T_f1x, T_f1y, T_f1z, cl.
i1, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1134 storeForces<T>(T_f2x, T_f2y, T_f2z, cl.
i2, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1135 storeForces<T>(T_f3x, T_f3y, T_f3z, cl.
i3, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1136 storeForces<T>(T_f4x, T_f4y, T_f4z, cl.
i4, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1138 float square_r67 =
dot(r67, r67);
1139 float norm_r67 = sqrtf(square_r67);
1140 float inv_square_r67 = 1.0f/(square_r67);
1141 ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
1142 ff2 = -
dot(r56, r67)*inv_square_r67;
1143 ff3 = -
dot(r78, r67)*inv_square_r67;
1144 ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
1146 float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
1147 float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
1148 float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
1149 float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
1150 float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
1152 T T_f5x, T_f5y, T_f5z;
1153 T T_f6x, T_f6y, T_f6z;
1154 T T_f7x, T_f7y, T_f7z;
1155 T T_f8x, T_f8y, T_f8z;
1156 convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1157 convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1158 convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1159 convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1160 storeForces<T>(T_f5x, T_f5y, T_f5z, cl.
i5, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1161 storeForces<T>(T_f6x, T_f6y, T_f6z, cl.
i6, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1162 storeForces<T>(T_f7x, T_f7y, T_f7z, cl.
i7, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1163 storeForces<T>(T_f8x, T_f8y, T_f8z, cl.
i8, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1167 #ifdef WRITE_FULL_VIRIALS
1168 float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1169 float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1170 virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1171 virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1172 virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1173 virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1174 virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1175 virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1176 virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1177 virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1178 virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1186 #define BONDEDFORCESKERNEL_NUM_WARP 4
1188 #define BONDEDFORCESKERNEL_NUM_WARP 4
1194 template <
typename T,
bool doEnergy,
bool doVirial>
1199 const CudaBond* __restrict__ bonds,
1202 const int numAngles,
1206 const int numDihedrals,
1210 const int numImpropers,
1214 const int numExclusions,
1217 const int numCrossterms,
1222 const float r2_delta,
const int r2_delta_expc,
1224 const float* __restrict__ r2_table,
1225 const float4* __restrict__ exclusionTable,
1228 cudaTextureObject_t r2_table_tex,
1229 cudaTextureObject_t exclusionTableTex,
1232 const float4* __restrict__
xyzq,
1234 const float3
lata,
const float3
latb,
const float3
latc,
1235 T* __restrict__ force,
1236 T* __restrict__ forceSlow,
1237 T* __restrict__ forceList,
1238 int* forceListCounter,
1239 int* forceListStarts,
1240 int* forceListStartsSlow,
1241 int* forceListNexts,
1242 double* __restrict__ energies_virials) {
1245 int indexTB = start + blockIdx.x;
1247 const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
1248 const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
1249 const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
1250 const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
1251 const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
1252 const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
1264 #ifdef WRITE_FULL_VIRIALS
1281 if (indexTB < numBondsTB) {
1282 int i = threadIdx.x + indexTB*blockDim.x;
1287 bondForce<T, doEnergy, doVirial>
1288 (i, bonds, bondValues,
xyzq,
1291 forceList, forceListCounter, forceListStarts, forceListNexts,
1296 indexTB -= numBondsTB;
1298 if (indexTB < numAnglesTB) {
1299 int i = threadIdx.x + indexTB*blockDim.x;
1303 if (i < numAngles) {
1304 angleForce<T, doEnergy, doVirial>
1305 (i, angles, angleValues,
xyzq, stride,
1308 forceList, forceListCounter, forceListStarts, forceListNexts,
1313 indexTB -= numAnglesTB;
1315 if (indexTB < numDihedralsTB) {
1316 int i = threadIdx.x + indexTB*blockDim.x;
1323 if (i < numDihedrals) {
1324 diheForce<T, doEnergy, doVirial>
1325 (i, dihedrals, dihedralValues,
xyzq, stride,
1328 forceList, forceListCounter, forceListStarts, forceListNexts,
1333 indexTB -= numDihedralsTB;
1335 if (indexTB < numImpropersTB) {
1336 int i = threadIdx.x + indexTB*blockDim.x;
1340 if (i < numImpropers) {
1341 diheForce<T, doEnergy, doVirial>
1342 (i, impropers, improperValues,
xyzq, stride,
1345 forceList, forceListCounter, forceListStarts, forceListNexts,
1350 indexTB -= numImpropersTB;
1352 if (indexTB < numExclusionsTB) {
1353 int i = threadIdx.x + indexTB*blockDim.x;
1360 if (i < numExclusions) {
1361 exclusionForce<T, doEnergy, doVirial>
1363 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
1364 r2_table, exclusionTable,
1366 r2_table_tex, exclusionTableTex,
1370 forceList, forceListCounter, forceListStartsSlow, forceListNexts,
1375 indexTB -= numExclusionsTB;
1377 if (indexTB < numCrosstermsTB) {
1378 int i = threadIdx.x + indexTB*blockDim.x;
1385 if (i < numCrossterms) {
1386 crosstermForce<T, doEnergy, doVirial>
1387 (i, crossterms, crosstermValues,
1390 forceList, forceListCounter, forceListStarts, forceListNexts,
1404 for (
int i=
WARPSIZE/2;i >= 1;i/=2) {
1407 int laneID = (threadIdx.x & (
WARPSIZE - 1));
1408 int warpID = threadIdx.x /
WARPSIZE;
1410 shEnergy[warpID] = energy;
1416 for (
int i=
WARPSIZE/2;i >= 1;i/=2) {
1427 #ifdef WRITE_FULL_VIRIALS
1430 for (
int i=
WARPSIZE/2;i >= 1;i/=2) {
1442 int laneID = (threadIdx.x & (
WARPSIZE - 1));
1443 int warpID = threadIdx.x /
WARPSIZE;
1445 shVirial[warpID] = virial;
1461 for (
int i=
WARPSIZE/2;i >= 1;i/=2) {
1474 #ifdef USE_FP_VIRIAL
1475 atomicAdd((
unsigned long long int *)&energies_virials[virialIndex + 0], llitoulli(virial.
xx*double_to_virial));
1476 atomicAdd((
unsigned long long int *)&energies_virials[virialIndex + 1], llitoulli(virial.
xy*double_to_virial));
1477 atomicAdd((
unsigned long long int *)&energies_virials[virialIndex + 2], llitoulli(virial.
xz*double_to_virial));
1478 atomicAdd((
unsigned long long int *)&energies_virials[virialIndex + 3], llitoulli(virial.
yx*double_to_virial));
1479 atomicAdd((
unsigned long long int *)&energies_virials[virialIndex + 4], llitoulli(virial.
yy*double_to_virial));
1480 atomicAdd((
unsigned long long int *)&energies_virials[virialIndex + 5], llitoulli(virial.
yz*double_to_virial));
1481 atomicAdd((
unsigned long long int *)&energies_virials[virialIndex + 6], llitoulli(virial.
zx*double_to_virial));
1482 atomicAdd((
unsigned long long int *)&energies_virials[virialIndex + 7], llitoulli(virial.
zy*double_to_virial));
1483 atomicAdd((
unsigned long long int *)&energies_virials[virialIndex + 8], llitoulli(virial.
zz*double_to_virial));
1503 template <
typename T,
bool doEnergy,
bool doVirial,
bool doElect>
1507 const int numModifiedExclusions,
1511 const float one_scale14,
1516 cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex,
1517 const float4* __restrict__
xyzq,
1519 const float3
lata,
const float3
latb,
const float3
latc,
1520 T* __restrict__ forceNbond, T* __restrict__ forceSlow,
1521 T* __restrict__ forceList,
1522 int* forceListCounter,
1523 int* forceListStartsNbond,
1524 int* forceListStartsSlow,
1525 int* forceListNexts,
1526 double* __restrict__ energies_virials
1530 int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
1532 double energyVdw, energyNbond, energySlow;
1541 #ifdef WRITE_FULL_VIRIALS
1545 virialNbond.
xx = 0.0;
1546 virialNbond.
xy = 0.0;
1547 virialNbond.
xz = 0.0;
1548 virialNbond.
yx = 0.0;
1549 virialNbond.
yy = 0.0;
1550 virialNbond.
yz = 0.0;
1551 virialNbond.
zx = 0.0;
1552 virialNbond.
zy = 0.0;
1553 virialNbond.
zz = 0.0;
1555 virialSlow.
xx = 0.0;
1556 virialSlow.
xy = 0.0;
1557 virialSlow.
xz = 0.0;
1558 virialSlow.
yx = 0.0;
1559 virialSlow.
yy = 0.0;
1560 virialSlow.
yz = 0.0;
1561 virialSlow.
zx = 0.0;
1562 virialSlow.
zy = 0.0;
1563 virialSlow.
zz = 0.0;
1568 if (i < numModifiedExclusions)
1570 modifiedExclusionForce<T, doEnergy, doVirial, doElect>
1572 #if __CUDA_ARCH__ >= 350
1577 modifiedExclusionForceTableTex, modifiedExclusionEnergyTableTex,
1579 energyVdw, forceNbond, energyNbond,
1580 forceSlow, energySlow,
1581 forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts,
1582 virialNbond, virialSlow);
1593 for (
int i=
WARPSIZE/2;i >= 1;i/=2) {
1600 int laneID = (threadIdx.x & (
WARPSIZE - 1));
1601 int warpID = threadIdx.x /
WARPSIZE;
1603 shEnergyVdw[warpID] = energyVdw;
1605 shEnergyNbond[warpID] = energyNbond;
1606 shEnergySlow[warpID] = energySlow;
1617 for (
int i=
WARPSIZE/2;i >= 1;i/=2) {
1636 #ifdef WRITE_FULL_VIRIALS
1639 for (
int i=
WARPSIZE/2;i >= 1;i/=2) {
1649 if (doElect && doSlow) {
1663 int laneID = (threadIdx.x & (
WARPSIZE - 1));
1664 int warpID = threadIdx.x /
WARPSIZE;
1666 shVirialNbond[warpID] = virialNbond;
1667 if (doElect && doSlow) {
1668 shVirialSlow[warpID] = virialSlow;
1673 virialNbond.
xx = 0.0;
1674 virialNbond.
xy = 0.0;
1675 virialNbond.
xz = 0.0;
1676 virialNbond.
yx = 0.0;
1677 virialNbond.
yy = 0.0;
1678 virialNbond.
yz = 0.0;
1679 virialNbond.
zx = 0.0;
1680 virialNbond.
zy = 0.0;
1681 virialNbond.
zz = 0.0;
1682 if (doElect && doSlow) {
1683 virialSlow.
xx = 0.0;
1684 virialSlow.
xy = 0.0;
1685 virialSlow.
xz = 0.0;
1686 virialSlow.
yx = 0.0;
1687 virialSlow.
yy = 0.0;
1688 virialSlow.
yz = 0.0;
1689 virialSlow.
zx = 0.0;
1690 virialSlow.
zy = 0.0;
1691 virialSlow.
zz = 0.0;
1696 virialNbond = shVirialNbond[laneID];
1697 if (doElect && doSlow) {
1698 virialSlow = shVirialSlow[laneID];
1702 for (
int i=
WARPSIZE/2;i >= 1;i/=2) {
1712 if (doElect && doSlow) {
1727 #ifdef USE_FP_VIRIAL
1737 if (doElect && doSlow) {
1759 if (doElect && doSlow) {
1778 template <
typename T>
1781 const int atomStorageSize,
1783 const T* __restrict__ forceList,
1784 const int* __restrict__ forceListStarts,
1785 const int* __restrict__ forceListNexts,
1786 T* __restrict__ force) {
1788 int i = threadIdx.x + (start + blockIdx.x) * blockDim.x;
1790 if (i < atomStorageSize) {
1794 int pos = forceListStarts[i];
1796 fx += forceList[pos * 3 + 0];
1797 fy += forceList[pos * 3 + 1];
1798 fz += forceList[pos * 3 + 2];
1799 pos = forceListNexts[pos];
1802 force[i + stride ] = fy;
1803 force[i + stride * 2] = fz;
1808 const int bin = threadIdx.x;
1810 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ?
ATOMIC_BINS : 2)> WarpReduce;
1811 __shared__
typename WarpReduce::TempStorage
tempStorage;
1814 if (threadIdx.x == 0) {
1815 energies_virials[blockIdx.x] = v;
1827 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
1838 numModifiedExclusions = 0;
1844 dihedralValues = NULL;
1845 improperValues = NULL;
1846 crosstermValues = NULL;
1855 forceListStarts = NULL;
1856 forceListNexts = NULL;
1858 forceListStartsSize = 0;
1859 forceListNextsSize = 0;
1860 allocate_device<int>(&forceListCounter, 1);
1871 deallocate_device<double>(&energies_virials);
1874 if (tupleData != NULL) deallocate_device<char>(&tupleData);
1875 if (xyzq != NULL) deallocate_device<float4>(&xyzq);
1876 if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
1878 if (forceList != NULL) deallocate_device<FORCE_TYPE>(&forceList);
1879 if (forceListCounter != NULL) deallocate_device<int>(&forceListCounter);
1880 if (forceListStarts != NULL) deallocate_device<int>(&forceListStarts);
1881 if (forceListNexts != NULL) deallocate_device<int>(&forceListNexts);
1883 if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
1884 if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
1885 if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
1886 if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
1887 if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
1891 allocate_device<CudaBondValue>(&bondValues, numBondValues);
1892 copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
1896 allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
1897 copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
1901 allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
1902 copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
1906 allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
1907 copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
1911 allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
1912 copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
1919 const int numBondsIn,
1920 const int numAnglesIn,
1921 const int numDihedralsIn,
1922 const int numImpropersIn,
1923 const int numModifiedExclusionsIn,
1924 const int numExclusionsIn,
1925 const int numCrosstermsIn,
1926 const char* h_tupleData,
1929 numBonds = numBondsIn;
1930 numAngles = numAnglesIn;
1931 numDihedrals = numDihedralsIn;
1932 numImpropers = numImpropersIn;
1933 numModifiedExclusions = numModifiedExclusionsIn;
1934 numExclusions = numExclusionsIn;
1935 numCrossterms = numCrosstermsIn;
1939 int numDihedralsWA =
warpAlign(numDihedrals);
1940 int numImpropersWA =
warpAlign(numImpropers);
1941 int numModifiedExclusionsWA =
warpAlign(numModifiedExclusions);
1942 int numExclusionsWA =
warpAlign(numExclusions);
1943 int numCrosstermsWA =
warpAlign(numCrossterms);
1950 reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, 1.4f);
1951 copy_HtoD<char>(h_tupleData, tupleData, sizeTot,
stream);
1955 bonds = (
CudaBond *)&tupleData[pos];
1956 pos += numBondsWA*
sizeof(
CudaBond);
1981 #ifdef USE_STRIDED_FORCE
1993 #ifdef USE_STRIDED_FORCE
1996 return (3*atomStorageSize);
2007 if (numModifiedExclusions > 0 || numExclusions > 0) {
2024 const double scale14,
const int atomStorageSize,
2025 const bool doEnergy,
const bool doVirial,
const bool doSlow,
2026 const float3
lata,
const float3
latb,
const float3
latc,
2027 const float cutoff2,
const float r2_delta,
const int r2_delta_expc,
2029 double *h_energies_virials,
2037 int startNbond = forceSize;
2038 int startSlow = 2*forceSize;
2041 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
2042 reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2044 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
2053 int listSize = 3 * (numBonds * 2 + numAngles * 3 + numDihedrals * 4 + numImpropers * 4 + numExclusions * 2 + numCrossterms * 8 + numModifiedExclusions * 4);
2054 reallocate_device<FORCE_TYPE>(&forceList, &forceListSize, listSize, 1.4f);
2055 reallocate_device<int>(&forceListNexts, &forceListNextsSize, listSize, 1.4f);
2056 reallocate_device<int>(&forceListStarts, &forceListStartsSize, 3 * atomStorageSize, 1.4f);
2057 int* forceListStartsNbond = forceListStarts + atomStorageSize;
2058 int* forceListStartsSlow = forceListStarts + 2 * atomStorageSize;
2060 clear_device_array<int>(forceListCounter, 1,
stream);
2061 cudaCheck(cudaMemsetAsync(forceListStarts, -1,
sizeof(
int) * 3 * atomStorageSize, stream));
2063 int* forceListStartsNbond = NULL;
2064 int* forceListStartsSlow = NULL;
2068 copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize,
stream);
2071 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
2072 clear_device_array<FORCE_TYPE>(forces, forceCopySize,
stream);
2074 if (doEnergy || doVirial) {
2078 float one_scale14 = (float)(1.0 - scale14);
2081 int numExclusionsDoSlow = doSlow ? numExclusions : 0;
2085 int numBondsTB = (numBonds + nthread - 1)/nthread;
2086 int numAnglesTB = (numAngles + nthread - 1)/nthread;
2087 int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
2088 int numImpropersTB = (numImpropers + nthread - 1)/nthread;
2089 int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
2090 int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
2092 int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
2093 numExclusionsTB + numCrosstermsTB;
2102 while (start < nblock)
2104 int nleft = nblock - start;
2105 int nblock_use = min(max_nblock, nleft);
2109 #define NONBONDEDTABLES cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable()
2111 #define NONBONDEDTABLES cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
2112 cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex()
2116 #define CALL(DOENERGY, DOVIRIAL) \
2117 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> <<< nblock_use, nthread, shmem_size, stream >>> \
2118 (start, numBonds, bonds, bondValues, \
2119 numAngles, angles, angleValues, \
2120 numDihedrals, dihedrals, dihedralValues, \
2121 numImpropers, impropers, improperValues, \
2122 numExclusionsDoSlow, exclusions, \
2123 numCrossterms, crossterms, crosstermValues, \
2125 r2_delta, r2_delta_expc, \
2126 cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable() , \
2127 xyzq, forceStride, \
2129 forces, &forces[startSlow], \
2130 forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
2133 #define CALL(DOENERGY, DOVIRIAL) \
2134 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> <<< nblock_use, nthread, shmem_size, stream >>> \
2135 (start, numBonds, bonds, bondValues, \
2136 numAngles, angles, angleValues, \
2137 numDihedrals, dihedrals, dihedralValues, \
2138 numImpropers, impropers, improperValues, \
2139 numExclusionsDoSlow, exclusions, \
2140 numCrossterms, crossterms, crosstermValues, \
2142 r2_delta, r2_delta_expc, \
2143 cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable() , \
2144 cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex() , \
2145 xyzq, forceStride, \
2147 forces, &forces[startSlow], \
2148 forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
2152 if (!doEnergy && !doVirial)
CALL(0, 0);
2153 if (!doEnergy && doVirial)
CALL(0, 1);
2154 if (doEnergy && !doVirial)
CALL(1, 0);
2155 if (doEnergy && doVirial)
CALL(1, 1);
2160 start += nblock_use;
2164 nblock = (numModifiedExclusions + nthread - 1)/nthread;
2166 bool doElect = (one_scale14 == 0.0f) ?
false :
true;
2169 while (start < nblock)
2171 int nleft = nblock - start;
2172 int nblock_use = min(max_nblock, nleft);
2174 #define CALL(DOENERGY, DOVIRIAL, DOELECT) \
2175 modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT> \
2176 <<< nblock_use, nthread, shmem_size, stream >>> (\
2177 start, numModifiedExclusions, modifiedExclusions, \
2178 doSlow, one_scale14, cutoff2, \
2179 cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getExclusionVdwCoefTable(), \
2180 cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
2181 cudaNonbondedTables.getModifiedExclusionForceTableTex(), cudaNonbondedTables.getModifiedExclusionEnergyTableTex(), \
2182 xyzq, forceStride, lata, latb, latc, \
2183 &forces[startNbond], &forces[startSlow], \
2184 forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts, \
2188 if (!doEnergy && !doVirial && !doElect)
CALL(0, 0, 0);
2189 if (!doEnergy && doVirial && !doElect)
CALL(0, 1, 0);
2190 if (doEnergy && !doVirial && !doElect)
CALL(1, 0, 0);
2191 if (doEnergy && doVirial && !doElect)
CALL(1, 1, 0);
2193 if (!doEnergy && !doVirial && doElect)
CALL(0, 0, 1);
2194 if (!doEnergy && doVirial && doElect)
CALL(0, 1, 1);
2195 if (doEnergy && !doVirial && doElect)
CALL(1, 0, 1);
2196 if (doEnergy && doVirial && doElect)
CALL(1, 1, 1);
2201 start += nblock_use;
2203 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
2205 nblock = (atomStorageSize + nthread - 1)/nthread;
2208 while (start < nblock)
2210 int nleft = nblock - start;
2211 int nblock_use = min(max_nblock, nleft);
2216 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2217 start, atomStorageSize, forceStride,
2218 forceList, forceListStarts, forceListNexts,
2220 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2221 start, atomStorageSize, forceStride,
2222 forceList, forceListStartsNbond, forceListNexts,
2223 &forces[startNbond]);
2225 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2226 start, atomStorageSize, forceStride,
2227 forceList, forceListStartsSlow, forceListNexts,
2228 &forces[startSlow]);
2238 start += nblock_use;
2242 copy_DtoH<FORCE_TYPE>(forces, h_forces, forceCopySize,
stream);
2243 if (doEnergy || doVirial) {
2246 reduceBondedBinsKernel<<<energies_virials_SIZE, ATOMIC_BINS, 0, stream>>>(energies_virials);
__global__ void modifiedExclusionForcesKernel(const int start, const int numModifiedExclusions, const CudaExclusion *__restrict__ modifiedExclusions, const bool doSlow, const float one_scale14, const float cutoff2, const int vdwCoefTableWidth, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ forceNbond, T *__restrict__ forceSlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStartsNbond, int *forceListStartsSlow, int *forceListNexts, double *__restrict__ energies_virials)
__device__ void modifiedExclusionForce(const int index, const CudaExclusion *__restrict__ exclusionList, const bool doSlow, const float one_scale14, const int vdwCoefTableWidth, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, double &energyVdw, T *__restrict__ forceNbond, double &energyNbond, T *__restrict__ forceSlow, double &energySlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStartsNbond, int *forceListStartsSlow, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virialNbond, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
__device__ void diheForce(const int index, const CudaDihedral *__restrict__ diheList, const CudaDihedralValue *__restrict__ dihedralValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
#define WARP_ALL(MASK, P)
~ComputeBondedCUDAKernel()
#define WRITE_FULL_VIRIALS
__forceinline__ __device__ void diheComp(const CudaDihedralValue *dihedralValues, int ic, const float sin_phi, const float cos_phi, const float scale, float &df, double &e)
static int warpAlign(const int n)
static __constant__ const float force_to_float
int getForceStride(const int atomStorageSize)
__forceinline__ __device__ void calcComponentForces(float fij, const float dx, const float dy, const float dz, T &fxij, T &fyij, T &fzij)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread unsigned int * exclusions
int getAllForceSize(const int atomStorageSize, const bool doSlow)
__device__ __forceinline__ float3 cross(const float3 v1, const float3 v2)
void bondedForce(const double scale14, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float r2_delta, const int r2_delta_expc, const float4 *h_xyzq, FORCE_TYPE *h_forces, double *h_energies, cudaStream_t stream)
static __constant__ const float float_to_force
if(ComputeNonbondedUtil::goMethod==2)
#define BONDEDFORCESKERNEL_NUM_WARP
void setupAngleValues(int numAngleValues, CudaAngleValue *h_angleValues)
__device__ void bondForce(const int index, const CudaBond *__restrict__ bondList, const CudaBondValue *__restrict__ bondValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
__device__ __forceinline__ float4 sampleTableTex(cudaTextureObject_t tex, float k)
__forceinline__ __device__ void convertForces(const float x, const float y, const float z, T &Tx, T &Ty, T &Tz)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void reduceBondedBinsKernel(double *energies_virials)
__thread cudaStream_t stream
__forceinline__ __device__ void storeForces(const T fx, const T fy, const T fz, const int ind, const int stride, T *force, T *forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts)
void setupBondValues(int numBondValues, CudaBondValue *h_bondValues)
int getForceSize(const int atomStorageSize)
void setupImproperValues(int numImproperValues, CudaDihedralValue *h_improperValues)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t forceTableTex
__device__ void exclusionForce(const int index, const CudaExclusion *__restrict__ exclusionList, const float r2_delta, const int r2_delta_expc, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, T *__restrict__ forceSlow, double &energySlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStartsSlow, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
void setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue *h_crosstermValues)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
#define FORCE_ENERGY_TABLE_SIZE
#define WARP_BALLOT(MASK, P)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t energyTableTex
__device__ __forceinline__ float dot(const float3 v1, const float3 v2)
ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables &cudaNonbondedTables)
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
void setupDihedralValues(int numDihedralValues, CudaDihedralValue *h_dihedralValues)
__device__ void angleForce(const int index, const CudaAngle *__restrict__ angleList, const CudaAngleValue *__restrict__ angleValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
__thread DeviceCUDA * deviceCUDA
__device__ void crosstermForce(const int index, const CudaCrossterm *__restrict__ crosstermList, const CudaCrosstermValue *__restrict__ crosstermValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
__forceinline__ __device__ int warpAggregatingAtomicInc(int *counter)
__global__ void bondedForcesKernel(const int start, const int numBonds, const CudaBond *__restrict__ bonds, const CudaBondValue *__restrict__ bondValues, const int numAngles, const CudaAngle *__restrict__ angles, const CudaAngleValue *__restrict__ angleValues, const int numDihedrals, const CudaDihedral *__restrict__ dihedrals, const CudaDihedralValue *__restrict__ dihedralValues, const int numImpropers, const CudaDihedral *__restrict__ impropers, const CudaDihedralValue *__restrict__ improperValues, const int numExclusions, const CudaExclusion *__restrict__ exclusions, const int numCrossterms, const CudaCrossterm *__restrict__ crossterms, const CudaCrosstermValue *__restrict__ crosstermValues, const float cutoff2, const float r2_delta, const int r2_delta_expc, const float *__restrict__ r2_table, const float4 *__restrict__ exclusionTable, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, T *__restrict__ forceSlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListStartsSlow, int *forceListNexts, double *__restrict__ energies_virials)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
__global__ void gatherBondedForcesKernel(const int start, const int atomStorageSize, const int stride, const T *__restrict__ forceList, const int *__restrict__ forceListStarts, const int *__restrict__ forceListNexts, T *__restrict__ force)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t vdwCoefTableTex
#define CALL(DOENERGY, DOVIRIAL)
void update(const int numBondsIn, const int numAnglesIn, const int numDihedralsIn, const int numImpropersIn, const int numModifiedExclusionsIn, const int numExclusionsIn, const int numCrosstermsIn, const char *h_tupleData, cudaStream_t stream)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)