2 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <namd_cub/cub.cuh>
11 #define __thread __declspec(thread)
15 #define OVERALLOC 1.2f
19 #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k
22 #define NONBONDKERNEL_NUM_WARP 4
24 template<
bool doEnergy,
bool doSlow>
25 __device__ __forceinline__
27 const float dx,
const float dy,
const float dz,
31 float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow,
32 float& energyVdw,
float& energyElec,
float& energySlow) {
34 int vdwIndex = vdwtypej + vdwtypei;
35 #if __CUDA_ARCH__ >= 350
41 float rinv = rsqrtf(r2);
46 float fSlow = qi * qj;
47 float f = ljab.
x * fi.z + ljab.
y * fi.y + fSlow * fi.x;
50 energyVdw += ljab.
x * ei.z + ljab.
y * ei.y;
51 energyElec += fSlow * ei.x;
52 if (doSlow) energySlow += fSlow * ei.w;
54 if (doSlow) fSlow *= fi.w;
67 float fxSlow = dx * fSlow;
68 float fySlow = dy * fSlow;
69 float fzSlow = dz * fSlow;
70 iforceSlow.x += fxSlow;
71 iforceSlow.y += fySlow;
72 iforceSlow.z += fzSlow;
73 jforceSlow.x -= fxSlow;
74 jforceSlow.y -= fySlow;
75 jforceSlow.z -= fzSlow;
80 __device__ __forceinline__
81 void storeForces(
const int pos,
const float3 force,
const float3 forceSlow,
83 atomicAdd(&devForces[pos].
x, force.x);
84 atomicAdd(&devForces[pos].
y, force.y);
85 atomicAdd(&devForces[pos].
z, force.z);
87 atomicAdd(&devForcesSlow[pos].x, forceSlow.x);
88 atomicAdd(&devForcesSlow[pos].y, forceSlow.y);
89 atomicAdd(&devForcesSlow[pos].z, forceSlow.z);
94 __device__ __forceinline__
95 void storeForces(
const int pos,
const float3 force,
const float3 forceSlow,
96 float* __restrict__ devForces_x,
97 float* __restrict__ devForces_y,
98 float* __restrict__ devForces_z,
99 float* __restrict__ devForcesSlow_x,
100 float* __restrict__ devForcesSlow_y,
101 float* __restrict__ devForcesSlow_z)
103 atomicAdd(&devForces_x[pos], force.x);
104 atomicAdd(&devForces_y[pos], force.y);
105 atomicAdd(&devForces_z[pos], force.z);
107 atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
108 atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
109 atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
113 template<
bool doSlow>
114 __device__ __forceinline__
115 void storeForces(
const int pos,
const float3 force,
const float3 forceSlow,
116 float3* __restrict__
forces, float3* __restrict__ forcesSlow) {
117 atomicAdd(&forces[pos].
x, force.x);
118 atomicAdd(&forces[pos].
y, force.y);
119 atomicAdd(&forces[pos].
z, force.z);
121 atomicAdd(&forcesSlow[pos].x, forceSlow.x);
122 atomicAdd(&forcesSlow[pos].y, forceSlow.y);
123 atomicAdd(&forcesSlow[pos].z, forceSlow.z);
127 template<
bool doPairlist>
128 __device__ __forceinline__
129 void shuffleNext(
float& xyzq_j_w,
int& vdwtypej,
int& jatomIndex,
int& jexclMaxdiff,
int& jexclIndex) {
139 template<
bool doPairlist>
140 __device__ __forceinline__
141 void shuffleNext(
float& xyzq_j_w,
int& vdwtypej,
int& jatomIndex) {
149 template<
bool doSlow>
150 __device__ __forceinline__
168 float dx = max(0.0f, fabsf(a.
x - b.x) - a.
wx);
169 float dy = max(0.0f, fabsf(a.
y - b.y) - a.
wy);
170 float dz = max(0.0f, fabsf(a.
z - b.z) - a.
wz);
171 float r2 = dx*dx + dy*dy + dz*dz;
175 #define LARGE_FLOAT (float)(1.0e10)
180 template <
bool doEnergy,
bool doVirial,
bool doSlow,
bool doPairlist,
bool doStreaming>
183 doPairlist ? (10) : (doEnergy ? (10) : (10) )
185 nonbondedForceKernel(
190 const float3
lata, const float3
latb, const float3
latc,
191 const float4* __restrict__
xyzq, const
float cutoff2,
202 #ifdef USE_NEW_EXCL_METHOD
203 const int* __restrict__ minmaxExclAtom,
228 if (itileList < numTileLists)
233 float energyVdw, energyElec, energySlow;
235 unsigned int itileListLen;
247 const int wid = threadIdx.x %
WARPSIZE;
248 const int iwarp = threadIdx.x /
WARPSIZE;
262 bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
264 int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
283 float4 xyzq_i = xyzq[iatomStart + wid];
292 boundingBoxI = boundingBoxes[iatomStart/
WARPSIZE];
293 boundingBoxI.
x += shx;
294 boundingBoxI.
y += shy;
295 boundingBoxI.
z += shz;
299 #ifdef USE_NEW_EXCL_METHOD
300 int iatomIndex, minExclAtom, maxExclAtom;
305 #ifdef USE_NEW_EXCL_METHOD
306 iatomIndex = atomIndex[iatomStart + wid];
307 int2 tmp = minmaxExclAtom[iatomStart + wid];
311 iatomIndex = atomIndex[iatomStart + wid];
332 if (doSlow) energySlow = 0.0f;
340 if (doPairlist) nexcluded = 0;
345 int nloopi = min(iatomSize - iatomStart,
WARPSIZE);
346 nfreei = max(iatomFreeSize - iatomStart, 0);
363 int iexclIndex, iexclMaxdiff;
365 int2 tmp = exclIndexMaxDiff[iatomStart + wid];
367 iexclMaxdiff = tmp.y;
370 for (
int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
373 int jatomStart = tileJatomStart[jtile];
375 float4 xyzq_j = xyzq[jatomStart + wid];
380 float r2bb =
distsq(boundingBoxI, xyzq_j);
383 unsigned int excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
384 int vdwtypej = vdwTypes[jatomStart + wid];
385 s_vdwtypej[iwarp][wid] = vdwtypej;
389 s_jatomIndex[iwarp][wid] = atomIndex[jatomStart + wid];
395 int nloopj = min(jatomSize - jatomStart,
WARPSIZE);
396 nfreej = max(jatomFreeSize - jatomStart, 0);
405 s_xyzq[iwarp][wid] = xyzq_j;
408 const bool self = zeroShift && (iatomStart == jatomStart);
411 s_jforce[iwarp][wid] = make_float3(0.0f, 0.0f, 0.0f);
413 s_jforceSlow[iwarp][wid] = make_float3(0.0f, 0.0f, 0.0f);
417 int t = (
self) ? 1 : 0;
429 int j = (0 + wid) & modval;
430 xyzq_j = s_xyzq[iwarp][j];
431 float dx = xyzq_j.x - xyzq_i.x;
432 float dy = xyzq_j.y - xyzq_i.y;
433 float dz = xyzq_j.z - xyzq_i.z;
435 float r2 = dx*dx + dy*dy + dz*dz;
437 if (j <
WARPSIZE && r2 < plcutoff2) {
444 int j = (t + wid) & modval;
448 xyzq_j = s_xyzq[iwarp][j];
449 float dx = xyzq_j.x - xyzq_i.x;
450 float dy = xyzq_j.y - xyzq_i.y;
451 float dz = xyzq_j.z - xyzq_i.z;
452 float r2 = dx*dx + dy*dy + dz*dz;
456 if (j < nfreej || wid < nfreei) {
457 bool excluded =
false;
458 int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
459 if ( abs(indexdiff) <= iexclMaxdiff) {
460 indexdiff += iexclIndex;
461 int indexword = ((
unsigned int) indexdiff) >> 5;
466 indexword = overflowExclusions[indexword];
469 excluded = ((indexword & (1<<(indexdiff&31))) != 0);
471 if (excluded) nexcluded += 2;
472 if (!excluded) excl |= 0x80000000;
473 if (!excluded && r2 < cutoff2) {
474 calcForceEnergy<doEnergy, doSlow>(
475 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
476 vdwtypei, s_vdwtypej[iwarp][j],
479 s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
481 energyElec, energySlow);
495 xyzq_j = s_xyzq[iwarp][(wid+t) & (WARPSIZE-1)];
496 float dx = xyzq_j.x - xyzq_i.x;
497 float dy = xyzq_j.y - xyzq_i.y;
498 float dz = xyzq_j.z - xyzq_i.z;
500 float r2 = dx*dx + dy*dy + dz*dz;
503 calcForceEnergy<doEnergy, doSlow>(
504 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
505 vdwtypei, s_vdwtypej[iwarp][(wid+t) & (WARPSIZE-1)],
vdwCoefTable,
508 s_jforce[iwarp][(wid+t) & (WARPSIZE-1)],
509 s_jforceSlow[iwarp][(wid+t) & (WARPSIZE-1)],
510 energyVdw, energyElec, energySlow);
519 storeForces<doSlow>(jatomStart + wid, s_jforce[iwarp][wid], s_jforceSlow[iwarp][wid],
529 if (wid == 0) jtiles[jtile] = anyexcl;
531 tileExcls[jtile].excl[wid] = excl;
535 itileListLen += anyexcl;
543 storeForces<doSlow>(iatomStart + wid, iforce, iforceSlow,
553 const int wid = threadIdx.x %
WARPSIZE;
567 if ((itileListLen & 65535) > 0) atomicAdd(&tileListStat->numTileLists, 1);
569 if (itileListLen > 0) atomicAdd(&tileListStat->numTileListsGBIS, 1);
573 typedef cub::WarpReduce<int> WarpReduceInt;
575 int warpId = threadIdx.x /
WARPSIZE;
578 volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
579 if (wid == 0) atomicAdd(&tileListStat->numExcluded, nexcludedWarp);
585 const int wid = threadIdx.x %
WARPSIZE;
587 typedef cub::WarpReduce<float> WarpReduce;
589 int warpId = threadIdx.x /
WARPSIZE;
590 volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
592 volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
594 volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
597 virialEnergy[
itileList].forcex = iforcexSum;
598 virialEnergy[
itileList].forcey = iforceySum;
599 virialEnergy[
itileList].forcez = iforcezSum;
603 iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
605 iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
607 iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
610 virialEnergy[
itileList].forceSlowx = iforcexSum;
611 virialEnergy[
itileList].forceSlowy = iforceySum;
612 virialEnergy[
itileList].forceSlowz = iforcezSum;
622 for (
int i=16;i >= 1;i/=2) {
629 virialEnergy[
itileList].energyVdw = energyVdw;
630 virialEnergy[
itileList].energyElec = energyElec;
631 if (doSlow) virialEnergy[
itileList].energySlow = energySlow;
640 int patchDone[2] = {
false,
false};
641 const int wid = threadIdx.x %
WARPSIZE;
643 int patchCountOld0 = atomicInc(&patchNumCount[patchInd.x], (
unsigned int)(patchNumList.x-1));
644 patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
645 if (patchInd.x != patchInd.y) {
646 int patchCountOld1 = atomicInc(&patchNumCount[patchInd.y], (
unsigned int)(patchNumList.y-1));
647 patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
659 for (
int i=start+wid;i < end;i+=
WARPSIZE) {
660 mapForces[i] = make_float4(devForce_x[i],
661 devForce_y[i], devForce_z[i], devForce_w[i]);
663 mapForcesSlow[i] = make_float4(devForceSlow_x[i],
675 for (
int i=start+wid;i < end;i+=
WARPSIZE) {
676 mapForces[i] = make_float4(devForce_x[i], devForce_y[i], devForce_z[i], devForce_w[i]);
678 mapForcesSlow[i] = make_float4(devForceSlow_x[i],
686 if (patchDone[0] || patchDone[1]) {
689 __threadfence_system();
693 int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
695 mapPatchReadyQueue[ind] = patchInd.x;
698 int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
700 mapPatchReadyQueue[ind] = patchInd.y;
706 if (doStreaming && outputOrder != NULL && threadIdx.x %
WARPSIZE == 0) {
707 int index = atomicAdd(&tileListStat->outputOrderIndex, 1);
716 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 32
718 const int atomStorageSize,
719 const float4* __restrict__ xyzq,
720 const float4* __restrict__ devForces,
const float4* __restrict__ devForcesSlow,
723 for (
int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
725 int i = ibase + threadIdx.x;
732 float4 force, forceSlow;
739 if (i < atomStorageSize) {
741 force = devForces[i];
742 if (doSlow) forceSlow = devForcesSlow[i];
745 float vxxt = force.x*pos.x;
746 float vxyt = force.x*pos.y;
747 float vxzt = force.x*pos.z;
748 float vyxt = force.y*pos.x;
749 float vyyt = force.y*pos.y;
750 float vyzt = force.y*pos.z;
751 float vzxt = force.z*pos.x;
752 float vzyt = force.z*pos.y;
753 float vzzt = force.z*pos.z;
764 typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
765 __shared__
typename BlockReduce::TempStorage
tempStorage;
766 volatile float vxx = BlockReduce(tempStorage).Sum(vxxt);
BLOCK_SYNC;
767 volatile float vxy = BlockReduce(tempStorage).Sum(vxyt);
BLOCK_SYNC;
768 volatile float vxz = BlockReduce(tempStorage).Sum(vxzt);
BLOCK_SYNC;
769 volatile float vyx = BlockReduce(tempStorage).Sum(vyxt);
BLOCK_SYNC;
770 volatile float vyy = BlockReduce(tempStorage).Sum(vyyt);
BLOCK_SYNC;
771 volatile float vyz = BlockReduce(tempStorage).Sum(vyzt);
BLOCK_SYNC;
772 volatile float vzx = BlockReduce(tempStorage).Sum(vzxt);
BLOCK_SYNC;
773 volatile float vzy = BlockReduce(tempStorage).Sum(vzyt);
BLOCK_SYNC;
774 volatile float vzz = BlockReduce(tempStorage).Sum(vzzt);
BLOCK_SYNC;
775 if (threadIdx.x == 0) {
776 atomicAdd(&virialEnergy->virial[0], (
double)vxx);
777 atomicAdd(&virialEnergy->virial[1], (
double)vxy);
778 atomicAdd(&virialEnergy->virial[2], (
double)vxz);
779 atomicAdd(&virialEnergy->virial[3], (
double)vyx);
780 atomicAdd(&virialEnergy->virial[4], (
double)vyy);
781 atomicAdd(&virialEnergy->virial[5], (
double)vyz);
782 atomicAdd(&virialEnergy->virial[6], (
double)vzx);
783 atomicAdd(&virialEnergy->virial[7], (
double)vzy);
784 atomicAdd(&virialEnergy->virial[8], (
double)vzz);
789 float vxxSlowt = forceSlow.x*pos.x;
790 float vxySlowt = forceSlow.x*pos.y;
791 float vxzSlowt = forceSlow.x*pos.z;
792 float vyxSlowt = forceSlow.y*pos.x;
793 float vyySlowt = forceSlow.y*pos.y;
794 float vyzSlowt = forceSlow.y*pos.z;
795 float vzxSlowt = forceSlow.z*pos.x;
796 float vzySlowt = forceSlow.z*pos.y;
797 float vzzSlowt = forceSlow.z*pos.z;
807 volatile float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt);
BLOCK_SYNC;
808 volatile float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt);
BLOCK_SYNC;
809 volatile float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt);
BLOCK_SYNC;
810 volatile float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt);
BLOCK_SYNC;
811 volatile float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt);
BLOCK_SYNC;
812 volatile float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt);
BLOCK_SYNC;
813 volatile float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt);
BLOCK_SYNC;
814 volatile float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt);
BLOCK_SYNC;
815 volatile float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt);
BLOCK_SYNC;
816 if (threadIdx.x == 0) {
817 atomicAdd(&virialEnergy->virialSlow[0], (
double)vxxSlow);
818 atomicAdd(&virialEnergy->virialSlow[1], (
double)vxySlow);
819 atomicAdd(&virialEnergy->virialSlow[2], (
double)vxzSlow);
820 atomicAdd(&virialEnergy->virialSlow[3], (
double)vyxSlow);
821 atomicAdd(&virialEnergy->virialSlow[4], (
double)vyySlow);
822 atomicAdd(&virialEnergy->virialSlow[5], (
double)vyzSlow);
823 atomicAdd(&virialEnergy->virialSlow[6], (
double)vzxSlow);
824 atomicAdd(&virialEnergy->virialSlow[7], (
double)vzySlow);
825 atomicAdd(&virialEnergy->virialSlow[8], (
double)vzzSlow);
833 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 32
835 const bool doEnergy,
const bool doVirial,
const bool doSlow,
836 const int numTileLists,
840 for (
int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
842 int itileList = ibase + threadIdx.x;
844 if (itileList < numTileLists) {
868 typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
869 __shared__
typename BlockReduce::TempStorage
tempStorage;
879 volatile float vxx = BlockReduce(tempStorage).Sum(vxxt);
BLOCK_SYNC;
880 volatile float vxy = BlockReduce(tempStorage).Sum(vxyt);
BLOCK_SYNC;
881 volatile float vxz = BlockReduce(tempStorage).Sum(vxzt);
BLOCK_SYNC;
882 volatile float vyx = BlockReduce(tempStorage).Sum(vyxt);
BLOCK_SYNC;
883 volatile float vyy = BlockReduce(tempStorage).Sum(vyyt);
BLOCK_SYNC;
884 volatile float vyz = BlockReduce(tempStorage).Sum(vyzt);
BLOCK_SYNC;
885 volatile float vzx = BlockReduce(tempStorage).Sum(vzxt);
BLOCK_SYNC;
886 volatile float vzy = BlockReduce(tempStorage).Sum(vzyt);
BLOCK_SYNC;
887 volatile float vzz = BlockReduce(tempStorage).Sum(vzzt);
BLOCK_SYNC;
888 if (threadIdx.x == 0) {
889 atomicAdd(&virialEnergy->virial[0], (
double)vxx);
890 atomicAdd(&virialEnergy->virial[1], (
double)vxy);
891 atomicAdd(&virialEnergy->virial[2], (
double)vxz);
892 atomicAdd(&virialEnergy->virial[3], (
double)vyx);
893 atomicAdd(&virialEnergy->virial[4], (
double)vyy);
894 atomicAdd(&virialEnergy->virial[5], (
double)vyz);
895 atomicAdd(&virialEnergy->virial[6], (
double)vzx);
896 atomicAdd(&virialEnergy->virial[7], (
double)vzy);
897 atomicAdd(&virialEnergy->virial[8], (
double)vzz);
901 typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
902 __shared__
typename BlockReduce::TempStorage
tempStorage;
912 volatile float vxx = BlockReduce(tempStorage).Sum(vxxt);
BLOCK_SYNC;
913 volatile float vxy = BlockReduce(tempStorage).Sum(vxyt);
BLOCK_SYNC;
914 volatile float vxz = BlockReduce(tempStorage).Sum(vxzt);
BLOCK_SYNC;
915 volatile float vyx = BlockReduce(tempStorage).Sum(vyxt);
BLOCK_SYNC;
916 volatile float vyy = BlockReduce(tempStorage).Sum(vyyt);
BLOCK_SYNC;
917 volatile float vyz = BlockReduce(tempStorage).Sum(vyzt);
BLOCK_SYNC;
918 volatile float vzx = BlockReduce(tempStorage).Sum(vzxt);
BLOCK_SYNC;
919 volatile float vzy = BlockReduce(tempStorage).Sum(vzyt);
BLOCK_SYNC;
920 volatile float vzz = BlockReduce(tempStorage).Sum(vzzt);
BLOCK_SYNC;
921 if (threadIdx.x == 0) {
922 atomicAdd(&virialEnergy->virialSlow[0], (
double)vxx);
923 atomicAdd(&virialEnergy->virialSlow[1], (
double)vxy);
924 atomicAdd(&virialEnergy->virialSlow[2], (
double)vxz);
925 atomicAdd(&virialEnergy->virialSlow[3], (
double)vyx);
926 atomicAdd(&virialEnergy->virialSlow[4], (
double)vyy);
927 atomicAdd(&virialEnergy->virialSlow[5], (
double)vyz);
928 atomicAdd(&virialEnergy->virialSlow[6], (
double)vzx);
929 atomicAdd(&virialEnergy->virialSlow[7], (
double)vzy);
930 atomicAdd(&virialEnergy->virialSlow[8], (
double)vzz);
936 typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
937 __shared__
typename BlockReduce::TempStorage
tempStorage;
940 if (threadIdx.x == 0) {
941 atomicAdd(&virialEnergy->energyVdw, (
double)energyVdw);
942 atomicAdd(&virialEnergy->energyElec, (
double)energyElec);
946 if (threadIdx.x == 0) atomicAdd(&virialEnergy->energySlow, (
double)energySlow);
958 #define REDUCEGBISENERGYKERNEL_NUM_WARP 32
963 for (
int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
965 int itileList = ibase + threadIdx.x;
966 double energyGBISt = 0.0;
967 if (itileList < numTileLists) {
968 energyGBISt = tileListVirialEnergy[
itileList].energyGBIS;
971 typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
972 __shared__
typename BlockReduce::TempStorage
tempStorage;
973 volatile double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt);
BLOCK_SYNC;
974 if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (
double)energyGBIS);
984 bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
988 overflowExclusions = NULL;
989 overflowExclusionsSize = 0;
991 exclIndexMaxDiff = NULL;
992 exclIndexMaxDiffSize = 0;
1000 patchNumCount = NULL;
1001 patchNumCountSize = 0;
1003 patchReadyQueue = NULL;
1004 patchReadyQueueSize = 0;
1006 force_x = force_y = force_z = force_w = NULL;
1008 forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
1014 reallocate_device<float>(&force_x, &forceSize,
atomStorageSize, 1.4f);
1015 reallocate_device<float>(&force_y, &forceSize,
atomStorageSize, 1.4f);
1016 reallocate_device<float>(&force_z, &forceSize,
atomStorageSize, 1.4f);
1017 reallocate_device<float>(&force_w, &forceSize,
atomStorageSize, 1.4f);
1018 reallocate_device<float>(&forceSlow_x, &forceSlowSize,
atomStorageSize, 1.4f);
1019 reallocate_device<float>(&forceSlow_y, &forceSlowSize,
atomStorageSize, 1.4f);
1020 reallocate_device<float>(&forceSlow_z, &forceSlowSize,
atomStorageSize, 1.4f);
1021 reallocate_device<float>(&forceSlow_w, &forceSlowSize,
atomStorageSize, 1.4f);
1026 if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
1027 if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
1028 if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
1029 if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
1030 if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
1031 if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
1032 if (force_x != NULL) deallocate_device<float>(&force_x);
1033 if (force_y != NULL) deallocate_device<float>(&force_y);
1034 if (force_z != NULL) deallocate_device<float>(&force_z);
1035 if (force_w != NULL) deallocate_device<float>(&force_w);
1036 if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
1037 if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
1038 if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
1039 if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);
1043 const int2* h_exclIndexMaxDiff,
const int* h_atomIndex, cudaStream_t
stream) {
1056 NAMD_die(
"CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
1058 return patchReadyQueue;
1061 template <
int doSlow>
1063 float *fx,
float *fy,
float *fz,
float *fw,
1064 float *fSlowx,
float *fSlowy,
float *fSlowz,
float *fSloww,
1067 int tid = blockIdx.x*blockDim.x + threadIdx.x;
1069 f[tid] = make_float4(fx[tid], fy[tid], fz[tid], fw[tid]);
1071 fSlow[tid] = make_float4(fSlowx[tid], fSlowy[tid], fSlowz[tid], fSloww[tid]);
1079 const int atomStorageSize,
const bool doPairlist,
1080 const bool doEnergy,
const bool doVirial,
const bool doSlow,
1081 const float3 lata,
const float3 latb,
const float3 latc,
1082 const float4* h_xyzq,
const float cutoff2,
1083 float4* d_forces, float4* d_forcesSlow,
1084 float4* h_forces, float4* h_forcesSlow,
1110 float4* m_forces = NULL;
1111 float4* m_forcesSlow = NULL;
1112 int* m_patchReadyQueue = NULL;
1114 unsigned int* patchNumCountPtr = NULL;
1117 if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
1121 patchNumCountPtr = patchNumCount;
1122 bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize,
numPatches, cudaHostAllocMapped);
1125 for (
int i=0;i <
numPatches;i++) patchReadyQueue[i] = -1;
1127 cudaCheck(cudaHostGetDevicePointer(&m_patchReadyQueue, patchReadyQueue, 0));
1128 cudaCheck(cudaHostGetDevicePointer(&m_forces, h_forces, 0));
1129 cudaCheck(cudaHostGetDevicePointer(&m_forcesSlow, h_forcesSlow, 0));
1133 if (doVirial || doEnergy) {
1150 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
1151 nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
1152 <<< nblock, nthread, shMemSize, stream >>> \
1153 (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
1154 cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
1155 vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
1156 cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
1157 atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
1158 tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
1159 tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
1160 force_x, force_y, force_z, force_w, \
1161 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
1162 numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
1163 outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
1165 bool called =
false;
1168 if (!doEnergy && !doVirial && !doSlow && !doPairlist)
CALL(0, 0, 0, 0, 1);
1169 if (!doEnergy && !doVirial && doSlow && !doPairlist)
CALL(0, 0, 1, 0, 1);
1170 if (!doEnergy && doVirial && !doSlow && !doPairlist)
CALL(0, 1, 0, 0, 1);
1171 if (!doEnergy && doVirial && doSlow && !doPairlist)
CALL(0, 1, 1, 0, 1);
1172 if ( doEnergy && !doVirial && !doSlow && !doPairlist)
CALL(1, 0, 0, 0, 1);
1173 if ( doEnergy && !doVirial && doSlow && !doPairlist)
CALL(1, 0, 1, 0, 1);
1174 if ( doEnergy && doVirial && !doSlow && !doPairlist)
CALL(1, 1, 0, 0, 1);
1175 if ( doEnergy && doVirial && doSlow && !doPairlist)
CALL(1, 1, 1, 0, 1);
1177 if (!doEnergy && !doVirial && !doSlow && doPairlist)
CALL(0, 0, 0, 1, 1);
1178 if (!doEnergy && !doVirial && doSlow && doPairlist)
CALL(0, 0, 1, 1, 1);
1179 if (!doEnergy && doVirial && !doSlow && doPairlist)
CALL(0, 1, 0, 1, 1);
1180 if (!doEnergy && doVirial && doSlow && doPairlist)
CALL(0, 1, 1, 1, 1);
1181 if ( doEnergy && !doVirial && !doSlow && doPairlist)
CALL(1, 0, 0, 1, 1);
1182 if ( doEnergy && !doVirial && doSlow && doPairlist)
CALL(1, 0, 1, 1, 1);
1183 if ( doEnergy && doVirial && !doSlow && doPairlist)
CALL(1, 1, 0, 1, 1);
1184 if ( doEnergy && doVirial && doSlow && doPairlist)
CALL(1, 1, 1, 1, 1);
1186 if (!doEnergy && !doVirial && !doSlow && !doPairlist)
CALL(0, 0, 0, 0, 0);
1187 if (!doEnergy && !doVirial && doSlow && !doPairlist)
CALL(0, 0, 1, 0, 0);
1188 if (!doEnergy && doVirial && !doSlow && !doPairlist)
CALL(0, 1, 0, 0, 0);
1189 if (!doEnergy && doVirial && doSlow && !doPairlist)
CALL(0, 1, 1, 0, 0);
1190 if ( doEnergy && !doVirial && !doSlow && !doPairlist)
CALL(1, 0, 0, 0, 0);
1191 if ( doEnergy && !doVirial && doSlow && !doPairlist)
CALL(1, 0, 1, 0, 0);
1192 if ( doEnergy && doVirial && !doSlow && !doPairlist)
CALL(1, 1, 0, 0, 0);
1193 if ( doEnergy && doVirial && doSlow && !doPairlist)
CALL(1, 1, 1, 0, 0);
1195 if (!doEnergy && !doVirial && !doSlow && doPairlist)
CALL(0, 0, 0, 1, 0);
1196 if (!doEnergy && !doVirial && doSlow && doPairlist)
CALL(0, 0, 1, 1, 0);
1197 if (!doEnergy && doVirial && !doSlow && doPairlist)
CALL(0, 1, 0, 1, 0);
1198 if (!doEnergy && doVirial && doSlow && doPairlist)
CALL(0, 1, 1, 1, 0);
1199 if ( doEnergy && !doVirial && !doSlow && doPairlist)
CALL(1, 0, 0, 1, 0);
1200 if ( doEnergy && !doVirial && doSlow && doPairlist)
CALL(1, 0, 1, 1, 0);
1201 if ( doEnergy && doVirial && !doSlow && doPairlist)
CALL(1, 1, 0, 1, 0);
1202 if ( doEnergy && doVirial && doSlow && doPairlist)
CALL(1, 1, 1, 1, 0);
1206 NAMD_die(
"CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
1211 int grid = (atomStorageSize + block - 1)/block;
1213 transposeForcesKernel<1><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
1214 force_x, force_y, force_z, force_w,
1215 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1218 transposeForcesKernel<0><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
1219 force_x, force_y, force_z, force_w,
1220 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1227 start += nblock*nwarp;
1236 const int atomStorageSize,
const bool doEnergy,
const bool doVirial,
const bool doSlow,
const bool doGBIS,
1237 float4* d_forces, float4* d_forcesSlow,
1240 if (doEnergy || doVirial) {
1241 clear_device_array<VirialEnergy>(d_virialEnergy, 1,
stream);
1248 reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
1253 if (doVirial || doEnergy)
1257 reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
1262 if (doGBIS && doEnergy)
1266 reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
1277 reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
1278 copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
__global__ void reduceGBISEnergyKernel(const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
#define WARP_ALL(MASK, P)
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
__forceinline__ __device__ void storeForces(const T fx, const T fy, const T fz, const int ind, const int stride, T *force)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t energyTableTex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ mapForcesSlow
void setTileListVirialEnergyLength(int len)
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ devForce_y
void clearTileListStat(cudaStream_t stream)
static __thread float4 * forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ devForces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ tileListStat
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
int getTileListVirialEnergyGBISLength()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ devForce_x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t forceTableTex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__global__ void transposeForcesKernel(float4 *f, float4 *fSlow, float *fx, float *fy, float *fz, float *fw, float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww, int n)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
~CudaComputeNonbondedKernel()
__device__ __forceinline__ void shuffleNext(float &w)
#define NONBONDKERNEL_NUM_WARP
void reallocate_forceSOA(int atomStorageSize)
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t vdwCoefTableTex
__global__ void reduceNonbondedVirialKernel(const bool doSlow, const int atomStorageSize, const float4 *__restrict__ xyzq, const float4 *__restrict__ devForces, const float4 *__restrict__ devForcesSlow, VirialEnergy *__restrict__ virialEnergy)
__global__ void const int const TileList *__restrict__ tileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
__global__ void reduceVirialEnergyKernel(const bool doEnergy, const bool doVirial, const bool doSlow, const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForce_z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ devForcesSlow
#define MAX_CONST_EXCLUSIONS
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
int getTileListVirialEnergyLength()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float plcutoff2
void NAMD_die(const char *err_msg)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ vdwTypes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_z
int * getPatchReadyQueue()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_x
__constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ mapForces
__global__ void const int numTileLists
__shared__ union @43 tempStorage
#define REDUCEVIRIALENERGYKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForce_w
__device__ __forceinline__ void calcForceEnergy(const float r2, const float qi, const float qj, const float dx, const float dy, const float dz, const int vdwtypei, const int vdwtypej, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, float3 &iforce, float3 &iforceSlow, float3 &jforce, float3 &jforceSlow, float &energyVdw, float &energyElec, float &energySlow)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
TileListVirialEnergy * getTileListVirialEnergy()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
__thread DeviceCUDA * deviceCUDA
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ mapPatchReadyQueue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ patchPairs
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ tileExcls
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_y
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_w
#define CALL(DOENERGY, DOVIRIAL)
#define WARP_ANY(MASK, P)
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP
#define REDUCEGBISENERGYKERNEL_NUM_WARP
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)