2 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <namd_cub/cub.cuh>
14 #define __thread __declspec(thread)
18 #define LARGE_FLOAT (float)(1.0e10)
20 #define GBISKERNEL_NUM_WARP 4
22 __device__ __forceinline__
44 __device__ __forceinline__
void loadI(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
48 __device__ __forceinline__
void loadJ(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
53 __device__ __forceinline__
void initQj(
const float q) {}
62 __device__ __forceinline__
void init() {psiSum = 0.0f;}
69 template <
bool doEnergy,
bool doSlow>
70 __device__ __forceinline__
void calcGBISPhase(
const float r2,
const float dx,
const float dy,
const float dz,
74 if (r2 < cutoff2 && r2 > 0.01f) {
75 float r_i = rsqrtf(r2);
90 atomicAdd(&out[i], res.
psiSum);
108 __device__ __forceinline__
void loadI(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
111 __device__ __forceinline__
void loadJ(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
117 __device__ __forceinline__
void initQj(
const float q) {
129 __device__ __forceinline__
void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f; dEdaSum = 0.0f;}
138 template <
bool doEnergy,
bool doSlow>
139 __device__ __forceinline__
void calcGBISPhase(
const float r2,
const float dx,
const float dy,
const float dz,
143 if (r2 < cutoff2 && r2 > 0.01f) {
144 float r_i = rsqrtf(r2);
149 float qiqj = inp.
qi*inp.
qj;
151 float aiaj4 = 4*aiaj;
152 float expr2aiaj4 = exp(-r2/aiaj4);
153 float fij = sqrt(r2 + aiaj*expr2aiaj4);
155 float expkappa = exp(-param.
kappa*fij);
157 float gbEij = qiqj*Dij*f_i;
160 float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
161 float ddrf_i = -ddrfij*f_i*f_i;
163 float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
167 float ddrScale = 0.f;
170 scale = r2 * param.
r_cut_2 - 1.f;
173 if (doEnergy) energyT += gbEij * scale;
174 forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
176 if (doEnergy) energyT += gbEij;
177 forcedEdr = ddrGbEij;
182 float dEdai = 0.5f*qiqj*f_i*f_i
184 *(aiaj+0.25f*r2)*expr2aiaj4/inp.
bornRadI*scale;
186 float dEdaj = 0.5f*qiqj*f_i*f_i
188 *(aiaj+0.25f*r2)*expr2aiaj4/inp.
bornRadJ*scale;
193 float tmpx = dx*forcedEdr;
194 float tmpy = dy*forcedEdr;
195 float tmpz = dz*forcedEdr;
196 resI.
force.x += tmpx;
197 resI.
force.y += tmpy;
198 resI.
force.z += tmpz;
199 resJ.
force.x -= tmpx;
200 resJ.
force.y -= tmpy;
201 resJ.
force.z -= tmpz;
208 float expkappa = exp(-param.
kappa*fij);
210 float gbEij = inp.
qi*(inp.
qi / (-param.
scaling) )*Dij/fij;
211 energyT += 0.5f*gbEij;
217 atomicAdd(&out[i], res.
dEdaSum);
218 atomicAdd(&forces[i].
x, res.
force.x);
219 atomicAdd(&forces[i].
y, res.
force.y);
220 atomicAdd(&forces[i].
z, res.
force.z);
236 __device__ __forceinline__
void loadI(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
239 dHdrPrefixI = inp3[i];
241 __device__ __forceinline__
void loadJ(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
244 dHdrPrefixJ = inp3[i];
247 __device__ __forceinline__
void initQj(
const float q) {}
257 __device__ __forceinline__
void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f;}
265 template <
bool doEnergy,
bool doSlow>
266 __device__ __forceinline__
void calcGBISPhase(
const float r2,
const float dx,
const float dy,
const float dz,
270 if (r2 < cutoff2 && r2 > 0.01f) {
271 float r_i = rsqrtf(r2);
279 float tmpx = dx * forceAlpha;
280 float tmpy = dy * forceAlpha;
281 float tmpz = dz * forceAlpha;
282 resI.
force.x += tmpx;
283 resI.
force.y += tmpy;
284 resI.
force.z += tmpz;
285 resJ.
force.x -= tmpx;
286 resJ.
force.y -= tmpy;
287 resJ.
force.z -= tmpz;
293 atomicAdd(&forces[i].
x, res.
force.x);
294 atomicAdd(&forces[i].
y, res.
force.y);
295 atomicAdd(&forces[i].
z, res.
force.z);
303 template <
bool doEnergy,
bool doSlow,
int phase>
310 const float3
lata, const float3
latb, const float3
latc,
311 const float4* __restrict__
xyzq,
314 const
float* __restrict__ inp1,
315 const
float* __restrict__ inp2,
316 const
float* __restrict__ inp3,
317 float* __restrict__ out,
318 float4* __restrict__
forces,
337 const int wid = threadIdx.x %
WARPSIZE;
340 float4 xyzq_i = xyzq[iatomStart + wid];
346 inp.loadI(iatomStart + wid, inp1, inp2, inp3);
347 if (phase == 2) inp.initQi(param, xyzq_i.w);
350 int nloopi = min(iatomSize - iatomStart,
WARPSIZE);
355 if (doEnergy) energyT = 0.0f;
357 for (
int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
360 int jatomStart = tileJatomStart[jtile];
363 float4 xyzq_j = xyzq[jatomStart + wid];
365 inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
366 if (phase == 2) inp.initQj(xyzq_j.w);
369 int nloopj = min(jatomSize - jatomStart,
WARPSIZE);
371 const bool diag_tile = (iatomStart == jatomStart);
373 int t = (phase != 2 && diag_tile) ? 1 : 0;
374 if (phase != 2 && diag_tile) {
382 int j = (t + wid) & modval;
388 float r2 = dx*dx + dy*dy + dz*dz;
390 if (j < nloopj && wid < nloopi) {
391 calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz,
cutoff2, param, inp, resI, resJ, energyT);
408 typedef cub::WarpReduce<double> WarpReduceDouble;
410 int warpId = threadIdx.x /
WARPSIZE;
411 volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((
double)energyT);
412 if (wid == 0) virialEnergy[
itileList].energyGBIS = energyTot;
455 if (intRad0 != NULL) deallocate_device<float>(&intRad0);
456 if (intRadS != NULL) deallocate_device<float>(&intRadS);
457 if (psiSum != NULL) deallocate_device<float>(&psiSum);
458 if (bornRad != NULL) deallocate_device<float>(&bornRad);
459 if (dEdaSum != NULL) deallocate_device<float>(&dEdaSum);
460 if (dHdrPrefix != NULL) deallocate_device<float>(&dHdrPrefix);
469 reallocate_device<float>(&intRad0, &intRad0Size,
atomStorageSize, 1.2f);
470 reallocate_device<float>(&intRadS, &intRadSSize,
atomStorageSize, 1.2f);
480 reallocate_device<float>(&bornRad, &bornRadSize,
atomStorageSize, 1.2f);
488 reallocate_device<float>(&dHdrPrefix, &dHdrPrefixSize,
atomStorageSize, 1.2f);
496 const float3
lata,
const float3
latb,
const float3
latc,
const float a_cut,
float* h_psiSum,
511 GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>>
514 param, intRad0, intRadS, NULL, psiSum, NULL, NULL);
525 const bool doEnergy,
const bool doSlow,
526 const float3
lata,
const float3
latb,
const float3
latc,
527 const float r_cut,
const float scaling,
const float kappa,
const float smoothDist,
528 const float epsilon_p,
const float epsilon_s,
529 float4* d_forces,
float* h_dEdaSum, cudaStream_t
stream) {
531 reallocate_device<float>(&dEdaSum, &dEdaSumSize,
atomStorageSize, 1.2f);
544 param.
r_cut2 = r_cut*r_cut;
553 #define CALL(DOENERGY, DOSLOW) GBIS_Kernel<DOENERGY, DOSLOW, 2> \
554 <<< nblock, nthread, 0, stream >>> \
555 (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
556 tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
557 param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
559 if (!doEnergy && !doSlow)
CALL(
false,
false);
560 if (!doEnergy && doSlow)
CALL(
false,
true);
561 if ( doEnergy && !doSlow)
CALL(
true,
false);
562 if ( doEnergy && doSlow)
CALL(
true,
true);
573 const float3
lata,
const float3
latb,
const float3
latc,
const float a_cut,
574 float4* d_forces, cudaStream_t
stream) {
585 GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>>
588 param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL);
__global__ void GBIS_Kernel(const int numTileLists, const TileList *__restrict__ tileLists, const int *__restrict__ tileJatomStart, const PatchPairRecord *__restrict__ patchPairs, const float3 lata, const float3 latb, const float3 latc, const float4 *__restrict__ xyzq, const float cutoff2, const GBISParam< phase > param, const float *__restrict__ inp1, const float *__restrict__ inp2, const float *__restrict__ inp3, float *__restrict__ out, float4 *__restrict__ forces, TileListVirialEnergy *__restrict__ virialEnergy)
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
PatchPairRecord * getPatchPairs()
__device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 1 > param, const GBISInput< 1 > inp, GBISResults< 1 > &resI, GBISResults< 1 > &resJ, float &energy)
static __thread float4 * forces
static __thread float * bornRadH
void setTileListVirialEnergyGBISLength(int len)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
static __thread float * dHdrPrefixH
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__device__ __forceinline__ void shuffleNext()
if(ComputeNonbondedUtil::goMethod==2)
__device__ __forceinline__ void shuffleNext()
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
__device__ __forceinline__ void shuffleNext(float &w)
__device__ __forceinline__ void init()
__device__ __forceinline__ void writeResult(const int i, const GBISResults< 1 > res, float *out, float4 *forces)
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ tileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
TileList * getTileListsGBIS()
static __thread float * intRadSH
static void CalcDH(float r, float r2, float ri, float rc, float r0, float rs, float &dh, int &d)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__device__ __forceinline__ void init()
int getNumTileListsGBIS()
int * getTileJatomStartGBIS()
__global__ void const int numTileLists
__device__ __forceinline__ void init()
CudaComputeGBISKernel(int deviceID)
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__thread DeviceCUDA * deviceCUDA
static void CalcH(float r, float r2, float ri, float rc, float r0, float rs, float &h, int &d)
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ patchPairs
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__device__ __forceinline__ void shuffleNext()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
#define CALL(DOENERGY, DOVIRIAL)
#define GBISKERNEL_NUM_WARP
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
static __thread float * intRad0H
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)