NAMD
Classes | Macros | Functions | Variables
CudaComputeGBISKernel.cu File Reference
#include <cuda.h>
#include <namd_cub/cub.cuh>
#include "CudaUtils.h"
#include "CudaTileListKernel.h"
#include "CudaComputeGBISKernel.h"
#include "ComputeGBIS.inl"
#include "DeviceCUDA.h"

Go to the source code of this file.

Classes

struct  GBISParam< phase >
 
struct  GBISInput< phase >
 
struct  GBISResults< phase >
 
struct  GBISParam< 1 >
 
struct  GBISInput< 1 >
 
struct  GBISResults< 1 >
 
struct  GBISParam< 2 >
 
struct  GBISInput< 2 >
 
struct  GBISResults< 2 >
 
struct  GBISParam< 3 >
 
struct  GBISInput< 3 >
 
struct  GBISResults< 3 >
 

Macros

#define GBIS_CUDA
 
#define LARGE_FLOAT   (float)(1.0e10)
 
#define GBISKERNEL_NUM_WARP   4
 
#define CALL(DOENERGY, DOSLOW)
 

Functions

__device__ __forceinline__ void shuffleNext (float &w)
 
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 1 > param, const GBISInput< 1 > inp, GBISResults< 1 > &resI, GBISResults< 1 > &resJ, float &energy)
 
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 1 > res, float *out, float4 *forces)
 
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 2 > param, const GBISInput< 2 > inp, GBISResults< 2 > &resI, GBISResults< 2 > &resJ, float &energyT)
 
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 2 > res, float *out, float4 *forces)
 
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 3 > param, const GBISInput< 3 > inp, GBISResults< 3 > &resI, GBISResults< 3 > &resJ, float &energy)
 
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 3 > res, float *out, float4 *forces)
 
template<bool doEnergy, bool doSlow, int phase>
__global__ void GBIS_Kernel (const int numTileLists, const TileList *__restrict__ tileLists, const int *__restrict__ tileJatomStart, const PatchPairRecord *__restrict__ patchPairs, const float3 lata, const float3 latb, const float3 latc, const float4 *__restrict__ xyzq, const float cutoff2, const GBISParam< phase > param, const float *__restrict__ inp1, const float *__restrict__ inp2, const float *__restrict__ inp3, float *__restrict__ out, float4 *__restrict__ forces, TileListVirialEnergy *__restrict__ virialEnergy)
 

Variables

__thread DeviceCUDAdeviceCUDA
 

Macro Definition Documentation

#define CALL (   DOENERGY,
  DOSLOW 
)
Value:
GBIS_Kernel<DOENERGY, DOSLOW, 2> \
<<< nblock, nthread, 0, stream >>> \
(tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
#define GBIS_CUDA

Definition at line 10 of file CudaComputeGBISKernel.cu.

#define GBISKERNEL_NUM_WARP   4
#define LARGE_FLOAT   (float)(1.0e10)

Definition at line 18 of file CudaComputeGBISKernel.cu.

Function Documentation

template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 1 >  param,
const GBISInput< 1 >  inp,
GBISResults< 1 > &  resI,
GBISResults< 1 > &  resJ,
float &  energy 
)

Definition at line 70 of file CudaComputeGBISKernel.cu.

References GBISParam< 1 >::a_cut, CalcH(), GBISInput< 1 >::intRad0j, GBISInput< 1 >::intRadSi, GBISResults< 1 >::psiSum, GBISInput< 1 >::qi, and GBISInput< 1 >::qj.

72  {
73 
74  if (r2 < cutoff2 && r2 > 0.01f) {
75  float r_i = rsqrtf(r2);
76  float r = r2 * r_i;
77  float hij;
78  int dij;
79  CalcH(r, r2, r_i, param.a_cut, inp.qi, inp.qj, hij, dij);
80  resI.psiSum += hij;
81  float hji;
82  int dji;
83  CalcH(r, r2, r_i, param.a_cut, inp.intRad0j, inp.intRadSi, hji, dji);
84  resJ.psiSum += hji;
85  }
86 
87 }
static void CalcH(float r, float r2, float ri, float rc, float r0, float rs, float &h, int &d)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 2 >  param,
const GBISInput< 2 >  inp,
GBISResults< 2 > &  resI,
GBISResults< 2 > &  resJ,
float &  energyT 
)

Definition at line 139 of file CudaComputeGBISKernel.cu.

References GBISInput< 2 >::bornRadI, GBISInput< 2 >::bornRadJ, GBISResults< 2 >::dEdaSum, GBISParam< 2 >::epsilon_p_i, GBISParam< 2 >::epsilon_s_i, GBISResults< 2 >::force, if(), GBISParam< 2 >::kappa, GBISInput< 2 >::qi, GBISInput< 2 >::qj, GBISParam< 2 >::r_cut2, GBISParam< 2 >::r_cut_2, GBISParam< 2 >::r_cut_4, GBISParam< 2 >::scaling, and GBISParam< 2 >::smoothDist.

141  {
142 
143  if (r2 < cutoff2 && r2 > 0.01f) {
144  float r_i = rsqrtf(r2);
145  float r = r2 * r_i;
146  //float bornRadJ = sh_jBornRad[j];
147 
148  //calculate GB energy
149  float qiqj = inp.qi*inp.qj;
150  float aiaj = inp.bornRadI*inp.bornRadJ;
151  float aiaj4 = 4*aiaj;
152  float expr2aiaj4 = exp(-r2/aiaj4);
153  float fij = sqrt(r2 + aiaj*expr2aiaj4);
154  float f_i = 1/fij;
155  float expkappa = exp(-param.kappa*fij);
156  float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
157  float gbEij = qiqj*Dij*f_i;
158 
159  //calculate energy derivatives
160  float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
161  float ddrf_i = -ddrfij*f_i*f_i;
162  float ddrDij = param.kappa*expkappa*ddrfij*param.epsilon_s_i;
163  float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
164 
165  //NAMD smoothing function
166  float scale = 1.f;
167  float ddrScale = 0.f;
168  float forcedEdr;
169  if (param.smoothDist > 0.f) {
170  scale = r2 * param.r_cut_2 - 1.f;
171  scale *= scale;
172  ddrScale = r*(r2-param.r_cut2)*param.r_cut_4;
173  if (doEnergy) energyT += gbEij * scale;
174  forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
175  } else {
176  if (doEnergy) energyT += gbEij;
177  forcedEdr = ddrGbEij;
178  }
179 
180  //add dEda
181  if (doSlow) {
182  float dEdai = 0.5f*qiqj*f_i*f_i
183  *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
184  *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadI*scale;//0
185  resI.dEdaSum += dEdai;
186  float dEdaj = 0.5f*qiqj*f_i*f_i
187  *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
188  *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadJ*scale;//0
189  resJ.dEdaSum += dEdaj;
190  }
191 
192  forcedEdr *= r_i;
193  float tmpx = dx*forcedEdr;
194  float tmpy = dy*forcedEdr;
195  float tmpz = dz*forcedEdr;
196  resI.force.x += tmpx;
197  resI.force.y += tmpy;
198  resI.force.z += tmpz;
199  resJ.force.x -= tmpx;
200  resJ.force.y -= tmpy;
201  resJ.force.z -= tmpz;
202  }
203 
204  // GB Self Energy
205  if (doEnergy) {
206  if (r2 < 0.01f) {
207  float fij = inp.bornRadI;//inf
208  float expkappa = exp(-param.kappa*fij);//0
209  float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
210  float gbEij = inp.qi*(inp.qi / (-param.scaling) )*Dij/fij;
211  energyT += 0.5f*gbEij;
212  } //same atom or within cutoff
213  }
214 }
if(ComputeNonbondedUtil::goMethod==2)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 3 >  param,
const GBISInput< 3 >  inp,
GBISResults< 3 > &  resI,
GBISResults< 3 > &  resJ,
float &  energy 
)

Definition at line 266 of file CudaComputeGBISKernel.cu.

References GBISParam< 3 >::a_cut, CalcDH(), GBISInput< 3 >::dHdrPrefixI, GBISInput< 3 >::dHdrPrefixJ, GBISResults< 3 >::force, GBISInput< 3 >::intRadIS, GBISInput< 3 >::intRadJ0, GBISInput< 3 >::intRadSJ, and GBISInput< 3 >::qi.

268  {
269 
270  if (r2 < cutoff2 && r2 > 0.01f) {
271  float r_i = rsqrtf(r2);
272  float r = r2 * r_i;
273  float dhij, dhji;
274  int dij, dji;
275  CalcDH(r, r2, r_i, param.a_cut, inp.qi, inp.intRadSJ, dhij, dij);
276  CalcDH(r, r2, r_i, param.a_cut, inp.intRadJ0, inp.intRadIS, dhji, dji);
277 
278  float forceAlpha = r_i*(inp.dHdrPrefixI*dhij + inp.dHdrPrefixJ*dhji);
279  float tmpx = dx * forceAlpha;
280  float tmpy = dy * forceAlpha;
281  float tmpz = dz * forceAlpha;
282  resI.force.x += tmpx;
283  resI.force.y += tmpy;
284  resI.force.z += tmpz;
285  resJ.force.x -= tmpx;
286  resJ.force.y -= tmpy;
287  resJ.force.z -= tmpz;
288  }
289 
290 }
static void CalcDH(float r, float r2, float ri, float rc, float r0, float rs, float &dh, int &d)
template<bool doEnergy, bool doSlow, int phase>
__global__ void GBIS_Kernel ( const int  numTileLists,
const TileList *__restrict__  tileLists,
const int *__restrict__  tileJatomStart,
const PatchPairRecord *__restrict__  patchPairs,
const float3  lata,
const float3  latb,
const float3  latc,
const float4 *__restrict__  xyzq,
const float  cutoff2,
const GBISParam< phase >  param,
const float *__restrict__  inp1,
const float *__restrict__  inp2,
const float *__restrict__  inp3,
float *__restrict__  out,
float4 *__restrict__  forces,
TileListVirialEnergy *__restrict__  virialEnergy 
)

Definition at line 306 of file CudaComputeGBISKernel.cu.

References cutoff2, GBISKERNEL_NUM_WARP, PatchPairRecord::iatomSize, TileList::iatomStart, itileList, PatchPairRecord::jatomSize, TileList::jtileEnd, TileList::jtileStart, TileList::offsetXYZ, tempStorage, WARP_FULL_MASK, WARP_SHUFFLE, WARPSIZE, and writeResult().

319  {
320 
321  // Single warp takes care of one list of tiles
322  for (int itileList = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;itileList < numTileLists;itileList += blockDim.x/WARPSIZE*gridDim.x)
323  {
324  TileList tmp = tileLists[itileList];
325  int iatomStart = tmp.iatomStart;
326  int jtileStart = tmp.jtileStart;
327  int jtileEnd = tmp.jtileEnd;
328  PatchPairRecord PPStmp = patchPairs[itileList];
329  int iatomSize = PPStmp.iatomSize;
330  int jatomSize = PPStmp.jatomSize;
331 
332  float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
333  float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
334  float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
335 
336  // Warp index (0...warpsize-1)
337  const int wid = threadIdx.x % WARPSIZE;
338 
339  // Load i-atom data (and shift coordinates)
340  float4 xyzq_i = xyzq[iatomStart + wid];
341  xyzq_i.x += shx;
342  xyzq_i.y += shy;
343  xyzq_i.z += shz;
344 
345  GBISInput<phase> inp;
346  inp.loadI(iatomStart + wid, inp1, inp2, inp3);
347  if (phase == 2) inp.initQi(param, xyzq_i.w);
348 
349  // Number of i loops
350  int nloopi = min(iatomSize - iatomStart, WARPSIZE);
351 
352  GBISResults<phase> resI;
353  resI.init();
354  float energyT;
355  if (doEnergy) energyT = 0.0f;
356 
357  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
358 
359  // Load j-atom starting index and exclusion mask
360  int jatomStart = tileJatomStart[jtile];
361 
362  // Load coordinates and charge
363  float4 xyzq_j = xyzq[jatomStart + wid];
364 
365  inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
366  if (phase == 2) inp.initQj(xyzq_j.w);
367 
368  // Number of j loops
369  int nloopj = min(jatomSize - jatomStart, WARPSIZE);
370 
371  const bool diag_tile = (iatomStart == jatomStart);
372  const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
373  int t = (phase != 2 && diag_tile) ? 1 : 0;
374  if (phase != 2 && diag_tile) {
375  inp.shuffleNext();
376  }
377 
378  GBISResults<phase> resJ;
379  resJ.init();
380 
381  for (;t < WARPSIZE;t++) {
382  int j = (t + wid) & modval;
383 
384  float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
385  float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
386  float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
387 
388  float r2 = dx*dx + dy*dy + dz*dz;
389 
390  if (j < nloopj && wid < nloopi) {
391  calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz, cutoff2, param, inp, resI, resJ, energyT);
392  }
393 
394  inp.shuffleNext();
395  resJ.shuffleNext();
396  } // t
397 
398  // Write j
399  writeResult(jatomStart + wid, resJ, out, forces);
400 
401  } // jtile
402 
403  // Write i
404  writeResult(iatomStart + wid, resI, out, forces);
405 
406  // Reduce energy
407  if (doEnergy) {
408  typedef cub::WarpReduce<double> WarpReduceDouble;
409  __shared__ typename WarpReduceDouble::TempStorage tempStorage[GBISKERNEL_NUM_WARP];
410  int warpId = threadIdx.x / WARPSIZE;
411  volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((double)energyT);
412  if (wid == 0) virialEnergy[itileList].energyGBIS = energyTot;
413  }
414 
415  }
416 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
static __thread float4 * forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__device__ __forceinline__ void writeResult(const int i, const GBISResults< 1 > res, float *out, float4 *forces)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
float3 offsetXYZ
__global__ void const int numTileLists
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
#define WARPSIZE
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
#define GBISKERNEL_NUM_WARP
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:38
__device__ __forceinline__ void shuffleNext ( float &  w)

Definition at line 23 of file CudaComputeGBISKernel.cu.

References WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

23  {
24  w = WARP_SHUFFLE(WARP_FULL_MASK, w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
25 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
#define WARPSIZE
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:38
__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 1 >  res,
float *  out,
float4 *  forces 
)

Definition at line 89 of file CudaComputeGBISKernel.cu.

References GBISResults< 1 >::psiSum.

Referenced by GBIS_Kernel().

89  {
90  atomicAdd(&out[i], res.psiSum);
91 }
__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 2 >  res,
float *  out,
float4 *  forces 
)

Definition at line 216 of file CudaComputeGBISKernel.cu.

References GBISResults< 2 >::dEdaSum, GBISResults< 2 >::force, x, y, and z.

216  {
217  atomicAdd(&out[i], res.dEdaSum);
218  atomicAdd(&forces[i].x, res.force.x);
219  atomicAdd(&forces[i].y, res.force.y);
220  atomicAdd(&forces[i].z, res.force.z);
221 }
static __thread float4 * forces
gridSize z
gridSize y
gridSize x
__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 3 >  res,
float *  out,
float4 *  forces 
)

Definition at line 292 of file CudaComputeGBISKernel.cu.

References GBISResults< 3 >::force, x, y, and z.

292  {
293  atomicAdd(&forces[i].x, res.force.x);
294  atomicAdd(&forces[i].y, res.force.y);
295  atomicAdd(&forces[i].z, res.force.z);
296 }
static __thread float4 * forces
gridSize z
gridSize y
gridSize x

Variable Documentation

__thread DeviceCUDA* deviceCUDA

Definition at line 18 of file DeviceCUDA.C.