CudaComputeGBISKernel.cu File Reference

#include <cuda.h>
#include <cub/cub.cuh>
#include "CudaUtils.h"
#include "CudaTileListKernel.h"
#include "CudaComputeGBISKernel.h"
#include "ComputeGBIS.inl"
#include "DeviceCUDA.h"

Go to the source code of this file.

Classes

struct  GBISParam< phase >
struct  GBISInput< phase >
struct  GBISResults< phase >
struct  GBISParam< 1 >
struct  GBISInput< 1 >
struct  GBISResults< 1 >
struct  GBISParam< 2 >
struct  GBISInput< 2 >
struct  GBISResults< 2 >
struct  GBISParam< 3 >
struct  GBISInput< 3 >
struct  GBISResults< 3 >

Defines

#define GBIS_CUDA
#define LARGE_FLOAT   (float)(1.0e10)
#define GBISKERNEL_NUM_WARP   4
#define CALL(DOENERGY, DOSLOW)

Functions

__device__ __forceinline__ void shuffleNext (float &w)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 1 > param, const GBISInput< 1 > inp, GBISResults< 1 > &resI, GBISResults< 1 > &resJ, float &energy)
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 1 > res, float *out, float4 *forces)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 2 > param, const GBISInput< 2 > inp, GBISResults< 2 > &resI, GBISResults< 2 > &resJ, float &energyT)
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 2 > res, float *out, float4 *forces)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 3 > param, const GBISInput< 3 > inp, GBISResults< 3 > &resI, GBISResults< 3 > &resJ, float &energy)
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 3 > res, float *out, float4 *forces)
template<bool doEnergy, bool doSlow, int phase>
__global__ void __launch_bounds__ (32 *GBISKERNEL_NUM_WARP, 6) GBIS_Kernel(const int numTileLists
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ const
float *__restrict__ const
float *__restrict__ float
*__restrict__ float4
*__restrict__
TileListVirialEnergy
*__restrict__ virialEnergy 
for (int itileList=threadIdx.x/WARPSIZE+blockDim.x/WARPSIZE *blockIdx.x;itileList< numTileLists;itileList+=blockDim.x/WARPSIZE *gridDim.x)

Variables

__thread DeviceCUDAdeviceCUDA
__global__ void const TileList
*__restrict__ 
tileLists
__global__ void const TileList
*__restrict__ const int
*__restrict__ 
tileJatomStart
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__ 
patchPairs
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 
lata
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3 
latb
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 
latc
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ 
xyzq
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float 
cutoff2
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > 
param
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ 
inp1
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ const
float *__restrict__ 
inp2
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ const
float *__restrict__ const
float *__restrict__ 
inp3
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ const
float *__restrict__ const
float *__restrict__ float
*__restrict__ 
out
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ const
float *__restrict__ const
float *__restrict__ float
*__restrict__ float4
*__restrict__ 
forces

Define Documentation

#define CALL ( DOENERGY,
DOSLOW   ) 
Value:
GBIS_Kernel<DOENERGY, DOSLOW, 2> \
     <<< nblock, nthread, 0, stream >>> \
    (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
      tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
      param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
#define GBIS_CUDA

Definition at line 6 of file CudaComputeGBISKernel.cu.

#define GBISKERNEL_NUM_WARP   4
#define LARGE_FLOAT   (float)(1.0e10)

Definition at line 14 of file CudaComputeGBISKernel.cu.

Referenced by if().


Function Documentation

template<bool doEnergy, bool doSlow, int phase>
__global__ void __launch_bounds__ ( 32 *  GBISKERNEL_NUM_WARP,
 
) const [inline]
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 3 >  param,
const GBISInput< 3 >  inp,
GBISResults< 3 > &  resI,
GBISResults< 3 > &  resJ,
float &  energy 
) [inline]

Definition at line 262 of file CudaComputeGBISKernel.cu.

References GBISParam< 3 >::a_cut, CalcDH(), GBISInput< 3 >::dHdrPrefixI, GBISInput< 3 >::dHdrPrefixJ, GBISResults< 3 >::force, GBISInput< 3 >::intRadIS, GBISInput< 3 >::intRadJ0, GBISInput< 3 >::intRadSJ, and GBISInput< 3 >::qi.

00264                                                              {
00265 
00266   if (r2 < cutoff2 && r2 > 0.01f) {
00267     float r_i = rsqrtf(r2);
00268     float r  = r2 * r_i;
00269     float dhij, dhji;
00270     int dij, dji;
00271     CalcDH(r, r2, r_i, param.a_cut, inp.qi,       inp.intRadSJ, dhij, dij);
00272     CalcDH(r, r2, r_i, param.a_cut, inp.intRadJ0, inp.intRadIS, dhji, dji);
00273 
00274     float forceAlpha = r_i*(inp.dHdrPrefixI*dhij + inp.dHdrPrefixJ*dhji);
00275     float tmpx = dx * forceAlpha;
00276     float tmpy = dy * forceAlpha;
00277     float tmpz = dz * forceAlpha;
00278     resI.force.x += tmpx;
00279     resI.force.y += tmpy;
00280     resI.force.z += tmpz;
00281     resJ.force.x -= tmpx;
00282     resJ.force.y -= tmpy;
00283     resJ.force.z -= tmpz;
00284   }
00285 
00286 }

template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 2 >  param,
const GBISInput< 2 >  inp,
GBISResults< 2 > &  resI,
GBISResults< 2 > &  resJ,
float &  energyT 
) [inline]

Definition at line 135 of file CudaComputeGBISKernel.cu.

References GBISInput< 2 >::bornRadI, GBISInput< 2 >::bornRadJ, GBISResults< 2 >::dEdaSum, GBISParam< 2 >::epsilon_p_i, GBISParam< 2 >::epsilon_s_i, GBISResults< 2 >::force, if(), GBISParam< 2 >::kappa, GBISInput< 2 >::qi, GBISInput< 2 >::qj, GBISParam< 2 >::r_cut2, GBISParam< 2 >::r_cut_2, GBISParam< 2 >::r_cut_4, GBISParam< 2 >::scaling, and GBISParam< 2 >::smoothDist.

00137                                                               {
00138 
00139   if (r2 < cutoff2 && r2 > 0.01f) {
00140     float r_i = rsqrtf(r2);
00141     float r  = r2 * r_i;
00142     //float bornRadJ = sh_jBornRad[j];
00143 
00144     //calculate GB energy
00145     float qiqj = inp.qi*inp.qj;
00146     float aiaj = inp.bornRadI*inp.bornRadJ;
00147     float aiaj4 = 4*aiaj;
00148     float expr2aiaj4 = exp(-r2/aiaj4);
00149     float fij = sqrt(r2 + aiaj*expr2aiaj4);
00150     float f_i = 1/fij;
00151     float expkappa = exp(-param.kappa*fij);
00152     float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
00153     float gbEij = qiqj*Dij*f_i;
00154 
00155     //calculate energy derivatives
00156     float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
00157     float ddrf_i = -ddrfij*f_i*f_i;
00158     float ddrDij = param.kappa*expkappa*ddrfij*param.epsilon_s_i;
00159     float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
00160 
00161     //NAMD smoothing function
00162     float scale = 1.f;
00163     float ddrScale = 0.f;
00164     float forcedEdr;
00165     if (param.smoothDist > 0.f) {
00166       scale = r2 * param.r_cut_2 - 1.f;
00167       scale *= scale;
00168       ddrScale = r*(r2-param.r_cut2)*param.r_cut_4;
00169       if (doEnergy) energyT += gbEij * scale;
00170       forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
00171     } else {
00172       if (doEnergy) energyT += gbEij;
00173       forcedEdr = ddrGbEij;
00174     }
00175 
00176     //add dEda
00177     if (doSlow) {
00178       float dEdai = 0.5f*qiqj*f_i*f_i
00179                 *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
00180                 *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadI*scale;//0
00181       resI.dEdaSum += dEdai;
00182       float dEdaj = 0.5f*qiqj*f_i*f_i
00183                 *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
00184                 *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadJ*scale;//0
00185       resJ.dEdaSum += dEdaj;
00186     }
00187 
00188     forcedEdr *= r_i;
00189     float tmpx = dx*forcedEdr;
00190     float tmpy = dy*forcedEdr;
00191     float tmpz = dz*forcedEdr;
00192     resI.force.x += tmpx;
00193     resI.force.y += tmpy;
00194     resI.force.z += tmpz;
00195     resJ.force.x -= tmpx;
00196     resJ.force.y -= tmpy;
00197     resJ.force.z -= tmpz;
00198   }
00199 
00200   // GB Self Energy
00201   if (doEnergy) {
00202     if (r2 < 0.01f) {
00203       float fij = inp.bornRadI;//inf
00204       float expkappa = exp(-param.kappa*fij);//0
00205       float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
00206       float gbEij = inp.qi*(inp.qi / (-param.scaling) )*Dij/fij;
00207       energyT += 0.5f*gbEij;
00208     } //same atom or within cutoff
00209   }
00210 }

template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 1 >  param,
const GBISInput< 1 >  inp,
GBISResults< 1 > &  resI,
GBISResults< 1 > &  resJ,
float &  energy 
) [inline]

Definition at line 66 of file CudaComputeGBISKernel.cu.

References GBISParam< 1 >::a_cut, CalcH(), GBISInput< 1 >::intRad0j, GBISInput< 1 >::intRadSi, GBISResults< 1 >::psiSum, GBISInput< 1 >::qi, and GBISInput< 1 >::qj.

00068                                                              {
00069 
00070   if (r2 < cutoff2 && r2 > 0.01f) {
00071     float r_i = rsqrtf(r2);
00072     float r  = r2 * r_i;
00073     float hij;
00074     int dij;
00075     CalcH(r, r2, r_i, param.a_cut, inp.qi, inp.qj, hij, dij);
00076     resI.psiSum += hij;
00077     float hji;
00078     int dji;
00079     CalcH(r, r2, r_i, param.a_cut, inp.intRad0j, inp.intRadSi, hji, dji);
00080     resJ.psiSum += hji;
00081   }
00082 
00083 }

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ const float* __restrict__ const float* __restrict__ float* __restrict__ float4* __restrict__ TileListVirialEnergy* __restrict__ virialEnergy for (  ) 
Type Constraints

Definition at line 318 of file CudaComputeGBISKernel.cu.

References GBISKERNEL_NUM_WARP, PatchPairRecord::iatomSize, TileList::iatomStart, itileList, j, PatchPairRecord::jatomSize, TileList::jtileEnd, TileList::jtileStart, TileList::offsetXYZ, tempStorage, WARP_FULL_MASK, WARP_SHUFFLE, WARPSIZE, and writeResult().

00319   {
00320     TileList tmp = tileLists[itileList];
00321     int iatomStart = tmp.iatomStart;
00322     int jtileStart = tmp.jtileStart;
00323     int jtileEnd   = tmp.jtileEnd;
00324     PatchPairRecord PPStmp = patchPairs[itileList];
00325     int iatomSize     = PPStmp.iatomSize;
00326     int jatomSize     = PPStmp.jatomSize;
00327 
00328     float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
00329     float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
00330     float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
00331 
00332     // Warp index (0...warpsize-1)
00333     const int wid = threadIdx.x % WARPSIZE;
00334 
00335     // Load i-atom data (and shift coordinates)
00336     float4 xyzq_i = xyzq[iatomStart + wid];
00337     xyzq_i.x += shx;
00338     xyzq_i.y += shy;
00339     xyzq_i.z += shz;
00340 
00341     GBISInput<phase> inp;
00342     inp.loadI(iatomStart + wid, inp1, inp2, inp3);
00343     if (phase == 2) inp.initQi(param, xyzq_i.w);
00344 
00345     // Number of i loops
00346     int nloopi = min(iatomSize - iatomStart, WARPSIZE);
00347 
00348     GBISResults<phase> resI;
00349     resI.init();
00350     float energyT;
00351     if (doEnergy) energyT = 0.0f;
00352 
00353     for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
00354 
00355       // Load j-atom starting index and exclusion mask
00356       int jatomStart = tileJatomStart[jtile];
00357 
00358       // Load coordinates and charge
00359       float4 xyzq_j  = xyzq[jatomStart + wid];
00360 
00361       inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
00362       if (phase == 2) inp.initQj(xyzq_j.w);
00363 
00364       // Number of j loops
00365       int nloopj = min(jatomSize - jatomStart, WARPSIZE);
00366 
00367       const bool diag_tile = (iatomStart == jatomStart);
00368       const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
00369       int t = (phase != 2 && diag_tile) ? 1 : 0;
00370       if (phase != 2 && diag_tile) {
00371         inp.shuffleNext();
00372       }
00373 
00374       GBISResults<phase> resJ;
00375       resJ.init();
00376 
00377       for (;t < WARPSIZE;t++) {
00378         int j = (t + wid) & modval;
00379 
00380         float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
00381         float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
00382         float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
00383 
00384         float r2 = dx*dx + dy*dy + dz*dz;
00385 
00386         if (j < nloopj && wid < nloopi) {
00387           calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz, cutoff2, param, inp, resI, resJ, energyT);
00388         }
00389 
00390         inp.shuffleNext();
00391         resJ.shuffleNext();
00392       } // t
00393 
00394       // Write j
00395       writeResult(jatomStart + wid, resJ, out, forces);
00396 
00397     } // jtile
00398 
00399     // Write i
00400     writeResult(iatomStart + wid, resI, out, forces);
00401 
00402     // Reduce energy
00403     if (doEnergy) {
00404       typedef cub::WarpReduce<double> WarpReduceDouble;
00405       __shared__ typename WarpReduceDouble::TempStorage tempStorage[GBISKERNEL_NUM_WARP];
00406       int warpId = threadIdx.x / WARPSIZE;
00407       volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((double)energyT);
00408       if (wid == 0) virialEnergy[itileList].energyGBIS = energyTot;
00409     }
00410 
00411   }

__device__ __forceinline__ void shuffleNext ( float &  w  ) 

Definition at line 19 of file CudaComputeGBISKernel.cu.

References WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

00019                            {
00020   w = WARP_SHUFFLE(WARP_FULL_MASK, w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00021 }

__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 3 >  res,
float *  out,
float4 *  forces 
)

Definition at line 288 of file CudaComputeGBISKernel.cu.

References GBISResults< 3 >::force.

00288                                                                                                                {
00289   atomicAdd(&forces[i].x, res.force.x);
00290   atomicAdd(&forces[i].y, res.force.y);
00291   atomicAdd(&forces[i].z, res.force.z);
00292 }

__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 2 >  res,
float *  out,
float4 *  forces 
)

Definition at line 212 of file CudaComputeGBISKernel.cu.

References GBISResults< 2 >::dEdaSum, and GBISResults< 2 >::force.

00212                                                                                                                {
00213   atomicAdd(&out[i], res.dEdaSum);
00214   atomicAdd(&forces[i].x, res.force.x);
00215   atomicAdd(&forces[i].y, res.force.y);
00216   atomicAdd(&forces[i].z, res.force.z);
00217 }

__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 1 >  res,
float *  out,
float4 *  forces 
)

Definition at line 85 of file CudaComputeGBISKernel.cu.

References GBISResults< 1 >::psiSum.

Referenced by for().

00085                                                                                                                {
00086   atomicAdd(&out[i], res.psiSum);
00087 }


Variable Documentation

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cutoff2

Definition at line 303 of file CudaComputeGBISKernel.cu.

Definition at line 18 of file DeviceCUDA.C.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ const float* __restrict__ const float* __restrict__ float* __restrict__ float4* __restrict__ forces
__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ inp1

Definition at line 303 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ const float* __restrict__ inp2

Definition at line 303 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ const float* __restrict__ const float* __restrict__ inp3

Definition at line 303 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 lata
__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 latb
__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 latc
__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ const float* __restrict__ const float* __restrict__ float* __restrict__ out

Definition at line 303 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> param

Definition at line 303 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ patchPairs

Definition at line 303 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ tileJatomStart

Definition at line 303 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ tileLists

Definition at line 303 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ xyzq

Definition at line 303 of file CudaComputeGBISKernel.cu.


Generated on 17 Sep 2019 for NAMD by  doxygen 1.6.1