CudaComputeGBISKernel.cu File Reference

#include <cuda.h>
#include <namd_cub/cub.cuh>
#include "CudaUtils.h"
#include "CudaTileListKernel.h"
#include "CudaComputeGBISKernel.h"
#include "ComputeGBIS.inl"
#include "DeviceCUDA.h"

Go to the source code of this file.

Classes

struct  GBISParam< phase >
struct  GBISInput< phase >
struct  GBISResults< phase >
struct  GBISParam< 1 >
struct  GBISInput< 1 >
struct  GBISResults< 1 >
struct  GBISParam< 2 >
struct  GBISInput< 2 >
struct  GBISResults< 2 >
struct  GBISParam< 3 >
struct  GBISInput< 3 >
struct  GBISResults< 3 >

Defines

#define GBIS_CUDA
#define LARGE_FLOAT   (float)(1.0e10)
#define GBISKERNEL_NUM_WARP   4
#define CALL(DOENERGY, DOSLOW)

Functions

__device__ __forceinline__ void shuffleNext (float &w)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 1 > param, const GBISInput< 1 > inp, GBISResults< 1 > &resI, GBISResults< 1 > &resJ, float &energy)
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 1 > res, float *out, float4 *forces)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 2 > param, const GBISInput< 2 > inp, GBISResults< 2 > &resI, GBISResults< 2 > &resJ, float &energyT)
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 2 > res, float *out, float4 *forces)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 3 > param, const GBISInput< 3 > inp, GBISResults< 3 > &resI, GBISResults< 3 > &resJ, float &energy)
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 3 > res, float *out, float4 *forces)
template<bool doEnergy, bool doSlow, int phase>
__global__ void __launch_bounds__ (32 *GBISKERNEL_NUM_WARP, 6) GBIS_Kernel(const int numTileLists
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ const
float *__restrict__ const
float *__restrict__ float
*__restrict__ float4
*__restrict__
TileListVirialEnergy
*__restrict__ virialEnergy 
for (int itileList=threadIdx.x/WARPSIZE+blockDim.x/WARPSIZE *blockIdx.x;itileList< numTileLists;itileList+=blockDim.x/WARPSIZE *gridDim.x)

Variables

__thread DeviceCUDAdeviceCUDA
__global__ void const TileList
*__restrict__ 
tileLists
__global__ void const TileList
*__restrict__ const int
*__restrict__ 
tileJatomStart
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__ 
patchPairs
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 
lata
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3 
latb
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 
latc
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ 
xyzq
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float 
cutoff2
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > 
param
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ 
inp1
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ const
float *__restrict__ 
inp2
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ const
float *__restrict__ const
float *__restrict__ 
inp3
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ const
float *__restrict__ const
float *__restrict__ float
*__restrict__ 
out
__global__ void const TileList
*__restrict__ const int
*__restrict__ const
PatchPairRecord *__restrict__
const float3 const float3
const float3 const float4
*__restrict__ const float
const GBISParam< phase > const
float *__restrict__ const
float *__restrict__ const
float *__restrict__ float
*__restrict__ float4
*__restrict__ 
forces

Define Documentation

#define CALL ( DOENERGY,
DOSLOW   ) 
Value:
GBIS_Kernel<DOENERGY, DOSLOW, 2> \
     <<< nblock, nthread, 0, stream >>> \
    (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
      tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
      param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
#define GBIS_CUDA

Definition at line 10 of file CudaComputeGBISKernel.cu.

#define GBISKERNEL_NUM_WARP   4
#define LARGE_FLOAT   (float)(1.0e10)

Definition at line 18 of file CudaComputeGBISKernel.cu.

Referenced by if().


Function Documentation

template<bool doEnergy, bool doSlow, int phase>
__global__ void __launch_bounds__ ( 32 *  GBISKERNEL_NUM_WARP,
 
) const [inline]
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 3 >  param,
const GBISInput< 3 >  inp,
GBISResults< 3 > &  resI,
GBISResults< 3 > &  resJ,
float &  energy 
) [inline]

Definition at line 266 of file CudaComputeGBISKernel.cu.

References GBISParam< 3 >::a_cut, CalcDH(), GBISInput< 3 >::dHdrPrefixI, GBISInput< 3 >::dHdrPrefixJ, GBISResults< 3 >::force, GBISInput< 3 >::intRadIS, GBISInput< 3 >::intRadJ0, GBISInput< 3 >::intRadSJ, and GBISInput< 3 >::qi.

00268                                                              {
00269 
00270   if (r2 < cutoff2 && r2 > 0.01f) {
00271     float r_i = rsqrtf(r2);
00272     float r  = r2 * r_i;
00273     float dhij, dhji;
00274     int dij, dji;
00275     CalcDH(r, r2, r_i, param.a_cut, inp.qi,       inp.intRadSJ, dhij, dij);
00276     CalcDH(r, r2, r_i, param.a_cut, inp.intRadJ0, inp.intRadIS, dhji, dji);
00277 
00278     float forceAlpha = r_i*(inp.dHdrPrefixI*dhij + inp.dHdrPrefixJ*dhji);
00279     float tmpx = dx * forceAlpha;
00280     float tmpy = dy * forceAlpha;
00281     float tmpz = dz * forceAlpha;
00282     resI.force.x += tmpx;
00283     resI.force.y += tmpy;
00284     resI.force.z += tmpz;
00285     resJ.force.x -= tmpx;
00286     resJ.force.y -= tmpy;
00287     resJ.force.z -= tmpz;
00288   }
00289 
00290 }

template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 2 >  param,
const GBISInput< 2 >  inp,
GBISResults< 2 > &  resI,
GBISResults< 2 > &  resJ,
float &  energyT 
) [inline]

Definition at line 139 of file CudaComputeGBISKernel.cu.

References GBISInput< 2 >::bornRadI, GBISInput< 2 >::bornRadJ, GBISResults< 2 >::dEdaSum, GBISParam< 2 >::epsilon_p_i, GBISParam< 2 >::epsilon_s_i, GBISResults< 2 >::force, if(), GBISParam< 2 >::kappa, GBISInput< 2 >::qi, GBISInput< 2 >::qj, GBISParam< 2 >::r_cut2, GBISParam< 2 >::r_cut_2, GBISParam< 2 >::r_cut_4, GBISParam< 2 >::scaling, and GBISParam< 2 >::smoothDist.

00141                                                               {
00142 
00143   if (r2 < cutoff2 && r2 > 0.01f) {
00144     float r_i = rsqrtf(r2);
00145     float r  = r2 * r_i;
00146     //float bornRadJ = sh_jBornRad[j];
00147 
00148     //calculate GB energy
00149     float qiqj = inp.qi*inp.qj;
00150     float aiaj = inp.bornRadI*inp.bornRadJ;
00151     float aiaj4 = 4*aiaj;
00152     float expr2aiaj4 = exp(-r2/aiaj4);
00153     float fij = sqrt(r2 + aiaj*expr2aiaj4);
00154     float f_i = 1/fij;
00155     float expkappa = exp(-param.kappa*fij);
00156     float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
00157     float gbEij = qiqj*Dij*f_i;
00158 
00159     //calculate energy derivatives
00160     float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
00161     float ddrf_i = -ddrfij*f_i*f_i;
00162     float ddrDij = param.kappa*expkappa*ddrfij*param.epsilon_s_i;
00163     float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
00164 
00165     //NAMD smoothing function
00166     float scale = 1.f;
00167     float ddrScale = 0.f;
00168     float forcedEdr;
00169     if (param.smoothDist > 0.f) {
00170       scale = r2 * param.r_cut_2 - 1.f;
00171       scale *= scale;
00172       ddrScale = r*(r2-param.r_cut2)*param.r_cut_4;
00173       if (doEnergy) energyT += gbEij * scale;
00174       forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
00175     } else {
00176       if (doEnergy) energyT += gbEij;
00177       forcedEdr = ddrGbEij;
00178     }
00179 
00180     //add dEda
00181     if (doSlow) {
00182       float dEdai = 0.5f*qiqj*f_i*f_i
00183                 *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
00184                 *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadI*scale;//0
00185       resI.dEdaSum += dEdai;
00186       float dEdaj = 0.5f*qiqj*f_i*f_i
00187                 *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
00188                 *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadJ*scale;//0
00189       resJ.dEdaSum += dEdaj;
00190     }
00191 
00192     forcedEdr *= r_i;
00193     float tmpx = dx*forcedEdr;
00194     float tmpy = dy*forcedEdr;
00195     float tmpz = dz*forcedEdr;
00196     resI.force.x += tmpx;
00197     resI.force.y += tmpy;
00198     resI.force.z += tmpz;
00199     resJ.force.x -= tmpx;
00200     resJ.force.y -= tmpy;
00201     resJ.force.z -= tmpz;
00202   }
00203 
00204   // GB Self Energy
00205   if (doEnergy) {
00206     if (r2 < 0.01f) {
00207       float fij = inp.bornRadI;//inf
00208       float expkappa = exp(-param.kappa*fij);//0
00209       float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
00210       float gbEij = inp.qi*(inp.qi / (-param.scaling) )*Dij/fij;
00211       energyT += 0.5f*gbEij;
00212     } //same atom or within cutoff
00213   }
00214 }

template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 1 >  param,
const GBISInput< 1 >  inp,
GBISResults< 1 > &  resI,
GBISResults< 1 > &  resJ,
float &  energy 
) [inline]

Definition at line 70 of file CudaComputeGBISKernel.cu.

References GBISParam< 1 >::a_cut, CalcH(), GBISInput< 1 >::intRad0j, GBISInput< 1 >::intRadSi, GBISResults< 1 >::psiSum, GBISInput< 1 >::qi, and GBISInput< 1 >::qj.

00072                                                              {
00073 
00074   if (r2 < cutoff2 && r2 > 0.01f) {
00075     float r_i = rsqrtf(r2);
00076     float r  = r2 * r_i;
00077     float hij;
00078     int dij;
00079     CalcH(r, r2, r_i, param.a_cut, inp.qi, inp.qj, hij, dij);
00080     resI.psiSum += hij;
00081     float hji;
00082     int dji;
00083     CalcH(r, r2, r_i, param.a_cut, inp.intRad0j, inp.intRadSi, hji, dji);
00084     resJ.psiSum += hji;
00085   }
00086 
00087 }

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ const float* __restrict__ const float* __restrict__ float* __restrict__ float4* __restrict__ TileListVirialEnergy* __restrict__ virialEnergy for (  ) 
Type Constraints

Definition at line 322 of file CudaComputeGBISKernel.cu.

References GBISKERNEL_NUM_WARP, PatchPairRecord::iatomSize, TileList::iatomStart, itileList, j, PatchPairRecord::jatomSize, TileList::jtileEnd, TileList::jtileStart, TileList::offsetXYZ, tempStorage, WARP_FULL_MASK, WARP_SHUFFLE, WARPSIZE, and writeResult().

00323   {
00324     TileList tmp = tileLists[itileList];
00325     int iatomStart = tmp.iatomStart;
00326     int jtileStart = tmp.jtileStart;
00327     int jtileEnd   = tmp.jtileEnd;
00328     PatchPairRecord PPStmp = patchPairs[itileList];
00329     int iatomSize     = PPStmp.iatomSize;
00330     int jatomSize     = PPStmp.jatomSize;
00331 
00332     float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
00333     float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
00334     float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
00335 
00336     // Warp index (0...warpsize-1)
00337     const int wid = threadIdx.x % WARPSIZE;
00338 
00339     // Load i-atom data (and shift coordinates)
00340     float4 xyzq_i = xyzq[iatomStart + wid];
00341     xyzq_i.x += shx;
00342     xyzq_i.y += shy;
00343     xyzq_i.z += shz;
00344 
00345     GBISInput<phase> inp;
00346     inp.loadI(iatomStart + wid, inp1, inp2, inp3);
00347     if (phase == 2) inp.initQi(param, xyzq_i.w);
00348 
00349     // Number of i loops
00350     int nloopi = min(iatomSize - iatomStart, WARPSIZE);
00351 
00352     GBISResults<phase> resI;
00353     resI.init();
00354     float energyT;
00355     if (doEnergy) energyT = 0.0f;
00356 
00357     for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
00358 
00359       // Load j-atom starting index and exclusion mask
00360       int jatomStart = tileJatomStart[jtile];
00361 
00362       // Load coordinates and charge
00363       float4 xyzq_j  = xyzq[jatomStart + wid];
00364 
00365       inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
00366       if (phase == 2) inp.initQj(xyzq_j.w);
00367 
00368       // Number of j loops
00369       int nloopj = min(jatomSize - jatomStart, WARPSIZE);
00370 
00371       const bool diag_tile = (iatomStart == jatomStart);
00372       const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
00373       int t = (phase != 2 && diag_tile) ? 1 : 0;
00374       if (phase != 2 && diag_tile) {
00375         inp.shuffleNext();
00376       }
00377 
00378       GBISResults<phase> resJ;
00379       resJ.init();
00380 
00381       for (;t < WARPSIZE;t++) {
00382         int j = (t + wid) & modval;
00383 
00384         float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
00385         float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
00386         float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
00387 
00388         float r2 = dx*dx + dy*dy + dz*dz;
00389 
00390         if (j < nloopj && wid < nloopi) {
00391           calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz, cutoff2, param, inp, resI, resJ, energyT);
00392         }
00393 
00394         inp.shuffleNext();
00395         resJ.shuffleNext();
00396       } // t
00397 
00398       // Write j
00399       writeResult(jatomStart + wid, resJ, out, forces);
00400 
00401     } // jtile
00402 
00403     // Write i
00404     writeResult(iatomStart + wid, resI, out, forces);
00405 
00406     // Reduce energy
00407     if (doEnergy) {
00408       typedef cub::WarpReduce<double> WarpReduceDouble;
00409       __shared__ typename WarpReduceDouble::TempStorage tempStorage[GBISKERNEL_NUM_WARP];
00410       int warpId = threadIdx.x / WARPSIZE;
00411       volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((double)energyT);
00412       if (wid == 0) virialEnergy[itileList].energyGBIS = energyTot;
00413     }
00414 
00415   }

__device__ __forceinline__ void shuffleNext ( float &  w  ) 

Definition at line 23 of file CudaComputeGBISKernel.cu.

References WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

00023                            {
00024   w = WARP_SHUFFLE(WARP_FULL_MASK, w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00025 }

__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 3 >  res,
float *  out,
float4 *  forces 
)

Definition at line 292 of file CudaComputeGBISKernel.cu.

References GBISResults< 3 >::force.

00292                                                                                                                {
00293   atomicAdd(&forces[i].x, res.force.x);
00294   atomicAdd(&forces[i].y, res.force.y);
00295   atomicAdd(&forces[i].z, res.force.z);
00296 }

__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 2 >  res,
float *  out,
float4 *  forces 
)

Definition at line 216 of file CudaComputeGBISKernel.cu.

References GBISResults< 2 >::dEdaSum, and GBISResults< 2 >::force.

00216                                                                                                                {
00217   atomicAdd(&out[i], res.dEdaSum);
00218   atomicAdd(&forces[i].x, res.force.x);
00219   atomicAdd(&forces[i].y, res.force.y);
00220   atomicAdd(&forces[i].z, res.force.z);
00221 }

__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 1 >  res,
float *  out,
float4 *  forces 
)

Definition at line 89 of file CudaComputeGBISKernel.cu.

References GBISResults< 1 >::psiSum.

Referenced by for().

00089                                                                                                                {
00090   atomicAdd(&out[i], res.psiSum);
00091 }


Variable Documentation

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cutoff2

Definition at line 307 of file CudaComputeGBISKernel.cu.

Definition at line 18 of file DeviceCUDA.C.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ const float* __restrict__ const float* __restrict__ float* __restrict__ float4* __restrict__ forces
__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ inp1

Definition at line 307 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ const float* __restrict__ inp2

Definition at line 307 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ const float* __restrict__ const float* __restrict__ inp3

Definition at line 307 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 lata
__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 latb
__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 latc
__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> const float* __restrict__ const float* __restrict__ const float* __restrict__ float* __restrict__ out

Definition at line 307 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float const GBISParam<phase> param

Definition at line 307 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ patchPairs

Definition at line 307 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ tileJatomStart

Definition at line 307 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ tileLists

Definition at line 307 of file CudaComputeGBISKernel.cu.

__global__ void const TileList* __restrict__ const int* __restrict__ const PatchPairRecord* __restrict__ const float3 const float3 const float3 const float4* __restrict__ xyzq

Definition at line 307 of file CudaComputeGBISKernel.cu.


Generated on 12 Jul 2020 for NAMD by  doxygen 1.6.1