NAMD
Public Member Functions | List of all members
CudaComputeNonbondedKernel Class Reference

#include <CudaComputeNonbondedKernel.h>

Public Member Functions

 CudaComputeNonbondedKernel (int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 
 ~CudaComputeNonbondedKernel ()
 
void updateVdwTypesExcl (const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
 
void nonbondedForce (CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
 
void reduceVirialEnergy (CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
 
void getVirialEnergy (VirialEnergy *h_virialEnergy, cudaStream_t stream)
 
void bindExclusions (int numExclusions, unsigned int *exclusion_bits)
 
int * getPatchReadyQueue ()
 
void reallocate_forceSOA (int atomStorageSize)
 

Detailed Description

Definition at line 8 of file CudaComputeNonbondedKernel.h.

Constructor & Destructor Documentation

CudaComputeNonbondedKernel::CudaComputeNonbondedKernel ( int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 983 of file CudaComputeNonbondedKernel.cu.

References cudaCheck.

984  : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
985 
986  cudaCheck(cudaSetDevice(deviceID));
987 
988  overflowExclusions = NULL;
989  overflowExclusionsSize = 0;
990 
991  exclIndexMaxDiff = NULL;
992  exclIndexMaxDiffSize = 0;
993 
994  atomIndex = NULL;
995  atomIndexSize = 0;
996 
997  vdwTypes = NULL;
998  vdwTypesSize = 0;
999 
1000  patchNumCount = NULL;
1001  patchNumCountSize = 0;
1002 
1003  patchReadyQueue = NULL;
1004  patchReadyQueueSize = 0;
1005 
1006  force_x = force_y = force_z = force_w = NULL;
1007  forceSize = 0;
1008  forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
1009  forceSlowSize = 0;
1010 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ vdwTypes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel ( )

Definition at line 1024 of file CudaComputeNonbondedKernel.cu.

References cudaCheck.

1024  {
1025  cudaCheck(cudaSetDevice(deviceID));
1026  if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
1027  if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
1028  if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
1029  if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
1030  if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
1031  if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
1032  if (force_x != NULL) deallocate_device<float>(&force_x);
1033  if (force_y != NULL) deallocate_device<float>(&force_y);
1034  if (force_z != NULL) deallocate_device<float>(&force_z);
1035  if (force_w != NULL) deallocate_device<float>(&force_w);
1036  if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
1037  if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
1038  if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
1039  if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);
1040 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ vdwTypes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
#define cudaCheck(stmt)
Definition: CudaUtils.h:79

Member Function Documentation

void CudaComputeNonbondedKernel::bindExclusions ( int  numExclusions,
unsigned int *  exclusion_bits 
)

Definition at line 1273 of file CudaComputeNonbondedKernel.cu.

References constExclusions, cudaCheck, and MAX_CONST_EXCLUSIONS.

1273  {
1274  int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
1275  cudaCheck(cudaMemcpyToSymbol(constExclusions, exclusion_bits, nconst*sizeof(unsigned int), 0));
1276 
1277  reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
1278  copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
1279 }
#define MAX_CONST_EXCLUSIONS
__constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
int * CudaComputeNonbondedKernel::getPatchReadyQueue ( )

Definition at line 1054 of file CudaComputeNonbondedKernel.cu.

References NAMD_die().

Referenced by CudaComputeNonbonded::launchWork().

1054  {
1055  if (!doStreaming) {
1056  NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
1057  }
1058  return patchReadyQueue;
1059 }
void NAMD_die(const char *err_msg)
Definition: common.C:83
void CudaComputeNonbondedKernel::getVirialEnergy ( VirialEnergy h_virialEnergy,
cudaStream_t  stream 
)
void CudaComputeNonbondedKernel::nonbondedForce ( CudaTileListKernel tlKernel,
const int  atomStorageSize,
const bool  doPairlist,
const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const float3  lata,
const float3  latb,
const float3  latc,
const float4 *  h_xyzq,
const float  cutoff2,
float4 *  d_forces,
float4 *  d_forcesSlow,
float4 *  h_forces,
float4 *  h_forcesSlow,
cudaStream_t  stream 
)

Definition at line 1078 of file CudaComputeNonbondedKernel.cu.

References atomStorageSize, CALL, CudaTileListKernel::clearTileListStat(), cudaCheck, deviceCUDA, CudaTileListKernel::get_xyzq(), DeviceCUDA::getMaxNumBlocks(), CudaTileListKernel::getNumPatches(), CudaTileListKernel::getNumTileLists(), CudaTileListKernel::getOutputOrder(), NAMD_die(), NONBONDKERNEL_NUM_WARP, numPatches, CudaTileListKernel::setTileListVirialEnergyLength(), stream, and WARPSIZE.

1085  {
1086 
1087  if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
1088 
1089  // clear_device_array<float4>(d_forces, atomStorageSize, stream);
1090  // if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, stream);
1091 
1092 
1093  // XXX TODO: Clear all of these
1094  if(1){
1095  // two clears
1096  tlKernel.clearTileListStat(stream);
1097  clear_device_array<float>(force_x, atomStorageSize, stream);
1098  clear_device_array<float>(force_y, atomStorageSize, stream);
1099  clear_device_array<float>(force_z, atomStorageSize, stream);
1100  clear_device_array<float>(force_w, atomStorageSize, stream);
1101  if (doSlow) {
1102  clear_device_array<float>(forceSlow_x, atomStorageSize, stream);
1103  clear_device_array<float>(forceSlow_y, atomStorageSize, stream);
1104  clear_device_array<float>(forceSlow_z, atomStorageSize, stream);
1105  clear_device_array<float>(forceSlow_w, atomStorageSize, stream);
1106  }
1107  }
1108 
1109  // --- streaming ----
1110  float4* m_forces = NULL;
1111  float4* m_forcesSlow = NULL;
1112  int* m_patchReadyQueue = NULL;
1113  int numPatches = 0;
1114  unsigned int* patchNumCountPtr = NULL;
1115  if (doStreaming) {
1116  numPatches = tlKernel.getNumPatches();
1117  if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
1118  // If re-allocated, clear array
1119  clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
1120  }
1121  patchNumCountPtr = patchNumCount;
1122  bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
1123  if (re) {
1124  // If re-allocated, re-set to "-1"
1125  for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
1126  }
1127  cudaCheck(cudaHostGetDevicePointer(&m_patchReadyQueue, patchReadyQueue, 0));
1128  cudaCheck(cudaHostGetDevicePointer(&m_forces, h_forces, 0));
1129  cudaCheck(cudaHostGetDevicePointer(&m_forcesSlow, h_forcesSlow, 0));
1130  }
1131  // -----------------
1132 
1133  if (doVirial || doEnergy) {
1134  tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
1135  }
1136 
1137  int shMemSize = 0;
1138 
1139  int* outputOrderPtr = tlKernel.getOutputOrder();
1140 
1141  int nwarp = NONBONDKERNEL_NUM_WARP;
1142  int nthread = WARPSIZE*nwarp;
1143  int start = 0;
1144  while (start < tlKernel.getNumTileLists())
1145  {
1146 
1147  int nleft = tlKernel.getNumTileLists() - start;
1148  int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
1149 
1150 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
1151  nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
1152  <<< nblock, nthread, shMemSize, stream >>> \
1153  (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
1154  cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
1155  vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
1156  cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
1157  atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
1158  tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
1159  tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
1160  force_x, force_y, force_z, force_w, \
1161  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
1162  numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
1163  outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
1164 
1165  bool called = false;
1166 
1167  if (doStreaming) {
1168  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1);
1169  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1);
1170  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1);
1171  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1);
1172  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1);
1173  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1);
1174  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1);
1175  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1);
1176 
1177  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1);
1178  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1);
1179  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1);
1180  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1);
1181  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1);
1182  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1);
1183  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1);
1184  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1);
1185  } else {
1186  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0);
1187  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0);
1188  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0);
1189  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0);
1190  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0);
1191  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0);
1192  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0);
1193  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0);
1194 
1195  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0);
1196  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0);
1197  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0);
1198  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0);
1199  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0);
1200  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0);
1201  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0);
1202  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0);
1203  }
1204 
1205  if (!called) {
1206  NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
1207  }
1208 
1209  {
1210  int block = 128;
1211  int grid = (atomStorageSize + block - 1)/block;
1212  if (doSlow)
1213  transposeForcesKernel<1><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
1214  force_x, force_y, force_z, force_w,
1215  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1216  atomStorageSize);
1217  else
1218  transposeForcesKernel<0><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
1219  force_x, force_y, force_z, force_w,
1220  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1221  atomStorageSize);
1222  }
1223 
1224 #undef CALL
1225  cudaCheck(cudaGetLastError());
1226 
1227  start += nblock*nwarp;
1228  }
1229 
1230 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
void setTileListVirialEnergyLength(int len)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void clearTileListStat(cudaStream_t stream)
#define NONBONDKERNEL_NUM_WARP
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
int getMaxNumBlocks()
Definition: DeviceCUDA.C:415
void NAMD_die(const char *err_msg)
Definition: common.C:83
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
#define WARPSIZE
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
#define CALL(DOENERGY, DOVIRIAL)
void CudaComputeNonbondedKernel::reallocate_forceSOA ( int  atomStorageSize)

Definition at line 1012 of file CudaComputeNonbondedKernel.cu.

References atomStorageSize.

1013 {
1014  reallocate_device<float>(&force_x, &forceSize, atomStorageSize, 1.4f);
1015  reallocate_device<float>(&force_y, &forceSize, atomStorageSize, 1.4f);
1016  reallocate_device<float>(&force_z, &forceSize, atomStorageSize, 1.4f);
1017  reallocate_device<float>(&force_w, &forceSize, atomStorageSize, 1.4f);
1018  reallocate_device<float>(&forceSlow_x, &forceSlowSize, atomStorageSize, 1.4f);
1019  reallocate_device<float>(&forceSlow_y, &forceSlowSize, atomStorageSize, 1.4f);
1020  reallocate_device<float>(&forceSlow_z, &forceSlowSize, atomStorageSize, 1.4f);
1021  reallocate_device<float>(&forceSlow_w, &forceSlowSize, atomStorageSize, 1.4f);
1022 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
void CudaComputeNonbondedKernel::reduceVirialEnergy ( CudaTileListKernel tlKernel,
const int  atomStorageSize,
const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const bool  doGBIS,
float4 *  d_forces,
float4 *  d_forcesSlow,
VirialEnergy d_virialEnergy,
cudaStream_t  stream 
)

Definition at line 1235 of file CudaComputeNonbondedKernel.cu.

References atomStorageSize, cudaCheck, deviceCUDA, CudaTileListKernel::get_xyzq(), DeviceCUDA::getMaxNumBlocks(), CudaTileListKernel::getTileListVirialEnergy(), CudaTileListKernel::getTileListVirialEnergyGBISLength(), CudaTileListKernel::getTileListVirialEnergyLength(), REDUCEGBISENERGYKERNEL_NUM_WARP, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP, REDUCEVIRIALENERGYKERNEL_NUM_WARP, stream, and WARPSIZE.

Referenced by CudaComputeNonbonded::launchWork().

1238  {
1239 
1240  if (doEnergy || doVirial) {
1241  clear_device_array<VirialEnergy>(d_virialEnergy, 1, stream);
1242  }
1243 
1244  if (doVirial)
1245  {
1247  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1248  reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
1249  (doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
1250  cudaCheck(cudaGetLastError());
1251  }
1252 
1253  if (doVirial || doEnergy)
1254  {
1256  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
1257  reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
1258  (doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
1259  cudaCheck(cudaGetLastError());
1260  }
1261 
1262  if (doGBIS && doEnergy)
1263  {
1265  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
1266  reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
1267  (tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
1268  cudaCheck(cudaGetLastError());
1269  }
1270 
1271 }
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
int getMaxNumBlocks()
Definition: DeviceCUDA.C:415
#define REDUCEVIRIALENERGYKERNEL_NUM_WARP
TileListVirialEnergy * getTileListVirialEnergy()
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
#define WARPSIZE
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP
#define REDUCEGBISENERGYKERNEL_NUM_WARP
void CudaComputeNonbondedKernel::updateVdwTypesExcl ( const int  atomStorageSize,
const int *  h_vdwTypes,
const int2 *  h_exclIndexMaxDiff,
const int *  h_atomIndex,
cudaStream_t  stream 
)

Definition at line 1042 of file CudaComputeNonbondedKernel.cu.

References atomStorageSize, OVERALLOC, and stream.

1043  {
1044 
1045  reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
1046  reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
1047  reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
1048 
1049  copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
1050  copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
1051  copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
1052 }
#define OVERALLOC
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ vdwTypes

The documentation for this class was generated from the following files: