CudaComputeNonbondedKernel Class Reference

#include <CudaComputeNonbondedKernel.h>

List of all members.

Public Member Functions

 CudaComputeNonbondedKernel (int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 ~CudaComputeNonbondedKernel ()
void updateVdwTypesExcl (const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
void nonbondedForce (CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
void reduceVirialEnergy (CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
void getVirialEnergy (VirialEnergy *h_virialEnergy, cudaStream_t stream)
void bindExclusions (int numExclusions, unsigned int *exclusion_bits)
int * getPatchReadyQueue ()
void reallocate_forceSOA (int atomStorageSize)

Detailed Description

Definition at line 8 of file CudaComputeNonbondedKernel.h.


Constructor & Destructor Documentation

CudaComputeNonbondedKernel::CudaComputeNonbondedKernel ( int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 983 of file CudaComputeNonbondedKernel.cu.

References cudaCheck.

00984                     : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
00985   
00986   cudaCheck(cudaSetDevice(deviceID));
00987 
00988   overflowExclusions = NULL;
00989   overflowExclusionsSize = 0;
00990 
00991   exclIndexMaxDiff = NULL;
00992   exclIndexMaxDiffSize = 0;
00993 
00994   atomIndex = NULL;
00995   atomIndexSize = 0;
00996 
00997   vdwTypes = NULL;
00998   vdwTypesSize = 0;
00999 
01000   patchNumCount = NULL;
01001   patchNumCountSize = 0;
01002 
01003   patchReadyQueue = NULL;
01004   patchReadyQueueSize = 0;
01005 
01006   force_x = force_y = force_z = force_w = NULL;
01007   forceSize = 0;
01008   forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
01009   forceSlowSize = 0;
01010 }

CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel (  ) 

Definition at line 1024 of file CudaComputeNonbondedKernel.cu.

References cudaCheck.

01024                                                         {
01025   cudaCheck(cudaSetDevice(deviceID));
01026   if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
01027   if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
01028   if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
01029   if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
01030   if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
01031   if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
01032   if (force_x != NULL) deallocate_device<float>(&force_x);
01033   if (force_y != NULL) deallocate_device<float>(&force_y);
01034   if (force_z != NULL) deallocate_device<float>(&force_z);
01035   if (force_w != NULL) deallocate_device<float>(&force_w);
01036   if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
01037   if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
01038   if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
01039   if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);  
01040 }


Member Function Documentation

void CudaComputeNonbondedKernel::bindExclusions ( int  numExclusions,
unsigned int *  exclusion_bits 
)

Definition at line 1273 of file CudaComputeNonbondedKernel.cu.

References constExclusions, cudaCheck, and MAX_CONST_EXCLUSIONS.

01273                                                                                                {
01274         int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
01275         cudaCheck(cudaMemcpyToSymbol(constExclusions, exclusion_bits, nconst*sizeof(unsigned int), 0));
01276 
01277   reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
01278   copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
01279 }

int * CudaComputeNonbondedKernel::getPatchReadyQueue (  ) 

Definition at line 1054 of file CudaComputeNonbondedKernel.cu.

References NAMD_die().

Referenced by CudaComputeNonbonded::launchWork().

01054                                                     {
01055   if (!doStreaming) {
01056     NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
01057   }
01058   return patchReadyQueue;
01059 }

void CudaComputeNonbondedKernel::getVirialEnergy ( VirialEnergy h_virialEnergy,
cudaStream_t  stream 
)
void CudaComputeNonbondedKernel::nonbondedForce ( CudaTileListKernel tlKernel,
const int  atomStorageSize,
const bool  doPairlist,
const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const float3  lata,
const float3  latb,
const float3  latc,
const float4 *  h_xyzq,
const float  cutoff2,
float4 *  d_forces,
float4 *  d_forcesSlow,
float4 *  h_forces,
float4 *  h_forcesSlow,
cudaStream_t  stream 
)

Definition at line 1078 of file CudaComputeNonbondedKernel.cu.

References CALL, CudaTileListKernel::clearTileListStat(), cudaCheck, deviceCUDA, CudaTileListKernel::get_xyzq(), DeviceCUDA::getMaxNumBlocks(), CudaTileListKernel::getNumPatches(), CudaTileListKernel::getNumTileLists(), CudaTileListKernel::getOutputOrder(), NAMD_die(), CudaTileListKernel::setTileListVirialEnergyLength(), and WARPSIZE.

01085                        {
01086 
01087   if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
01088 
01089   // clear_device_array<float4>(d_forces, atomStorageSize, stream);
01090   // if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, stream);
01091 
01092   
01093   // XXX TODO: Clear all of these
01094   if(1){
01095      // two clears
01096      tlKernel.clearTileListStat(stream);
01097      clear_device_array<float>(force_x, atomStorageSize, stream);
01098      clear_device_array<float>(force_y, atomStorageSize, stream);
01099      clear_device_array<float>(force_z, atomStorageSize, stream);
01100      clear_device_array<float>(force_w, atomStorageSize, stream);
01101      if (doSlow) {
01102        clear_device_array<float>(forceSlow_x, atomStorageSize, stream);
01103        clear_device_array<float>(forceSlow_y, atomStorageSize, stream);
01104        clear_device_array<float>(forceSlow_z, atomStorageSize, stream);
01105        clear_device_array<float>(forceSlow_w, atomStorageSize, stream);
01106      }
01107   }
01108 
01109   // --- streaming ----
01110   float4* m_forces = NULL;
01111   float4* m_forcesSlow = NULL;
01112   int* m_patchReadyQueue = NULL;
01113   int numPatches = 0;
01114   unsigned int* patchNumCountPtr = NULL;
01115   if (doStreaming) {
01116     numPatches = tlKernel.getNumPatches();
01117     if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
01118       // If re-allocated, clear array
01119       clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
01120     }
01121     patchNumCountPtr = patchNumCount;
01122     bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
01123     if (re) {
01124       // If re-allocated, re-set to "-1"
01125       for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
01126     }
01127     cudaCheck(cudaHostGetDevicePointer(&m_patchReadyQueue, patchReadyQueue, 0));
01128     cudaCheck(cudaHostGetDevicePointer(&m_forces, h_forces, 0));
01129     cudaCheck(cudaHostGetDevicePointer(&m_forcesSlow, h_forcesSlow, 0));
01130   }
01131   // -----------------
01132 
01133   if (doVirial || doEnergy) {
01134     tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
01135   }
01136 
01137   int shMemSize = 0;
01138 
01139   int* outputOrderPtr = tlKernel.getOutputOrder();
01140 
01141   int nwarp = NONBONDKERNEL_NUM_WARP;
01142   int nthread = WARPSIZE*nwarp;
01143   int start = 0;
01144   while (start < tlKernel.getNumTileLists())
01145   {
01146 
01147     int nleft = tlKernel.getNumTileLists() - start;
01148     int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
01149 
01150 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
01151     nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
01152   <<< nblock, nthread, shMemSize, stream >>>  \
01153   (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
01154     cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
01155     vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
01156     cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
01157     atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
01158     tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
01159     tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
01160     force_x, force_y, force_z, force_w, \
01161     forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
01162     numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
01163     outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
01164 
01165     bool called = false;
01166 
01167     if (doStreaming) {
01168       if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1);
01169       if (!doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(0, 0, 1, 0, 1);
01170       if (!doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1);
01171       if (!doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(0, 1, 1, 0, 1);
01172       if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1);
01173       if ( doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(1, 0, 1, 0, 1);
01174       if ( doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1);
01175       if ( doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(1, 1, 1, 0, 1);
01176 
01177       if (!doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(0, 0, 0, 1, 1);
01178       if (!doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(0, 0, 1, 1, 1);
01179       if (!doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(0, 1, 0, 1, 1);
01180       if (!doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(0, 1, 1, 1, 1);
01181       if ( doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(1, 0, 0, 1, 1);
01182       if ( doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(1, 0, 1, 1, 1);
01183       if ( doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(1, 1, 0, 1, 1);
01184       if ( doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(1, 1, 1, 1, 1);
01185     } else {
01186       if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0);
01187       if (!doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(0, 0, 1, 0, 0);
01188       if (!doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0);
01189       if (!doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(0, 1, 1, 0, 0);
01190       if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0);
01191       if ( doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(1, 0, 1, 0, 0);
01192       if ( doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0);
01193       if ( doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(1, 1, 1, 0, 0);
01194 
01195       if (!doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(0, 0, 0, 1, 0);
01196       if (!doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(0, 0, 1, 1, 0);
01197       if (!doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(0, 1, 0, 1, 0);
01198       if (!doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(0, 1, 1, 1, 0);
01199       if ( doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(1, 0, 0, 1, 0);
01200       if ( doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(1, 0, 1, 1, 0);
01201       if ( doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(1, 1, 0, 1, 0);
01202       if ( doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(1, 1, 1, 1, 0);
01203     }
01204 
01205     if (!called) {
01206       NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
01207     }
01208 
01209     {
01210       int block = 128;
01211       int grid = (atomStorageSize + block - 1)/block;
01212       if (doSlow) 
01213         transposeForcesKernel<1><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
01214                        force_x, force_y, force_z, force_w,
01215                        forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
01216                        atomStorageSize);
01217       else
01218         transposeForcesKernel<0><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
01219                        force_x, force_y, force_z, force_w,
01220                        forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
01221                        atomStorageSize);        
01222     }
01223 
01224 #undef CALL
01225     cudaCheck(cudaGetLastError());
01226 
01227     start += nblock*nwarp;
01228   }
01229 
01230 }

void CudaComputeNonbondedKernel::reallocate_forceSOA ( int  atomStorageSize  ) 

Definition at line 1012 of file CudaComputeNonbondedKernel.cu.

01013 {
01014   reallocate_device<float>(&force_x, &forceSize, atomStorageSize, 1.4f);
01015   reallocate_device<float>(&force_y, &forceSize, atomStorageSize, 1.4f);
01016   reallocate_device<float>(&force_z, &forceSize, atomStorageSize, 1.4f);
01017   reallocate_device<float>(&force_w, &forceSize, atomStorageSize, 1.4f);
01018   reallocate_device<float>(&forceSlow_x, &forceSlowSize, atomStorageSize, 1.4f);
01019   reallocate_device<float>(&forceSlow_y, &forceSlowSize, atomStorageSize, 1.4f);
01020   reallocate_device<float>(&forceSlow_z, &forceSlowSize, atomStorageSize, 1.4f);
01021   reallocate_device<float>(&forceSlow_w, &forceSlowSize, atomStorageSize, 1.4f);  
01022 }

void CudaComputeNonbondedKernel::reduceVirialEnergy ( CudaTileListKernel tlKernel,
const int  atomStorageSize,
const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const bool  doGBIS,
float4 *  d_forces,
float4 *  d_forcesSlow,
VirialEnergy d_virialEnergy,
cudaStream_t  stream 
)

Definition at line 1235 of file CudaComputeNonbondedKernel.cu.

References cudaCheck, deviceCUDA, CudaTileListKernel::get_xyzq(), DeviceCUDA::getMaxNumBlocks(), CudaTileListKernel::getTileListVirialEnergy(), CudaTileListKernel::getTileListVirialEnergyGBISLength(), CudaTileListKernel::getTileListVirialEnergyLength(), REDUCEGBISENERGYKERNEL_NUM_WARP, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP, REDUCEVIRIALENERGYKERNEL_NUM_WARP, and WARPSIZE.

Referenced by CudaComputeNonbonded::launchWork().

01238                                                      {
01239 
01240   if (doEnergy || doVirial) {
01241     clear_device_array<VirialEnergy>(d_virialEnergy, 1, stream);
01242   }
01243 
01244   if (doVirial)
01245   {
01246     int nthread = REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE;
01247     int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
01248     reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
01249     (doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
01250     cudaCheck(cudaGetLastError());
01251   }
01252 
01253   if (doVirial || doEnergy)
01254   {
01255     int nthread = REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE;
01256     int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
01257     reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
01258     (doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
01259     cudaCheck(cudaGetLastError());
01260   }  
01261 
01262   if (doGBIS && doEnergy)
01263   {
01264     int nthread = REDUCEGBISENERGYKERNEL_NUM_WARP*WARPSIZE;
01265     int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
01266     reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
01267     (tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
01268     cudaCheck(cudaGetLastError());
01269   }
01270 
01271 }

void CudaComputeNonbondedKernel::updateVdwTypesExcl ( const int  atomStorageSize,
const int *  h_vdwTypes,
const int2 *  h_exclIndexMaxDiff,
const int *  h_atomIndex,
cudaStream_t  stream 
)

Definition at line 1042 of file CudaComputeNonbondedKernel.cu.

References OVERALLOC.

01043                                                                                {
01044 
01045   reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
01046   reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
01047   reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
01048 
01049   copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
01050   copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
01051   copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
01052 }


The documentation for this class was generated from the following files:

Generated on 19 Sep 2020 for NAMD by  doxygen 1.6.1