CudaComputeNonbondedKernel Class Reference

#include <CudaComputeNonbondedKernel.h>

List of all members.

Public Member Functions

 CudaComputeNonbondedKernel (int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 ~CudaComputeNonbondedKernel ()
void updateVdwTypesExcl (const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
void nonbondedForce (CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
void reduceVirialEnergy (CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
void getVirialEnergy (VirialEnergy *h_virialEnergy, cudaStream_t stream)
void bindExclusions (int numExclusions, unsigned int *exclusion_bits)
int * getPatchReadyQueue ()
void reallocate_forceSOA (int atomStorageSize)

Detailed Description

Definition at line 8 of file CudaComputeNonbondedKernel.h.


Constructor & Destructor Documentation

CudaComputeNonbondedKernel::CudaComputeNonbondedKernel ( int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 979 of file CudaComputeNonbondedKernel.cu.

References cudaCheck.

00980                     : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
00981   
00982   cudaCheck(cudaSetDevice(deviceID));
00983 
00984   overflowExclusions = NULL;
00985   overflowExclusionsSize = 0;
00986 
00987   exclIndexMaxDiff = NULL;
00988   exclIndexMaxDiffSize = 0;
00989 
00990   atomIndex = NULL;
00991   atomIndexSize = 0;
00992 
00993   vdwTypes = NULL;
00994   vdwTypesSize = 0;
00995 
00996   patchNumCount = NULL;
00997   patchNumCountSize = 0;
00998 
00999   patchReadyQueue = NULL;
01000   patchReadyQueueSize = 0;
01001 
01002   force_x = force_y = force_z = force_w = NULL;
01003   forceSize = 0;
01004   forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
01005   forceSlowSize = 0;
01006 }

CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel (  ) 

Definition at line 1020 of file CudaComputeNonbondedKernel.cu.

References cudaCheck.

01020                                                         {
01021   cudaCheck(cudaSetDevice(deviceID));
01022   if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
01023   if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
01024   if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
01025   if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
01026   if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
01027   if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
01028   if (force_x != NULL) deallocate_device<float>(&force_x);
01029   if (force_y != NULL) deallocate_device<float>(&force_y);
01030   if (force_z != NULL) deallocate_device<float>(&force_z);
01031   if (force_w != NULL) deallocate_device<float>(&force_w);
01032   if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
01033   if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
01034   if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
01035   if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);  
01036 }


Member Function Documentation

void CudaComputeNonbondedKernel::bindExclusions ( int  numExclusions,
unsigned int *  exclusion_bits 
)

Definition at line 1269 of file CudaComputeNonbondedKernel.cu.

References constExclusions, cudaCheck, and MAX_CONST_EXCLUSIONS.

01269                                                                                                {
01270         int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
01271         cudaCheck(cudaMemcpyToSymbol(constExclusions, exclusion_bits, nconst*sizeof(unsigned int), 0));
01272 
01273   reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
01274   copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
01275 }

int * CudaComputeNonbondedKernel::getPatchReadyQueue (  ) 

Definition at line 1050 of file CudaComputeNonbondedKernel.cu.

References NAMD_die().

Referenced by CudaComputeNonbonded::launchWork().

01050                                                     {
01051   if (!doStreaming) {
01052     NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
01053   }
01054   return patchReadyQueue;
01055 }

void CudaComputeNonbondedKernel::getVirialEnergy ( VirialEnergy h_virialEnergy,
cudaStream_t  stream 
)
void CudaComputeNonbondedKernel::nonbondedForce ( CudaTileListKernel tlKernel,
const int  atomStorageSize,
const bool  doPairlist,
const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const float3  lata,
const float3  latb,
const float3  latc,
const float4 *  h_xyzq,
const float  cutoff2,
float4 *  d_forces,
float4 *  d_forcesSlow,
float4 *  h_forces,
float4 *  h_forcesSlow,
cudaStream_t  stream 
)

Definition at line 1074 of file CudaComputeNonbondedKernel.cu.

References CALL, CudaTileListKernel::clearTileListStat(), cudaCheck, deviceCUDA, CudaTileListKernel::get_xyzq(), DeviceCUDA::getMaxNumBlocks(), CudaTileListKernel::getNumPatches(), CudaTileListKernel::getNumTileLists(), CudaTileListKernel::getOutputOrder(), NAMD_die(), CudaTileListKernel::setTileListVirialEnergyLength(), and WARPSIZE.

01081                        {
01082 
01083   if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
01084 
01085   // clear_device_array<float4>(d_forces, atomStorageSize, stream);
01086   // if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, stream);
01087 
01088   
01089   // XXX TODO: Clear all of these
01090   if(1){
01091      // two clears
01092      tlKernel.clearTileListStat(stream);
01093      clear_device_array<float>(force_x, atomStorageSize, stream);
01094      clear_device_array<float>(force_y, atomStorageSize, stream);
01095      clear_device_array<float>(force_z, atomStorageSize, stream);
01096      clear_device_array<float>(force_w, atomStorageSize, stream);
01097      if (doSlow) {
01098        clear_device_array<float>(forceSlow_x, atomStorageSize, stream);
01099        clear_device_array<float>(forceSlow_y, atomStorageSize, stream);
01100        clear_device_array<float>(forceSlow_z, atomStorageSize, stream);
01101        clear_device_array<float>(forceSlow_w, atomStorageSize, stream);
01102      }
01103   }
01104 
01105   // --- streaming ----
01106   float4* m_forces = NULL;
01107   float4* m_forcesSlow = NULL;
01108   int* m_patchReadyQueue = NULL;
01109   int numPatches = 0;
01110   unsigned int* patchNumCountPtr = NULL;
01111   if (doStreaming) {
01112     numPatches = tlKernel.getNumPatches();
01113     if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
01114       // If re-allocated, clear array
01115       clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
01116     }
01117     patchNumCountPtr = patchNumCount;
01118     bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
01119     if (re) {
01120       // If re-allocated, re-set to "-1"
01121       for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
01122     }
01123     cudaCheck(cudaHostGetDevicePointer(&m_patchReadyQueue, patchReadyQueue, 0));
01124     cudaCheck(cudaHostGetDevicePointer(&m_forces, h_forces, 0));
01125     cudaCheck(cudaHostGetDevicePointer(&m_forcesSlow, h_forcesSlow, 0));
01126   }
01127   // -----------------
01128 
01129   if (doVirial || doEnergy) {
01130     tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
01131   }
01132 
01133   int shMemSize = 0;
01134 
01135   int* outputOrderPtr = tlKernel.getOutputOrder();
01136 
01137   int nwarp = NONBONDKERNEL_NUM_WARP;
01138   int nthread = WARPSIZE*nwarp;
01139   int start = 0;
01140   while (start < tlKernel.getNumTileLists())
01141   {
01142 
01143     int nleft = tlKernel.getNumTileLists() - start;
01144     int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
01145 
01146 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
01147     nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
01148   <<< nblock, nthread, shMemSize, stream >>>  \
01149   (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
01150     cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
01151     vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
01152     cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
01153     atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
01154     tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
01155     tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
01156     force_x, force_y, force_z, force_w, \
01157     forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
01158     numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
01159     outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
01160 
01161     bool called = false;
01162 
01163     if (doStreaming) {
01164       if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1);
01165       if (!doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(0, 0, 1, 0, 1);
01166       if (!doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1);
01167       if (!doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(0, 1, 1, 0, 1);
01168       if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1);
01169       if ( doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(1, 0, 1, 0, 1);
01170       if ( doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1);
01171       if ( doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(1, 1, 1, 0, 1);
01172 
01173       if (!doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(0, 0, 0, 1, 1);
01174       if (!doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(0, 0, 1, 1, 1);
01175       if (!doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(0, 1, 0, 1, 1);
01176       if (!doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(0, 1, 1, 1, 1);
01177       if ( doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(1, 0, 0, 1, 1);
01178       if ( doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(1, 0, 1, 1, 1);
01179       if ( doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(1, 1, 0, 1, 1);
01180       if ( doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(1, 1, 1, 1, 1);
01181     } else {
01182       if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0);
01183       if (!doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(0, 0, 1, 0, 0);
01184       if (!doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0);
01185       if (!doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(0, 1, 1, 0, 0);
01186       if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0);
01187       if ( doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(1, 0, 1, 0, 0);
01188       if ( doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0);
01189       if ( doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(1, 1, 1, 0, 0);
01190 
01191       if (!doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(0, 0, 0, 1, 0);
01192       if (!doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(0, 0, 1, 1, 0);
01193       if (!doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(0, 1, 0, 1, 0);
01194       if (!doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(0, 1, 1, 1, 0);
01195       if ( doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(1, 0, 0, 1, 0);
01196       if ( doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(1, 0, 1, 1, 0);
01197       if ( doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(1, 1, 0, 1, 0);
01198       if ( doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(1, 1, 1, 1, 0);
01199     }
01200 
01201     if (!called) {
01202       NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
01203     }
01204 
01205     {
01206       int block = 128;
01207       int grid = (atomStorageSize + block - 1)/block;
01208       if (doSlow) 
01209         transposeForcesKernel<1><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
01210                        force_x, force_y, force_z, force_w,
01211                        forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
01212                        atomStorageSize);
01213       else
01214         transposeForcesKernel<0><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
01215                        force_x, force_y, force_z, force_w,
01216                        forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
01217                        atomStorageSize);        
01218     }
01219 
01220 #undef CALL
01221     cudaCheck(cudaGetLastError());
01222 
01223     start += nblock*nwarp;
01224   }
01225 
01226 }

void CudaComputeNonbondedKernel::reallocate_forceSOA ( int  atomStorageSize  ) 

Definition at line 1008 of file CudaComputeNonbondedKernel.cu.

01009 {
01010   reallocate_device<float>(&force_x, &forceSize, atomStorageSize, 1.4f);
01011   reallocate_device<float>(&force_y, &forceSize, atomStorageSize, 1.4f);
01012   reallocate_device<float>(&force_z, &forceSize, atomStorageSize, 1.4f);
01013   reallocate_device<float>(&force_w, &forceSize, atomStorageSize, 1.4f);
01014   reallocate_device<float>(&forceSlow_x, &forceSlowSize, atomStorageSize, 1.4f);
01015   reallocate_device<float>(&forceSlow_y, &forceSlowSize, atomStorageSize, 1.4f);
01016   reallocate_device<float>(&forceSlow_z, &forceSlowSize, atomStorageSize, 1.4f);
01017   reallocate_device<float>(&forceSlow_w, &forceSlowSize, atomStorageSize, 1.4f);  
01018 }

void CudaComputeNonbondedKernel::reduceVirialEnergy ( CudaTileListKernel tlKernel,
const int  atomStorageSize,
const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const bool  doGBIS,
float4 *  d_forces,
float4 *  d_forcesSlow,
VirialEnergy d_virialEnergy,
cudaStream_t  stream 
)

Definition at line 1231 of file CudaComputeNonbondedKernel.cu.

References cudaCheck, deviceCUDA, CudaTileListKernel::get_xyzq(), DeviceCUDA::getMaxNumBlocks(), CudaTileListKernel::getTileListVirialEnergy(), CudaTileListKernel::getTileListVirialEnergyGBISLength(), CudaTileListKernel::getTileListVirialEnergyLength(), REDUCEGBISENERGYKERNEL_NUM_WARP, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP, REDUCEVIRIALENERGYKERNEL_NUM_WARP, and WARPSIZE.

Referenced by CudaComputeNonbonded::launchWork().

01234                                                      {
01235 
01236   if (doEnergy || doVirial) {
01237     clear_device_array<VirialEnergy>(d_virialEnergy, 1, stream);
01238   }
01239 
01240   if (doVirial)
01241   {
01242     int nthread = REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE;
01243     int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
01244     reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
01245     (doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
01246     cudaCheck(cudaGetLastError());
01247   }
01248 
01249   if (doVirial || doEnergy)
01250   {
01251     int nthread = REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE;
01252     int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
01253     reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
01254     (doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
01255     cudaCheck(cudaGetLastError());
01256   }  
01257 
01258   if (doGBIS && doEnergy)
01259   {
01260     int nthread = REDUCEGBISENERGYKERNEL_NUM_WARP*WARPSIZE;
01261     int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
01262     reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
01263     (tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
01264     cudaCheck(cudaGetLastError());
01265   }
01266 
01267 }

void CudaComputeNonbondedKernel::updateVdwTypesExcl ( const int  atomStorageSize,
const int *  h_vdwTypes,
const int2 *  h_exclIndexMaxDiff,
const int *  h_atomIndex,
cudaStream_t  stream 
)

Definition at line 1038 of file CudaComputeNonbondedKernel.cu.

References OVERALLOC.

01039                                                                                {
01040 
01041   reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
01042   reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
01043   reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
01044 
01045   copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
01046   copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
01047   copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
01048 }


The documentation for this class was generated from the following files:

Generated on 30 May 2020 for NAMD by  doxygen 1.6.1