NAMD
Classes | Public Member Functions | List of all members
CudaTileListKernel Class Reference

#include <CudaTileListKernel.h>

Public Member Functions

 CudaTileListKernel (int deviceID, bool doStreaming)
 
 ~CudaTileListKernel ()
 
int getNumEmptyPatches ()
 
int * getEmptyPatches ()
 
int getNumExcluded ()
 
float get_plcutoff2 ()
 
int getNumTileLists ()
 
int getNumTileListsGBIS ()
 
int getNumJtiles ()
 
BoundingBoxgetBoundingBoxes ()
 
int * getJtiles ()
 
float4 * get_xyzq ()
 
TileListStatgetTileListStatDevPtr ()
 
void clearTileListStat (cudaStream_t stream)
 
int * getTileJatomStart ()
 
TileListgetTileLists ()
 
unsigned int * getTileListDepth ()
 
int * getTileListOrder ()
 
TileExclgetTileExcls ()
 
PatchPairRecordgetPatchPairs ()
 
int * getTileJatomStartGBIS ()
 
TileListgetTileListsGBIS ()
 
TileListVirialEnergygetTileListVirialEnergy ()
 
CudaPatchRecordgetCudaPatches ()
 
void prepareTileList (cudaStream_t stream)
 
void finishTileList (cudaStream_t stream)
 
void updateComputes (const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
 
void buildTileLists (const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
 
void reSortTileLists (const bool doGBIS, cudaStream_t stream)
 
void setTileListVirialEnergyLength (int len)
 
void setTileListVirialEnergyGBISLength (int len)
 
int getTileListVirialEnergyLength ()
 
int getTileListVirialEnergyGBISLength ()
 
int getNumPatches ()
 
int getNumComputes ()
 
int * getOutputOrder ()
 

Detailed Description

Definition at line 87 of file CudaTileListKernel.h.

Constructor & Destructor Documentation

CudaTileListKernel::CudaTileListKernel ( int  deviceID,
bool  doStreaming 
)

Definition at line 744 of file CudaTileListKernel.cu.

References cudaCheck.

744  :
745 deviceID(deviceID), doStreaming(doStreaming) {
746 
747  cudaCheck(cudaSetDevice(deviceID));
748 
749  activeBuffer = 1;
750 
751  numPatches = 0;
752  numComputes = 0;
753 
754  cudaPatches = NULL;
755  cudaPatchesSize = 0;
756 
757  cudaComputes = NULL;
758  cudaComputesSize = 0;
759 
760  patchNumLists = NULL;
761  patchNumListsSize = 0;
762 
763  emptyPatches = NULL;
764  emptyPatchesSize = 0;
765  h_emptyPatches = NULL;
766  h_emptyPatchesSize = 0;
767  numEmptyPatches = 0;
768 
769  sortKeySrc = NULL;
770  sortKeySrcSize = 0;
771  sortKeyDst = NULL;
772  sortKeyDstSize = 0;
773 
774  tileLists1 = NULL;
775  tileLists1Size = 0;
776  tileLists2 = NULL;
777  tileLists2Size = 0;
778 
779  patchPairs1 = NULL;
780  patchPairs1Size = 0;
781  patchPairs2 = NULL;
782  patchPairs2Size = 0;
783 
784  tileJatomStart1 = NULL;
785  tileJatomStart1Size = 0;
786  tileJatomStart2 = NULL;
787  tileJatomStart2Size = 0;
788 
789  boundingBoxes = NULL;
790  boundingBoxesSize = 0;
791 
792  tileListDepth1 = NULL;
793  tileListDepth1Size = 0;
794  tileListDepth2 = NULL;
795  tileListDepth2Size = 0;
796 
797  tileListOrder1 = NULL;
798  tileListOrder1Size = 0;
799  tileListOrder2 = NULL;
800  tileListOrder2Size = 0;
801 
802  tileExcls1 = NULL;
803  tileExcls1Size = 0;
804  tileExcls2 = NULL;
805  tileExcls2Size = 0;
806 
807  xyzq = NULL;
808  xyzqSize = 0;
809 
810  allocate_device<TileListStat>(&d_tileListStat, 1);
811  allocate_host<TileListStat>(&h_tileListStat, 1);
812 
813  tileListPos = NULL;
814  tileListPosSize = 0;
815  tempStorage = NULL;
816  tempStorageSize = 0;
817 
818  jtiles = NULL;
819  jtilesSize = 0;
820 
821  tilePos = NULL;
822  tilePosSize = 0;
823 
824  tileListsGBIS = NULL;
825  tileListsGBISSize = 0;
826 
827  tileJatomStartGBIS = NULL;
828  tileJatomStartGBISSize = 0;
829 
830  tileListVirialEnergy = NULL;
831  tileListVirialEnergySize = 0;
832 
833  atomStorageSize = 0;
834  numTileLists = 0;
835  numTileListsGBIS = 0;
836  numJtiles = 1;
837 
838  outputOrder = NULL;
839  outputOrderSize = 0;
840  doOutputOrder = false;
841 
842  minmaxListLen = NULL;
843  minmaxListLenSize = 0;
844 
845  sortKeys = NULL;
846  sortKeysSize = 0;
847  sortKeys_endbit = 0;
848 
849  cudaCheck(cudaEventCreate(&tileListStatEvent));
850  tileListStatEventRecord = false;
851 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void const int numTileLists
__shared__ union @43 tempStorage
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
CudaTileListKernel::~CudaTileListKernel ( )

Definition at line 853 of file CudaTileListKernel.cu.

References cudaCheck.

853  {
854  cudaCheck(cudaSetDevice(deviceID));
855  deallocate_device<TileListStat>(&d_tileListStat);
856  deallocate_host<TileListStat>(&h_tileListStat);
857  //
858  if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
859  if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
860  if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
861  if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
862  if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
863  //
864  if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
865  if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
866  if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
867  if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
868  if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
869  if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
870  if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
871  if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
872  if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
873  if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
874  if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
875  if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
876  if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
877  if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
878  if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
879  if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
880  if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
881  if (jtiles != NULL) deallocate_device<int>(&jtiles);
882  if (tilePos != NULL) deallocate_device<int>(&tilePos);
883 
884  if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
885  if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
886 
887  if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
888 
889  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
890 
891  if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
892  if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
893 
894  cudaCheck(cudaEventDestroy(tileListStatEvent));
895 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__shared__ union @43 tempStorage
#define cudaCheck(stmt)
Definition: CudaUtils.h:95

Member Function Documentation

void CudaTileListKernel::buildTileLists ( const int  numTileListsPrev,
const int  numPatchesIn,
const int  atomStorageSizeIn,
const int  maxTileListLenIn,
const float3  lata,
const float3  latb,
const float3  latc,
const CudaPatchRecord h_cudaPatches,
const float4 *  h_xyzq,
const float  plcutoff2In,
const size_t  maxShmemPerBlock,
cudaStream_t  stream 
)

Definition at line 1031 of file CudaTileListKernel.cu.

References BOUNDINGBOXKERNEL_NUM_WARP, buildTileListsBBKernel_shmem_sizePerThread(), clearTileListStat(), cudaCheck, DEFAULTKERNEL_NUM_THREAD, deviceCUDA, DeviceCUDA::getMaxNumBlocks(), lata, latb, latc, NAMD_die(), TileListStat::numTileLists, OVERALLOC, stream, TILELISTKERNELNEW_NUM_WARP, TileListStat::tilesSizeExceeded, UPDATEPATCHESKERNEL_NUM_THREAD, and WARPSIZE.

1036  {
1037 
1038  numPatches = numPatchesIn;
1039  atomStorageSize = atomStorageSizeIn;
1040  maxTileListLen = maxTileListLenIn;
1041  plcutoff2 = plcutoff2In;
1042 
1043  if (doStreaming) {
1044  // Re-allocate patchNumLists
1045  reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1046  reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1047  reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1048  }
1049 
1050  // Re-allocate (tileLists1, patchPairs1
1051  reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
1052  reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
1053 
1054  // Copy cudaPatches to device
1055  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1056  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1057 
1058  // Re-allocate temporary storage
1059  reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
1060  // Calculate tile list positions (tilePos)
1061  {
1062  int nthread = DEFAULTKERNEL_NUM_THREAD;
1063  int nblock = 1;
1064  calcTileListPosKernel<DEFAULTKERNEL_NUM_THREAD> <<< nblock, nthread, 0, stream >>> (numComputes, cudaComputes, cudaPatches, tilePos);
1065  cudaCheck(cudaGetLastError());
1066  }
1067 
1068  // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
1069  {
1070  int nthread = UPDATEPATCHESKERNEL_NUM_THREAD;
1071  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/WARPSIZE)+1);
1072  updatePatchesKernel<WARPSIZE> <<< nblock, nthread, 0, stream >>> (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1073  cudaCheck(cudaGetLastError());
1074  }
1075 
1076  // ---------------------------------------------------------------------------------------------
1077 
1078 
1079  // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
1080  // tileListDepth2 and tileListOrder2 since they're used in sorting
1081  reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
1082  reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
1083 
1084  // Allocate with +1 to include last term in the exclusive sum
1085  reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
1086 
1087  reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
1088 
1089  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
1090 
1091  copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
1092 
1093  // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
1094  {
1095  int numBoundingBoxes = atomStorageSize/WARPSIZE;
1096  reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
1097 
1098  int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
1099  int nthread = WARPSIZE*nwarp;
1100  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1101  buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1102  cudaCheck(cudaGetLastError());
1103  }
1104 
1105  {
1106  int nwarp = TILELISTKERNELNEW_NUM_WARP;
1107  int nthread = WARPSIZE*nwarp;
1108  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
1109 
1110  int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
1111  if(shmem_size > maxShmemPerBlock){
1112  NAMD_die("CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1113  }
1114 
1115  // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
1116  // tell the required size in h_tileListStat->numJtiles. In subsequent calls,
1117  // re-allocation only happens when the size is exceeded.
1118  h_tileListStat->tilesSizeExceeded = true;
1119  int reallocCount = 0;
1120  while (h_tileListStat->tilesSizeExceeded) {
1121  reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
1122 
1124  // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
1125  buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>> (
1126  numTileListsPrev, tileLists1, cudaPatches, tilePos,
1127  lata, latb, latc, plcutoff2, maxTileListLen,
1128  boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1129  tileListDepth1, tileListOrder1, patchPairs1,
1130  d_tileListStat);
1131 
1132  cudaCheck(cudaGetLastError());
1133 
1134  // get (numATileLists, numJtiles, tilesSizeExceeded)
1135  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
1136  cudaCheck(cudaStreamSynchronize(stream));
1137  numJtiles = h_tileListStat->numJtiles;
1138 
1139  if (h_tileListStat->tilesSizeExceeded) {
1140  reallocCount++;
1141  if (reallocCount > 1) {
1142  NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
1143  }
1144  }
1145 
1146  }
1147 
1148  numTileLists = h_tileListStat->numTileLists;
1149 
1150  reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
1151  }
1152 
1153  // Re-allocate tileListVirialEnergy.
1154  // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
1155  // we're quaranteed to have enough space
1156  reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
1157 
1158  reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
1159  reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
1160  reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
1161  reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
1162  reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
1163 
1164  int numTileListsSrc = numTileListsPrev;
1165  int numJtilesSrc = numJtiles;
1167  int numJtilesDst = numJtiles;
1168 
1169  // Sort tiles
1170  sortTileLists(
1171  false,
1172  0, false,
1173  numTileListsSrc, numJtilesSrc,
1174  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1175  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1176  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1177  numTileListsDst, numJtilesDst,
1178  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1179  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1180  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1181  stream);
1182 
1183  // Set active buffer to 2
1184  setActiveBuffer(2);
1185 
1186  if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
1187 }
#define OVERALLOC
numTileListsSrc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
void clearTileListStat(cudaStream_t stream)
#define WARPSIZE
Definition: CudaUtils.h:10
#define UPDATEPATCHESKERNEL_NUM_THREAD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
#define DEFAULTKERNEL_NUM_THREAD
const int numTileListsDst
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
int getMaxNumBlocks()
Definition: DeviceCUDA.C:419
void NAMD_die(const char *err_msg)
Definition: common.C:83
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void const int numTileLists
#define BOUNDINGBOXKERNEL_NUM_WARP
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
#define TILELISTKERNELNEW_NUM_WARP
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void CudaTileListKernel::clearTileListStat ( cudaStream_t  stream)

Definition at line 901 of file CudaTileListKernel.cu.

References getNumEmptyPatches(), TileListStat::patchReadyQueueCount, and stream.

Referenced by buildTileLists(), and CudaComputeNonbondedKernel::nonbondedForce().

901  {
902  // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
903  memset(h_tileListStat, 0, sizeof(TileListStat));
904  h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
905  copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
906 }
__thread cudaStream_t stream
void CudaTileListKernel::finishTileList ( cudaStream_t  stream)

Definition at line 908 of file CudaTileListKernel.cu.

References cudaCheck, and stream.

908  {
909  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
910  cudaCheck(cudaEventRecord(tileListStatEvent, stream));
911  tileListStatEventRecord = true;
912 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
float CudaTileListKernel::get_plcutoff2 ( )
inline

Definition at line 277 of file CudaTileListKernel.h.

277 {return plcutoff2;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
float4* CudaTileListKernel::get_xyzq ( )
inline

Definition at line 283 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), CudaComputeGBISKernel::GBISphase3(), CudaComputeNonbondedKernel::nonbondedForce(), and CudaComputeNonbondedKernel::reduceVirialEnergy().

283 {return xyzq;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
BoundingBox* CudaTileListKernel::getBoundingBoxes ( )
inline

Definition at line 281 of file CudaTileListKernel.h.

281 {return boundingBoxes;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
CudaPatchRecord* CudaTileListKernel::getCudaPatches ( )
inline

Definition at line 302 of file CudaTileListKernel.h.

302 {return cudaPatches;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
int* CudaTileListKernel::getEmptyPatches ( )
inline

Definition at line 273 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbonded::launchWork().

273 {return h_emptyPatches;}
int* CudaTileListKernel::getJtiles ( )
inline

Definition at line 282 of file CudaTileListKernel.h.

282 {return jtiles;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
int CudaTileListKernel::getNumComputes ( )
inline

Definition at line 326 of file CudaTileListKernel.h.

326 {return numComputes;}
int CudaTileListKernel::getNumEmptyPatches ( )
inline

Definition at line 272 of file CudaTileListKernel.h.

Referenced by clearTileListStat(), and CudaComputeNonbonded::launchWork().

272 {return numEmptyPatches;}
int CudaTileListKernel::getNumExcluded ( )
inline

Definition at line 275 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbonded::finishReductions().

275 {return numExcluded;}
int CudaTileListKernel::getNumJtiles ( )
inline

Definition at line 280 of file CudaTileListKernel.h.

280 {return numJtiles;}
int CudaTileListKernel::getNumPatches ( )
inline

Definition at line 324 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbonded::launchWork(), and CudaComputeNonbondedKernel::nonbondedForce().

324 {return numPatches;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
int CudaTileListKernel::getNumTileLists ( )
inline

Definition at line 278 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::nonbondedForce().

278 {return numTileLists;}
__global__ void const int numTileLists
int CudaTileListKernel::getNumTileListsGBIS ( )
inline

Definition at line 279 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), CudaComputeGBISKernel::GBISphase2(), and CudaComputeGBISKernel::GBISphase3().

279 {return numTileListsGBIS;}
int* CudaTileListKernel::getOutputOrder ( )
inline

Definition at line 327 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::nonbondedForce().

327  {
328  if (!doStreaming) return NULL;
329  if (doOutputOrder) {
330  return outputOrder;
331  } else {
332  return NULL;
333  }
334  }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
PatchPairRecord* CudaTileListKernel::getPatchPairs ( )
inline

Definition at line 295 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), and CudaComputeGBISKernel::GBISphase3().

295 {return ((activeBuffer == 1) ? patchPairs1 : patchPairs2);}
TileExcl* CudaTileListKernel::getTileExcls ( )
inline

Definition at line 294 of file CudaTileListKernel.h.

294 {return ((activeBuffer == 1) ? tileExcls1 : tileExcls2);}
int* CudaTileListKernel::getTileJatomStart ( )
inline

Definition at line 288 of file CudaTileListKernel.h.

288 {return ((activeBuffer == 1) ? tileJatomStart1 : tileJatomStart2);}
int* CudaTileListKernel::getTileJatomStartGBIS ( )
inline

Definition at line 297 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), and CudaComputeGBISKernel::GBISphase3().

297 {return tileJatomStartGBIS;}
unsigned int* CudaTileListKernel::getTileListDepth ( )
inline

Definition at line 292 of file CudaTileListKernel.h.

292 {return ((activeBuffer == 1) ? tileListDepth1 : tileListDepth2);}
int* CudaTileListKernel::getTileListOrder ( )
inline

Definition at line 293 of file CudaTileListKernel.h.

293 {return ((activeBuffer == 1) ? tileListOrder1 : tileListOrder2);}
TileList* CudaTileListKernel::getTileLists ( )
inline

Definition at line 289 of file CudaTileListKernel.h.

289  {
290  return ((activeBuffer == 1) ? tileLists1 : tileLists2);
291  }
TileList* CudaTileListKernel::getTileListsGBIS ( )
inline

Definition at line 298 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), and CudaComputeGBISKernel::GBISphase3().

298 {return tileListsGBIS;}
TileListStat* CudaTileListKernel::getTileListStatDevPtr ( )
inline

Definition at line 285 of file CudaTileListKernel.h.

285 {return d_tileListStat;}
TileListVirialEnergy* CudaTileListKernel::getTileListVirialEnergy ( )
inline

Definition at line 300 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::reduceVirialEnergy().

300 {return tileListVirialEnergy;}
int CudaTileListKernel::getTileListVirialEnergyGBISLength ( )
inline

Definition at line 322 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::reduceVirialEnergy().

322 {return tileListVirialEnergyGBISLength;}
int CudaTileListKernel::getTileListVirialEnergyLength ( )
inline

Definition at line 321 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::reduceVirialEnergy().

321 {return tileListVirialEnergyLength;}
void CudaTileListKernel::prepareTileList ( cudaStream_t  stream)

Definition at line 897 of file CudaTileListKernel.cu.

References stream.

897  {
898  clear_device_array<int>(jtiles, numJtiles, stream);
899 }
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
void CudaTileListKernel::reSortTileLists ( const bool  doGBIS,
cudaStream_t  stream 
)

Definition at line 1586 of file CudaTileListKernel.cu.

References cudaCheck, NAMD_die(), TileListStat::numExcluded, TileListStat::numTileLists, TileListStat::numTileListsGBIS, and OVERALLOC.

1586  {
1587  // Store previous number of active lists
1588  int numTileListsPrev = numTileLists;
1589 
1590  // Wait for finishTileList() to stop copying
1591  if (!tileListStatEventRecord)
1592  NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1593  cudaCheck(cudaEventSynchronize(tileListStatEvent));
1594 
1595  // Get numTileLists, numTileListsGBIS, and numExcluded
1596  {
1597  numTileLists = h_tileListStat->numTileLists;
1598  numTileListsGBIS = h_tileListStat->numTileListsGBIS;
1599  numExcluded = h_tileListStat->numExcluded;
1600  }
1601 
1602  // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
1603  // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
1604  sortTileLists(true, 0, true,
1605  numTileListsPrev, numJtiles,
1606  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1607  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1608  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1609  numTileLists, numJtiles,
1610  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1611  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1612  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1613  stream);
1614 
1615  // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
1616  // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
1617  // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
1618 
1619  // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
1620 
1621  // NOTE:
1622  // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
1623  // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
1624 
1625  if (doGBIS) {
1626  // GBIS is used => produce a second tile list
1627  // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
1628  reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
1629  reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
1630 
1631  sortTileLists(true, 16, true,
1632  numTileListsPrev, numJtiles,
1633  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1634  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1635  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1636  numTileListsGBIS, numJtiles,
1637  PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1638  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1639  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1640  stream);
1641  }
1642 
1643  // Set active buffer to be 1
1644  setActiveBuffer(1);
1645 
1646 }
#define OVERALLOC
__thread cudaStream_t stream
void NAMD_die(const char *err_msg)
Definition: common.C:83
__global__ void const int numTileLists
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void CudaTileListKernel::setTileListVirialEnergyGBISLength ( int  len)

Definition at line 1684 of file CudaTileListKernel.cu.

References NAMD_die().

Referenced by CudaComputeGBISKernel::GBISphase2().

1684  {
1685  if (len > tileListVirialEnergySize) {
1686  NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1687  }
1688  tileListVirialEnergyGBISLength = len;
1689 }
void NAMD_die(const char *err_msg)
Definition: common.C:83
void CudaTileListKernel::setTileListVirialEnergyLength ( int  len)

Definition at line 1677 of file CudaTileListKernel.cu.

References NAMD_die().

Referenced by CudaComputeNonbondedKernel::nonbondedForce().

1677  {
1678  if (len > tileListVirialEnergySize) {
1679  NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1680  }
1681  tileListVirialEnergyLength = len;
1682 }
void NAMD_die(const char *err_msg)
Definition: common.C:83
void CudaTileListKernel::updateComputes ( const int  numComputesIn,
const CudaComputeRecord h_cudaComputes,
cudaStream_t  stream 
)

Definition at line 914 of file CudaTileListKernel.cu.

References stream.

915  {
916 
917  numComputes = numComputesIn;
918 
919  reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
920  copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
921 
922  if (doStreaming) doOutputOrder = true;
923 }
__thread cudaStream_t stream

The documentation for this class was generated from the following files: