NAMD
Classes | Public Member Functions | List of all members
CudaTileListKernel Class Reference

#include <CudaTileListKernel.h>

Public Member Functions

 CudaTileListKernel (int deviceID, bool doStreaming)
 
 ~CudaTileListKernel ()
 
int getNumEmptyPatches ()
 
int * getEmptyPatches ()
 
int getNumExcluded ()
 
float get_plcutoff2 ()
 
int getNumTileLists ()
 
int getNumTileListsGBIS ()
 
int getNumJtiles ()
 
BoundingBoxgetBoundingBoxes ()
 
int * getJtiles ()
 
float4 * get_xyzq ()
 
TileListStatgetTileListStatDevPtr ()
 
void clearTileListStat (cudaStream_t stream)
 
int * getTileJatomStart ()
 
TileListgetTileLists ()
 
unsigned int * getTileListDepth ()
 
int * getTileListOrder ()
 
TileExclgetTileExcls ()
 
PatchPairRecordgetPatchPairs ()
 
int * getTileJatomStartGBIS ()
 
TileListgetTileListsGBIS ()
 
TileListVirialEnergygetTileListVirialEnergy ()
 
CudaPatchRecordgetCudaPatches ()
 
void prepareTileList (cudaStream_t stream)
 
void finishTileList (cudaStream_t stream)
 
void updateComputes (const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
 
void buildTileLists (const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
 
void reSortTileLists (const bool doGBIS, cudaStream_t stream)
 
void setTileListVirialEnergyLength (int len)
 
void setTileListVirialEnergyGBISLength (int len)
 
int getTileListVirialEnergyLength ()
 
int getTileListVirialEnergyGBISLength ()
 
int getNumPatches ()
 
int getNumComputes ()
 
int * getOutputOrder ()
 

Detailed Description

Definition at line 87 of file CudaTileListKernel.h.

Constructor & Destructor Documentation

CudaTileListKernel::CudaTileListKernel ( int  deviceID,
bool  doStreaming 
)

Definition at line 705 of file CudaTileListKernel.cu.

References cudaCheck.

705  :
706 deviceID(deviceID), doStreaming(doStreaming) {
707 
708  cudaCheck(cudaSetDevice(deviceID));
709 
710  activeBuffer = 1;
711 
712  numPatches = 0;
713  numComputes = 0;
714 
715  cudaPatches = NULL;
716  cudaPatchesSize = 0;
717 
718  cudaComputes = NULL;
719  cudaComputesSize = 0;
720 
721  patchNumLists = NULL;
722  patchNumListsSize = 0;
723 
724  emptyPatches = NULL;
725  emptyPatchesSize = 0;
726  h_emptyPatches = NULL;
727  h_emptyPatchesSize = 0;
728  numEmptyPatches = 0;
729 
730  sortKeySrc = NULL;
731  sortKeySrcSize = 0;
732  sortKeyDst = NULL;
733  sortKeyDstSize = 0;
734 
735  tileLists1 = NULL;
736  tileLists1Size = 0;
737  tileLists2 = NULL;
738  tileLists2Size = 0;
739 
740  patchPairs1 = NULL;
741  patchPairs1Size = 0;
742  patchPairs2 = NULL;
743  patchPairs2Size = 0;
744 
745  tileJatomStart1 = NULL;
746  tileJatomStart1Size = 0;
747  tileJatomStart2 = NULL;
748  tileJatomStart2Size = 0;
749 
750  boundingBoxes = NULL;
751  boundingBoxesSize = 0;
752 
753  tileListDepth1 = NULL;
754  tileListDepth1Size = 0;
755  tileListDepth2 = NULL;
756  tileListDepth2Size = 0;
757 
758  tileListOrder1 = NULL;
759  tileListOrder1Size = 0;
760  tileListOrder2 = NULL;
761  tileListOrder2Size = 0;
762 
763  tileExcls1 = NULL;
764  tileExcls1Size = 0;
765  tileExcls2 = NULL;
766  tileExcls2Size = 0;
767 
768  xyzq = NULL;
769  xyzqSize = 0;
770 
771  allocate_device<TileListStat>(&d_tileListStat, 1);
772  allocate_host<TileListStat>(&h_tileListStat, 1);
773 
774  tileListPos = NULL;
775  tileListPosSize = 0;
776  tempStorage = NULL;
777  tempStorageSize = 0;
778 
779  jtiles = NULL;
780  jtilesSize = 0;
781 
782  tilePos = NULL;
783  tilePosSize = 0;
784 
785  tileListsGBIS = NULL;
786  tileListsGBISSize = 0;
787 
788  tileJatomStartGBIS = NULL;
789  tileJatomStartGBISSize = 0;
790 
791  tileListVirialEnergy = NULL;
792  tileListVirialEnergySize = 0;
793 
794  atomStorageSize = 0;
795  numTileLists = 0;
796  numTileListsGBIS = 0;
797  numJtiles = 1;
798 
799  outputOrder = NULL;
800  outputOrderSize = 0;
801  doOutputOrder = false;
802 
803  minmaxListLen = NULL;
804  minmaxListLenSize = 0;
805 
806  sortKeys = NULL;
807  sortKeysSize = 0;
808  sortKeys_endbit = 0;
809 
810  cudaCheck(cudaEventCreate(&tileListStatEvent));
811  tileListStatEventRecord = false;
812 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__global__ void const int numTileLists
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
CudaTileListKernel::~CudaTileListKernel ( )

Definition at line 814 of file CudaTileListKernel.cu.

References cudaCheck.

814  {
815  cudaCheck(cudaSetDevice(deviceID));
816  deallocate_device<TileListStat>(&d_tileListStat);
817  deallocate_host<TileListStat>(&h_tileListStat);
818  //
819  if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
820  if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
821  if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
822  if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
823  if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
824  //
825  if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
826  if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
827  if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
828  if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
829  if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
830  if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
831  if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
832  if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
833  if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
834  if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
835  if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
836  if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
837  if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
838  if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
839  if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
840  if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
841  if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
842  if (jtiles != NULL) deallocate_device<int>(&jtiles);
843  if (tilePos != NULL) deallocate_device<int>(&tilePos);
844 
845  if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
846  if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
847 
848  if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
849 
850  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
851 
852  if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
853  if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
854 
855  cudaCheck(cudaEventDestroy(tileListStatEvent));
856 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
#define cudaCheck(stmt)
Definition: CudaUtils.h:79

Member Function Documentation

void CudaTileListKernel::buildTileLists ( const int  numTileListsPrev,
const int  numPatchesIn,
const int  atomStorageSizeIn,
const int  maxTileListLenIn,
const float3  lata,
const float3  latb,
const float3  latc,
const CudaPatchRecord h_cudaPatches,
const float4 *  h_xyzq,
const float  plcutoff2In,
const size_t  maxShmemPerBlock,
cudaStream_t  stream 
)

Definition at line 992 of file CudaTileListKernel.cu.

References BOUNDINGBOXKERNEL_NUM_WARP, buildTileListsBBKernel_shmem_sizePerThread(), clearTileListStat(), cudaCheck, deviceCUDA, DeviceCUDA::getMaxNumBlocks(), lata, latb, latc, NAMD_die(), TileListStat::numTileLists, OVERALLOC, stream, TILELISTKERNELNEW_NUM_WARP, TileListStat::tilesSizeExceeded, and WARPSIZE.

997  {
998 
999  numPatches = numPatchesIn;
1000  atomStorageSize = atomStorageSizeIn;
1001  maxTileListLen = maxTileListLenIn;
1002  plcutoff2 = plcutoff2In;
1003 
1004  if (doStreaming) {
1005  // Re-allocate patchNumLists
1006  reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1007  reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1008  reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1009  }
1010 
1011  // Re-allocate (tileLists1, patchPairs1
1012  reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
1013  reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
1014 
1015  // Copy cudaPatches to device
1016  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1017  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1018 
1019  // Re-allocate temporary storage
1020  reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
1021  // Calculate tile list positions (tilePos)
1022  {
1023  int nthread = 1024;
1024  int nblock = 1;
1025  calcTileListPosKernel<1024> <<< nblock, nthread, 0, stream >>>
1026  (numComputes, cudaComputes, cudaPatches, tilePos);
1027  cudaCheck(cudaGetLastError());
1028  }
1029 
1030  // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
1031  {
1032  int nthread = 512;
1033  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/32)+1);
1034  updatePatchesKernel<32> <<< nblock, nthread, 0, stream >>>
1035  (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1036  cudaCheck(cudaGetLastError());
1037  }
1038 
1039  // ---------------------------------------------------------------------------------------------
1040 
1041 
1042  // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
1043  // tileListDepth2 and tileListOrder2 since they're used in sorting
1044  reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
1045  reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
1046 
1047  // Allocate with +1 to include last term in the exclusive sum
1048  reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
1049 
1050  reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
1051 
1052  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
1053 
1054  copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
1055 
1056  // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
1057  {
1058  int numBoundingBoxes = atomStorageSize/WARPSIZE;
1059  reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
1060 
1061  int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
1062  int nthread = WARPSIZE*nwarp;
1063  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1064  buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1065  cudaCheck(cudaGetLastError());
1066  }
1067 
1068  {
1069  int nwarp = TILELISTKERNELNEW_NUM_WARP;
1070  int nthread = WARPSIZE*nwarp;
1071  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
1072 
1073  int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
1074  if(shmem_size > maxShmemPerBlock){
1075  NAMD_die("CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1076  }
1077 
1078  // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
1079  // tell the required size in h_tileListStat->numJtiles. In subsequent calls,
1080  // re-allocation only happens when the size is exceeded.
1081  h_tileListStat->tilesSizeExceeded = true;
1082  int reallocCount = 0;
1083  while (h_tileListStat->tilesSizeExceeded) {
1084  reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
1085 
1087  // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
1088 
1089  buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>>
1090  (numTileListsPrev, tileLists1, cudaPatches, tilePos,
1091  lata, latb, latc, plcutoff2, maxTileListLen,
1092  boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1093  tileListDepth1, tileListOrder1, patchPairs1,
1094  d_tileListStat);
1095 
1096  cudaCheck(cudaGetLastError());
1097 
1098  // get (numATileLists, numJtiles, tilesSizeExceeded)
1099  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
1100  cudaCheck(cudaStreamSynchronize(stream));
1101  numJtiles = h_tileListStat->numJtiles;
1102 
1103  if (h_tileListStat->tilesSizeExceeded) {
1104  reallocCount++;
1105  if (reallocCount > 1) {
1106  NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
1107  }
1108  }
1109 
1110  }
1111 
1112  numTileLists = h_tileListStat->numTileLists;
1113 
1114  reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
1115  }
1116 
1117  // Re-allocate tileListVirialEnergy.
1118  // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
1119  // we're quaranteed to have enough space
1120  reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
1121 
1122  reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
1123  reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
1124  reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
1125  reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
1126  reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
1127 
1128  int numTileListsSrc = numTileListsPrev;
1129  int numJtilesSrc = numJtiles;
1131  int numJtilesDst = numJtiles;
1132 
1133  // Sort tiles
1134  sortTileLists(
1135  false,
1136  0, false,
1137  numTileListsSrc, numJtilesSrc,
1138  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1139  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1140  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1141  numTileListsDst, numJtilesDst,
1142  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1143  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1144  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1145  stream);
1146 
1147  // Set active buffer to 2
1148  setActiveBuffer(2);
1149 
1150  if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
1151 }
#define OVERALLOC
numTileListsSrc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void clearTileListStat(cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
const int numTileListsDst
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen)
int getMaxNumBlocks()
Definition: DeviceCUDA.C:415
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float plcutoff2
void NAMD_die(const char *err_msg)
Definition: common.C:83
__global__ void const int numTileLists
#define BOUNDINGBOXKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
#define TILELISTKERNELNEW_NUM_WARP
#define WARPSIZE
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
void CudaTileListKernel::clearTileListStat ( cudaStream_t  stream)

Definition at line 862 of file CudaTileListKernel.cu.

References getNumEmptyPatches(), TileListStat::patchReadyQueueCount, and stream.

Referenced by buildTileLists(), and CudaComputeNonbondedKernel::nonbondedForce().

862  {
863  // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
864  memset(h_tileListStat, 0, sizeof(TileListStat));
865  h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
866  copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
867 }
__thread cudaStream_t stream
void CudaTileListKernel::finishTileList ( cudaStream_t  stream)

Definition at line 869 of file CudaTileListKernel.cu.

References cudaCheck, and stream.

869  {
870  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
871  cudaCheck(cudaEventRecord(tileListStatEvent, stream));
872  tileListStatEventRecord = true;
873 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
float CudaTileListKernel::get_plcutoff2 ( )
inline

Definition at line 277 of file CudaTileListKernel.h.

277 {return plcutoff2;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float plcutoff2
float4* CudaTileListKernel::get_xyzq ( )
inline

Definition at line 283 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), CudaComputeGBISKernel::GBISphase3(), CudaComputeNonbondedKernel::nonbondedForce(), and CudaComputeNonbondedKernel::reduceVirialEnergy().

283 {return xyzq;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
BoundingBox* CudaTileListKernel::getBoundingBoxes ( )
inline

Definition at line 281 of file CudaTileListKernel.h.

281 {return boundingBoxes;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
CudaPatchRecord* CudaTileListKernel::getCudaPatches ( )
inline

Definition at line 302 of file CudaTileListKernel.h.

302 {return cudaPatches;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
int* CudaTileListKernel::getEmptyPatches ( )
inline

Definition at line 273 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbonded::launchWork().

273 {return h_emptyPatches;}
int* CudaTileListKernel::getJtiles ( )
inline

Definition at line 282 of file CudaTileListKernel.h.

282 {return jtiles;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
int CudaTileListKernel::getNumComputes ( )
inline

Definition at line 326 of file CudaTileListKernel.h.

326 {return numComputes;}
int CudaTileListKernel::getNumEmptyPatches ( )
inline

Definition at line 272 of file CudaTileListKernel.h.

Referenced by clearTileListStat(), and CudaComputeNonbonded::launchWork().

272 {return numEmptyPatches;}
int CudaTileListKernel::getNumExcluded ( )
inline

Definition at line 275 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbonded::finishReductions().

275 {return numExcluded;}
int CudaTileListKernel::getNumJtiles ( )
inline

Definition at line 280 of file CudaTileListKernel.h.

280 {return numJtiles;}
int CudaTileListKernel::getNumPatches ( )
inline

Definition at line 324 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbonded::launchWork(), and CudaComputeNonbondedKernel::nonbondedForce().

324 {return numPatches;}
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
int CudaTileListKernel::getNumTileLists ( )
inline

Definition at line 278 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::nonbondedForce().

278 {return numTileLists;}
__global__ void const int numTileLists
int CudaTileListKernel::getNumTileListsGBIS ( )
inline

Definition at line 279 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), CudaComputeGBISKernel::GBISphase2(), and CudaComputeGBISKernel::GBISphase3().

279 {return numTileListsGBIS;}
int* CudaTileListKernel::getOutputOrder ( )
inline

Definition at line 327 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::nonbondedForce().

327  {
328  if (!doStreaming) return NULL;
329  if (doOutputOrder) {
330  return outputOrder;
331  } else {
332  return NULL;
333  }
334  }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
PatchPairRecord* CudaTileListKernel::getPatchPairs ( )
inline

Definition at line 295 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), and CudaComputeGBISKernel::GBISphase3().

295 {return ((activeBuffer == 1) ? patchPairs1 : patchPairs2);}
TileExcl* CudaTileListKernel::getTileExcls ( )
inline

Definition at line 294 of file CudaTileListKernel.h.

294 {return ((activeBuffer == 1) ? tileExcls1 : tileExcls2);}
int* CudaTileListKernel::getTileJatomStart ( )
inline

Definition at line 288 of file CudaTileListKernel.h.

288 {return ((activeBuffer == 1) ? tileJatomStart1 : tileJatomStart2);}
int* CudaTileListKernel::getTileJatomStartGBIS ( )
inline

Definition at line 297 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), and CudaComputeGBISKernel::GBISphase3().

297 {return tileJatomStartGBIS;}
unsigned int* CudaTileListKernel::getTileListDepth ( )
inline

Definition at line 292 of file CudaTileListKernel.h.

292 {return ((activeBuffer == 1) ? tileListDepth1 : tileListDepth2);}
int* CudaTileListKernel::getTileListOrder ( )
inline

Definition at line 293 of file CudaTileListKernel.h.

293 {return ((activeBuffer == 1) ? tileListOrder1 : tileListOrder2);}
TileList* CudaTileListKernel::getTileLists ( )
inline

Definition at line 289 of file CudaTileListKernel.h.

289  {
290  return ((activeBuffer == 1) ? tileLists1 : tileLists2);
291  }
TileList* CudaTileListKernel::getTileListsGBIS ( )
inline

Definition at line 298 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), and CudaComputeGBISKernel::GBISphase3().

298 {return tileListsGBIS;}
TileListStat* CudaTileListKernel::getTileListStatDevPtr ( )
inline

Definition at line 285 of file CudaTileListKernel.h.

285 {return d_tileListStat;}
TileListVirialEnergy* CudaTileListKernel::getTileListVirialEnergy ( )
inline

Definition at line 300 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::reduceVirialEnergy().

300 {return tileListVirialEnergy;}
int CudaTileListKernel::getTileListVirialEnergyGBISLength ( )
inline

Definition at line 322 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::reduceVirialEnergy().

322 {return tileListVirialEnergyGBISLength;}
int CudaTileListKernel::getTileListVirialEnergyLength ( )
inline

Definition at line 321 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::reduceVirialEnergy().

321 {return tileListVirialEnergyLength;}
void CudaTileListKernel::prepareTileList ( cudaStream_t  stream)

Definition at line 858 of file CudaTileListKernel.cu.

References stream.

858  {
859  clear_device_array<int>(jtiles, numJtiles, stream);
860 }
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
void CudaTileListKernel::reSortTileLists ( const bool  doGBIS,
cudaStream_t  stream 
)

Definition at line 1564 of file CudaTileListKernel.cu.

References cudaCheck, NAMD_die(), TileListStat::numExcluded, TileListStat::numTileLists, TileListStat::numTileListsGBIS, and OVERALLOC.

1564  {
1565  // Store previous number of active lists
1566  int numTileListsPrev = numTileLists;
1567 
1568  // Wait for finishTileList() to stop copying
1569  if (!tileListStatEventRecord)
1570  NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1571  cudaCheck(cudaEventSynchronize(tileListStatEvent));
1572 
1573  // Get numTileLists, numTileListsGBIS, and numExcluded
1574  {
1575  numTileLists = h_tileListStat->numTileLists;
1576  numTileListsGBIS = h_tileListStat->numTileListsGBIS;
1577  numExcluded = h_tileListStat->numExcluded;
1578  }
1579 
1580  // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
1581  // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
1582  sortTileLists(true, 0, true,
1583  numTileListsPrev, numJtiles,
1584  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1585  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1586  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1587  numTileLists, numJtiles,
1588  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1589  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1590  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1591  stream);
1592 
1593  // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
1594  // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
1595  // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
1596 
1597  // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
1598 
1599  // NOTE:
1600  // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
1601  // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
1602 
1603  if (doGBIS) {
1604  // GBIS is used => produce a second tile list
1605  // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
1606  reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
1607  reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
1608 
1609  sortTileLists(true, 16, true,
1610  numTileListsPrev, numJtiles,
1611  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1612  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1613  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1614  numTileListsGBIS, numJtiles,
1615  PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1616  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1617  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1618  stream);
1619  }
1620 
1621  // Set active buffer to be 1
1622  setActiveBuffer(1);
1623 
1624 }
#define OVERALLOC
__thread cudaStream_t stream
void NAMD_die(const char *err_msg)
Definition: common.C:83
__global__ void const int numTileLists
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void CudaTileListKernel::setTileListVirialEnergyGBISLength ( int  len)

Definition at line 1662 of file CudaTileListKernel.cu.

References NAMD_die().

Referenced by CudaComputeGBISKernel::GBISphase2().

1662  {
1663  if (len > tileListVirialEnergySize) {
1664  NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1665  }
1666  tileListVirialEnergyGBISLength = len;
1667 }
void NAMD_die(const char *err_msg)
Definition: common.C:83
void CudaTileListKernel::setTileListVirialEnergyLength ( int  len)

Definition at line 1655 of file CudaTileListKernel.cu.

References NAMD_die().

Referenced by CudaComputeNonbondedKernel::nonbondedForce().

1655  {
1656  if (len > tileListVirialEnergySize) {
1657  NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1658  }
1659  tileListVirialEnergyLength = len;
1660 }
void NAMD_die(const char *err_msg)
Definition: common.C:83
void CudaTileListKernel::updateComputes ( const int  numComputesIn,
const CudaComputeRecord h_cudaComputes,
cudaStream_t  stream 
)

Definition at line 875 of file CudaTileListKernel.cu.

References stream.

876  {
877 
878  numComputes = numComputesIn;
879 
880  reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
881  copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
882 
883  if (doStreaming) doOutputOrder = true;
884 }
__thread cudaStream_t stream

The documentation for this class was generated from the following files: