4 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <cub/device/device_radix_sort.cuh>
6 #include <cub/device/device_scan.cuh>
9 #include <namd_cub/device/device_radix_sort.cuh>
10 #include <namd_cub/device/device_scan.cuh>
11 #include <namd_cub/cub.cuh>
17 #define __thread __declspec(thread)
21 #define OVERALLOC 1.2f
23 #if __CUDA_ARCH__ < 350
35 for (
int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
37 int2 patchInd = tileLists[i].patchInd;
38 atomicAdd(&patchNumLists[patchInd.x], 1);
39 if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
50 const int numPatches,
int* __restrict__ numEmptyPatches,
int* __restrict__ emptyPatches) {
52 for (
int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
54 int2 patchInd = tileLists[i].patchInd;
55 int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
56 tileLists[i].patchNumList = patchNumList;
59 for (
int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
61 if (patchNumLists[i] == 0) {
62 int ind = atomicAdd(numEmptyPatches, 1);
63 emptyPatches[ind] = i;
85 const int begin_bit,
const unsigned int* __restrict__ sortKeys,
unsigned int* __restrict__ sortKey) {
89 int icompute = tileLists[
itileList].icompute;
90 int depth = min((tileListDepth[
itileList] >> begin_bit) & 65535, maxTileListLen);
91 int i = icompute*maxTileListLen + (depth - 1);
92 sortKey[
itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
99 unsigned int* __restrict__
keys,
int* __restrict__ vals) {
103 for (
int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
105 int i = base + threadIdx.x;
106 typedef cub::BlockRadixSort<unsigned int, width, 1, int>
BlockRadixSort;
107 __shared__
typename BlockRadixSort::TempStorage
tempStorage;
108 unsigned int key[1] = {(i < n) ? ((keys[i] >>
begin_bit) & 65535) : 0};
109 int val[1] = {(i < n) ? vals[i] : 0};
121 const int* __restrict__
outputOrder,
const int* __restrict__ tileListPos,
124 int* __restrict__ tileListOrderDst,
127 for (
int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
129 int j = outputOrder[numTileListsSrc - i - 1];
130 if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
131 int k = tileListPos[i];
132 tileListDepthDst[k] = tileListDepthSrc[j];
133 tileListOrderDst[k] = j;
145 for (
int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
147 int j = outputOrder[numTileLists - i - 1];
148 tileListDepthDst[i] = ((tileListDepthSrc[j] >>
begin_bit) & 65535) == 0 ? 0 : 1;
154 int2* __restrict__ minmaxListLen) {
157 val.x = maxTileListLen+1;
159 for (
int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
161 minmaxListLen[i] = val;
170 const TileList* __restrict__ tileListsSrc,
171 const int* __restrict__ tileListOrderDst,
173 int2* __restrict__ minmaxListLen,
unsigned int* __restrict__ sortKeys) {
175 for (
int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
177 int k = tileListOrderDst[i];
178 int icompute = tileListsSrc[k].icompute;
179 int depth = tileListDepthDst[i] & 65535;
181 int j = icompute*maxTileListLen + (depth-1);
183 int2 minmax = minmaxListLen[icompute];
184 int2 minmaxOrig = minmax;
185 if (minmax.x > depth) minmax.x = depth;
186 if (minmax.y < depth) minmax.y = depth;
187 if (minmax.x != minmaxOrig.x) {
188 atomicMin(&minmaxListLen[icompute].
x, minmax.x);
190 if (minmax.y != minmaxOrig.y) {
191 atomicMax(&minmaxListLen[icompute].
y, minmax.y);
197 __global__
void fillSortKeys(
const int numComputes,
const int maxTileListLen,
198 const int2* __restrict__ minmaxListLen,
unsigned int* __restrict__ sortKeys) {
200 for (
int i = threadIdx.x/
WARPSIZE + blockDim.x/
WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/
WARPSIZE*gridDim.x) {
201 const int wid = threadIdx.x %
WARPSIZE;
202 int2 minmax = minmaxListLen[i];
203 int minlen = minmax.x;
204 int maxlen = minmax.y;
207 if ( maxlen < minlen ) {
209 maxlen = maxTileListLen;
211 unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
212 unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
213 unsigned int aveKey = (maxKey + minKey)/2;
214 for (
int j=wid;j < minlen-1;j+=
WARPSIZE) {
215 sortKeys[i*maxTileListLen + j] = minKey;
217 for (
int j=maxlen+wid;j < maxTileListLen;j+=
WARPSIZE) {
218 sortKeys[i*maxTileListLen + j] = maxKey;
220 for (
int j=wid;j < maxTileListLen;j+=
WARPSIZE) {
221 if (sortKeys[i*maxTileListLen + j] == 0) {
222 sortKeys[i*maxTileListLen + j] = aveKey;
232 #define BOUNDINGBOXKERNEL_NUM_WARP 8
237 const int warpId = threadIdx.x /
WARPSIZE;
238 const int wid = threadIdx.x %
WARPSIZE;
241 for (
int iwarp = warpId*
WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
243 const int i = iwarp + wid;
247 float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
249 volatile float3 minxyz, maxxyz;
251 typedef cub::WarpReduce<float> WarpReduce;
253 minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
254 minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
255 minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
256 maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
257 maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
258 maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
262 boundingBox.
x = 0.5f*(minxyz.x + maxxyz.x);
263 boundingBox.
y = 0.5f*(minxyz.y + maxxyz.y);
264 boundingBox.
z = 0.5f*(minxyz.z + maxxyz.z);
265 boundingBox.
wx = 0.5f*(maxxyz.x - minxyz.x);
266 boundingBox.
wy = 0.5f*(maxxyz.y - minxyz.y);
267 boundingBox.
wz = 0.5f*(maxxyz.z - minxyz.z);
268 boundingBoxes[ibb] = boundingBox;
278 float dx = max(0.0f, fabsf(a.
x - b.
x) - a.
wx - b.
wx);
279 float dy = max(0.0f, fabsf(a.
y - b.
y) - a.
wy - b.
wy);
280 float dz = max(0.0f, fabsf(a.
z - b.
z) - a.
wz - b.
wz);
281 float r2 = dx*dx + dy*dy + dz*dz;
292 template <
typename T>
293 __device__ __forceinline__
294 int shWarpExclusiveSum(
const int n,
volatile T* sh_in,
volatile int* sh_out) {
295 const int wid = threadIdx.x %
WARPSIZE;
296 volatile int blockOffset = 0;
297 for (
int iblock=0;iblock < n;iblock +=
WARPSIZE) {
299 int blockLen = min(
WARPSIZE, n-iblock);
301 typedef cub::WarpScan<int> WarpScan;
302 __shared__
typename WarpScan::TempStorage
tempStorage;
303 int data = (wid < blockLen) ? (
int)sh_in[iblock + wid] : 0;
304 WarpScan(tempStorage).ExclusiveSum(data, data);
308 int last = (int)sh_in[iblock + blockLen-1];
310 if (wid < blockLen) sh_out[iblock + wid] = data;
312 blockOffset = sh_out[iblock + blockLen-1] + last;
318 #define TILELISTKERNELNEW_NUM_WARP 4
323 template<
int nthread>
327 int* __restrict__ tilePos) {
329 typedef cub::BlockScan<int, nthread> BlockScan;
331 __shared__
typename BlockScan::TempStorage
tempStorage;
332 __shared__
int shTilePos0;
334 if (threadIdx.x == nthread-1) {
338 for (
int base=0;base < numComputes;base+=nthread) {
339 int k = base + threadIdx.x;
341 int numTiles1 = (k < numComputes) ? (patches[computes[k].patchInd.x].numAtoms-1)/
WARPSIZE+1 : 0;
345 BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
348 if (k < numComputes) {
349 tilePos[k] = shTilePos0 + tilePosVal;
354 if (threadIdx.x == nthread-1) {
355 shTilePos0 += tilePosVal + numTiles1;
361 template<
int nthread>
363 const int* __restrict__ tilePos,
368 const int tid = threadIdx.x % nthread;
371 for (
int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
376 int numTiles1 = (patches[patchInd.x].numAtoms-1)/
WARPSIZE+1;
377 int itileList0 = tilePos[k];
378 for (
int i=tid;i < numTiles1;i+=nthread) {
379 tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
380 tileLists[itileList0 + i].patchInd = patchInd;
381 tileLists[itileList0 + i].icompute = k;
387 __host__ __device__ __forceinline__
391 maxTileListLen*
sizeof(char)
400 const int* __restrict__ tileListPos,
401 const float3
lata,
const float3
latb,
const float3
latc,
402 const float cutoff2,
const int maxTileListLen,
405 const int tileJatomStartSize,
411 extern __shared__
char sh_buffer[];
413 int pos = threadIdx.x*sizePerThread;
414 volatile char* sh_tile = (
char*)&sh_buffer[pos];
420 const int wid = threadIdx.x %
WARPSIZE;
424 int itileListLen = 0;
432 if (itileList < numTileLists) {
433 offsetXYZ = tileLists[
itileList].offsetXYZ;
434 patchInd = tileLists[
itileList].patchInd;
435 icompute = tileLists[
itileList].icompute;
437 i = itileList - tileListPos[icompute];
439 float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
440 float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
441 float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
444 bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
447 patch1 = patches[patchInd.x];
448 patch2 = patches[patchInd.y];
455 bool self = zeroShift && (tileStart1 == tileStart2);
458 BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
459 boundingBoxI.
x += shx;
460 boundingBoxI.y += shy;
461 boundingBoxI.z += shz;
463 for (
int j=0;j < numTiles2;j++) {
465 if (!
self || j >= i) {
466 BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
467 float r2bb =
distsq(boundingBoxI, boundingBoxJ);
468 if (r2bb < cutoff2) {
475 tileListDepth[
itileList] = (
unsigned int)itileListLen;
479 typedef cub::WarpScan<int> WarpScan;
480 __shared__
typename WarpScan::TempStorage
tempStorage;
481 int active = (itileListLen > 0);
483 WarpScan(tempStorage).ExclusiveSum(active, activePos);
485 WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
487 int jtileStart, numJtiles;
490 atomicAdd(&tileListStat->numTileLists, activePos + active);
491 numJtiles = itileListPos + itileListLen;
492 jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
495 jtileStart = cub::ShuffleIndex<WARPSIZE>(jtileStart, WARPSIZE-1,
WARP_FULL_MASK);
496 if (jtileStart + numJtiles > tileJatomStartSize) {
498 if (wid == 0) tileListStat->tilesSizeExceeded =
true;
502 int jStart = itileListPos;
503 int jEnd = cub::ShuffleDown<WARPSIZE>(itileListPos, 1, WARPSIZE-1,
WARP_FULL_MASK);
504 if (wid == WARPSIZE-1) jEnd = numJtiles;
506 if (itileListLen > 0) {
511 TLtmp.
jtileEnd = jtileStart + jEnd - 1;
527 int jtile = jtileStart + jStart;
528 for (
int j=0;j < numTiles2;j++) {
541 #define REPACKTILELISTSKERNEL_NUM_WARP 32
545 const int* __restrict__
jtiles,
546 const TileList* __restrict__ tileListsSrc,
TileList* __restrict__ tileListsDst,
548 const int* __restrict__ tileJatomStartSrc,
int* __restrict__ tileJatomStartDst,
549 const TileExcl* __restrict__ tileExclsSrc,
TileExcl* __restrict__ tileExclsDst) {
551 const int wid = threadIdx.x %
WARPSIZE;
554 for (
int i = threadIdx.x/
WARPSIZE + blockDim.x/
WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/
WARPSIZE*gridDim.x)
556 int j = tileListOrder[i];
557 int start = tileListPos[i];
558 int end = tileListPos[i+1]-1;
559 if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
561 int startOld =
__ldg(&tileListsSrc[j].jtileStart);
562 int endOld =
__ldg(&tileListsSrc[j].jtileEnd);
563 int iatomStart =
__ldg(&tileListsSrc[j].iatomStart);
565 offsetXYZ.x =
__ldg(&tileListsSrc[j].offsetXYZ.x);
566 offsetXYZ.y =
__ldg(&tileListsSrc[j].offsetXYZ.y);
567 offsetXYZ.z =
__ldg(&tileListsSrc[j].offsetXYZ.z);
568 int2 patchInd = tileListsSrc[j].patchInd;
569 int icompute =
__ldg(&tileListsSrc[j].icompute);
578 tileListsDst[i] = tileList;
581 if (jtiles == NULL) {
584 for (
int jtileOld=startOld;jtileOld <= endOld;jtileOld+=
WARPSIZE,jtile+=
WARPSIZE) {
585 if (jtileOld + wid <= endOld) {
586 tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
589 if (tileExclsSrc != NULL) {
591 for (
int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
592 tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
597 for (
int jtileOld=startOld;jtileOld <= endOld;jtileOld+=
WARPSIZE) {
598 int t = jtileOld + wid;
599 int jtile = (t <= endOld) ? jtiles[t] : 0;
602 typedef cub::WarpScan<int> WarpScan;
604 int warpId = threadIdx.x /
WARPSIZE;
606 WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
608 if (jtile) tileJatomStartDst[jtile0+jtilePos] =
__ldg(&tileJatomStartSrc[t]);
610 if (tileExclsSrc != NULL) {
614 int k = __ffs(b) - 1;
615 tileExclsDst[jtile0].excl[wid] =
__ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
617 b ^= ((
unsigned int)1 << k);
633 #define SORTTILELISTSKERNEL_NUM_THREAD 512
634 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 22
635 template <
typename keyT,
typename valT,
bool ascend>
652 typename BlockLoad::TempStorage
load;
653 typename BlockLoadU::TempStorage
loadU;
654 typename BlockRadixSort::TempStorage
sort;
660 BlockLoadU(tempStorage.loadU).Load(tileListDepthSrc, keys, numTileListsSrc, oobKey);
662 BlockLoad(tempStorage.load).Load(tileListOrderSrc, values, numTileListsSrc);
666 BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys, values, begin_bit, end_bit);
668 BlockRadixSort(tempStorage.sort).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
671 cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListOrderDst,
values,
numTileListsDst);
675 unsigned int* __restrict__ tileListDepthSrc,
unsigned int* __restrict__ tileListDepthDst) {
677 for (
int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
679 int j = tileListOrder[i];
680 tileListDepthDst[i] = tileListDepthSrc[j];
693 unsigned int a = tileListDepth[
itileList];
706 deviceID(deviceID), doStreaming(doStreaming) {
719 cudaComputesSize = 0;
721 patchNumLists = NULL;
722 patchNumListsSize = 0;
725 emptyPatchesSize = 0;
726 h_emptyPatches = NULL;
727 h_emptyPatchesSize = 0;
745 tileJatomStart1 = NULL;
746 tileJatomStart1Size = 0;
747 tileJatomStart2 = NULL;
748 tileJatomStart2Size = 0;
750 boundingBoxes = NULL;
751 boundingBoxesSize = 0;
753 tileListDepth1 = NULL;
754 tileListDepth1Size = 0;
755 tileListDepth2 = NULL;
756 tileListDepth2Size = 0;
758 tileListOrder1 = NULL;
759 tileListOrder1Size = 0;
760 tileListOrder2 = NULL;
761 tileListOrder2Size = 0;
771 allocate_device<TileListStat>(&d_tileListStat, 1);
772 allocate_host<TileListStat>(&h_tileListStat, 1);
785 tileListsGBIS = NULL;
786 tileListsGBISSize = 0;
788 tileJatomStartGBIS = NULL;
789 tileJatomStartGBISSize = 0;
791 tileListVirialEnergy = NULL;
792 tileListVirialEnergySize = 0;
796 numTileListsGBIS = 0;
801 doOutputOrder =
false;
803 minmaxListLen = NULL;
804 minmaxListLenSize = 0;
810 cudaCheck(cudaEventCreate(&tileListStatEvent));
811 tileListStatEventRecord =
false;
816 deallocate_device<TileListStat>(&d_tileListStat);
817 deallocate_host<TileListStat>(&h_tileListStat);
819 if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
820 if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
821 if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
822 if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
823 if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
825 if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
826 if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
827 if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
828 if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
829 if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
830 if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
831 if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
832 if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
833 if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
834 if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
835 if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
836 if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
837 if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
838 if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
839 if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
840 if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
841 if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
842 if (jtiles != NULL) deallocate_device<int>(&jtiles);
843 if (tilePos != NULL) deallocate_device<int>(&tilePos);
845 if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
846 if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
848 if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
850 if (xyzq != NULL) deallocate_device<float4>(&xyzq);
852 if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
853 if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
855 cudaCheck(cudaEventDestroy(tileListStatEvent));
859 clear_device_array<int>(jtiles, numJtiles,
stream);
866 copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1,
stream);
870 copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1,
stream);
871 cudaCheck(cudaEventRecord(tileListStatEvent, stream));
872 tileListStatEventRecord =
true;
878 numComputes = numComputesIn;
880 reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
881 copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes,
stream);
883 if (doStreaming) doOutputOrder =
true;
886 void CudaTileListKernel::writeTileList(
const char* filename,
const int numTileLists,
890 copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists,
stream);
891 cudaCheck(cudaStreamSynchronize(stream));
892 FILE* handle = fopen(filename,
"wt");
895 fprintf(handle,
"%d %d %d %f %f %f %d %d %d %d\n",
900 delete [] h_tileLists;
903 void CudaTileListKernel::writeTileJatomStart(
const char* filename,
const int numJtiles,
904 const int* d_tileJatomStart, cudaStream_t stream) {
906 int* h_tileJatomStart =
new int[numJtiles];
907 copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles,
stream);
908 cudaCheck(cudaStreamSynchronize(stream));
909 FILE* handle = fopen(filename,
"wt");
910 for (
int i=0;i < numJtiles;i++) {
911 fprintf(handle,
"%d\n", h_tileJatomStart[i]);
914 delete [] h_tileJatomStart;
993 const int numPatchesIn,
const int atomStorageSizeIn,
const int maxTileListLenIn,
994 const float3
lata,
const float3
latb,
const float3
latc,
996 const float plcutoff2In,
const size_t maxShmemPerBlock,
997 cudaStream_t stream) {
999 numPatches = numPatchesIn;
1000 atomStorageSize = atomStorageSizeIn;
1001 maxTileListLen = maxTileListLenIn;
1002 plcutoff2 = plcutoff2In;
1006 reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1007 reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1008 reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1012 reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev,
OVERALLOC);
1013 reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev,
OVERALLOC);
1016 reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1017 copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches,
stream);
1020 reallocate_device<int>(&tilePos, &tilePosSize, numComputes,
OVERALLOC);
1025 calcTileListPosKernel<1024> <<< nblock, nthread, 0, stream >>>
1026 (numComputes, cudaComputes, cudaPatches, tilePos);
1034 updatePatchesKernel<32> <<< nblock, nthread, 0, stream >>>
1035 (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1044 reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1,
OVERALLOC);
1045 reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev,
OVERALLOC);
1048 reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1,
OVERALLOC);
1050 reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev,
OVERALLOC);
1052 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize,
OVERALLOC);
1054 copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize,
stream);
1058 int numBoundingBoxes = atomStorageSize/
WARPSIZE;
1059 reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes,
OVERALLOC);
1064 buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1074 if(shmem_size > maxShmemPerBlock){
1075 NAMD_die(
"CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1082 int reallocCount = 0;
1084 reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles,
OVERALLOC);
1089 buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>>
1090 (numTileListsPrev, tileLists1, cudaPatches, tilePos,
1092 boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1093 tileListDepth1, tileListOrder1, patchPairs1,
1099 copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1,
stream);
1100 cudaCheck(cudaStreamSynchronize(stream));
1101 numJtiles = h_tileListStat->numJtiles;
1103 if (h_tileListStat->tilesSizeExceeded) {
1105 if (reallocCount > 1) {
1106 NAMD_die(
"CudaTileListKernel::buildTileLists, multiple reallocations detected");
1114 reallocate_device<int>(&jtiles, &jtilesSize, numJtiles,
OVERALLOC);
1120 reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists,
OVERALLOC);
1122 reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists,
OVERALLOC);
1123 reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists,
OVERALLOC);
1124 reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles,
OVERALLOC);
1125 reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles,
OVERALLOC);
1126 reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles,
OVERALLOC);
1128 int numTileListsSrc = numTileListsPrev;
1129 int numJtilesSrc = numJtiles;
1130 int numTileListsDst = numTileLists;
1131 int numJtilesDst = numJtiles;
1137 numTileListsSrc, numJtilesSrc,
1138 PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1139 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1140 PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1141 numTileListsDst, numJtilesDst,
1142 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1143 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1144 PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1150 if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists,
OVERALLOC);
1160 while (a >>= 1) k++;
1167 void CudaTileListKernel::sortTileLists(
1168 const bool useJtiles,
1169 const int begin_bit,
const bool highDepthBitsSetIn,
1171 const int numTileListsSrc,
const int numJtilesSrc,
1172 PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
1173 PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
1174 PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
1176 const int numTileListsDst,
const int numJtilesDst,
1177 PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
1178 PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
1179 PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
1180 cudaStream_t stream) {
1182 bool doShiftDown = (begin_bit != 0 || highDepthBitsSetIn);
1188 if (numTileListsSrc > tileListsSrc.size || numJtilesSrc > tileJatomStartSrc.size ||
1189 numTileListsSrc > tileListDepthSrc.size || numTileListsSrc > tileListOrderSrc.size ||
1190 (patchPairsSrc.ptr != NULL && numTileListsSrc > patchPairsSrc.size) ||
1191 (tileExclsSrc.ptr != NULL && numJtilesSrc > tileExclsSrc.size))
1192 NAMD_die(
"CudaTileListKernel::sortTileLists, Src allocated too small");
1194 if (numTileListsDst > tileListsDst.size || numJtilesDst > tileJatomStartDst.size ||
1195 numTileListsSrc > tileListDepthDst.size || numTileListsSrc > tileListOrderDst.size ||
1196 (patchPairsDst.ptr != NULL && numTileListsDst > patchPairsDst.size) ||
1197 (tileExclsDst.ptr != NULL && numJtilesDst > tileExclsDst.size))
1198 NAMD_die(
"CudaTileListKernel::sortTileLists, Dst allocated too small");
1200 if (begin_bit != 0 && begin_bit != 16)
1201 NAMD_die(
"CudaTileListKernel::sortTileLists, begin_bit must be 0 or 16");
1204 int num_bit =
ilog2(maxTileListLen);
1206 NAMD_die(
"CudaTileListKernel::sortTileLists, num_bit overflow");
1207 int end_bit = begin_bit + num_bit;
1212 if (doOutputOrder && useJtiles) {
1225 bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1238 cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
1239 (
int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1241 if (size == 0) size = 128;
1242 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1243 size = tempStorageSize;
1244 cudaCheck(cub::DeviceScan::ExclusiveSum((
void *)tempStorage, size,
1245 (
int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1253 storeInReverse <<< nblock, nthread, 0, stream >>>
1255 tileListOrderSrc.ptr, tileListDepthSrc.ptr,
1256 tileListOrderDst.ptr, tileListDepthDst.ptr);
1262 maxTileListLen_sortKeys = maxTileListLen;
1264 reallocate_device<unsigned int>(&sortKeys, &sortKeysSize, numComputes*maxTileListLen);
1265 clear_device_array<unsigned int>(sortKeys, numComputes*maxTileListLen,
stream);
1269 reallocate_device<int2>(&minmaxListLen, &minmaxListLenSize, numComputes);
1273 initMinMaxListLen <<< nblock, nthread, 0, stream >>>
1274 (numComputes, maxTileListLen, minmaxListLen);
1282 buildSortKeys <<< nblock, nthread, 0, stream >>>
1283 (
numTileListsDst, maxTileListLen, tileListsSrc.ptr, tileListOrderDst.ptr,
1284 tileListDepthDst.ptr, minmaxListLen, sortKeys);
1288 sortKeys_endbit =
ilog2(numTileListsDst);
1296 fillSortKeys <<< nblock, nthread, 0, stream >>>
1297 (numComputes, maxTileListLen, minmaxListLen, sortKeys);
1303 doOutputOrder =
false;
1305 }
else if (doOutputOrder) {
1310 int endbit_tmp =
ilog2(numTileListsSrc);
1319 buildRemoveZerosSortKey <<< nblock, nthread, 0, stream >>>
1324 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1331 sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1333 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1340 cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
1341 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1342 numTileListsSrc, 0, endbit_tmp, stream));
1344 if (size == 0) size = 128;
1345 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1346 size = tempStorageSize;
1347 cudaCheck(cub::DeviceRadixSort::SortPairs((
void *)tempStorage, size,
1348 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1349 numTileListsSrc, 0, endbit_tmp, stream));
1356 reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1358 tileListDepthSrc.ptr, tileListDepthDst.ptr);
1365 if (sortKeys_endbit <= 0)
1366 NAMD_die(
"CudaTileListKernel::sortTileLists, sortKeys not produced or invalid sortKeys_endbit");
1375 setupSortKey <<< nblock, nthread, 0, stream >>>
1376 (
numTileListsSrc, maxTileListLen_sortKeys, tileListsSrc.ptr, tileListDepthSrc.ptr,
begin_bit, sortKeys, sortKeySrc);
1382 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1389 unsigned int oobKey = (2 << sortKeys_endbit) - 1;
1390 sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1392 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1399 cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
1400 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1401 numTileListsSrc, 0, sortKeys_endbit, stream));
1403 if (size == 0) size = 128;
1404 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1405 size = tempStorageSize;
1406 cudaCheck(cub::DeviceRadixSort::SortPairs((
void *)tempStorage, size,
1407 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1408 numTileListsSrc, 0, sortKeys_endbit, stream));
1415 reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1417 tileListDepthSrc.ptr, tileListDepthDst.ptr);
1425 localSort<32> <<< nblock, nthread, 0, stream >>>
1430 doShiftDown =
false;
1443 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1449 sortTileListsKernel<unsigned int, int, false> <<< nblock, nthread, 0, stream >>>
1451 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1459 cudaCheck(cub::DeviceRadixSort::SortPairsDescending(NULL, size,
1460 tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1461 numTileListsSrc, begin_bit, end_bit, stream));
1463 if (size == 0) size = 128;
1464 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1465 size = tempStorageSize;
1466 cudaCheck(cub::DeviceRadixSort::SortPairsDescending((
void *)tempStorage, size,
1467 tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1468 numTileListsSrc, begin_bit, end_bit, stream));
1479 bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1485 reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsDst+1,
OVERALLOC);
1497 cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
1498 (
int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1500 if (size == 0) size = 128;
1501 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1502 size = tempStorageSize;
1505 cudaCheck(cub::DeviceScan::ExclusiveSum((
void *)tempStorage, size,
1506 (
int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1521 repackTileListsKernel <<< nblock, nthread, 0, stream >>>
1523 (useJtiles) ? jtiles : NULL,
1524 tileListsSrc.ptr, tileListsDst.ptr,
1525 patchPairsSrc.ptr, patchPairsDst.ptr,
1526 tileJatomStartSrc.ptr, tileJatomStartDst.ptr,
1527 tileExclsSrc.ptr, tileExclsDst.ptr);
1534 clear_device_array<int>(patchNumLists, numPatches,
stream);
1539 calcPatchNumLists <<< nblock, nthread, 0, stream >>>
1544 clear_device_array<int>(&emptyPatches[numPatches], 1,
stream);
1548 setPatchNumLists_findEmptyPatches <<< nblock, nthread, 0, stream >>>
1550 numPatches, &emptyPatches[numPatches], emptyPatches);
1554 copy_DtoH<int>(emptyPatches, h_emptyPatches, numPatches+1,
stream);
1555 cudaCheck(cudaStreamSynchronize(stream));
1556 numEmptyPatches = h_emptyPatches[numPatches];
1566 int numTileListsPrev = numTileLists;
1569 if (!tileListStatEventRecord)
1570 NAMD_die(
"CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1571 cudaCheck(cudaEventSynchronize(tileListStatEvent));
1582 sortTileLists(
true, 0,
true,
1583 numTileListsPrev, numJtiles,
1584 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1585 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1586 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1587 numTileLists, numJtiles,
1588 PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1589 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1590 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1606 reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS,
OVERALLOC);
1607 reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles,
OVERALLOC);
1609 sortTileLists(
true, 16,
true,
1610 numTileListsPrev, numJtiles,
1611 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1612 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1613 PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1614 numTileListsGBIS, numJtiles,
1615 PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1616 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1617 PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1656 if (len > tileListVirialEnergySize) {
1657 NAMD_die(
"CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1659 tileListVirialEnergyLength = len;
1663 if (len > tileListVirialEnergySize) {
1664 NAMD_die(
"CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1666 tileListVirialEnergyGBISLength = len;
__global__ void reOrderTileListDepth(const int numTileLists, const int *__restrict__ tileListOrder, unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
BlockLoad::TempStorage load
__global__ void repackTileListsKernel(const int numTileLists, const int begin_bit, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrder, const int *__restrict__ jtiles, const TileList *__restrict__ tileListsSrc, TileList *__restrict__ tileListsDst, const PatchPairRecord *__restrict__ patchPairsSrc, PatchPairRecord *__restrict__ patchPairsDst, const int *__restrict__ tileJatomStartSrc, int *__restrict__ tileJatomStartDst, const TileExcl *__restrict__ tileExclsSrc, TileExcl *__restrict__ tileExclsDst)
__global__ void storeInReverse(const int numTileListsSrc, const int begin_bit, const int *__restrict__ outputOrder, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrderSrc, const unsigned int *__restrict__ tileListDepthSrc, int *__restrict__ tileListOrderDst, unsigned int *__restrict__ tileListDepthDst)
CudaTileListKernel(int deviceID, bool doStreaming)
void prepareTileList(cudaStream_t stream)
__global__ void setupSortKey(const int numTileLists, const int maxTileListLen, const TileList *__restrict__ tileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, const unsigned int *__restrict__ sortKeys, unsigned int *__restrict__ sortKey)
void setTileListVirialEnergyLength(int len)
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void clearTileListStat(cudaStream_t stream)
__global__ void buildTileListsBBKernel(const int numTileLists, TileList *__restrict__ tileLists, const CudaPatchRecord *__restrict__ patches, const int *__restrict__ tileListPos, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const int maxTileListLen, const BoundingBox *__restrict__ boundingBoxes, int *__restrict__ tileJatomStart, const int tileJatomStartSize, unsigned int *__restrict__ tileListDepth, int *__restrict__ tileListOrder, PatchPairRecord *__restrict__ patchPairs, TileListStat *__restrict__ tileListStat)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ tileListStat
void setTileListVirialEnergyGBISLength(int len)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__global__ void setPatchNumLists_findEmptyPatches(const int numTileLists, TileList *__restrict__ tileLists, const int *__restrict__ patchNumLists, const int numPatches, int *__restrict__ numEmptyPatches, int *__restrict__ emptyPatches)
#define REPACKTILELISTSKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ tileListOrderSrc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
cub::BlockRadixSort< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT > BlockRadixSort
__thread cudaStream_t stream
__global__ void buildRemoveZerosSortKey(const int numTileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, unsigned int *__restrict__ sortKey)
__global__ void calcPatchNumLists(const int numTileLists, const int numPatches, const TileList *__restrict__ tileLists, int *__restrict__ patchNumLists)
#define SORTTILELISTSKERNEL_NUM_THREAD
__global__ void const int const TileList *__restrict__ tileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
__global__ void updatePatchesKernel(const int numComputes, const int *__restrict__ tilePos, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, TileList *__restrict__ tileLists)
cub::BlockLoad< valT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE > BlockLoad
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
const int const int const int const keyT oobKey
__global__ void calcTileListPosKernel(const int numComputes, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, int *__restrict__ tilePos)
#define WARP_BALLOT(MASK, P)
const int numTileListsDst
void finishTileList(cudaStream_t stream)
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen)
void NAMD_die(const char *err_msg)
BlockLoadU::TempStorage loadU
const int const int const int end_bit
__global__ void buildBoundingBoxesKernel(const int atomStorageSize, const float4 *__restrict__ xyzq, BoundingBox *__restrict__ boundingBoxes)
__global__ void const int numTileLists
#define BOUNDINGBOXKERNEL_NUM_WARP
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
__shared__ union @43 tempStorage
BlockRadixSort::TempStorage sort
__global__ void localSort(const int n, const int begin_bit, const int num_bit, unsigned int *__restrict__ keys, int *__restrict__ vals)
__global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit, const int *__restrict__ outputOrder, const unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
__thread DeviceCUDA * deviceCUDA
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ valT *__restrict__ tileListOrderDst typedef cub::BlockLoad< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE > BlockLoadU
#define TILELISTKERNELNEW_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ patchPairs
valT values[SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void fillSortKeys(const int numComputes, const int maxTileListLen, const int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen, const TileList *__restrict__ tileListsSrc, const int *__restrict__ tileListOrderDst, const unsigned int *__restrict__ tileListDepthDst, int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen, int2 *__restrict__ minmaxListLen)
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start