NAMD
CudaTileListKernel.cu
Go to the documentation of this file.
1 // #include <map>
2 // #include <algorithm>
3 #include <cuda.h>
4 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <cub/device/device_radix_sort.cuh>
6 #include <cub/device/device_scan.cuh>
7 #include <cub/cub.cuh>
8 #else
9 #include <namd_cub/device/device_radix_sort.cuh>
10 #include <namd_cub/device/device_scan.cuh>
11 #include <namd_cub/cub.cuh>
12 #endif
13 #include "CudaUtils.h"
14 #include "CudaTileListKernel.h"
15 #include "DeviceCUDA.h"
16 #ifdef WIN32
17 #define __thread __declspec(thread)
18 #endif
19 extern __thread DeviceCUDA *deviceCUDA;
20 
21 #define OVERALLOC 1.2f
22 
23 #if __CUDA_ARCH__ < 350
24 #define __ldg *
25 #endif
26 
27 void NAMD_die(const char *);
28 
29 //
30 // Calculate the number of lists that contribute to each patch
31 //
32 __global__ void calcPatchNumLists(const int numTileLists, const int numPatches,
33  const TileList* __restrict__ tileLists, int* __restrict__ patchNumLists) {
34 
35  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
36  {
37  int2 patchInd = tileLists[i].patchInd;
38  atomicAdd(&patchNumLists[patchInd.x], 1);
39  if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
40  }
41 
42 }
43 
44 //
45 // Write patchNumList back to tile list and
46 // Find empty patches into emptyPatches[0 ... numEmptyPatches-1]
47 //
49  TileList* __restrict__ tileLists, const int* __restrict__ patchNumLists,
50  const int numPatches, int* __restrict__ numEmptyPatches, int* __restrict__ emptyPatches) {
51 
52  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
53  {
54  int2 patchInd = tileLists[i].patchInd;
55  int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
56  tileLists[i].patchNumList = patchNumList;
57  }
58 
59  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
60  {
61  if (patchNumLists[i] == 0) {
62  int ind = atomicAdd(numEmptyPatches, 1);
63  emptyPatches[ind] = i;
64  }
65  }
66 
67 }
68 
69 //
70 // Builds a sort key that removes zeros but keeps the order otherwise the same
71 //
72 __global__ void buildRemoveZerosSortKey(const int numTileLists,
73  const unsigned int* __restrict__ tileListDepth, const int begin_bit, unsigned int* __restrict__ sortKey) {
74 
75  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
76  {
77  int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
78  sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
79  }
80 
81 }
82 
83 __global__ void setupSortKey(const int numTileLists, const int maxTileListLen,
84  const TileList* __restrict__ tileLists, const unsigned int* __restrict__ tileListDepth,
85  const int begin_bit, const unsigned int* __restrict__ sortKeys, unsigned int* __restrict__ sortKey) {
86 
87  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
88  {
89  int icompute = tileLists[itileList].icompute;
90  int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
91  int i = icompute*maxTileListLen + (depth - 1);
92  sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
93  }
94 
95 }
96 
97 template <int width>
98 __global__ void localSort(const int n, const int begin_bit, const int num_bit,
99  unsigned int* __restrict__ keys, int* __restrict__ vals) {
100 
101  // NOTE: blockDim.x = width
102 
103  for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
104  {
105  int i = base + threadIdx.x;
106  typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
107  __shared__ typename BlockRadixSort::TempStorage tempStorage;
108  unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
109  int val[1] = {(i < n) ? vals[i] : 0};
110  BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
111  if (i < n) {
112  keys[i] = key[0];
113  vals[i] = val[0];
114  }
115  BLOCK_SYNC;
116  }
117 
118 }
119 
120 __global__ void storeInReverse(const int numTileListsSrc, const int begin_bit,
121  const int* __restrict__ outputOrder, const int* __restrict__ tileListPos,
122  const int* __restrict__ tileListOrderSrc,
123  const unsigned int* __restrict__ tileListDepthSrc,
124  int* __restrict__ tileListOrderDst,
125  unsigned int* __restrict__ tileListDepthDst) {
126 
127  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
128  {
129  int j = outputOrder[numTileListsSrc - i - 1];
130  if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
131  int k = tileListPos[i];
132  tileListDepthDst[k] = tileListDepthSrc[j];
133  tileListOrderDst[k] = j; //tileListOrderSrc[j];
134  }
135  }
136 }
137 
138 //
139 // Bit shift tileListDepth so that only lower 16 bits are used
140 //
141 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
142  const int* __restrict__ outputOrder, const unsigned int* __restrict__ tileListDepthSrc,
143  unsigned int* __restrict__ tileListDepthDst) {
144 
145  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
146  {
147  int j = outputOrder[numTileLists - i - 1];
148  tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
149  }
150 
151 }
152 
153 __global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen,
154  int2* __restrict__ minmaxListLen) {
155 
156  int2 val;
157  val.x = maxTileListLen+1;
158  val.y = 0;
159  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
160  {
161  minmaxListLen[i] = val;
162  }
163 
164 }
165 
166 //
167 // Build sortKeys[], values are in range 0 ... numTileListsDst-1
168 //
169 __global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen,
170  const TileList* __restrict__ tileListsSrc,
171  const int* __restrict__ tileListOrderDst,
172  const unsigned int* __restrict__ tileListDepthDst,
173  int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
174 
175  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
176  {
177  int k = tileListOrderDst[i];
178  int icompute = tileListsSrc[k].icompute;
179  int depth = tileListDepthDst[i] & 65535;
180  // depth is in range [1 ... maxTileListLen]
181  int j = icompute*maxTileListLen + (depth-1);
182  sortKeys[j] = i;
183  int2 minmax = minmaxListLen[icompute];
184  int2 minmaxOrig = minmax;
185  if (minmax.x > depth) minmax.x = depth;
186  if (minmax.y < depth) minmax.y = depth;
187  if (minmax.x != minmaxOrig.x) {
188  atomicMin(&minmaxListLen[icompute].x, minmax.x);
189  }
190  if (minmax.y != minmaxOrig.y) {
191  atomicMax(&minmaxListLen[icompute].y, minmax.y);
192  }
193  }
194 
195 }
196 
197 __global__ void fillSortKeys(const int numComputes, const int maxTileListLen,
198  const int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
199 
200  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
201  const int wid = threadIdx.x % WARPSIZE;
202  int2 minmax = minmaxListLen[i];
203  int minlen = minmax.x;
204  int maxlen = minmax.y;
205  // minlen, maxlen are in range [1 ... maxTileListLen]
206  // as long as i is in tileListsSrc[].icompute above
207  if ( maxlen < minlen ) {
208  minlen = 1;
209  maxlen = maxTileListLen;
210  }
211  unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
212  unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
213  unsigned int aveKey = (maxKey + minKey)/2;
214  for (int j=wid;j < minlen-1;j+=WARPSIZE) {
215  sortKeys[i*maxTileListLen + j] = minKey;
216  }
217  for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
218  sortKeys[i*maxTileListLen + j] = maxKey;
219  }
220  for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
221  if (sortKeys[i*maxTileListLen + j] == 0) {
222  sortKeys[i*maxTileListLen + j] = aveKey;
223  }
224  }
225  }
226 
227 }
228 
229 //
230 // Calculate bounding boxes for sets of WARPSIZE=32 atoms
231 //
232 #define BOUNDINGBOXKERNEL_NUM_WARP 8
233 __global__ void
234 buildBoundingBoxesKernel(const int atomStorageSize, const float4* __restrict__ xyzq,
235  BoundingBox* __restrict__ boundingBoxes) {
236 
237  const int warpId = threadIdx.x / WARPSIZE;
238  const int wid = threadIdx.x % WARPSIZE;
239 
240  // Loop with warp-aligned index to avoid warp-divergence
241  for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
242  // Full atom index
243  const int i = iwarp + wid;
244  // Bounding box index
245  const int ibb = i/WARPSIZE;
246 
247  float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
248 
249  volatile float3 minxyz, maxxyz;
250 
251  typedef cub::WarpReduce<float> WarpReduce;
252  __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
253  minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
254  minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
255  minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
256  maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
257  maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
258  maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
259 
260  if (wid == 0) {
261  BoundingBox boundingBox;
262  boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
263  boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
264  boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
265  boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
266  boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
267  boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
268  boundingBoxes[ibb] = boundingBox;
269  }
270  }
271 
272 }
273 
274 //
275 // Returns the lower estimate for the distance between two bounding boxes
276 //
277 __device__ __forceinline__ float distsq(const BoundingBox a, const BoundingBox b) {
278  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
279  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
280  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
281  float r2 = dx*dx + dy*dy + dz*dz;
282  return r2;
283 }
284 
285 #if 0
286 //
287 // Performs warp-level exclusive sum on a shared memory array:
288 // sh_out[0 ... n-1] = exclusiveSum( sh_in[0 ... n-1] )
289 // sh_in and sh_out can point to same array
290 // Returns the total sum
291 //
292 template <typename T>
293 __device__ __forceinline__
294 int shWarpExclusiveSum(const int n, volatile T* sh_in, volatile int* sh_out) {
295  const int wid = threadIdx.x % WARPSIZE;
296  volatile int blockOffset = 0;
297  for (int iblock=0;iblock < n;iblock += WARPSIZE) {
298  // Size of the exclusive sum
299  int blockLen = min(WARPSIZE, n-iblock);
300  // Perform exclusive sum on sh_in[iblock ... iblock + blockLen-1]
301  typedef cub::WarpScan<int> WarpScan;
302  __shared__ typename WarpScan::TempStorage tempStorage;
303  int data = (wid < blockLen) ? (int)sh_in[iblock + wid] : 0;
304  WarpScan(tempStorage).ExclusiveSum(data, data);
305  // Shift by block offset
306  data += blockOffset;
307  // Save last value
308  int last = (int)sh_in[iblock + blockLen-1];
309  // Write output
310  if (wid < blockLen) sh_out[iblock + wid] = data;
311  // block offset = last term of the exclusive sum + last value
312  blockOffset = sh_out[iblock + blockLen-1] + last;
313  }
314  return blockOffset;
315 }
316 #endif
317 
318 #define TILELISTKERNELNEW_NUM_WARP 4
319 
320 //
321 // NOTE: Executed on a single thread block
322 //
323 template<int nthread>
324 __global__ void calcTileListPosKernel(const int numComputes,
325  const CudaComputeRecord* __restrict__ computes,
326  const CudaPatchRecord* __restrict__ patches,
327  int* __restrict__ tilePos) {
328 
329  typedef cub::BlockScan<int, nthread> BlockScan;
330 
331  __shared__ typename BlockScan::TempStorage tempStorage;
332  __shared__ int shTilePos0;
333 
334  if (threadIdx.x == nthread-1) {
335  shTilePos0 = 0;
336  }
337 
338  for (int base=0;base < numComputes;base+=nthread) {
339  int k = base + threadIdx.x;
340 
341  int numTiles1 = (k < numComputes) ? (patches[computes[k].patchInd.x].numAtoms-1)/WARPSIZE+1 : 0;
342 
343  // Calculate positions in tile list and jtile list
344  int tilePosVal;
345  BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
346 
347  // Store into global memory
348  if (k < numComputes) {
349  tilePos[k] = shTilePos0 + tilePosVal;
350  }
351 
352  BLOCK_SYNC;
353  // Store block end position
354  if (threadIdx.x == nthread-1) {
355  shTilePos0 += tilePosVal + numTiles1;
356  }
357  }
358 }
359 
360 
361 template<int nthread>
362 __global__ void updatePatchesKernel(const int numComputes,
363  const int* __restrict__ tilePos,
364  const CudaComputeRecord* __restrict__ computes,
365  const CudaPatchRecord* __restrict__ patches,
366  TileList* __restrict__ tileLists) {
367 
368  const int tid = threadIdx.x % nthread;
369 
370  // nthread threads takes care of one compute
371  for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
372  {
373  CudaComputeRecord compute = computes[k];
374  float3 offsetXYZ = compute.offsetXYZ;
375  int2 patchInd = compute.patchInd;
376  int numTiles1 = (patches[patchInd.x].numAtoms-1)/WARPSIZE+1;
377  int itileList0 = tilePos[k];
378  for (int i=tid;i < numTiles1;i+=nthread) {
379  tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
380  tileLists[itileList0 + i].patchInd = patchInd;
381  tileLists[itileList0 + i].icompute = k;
382  }
383  }
384 
385 }
386 
387 __host__ __device__ __forceinline__
388 int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen) {
389  // Size in bytes
390  int size = (
391  maxTileListLen*sizeof(char)
392  );
393  return size;
394 }
395 
396 __global__ void
398  TileList* __restrict__ tileLists,
399  const CudaPatchRecord* __restrict__ patches,
400  const int* __restrict__ tileListPos,
401  const float3 lata, const float3 latb, const float3 latc,
402  const float cutoff2, const int maxTileListLen,
403  const BoundingBox* __restrict__ boundingBoxes,
404  int* __restrict__ tileJatomStart,
405  const int tileJatomStartSize,
406  unsigned int* __restrict__ tileListDepth,
407  int* __restrict__ tileListOrder,
408  PatchPairRecord* __restrict__ patchPairs,
409  TileListStat* __restrict__ tileListStat) {
410 
411  extern __shared__ char sh_buffer[];
412  int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
413  int pos = threadIdx.x*sizePerThread;
414  volatile char* sh_tile = (char*)&sh_buffer[pos];
415 
416  // Loop with warp-aligned index to avoid warp-divergence
417  for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
418 
419  // Use one thread per tile list
420  const int wid = threadIdx.x % WARPSIZE;
421  const int itileList = iwarp + wid;
422 
423  int i;
424  int itileListLen = 0;
425  CudaPatchRecord patch1;
426  CudaPatchRecord patch2;
427  float3 offsetXYZ;
428  int2 patchInd;
429  int numTiles2;
430  int icompute;
431 
432  if (itileList < numTileLists) {
433  offsetXYZ = tileLists[itileList].offsetXYZ;
434  patchInd = tileLists[itileList].patchInd;
435  icompute = tileLists[itileList].icompute;
436  // Get i-column
437  i = itileList - tileListPos[icompute];
438 
439  float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
440  float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
441  float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
442 
443  // DH - set zeroShift flag if magnitude of shift vector is zero
444  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
445 
446  // Load patches
447  patch1 = patches[patchInd.x];
448  patch2 = patches[patchInd.y];
449  // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
450  numTiles2 = (patch2.numAtoms-1)/WARPSIZE+1;
451  int tileStart1 = patch1.atomStart/WARPSIZE;
452  int tileStart2 = patch2.atomStart/WARPSIZE;
453 
454  // DH - self requires that zeroShift is also set
455  bool self = zeroShift && (tileStart1 == tileStart2);
456 
457  // Load i-atom data (and shift coordinates)
458  BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
459  boundingBoxI.x += shx;
460  boundingBoxI.y += shy;
461  boundingBoxI.z += shz;
462 
463  for (int j=0;j < numTiles2;j++) {
464  sh_tile[j] = 0;
465  if (!self || j >= i) {
466  BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
467  float r2bb = distsq(boundingBoxI, boundingBoxJ);
468  if (r2bb < cutoff2) {
469  sh_tile[j] = 1;
470  itileListLen++;
471  }
472  }
473  }
474 
475  tileListDepth[itileList] = (unsigned int)itileListLen;
476  tileListOrder[itileList] = itileList;
477  }
478 
479  typedef cub::WarpScan<int> WarpScan;
480  __shared__ typename WarpScan::TempStorage tempStorage;
481  int active = (itileListLen > 0);
482  int activePos;
483  WarpScan(tempStorage).ExclusiveSum(active, activePos);
484  int itileListPos;
485  WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
486 
487  int jtileStart, numJtiles;
488  // Last thread in the warp knows the total number
489  if (wid == WARPSIZE-1) {
490  atomicAdd(&tileListStat->numTileLists, activePos + active);
491  numJtiles = itileListPos + itileListLen;
492  jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
493  }
494  numJtiles = cub::ShuffleIndex<WARPSIZE>(numJtiles, WARPSIZE-1, WARP_FULL_MASK);
495  jtileStart = cub::ShuffleIndex<WARPSIZE>(jtileStart, WARPSIZE-1, WARP_FULL_MASK);
496  if (jtileStart + numJtiles > tileJatomStartSize) {
497  // tileJatomStart out of memory, exit
498  if (wid == 0) tileListStat->tilesSizeExceeded = true;
499  return;
500  }
501 
502  int jStart = itileListPos;
503  int jEnd = cub::ShuffleDown<WARPSIZE>(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);
504  if (wid == WARPSIZE-1) jEnd = numJtiles;
505 
506  if (itileListLen > 0) {
507  // Setup tileLists[]
508  TileList TLtmp;
509  TLtmp.iatomStart = patch1.atomStart + i*WARPSIZE;
510  TLtmp.jtileStart = jtileStart + jStart;
511  TLtmp.jtileEnd = jtileStart + jEnd - 1;
512  TLtmp.patchInd = patchInd;
513  TLtmp.offsetXYZ = offsetXYZ;
514  TLtmp.icompute = icompute;
515  // TLtmp.patchNumList.x = 0;
516  // TLtmp.patchNumList.y = 0;
517  tileLists[itileList] = TLtmp;
518  // PatchPair
519  PatchPairRecord patchPair;
520  patchPair.iatomSize = patch1.atomStart + patch1.numAtoms;
521  patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
522  patchPair.jatomSize = patch2.atomStart + patch2.numAtoms;
523  patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
524  patchPairs[itileList] = patchPair;
525 
526  // Write tiles
527  int jtile = jtileStart + jStart;
528  for (int j=0;j < numTiles2;j++) {
529  if (sh_tile[j]) {
530  tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
531  jtile++;
532  }
533  }
534 
535  }
536 
537  }
538 
539 }
540 
541 #define REPACKTILELISTSKERNEL_NUM_WARP 32
542 __global__ void
543 repackTileListsKernel(const int numTileLists, const int begin_bit, const int* __restrict__ tileListPos,
544  const int* __restrict__ tileListOrder,
545  const int* __restrict__ jtiles,
546  const TileList* __restrict__ tileListsSrc, TileList* __restrict__ tileListsDst,
547  const PatchPairRecord* __restrict__ patchPairsSrc, PatchPairRecord* __restrict__ patchPairsDst,
548  const int* __restrict__ tileJatomStartSrc, int* __restrict__ tileJatomStartDst,
549  const TileExcl* __restrict__ tileExclsSrc, TileExcl* __restrict__ tileExclsDst) {
550 
551  const int wid = threadIdx.x % WARPSIZE;
552 
553  // One warp does one tile list
554  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/WARPSIZE*gridDim.x)
555  {
556  int j = tileListOrder[i];
557  int start = tileListPos[i];
558  int end = tileListPos[i+1]-1;
559  if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
560  // TileList
561  int startOld = __ldg(&tileListsSrc[j].jtileStart);
562  int endOld = __ldg(&tileListsSrc[j].jtileEnd);
563  int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
564  float3 offsetXYZ;
565  offsetXYZ.x = __ldg(&tileListsSrc[j].offsetXYZ.x);
566  offsetXYZ.y = __ldg(&tileListsSrc[j].offsetXYZ.y);
567  offsetXYZ.z = __ldg(&tileListsSrc[j].offsetXYZ.z);
568  int2 patchInd = tileListsSrc[j].patchInd;
569  int icompute = __ldg(&tileListsSrc[j].icompute);
570  if (wid == 0) {
571  TileList tileList;
572  tileList.iatomStart = iatomStart;
573  tileList.offsetXYZ = offsetXYZ;
574  tileList.jtileStart = start;
575  tileList.jtileEnd = end;
576  tileList.patchInd = patchInd;
577  tileList.icompute = icompute;
578  tileListsDst[i] = tileList;
579  }
580 
581  if (jtiles == NULL) {
582  // No jtiles, simple copy will do
583  int jtile = start;
584  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
585  if (jtileOld + wid <= endOld) {
586  tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
587  }
588  }
589  if (tileExclsSrc != NULL) {
590  int jtile = start;
591  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
592  tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
593  }
594  }
595  } else {
596  int jtile0 = start;
597  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
598  int t = jtileOld + wid;
599  int jtile = (t <= endOld) ? jtiles[t] : 0;
600  jtile >>= begin_bit;
601  jtile &= 65535;
602  typedef cub::WarpScan<int> WarpScan;
603  __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
604  int warpId = threadIdx.x / WARPSIZE;
605  int jtilePos;
606  WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
607 
608  if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
609 
610  if (tileExclsSrc != NULL) {
611  unsigned int b = WARP_BALLOT(WARP_FULL_MASK, jtile);
612  while (b != 0) {
613  // k = index of thread that has data
614  int k = __ffs(b) - 1;
615  tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
616  // remove 1 bit and advance jtile0
617  b ^= ((unsigned int)1 << k);
618  jtile0++;
619  }
620  } else {
621  jtile0 += __popc(WARP_BALLOT(WARP_FULL_MASK, jtile));
622  }
623  }
624  }
625  }
626 
627 }
628 
629 //
630 // NOTE: Executed on a single thread block
631 // oobKey = out-of-bounds key value
632 //
633 #define SORTTILELISTSKERNEL_NUM_THREAD 512
634 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 22
635 template <typename keyT, typename valT, bool ascend>
637 void sortTileListsKernel(const int numTileListsSrc, const int numTileListsDst,
638  const int begin_bit, const int end_bit, const keyT oobKey,
639  keyT* __restrict__ tileListDepthSrc, keyT* __restrict__ tileListDepthDst,
640  valT* __restrict__ tileListOrderSrc, valT* __restrict__ tileListOrderDst) {
641 
643  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU;
644 
646  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
647 
650 
651  __shared__ union {
652  typename BlockLoad::TempStorage load;
653  typename BlockLoadU::TempStorage loadU;
654  typename BlockRadixSort::TempStorage sort;
655  } tempStorage;
656 
659 
660  BlockLoadU(tempStorage.loadU).Load(tileListDepthSrc, keys, numTileListsSrc, oobKey);
662  BlockLoad(tempStorage.load).Load(tileListOrderSrc, values, numTileListsSrc);
663  BLOCK_SYNC;
664 
665  if (ascend)
666  BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys, values, begin_bit, end_bit);
667  else
668  BlockRadixSort(tempStorage.sort).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
669 
670  cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListDepthDst, keys, numTileListsDst);
671  cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListOrderDst, values, numTileListsDst);
672 }
673 
674 __global__ void reOrderTileListDepth(const int numTileLists, const int* __restrict__ tileListOrder,
675  unsigned int* __restrict__ tileListDepthSrc, unsigned int* __restrict__ tileListDepthDst) {
676 
677  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
678  {
679  int j = tileListOrder[i];
680  tileListDepthDst[i] = tileListDepthSrc[j];
681  }
682 
683 }
684 
685 //
686 // Bit shift tileListDepth so that only lower 16 bits are used
687 //
688 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
689  unsigned int* __restrict__ tileListDepth) {
690 
691  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
692  {
693  unsigned int a = tileListDepth[itileList];
694  a >>= begin_bit;
695  a &= 65535;
696  tileListDepth[itileList] = a;
697  }
698 
699 }
700 
701 // ##############################################################################################
702 // ##############################################################################################
703 // ##############################################################################################
704 
705 CudaTileListKernel::CudaTileListKernel(int deviceID, bool doStreaming) :
706 deviceID(deviceID), doStreaming(doStreaming) {
707 
708  cudaCheck(cudaSetDevice(deviceID));
709 
710  activeBuffer = 1;
711 
712  numPatches = 0;
713  numComputes = 0;
714 
715  cudaPatches = NULL;
716  cudaPatchesSize = 0;
717 
718  cudaComputes = NULL;
719  cudaComputesSize = 0;
720 
721  patchNumLists = NULL;
722  patchNumListsSize = 0;
723 
724  emptyPatches = NULL;
725  emptyPatchesSize = 0;
726  h_emptyPatches = NULL;
727  h_emptyPatchesSize = 0;
728  numEmptyPatches = 0;
729 
730  sortKeySrc = NULL;
731  sortKeySrcSize = 0;
732  sortKeyDst = NULL;
733  sortKeyDstSize = 0;
734 
735  tileLists1 = NULL;
736  tileLists1Size = 0;
737  tileLists2 = NULL;
738  tileLists2Size = 0;
739 
740  patchPairs1 = NULL;
741  patchPairs1Size = 0;
742  patchPairs2 = NULL;
743  patchPairs2Size = 0;
744 
745  tileJatomStart1 = NULL;
746  tileJatomStart1Size = 0;
747  tileJatomStart2 = NULL;
748  tileJatomStart2Size = 0;
749 
750  boundingBoxes = NULL;
751  boundingBoxesSize = 0;
752 
753  tileListDepth1 = NULL;
754  tileListDepth1Size = 0;
755  tileListDepth2 = NULL;
756  tileListDepth2Size = 0;
757 
758  tileListOrder1 = NULL;
759  tileListOrder1Size = 0;
760  tileListOrder2 = NULL;
761  tileListOrder2Size = 0;
762 
763  tileExcls1 = NULL;
764  tileExcls1Size = 0;
765  tileExcls2 = NULL;
766  tileExcls2Size = 0;
767 
768  xyzq = NULL;
769  xyzqSize = 0;
770 
771  allocate_device<TileListStat>(&d_tileListStat, 1);
772  allocate_host<TileListStat>(&h_tileListStat, 1);
773 
774  tileListPos = NULL;
775  tileListPosSize = 0;
776  tempStorage = NULL;
777  tempStorageSize = 0;
778 
779  jtiles = NULL;
780  jtilesSize = 0;
781 
782  tilePos = NULL;
783  tilePosSize = 0;
784 
785  tileListsGBIS = NULL;
786  tileListsGBISSize = 0;
787 
788  tileJatomStartGBIS = NULL;
789  tileJatomStartGBISSize = 0;
790 
791  tileListVirialEnergy = NULL;
792  tileListVirialEnergySize = 0;
793 
794  atomStorageSize = 0;
795  numTileLists = 0;
796  numTileListsGBIS = 0;
797  numJtiles = 1;
798 
799  outputOrder = NULL;
800  outputOrderSize = 0;
801  doOutputOrder = false;
802 
803  minmaxListLen = NULL;
804  minmaxListLenSize = 0;
805 
806  sortKeys = NULL;
807  sortKeysSize = 0;
808  sortKeys_endbit = 0;
809 
810  cudaCheck(cudaEventCreate(&tileListStatEvent));
811  tileListStatEventRecord = false;
812 }
813 
815  cudaCheck(cudaSetDevice(deviceID));
816  deallocate_device<TileListStat>(&d_tileListStat);
817  deallocate_host<TileListStat>(&h_tileListStat);
818  //
819  if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
820  if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
821  if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
822  if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
823  if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
824  //
825  if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
826  if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
827  if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
828  if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
829  if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
830  if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
831  if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
832  if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
833  if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
834  if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
835  if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
836  if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
837  if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
838  if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
839  if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
840  if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
841  if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
842  if (jtiles != NULL) deallocate_device<int>(&jtiles);
843  if (tilePos != NULL) deallocate_device<int>(&tilePos);
844 
845  if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
846  if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
847 
848  if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
849 
850  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
851 
852  if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
853  if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
854 
855  cudaCheck(cudaEventDestroy(tileListStatEvent));
856 }
857 
859  clear_device_array<int>(jtiles, numJtiles, stream);
860 }
861 
863  // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
864  memset(h_tileListStat, 0, sizeof(TileListStat));
865  h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
866  copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
867 }
868 
870  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
871  cudaCheck(cudaEventRecord(tileListStatEvent, stream));
872  tileListStatEventRecord = true;
873 }
874 
875 void CudaTileListKernel::updateComputes(const int numComputesIn,
876  const CudaComputeRecord* h_cudaComputes, cudaStream_t stream) {
877 
878  numComputes = numComputesIn;
879 
880  reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
881  copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
882 
883  if (doStreaming) doOutputOrder = true;
884 }
885 
886 void CudaTileListKernel::writeTileList(const char* filename, const int numTileLists,
887  const TileList* d_tileLists, cudaStream_t stream) {
888 
889  TileList* h_tileLists = new TileList[numTileLists];
890  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
891  cudaCheck(cudaStreamSynchronize(stream));
892  FILE* handle = fopen(filename,"wt");
893  for (int itileList=0;itileList < numTileLists;itileList++) {
894  TileList tmp = h_tileLists[itileList];
895  fprintf(handle, "%d %d %d %f %f %f %d %d %d %d\n",
896  tmp.iatomStart, tmp.jtileStart, tmp.jtileEnd, tmp.offsetXYZ.x, tmp.offsetXYZ.y,
897  tmp.offsetXYZ.z, tmp.patchInd.x, tmp.patchInd.y, tmp.patchNumList.x, tmp.patchNumList.y);
898  }
899  fclose(handle);
900  delete [] h_tileLists;
901 }
902 
903 void CudaTileListKernel::writeTileJatomStart(const char* filename, const int numJtiles,
904  const int* d_tileJatomStart, cudaStream_t stream) {
905 
906  int* h_tileJatomStart = new int[numJtiles];
907  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
908  cudaCheck(cudaStreamSynchronize(stream));
909  FILE* handle = fopen(filename,"wt");
910  for (int i=0;i < numJtiles;i++) {
911  fprintf(handle, "%d\n", h_tileJatomStart[i]);
912  }
913  fclose(handle);
914  delete [] h_tileJatomStart;
915 }
916 
917 /*
918 std::pair<int, int> flip_pair(const std::pair<int, int> &p)
919 {
920  return std::pair<int, int>(p.second, p.first);
921 }
922 
923 void CudaTileListKernel::markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
924  const int numJtiles, int* d_tileJatomStart, cudaStream_t stream) {
925 
926  const int shCacheSize = 10;
927  TileList* h_tileLists = new TileList[numTileLists];
928  int* h_tileJatomStart = new int[numJtiles];
929  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
930  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
931  cudaCheck(cudaStreamSynchronize(stream));
932  int ntotal = 0;
933  int ncache = 0;
934  for (int i=0;i < numTileLists;i+=width) {
935  int jend = min(i + width, numTileLists);
936  std::map<int, int> atomStartMap;
937  std::map<int, int>::iterator it;
938  atomStartMap.clear();
939  for (int j=i;j < jend;j++) {
940  TileList tmp = h_tileLists[j];
941  int iatomStart = tmp.iatomStart;
942  it = atomStartMap.find(iatomStart);
943  if (it == atomStartMap.end()) {
944  // Insert new
945  atomStartMap.insert( std::pair<int, int>(iatomStart, 0) );
946  } else {
947  // Increase counter
948  it->second--;
949  }
950  int jtileStart = tmp.jtileStart;
951  int jtileEnd = tmp.jtileEnd;
952  ntotal += (jtileEnd - jtileStart + 1) + 1;
953  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
954  int jatomStart = h_tileJatomStart[jtile];
955  it = atomStartMap.find(jatomStart);
956  if (it == atomStartMap.end()) {
957  // Insert new
958  atomStartMap.insert( std::pair<int, int>(jatomStart, 0) );
959  } else {
960  // Increase counter
961  it->second--;
962  }
963  jatomStart |= (65535 << 16);
964  h_tileJatomStart[jtile] = jatomStart;
965  }
966  iatomStart |= (65535 << 16);
967  tmp.iatomStart = iatomStart;
968  h_tileLists[j] = tmp;
969  }
970  ncache += atomStartMap.size();
971  std::multimap<int, int> imap;
972  imap.clear();
973  std::multimap<int, int>::iterator imap_it;
974  std::transform(atomStartMap.begin(), atomStartMap.end(), std::inserter(imap, imap.begin()), flip_pair);
975  if (i < 400) {
976  printf("%d %d\n", ntotal, imap.size());
977  for (imap_it = imap.begin();imap_it != imap.end();imap_it++) {
978  if (imap_it->first != 0)
979  printf("(%d %d)\n", imap_it->first, imap_it->second);
980  }
981  }
982  }
983  printf("ntotal %d ncache %d\n", ntotal, ncache);
984  copy_HtoD<TileList>(h_tileLists, d_tileLists, numTileLists, stream);
985  copy_HtoD<int>(h_tileJatomStart, d_tileJatomStart, numJtiles, stream);
986  cudaCheck(cudaStreamSynchronize(stream));
987  delete [] h_tileLists;
988  delete [] h_tileJatomStart;
989 }
990 */
991 
992 void CudaTileListKernel::buildTileLists(const int numTileListsPrev,
993  const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
994  const float3 lata, const float3 latb, const float3 latc,
995  const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq,
996  const float plcutoff2In, const size_t maxShmemPerBlock,
997  cudaStream_t stream) {
998 
999  numPatches = numPatchesIn;
1000  atomStorageSize = atomStorageSizeIn;
1001  maxTileListLen = maxTileListLenIn;
1002  plcutoff2 = plcutoff2In;
1003 
1004  if (doStreaming) {
1005  // Re-allocate patchNumLists
1006  reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1007  reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1008  reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1009  }
1010 
1011  // Re-allocate (tileLists1, patchPairs1
1012  reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
1013  reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
1014 
1015  // Copy cudaPatches to device
1016  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1017  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1018 
1019  // Re-allocate temporary storage
1020  reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
1021  // Calculate tile list positions (tilePos)
1022  {
1023  int nthread = 1024;
1024  int nblock = 1;
1025  calcTileListPosKernel<1024> <<< nblock, nthread, 0, stream >>>
1026  (numComputes, cudaComputes, cudaPatches, tilePos);
1027  cudaCheck(cudaGetLastError());
1028  }
1029 
1030  // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
1031  {
1032  int nthread = 512;
1033  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/32)+1);
1034  updatePatchesKernel<32> <<< nblock, nthread, 0, stream >>>
1035  (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1036  cudaCheck(cudaGetLastError());
1037  }
1038 
1039  // ---------------------------------------------------------------------------------------------
1040 
1041 
1042  // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
1043  // tileListDepth2 and tileListOrder2 since they're used in sorting
1044  reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
1045  reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
1046 
1047  // Allocate with +1 to include last term in the exclusive sum
1048  reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
1049 
1050  reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
1051 
1052  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
1053 
1054  copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
1055 
1056  // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
1057  {
1058  int numBoundingBoxes = atomStorageSize/WARPSIZE;
1059  reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
1060 
1061  int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
1062  int nthread = WARPSIZE*nwarp;
1063  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1064  buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1065  cudaCheck(cudaGetLastError());
1066  }
1067 
1068  {
1069  int nwarp = TILELISTKERNELNEW_NUM_WARP;
1070  int nthread = WARPSIZE*nwarp;
1071  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
1072 
1073  int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
1074  if(shmem_size > maxShmemPerBlock){
1075  NAMD_die("CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1076  }
1077 
1078  // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
1079  // tell the required size in h_tileListStat->numJtiles. In subsequent calls,
1080  // re-allocation only happens when the size is exceeded.
1081  h_tileListStat->tilesSizeExceeded = true;
1082  int reallocCount = 0;
1083  while (h_tileListStat->tilesSizeExceeded) {
1084  reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
1085 
1086  clearTileListStat(stream);
1087  // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
1088 
1089  buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>>
1090  (numTileListsPrev, tileLists1, cudaPatches, tilePos,
1091  lata, latb, latc, plcutoff2, maxTileListLen,
1092  boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1093  tileListDepth1, tileListOrder1, patchPairs1,
1094  d_tileListStat);
1095 
1096  cudaCheck(cudaGetLastError());
1097 
1098  // get (numATileLists, numJtiles, tilesSizeExceeded)
1099  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
1100  cudaCheck(cudaStreamSynchronize(stream));
1101  numJtiles = h_tileListStat->numJtiles;
1102 
1103  if (h_tileListStat->tilesSizeExceeded) {
1104  reallocCount++;
1105  if (reallocCount > 1) {
1106  NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
1107  }
1108  }
1109 
1110  }
1111 
1112  numTileLists = h_tileListStat->numTileLists;
1113 
1114  reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
1115  }
1116 
1117  // Re-allocate tileListVirialEnergy.
1118  // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
1119  // we're quaranteed to have enough space
1120  reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
1121 
1122  reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
1123  reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
1124  reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
1125  reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
1126  reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
1127 
1128  int numTileListsSrc = numTileListsPrev;
1129  int numJtilesSrc = numJtiles;
1130  int numTileListsDst = numTileLists;
1131  int numJtilesDst = numJtiles;
1132 
1133  // Sort tiles
1134  sortTileLists(
1135  false,
1136  0, false,
1137  numTileListsSrc, numJtilesSrc,
1138  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1139  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1140  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1141  numTileListsDst, numJtilesDst,
1142  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1143  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1144  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1145  stream);
1146 
1147  // Set active buffer to 2
1148  setActiveBuffer(2);
1149 
1150  if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
1151 }
1152 
1153 //
1154 // Returns integer log2(a) rounded up
1155 //
1156 int ilog2(int a) {
1157  // if (a < 0)
1158  // NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
1159  int k = 1;
1160  while (a >>= 1) k++;
1161  return k;
1162 }
1163 
1164 //
1165 // Sort tile lists
1166 //
1167 void CudaTileListKernel::sortTileLists(
1168  const bool useJtiles,
1169  const int begin_bit, const bool highDepthBitsSetIn,
1170  // Source
1171  const int numTileListsSrc, const int numJtilesSrc,
1172  PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
1173  PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
1174  PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
1175  // Destination
1176  const int numTileListsDst, const int numJtilesDst,
1177  PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
1178  PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
1179  PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
1180  cudaStream_t stream) {
1181 
1182  bool doShiftDown = (begin_bit != 0 || highDepthBitsSetIn);
1183 
1184  // if (numTileListsDst == 0)
1185  // NAMD_die("CudaTileListKernel::sortTileLists, numTileListsDst = 0");
1186 
1187  // Check that the array sizes are adequate
1188  if (numTileListsSrc > tileListsSrc.size || numJtilesSrc > tileJatomStartSrc.size ||
1189  numTileListsSrc > tileListDepthSrc.size || numTileListsSrc > tileListOrderSrc.size ||
1190  (patchPairsSrc.ptr != NULL && numTileListsSrc > patchPairsSrc.size) ||
1191  (tileExclsSrc.ptr != NULL && numJtilesSrc > tileExclsSrc.size))
1192  NAMD_die("CudaTileListKernel::sortTileLists, Src allocated too small");
1193 
1194  if (numTileListsDst > tileListsDst.size || numJtilesDst > tileJatomStartDst.size ||
1195  numTileListsSrc > tileListDepthDst.size || numTileListsSrc > tileListOrderDst.size ||
1196  (patchPairsDst.ptr != NULL && numTileListsDst > patchPairsDst.size) ||
1197  (tileExclsDst.ptr != NULL && numJtilesDst > tileExclsDst.size))
1198  NAMD_die("CudaTileListKernel::sortTileLists, Dst allocated too small");
1199 
1200  if (begin_bit != 0 && begin_bit != 16)
1201  NAMD_die("CudaTileListKernel::sortTileLists, begin_bit must be 0 or 16");
1202 
1203  // Number of bits needed in the sort
1204  int num_bit = ilog2(maxTileListLen);
1205  if (num_bit > 16)
1206  NAMD_die("CudaTileListKernel::sortTileLists, num_bit overflow");
1207  int end_bit = begin_bit + num_bit;
1208 
1209  if (doStreaming)
1210  {
1211  // ----------------------------------------------------------------------------------------
1212  if (doOutputOrder && useJtiles) {
1213  // outputOrder has been produced, put tile lists back in reverse order and produce sortKeys
1214  // NOTE: This is done very infrequently, typically only once when the MD run starts.
1215 
1216  // Calculate position from depth
1217  {
1218  // -----------------------------------------------------------------------------
1219  // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1220  // -----------------------------------------------------------------------------
1221  if (doShiftDown)
1222  {
1223  int nthread = 1024;
1224  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1225  bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1226  (numTileListsSrc, begin_bit, outputOrder, tileListDepthSrc.ptr, tileListDepthDst.ptr);
1227  cudaCheck(cudaGetLastError());
1228  }
1229 
1230  reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsSrc, OVERALLOC);
1231 
1232  // --------------------------------------------------------------------
1233  // Compute itileList positions to store tileLists
1234  // ExclusiveSum(tileListDepthDst[0...numTileListsDst-1])
1235  // --------------------------------------------------------------------
1236  {
1237  size_t size = 0;
1238  cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
1239  (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1240  // Make sure tempStorage doesn't remain NULL
1241  if (size == 0) size = 128;
1242  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1243  size = tempStorageSize;
1244  cudaCheck(cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1245  (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1246  }
1247  }
1248 
1249  // Store in reverse order from outputOrder
1250  {
1251  int nthread = 1024;
1252  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1253  storeInReverse <<< nblock, nthread, 0, stream >>>
1254  (numTileListsSrc, begin_bit, outputOrder, tileListPos,
1255  tileListOrderSrc.ptr, tileListDepthSrc.ptr,
1256  tileListOrderDst.ptr, tileListDepthDst.ptr);
1257  cudaCheck(cudaGetLastError());
1258  }
1259 
1260  // Build sortKeys
1261  {
1262  maxTileListLen_sortKeys = maxTileListLen;
1263 
1264  reallocate_device<unsigned int>(&sortKeys, &sortKeysSize, numComputes*maxTileListLen);
1265  clear_device_array<unsigned int>(sortKeys, numComputes*maxTileListLen, stream);
1266 
1267  // Re-allocate and initialize minmaxListLen
1268  {
1269  reallocate_device<int2>(&minmaxListLen, &minmaxListLenSize, numComputes);
1270 
1271  int nthread = 1024;
1272  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nthread+1);
1273  initMinMaxListLen <<< nblock, nthread, 0, stream >>>
1274  (numComputes, maxTileListLen, minmaxListLen);
1275  cudaCheck(cudaGetLastError());
1276  }
1277 
1278  // Build sortKeys and calculate minmaxListLen
1279  {
1280  int nthread = 1024;
1281  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1282  buildSortKeys <<< nblock, nthread, 0, stream >>>
1283  (numTileListsDst, maxTileListLen, tileListsSrc.ptr, tileListOrderDst.ptr,
1284  tileListDepthDst.ptr, minmaxListLen, sortKeys);
1285  cudaCheck(cudaGetLastError());
1286 
1287  // Maximum value in sortKeys[] is numTileListsDst - 1
1288  sortKeys_endbit = ilog2(numTileListsDst);
1289  }
1290 
1291  // Fill in missing sortKeys using minmaxListLen
1292  {
1293  int nthread = 1024;
1294  int nwarp = nthread/WARPSIZE;
1295  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nwarp+1);
1296  fillSortKeys <<< nblock, nthread, 0, stream >>>
1297  (numComputes, maxTileListLen, minmaxListLen, sortKeys);
1298  cudaCheck(cudaGetLastError());
1299  }
1300 
1301  }
1302 
1303  doOutputOrder = false;
1304 
1305  } else if (doOutputOrder) {
1306  // OutputOrder will be produced in next pairlist non-bond kernel.
1307  // This time just remove zero length lists
1308  // NOTE: This is done very infrequently, typically only once when the MD run starts.
1309 
1310  int endbit_tmp = ilog2(numTileListsSrc);
1311 
1312  // Remove zeros
1313  {
1314  reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1315  reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1316 
1317  int nthread = 1024;
1318  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1319  buildRemoveZerosSortKey <<< nblock, nthread, 0, stream >>>
1320  (numTileListsSrc, tileListDepthSrc.ptr, begin_bit, sortKeySrc);
1321  cudaCheck(cudaGetLastError());
1322  }
1323 
1324  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1325  {
1326  // Short list, sort withing a single thread block
1327  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1328  int nblock = 1;
1329 
1330  unsigned int oobKey = numTileListsSrc;
1331  sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1332  (numTileListsSrc, numTileListsDst, 0, endbit_tmp, oobKey, sortKeySrc, sortKeyDst,
1333  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1334  cudaCheck(cudaGetLastError());
1335  }
1336  else
1337  {
1338  // Long list, sort on multiple thread blocks
1339  size_t size = 0;
1340  cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
1341  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1342  numTileListsSrc, 0, endbit_tmp, stream));
1343  // Make sure tempStorage doesn't remain NULL
1344  if (size == 0) size = 128;
1345  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1346  size = tempStorageSize;
1347  cudaCheck(cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1348  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1349  numTileListsSrc, 0, endbit_tmp, stream));
1350  }
1351 
1352  // Re-order tileListDepth using tileListOrderDst
1353  {
1354  int nthread = 1024;
1355  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1356  reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1357  (numTileListsDst, tileListOrderDst.ptr,
1358  tileListDepthSrc.ptr, tileListDepthDst.ptr);
1359  cudaCheck(cudaGetLastError());
1360  }
1361 
1362  } else {
1363  // This is done during regular MD cycle
1364 
1365  if (sortKeys_endbit <= 0)
1366  NAMD_die("CudaTileListKernel::sortTileLists, sortKeys not produced or invalid sortKeys_endbit");
1367 
1368  // Setup sort keys
1369  {
1370  reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1371  reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1372 
1373  int nthread = 1024;
1374  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1375  setupSortKey <<< nblock, nthread, 0, stream >>>
1376  (numTileListsSrc, maxTileListLen_sortKeys, tileListsSrc.ptr, tileListDepthSrc.ptr, begin_bit, sortKeys, sortKeySrc);
1377  cudaCheck(cudaGetLastError());
1378 
1379  }
1380 
1381  // Global sort
1382  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1383  // if (false)
1384  {
1385  // Short list, sort withing a single thread block
1386  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1387  int nblock = 1;
1388 
1389  unsigned int oobKey = (2 << sortKeys_endbit) - 1;
1390  sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1391  (numTileListsSrc, numTileListsDst, 0, sortKeys_endbit, oobKey, sortKeySrc, sortKeyDst,
1392  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1393  cudaCheck(cudaGetLastError());
1394  }
1395  else
1396  {
1397  // Long list, sort on multiple thread blocks
1398  size_t size = 0;
1399  cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
1400  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1401  numTileListsSrc, 0, sortKeys_endbit, stream));
1402  // Make sure tempStorage doesn't remain NULL
1403  if (size == 0) size = 128;
1404  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1405  size = tempStorageSize;
1406  cudaCheck(cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1407  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1408  numTileListsSrc, 0, sortKeys_endbit, stream));
1409  }
1410 
1411  // Re-order tileListDepth using tileListOrderDst
1412  {
1413  int nthread = 1024;
1414  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1415  reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1416  (numTileListsDst, tileListOrderDst.ptr,
1417  tileListDepthSrc.ptr, tileListDepthDst.ptr);
1418  cudaCheck(cudaGetLastError());
1419  }
1420 
1421  // Local sort
1422  {
1423  int nthread = 32;
1424  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1425  localSort<32> <<< nblock, nthread, 0, stream >>>
1426  (numTileListsDst, begin_bit, num_bit, tileListDepthDst.ptr, tileListOrderDst.ptr);
1427  cudaCheck(cudaGetLastError());
1428 
1429  // No need to shift any more
1430  doShiftDown = false;
1431  }
1432 
1433  }
1434  // ----------------------------------------------------------------------------------------
1435 
1436  } // (doStreaming)
1437  else
1438  {
1439  // --------------------------------------------------------------------
1440  // Sort {tileListDepthSrc, tileListOrderSrc}[0 ... numTileListsSrc-1]
1441  // => {tileListDepthDst, tileListOrderDst}[0 ... numTileListsSrc-1]
1442  // --------------------------------------------------------------------
1443  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1444  {
1445  // Short list, sort withing a single thread block
1446  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1447  int nblock = 1;
1448 
1449  sortTileListsKernel<unsigned int, int, false> <<< nblock, nthread, 0, stream >>>
1450  (numTileListsSrc, numTileListsDst, begin_bit, end_bit, 0, tileListDepthSrc.ptr, tileListDepthDst.ptr,
1451  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1452  cudaCheck(cudaGetLastError());
1453 
1454  }
1455  else
1456  {
1457  // Long list, sort on multiple thread blocks
1458  size_t size = 0;
1459  cudaCheck(cub::DeviceRadixSort::SortPairsDescending(NULL, size,
1460  tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1461  numTileListsSrc, begin_bit, end_bit, stream));
1462  // Make sure tempStorage doesn't remain NULL
1463  if (size == 0) size = 128;
1464  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1465  size = tempStorageSize;
1466  cudaCheck(cub::DeviceRadixSort::SortPairsDescending((void *)tempStorage, size,
1467  tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1468  numTileListsSrc, begin_bit, end_bit, stream));
1469  }
1470  }
1471 
1472  // -----------------------------------------------------------------------------
1473  // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1474  // -----------------------------------------------------------------------------
1475  if (doShiftDown)
1476  {
1477  int nthread = 1024;
1478  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1479  bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1480  (numTileListsDst, begin_bit, tileListDepthDst.ptr);
1481  cudaCheck(cudaGetLastError());
1482  }
1483 
1484  // Allocate with +1 to include last term in the exclusive sum
1485  reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsDst+1, OVERALLOC);
1486 
1487  // --------------------------------------------------------------------
1488  // Compute itileList positions to store tileLists
1489  // ExclusiveSum(tileListDepthDst[0...numTileListsDst+1])
1490  // NOTE: tileListDepthDst[numTileListsDst] is not accessed
1491  // since this is an exclusive sum. But with this trick,
1492  // tileListPos[numTileListsDst] will contain the total number
1493  // of tile lists
1494  // --------------------------------------------------------------------
1495  {
1496  size_t size = 0;
1497  cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
1498  (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1499  // Make sure tempStorage doesn't remain NULL
1500  if (size == 0) size = 128;
1501  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1502  size = tempStorageSize;
1503  // NOTE: Bug in CUB 1.4.1, stalls here with Geforce GTC Titan X.
1504  // Tested on "manila" node at UIUC. Works OK with CUB 1.5.2
1505  cudaCheck(cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1506  (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1507  }
1508 
1509  // --------------------------------------------------------------------
1510  // Re-package to
1511  // tileListsDst[0 ... numTileListsDst-1], tileJatomStartDst[0 ... numJtilesDst-1]
1512  // patchPairDst[]
1513  // tileJatomStartDst[]
1514  // tileExclsDst[0 ... numJtilesDst-1]
1515  // --------------------------------------------------------------------
1516  {
1518  int nwarp = REPACKTILELISTSKERNEL_NUM_WARP;
1519  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nwarp+1);
1520 
1521  repackTileListsKernel <<< nblock, nthread, 0, stream >>>
1522  (numTileListsDst, begin_bit, tileListPos, tileListOrderDst.ptr,
1523  (useJtiles) ? jtiles : NULL,
1524  tileListsSrc.ptr, tileListsDst.ptr,
1525  patchPairsSrc.ptr, patchPairsDst.ptr,
1526  tileJatomStartSrc.ptr, tileJatomStartDst.ptr,
1527  tileExclsSrc.ptr, tileExclsDst.ptr);
1528  cudaCheck(cudaGetLastError());
1529  }
1530 
1531  // Count the number of tileLists that contribute to each patch
1532  if (doStreaming)
1533  {
1534  clear_device_array<int>(patchNumLists, numPatches, stream);
1535 
1536  // Fill in patchNumLists[0...numPatches-1]
1537  int nthread = 512;
1538  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1539  calcPatchNumLists <<< nblock, nthread, 0, stream >>>
1540  (numTileListsDst, numPatches, tileListsDst.ptr, patchNumLists);
1541  cudaCheck(cudaGetLastError());
1542 
1543  // Use emptyPatches[numPatches] as the count variable
1544  clear_device_array<int>(&emptyPatches[numPatches], 1, stream);
1545 
1546  // Fill in tileListsDst[0...numTileListsDst-1].patchNumLists
1547  // and find empty patches into emptyPatches[0 ... numEmptyPatches - 1]
1548  setPatchNumLists_findEmptyPatches <<< nblock, nthread, 0, stream >>>
1549  (numTileListsDst, tileListsDst.ptr, patchNumLists,
1550  numPatches, &emptyPatches[numPatches], emptyPatches);
1551  cudaCheck(cudaGetLastError());
1552 
1553  // // Copy emptyPatches[0 ... numPatches] to host
1554  copy_DtoH<int>(emptyPatches, h_emptyPatches, numPatches+1, stream);
1555  cudaCheck(cudaStreamSynchronize(stream));
1556  numEmptyPatches = h_emptyPatches[numPatches];
1557  }
1558 
1559 }
1560 
1561 //
1562 // Re-sort tile lists after pairlist refinement. Can only be called after finishTileList() has finished copying
1563 //
1564 void CudaTileListKernel::reSortTileLists(const bool doGBIS, cudaStream_t stream) {
1565  // Store previous number of active lists
1566  int numTileListsPrev = numTileLists;
1567 
1568  // Wait for finishTileList() to stop copying
1569  if (!tileListStatEventRecord)
1570  NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1571  cudaCheck(cudaEventSynchronize(tileListStatEvent));
1572 
1573  // Get numTileLists, numTileListsGBIS, and numExcluded
1574  {
1575  numTileLists = h_tileListStat->numTileLists;
1576  numTileListsGBIS = h_tileListStat->numTileListsGBIS;
1577  numExcluded = h_tileListStat->numExcluded;
1578  }
1579 
1580  // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
1581  // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
1582  sortTileLists(true, 0, true,
1583  numTileListsPrev, numJtiles,
1584  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1585  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1586  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1587  numTileLists, numJtiles,
1588  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1589  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1590  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1591  stream);
1592 
1593  // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
1594  // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
1595  // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
1596 
1597  // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
1598 
1599  // NOTE:
1600  // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
1601  // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
1602 
1603  if (doGBIS) {
1604  // GBIS is used => produce a second tile list
1605  // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
1606  reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
1607  reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
1608 
1609  sortTileLists(true, 16, true,
1610  numTileListsPrev, numJtiles,
1611  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1612  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1613  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1614  numTileListsGBIS, numJtiles,
1615  PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1616  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1617  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1618  stream);
1619  }
1620 
1621  // Set active buffer to be 1
1622  setActiveBuffer(1);
1623 
1624 }
1625 
1626 /*
1627 //
1628 // Apply outputOrder after regular (non-pairlist, non-energy) non-bonded kernel
1629 //
1630 void CudaTileListKernel::applyOutputOrder(cudaStream_t stream) {
1631  return;
1632 
1633  if (!doStreaming || !doOutputOrder)
1634  return;
1635 
1636  // Sort {tileList1, tileJatomStart1, tileExcl1} => {tileList2, tileJatomStart2, tileExcl2}
1637  // VdW tile list in {tileList1, tileJatomStart1, tileExcl1}
1638  sortTileLists(false, 0, true,
1639  numTileLists, numJtiles,
1640  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1641  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1642  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls2Size),
1643  numTileLists, numJtiles,
1644  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1645  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1646  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1647  stream);
1648 
1649  // Set active buffer to be 2
1650  setActiveBuffer(2);
1651 
1652 }
1653 */
1654 
1656  if (len > tileListVirialEnergySize) {
1657  NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1658  }
1659  tileListVirialEnergyLength = len;
1660 }
1661 
1663  if (len > tileListVirialEnergySize) {
1664  NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1665  }
1666  tileListVirialEnergyGBISLength = len;
1667 }
__global__ void reOrderTileListDepth(const int numTileLists, const int *__restrict__ tileListOrder, unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
BlockLoad::TempStorage load
__global__ void repackTileListsKernel(const int numTileLists, const int begin_bit, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrder, const int *__restrict__ jtiles, const TileList *__restrict__ tileListsSrc, TileList *__restrict__ tileListsDst, const PatchPairRecord *__restrict__ patchPairsSrc, PatchPairRecord *__restrict__ patchPairsDst, const int *__restrict__ tileJatomStartSrc, int *__restrict__ tileJatomStartDst, const TileExcl *__restrict__ tileExclsSrc, TileExcl *__restrict__ tileExclsDst)
__global__ void storeInReverse(const int numTileListsSrc, const int begin_bit, const int *__restrict__ outputOrder, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrderSrc, const unsigned int *__restrict__ tileListDepthSrc, int *__restrict__ tileListOrderDst, unsigned int *__restrict__ tileListDepthDst)
CudaTileListKernel(int deviceID, bool doStreaming)
void prepareTileList(cudaStream_t stream)
#define OVERALLOC
__global__ void setupSortKey(const int numTileLists, const int maxTileListLen, const TileList *__restrict__ tileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, const unsigned int *__restrict__ sortKeys, unsigned int *__restrict__ sortKey)
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
void setTileListVirialEnergyLength(int len)
numTileListsSrc
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void clearTileListStat(cudaStream_t stream)
__global__ void buildTileListsBBKernel(const int numTileLists, TileList *__restrict__ tileLists, const CudaPatchRecord *__restrict__ patches, const int *__restrict__ tileListPos, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const int maxTileListLen, const BoundingBox *__restrict__ boundingBoxes, int *__restrict__ tileJatomStart, const int tileJatomStartSize, unsigned int *__restrict__ tileListDepth, int *__restrict__ tileListOrder, PatchPairRecord *__restrict__ patchPairs, TileListStat *__restrict__ tileListStat)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ tileListStat
void setTileListVirialEnergyGBISLength(int len)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__global__ void setPatchNumLists_findEmptyPatches(const int numTileLists, TileList *__restrict__ tileLists, const int *__restrict__ patchNumLists, const int numPatches, int *__restrict__ numEmptyPatches, int *__restrict__ emptyPatches)
#define REPACKTILELISTSKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ tileListOrderSrc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
cub::BlockRadixSort< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT > BlockRadixSort
__thread cudaStream_t stream
__global__ void buildRemoveZerosSortKey(const int numTileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, unsigned int *__restrict__ sortKey)
__global__ void calcPatchNumLists(const int numTileLists, const int numPatches, const TileList *__restrict__ tileLists, int *__restrict__ patchNumLists)
#define SORTTILELISTSKERNEL_NUM_THREAD
__global__ void const int const TileList *__restrict__ tileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
__global__ void updatePatchesKernel(const int numComputes, const int *__restrict__ tilePos, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, TileList *__restrict__ tileLists)
cub::BlockLoad< valT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE > BlockLoad
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
const int const int const int const keyT oobKey
__global__ void calcTileListPosKernel(const int numComputes, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, int *__restrict__ tilePos)
#define WARP_BALLOT(MASK, P)
Definition: CudaUtils.h:42
const int numTileListsDst
#define __ldg
void finishTileList(cudaStream_t stream)
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen)
int getMaxNumBlocks()
Definition: DeviceCUDA.C:415
void NAMD_die(const char *err_msg)
Definition: common.C:83
BlockLoadU::TempStorage loadU
float3 offsetXYZ
const int const int const int end_bit
__global__ void buildBoundingBoxesKernel(const int atomStorageSize, const float4 *__restrict__ xyzq, BoundingBox *__restrict__ boundingBoxes)
__global__ void const int numTileLists
#define BOUNDINGBOXKERNEL_NUM_WARP
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
__shared__ union @43 tempStorage
BlockRadixSort::TempStorage sort
int ilog2(int a)
__global__ void localSort(const int n, const int begin_bit, const int num_bit, unsigned int *__restrict__ keys, int *__restrict__ vals)
__global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit, const int *__restrict__ outputOrder, const unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ valT *__restrict__ tileListOrderDst typedef cub::BlockLoad< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE > BlockLoadU
#define TILELISTKERNELNEW_NUM_WARP
gridSize y
#define WARPSIZE
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ patchPairs
valT values[SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
gridSize x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void fillSortKeys(const int numComputes, const int maxTileListLen, const int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen, const TileList *__restrict__ tileListsSrc, const int *__restrict__ tileListOrderDst, const unsigned int *__restrict__ tileListDepthDst, int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen, int2 *__restrict__ minmaxListLen)
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start