NAMD
CudaTileListKernel.cu
Go to the documentation of this file.
1 // #include <map>
2 // #include <algorithm>
3 #if defined(NAMD_HIP) && !defined(NAMD_CUDA)//NAMD_HIP
4 #include "hip/hip_runtime.h"
5 #include <hipcub/hipcub.hpp>
6 #define cub hipcub
7 #endif
8 #if defined(NAMD_CUDA)
9 #include <cuda.h>
10 #if __CUDACC_VER_MAJOR__ >= 11
11 #include <cub/device/device_radix_sort.cuh>
12 #include <cub/device/device_scan.cuh>
13 #include <cub/cub.cuh>
14 #else
15 #include <namd_cub/device/device_radix_sort.cuh>
16 #include <namd_cub/device/device_scan.cuh>
17 #include <namd_cub/cub.cuh>
18 #endif //CUDACC version
19 #endif //NAMD_CUDA
20 
21 #include "CudaUtils.h"
22 #include "CudaTileListKernel.h"
23 #include "DeviceCUDA.h"
24 #ifdef WIN32
25 #define __thread __declspec(thread)
26 #endif
27 extern __thread DeviceCUDA *deviceCUDA;
28 
29 #define OVERALLOC 1.2f
30 #ifdef NAMD_CUDA
31 #define DEFAULTKERNEL_NUM_THREAD 1024
32 #define UPDATEPATCHESKERNEL_NUM_THREAD 512
33 #define CALCPATCHNUMLISTSKERNEL_NUM_THREAD 512
34 #define BOUNDINGBOXKERNEL_NUM_WARP 8
35 #else // NAMD_HIP
36 #define DEFAULTKERNEL_NUM_THREAD 256
37 #define UPDATEPATCHESKERNEL_NUM_THREAD 256
38 #define CALCPATCHNUMLISTSKERNEL_NUM_THREAD 256
39 #define BOUNDINGBOXKERNEL_NUM_WARP 4
40 #endif
41 #if __CUDA_ARCH__ < 350
42 #define __ldg *
43 #endif
44 
45 void NAMD_die(const char *);
46 
47 //
48 // Calculate the number of lists that contribute to each patch
49 //
50 __global__ void calcPatchNumLists(const int numTileLists, const int numPatches,
51  const TileList* __restrict__ tileLists, int* __restrict__ patchNumLists) {
52 
53  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
54  {
55  int2 patchInd = tileLists[i].patchInd;
56  atomicAdd(&patchNumLists[patchInd.x], 1);
57  if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
58  }
59 
60 }
61 
62 //
63 // Write patchNumList back to tile list and
64 // Find empty patches into emptyPatches[0 ... numEmptyPatches-1]
65 //
67  TileList* __restrict__ tileLists, const int* __restrict__ patchNumLists,
68  const int numPatches, int* __restrict__ numEmptyPatches, int* __restrict__ emptyPatches) {
69 
70  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
71  {
72  int2 patchInd = tileLists[i].patchInd;
73  int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
74  tileLists[i].patchNumList = patchNumList;
75  }
76 
77  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
78  {
79  if (patchNumLists[i] == 0) {
80  int ind = atomicAdd(numEmptyPatches, 1);
81  emptyPatches[ind] = i;
82  }
83  }
84 
85 }
86 
87 //
88 // Builds a sort key that removes zeros but keeps the order otherwise the same
89 //
90 __global__ void buildRemoveZerosSortKey(const int numTileLists,
91  const unsigned int* __restrict__ tileListDepth, const int begin_bit, unsigned int* __restrict__ sortKey) {
92 
93  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
94  {
95  int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
96  sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
97  }
98 
99 }
100 
101 __global__ void setupSortKey(const int numTileLists, const int maxTileListLen,
102  const TileList* __restrict__ tileLists, const unsigned int* __restrict__ tileListDepth,
103  const int begin_bit, const unsigned int* __restrict__ sortKeys, unsigned int* __restrict__ sortKey) {
104 
105  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
106  {
107  int icompute = tileLists[itileList].icompute;
108  int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
109  int i = icompute*maxTileListLen + (depth - 1);
110  sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
111  }
112 
113 }
114 
115 template <int width>
116 __global__ void localSort(const int n, const int begin_bit, const int num_bit,
117  unsigned int* __restrict__ keys, int* __restrict__ vals) {
118 
119  // NOTE: blockDim.x = width
120 
121  for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
122  {
123  int i = base + threadIdx.x;
124  typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
125  __shared__ typename BlockRadixSort::TempStorage tempStorage;
126  unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
127  int val[1] = {(i < n) ? vals[i] : 0};
128  BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
129  if (i < n) {
130  keys[i] = key[0];
131  vals[i] = val[0];
132  }
133  BLOCK_SYNC;
134  }
135 
136 }
137 
138 __global__ void storeInReverse(const int numTileListsSrc, const int begin_bit,
139  const int* __restrict__ outputOrder, const int* __restrict__ tileListPos,
140  const int* __restrict__ tileListOrderSrc,
141  const unsigned int* __restrict__ tileListDepthSrc,
142  int* __restrict__ tileListOrderDst,
143  unsigned int* __restrict__ tileListDepthDst) {
144 
145  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
146  {
147  int j = outputOrder[numTileListsSrc - i - 1];
148  if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
149  int k = tileListPos[i];
150  tileListDepthDst[k] = tileListDepthSrc[j];
151  tileListOrderDst[k] = j; //tileListOrderSrc[j];
152  }
153  }
154 }
155 
156 //
157 // Bit shift tileListDepth so that only lower 16 bits are used
158 //
159 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
160  const int* __restrict__ outputOrder, const unsigned int* __restrict__ tileListDepthSrc,
161  unsigned int* __restrict__ tileListDepthDst) {
162 
163  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
164  {
165  int j = outputOrder[numTileLists - i - 1];
166  tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
167  }
168 
169 }
170 
171 __global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen,
172  int2* __restrict__ minmaxListLen) {
173 
174  int2 val;
175  val.x = maxTileListLen+1;
176  val.y = 0;
177  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
178  {
179  minmaxListLen[i] = val;
180  }
181 
182 }
183 
184 //
185 // Build sortKeys[], values are in range 0 ... numTileListsDst-1
186 //
187 __global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen,
188  const TileList* __restrict__ tileListsSrc,
189  const int* __restrict__ tileListOrderDst,
190  const unsigned int* __restrict__ tileListDepthDst,
191  int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
192 
193  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
194  {
195  int k = tileListOrderDst[i];
196  int icompute = tileListsSrc[k].icompute;
197  int depth = tileListDepthDst[i] & 65535;
198  // depth is in range [1 ... maxTileListLen]
199  int j = icompute*maxTileListLen + (depth-1);
200  sortKeys[j] = i;
201  int2 minmax = minmaxListLen[icompute];
202  int2 minmaxOrig = minmax;
203  if (minmax.x > depth) minmax.x = depth;
204  if (minmax.y < depth) minmax.y = depth;
205  if (minmax.x != minmaxOrig.x) {
206  atomicMin(&minmaxListLen[icompute].x, minmax.x);
207  }
208  if (minmax.y != minmaxOrig.y) {
209  atomicMax(&minmaxListLen[icompute].y, minmax.y);
210  }
211  }
212 
213 }
214 
215 __global__ void fillSortKeys(const int numComputes, const int maxTileListLen,
216  const int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
217 
218  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
219  const int wid = threadIdx.x % WARPSIZE;
220  int2 minmax = minmaxListLen[i];
221  int minlen = minmax.x;
222  int maxlen = minmax.y;
223  // minlen, maxlen are in range [1 ... maxTileListLen]
224  // as long as i is in tileListsSrc[].icompute above
225  if ( maxlen < minlen ) {
226  minlen = 1;
227  maxlen = maxTileListLen;
228  }
229  unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
230  unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
231  unsigned int aveKey = (maxKey + minKey)/2;
232  for (int j=wid;j < minlen-1;j+=WARPSIZE) {
233  sortKeys[i*maxTileListLen + j] = minKey;
234  }
235  for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
236  sortKeys[i*maxTileListLen + j] = maxKey;
237  }
238  for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
239  if (sortKeys[i*maxTileListLen + j] == 0) {
240  sortKeys[i*maxTileListLen + j] = aveKey;
241  }
242  }
243  }
244 
245 }
246 
247 //
248 // Calculate bounding boxes for sets of WARPSIZE=32 atoms
249 //
250 
251 __global__ void
252 buildBoundingBoxesKernel(const int atomStorageSize, const float4* __restrict__ xyzq,
253  BoundingBox* __restrict__ boundingBoxes) {
254 
255  const int warpId = threadIdx.x / WARPSIZE;
256  const int wid = threadIdx.x % WARPSIZE;
257 
258  // Loop with warp-aligned index to avoid warp-divergence
259  for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
260  // Full atom index
261  const int i = iwarp + wid;
262  // Bounding box index
263  const int ibb = i/WARPSIZE;
264 
265  float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
266 
267  volatile float3 minxyz, maxxyz;
268 
269  typedef cub::WarpReduce<float> WarpReduce;
270  __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
271  minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
272  minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
273  minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
274  maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
275  maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
276  maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
277 
278  if (wid == 0) {
279  BoundingBox boundingBox;
280  boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
281  boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
282  boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
283  boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
284  boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
285  boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
286  boundingBoxes[ibb] = boundingBox;
287  }
288  }
289 
290 }
291 
292 //
293 // Returns the lower estimate for the distance between two bounding boxes
294 //
295 __device__ __forceinline__ float distsq(const BoundingBox a, const BoundingBox b) {
296  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
297  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
298  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
299  float r2 = dx*dx + dy*dy + dz*dz;
300  return r2;
301 }
302 
303 #if 0
304 //
305 // Performs warp-level exclusive sum on a shared memory array:
306 // sh_out[0 ... n-1] = exclusiveSum( sh_in[0 ... n-1] )
307 // sh_in and sh_out can point to same array
308 // Returns the total sum
309 //
310 template <typename T>
311 __device__ __forceinline__
312 int shWarpExclusiveSum(const int n, volatile T* sh_in, volatile int* sh_out) {
313  const int wid = threadIdx.x % WARPSIZE;
314  volatile int blockOffset = 0;
315  for (int iblock=0;iblock < n;iblock += WARPSIZE) {
316  // Size of the exclusive sum
317  int blockLen = min(WARPSIZE, n-iblock);
318  // Perform exclusive sum on sh_in[iblock ... iblock + blockLen-1]
319  typedef cub::WarpScan<int> WarpScan;
320  __shared__ typename WarpScan::TempStorage tempStorage;
321  int data = (wid < blockLen) ? (int)sh_in[iblock + wid] : 0;
322  WarpScan(tempStorage).ExclusiveSum(data, data);
323  // Shift by block offset
324  data += blockOffset;
325  // Save last value
326  int last = (int)sh_in[iblock + blockLen-1];
327  // Write output
328  if (wid < blockLen) sh_out[iblock + wid] = data;
329  // block offset = last term of the exclusive sum + last value
330  blockOffset = sh_out[iblock + blockLen-1] + last;
331  }
332  return blockOffset;
333 }
334 #endif
335 
336 #define TILELISTKERNELNEW_NUM_WARP 4
337 
338 //
339 // NOTE: Executed on a single thread block
340 //
341 template<int nthread>
342 __global__ void calcTileListPosKernel(const int numComputes,
343  const CudaComputeRecord* __restrict__ computes,
344  const CudaPatchRecord* __restrict__ patches,
345  int* __restrict__ tilePos) {
346 
347  typedef cub::BlockScan<int, nthread> BlockScan;
348 
349  __shared__ typename BlockScan::TempStorage tempStorage;
350  __shared__ int shTilePos0;
351 
352  if (threadIdx.x == nthread-1) {
353  shTilePos0 = 0;
354  }
355  for (int base=0;base < numComputes;base+=nthread) {
356  int k = base + threadIdx.x;
357 
358  int numTiles1 = (k < numComputes) ? (patches[computes[k].patchInd.x].numAtoms-1)/WARPSIZE+1 : 0;
359 
360  // Calculate positions in tile list and jtile list
361  int tilePosVal;
362  BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
363 
364  // Store into global memory
365  if (k < numComputes) {
366  tilePos[k] = shTilePos0 + tilePosVal;
367  }
368 
369  BLOCK_SYNC;
370  // Store block end position
371  if (threadIdx.x == nthread-1) {
372  shTilePos0 += tilePosVal + numTiles1;
373  }
374  }
375 }
376 
377 
378 template<int nthread>
379 __global__ void updatePatchesKernel(const int numComputes,
380  const int* __restrict__ tilePos,
381  const CudaComputeRecord* __restrict__ computes,
382  const CudaPatchRecord* __restrict__ patches,
383  TileList* __restrict__ tileLists) {
384 
385  const int tid = threadIdx.x % nthread;
386 
387  // nthread threads takes care of one compute
388  for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
389  {
390  CudaComputeRecord compute = computes[k];
391  float3 offsetXYZ = compute.offsetXYZ;
392  int2 patchInd = compute.patchInd;
393  int numTiles1 = (patches[patchInd.x].numAtoms-1)/WARPSIZE+1;
394  int itileList0 = tilePos[k];
395  for (int i=tid;i < numTiles1;i+=nthread) {
396  tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
397  tileLists[itileList0 + i].patchInd = patchInd;
398  tileLists[itileList0 + i].icompute = k;
399  }
400  }
401 
402 }
403 
404 __host__ __device__ __forceinline__
405 int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen) {
406  // Size in bytes
407  int size = (
408  maxTileListLen*sizeof(char)
409  );
410  return size;
411 }
412 
413 __global__ void
415  TileList* __restrict__ tileLists,
416  const CudaPatchRecord* __restrict__ patches,
417  const int* __restrict__ tileListPos,
418  const float3 lata, const float3 latb, const float3 latc,
419  const float cutoff2, const int maxTileListLen,
420  const BoundingBox* __restrict__ boundingBoxes,
421  int* __restrict__ tileJatomStart,
422  const int tileJatomStartSize,
423  unsigned int* __restrict__ tileListDepth,
424  int* __restrict__ tileListOrder,
425  PatchPairRecord* __restrict__ patchPairs,
426  TileListStat* __restrict__ tileListStat) {
427  #ifdef NAMD_CUDA
428  extern __shared__ char sh_buffer[];
429  #else
430  HIP_DYNAMIC_SHARED( char, sh_buffer)
431  #endif
432  int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
433  int pos = threadIdx.x*sizePerThread;
434  volatile char* sh_tile = (char*)&sh_buffer[pos];
435 
436  // Loop with warp-aligned index to avoid warp-divergence
437  for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
438 
439  // Use one thread per tile list
440  const int wid = threadIdx.x % WARPSIZE;
441  const int itileList = iwarp + wid;
442 
443  int i;
444  int itileListLen = 0;
445  CudaPatchRecord patch1;
446  CudaPatchRecord patch2;
447  float3 offsetXYZ;
448  int2 patchInd;
449  int numTiles2;
450  int icompute;
451 
452  if (itileList < numTileLists) {
453  offsetXYZ = tileLists[itileList].offsetXYZ;
454  patchInd = tileLists[itileList].patchInd;
455  icompute = tileLists[itileList].icompute;
456  // Get i-column
457  i = itileList - tileListPos[icompute];
458 
459  float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
460  float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
461  float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
462 
463  // DH - set zeroShift flag if magnitude of shift vector is zero
464  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
465 
466  // Load patches
467  patch1 = patches[patchInd.x];
468  patch2 = patches[patchInd.y];
469  // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
470  numTiles2 = (patch2.numAtoms-1)/WARPSIZE+1;
471  int tileStart1 = patch1.atomStart/WARPSIZE;
472  int tileStart2 = patch2.atomStart/WARPSIZE;
473 
474  // DH - self requires that zeroShift is also set
475  bool self = zeroShift && (tileStart1 == tileStart2);
476 
477  // Load i-atom data (and shift coordinates)
478  BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
479  boundingBoxI.x += shx;
480  boundingBoxI.y += shy;
481  boundingBoxI.z += shz;
482 
483  for (int j=0;j < numTiles2;j++) {
484  sh_tile[j] = 0;
485  if (!self || j >= i) {
486  BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
487  float r2bb = distsq(boundingBoxI, boundingBoxJ);
488  if (r2bb < cutoff2) {
489  sh_tile[j] = 1;
490  itileListLen++;
491  }
492  }
493  }
494 
495  tileListDepth[itileList] = (unsigned int)itileListLen;
496  tileListOrder[itileList] = itileList;
497  }
498 
499  typedef cub::WarpScan<int> WarpScan;
500  __shared__ typename WarpScan::TempStorage tempStorage;
501  int active = (itileListLen > 0);
502  int activePos;
503  WarpScan(tempStorage).ExclusiveSum(active, activePos);
504  int itileListPos;
505  WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
506 
507  int jtileStart, numJtiles;
508  // Last thread in the warp knows the total number
509  if (wid == WARPSIZE-1) {
510  atomicAdd(&tileListStat->numTileLists, activePos + active);
511  numJtiles = itileListPos + itileListLen;
512  jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
513  }
514 
515  numJtiles = cub::ShuffleIndex<WARPSIZE>(numJtiles, WARPSIZE-1, WARP_FULL_MASK);
516  jtileStart = cub::ShuffleIndex<WARPSIZE>(jtileStart, WARPSIZE-1, WARP_FULL_MASK);
517 
518  if (jtileStart + numJtiles > tileJatomStartSize) {
519  // tileJatomStart out of memory, exit
520  if (wid == 0) tileListStat->tilesSizeExceeded = true;
521  return;
522  }
523 
524  int jStart = itileListPos;
525  int jEnd = cub::ShuffleDown<WARPSIZE>(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);
526 
527  if (wid == WARPSIZE-1) jEnd = numJtiles;
528 
529  if (itileListLen > 0) {
530  // Setup tileLists[]
531  //TileList TLtmp;
532  tileLists[itileList].iatomStart = patch1.atomStart + i*WARPSIZE;
533  tileLists[itileList].jtileStart = jtileStart + jStart;
534  tileLists[itileList].jtileEnd = jtileStart + jEnd - 1;
535  tileLists[itileList].patchInd = patchInd;
536  tileLists[itileList].offsetXYZ = offsetXYZ;
537  tileLists[itileList].icompute = icompute;
538  // TLtmp.patchNumList.x = 0;
539  // TLtmp.patchNumList.y = 0;
540  // tileLists[itileList] = TLtmp;
541  // PatchPair
542  PatchPairRecord patchPair;
543  patchPair.iatomSize = patch1.atomStart + patch1.numAtoms;
544  patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
545  patchPair.jatomSize = patch2.atomStart + patch2.numAtoms;
546  patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
547  patchPairs[itileList] = patchPair;
548 
549  // Write tiles
550  int jtile = jtileStart + jStart;
551  for (int j=0;j < numTiles2;j++) {
552  if (sh_tile[j]) {
553  tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
554  jtile++;
555  }
556  }
557 
558  }
559 
560  }
561 
562 }
563 #ifdef NAMD_CUDA
564 #define REPACKTILELISTSKERNEL_NUM_WARP 32
565 #else
566 #define REPACKTILELISTSKERNEL_NUM_WARP 4
567 #endif
568 __global__ void
569 repackTileListsKernel(const int numTileLists, const int begin_bit, const int* __restrict__ tileListPos,
570  const int* __restrict__ tileListOrder,
571  const int* __restrict__ jtiles,
572  const TileList* __restrict__ tileListsSrc, TileList* __restrict__ tileListsDst,
573  const PatchPairRecord* __restrict__ patchPairsSrc, PatchPairRecord* __restrict__ patchPairsDst,
574  const int* __restrict__ tileJatomStartSrc, int* __restrict__ tileJatomStartDst,
575  const TileExcl* __restrict__ tileExclsSrc, TileExcl* __restrict__ tileExclsDst) {
576 
577  const int wid = threadIdx.x % WARPSIZE;
578 
579  // One warp does one tile list
580  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/WARPSIZE*gridDim.x)
581  {
582  int j = tileListOrder[i];
583  int start = tileListPos[i];
584  int end = tileListPos[i+1]-1;
585  if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
586  // TileList
587  int startOld = __ldg(&tileListsSrc[j].jtileStart);
588  int endOld = __ldg(&tileListsSrc[j].jtileEnd);
589  int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
590  float3 offsetXYZ;
591  offsetXYZ.x = __ldg(&tileListsSrc[j].offsetXYZ.x);
592  offsetXYZ.y = __ldg(&tileListsSrc[j].offsetXYZ.y);
593  offsetXYZ.z = __ldg(&tileListsSrc[j].offsetXYZ.z);
594  int2 patchInd = tileListsSrc[j].patchInd;
595  int icompute = __ldg(&tileListsSrc[j].icompute);
596  if (wid == 0) {
597  // TileList tileList;
598  tileListsDst[i].iatomStart = iatomStart;
599  tileListsDst[i].offsetXYZ = offsetXYZ;
600  tileListsDst[i].jtileStart = start;
601  tileListsDst[i].jtileEnd = end;
602  tileListsDst[i].patchInd = patchInd;
603  tileListsDst[i].icompute = icompute;
604  //tileListsDst[i] = tileList;
605  }
606 
607  if (jtiles == NULL) {
608  // No jtiles, simple copy will do
609  int jtile = start;
610  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
611  if (jtileOld + wid <= endOld) {
612  tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
613  }
614  }
615  if (tileExclsSrc != NULL) {
616  int jtile = start;
617  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
618  tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
619  }
620  }
621  } else {
622  int jtile0 = start;
623  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
624  int t = jtileOld + wid;
625  int jtile = (t <= endOld) ? jtiles[t] : 0;
626  jtile >>= begin_bit;
627  jtile &= 65535;
628  typedef cub::WarpScan<int> WarpScan;
629  __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
630  int warpId = threadIdx.x / WARPSIZE;
631  int jtilePos;
632  WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
633 
634  if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
635 
636  WarpMask b = WARP_BALLOT(WARP_FULL_MASK, jtile);
637  if (tileExclsSrc != NULL) {
638  while (b != 0) {
639  // k = index of thread that has data
640 #if WARPSIZE == 64
641  int k = __ffsll(b) - 1;
642 #else
643  int k = __ffs(b) - 1;
644 #endif
645  tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
646  // remove 1 bit and advance jtile0
647  b ^= ((WarpMask)1 << k);
648  jtile0++;
649  }
650  } else {
651 #if WARPSIZE == 64
652  jtile0 += __popcll(b);
653 #else
654  jtile0 += __popc(b);
655 #endif
656  }
657  }
658  }
659  }
660 
661 }
662 
663 //
664 // NOTE: Executed on a single thread block
665 // oobKey = out-of-bounds key value
666 //
667 #ifdef NAMD_CUDA
668 #define SORTTILELISTSKERNEL_NUM_THREAD 512
669 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 22
670 #else
671 #define SORTTILELISTSKERNEL_NUM_THREAD 256
672 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 15
673 #endif
674 template <typename keyT, typename valT, bool ascend>
676 void sortTileListsKernel(const int numTileListsSrc, const int numTileListsDst,
677  const int begin_bit, const int end_bit, const keyT oobKey,
678  keyT* __restrict__ tileListDepthSrc, keyT* __restrict__ tileListDepthDst,
679  valT* __restrict__ tileListOrderSrc, valT* __restrict__ tileListOrderDst) {
680 
682  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU;
683 
685  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
686 
689 
690  __shared__ union {
691  typename BlockLoad::TempStorage load;
692  typename BlockLoadU::TempStorage loadU;
693  typename BlockRadixSort::TempStorage sort;
694  } tempStorage;
695 
698 
699  BlockLoadU(tempStorage.loadU).Load(tileListDepthSrc, keys, numTileListsSrc, oobKey);
701  BlockLoad(tempStorage.load).Load(tileListOrderSrc, values, numTileListsSrc);
702  BLOCK_SYNC;
703 
704  if (ascend)
705  BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys, values, begin_bit, end_bit);
706  else
707  BlockRadixSort(tempStorage.sort).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
708 
709  cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListDepthDst, keys, numTileListsDst);
710  cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListOrderDst, values, numTileListsDst);
711 }
712 
713 __global__ void reOrderTileListDepth(const int numTileLists, const int* __restrict__ tileListOrder,
714  unsigned int* __restrict__ tileListDepthSrc, unsigned int* __restrict__ tileListDepthDst) {
715 
716  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
717  {
718  int j = tileListOrder[i];
719  tileListDepthDst[i] = tileListDepthSrc[j];
720  }
721 
722 }
723 
724 //
725 // Bit shift tileListDepth so that only lower 16 bits are used
726 //
727 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
728  unsigned int* __restrict__ tileListDepth) {
729 
730  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
731  {
732  unsigned int a = tileListDepth[itileList];
733  a >>= begin_bit;
734  a &= 65535;
735  tileListDepth[itileList] = a;
736  }
737 
738 }
739 
740 // ##############################################################################################
741 // ##############################################################################################
742 // ##############################################################################################
743 
744 CudaTileListKernel::CudaTileListKernel(int deviceID, bool doStreaming) :
745 deviceID(deviceID), doStreaming(doStreaming) {
746 
747  cudaCheck(cudaSetDevice(deviceID));
748 
749  activeBuffer = 1;
750 
751  numPatches = 0;
752  numComputes = 0;
753 
754  cudaPatches = NULL;
755  cudaPatchesSize = 0;
756 
757  cudaComputes = NULL;
758  cudaComputesSize = 0;
759 
760  patchNumLists = NULL;
761  patchNumListsSize = 0;
762 
763  emptyPatches = NULL;
764  emptyPatchesSize = 0;
765  h_emptyPatches = NULL;
766  h_emptyPatchesSize = 0;
767  numEmptyPatches = 0;
768 
769  sortKeySrc = NULL;
770  sortKeySrcSize = 0;
771  sortKeyDst = NULL;
772  sortKeyDstSize = 0;
773 
774  tileLists1 = NULL;
775  tileLists1Size = 0;
776  tileLists2 = NULL;
777  tileLists2Size = 0;
778 
779  patchPairs1 = NULL;
780  patchPairs1Size = 0;
781  patchPairs2 = NULL;
782  patchPairs2Size = 0;
783 
784  tileJatomStart1 = NULL;
785  tileJatomStart1Size = 0;
786  tileJatomStart2 = NULL;
787  tileJatomStart2Size = 0;
788 
789  boundingBoxes = NULL;
790  boundingBoxesSize = 0;
791 
792  tileListDepth1 = NULL;
793  tileListDepth1Size = 0;
794  tileListDepth2 = NULL;
795  tileListDepth2Size = 0;
796 
797  tileListOrder1 = NULL;
798  tileListOrder1Size = 0;
799  tileListOrder2 = NULL;
800  tileListOrder2Size = 0;
801 
802  tileExcls1 = NULL;
803  tileExcls1Size = 0;
804  tileExcls2 = NULL;
805  tileExcls2Size = 0;
806 
807  xyzq = NULL;
808  xyzqSize = 0;
809 
810  allocate_device<TileListStat>(&d_tileListStat, 1);
811  allocate_host<TileListStat>(&h_tileListStat, 1);
812 
813  tileListPos = NULL;
814  tileListPosSize = 0;
815  tempStorage = NULL;
816  tempStorageSize = 0;
817 
818  jtiles = NULL;
819  jtilesSize = 0;
820 
821  tilePos = NULL;
822  tilePosSize = 0;
823 
824  tileListsGBIS = NULL;
825  tileListsGBISSize = 0;
826 
827  tileJatomStartGBIS = NULL;
828  tileJatomStartGBISSize = 0;
829 
830  tileListVirialEnergy = NULL;
831  tileListVirialEnergySize = 0;
832 
833  atomStorageSize = 0;
834  numTileLists = 0;
835  numTileListsGBIS = 0;
836  numJtiles = 1;
837 
838  outputOrder = NULL;
839  outputOrderSize = 0;
840  doOutputOrder = false;
841 
842  minmaxListLen = NULL;
843  minmaxListLenSize = 0;
844 
845  sortKeys = NULL;
846  sortKeysSize = 0;
847  sortKeys_endbit = 0;
848 
849  cudaCheck(cudaEventCreate(&tileListStatEvent));
850  tileListStatEventRecord = false;
851 }
852 
854  cudaCheck(cudaSetDevice(deviceID));
855  deallocate_device<TileListStat>(&d_tileListStat);
856  deallocate_host<TileListStat>(&h_tileListStat);
857  //
858  if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
859  if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
860  if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
861  if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
862  if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
863  //
864  if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
865  if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
866  if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
867  if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
868  if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
869  if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
870  if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
871  if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
872  if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
873  if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
874  if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
875  if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
876  if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
877  if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
878  if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
879  if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
880  if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
881  if (jtiles != NULL) deallocate_device<int>(&jtiles);
882  if (tilePos != NULL) deallocate_device<int>(&tilePos);
883 
884  if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
885  if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
886 
887  if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
888 
889  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
890 
891  if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
892  if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
893 
894  cudaCheck(cudaEventDestroy(tileListStatEvent));
895 }
896 
898  clear_device_array<int>(jtiles, numJtiles, stream);
899 }
900 
902  // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
903  memset(h_tileListStat, 0, sizeof(TileListStat));
904  h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
905  copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
906 }
907 
909  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
910  cudaCheck(cudaEventRecord(tileListStatEvent, stream));
911  tileListStatEventRecord = true;
912 }
913 
914 void CudaTileListKernel::updateComputes(const int numComputesIn,
915  const CudaComputeRecord* h_cudaComputes, cudaStream_t stream) {
916 
917  numComputes = numComputesIn;
918 
919  reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
920  copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
921 
922  if (doStreaming) doOutputOrder = true;
923 }
924 
925 void CudaTileListKernel::writeTileList(const char* filename, const int numTileLists,
926  const TileList* d_tileLists, cudaStream_t stream) {
927 
928  TileList* h_tileLists = new TileList[numTileLists];
929  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
930  cudaCheck(cudaStreamSynchronize(stream));
931  FILE* handle = fopen(filename,"wt");
932  for (int itileList=0;itileList < numTileLists;itileList++) {
933  TileList tmp = h_tileLists[itileList];
934  fprintf(handle, "%d %d %d %f %f %f %d %d %d %d\n",
935  tmp.iatomStart, tmp.jtileStart, tmp.jtileEnd, tmp.offsetXYZ.x, tmp.offsetXYZ.y,
936  tmp.offsetXYZ.z, tmp.patchInd.x, tmp.patchInd.y, tmp.patchNumList.x, tmp.patchNumList.y);
937  }
938  fclose(handle);
939  delete [] h_tileLists;
940 }
941 
942 void CudaTileListKernel::writeTileJatomStart(const char* filename, const int numJtiles,
943  const int* d_tileJatomStart, cudaStream_t stream) {
944 
945  int* h_tileJatomStart = new int[numJtiles];
946  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
947  cudaCheck(cudaStreamSynchronize(stream));
948  FILE* handle = fopen(filename,"wt");
949  for (int i=0;i < numJtiles;i++) {
950  fprintf(handle, "%d\n", h_tileJatomStart[i]);
951  }
952  fclose(handle);
953  delete [] h_tileJatomStart;
954 }
955 
956 /*
957 std::pair<int, int> flip_pair(const std::pair<int, int> &p)
958 {
959  return std::pair<int, int>(p.second, p.first);
960 }
961 
962 void CudaTileListKernel::markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
963  const int numJtiles, int* d_tileJatomStart, cudaStream_t stream) {
964 
965  const int shCacheSize = 10;
966  TileList* h_tileLists = new TileList[numTileLists];
967  int* h_tileJatomStart = new int[numJtiles];
968  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
969  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
970  cudaCheck(cudaStreamSynchronize(stream));
971  int ntotal = 0;
972  int ncache = 0;
973  for (int i=0;i < numTileLists;i+=width) {
974  int jend = min(i + width, numTileLists);
975  std::map<int, int> atomStartMap;
976  std::map<int, int>::iterator it;
977  atomStartMap.clear();
978  for (int j=i;j < jend;j++) {
979  TileList tmp = h_tileLists[j];
980  int iatomStart = tmp.iatomStart;
981  it = atomStartMap.find(iatomStart);
982  if (it == atomStartMap.end()) {
983  // Insert new
984  atomStartMap.insert( std::pair<int, int>(iatomStart, 0) );
985  } else {
986  // Increase counter
987  it->second--;
988  }
989  int jtileStart = tmp.jtileStart;
990  int jtileEnd = tmp.jtileEnd;
991  ntotal += (jtileEnd - jtileStart + 1) + 1;
992  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
993  int jatomStart = h_tileJatomStart[jtile];
994  it = atomStartMap.find(jatomStart);
995  if (it == atomStartMap.end()) {
996  // Insert new
997  atomStartMap.insert( std::pair<int, int>(jatomStart, 0) );
998  } else {
999  // Increase counter
1000  it->second--;
1001  }
1002  jatomStart |= (65535 << 16);
1003  h_tileJatomStart[jtile] = jatomStart;
1004  }
1005  iatomStart |= (65535 << 16);
1006  tmp.iatomStart = iatomStart;
1007  h_tileLists[j] = tmp;
1008  }
1009  ncache += atomStartMap.size();
1010  std::multimap<int, int> imap;
1011  imap.clear();
1012  std::multimap<int, int>::iterator imap_it;
1013  std::transform(atomStartMap.begin(), atomStartMap.end(), std::inserter(imap, imap.begin()), flip_pair);
1014  if (i < 400) {
1015  printf("%d %d\n", ntotal, imap.size());
1016  for (imap_it = imap.begin();imap_it != imap.end();imap_it++) {
1017  if (imap_it->first != 0)
1018  printf("(%d %d)\n", imap_it->first, imap_it->second);
1019  }
1020  }
1021  }
1022  printf("ntotal %d ncache %d\n", ntotal, ncache);
1023  copy_HtoD<TileList>(h_tileLists, d_tileLists, numTileLists, stream);
1024  copy_HtoD<int>(h_tileJatomStart, d_tileJatomStart, numJtiles, stream);
1025  cudaCheck(cudaStreamSynchronize(stream));
1026  delete [] h_tileLists;
1027  delete [] h_tileJatomStart;
1028 }
1029 */
1030 
1031 void CudaTileListKernel::buildTileLists(const int numTileListsPrev,
1032  const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
1033  const float3 lata, const float3 latb, const float3 latc,
1034  const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq,
1035  const float plcutoff2In, const size_t maxShmemPerBlock,
1036  cudaStream_t stream) {
1037 
1038  numPatches = numPatchesIn;
1039  atomStorageSize = atomStorageSizeIn;
1040  maxTileListLen = maxTileListLenIn;
1041  plcutoff2 = plcutoff2In;
1042 
1043  if (doStreaming) {
1044  // Re-allocate patchNumLists
1045  reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1046  reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1047  reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1048  }
1049 
1050  // Re-allocate (tileLists1, patchPairs1
1051  reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
1052  reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
1053 
1054  // Copy cudaPatches to device
1055  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1056  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1057 
1058  // Re-allocate temporary storage
1059  reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
1060  // Calculate tile list positions (tilePos)
1061  {
1062  int nthread = DEFAULTKERNEL_NUM_THREAD;
1063  int nblock = 1;
1064  calcTileListPosKernel<DEFAULTKERNEL_NUM_THREAD> <<< nblock, nthread, 0, stream >>> (numComputes, cudaComputes, cudaPatches, tilePos);
1065  cudaCheck(cudaGetLastError());
1066  }
1067 
1068  // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
1069  {
1070  int nthread = UPDATEPATCHESKERNEL_NUM_THREAD;
1071  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/WARPSIZE)+1);
1072  updatePatchesKernel<WARPSIZE> <<< nblock, nthread, 0, stream >>> (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1073  cudaCheck(cudaGetLastError());
1074  }
1075 
1076  // ---------------------------------------------------------------------------------------------
1077 
1078 
1079  // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
1080  // tileListDepth2 and tileListOrder2 since they're used in sorting
1081  reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
1082  reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
1083 
1084  // Allocate with +1 to include last term in the exclusive sum
1085  reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
1086 
1087  reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
1088 
1089  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
1090 
1091  copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
1092 
1093  // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
1094  {
1095  int numBoundingBoxes = atomStorageSize/WARPSIZE;
1096  reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
1097 
1098  int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
1099  int nthread = WARPSIZE*nwarp;
1100  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1101  buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1102  cudaCheck(cudaGetLastError());
1103  }
1104 
1105  {
1106  int nwarp = TILELISTKERNELNEW_NUM_WARP;
1107  int nthread = WARPSIZE*nwarp;
1108  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
1109 
1110  int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
1111  if(shmem_size > maxShmemPerBlock){
1112  NAMD_die("CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1113  }
1114 
1115  // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
1116  // tell the required size in h_tileListStat->numJtiles. In subsequent calls,
1117  // re-allocation only happens when the size is exceeded.
1118  h_tileListStat->tilesSizeExceeded = true;
1119  int reallocCount = 0;
1120  while (h_tileListStat->tilesSizeExceeded) {
1121  reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
1122 
1123  clearTileListStat(stream);
1124  // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
1125  buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>> (
1126  numTileListsPrev, tileLists1, cudaPatches, tilePos,
1127  lata, latb, latc, plcutoff2, maxTileListLen,
1128  boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1129  tileListDepth1, tileListOrder1, patchPairs1,
1130  d_tileListStat);
1131 
1132  cudaCheck(cudaGetLastError());
1133 
1134  // get (numATileLists, numJtiles, tilesSizeExceeded)
1135  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
1136  cudaCheck(cudaStreamSynchronize(stream));
1137  numJtiles = h_tileListStat->numJtiles;
1138 
1139  if (h_tileListStat->tilesSizeExceeded) {
1140  reallocCount++;
1141  if (reallocCount > 1) {
1142  NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
1143  }
1144  }
1145 
1146  }
1147 
1148  numTileLists = h_tileListStat->numTileLists;
1149 
1150  reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
1151  }
1152 
1153  // Re-allocate tileListVirialEnergy.
1154  // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
1155  // we're quaranteed to have enough space
1156  reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
1157 
1158  reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
1159  reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
1160  reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
1161  reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
1162  reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
1163 
1164  int numTileListsSrc = numTileListsPrev;
1165  int numJtilesSrc = numJtiles;
1166  int numTileListsDst = numTileLists;
1167  int numJtilesDst = numJtiles;
1168 
1169  // Sort tiles
1170  sortTileLists(
1171  false,
1172  0, false,
1173  numTileListsSrc, numJtilesSrc,
1174  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1175  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1176  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1177  numTileListsDst, numJtilesDst,
1178  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1179  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1180  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1181  stream);
1182 
1183  // Set active buffer to 2
1184  setActiveBuffer(2);
1185 
1186  if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
1187 }
1188 
1189 //
1190 // Returns integer log2(a) rounded up
1191 //
1192 int ilog2(int a) {
1193  // if (a < 0)
1194  // NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
1195  int k = 1;
1196  while (a >>= 1) k++;
1197  return k;
1198 }
1199 
1200 //
1201 // Sort tile lists
1202 //
1203 void CudaTileListKernel::sortTileLists(
1204  const bool useJtiles,
1205  const int begin_bit, const bool highDepthBitsSetIn,
1206  // Source
1207  const int numTileListsSrc, const int numJtilesSrc,
1208  PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
1209  PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
1210  PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
1211  // Destination
1212  const int numTileListsDst, const int numJtilesDst,
1213  PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
1214  PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
1215  PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
1216  cudaStream_t stream) {
1217 
1218  bool doShiftDown = (begin_bit != 0 || highDepthBitsSetIn);
1219 
1220  // if (numTileListsDst == 0)
1221  // NAMD_die("CudaTileListKernel::sortTileLists, numTileListsDst = 0");
1222 
1223  // Check that the array sizes are adequate
1224  if (numTileListsSrc > tileListsSrc.size || numJtilesSrc > tileJatomStartSrc.size ||
1225  numTileListsSrc > tileListDepthSrc.size || numTileListsSrc > tileListOrderSrc.size ||
1226  (patchPairsSrc.ptr != NULL && numTileListsSrc > patchPairsSrc.size) ||
1227  (tileExclsSrc.ptr != NULL && numJtilesSrc > tileExclsSrc.size))
1228  NAMD_die("CudaTileListKernel::sortTileLists, Src allocated too small");
1229 
1230  if (numTileListsDst > tileListsDst.size || numJtilesDst > tileJatomStartDst.size ||
1231  numTileListsSrc > tileListDepthDst.size || numTileListsSrc > tileListOrderDst.size ||
1232  (patchPairsDst.ptr != NULL && numTileListsDst > patchPairsDst.size) ||
1233  (tileExclsDst.ptr != NULL && numJtilesDst > tileExclsDst.size))
1234  NAMD_die("CudaTileListKernel::sortTileLists, Dst allocated too small");
1235 
1236  if (begin_bit != 0 && begin_bit != 16)
1237  NAMD_die("CudaTileListKernel::sortTileLists, begin_bit must be 0 or 16");
1238 
1239  // Number of bits needed in the sort
1240  int num_bit = ilog2(maxTileListLen);
1241  if (num_bit > 16)
1242  NAMD_die("CudaTileListKernel::sortTileLists, num_bit overflow");
1243  int end_bit = begin_bit + num_bit;
1244 
1245  if (doStreaming)
1246  {
1247  // ----------------------------------------------------------------------------------------
1248  if (doOutputOrder && useJtiles) {
1249  // outputOrder has been produced, put tile lists back in reverse order and produce sortKeys
1250  // NOTE: This is done very infrequently, typically only once when the MD run starts.
1251 
1252  // Calculate position from depth
1253  {
1254  // -----------------------------------------------------------------------------
1255  // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1256  // -----------------------------------------------------------------------------
1257  if (doShiftDown)
1258  {
1259  int nthread = DEFAULTKERNEL_NUM_THREAD;
1260  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1261  bitshiftTileListDepth <<< nblock, nthread, 0, stream >>> (numTileListsSrc, begin_bit, outputOrder, tileListDepthSrc.ptr, tileListDepthDst.ptr);
1262  cudaCheck(cudaGetLastError());
1263  }
1264 
1265  reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsSrc, OVERALLOC);
1266 
1267  // --------------------------------------------------------------------
1268  // Compute itileList positions to store tileLists
1269  // ExclusiveSum(tileListDepthDst[0...numTileListsDst-1])
1270  // --------------------------------------------------------------------
1271  {
1272  size_t size = 0;
1273  cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum(NULL, size,
1274  (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1275  // Make sure tempStorage doesn't remain NULL
1276  if (size == 0) size = 128;
1277  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1278  size = tempStorageSize;
1279  cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1280  (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1281  }
1282  }
1283 
1284  // Store in reverse order from outputOrder
1285  {
1286  int nthread = DEFAULTKERNEL_NUM_THREAD;
1287  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1288  storeInReverse <<< nblock, nthread, 0, stream >>> (
1289  numTileListsSrc, begin_bit, outputOrder, tileListPos,
1290  tileListOrderSrc.ptr, tileListDepthSrc.ptr,
1291  tileListOrderDst.ptr, tileListDepthDst.ptr);
1292  cudaCheck(cudaGetLastError());
1293  }
1294 
1295  // Build sortKeys
1296  {
1297  maxTileListLen_sortKeys = maxTileListLen;
1298 
1299  reallocate_device<unsigned int>(&sortKeys, &sortKeysSize, numComputes*maxTileListLen);
1300  clear_device_array<unsigned int>(sortKeys, numComputes*maxTileListLen, stream);
1301 
1302  // Re-allocate and initialize minmaxListLen
1303  {
1304  reallocate_device<int2>(&minmaxListLen, &minmaxListLenSize, numComputes);
1305 
1306  int nthread = DEFAULTKERNEL_NUM_THREAD;
1307  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nthread+1);
1308  initMinMaxListLen <<< nblock, nthread, 0, stream >>> (numComputes, maxTileListLen, minmaxListLen);
1309  cudaCheck(cudaGetLastError());
1310  }
1311 
1312  // Build sortKeys and calculate minmaxListLen
1313  {
1314  int nthread = DEFAULTKERNEL_NUM_THREAD;
1315  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1316  buildSortKeys <<< nblock, nthread, 0, stream >>> (
1317  numTileListsDst, maxTileListLen, tileListsSrc.ptr, tileListOrderDst.ptr,
1318  tileListDepthDst.ptr, minmaxListLen, sortKeys);
1319  cudaCheck(cudaGetLastError());
1320 
1321  // Maximum value in sortKeys[] is numTileListsDst - 1
1322  sortKeys_endbit = ilog2(numTileListsDst);
1323  }
1324 
1325  // Fill in missing sortKeys using minmaxListLen
1326  {
1327  int nthread = DEFAULTKERNEL_NUM_THREAD;
1328  int nwarp = nthread/WARPSIZE;
1329  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nwarp+1);
1330  fillSortKeys <<< nblock, nthread, 0, stream >>> (numComputes, maxTileListLen, minmaxListLen, sortKeys);
1331  cudaCheck(cudaGetLastError());
1332  }
1333 
1334  }
1335 
1336  doOutputOrder = false;
1337 
1338  } else if (doOutputOrder) {
1339  // OutputOrder will be produced in next pairlist non-bond kernel.
1340  // This time just remove zero length lists
1341  // NOTE: This is done very infrequently, typically only once when the MD run starts.
1342 
1343  int endbit_tmp = ilog2(numTileListsSrc);
1344 
1345  // Remove zeros
1346  {
1347  reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1348  reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1349 
1350  int nthread = DEFAULTKERNEL_NUM_THREAD;
1351  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1352  buildRemoveZerosSortKey <<< nblock, nthread, 0, stream >>> (numTileListsSrc, tileListDepthSrc.ptr, begin_bit, sortKeySrc);
1353  cudaCheck(cudaGetLastError());
1354  }
1355 
1356  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1357  {
1358  // Short list, sort withing a single thread block
1359  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1360  int nblock = 1;
1361 
1362  unsigned int oobKey = numTileListsSrc;
1363  sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>> (
1364  numTileListsSrc, numTileListsDst, 0, endbit_tmp, oobKey, sortKeySrc, sortKeyDst,
1365  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1366  cudaCheck(cudaGetLastError());
1367  }
1368  else
1369  {
1370  // Long list, sort on multiple thread blocks
1371  size_t size = 0;
1372  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs(NULL, size,
1373  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1374  numTileListsSrc, 0, endbit_tmp, stream));
1375  // Make sure tempStorage doesn't remain NULL
1376  if (size == 0) size = 128;
1377  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1378  size = tempStorageSize;
1379  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1380  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1381  numTileListsSrc, 0, endbit_tmp, stream));
1382  }
1383 
1384  // Re-order tileListDepth using tileListOrderDst
1385  {
1386  int nthread = DEFAULTKERNEL_NUM_THREAD;
1387  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1388  reOrderTileListDepth <<< nblock, nthread, 0, stream >>> (numTileListsDst, tileListOrderDst.ptr,
1389  tileListDepthSrc.ptr, tileListDepthDst.ptr);
1390  cudaCheck(cudaGetLastError());
1391  }
1392 
1393  } else {
1394  // This is done during regular MD cycle
1395 
1396  if (sortKeys_endbit <= 0)
1397  NAMD_die("CudaTileListKernel::sortTileLists, sortKeys not produced or invalid sortKeys_endbit");
1398 
1399  // Setup sort keys
1400  {
1401  reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1402  reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1403 
1404  int nthread = DEFAULTKERNEL_NUM_THREAD;
1405  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1406  setupSortKey <<< nblock, nthread, 0, stream >>> (numTileListsSrc, maxTileListLen_sortKeys,
1407  tileListsSrc.ptr, tileListDepthSrc.ptr, begin_bit, sortKeys, sortKeySrc);
1408  cudaCheck(cudaGetLastError());
1409 
1410  }
1411 
1412  // Global sort
1413  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1414  // if (false)
1415  {
1416  // Short list, sort withing a single thread block
1417  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1418  int nblock = 1;
1419 
1420  unsigned int oobKey = (2 << sortKeys_endbit) - 1;
1421  sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>> (
1422  numTileListsSrc, numTileListsDst, 0, sortKeys_endbit, oobKey, sortKeySrc, sortKeyDst,
1423  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1424  cudaCheck(cudaGetLastError());
1425  }
1426  else
1427  {
1428  // Long list, sort on multiple thread blocks
1429  size_t size = 0;
1430  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs(NULL, size,
1431  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1432  numTileListsSrc, 0, sortKeys_endbit, stream));
1433  // Make sure tempStorage doesn't remain NULL
1434  if (size == 0) size = 128;
1435  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1436  size = tempStorageSize;
1437  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1438  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1439  numTileListsSrc, 0, sortKeys_endbit, stream));
1440  }
1441 
1442  // Re-order tileListDepth using tileListOrderDst
1443  {
1444  int nthread = DEFAULTKERNEL_NUM_THREAD;
1445  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1446  reOrderTileListDepth <<< nblock, nthread, 0, stream >>> (numTileListsDst, tileListOrderDst.ptr, tileListDepthSrc.ptr, tileListDepthDst.ptr);
1447  cudaCheck(cudaGetLastError());
1448  }
1449 
1450  // Local sort
1451  {
1452  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/WARPSIZE+1);
1453  localSort<WARPSIZE> <<< nblock, WARPSIZE, 0, stream >>> (numTileListsDst, begin_bit, num_bit, tileListDepthDst.ptr, tileListOrderDst.ptr);
1454  cudaCheck(cudaGetLastError());
1455 
1456  // No need to shift any more
1457  doShiftDown = false;
1458  }
1459 
1460  }
1461  // ----------------------------------------------------------------------------------------
1462 
1463  } // (doStreaming)
1464  else
1465  {
1466  // --------------------------------------------------------------------
1467  // Sort {tileListDepthSrc, tileListOrderSrc}[0 ... numTileListsSrc-1]
1468  // => {tileListDepthDst, tileListOrderDst}[0 ... numTileListsSrc-1]
1469  // --------------------------------------------------------------------
1470  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1471  {
1472  // Short list, sort withing a single thread block
1473  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1474  int nblock = 1;
1475  sortTileListsKernel<unsigned int, int, false> <<< nblock, nthread, 0, stream >>> (
1476  numTileListsSrc, numTileListsDst, begin_bit, end_bit, 0, tileListDepthSrc.ptr, tileListDepthDst.ptr,
1477  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1478  cudaCheck(cudaGetLastError());
1479 
1480  }
1481  else
1482  {
1483  // Long list, sort on multiple thread blocks
1484  size_t size = 0;
1485  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairsDescending(NULL, size,
1486  tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1487  numTileListsSrc, begin_bit, end_bit, stream));
1488  // Make sure tempStorage doesn't remain NULL
1489  if (size == 0) size = 128;
1490  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1491  size = tempStorageSize;
1492  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairsDescending((void *)tempStorage, size,
1493  tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1494  numTileListsSrc, begin_bit, end_bit, stream));
1495  }
1496  }
1497 
1498  // -----------------------------------------------------------------------------
1499  // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1500  // -----------------------------------------------------------------------------
1501  if (doShiftDown)
1502  {
1503  int nthread = DEFAULTKERNEL_NUM_THREAD;
1504  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1505  bitshiftTileListDepth <<< nblock, nthread, 0, stream >>> (numTileListsDst, begin_bit, tileListDepthDst.ptr);
1506  cudaCheck(cudaGetLastError());
1507  }
1508 
1509  // Allocate with +1 to include last term in the exclusive sum
1510  reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsDst+1, OVERALLOC);
1511 
1512  // --------------------------------------------------------------------
1513  // Compute itileList positions to store tileLists
1514  // ExclusiveSum(tileListDepthDst[0...numTileListsDst+1])
1515  // NOTE: tileListDepthDst[numTileListsDst] is not accessed
1516  // since this is an exclusive sum. But with this trick,
1517  // tileListPos[numTileListsDst] will contain the total number
1518  // of tile lists
1519  // --------------------------------------------------------------------
1520  {
1521  size_t size = 0;
1522  cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum(NULL, size,
1523  (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1524  // Make sure tempStorage doesn't remain NULL
1525  if (size == 0) size = 128;
1526  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1527  size = tempStorageSize;
1528  // NOTE: Bug in CUB 1.4.1, stalls here with Geforce GTC Titan X.
1529  // Tested on "manila" node at UIUC. Works OK with CUB 1.5.2
1530  cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1531  (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1532  }
1533 
1534  // --------------------------------------------------------------------
1535  // Re-package to
1536  // tileListsDst[0 ... numTileListsDst-1], tileJatomStartDst[0 ... numJtilesDst-1]
1537  // patchPairDst[]
1538  // tileJatomStartDst[]
1539  // tileExclsDst[0 ... numJtilesDst-1]
1540  // --------------------------------------------------------------------
1541  {
1543  int nwarp = REPACKTILELISTSKERNEL_NUM_WARP;
1544  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nwarp+1);
1545  repackTileListsKernel <<< nblock, nthread, 0, stream >>> (
1546  numTileListsDst, begin_bit, tileListPos, tileListOrderDst.ptr,
1547  (useJtiles) ? jtiles : NULL,
1548  tileListsSrc.ptr, tileListsDst.ptr,
1549  patchPairsSrc.ptr, patchPairsDst.ptr,
1550  tileJatomStartSrc.ptr, tileJatomStartDst.ptr,
1551  tileExclsSrc.ptr, tileExclsDst.ptr);
1552  cudaCheck(cudaGetLastError());
1553  }
1554 
1555  // Count the number of tileLists that contribute to each patch
1556  if (doStreaming)
1557  {
1558  clear_device_array<int>(patchNumLists, numPatches, stream);
1559 
1560  // Fill in patchNumLists[0...numPatches-1]
1561  int nthread = CALCPATCHNUMLISTSKERNEL_NUM_THREAD;
1562  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1563  calcPatchNumLists <<< nblock, nthread, 0, stream >>> (numTileListsDst, numPatches, tileListsDst.ptr, patchNumLists);
1564  cudaCheck(cudaGetLastError());
1565 
1566  // Use emptyPatches[numPatches] as the count variable
1567  clear_device_array<int>(&emptyPatches[numPatches], 1, stream);
1568 
1569  // Fill in tileListsDst[0...numTileListsDst-1].patchNumLists
1570  // and find empty patches into emptyPatches[0 ... numEmptyPatches - 1]
1571  setPatchNumLists_findEmptyPatches <<< nblock, nthread, 0, stream >>> (numTileListsDst, tileListsDst.ptr, patchNumLists,
1572  numPatches, &emptyPatches[numPatches], emptyPatches);
1573  cudaCheck(cudaGetLastError());
1574 
1575  // // Copy emptyPatches[0 ... numPatches] to host
1576  copy_DtoH<int>(emptyPatches, h_emptyPatches, numPatches+1, stream);
1577  cudaCheck(cudaStreamSynchronize(stream));
1578  numEmptyPatches = h_emptyPatches[numPatches];
1579  }
1580 
1581 }
1582 
1583 //
1584 // Re-sort tile lists after pairlist refinement. Can only be called after finishTileList() has finished copying
1585 //
1586 void CudaTileListKernel::reSortTileLists(const bool doGBIS, cudaStream_t stream) {
1587  // Store previous number of active lists
1588  int numTileListsPrev = numTileLists;
1589 
1590  // Wait for finishTileList() to stop copying
1591  if (!tileListStatEventRecord)
1592  NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1593  cudaCheck(cudaEventSynchronize(tileListStatEvent));
1594 
1595  // Get numTileLists, numTileListsGBIS, and numExcluded
1596  {
1597  numTileLists = h_tileListStat->numTileLists;
1598  numTileListsGBIS = h_tileListStat->numTileListsGBIS;
1599  numExcluded = h_tileListStat->numExcluded;
1600  }
1601 
1602  // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
1603  // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
1604  sortTileLists(true, 0, true,
1605  numTileListsPrev, numJtiles,
1606  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1607  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1608  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1609  numTileLists, numJtiles,
1610  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1611  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1612  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1613  stream);
1614 
1615  // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
1616  // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
1617  // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
1618 
1619  // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
1620 
1621  // NOTE:
1622  // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
1623  // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
1624 
1625  if (doGBIS) {
1626  // GBIS is used => produce a second tile list
1627  // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
1628  reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
1629  reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
1630 
1631  sortTileLists(true, 16, true,
1632  numTileListsPrev, numJtiles,
1633  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1634  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1635  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1636  numTileListsGBIS, numJtiles,
1637  PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1638  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1639  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1640  stream);
1641  }
1642 
1643  // Set active buffer to be 1
1644  setActiveBuffer(1);
1645 
1646 }
1647 
1648 /*
1649 //
1650 // Apply outputOrder after regular (non-pairlist, non-energy) non-bonded kernel
1651 //
1652 void CudaTileListKernel::applyOutputOrder(cudaStream_t stream) {
1653  return;
1654 
1655  if (!doStreaming || !doOutputOrder)
1656  return;
1657 
1658  // Sort {tileList1, tileJatomStart1, tileExcl1} => {tileList2, tileJatomStart2, tileExcl2}
1659  // VdW tile list in {tileList1, tileJatomStart1, tileExcl1}
1660  sortTileLists(false, 0, true,
1661  numTileLists, numJtiles,
1662  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1663  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1664  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls2Size),
1665  numTileLists, numJtiles,
1666  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1667  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1668  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1669  stream);
1670 
1671  // Set active buffer to be 2
1672  setActiveBuffer(2);
1673 
1674 }
1675 */
1676 
1678  if (len > tileListVirialEnergySize) {
1679  NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1680  }
1681  tileListVirialEnergyLength = len;
1682 }
1683 
1685  if (len > tileListVirialEnergySize) {
1686  NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1687  }
1688  tileListVirialEnergyGBISLength = len;
1689 }
__global__ void reOrderTileListDepth(const int numTileLists, const int *__restrict__ tileListOrder, unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
BlockLoad::TempStorage load
__global__ void repackTileListsKernel(const int numTileLists, const int begin_bit, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrder, const int *__restrict__ jtiles, const TileList *__restrict__ tileListsSrc, TileList *__restrict__ tileListsDst, const PatchPairRecord *__restrict__ patchPairsSrc, PatchPairRecord *__restrict__ patchPairsDst, const int *__restrict__ tileJatomStartSrc, int *__restrict__ tileJatomStartDst, const TileExcl *__restrict__ tileExclsSrc, TileExcl *__restrict__ tileExclsDst)
__global__ void storeInReverse(const int numTileListsSrc, const int begin_bit, const int *__restrict__ outputOrder, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrderSrc, const unsigned int *__restrict__ tileListDepthSrc, int *__restrict__ tileListOrderDst, unsigned int *__restrict__ tileListDepthDst)
CudaTileListKernel(int deviceID, bool doStreaming)
void prepareTileList(cudaStream_t stream)
#define OVERALLOC
__global__ void setupSortKey(const int numTileLists, const int maxTileListLen, const TileList *__restrict__ tileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, const unsigned int *__restrict__ sortKeys, unsigned int *__restrict__ sortKey)
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
void setTileListVirialEnergyLength(int len)
numTileListsSrc
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ patchPairs
#define CALCPATCHNUMLISTSKERNEL_NUM_THREAD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
void clearTileListStat(cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ tileListStat
__global__ void buildTileListsBBKernel(const int numTileLists, TileList *__restrict__ tileLists, const CudaPatchRecord *__restrict__ patches, const int *__restrict__ tileListPos, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const int maxTileListLen, const BoundingBox *__restrict__ boundingBoxes, int *__restrict__ tileJatomStart, const int tileJatomStartSize, unsigned int *__restrict__ tileListDepth, int *__restrict__ tileListOrder, PatchPairRecord *__restrict__ patchPairs, TileListStat *__restrict__ tileListStat)
void setTileListVirialEnergyGBISLength(int len)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__global__ void setPatchNumLists_findEmptyPatches(const int numTileLists, TileList *__restrict__ tileLists, const int *__restrict__ patchNumLists, const int numPatches, int *__restrict__ numEmptyPatches, int *__restrict__ emptyPatches)
#define REPACKTILELISTSKERNEL_NUM_WARP
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ tileListOrderSrc
#define WARPSIZE
Definition: CudaUtils.h:10
cub::BlockRadixSort< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT > BlockRadixSort
#define UPDATEPATCHESKERNEL_NUM_THREAD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__thread cudaStream_t stream
__global__ void buildRemoveZerosSortKey(const int numTileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, unsigned int *__restrict__ sortKey)
__global__ void calcPatchNumLists(const int numTileLists, const int numPatches, const TileList *__restrict__ tileLists, int *__restrict__ patchNumLists)
#define SORTTILELISTSKERNEL_NUM_THREAD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ tileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
unsigned int WarpMask
Definition: CudaUtils.h:11
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
__global__ void updatePatchesKernel(const int numComputes, const int *__restrict__ tilePos, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, TileList *__restrict__ tileLists)
cub::BlockLoad< valT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE > BlockLoad
keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD
const int const int const int const keyT oobKey
__global__ void calcTileListPosKernel(const int numComputes, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, int *__restrict__ tilePos)
#define DEFAULTKERNEL_NUM_THREAD
#define WARP_BALLOT(MASK, P)
Definition: CudaUtils.h:58
const int numTileListsDst
#define __ldg
void finishTileList(cudaStream_t stream)
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
int getMaxNumBlocks()
Definition: DeviceCUDA.C:419
void NAMD_die(const char *err_msg)
Definition: common.C:83
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
BlockLoadU::TempStorage loadU
float3 offsetXYZ
const int const int const int end_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__global__ void buildBoundingBoxesKernel(const int atomStorageSize, const float4 *__restrict__ xyzq, BoundingBox *__restrict__ boundingBoxes)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void const int numTileLists
#define BOUNDINGBOXKERNEL_NUM_WARP
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
__shared__ union @43 tempStorage
BlockRadixSort::TempStorage sort
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
int ilog2(int a)
__global__ void localSort(const int n, const int begin_bit, const int num_bit, unsigned int *__restrict__ keys, int *__restrict__ vals)
__global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit, const int *__restrict__ outputOrder, const unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ valT *__restrict__ tileListOrderDst typedef cub::BlockLoad< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE > BlockLoadU
#define TILELISTKERNELNEW_NUM_WARP
gridSize y
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
valT values[SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
gridSize x
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
__global__ void fillSortKeys(const int numComputes, const int maxTileListLen, const int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen, const TileList *__restrict__ tileListsSrc, const int *__restrict__ tileListOrderDst, const unsigned int *__restrict__ tileListDepthDst, int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen, int2 *__restrict__ minmaxListLen)
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start