NAMD
Macros | Typedefs | Functions | Variables
CudaTileListKernel.cu File Reference
#include <cuda.h>
#include <namd_cub/device/device_radix_sort.cuh>
#include <namd_cub/device/device_scan.cuh>
#include <namd_cub/cub.cuh>
#include "CudaUtils.h"
#include "CudaTileListKernel.h"
#include "DeviceCUDA.h"

Go to the source code of this file.

Macros

#define OVERALLOC   1.2f
 
#define __ldg   *
 
#define BOUNDINGBOXKERNEL_NUM_WARP   8
 
#define TILELISTKERNELNEW_NUM_WARP   4
 
#define REPACKTILELISTSKERNEL_NUM_WARP   32
 
#define SORTTILELISTSKERNEL_NUM_THREAD   512
 
#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD   22
 

Typedefs

typedef cub::BlockLoad< valT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
cub::BLOCK_LOAD_WARP_TRANSPOSE > 
BlockLoad
 
typedef cub::BlockRadixSort
< keyT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
valT > 
BlockRadixSort
 

Functions

void NAMD_die (const char *)
 
__global__ void calcPatchNumLists (const int numTileLists, const int numPatches, const TileList *__restrict__ tileLists, int *__restrict__ patchNumLists)
 
__global__ void setPatchNumLists_findEmptyPatches (const int numTileLists, TileList *__restrict__ tileLists, const int *__restrict__ patchNumLists, const int numPatches, int *__restrict__ numEmptyPatches, int *__restrict__ emptyPatches)
 
__global__ void buildRemoveZerosSortKey (const int numTileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, unsigned int *__restrict__ sortKey)
 
__global__ void setupSortKey (const int numTileLists, const int maxTileListLen, const TileList *__restrict__ tileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, const unsigned int *__restrict__ sortKeys, unsigned int *__restrict__ sortKey)
 
template<int width>
__global__ void localSort (const int n, const int begin_bit, const int num_bit, unsigned int *__restrict__ keys, int *__restrict__ vals)
 
__global__ void storeInReverse (const int numTileListsSrc, const int begin_bit, const int *__restrict__ outputOrder, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrderSrc, const unsigned int *__restrict__ tileListDepthSrc, int *__restrict__ tileListOrderDst, unsigned int *__restrict__ tileListDepthDst)
 
__global__ void bitshiftTileListDepth (const int numTileLists, const int begin_bit, const int *__restrict__ outputOrder, const unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
 
__global__ void initMinMaxListLen (const int numComputes, const int maxTileListLen, int2 *__restrict__ minmaxListLen)
 
__global__ void buildSortKeys (const int numTileListsDst, const int maxTileListLen, const TileList *__restrict__ tileListsSrc, const int *__restrict__ tileListOrderDst, const unsigned int *__restrict__ tileListDepthDst, int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
 
__global__ void fillSortKeys (const int numComputes, const int maxTileListLen, const int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
 
__global__ void buildBoundingBoxesKernel (const int atomStorageSize, const float4 *__restrict__ xyzq, BoundingBox *__restrict__ boundingBoxes)
 
__device__ __forceinline__ float distsq (const BoundingBox a, const BoundingBox b)
 
template<int nthread>
__global__ void calcTileListPosKernel (const int numComputes, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, int *__restrict__ tilePos)
 
template<int nthread>
__global__ void updatePatchesKernel (const int numComputes, const int *__restrict__ tilePos, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, TileList *__restrict__ tileLists)
 
__host__ __device__
__forceinline__ int 
buildTileListsBBKernel_shmem_sizePerThread (const int maxTileListLen)
 
__global__ void buildTileListsBBKernel (const int numTileLists, TileList *__restrict__ tileLists, const CudaPatchRecord *__restrict__ patches, const int *__restrict__ tileListPos, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const int maxTileListLen, const BoundingBox *__restrict__ boundingBoxes, int *__restrict__ tileJatomStart, const int tileJatomStartSize, unsigned int *__restrict__ tileListDepth, int *__restrict__ tileListOrder, PatchPairRecord *__restrict__ patchPairs, TileListStat *__restrict__ tileListStat)
 
__global__ void repackTileListsKernel (const int numTileLists, const int begin_bit, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrder, const int *__restrict__ jtiles, const TileList *__restrict__ tileListsSrc, TileList *__restrict__ tileListsDst, const PatchPairRecord *__restrict__ patchPairsSrc, PatchPairRecord *__restrict__ patchPairsDst, const int *__restrict__ tileJatomStartSrc, int *__restrict__ tileJatomStartDst, const TileExcl *__restrict__ tileExclsSrc, TileExcl *__restrict__ tileExclsDst)
 
template<typename keyT , typename valT , bool ascend>
 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__ void sortTileListsKernel(const int numTileListsSrc
 
 BlockLoadU (tempStorage.loadU).Load(tileListDepthSrc
 
 BlockLoad (tempStorage.load).Load(tileListOrderSrc
 
 BlockRadixSort (tempStorage.sort).SortBlockedToStriped(keys
 
__global__ void reOrderTileListDepth (const int numTileLists, const int *__restrict__ tileListOrder, unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
 
__global__ void bitshiftTileListDepth (const int numTileLists, const int begin_bit, unsigned int *__restrict__ tileListDepth)
 
int ilog2 (int a)
 

Variables

const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ valT
*__restrict__ valT
*__restrict__ tileListOrderDst
typedef cub::BlockLoad< keyT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
cub::BLOCK_LOAD_WARP_TRANSPOSE > 
BlockLoadU
 
__thread DeviceCUDAdeviceCUDA
 
const int numTileListsDst
 
const int const int begin_bit
 
const int const int const int end_bit
 
const int const int const int
const keyT 
oobKey
 
const int const int const int
const keyT keyT *__restrict__ 
tileListDepthSrc
 
const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ 
tileListDepthDst
 
const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ valT
*__restrict__ 
tileListOrderSrc
 
union {
   BlockLoad::TempStorage   load
 
   BlockLoadU::TempStorage   loadU
 
   BlockRadixSort::TempStorage   sort
 
tempStorage
 
keyT keys [SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
 
valT values [SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
 
 numTileListsSrc
 
 BLOCK_SYNC
 

Macro Definition Documentation

#define __ldg   *
#define BOUNDINGBOXKERNEL_NUM_WARP   8
#define OVERALLOC   1.2f
#define REPACKTILELISTSKERNEL_NUM_WARP   32

Definition at line 541 of file CudaTileListKernel.cu.

Referenced by repackTileListsKernel().

#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD   22

Definition at line 634 of file CudaTileListKernel.cu.

#define SORTTILELISTSKERNEL_NUM_THREAD   512

Definition at line 633 of file CudaTileListKernel.cu.

#define TILELISTKERNELNEW_NUM_WARP   4

Definition at line 318 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists().

Typedef Documentation

typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad

Definition at line 646 of file CudaTileListKernel.cu.

Definition at line 649 of file CudaTileListKernel.cu.

Function Documentation

template<typename keyT , typename valT , bool ascend>
__launch_bounds__ ( SORTTILELISTSKERNEL_NUM_THREAD  ,
 
) const
__global__ void bitshiftTileListDepth ( const int  numTileLists,
const int  begin_bit,
const int *__restrict__  outputOrder,
const unsigned int *__restrict__  tileListDepthSrc,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 141 of file CudaTileListKernel.cu.

References begin_bit.

143  {
144 
145  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
146  {
147  int j = outputOrder[numTileLists - i - 1];
148  tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
149  }
150 
151 }
const int const int begin_bit
__global__ void const int numTileLists
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void bitshiftTileListDepth ( const int  numTileLists,
const int  begin_bit,
unsigned int *__restrict__  tileListDepth 
)

Definition at line 688 of file CudaTileListKernel.cu.

References begin_bit, and itileList.

689  {
690 
691  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
692  {
693  unsigned int a = tileListDepth[itileList];
694  a >>= begin_bit;
695  a &= 65535;
697  }
698 
699 }
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__global__ void const int numTileLists
BlockLoad ( tempStorage.  load)
BlockLoadU ( tempStorage.  loadU)
BlockRadixSort ( tempStorage.  sort)
__global__ void buildBoundingBoxesKernel ( const int  atomStorageSize,
const float4 *__restrict__  xyzq,
BoundingBox *__restrict__  boundingBoxes 
)

Definition at line 234 of file CudaTileListKernel.cu.

References BOUNDINGBOXKERNEL_NUM_WARP, tempStorage, WARPSIZE, BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

235  {
236 
237  const int warpId = threadIdx.x / WARPSIZE;
238  const int wid = threadIdx.x % WARPSIZE;
239 
240  // Loop with warp-aligned index to avoid warp-divergence
241  for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
242  // Full atom index
243  const int i = iwarp + wid;
244  // Bounding box index
245  const int ibb = i/WARPSIZE;
246 
247  float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
248 
249  volatile float3 minxyz, maxxyz;
250 
251  typedef cub::WarpReduce<float> WarpReduce;
252  __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
253  minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
254  minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
255  minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
256  maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
257  maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
258  maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
259 
260  if (wid == 0) {
261  BoundingBox boundingBox;
262  boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
263  boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
264  boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
265  boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
266  boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
267  boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
268  boundingBoxes[ibb] = boundingBox;
269  }
270  }
271 
272 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
#define BOUNDINGBOXKERNEL_NUM_WARP
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
#define WARPSIZE
__global__ void buildRemoveZerosSortKey ( const int  numTileLists,
const unsigned int *__restrict__  tileListDepth,
const int  begin_bit,
unsigned int *__restrict__  sortKey 
)

Definition at line 72 of file CudaTileListKernel.cu.

References begin_bit, and itileList.

73  {
74 
75  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
76  {
77  int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
78  sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
79  }
80 
81 }
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__global__ void const int numTileLists
__global__ void buildSortKeys ( const int  numTileListsDst,
const int  maxTileListLen,
const TileList *__restrict__  tileListsSrc,
const int *__restrict__  tileListOrderDst,
const unsigned int *__restrict__  tileListDepthDst,
int2 *__restrict__  minmaxListLen,
unsigned int *__restrict__  sortKeys 
)

Definition at line 169 of file CudaTileListKernel.cu.

References x, and y.

173  {
174 
175  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
176  {
177  int k = tileListOrderDst[i];
178  int icompute = tileListsSrc[k].icompute;
179  int depth = tileListDepthDst[i] & 65535;
180  // depth is in range [1 ... maxTileListLen]
181  int j = icompute*maxTileListLen + (depth-1);
182  sortKeys[j] = i;
183  int2 minmax = minmaxListLen[icompute];
184  int2 minmaxOrig = minmax;
185  if (minmax.x > depth) minmax.x = depth;
186  if (minmax.y < depth) minmax.y = depth;
187  if (minmax.x != minmaxOrig.x) {
188  atomicMin(&minmaxListLen[icompute].x, minmax.x);
189  }
190  if (minmax.y != minmaxOrig.y) {
191  atomicMax(&minmaxListLen[icompute].y, minmax.y);
192  }
193  }
194 
195 }
const int numTileListsDst
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
gridSize y
gridSize x
__global__ void buildTileListsBBKernel ( const int  numTileLists,
TileList *__restrict__  tileLists,
const CudaPatchRecord *__restrict__  patches,
const int *__restrict__  tileListPos,
const float3  lata,
const float3  latb,
const float3  latc,
const float  cutoff2,
const int  maxTileListLen,
const BoundingBox *__restrict__  boundingBoxes,
int *__restrict__  tileJatomStart,
const int  tileJatomStartSize,
unsigned int *__restrict__  tileListDepth,
int *__restrict__  tileListOrder,
PatchPairRecord *__restrict__  patchPairs,
TileListStat *__restrict__  tileListStat 
)

Definition at line 397 of file CudaTileListKernel.cu.

References CudaPatchRecord::atomStart, buildTileListsBBKernel_shmem_sizePerThread(), distsq(), PatchPairRecord::iatomFreeSize, PatchPairRecord::iatomSize, TileList::iatomStart, TileList::icompute, itileList, PatchPairRecord::jatomFreeSize, PatchPairRecord::jatomSize, TileList::jtileEnd, TileList::jtileStart, CudaPatchRecord::numAtoms, CudaPatchRecord::numFreeAtoms, numTileLists, TileList::offsetXYZ, TileList::patchInd, tempStorage, WARP_FULL_MASK, WARPSIZE, and BoundingBox::x.

409  {
410 
411  extern __shared__ char sh_buffer[];
412  int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
413  int pos = threadIdx.x*sizePerThread;
414  volatile char* sh_tile = (char*)&sh_buffer[pos];
415 
416  // Loop with warp-aligned index to avoid warp-divergence
417  for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
418 
419  // Use one thread per tile list
420  const int wid = threadIdx.x % WARPSIZE;
421  const int itileList = iwarp + wid;
422 
423  int i;
424  int itileListLen = 0;
425  CudaPatchRecord patch1;
426  CudaPatchRecord patch2;
427  float3 offsetXYZ;
428  int2 patchInd;
429  int numTiles2;
430  int icompute;
431 
432  if (itileList < numTileLists) {
433  offsetXYZ = tileLists[itileList].offsetXYZ;
434  patchInd = tileLists[itileList].patchInd;
435  icompute = tileLists[itileList].icompute;
436  // Get i-column
437  i = itileList - tileListPos[icompute];
438 
439  float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
440  float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
441  float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
442 
443  // DH - set zeroShift flag if magnitude of shift vector is zero
444  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
445 
446  // Load patches
447  patch1 = patches[patchInd.x];
448  patch2 = patches[patchInd.y];
449  // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
450  numTiles2 = (patch2.numAtoms-1)/WARPSIZE+1;
451  int tileStart1 = patch1.atomStart/WARPSIZE;
452  int tileStart2 = patch2.atomStart/WARPSIZE;
453 
454  // DH - self requires that zeroShift is also set
455  bool self = zeroShift && (tileStart1 == tileStart2);
456 
457  // Load i-atom data (and shift coordinates)
458  BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
459  boundingBoxI.x += shx;
460  boundingBoxI.y += shy;
461  boundingBoxI.z += shz;
462 
463  for (int j=0;j < numTiles2;j++) {
464  sh_tile[j] = 0;
465  if (!self || j >= i) {
466  BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
467  float r2bb = distsq(boundingBoxI, boundingBoxJ);
468  if (r2bb < cutoff2) {
469  sh_tile[j] = 1;
470  itileListLen++;
471  }
472  }
473  }
474 
475  tileListDepth[itileList] = (unsigned int)itileListLen;
477  }
478 
479  typedef cub::WarpScan<int> WarpScan;
480  __shared__ typename WarpScan::TempStorage tempStorage;
481  int active = (itileListLen > 0);
482  int activePos;
483  WarpScan(tempStorage).ExclusiveSum(active, activePos);
484  int itileListPos;
485  WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
486 
487  int jtileStart, numJtiles;
488  // Last thread in the warp knows the total number
489  if (wid == WARPSIZE-1) {
490  atomicAdd(&tileListStat->numTileLists, activePos + active);
491  numJtiles = itileListPos + itileListLen;
492  jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
493  }
494  numJtiles = cub::ShuffleIndex<WARPSIZE>(numJtiles, WARPSIZE-1, WARP_FULL_MASK);
495  jtileStart = cub::ShuffleIndex<WARPSIZE>(jtileStart, WARPSIZE-1, WARP_FULL_MASK);
496  if (jtileStart + numJtiles > tileJatomStartSize) {
497  // tileJatomStart out of memory, exit
498  if (wid == 0) tileListStat->tilesSizeExceeded = true;
499  return;
500  }
501 
502  int jStart = itileListPos;
503  int jEnd = cub::ShuffleDown<WARPSIZE>(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);
504  if (wid == WARPSIZE-1) jEnd = numJtiles;
505 
506  if (itileListLen > 0) {
507  // Setup tileLists[]
508  TileList TLtmp;
509  TLtmp.iatomStart = patch1.atomStart + i*WARPSIZE;
510  TLtmp.jtileStart = jtileStart + jStart;
511  TLtmp.jtileEnd = jtileStart + jEnd - 1;
512  TLtmp.patchInd = patchInd;
513  TLtmp.offsetXYZ = offsetXYZ;
514  TLtmp.icompute = icompute;
515  // TLtmp.patchNumList.x = 0;
516  // TLtmp.patchNumList.y = 0;
517  tileLists[itileList] = TLtmp;
518  // PatchPair
519  PatchPairRecord patchPair;
520  patchPair.iatomSize = patch1.atomStart + patch1.numAtoms;
521  patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
522  patchPair.jatomSize = patch2.atomStart + patch2.numAtoms;
523  patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
524  patchPairs[itileList] = patchPair;
525 
526  // Write tiles
527  int jtile = jtileStart + jStart;
528  for (int j=0;j < numTiles2;j++) {
529  if (sh_tile[j]) {
530  tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
531  jtile++;
532  }
533  }
534 
535  }
536 
537  }
538 
539 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen)
float3 offsetXYZ
__global__ void const int numTileLists
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
#define WARPSIZE
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread ( const int  maxTileListLen)

Definition at line 388 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists(), and buildTileListsBBKernel().

388  {
389  // Size in bytes
390  int size = (
391  maxTileListLen*sizeof(char)
392  );
393  return size;
394 }
__global__ void calcPatchNumLists ( const int  numTileLists,
const int  numPatches,
const TileList *__restrict__  tileLists,
int *__restrict__  patchNumLists 
)

Definition at line 32 of file CudaTileListKernel.cu.

33  {
34 
35  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
36  {
37  int2 patchInd = tileLists[i].patchInd;
38  atomicAdd(&patchNumLists[patchInd.x], 1);
39  if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
40  }
41 
42 }
__global__ void const int numTileLists
template<int nthread>
__global__ void calcTileListPosKernel ( const int  numComputes,
const CudaComputeRecord *__restrict__  computes,
const CudaPatchRecord *__restrict__  patches,
int *__restrict__  tilePos 
)

Definition at line 324 of file CudaTileListKernel.cu.

References BLOCK_SYNC, tempStorage, and WARPSIZE.

327  {
328 
329  typedef cub::BlockScan<int, nthread> BlockScan;
330 
331  __shared__ typename BlockScan::TempStorage tempStorage;
332  __shared__ int shTilePos0;
333 
334  if (threadIdx.x == nthread-1) {
335  shTilePos0 = 0;
336  }
337 
338  for (int base=0;base < numComputes;base+=nthread) {
339  int k = base + threadIdx.x;
340 
341  int numTiles1 = (k < numComputes) ? (patches[computes[k].patchInd.x].numAtoms-1)/WARPSIZE+1 : 0;
342 
343  // Calculate positions in tile list and jtile list
344  int tilePosVal;
345  BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
346 
347  // Store into global memory
348  if (k < numComputes) {
349  tilePos[k] = shTilePos0 + tilePosVal;
350  }
351 
352  BLOCK_SYNC;
353  // Store block end position
354  if (threadIdx.x == nthread-1) {
355  shTilePos0 += tilePosVal + numTiles1;
356  }
357  }
358 }
__shared__ union @43 tempStorage
#define WARPSIZE
__device__ __forceinline__ float distsq ( const BoundingBox  a,
const BoundingBox  b 
)

Definition at line 277 of file CudaTileListKernel.cu.

References BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

277  {
278  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
279  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
280  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
281  float r2 = dx*dx + dy*dy + dz*dz;
282  return r2;
283 }
__global__ void fillSortKeys ( const int  numComputes,
const int  maxTileListLen,
const int2 *__restrict__  minmaxListLen,
unsigned int *__restrict__  sortKeys 
)

Definition at line 197 of file CudaTileListKernel.cu.

References WARPSIZE.

198  {
199 
200  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
201  const int wid = threadIdx.x % WARPSIZE;
202  int2 minmax = minmaxListLen[i];
203  int minlen = minmax.x;
204  int maxlen = minmax.y;
205  // minlen, maxlen are in range [1 ... maxTileListLen]
206  // as long as i is in tileListsSrc[].icompute above
207  if ( maxlen < minlen ) {
208  minlen = 1;
209  maxlen = maxTileListLen;
210  }
211  unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
212  unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
213  unsigned int aveKey = (maxKey + minKey)/2;
214  for (int j=wid;j < minlen-1;j+=WARPSIZE) {
215  sortKeys[i*maxTileListLen + j] = minKey;
216  }
217  for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
218  sortKeys[i*maxTileListLen + j] = maxKey;
219  }
220  for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
221  if (sortKeys[i*maxTileListLen + j] == 0) {
222  sortKeys[i*maxTileListLen + j] = aveKey;
223  }
224  }
225  }
226 
227 }
#define WARPSIZE
int ilog2 ( int  a)

Definition at line 1156 of file CudaTileListKernel.cu.

1156  {
1157  // if (a < 0)
1158  // NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
1159  int k = 1;
1160  while (a >>= 1) k++;
1161  return k;
1162 }
__global__ void initMinMaxListLen ( const int  numComputes,
const int  maxTileListLen,
int2 *__restrict__  minmaxListLen 
)

Definition at line 153 of file CudaTileListKernel.cu.

154  {
155 
156  int2 val;
157  val.x = maxTileListLen+1;
158  val.y = 0;
159  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
160  {
161  minmaxListLen[i] = val;
162  }
163 
164 }
template<int width>
__global__ void localSort ( const int  n,
const int  begin_bit,
const int  num_bit,
unsigned int *__restrict__  keys,
int *__restrict__  vals 
)

Definition at line 98 of file CudaTileListKernel.cu.

References begin_bit, BLOCK_SYNC, and tempStorage.

99  {
100 
101  // NOTE: blockDim.x = width
102 
103  for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
104  {
105  int i = base + threadIdx.x;
106  typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
107  __shared__ typename BlockRadixSort::TempStorage tempStorage;
108  unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
109  int val[1] = {(i < n) ? vals[i] : 0};
110  BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
111  if (i < n) {
112  keys[i] = key[0];
113  vals[i] = val[0];
114  }
115  BLOCK_SYNC;
116  }
117 
118 }
const int const int begin_bit
cub::BlockRadixSort< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT > BlockRadixSort
keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
__shared__ union @43 tempStorage
void NAMD_die ( const char *  )

Definition at line 83 of file common.C.

84 {
85  if ( ! err_msg ) err_msg = "(unknown error)";
86  CkPrintf("FATAL ERROR: %s\n", err_msg);
87  fflush(stdout);
88  char repstr[24] = "";
89  if (CmiNumPartitions() > 1) {
90  sprintf(repstr,"REPLICA %d ", CmiMyPartition());
91  }
92  CkError("%sFATAL ERROR: %s\n", repstr, err_msg);
93 #if CHARM_VERSION < 61000
94  CkExit();
95 #else
96  CkExit(1);
97 #endif
98 }
__global__ void reOrderTileListDepth ( const int  numTileLists,
const int *__restrict__  tileListOrder,
unsigned int *__restrict__  tileListDepthSrc,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 674 of file CudaTileListKernel.cu.

675  {
676 
677  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
678  {
679  int j = tileListOrder[i];
681  }
682 
683 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
__global__ void const int numTileLists
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
__global__ void repackTileListsKernel ( const int  numTileLists,
const int  begin_bit,
const int *__restrict__  tileListPos,
const int *__restrict__  tileListOrder,
const int *__restrict__  jtiles,
const TileList *__restrict__  tileListsSrc,
TileList *__restrict__  tileListsDst,
const PatchPairRecord *__restrict__  patchPairsSrc,
PatchPairRecord *__restrict__  patchPairsDst,
const int *__restrict__  tileJatomStartSrc,
int *__restrict__  tileJatomStartDst,
const TileExcl *__restrict__  tileExclsSrc,
TileExcl *__restrict__  tileExclsDst 
)

Definition at line 543 of file CudaTileListKernel.cu.

References __ldg, begin_bit, TileList::iatomStart, TileList::icompute, TileList::jtileEnd, TileList::jtileStart, TileList::offsetXYZ, TileList::patchInd, REPACKTILELISTSKERNEL_NUM_WARP, WARP_BALLOT, WARP_FULL_MASK, and WARPSIZE.

549  {
550 
551  const int wid = threadIdx.x % WARPSIZE;
552 
553  // One warp does one tile list
554  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/WARPSIZE*gridDim.x)
555  {
556  int j = tileListOrder[i];
557  int start = tileListPos[i];
558  int end = tileListPos[i+1]-1;
559  if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
560  // TileList
561  int startOld = __ldg(&tileListsSrc[j].jtileStart);
562  int endOld = __ldg(&tileListsSrc[j].jtileEnd);
563  int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
564  float3 offsetXYZ;
565  offsetXYZ.x = __ldg(&tileListsSrc[j].offsetXYZ.x);
566  offsetXYZ.y = __ldg(&tileListsSrc[j].offsetXYZ.y);
567  offsetXYZ.z = __ldg(&tileListsSrc[j].offsetXYZ.z);
568  int2 patchInd = tileListsSrc[j].patchInd;
569  int icompute = __ldg(&tileListsSrc[j].icompute);
570  if (wid == 0) {
571  TileList tileList;
572  tileList.iatomStart = iatomStart;
573  tileList.offsetXYZ = offsetXYZ;
574  tileList.jtileStart = start;
575  tileList.jtileEnd = end;
576  tileList.patchInd = patchInd;
577  tileList.icompute = icompute;
578  tileListsDst[i] = tileList;
579  }
580 
581  if (jtiles == NULL) {
582  // No jtiles, simple copy will do
583  int jtile = start;
584  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
585  if (jtileOld + wid <= endOld) {
586  tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
587  }
588  }
589  if (tileExclsSrc != NULL) {
590  int jtile = start;
591  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
592  tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
593  }
594  }
595  } else {
596  int jtile0 = start;
597  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
598  int t = jtileOld + wid;
599  int jtile = (t <= endOld) ? jtiles[t] : 0;
600  jtile >>= begin_bit;
601  jtile &= 65535;
602  typedef cub::WarpScan<int> WarpScan;
603  __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
604  int warpId = threadIdx.x / WARPSIZE;
605  int jtilePos;
606  WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
607 
608  if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
609 
610  if (tileExclsSrc != NULL) {
611  unsigned int b = WARP_BALLOT(WARP_FULL_MASK, jtile);
612  while (b != 0) {
613  // k = index of thread that has data
614  int k = __ffs(b) - 1;
615  tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
616  // remove 1 bit and advance jtile0
617  b ^= ((unsigned int)1 << k);
618  jtile0++;
619  }
620  } else {
621  jtile0 += __popc(WARP_BALLOT(WARP_FULL_MASK, jtile));
622  }
623  }
624  }
625  }
626 
627 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
const int const int begin_bit
#define REPACKTILELISTSKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
unsigned int excl[32]
#define WARP_BALLOT(MASK, P)
Definition: CudaUtils.h:42
#define __ldg
float3 offsetXYZ
__global__ void const int numTileLists
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
#define WARPSIZE
__global__ void setPatchNumLists_findEmptyPatches ( const int  numTileLists,
TileList *__restrict__  tileLists,
const int *__restrict__  patchNumLists,
const int  numPatches,
int *__restrict__  numEmptyPatches,
int *__restrict__  emptyPatches 
)

Definition at line 48 of file CudaTileListKernel.cu.

50  {
51 
52  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
53  {
54  int2 patchInd = tileLists[i].patchInd;
55  int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
56  tileLists[i].patchNumList = patchNumList;
57  }
58 
59  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
60  {
61  if (patchNumLists[i] == 0) {
62  int ind = atomicAdd(numEmptyPatches, 1);
63  emptyPatches[ind] = i;
64  }
65  }
66 
67 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
__global__ void const int numTileLists
__global__ void setupSortKey ( const int  numTileLists,
const int  maxTileListLen,
const TileList *__restrict__  tileLists,
const unsigned int *__restrict__  tileListDepth,
const int  begin_bit,
const unsigned int *__restrict__  sortKeys,
unsigned int *__restrict__  sortKey 
)

Definition at line 83 of file CudaTileListKernel.cu.

References itileList.

85  {
86 
87  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
88  {
89  int icompute = tileLists[itileList].icompute;
90  int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
91  int i = icompute*maxTileListLen + (depth - 1);
92  sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
93  }
94 
95 }
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__global__ void const int numTileLists
__global__ void storeInReverse ( const int  numTileListsSrc,
const int  begin_bit,
const int *__restrict__  outputOrder,
const int *__restrict__  tileListPos,
const int *__restrict__  tileListOrderSrc,
const unsigned int *__restrict__  tileListDepthSrc,
int *__restrict__  tileListOrderDst,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 120 of file CudaTileListKernel.cu.

125  {
126 
127  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
128  {
129  int j = outputOrder[numTileListsSrc - i - 1];
130  if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
131  int k = tileListPos[i];
133  tileListOrderDst[k] = j; //tileListOrderSrc[j];
134  }
135  }
136 }
numTileListsSrc
const int const int begin_bit
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
template<int nthread>
__global__ void updatePatchesKernel ( const int  numComputes,
const int *__restrict__  tilePos,
const CudaComputeRecord *__restrict__  computes,
const CudaPatchRecord *__restrict__  patches,
TileList *__restrict__  tileLists 
)

Definition at line 362 of file CudaTileListKernel.cu.

References CudaComputeRecord::offsetXYZ, CudaComputeRecord::patchInd, and WARPSIZE.

366  {
367 
368  const int tid = threadIdx.x % nthread;
369 
370  // nthread threads takes care of one compute
371  for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
372  {
373  CudaComputeRecord compute = computes[k];
374  float3 offsetXYZ = compute.offsetXYZ;
375  int2 patchInd = compute.patchInd;
376  int numTiles1 = (patches[patchInd.x].numAtoms-1)/WARPSIZE+1;
377  int itileList0 = tilePos[k];
378  for (int i=tid;i < numTiles1;i+=nthread) {
379  tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
380  tileLists[itileList0 + i].patchInd = patchInd;
381  tileLists[itileList0 + i].icompute = k;
382  }
383  }
384 
385 }
float3 offsetXYZ
#define WARPSIZE

Variable Documentation

else begin_bit
BLOCK_SYNC
const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ valT* __restrict__ tileListOrderDst typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU

Definition at line 637 of file CudaTileListKernel.cu.

__thread DeviceCUDA* deviceCUDA

Definition at line 18 of file DeviceCUDA.C.

else end_bit

Definition at line 637 of file CudaTileListKernel.cu.

keys

Definition at line 657 of file CudaTileListKernel.cu.

BlockLoad::TempStorage load
BlockLoadU::TempStorage loadU

Definition at line 653 of file CudaTileListKernel.cu.

const int numTileListsDst

Definition at line 637 of file CudaTileListKernel.cu.

numTileListsSrc

Definition at line 660 of file CudaTileListKernel.cu.

oobKey

Definition at line 637 of file CudaTileListKernel.cu.

BlockRadixSort::TempStorage sort
__shared__ { ... } tempStorage
const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ tileListDepthDst

Definition at line 637 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ tileListDepthSrc

Definition at line 637 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ tileListOrderSrc

Definition at line 637 of file CudaTileListKernel.cu.

else values

Definition at line 658 of file CudaTileListKernel.cu.

Referenced by Controller::printEnergies(), and Parameters::read_parm().