CudaTileListKernel.cu File Reference

#include <cuda.h>
#include <namd_cub/device/device_radix_sort.cuh>
#include <namd_cub/device/device_scan.cuh>
#include <namd_cub/cub.cuh>
#include "CudaUtils.h"
#include "CudaTileListKernel.h"
#include "DeviceCUDA.h"

Go to the source code of this file.

Defines

#define OVERALLOC   1.2f
#define __ldg   *
#define BOUNDINGBOXKERNEL_NUM_WARP   8
#define TILELISTKERNELNEW_NUM_WARP   4
#define REPACKTILELISTSKERNEL_NUM_WARP   32
#define SORTTILELISTSKERNEL_NUM_THREAD   512
#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD   22

Typedefs

typedef cub::BlockLoad< valT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
cub::BLOCK_LOAD_WARP_TRANSPOSE > 
BlockLoad
typedef cub::BlockRadixSort
< keyT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
valT > 
BlockRadixSort

Functions

void NAMD_die (const char *)
__global__ void calcPatchNumLists (const int numTileLists, const int numPatches, const TileList *__restrict__ tileLists, int *__restrict__ patchNumLists)
__global__ void setPatchNumLists_findEmptyPatches (const int numTileLists, TileList *__restrict__ tileLists, const int *__restrict__ patchNumLists, const int numPatches, int *__restrict__ numEmptyPatches, int *__restrict__ emptyPatches)
__global__ void buildRemoveZerosSortKey (const int numTileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, unsigned int *__restrict__ sortKey)
__global__ void setupSortKey (const int numTileLists, const int maxTileListLen, const TileList *__restrict__ tileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, const unsigned int *__restrict__ sortKeys, unsigned int *__restrict__ sortKey)
template<int width>
__global__ void localSort (const int n, const int begin_bit, const int num_bit, unsigned int *__restrict__ keys, int *__restrict__ vals)
__global__ void storeInReverse (const int numTileListsSrc, const int begin_bit, const int *__restrict__ outputOrder, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrderSrc, const unsigned int *__restrict__ tileListDepthSrc, int *__restrict__ tileListOrderDst, unsigned int *__restrict__ tileListDepthDst)
__global__ void bitshiftTileListDepth (const int numTileLists, const int begin_bit, const int *__restrict__ outputOrder, const unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
__global__ void initMinMaxListLen (const int numComputes, const int maxTileListLen, int2 *__restrict__ minmaxListLen)
__global__ void buildSortKeys (const int numTileListsDst, const int maxTileListLen, const TileList *__restrict__ tileListsSrc, const int *__restrict__ tileListOrderDst, const unsigned int *__restrict__ tileListDepthDst, int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void fillSortKeys (const int numComputes, const int maxTileListLen, const int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void buildBoundingBoxesKernel (const int atomStorageSize, const float4 *__restrict__ xyzq, BoundingBox *__restrict__ boundingBoxes)
__device__ __forceinline__ float distsq (const BoundingBox a, const BoundingBox b)
template<int nthread>
__global__ void calcTileListPosKernel (const int numComputes, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, int *__restrict__ tilePos)
template<int nthread>
__global__ void updatePatchesKernel (const int numComputes, const int *__restrict__ tilePos, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, TileList *__restrict__ tileLists)
__host__ __device__
__forceinline__ int 
buildTileListsBBKernel_shmem_sizePerThread (const int maxTileListLen)
__global__ void buildTileListsBBKernel (const int numTileLists, TileList *__restrict__ tileLists, const CudaPatchRecord *__restrict__ patches, const int *__restrict__ tileListPos, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const int maxTileListLen, const BoundingBox *__restrict__ boundingBoxes, int *__restrict__ tileJatomStart, const int tileJatomStartSize, unsigned int *__restrict__ tileListDepth, int *__restrict__ tileListOrder, PatchPairRecord *__restrict__ patchPairs, TileListStat *__restrict__ tileListStat)
__global__ void repackTileListsKernel (const int numTileLists, const int begin_bit, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrder, const int *__restrict__ jtiles, const TileList *__restrict__ tileListsSrc, TileList *__restrict__ tileListsDst, const PatchPairRecord *__restrict__ patchPairsSrc, PatchPairRecord *__restrict__ patchPairsDst, const int *__restrict__ tileJatomStartSrc, int *__restrict__ tileJatomStartDst, const TileExcl *__restrict__ tileExclsSrc, TileExcl *__restrict__ tileExclsDst)
template<typename keyT , typename valT , bool ascend>
 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__ void sortTileListsKernel(const int numTileListsSrc
 BlockLoadU (tempStorage.loadU).Load(tileListDepthSrc
 BlockLoad (tempStorage.load).Load(tileListOrderSrc
 if (ascend) BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys
else BlockRadixSort (tempStorage.sort).SortDescendingBlockedToStriped(keys
__global__ void reOrderTileListDepth (const int numTileLists, const int *__restrict__ tileListOrder, unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
__global__ void bitshiftTileListDepth (const int numTileLists, const int begin_bit, unsigned int *__restrict__ tileListDepth)
int ilog2 (int a)

Variables

const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ valT
*__restrict__ valT
*__restrict__ tileListOrderDst
typedef cub::BlockLoad< keyT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
cub::BLOCK_LOAD_WARP_TRANSPOSE > 
BlockLoadU
__thread DeviceCUDAdeviceCUDA
const int numTileListsDst
const int const int begin_bit
const int const int const int end_bit
const int const int const int
const keyT 
oobKey
const int const int const int
const keyT keyT *__restrict__ 
tileListDepthSrc
const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ 
tileListDepthDst
const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ valT
*__restrict__ 
tileListOrderSrc
union {
   BlockLoad::TempStorage   load
   BlockLoadU::TempStorage   loadU
   BlockRadixSort::TempStorage   sort
tempStorage
keyT keys [SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
valT values [SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
 numTileListsSrc
 BLOCK_SYNC

Define Documentation

#define __ldg   *
#define BOUNDINGBOXKERNEL_NUM_WARP   8
#define OVERALLOC   1.2f

Definition at line 21 of file CudaTileListKernel.cu.

#define REPACKTILELISTSKERNEL_NUM_WARP   32

Definition at line 541 of file CudaTileListKernel.cu.

Referenced by repackTileListsKernel().

#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD   22

Definition at line 634 of file CudaTileListKernel.cu.

#define SORTTILELISTSKERNEL_NUM_THREAD   512

Definition at line 633 of file CudaTileListKernel.cu.

#define TILELISTKERNELNEW_NUM_WARP   4

Definition at line 318 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists().


Typedef Documentation

typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad

Definition at line 646 of file CudaTileListKernel.cu.

typedef cub::BlockRadixSort<keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT> BlockRadixSort

Definition at line 649 of file CudaTileListKernel.cu.


Function Documentation

template<typename keyT , typename valT , bool ascend>
__launch_bounds__ ( SORTTILELISTSKERNEL_NUM_THREAD  ,
 
) const [inline]
__global__ void bitshiftTileListDepth ( const int  numTileLists,
const int  begin_bit,
unsigned int *__restrict__  tileListDepth 
)

Definition at line 688 of file CudaTileListKernel.cu.

References itileList.

00689                                             {
00690 
00691   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
00692   {
00693     unsigned int a = tileListDepth[itileList];
00694     a >>= begin_bit;
00695     a &= 65535;
00696     tileListDepth[itileList] = a;
00697   }
00698 
00699 }

__global__ void bitshiftTileListDepth ( const int  numTileLists,
const int  begin_bit,
const int *__restrict__  outputOrder,
const unsigned int *__restrict__  tileListDepthSrc,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 141 of file CudaTileListKernel.cu.

References j.

00143                                                {
00144 
00145   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
00146   {
00147     int j = outputOrder[numTileLists - i - 1];
00148     tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
00149   }
00150 
00151 }

BlockLoad ( tempStorage.  load  ) 
BlockLoadU ( tempStorage.  loadU  ) 
else BlockRadixSort ( tempStorage.  sort  ) 
__global__ void buildBoundingBoxesKernel ( const int  atomStorageSize,
const float4 *__restrict__  xyzq,
BoundingBox *__restrict__  boundingBoxes 
)

Definition at line 234 of file CudaTileListKernel.cu.

References BOUNDINGBOXKERNEL_NUM_WARP, tempStorage, WARPSIZE, BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

00235                                            {
00236 
00237   const int warpId = threadIdx.x / WARPSIZE;
00238   const int wid = threadIdx.x % WARPSIZE;
00239 
00240   // Loop with warp-aligned index to avoid warp-divergence
00241   for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
00242     // Full atom index
00243     const int i = iwarp + wid;
00244     // Bounding box index
00245     const int ibb = i/WARPSIZE;
00246 
00247     float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
00248 
00249     volatile float3 minxyz, maxxyz;
00250 
00251     typedef cub::WarpReduce<float> WarpReduce;
00252     __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
00253     minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
00254     minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
00255     minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
00256     maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
00257     maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
00258     maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
00259 
00260     if (wid == 0) {
00261       BoundingBox boundingBox;
00262       boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
00263       boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
00264       boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
00265       boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
00266       boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
00267       boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
00268       boundingBoxes[ibb] = boundingBox;
00269     }
00270   }
00271 
00272 }

__global__ void buildRemoveZerosSortKey ( const int  numTileLists,
const unsigned int *__restrict__  tileListDepth,
const int  begin_bit,
unsigned int *__restrict__  sortKey 
)

Definition at line 72 of file CudaTileListKernel.cu.

References itileList.

00073                                                                                                            {
00074 
00075   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
00076   {
00077     int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
00078     sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
00079   }
00080 
00081 }

__global__ void buildSortKeys ( const int  numTileListsDst,
const int  maxTileListLen,
const TileList *__restrict__  tileListsSrc,
const int *__restrict__  tileListOrderDst,
const unsigned int *__restrict__  tileListDepthDst,
int2 *__restrict__  minmaxListLen,
unsigned int *__restrict__  sortKeys 
)

Definition at line 169 of file CudaTileListKernel.cu.

References j.

00173                                                                          {
00174 
00175   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
00176   {
00177     int k = tileListOrderDst[i];
00178     int icompute = tileListsSrc[k].icompute;
00179     int depth    = tileListDepthDst[i] & 65535;
00180     // depth is in range [1 ... maxTileListLen]
00181     int j        = icompute*maxTileListLen + (depth-1);
00182     sortKeys[j] = i;
00183     int2 minmax = minmaxListLen[icompute];
00184     int2 minmaxOrig = minmax;
00185     if (minmax.x > depth) minmax.x = depth;
00186     if (minmax.y < depth) minmax.y = depth;
00187     if (minmax.x != minmaxOrig.x) {
00188       atomicMin(&minmaxListLen[icompute].x, minmax.x);
00189     }
00190     if (minmax.y != minmaxOrig.y) {
00191       atomicMax(&minmaxListLen[icompute].y, minmax.y);
00192     }
00193   }
00194 
00195 }

__global__ void buildTileListsBBKernel ( const int  numTileLists,
TileList *__restrict__  tileLists,
const CudaPatchRecord *__restrict__  patches,
const int *__restrict__  tileListPos,
const float3  lata,
const float3  latb,
const float3  latc,
const float  cutoff2,
const int  maxTileListLen,
const BoundingBox *__restrict__  boundingBoxes,
int *__restrict__  tileJatomStart,
const int  tileJatomStartSize,
unsigned int *__restrict__  tileListDepth,
int *__restrict__  tileListOrder,
PatchPairRecord *__restrict__  patchPairs,
TileListStat *__restrict__  tileListStat 
)

Definition at line 397 of file CudaTileListKernel.cu.

References CudaPatchRecord::atomStart, buildTileListsBBKernel_shmem_sizePerThread(), distsq(), PatchPairRecord::iatomFreeSize, PatchPairRecord::iatomSize, TileList::iatomStart, TileList::icompute, itileList, j, PatchPairRecord::jatomFreeSize, PatchPairRecord::jatomSize, TileList::jtileEnd, TileList::jtileStart, CudaPatchRecord::numAtoms, CudaPatchRecord::numFreeAtoms, TileList::offsetXYZ, TileList::patchInd, WARP_FULL_MASK, and WARPSIZE.

00409                                            {
00410 
00411   extern __shared__ char sh_buffer[];
00412   int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
00413   int pos = threadIdx.x*sizePerThread;
00414   volatile char* sh_tile = (char*)&sh_buffer[pos];
00415 
00416   // Loop with warp-aligned index to avoid warp-divergence
00417   for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
00418 
00419     // Use one thread per tile list
00420     const int wid = threadIdx.x % WARPSIZE;
00421     const int itileList = iwarp + wid;
00422 
00423     int i;
00424     int itileListLen = 0;
00425     CudaPatchRecord patch1;
00426     CudaPatchRecord patch2;
00427     float3 offsetXYZ;
00428     int2 patchInd;
00429     int numTiles2;
00430     int icompute;
00431 
00432     if (itileList < numTileLists) {
00433       offsetXYZ = tileLists[itileList].offsetXYZ;
00434       patchInd  = tileLists[itileList].patchInd;
00435       icompute  = tileLists[itileList].icompute;
00436       // Get i-column
00437       i = itileList - tileListPos[icompute];
00438 
00439       float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
00440       float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
00441       float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
00442 
00443       // DH - set zeroShift flag if magnitude of shift vector is zero
00444       bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
00445 
00446       // Load patches
00447       patch1 = patches[patchInd.x];
00448       patch2 = patches[patchInd.y];
00449       // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
00450       numTiles2 = (patch2.numAtoms-1)/WARPSIZE+1;
00451       int tileStart1 = patch1.atomStart/WARPSIZE;
00452       int tileStart2 = patch2.atomStart/WARPSIZE;
00453 
00454       // DH - self requires that zeroShift is also set
00455       bool self = zeroShift && (tileStart1 == tileStart2);
00456 
00457       // Load i-atom data (and shift coordinates)
00458       BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
00459       boundingBoxI.x += shx;
00460       boundingBoxI.y += shy;
00461       boundingBoxI.z += shz;
00462 
00463       for (int j=0;j < numTiles2;j++) {
00464         sh_tile[j] = 0;
00465         if (!self || j >= i) {
00466           BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
00467           float r2bb = distsq(boundingBoxI, boundingBoxJ);
00468           if (r2bb < cutoff2) {
00469             sh_tile[j] = 1;
00470             itileListLen++;
00471           }
00472         }
00473       }
00474 
00475       tileListDepth[itileList] = (unsigned int)itileListLen;
00476       tileListOrder[itileList] = itileList;
00477     }
00478 
00479     typedef cub::WarpScan<int> WarpScan;
00480     __shared__ typename WarpScan::TempStorage tempStorage;
00481     int active = (itileListLen > 0);
00482     int activePos;
00483     WarpScan(tempStorage).ExclusiveSum(active, activePos);
00484     int itileListPos;
00485     WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
00486 
00487     int jtileStart, numJtiles;
00488     // Last thread in the warp knows the total number
00489     if (wid == WARPSIZE-1) {
00490       atomicAdd(&tileListStat->numTileLists, activePos + active);
00491       numJtiles = itileListPos + itileListLen;
00492       jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
00493     }
00494     numJtiles  = cub::ShuffleIndex<WARPSIZE>(numJtiles,  WARPSIZE-1, WARP_FULL_MASK);
00495     jtileStart = cub::ShuffleIndex<WARPSIZE>(jtileStart, WARPSIZE-1, WARP_FULL_MASK);    
00496     if (jtileStart + numJtiles > tileJatomStartSize) {
00497       // tileJatomStart out of memory, exit
00498       if (wid == 0) tileListStat->tilesSizeExceeded = true;
00499       return;
00500     }
00501 
00502     int jStart = itileListPos;
00503     int jEnd   = cub::ShuffleDown<WARPSIZE>(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);    
00504     if (wid == WARPSIZE-1) jEnd = numJtiles;
00505 
00506     if (itileListLen > 0) {
00507       // Setup tileLists[]
00508       TileList TLtmp;
00509       TLtmp.iatomStart = patch1.atomStart + i*WARPSIZE;
00510       TLtmp.jtileStart = jtileStart + jStart;
00511       TLtmp.jtileEnd   = jtileStart + jEnd - 1;
00512       TLtmp.patchInd   = patchInd;
00513       TLtmp.offsetXYZ  = offsetXYZ;
00514       TLtmp.icompute   = icompute;
00515       // TLtmp.patchNumList.x = 0;
00516       // TLtmp.patchNumList.y = 0;
00517       tileLists[itileList] = TLtmp;
00518       // PatchPair
00519       PatchPairRecord patchPair;
00520       patchPair.iatomSize     = patch1.atomStart + patch1.numAtoms;
00521       patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
00522       patchPair.jatomSize     = patch2.atomStart + patch2.numAtoms;
00523       patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
00524       patchPairs[itileList] = patchPair;
00525 
00526       // Write tiles
00527       int jtile = jtileStart + jStart;
00528       for (int j=0;j < numTiles2;j++) {
00529         if (sh_tile[j]) {
00530           tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
00531           jtile++;
00532         }
00533       }
00534 
00535     }
00536 
00537   }
00538 
00539 }

__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread ( const int  maxTileListLen  ) 

Definition at line 388 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists(), and buildTileListsBBKernel().

00388                                                                          {
00389   // Size in bytes
00390   int size = (
00391     maxTileListLen*sizeof(char)
00392     );
00393   return size;
00394 }

__global__ void calcPatchNumLists ( const int  numTileLists,
const int  numPatches,
const TileList *__restrict__  tileLists,
int *__restrict__  patchNumLists 
)

Definition at line 32 of file CudaTileListKernel.cu.

00033                                                                            {
00034 
00035   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
00036   {
00037     int2 patchInd = tileLists[i].patchInd;
00038     atomicAdd(&patchNumLists[patchInd.x], 1);
00039     if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
00040   }
00041 
00042 }

template<int nthread>
__global__ void calcTileListPosKernel ( const int  numComputes,
const CudaComputeRecord *__restrict__  computes,
const CudaPatchRecord *__restrict__  patches,
int *__restrict__  tilePos 
) [inline]

Definition at line 324 of file CudaTileListKernel.cu.

References BLOCK_SYNC, and WARPSIZE.

00327                              {
00328 
00329   typedef cub::BlockScan<int, nthread> BlockScan;
00330 
00331   __shared__ typename BlockScan::TempStorage tempStorage;
00332   __shared__ int shTilePos0;
00333 
00334   if (threadIdx.x == nthread-1) {
00335     shTilePos0 = 0;
00336   }
00337 
00338   for (int base=0;base < numComputes;base+=nthread) {
00339     int k = base + threadIdx.x;
00340 
00341     int numTiles1 = (k < numComputes) ? (patches[computes[k].patchInd.x].numAtoms-1)/WARPSIZE+1 : 0;
00342 
00343     // Calculate positions in tile list and jtile list
00344     int tilePosVal;
00345     BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
00346 
00347     // Store into global memory
00348     if (k < numComputes) {
00349       tilePos[k] = shTilePos0 + tilePosVal;
00350     }
00351 
00352     BLOCK_SYNC;
00353     // Store block end position
00354     if (threadIdx.x == nthread-1) {
00355       shTilePos0 += tilePosVal + numTiles1;
00356     }
00357   }
00358 }

__device__ __forceinline__ float distsq ( const BoundingBox  a,
const BoundingBox  b 
)

Definition at line 277 of file CudaTileListKernel.cu.

References BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

00277                                                                                   {
00278   float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
00279   float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
00280   float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
00281   float r2 = dx*dx + dy*dy + dz*dz;
00282   return r2;
00283 }

__global__ void fillSortKeys ( const int  numComputes,
const int  maxTileListLen,
const int2 *__restrict__  minmaxListLen,
unsigned int *__restrict__  sortKeys 
)

Definition at line 197 of file CudaTileListKernel.cu.

References j, and WARPSIZE.

00198                                                                                {
00199 
00200   for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
00201     const int wid = threadIdx.x % WARPSIZE;
00202     int2 minmax = minmaxListLen[i];
00203     int minlen = minmax.x;
00204     int maxlen = minmax.y;
00205     // minlen, maxlen are in range [1 ... maxTileListLen]
00206     // as long as i is in tileListsSrc[].icompute above
00207     if ( maxlen < minlen ) {
00208       minlen = 1;
00209       maxlen = maxTileListLen;
00210     }
00211     unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
00212     unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
00213     unsigned int aveKey = (maxKey + minKey)/2;
00214     for (int j=wid;j < minlen-1;j+=WARPSIZE) {
00215       sortKeys[i*maxTileListLen + j] = minKey;
00216     }
00217     for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
00218       sortKeys[i*maxTileListLen + j] = maxKey;
00219     }
00220     for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
00221       if (sortKeys[i*maxTileListLen + j] == 0) {
00222         sortKeys[i*maxTileListLen + j] = aveKey;
00223       }
00224     }
00225   }
00226 
00227 }

if ( ascend   ) 
int ilog2 ( int  a  ) 

Definition at line 1156 of file CudaTileListKernel.cu.

01156                  {
01157   // if (a < 0)
01158   //   NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
01159   int k = 1;
01160   while (a >>= 1) k++;
01161   return k;
01162 }

__global__ void initMinMaxListLen ( const int  numComputes,
const int  maxTileListLen,
int2 *__restrict__  minmaxListLen 
)

Definition at line 153 of file CudaTileListKernel.cu.

00154                                     {
00155 
00156   int2 val;
00157   val.x = maxTileListLen+1;
00158   val.y = 0;
00159   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
00160   {
00161     minmaxListLen[i] = val;
00162   }
00163 
00164 }

template<int width>
__global__ void localSort ( const int  n,
const int  begin_bit,
const int  num_bit,
unsigned int *__restrict__  keys,
int *__restrict__  vals 
) [inline]

Definition at line 98 of file CudaTileListKernel.cu.

References BLOCK_SYNC, and tempStorage.

00099                                                            {
00100 
00101   // NOTE: blockDim.x = width
00102 
00103   for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
00104   {
00105     int i = base + threadIdx.x;
00106     typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
00107     __shared__ typename BlockRadixSort::TempStorage tempStorage;
00108     unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
00109     int val[1] = {(i < n) ? vals[i] : 0};
00110     BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
00111     if (i < n) {
00112       keys[i] = key[0];
00113       vals[i] = val[0];
00114     }
00115     BLOCK_SYNC;
00116   }
00117 
00118 }

void NAMD_die ( const char *   ) 

Definition at line 83 of file common.C.

00084 {
00085   if ( ! err_msg ) err_msg = "(unknown error)";
00086   CkPrintf("FATAL ERROR: %s\n", err_msg);
00087   fflush(stdout);
00088   char repstr[24] = "";
00089   if (CmiNumPartitions() > 1) {
00090     sprintf(repstr,"REPLICA %d ", CmiMyPartition());
00091   }
00092   CkError("%sFATAL ERROR: %s\n", repstr, err_msg);
00093 #if CHARM_VERSION < 61000
00094   CkExit();
00095 #else
00096   CkExit(1);
00097 #endif
00098 }

__global__ void reOrderTileListDepth ( const int  numTileLists,
const int *__restrict__  tileListOrder,
unsigned int *__restrict__  tileListDepthSrc,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 674 of file CudaTileListKernel.cu.

References j.

00675                                                                                             {
00676 
00677   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
00678   {
00679     int j = tileListOrder[i];
00680     tileListDepthDst[i] = tileListDepthSrc[j];
00681   }
00682 
00683 }

__global__ void repackTileListsKernel ( const int  numTileLists,
const int  begin_bit,
const int *__restrict__  tileListPos,
const int *__restrict__  tileListOrder,
const int *__restrict__  jtiles,
const TileList *__restrict__  tileListsSrc,
TileList *__restrict__  tileListsDst,
const PatchPairRecord *__restrict__  patchPairsSrc,
PatchPairRecord *__restrict__  patchPairsDst,
const int *__restrict__  tileJatomStartSrc,
int *__restrict__  tileJatomStartDst,
const TileExcl *__restrict__  tileExclsSrc,
TileExcl *__restrict__  tileExclsDst 
)

Definition at line 543 of file CudaTileListKernel.cu.

References __ldg, TileList::iatomStart, TileList::icompute, j, TileList::jtileEnd, TileList::jtileStart, TileList::offsetXYZ, TileList::patchInd, REPACKTILELISTSKERNEL_NUM_WARP, WARP_BALLOT, WARP_FULL_MASK, and WARPSIZE.

00549                                                                                   {
00550 
00551   const int wid = threadIdx.x % WARPSIZE;
00552 
00553   // One warp does one tile list
00554   for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/WARPSIZE*gridDim.x)
00555   {
00556     int j = tileListOrder[i];
00557     int start = tileListPos[i];
00558     int end   = tileListPos[i+1]-1;
00559     if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
00560     // TileList
00561     int startOld   = __ldg(&tileListsSrc[j].jtileStart);
00562     int endOld     = __ldg(&tileListsSrc[j].jtileEnd);
00563     int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
00564     float3 offsetXYZ;
00565     offsetXYZ.x  = __ldg(&tileListsSrc[j].offsetXYZ.x);
00566     offsetXYZ.y  = __ldg(&tileListsSrc[j].offsetXYZ.y);
00567     offsetXYZ.z  = __ldg(&tileListsSrc[j].offsetXYZ.z);
00568     int2 patchInd = tileListsSrc[j].patchInd;
00569     int icompute = __ldg(&tileListsSrc[j].icompute);
00570     if (wid == 0) {
00571       TileList tileList;
00572       tileList.iatomStart = iatomStart;
00573       tileList.offsetXYZ  = offsetXYZ;
00574       tileList.jtileStart = start;
00575       tileList.jtileEnd   = end;
00576       tileList.patchInd   = patchInd;
00577       tileList.icompute   = icompute;
00578       tileListsDst[i] = tileList;
00579     }
00580 
00581     if (jtiles == NULL) {
00582       // No jtiles, simple copy will do
00583       int jtile = start;
00584       for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
00585         if (jtileOld + wid <= endOld) {
00586           tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
00587         }
00588       }
00589       if (tileExclsSrc != NULL) {
00590         int jtile = start;
00591         for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
00592           tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
00593         }
00594       }
00595     } else {
00596       int jtile0 = start;
00597       for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
00598         int t = jtileOld + wid;
00599         int jtile = (t <= endOld) ? jtiles[t] : 0;
00600         jtile >>= begin_bit;
00601         jtile &= 65535;
00602         typedef cub::WarpScan<int> WarpScan;
00603         __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
00604         int warpId = threadIdx.x / WARPSIZE;
00605         int jtilePos;
00606         WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
00607 
00608         if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
00609 
00610         if (tileExclsSrc != NULL) {
00611           unsigned int b = WARP_BALLOT(WARP_FULL_MASK, jtile);
00612           while (b != 0) {
00613             // k = index of thread that has data
00614             int k = __ffs(b) - 1;
00615             tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
00616             // remove 1 bit and advance jtile0
00617             b ^= ((unsigned int)1 << k);
00618             jtile0++;
00619           }
00620         } else {
00621           jtile0 += __popc(WARP_BALLOT(WARP_FULL_MASK, jtile));
00622         }
00623       }
00624     }
00625   }
00626 
00627 }

__global__ void setPatchNumLists_findEmptyPatches ( const int  numTileLists,
TileList *__restrict__  tileLists,
const int *__restrict__  patchNumLists,
const int  numPatches,
int *__restrict__  numEmptyPatches,
int *__restrict__  emptyPatches 
)

Definition at line 48 of file CudaTileListKernel.cu.

00050                                                                                            {
00051 
00052   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
00053   {
00054     int2 patchInd = tileLists[i].patchInd;
00055     int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
00056     tileLists[i].patchNumList = patchNumList;
00057   }
00058 
00059   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
00060   {
00061     if (patchNumLists[i] == 0) {
00062       int ind = atomicAdd(numEmptyPatches, 1);
00063       emptyPatches[ind] = i;
00064     }
00065   }
00066 
00067 }

__global__ void setupSortKey ( const int  numTileLists,
const int  maxTileListLen,
const TileList *__restrict__  tileLists,
const unsigned int *__restrict__  tileListDepth,
const int  begin_bit,
const unsigned int *__restrict__  sortKeys,
unsigned int *__restrict__  sortKey 
)

Definition at line 83 of file CudaTileListKernel.cu.

References itileList.

00085                                                                                                       {
00086 
00087   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
00088   {
00089     int icompute = tileLists[itileList].icompute;
00090     int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
00091     int i = icompute*maxTileListLen + (depth - 1);
00092     sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
00093   }
00094 
00095 }

__global__ void storeInReverse ( const int  numTileListsSrc,
const int  begin_bit,
const int *__restrict__  outputOrder,
const int *__restrict__  tileListPos,
const int *__restrict__  tileListOrderSrc,
const unsigned int *__restrict__  tileListDepthSrc,
int *__restrict__  tileListOrderDst,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 120 of file CudaTileListKernel.cu.

References j.

00125                                                {
00126 
00127   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
00128   {
00129     int j = outputOrder[numTileListsSrc - i - 1];
00130     if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
00131       int k = tileListPos[i];
00132       tileListDepthDst[k] = tileListDepthSrc[j];
00133       tileListOrderDst[k] = j; //tileListOrderSrc[j];
00134     }
00135   }
00136 }

template<int nthread>
__global__ void updatePatchesKernel ( const int  numComputes,
const int *__restrict__  tilePos,
const CudaComputeRecord *__restrict__  computes,
const CudaPatchRecord *__restrict__  patches,
TileList *__restrict__  tileLists 
) [inline]

Definition at line 362 of file CudaTileListKernel.cu.

References CudaComputeRecord::offsetXYZ, CudaComputeRecord::patchInd, and WARPSIZE.

00366                                     {
00367 
00368   const int tid = threadIdx.x % nthread;
00369 
00370   // nthread threads takes care of one compute
00371   for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
00372   {
00373     CudaComputeRecord compute = computes[k];
00374     float3 offsetXYZ = compute.offsetXYZ;
00375     int2 patchInd = compute.patchInd;
00376     int numTiles1 = (patches[patchInd.x].numAtoms-1)/WARPSIZE+1;
00377     int itileList0 = tilePos[k];
00378     for (int i=tid;i < numTiles1;i+=nthread) {
00379       tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
00380       tileLists[itileList0 + i].patchInd  = patchInd;
00381       tileLists[itileList0 + i].icompute  = k;
00382     }
00383   }
00384 
00385 }


Variable Documentation

else begin_bit

Definition at line 637 of file CudaTileListKernel.cu.

Definition at line 661 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ valT* __restrict__ tileListOrderDst typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU

Definition at line 637 of file CudaTileListKernel.cu.

Definition at line 18 of file DeviceCUDA.C.

else end_bit

Definition at line 637 of file CudaTileListKernel.cu.

Definition at line 657 of file CudaTileListKernel.cu.

BlockLoad::TempStorage load
BlockLoadU::TempStorage loadU

Definition at line 653 of file CudaTileListKernel.cu.

const int numTileListsDst

Definition at line 637 of file CudaTileListKernel.cu.

Definition at line 660 of file CudaTileListKernel.cu.

Definition at line 637 of file CudaTileListKernel.cu.

BlockRadixSort::TempStorage sort
__shared__ { ... } tempStorage
const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ tileListDepthDst

Definition at line 637 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ tileListDepthSrc

Definition at line 637 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ tileListOrderSrc

Definition at line 637 of file CudaTileListKernel.cu.

else values

Definition at line 658 of file CudaTileListKernel.cu.

Referenced by Controller::printEnergies(), and Parameters::read_parm().


Generated on 12 Jul 2020 for NAMD by  doxygen 1.6.1