CudaTileListKernel.cu File Reference

#include <cuda.h>
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_scan.cuh>
#include <cub/cub.cuh>
#include "CudaUtils.h"
#include "CudaTileListKernel.h"
#include "DeviceCUDA.h"

Go to the source code of this file.

Defines

#define OVERALLOC   1.2f
#define __ldg   *
#define BOUNDINGBOXKERNEL_NUM_WARP   8
#define TILELISTKERNELNEW_NUM_WARP   4
#define REPACKTILELISTSKERNEL_NUM_WARP   32
#define SORTTILELISTSKERNEL_NUM_THREAD   512
#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD   22

Typedefs

typedef cub::BlockLoad< valT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
cub::BLOCK_LOAD_WARP_TRANSPOSE > 
BlockLoad
typedef cub::BlockRadixSort
< keyT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
valT > 
BlockRadixSort

Functions

void NAMD_die (const char *)
__global__ void calcPatchNumLists (const int numTileLists, const int numPatches, const TileList *__restrict__ tileLists, int *__restrict__ patchNumLists)
__global__ void setPatchNumLists_findEmptyPatches (const int numTileLists, TileList *__restrict__ tileLists, const int *__restrict__ patchNumLists, const int numPatches, int *__restrict__ numEmptyPatches, int *__restrict__ emptyPatches)
__global__ void buildRemoveZerosSortKey (const int numTileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, unsigned int *__restrict__ sortKey)
__global__ void setupSortKey (const int numTileLists, const int maxTileListLen, const TileList *__restrict__ tileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, const unsigned int *__restrict__ sortKeys, unsigned int *__restrict__ sortKey)
template<int width>
__global__ void localSort (const int n, const int begin_bit, const int num_bit, unsigned int *__restrict__ keys, int *__restrict__ vals)
__global__ void storeInReverse (const int numTileListsSrc, const int begin_bit, const int *__restrict__ outputOrder, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrderSrc, const unsigned int *__restrict__ tileListDepthSrc, int *__restrict__ tileListOrderDst, unsigned int *__restrict__ tileListDepthDst)
__global__ void bitshiftTileListDepth (const int numTileLists, const int begin_bit, const int *__restrict__ outputOrder, const unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
__global__ void initMinMaxListLen (const int numComputes, const int maxTileListLen, int2 *__restrict__ minmaxListLen)
__global__ void buildSortKeys (const int numTileListsDst, const int maxTileListLen, const TileList *__restrict__ tileListsSrc, const int *__restrict__ tileListOrderDst, const unsigned int *__restrict__ tileListDepthDst, int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void fillSortKeys (const int numComputes, const int maxTileListLen, const int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void buildBoundingBoxesKernel (const int atomStorageSize, const float4 *__restrict__ xyzq, BoundingBox *__restrict__ boundingBoxes)
__device__ __forceinline__ float distsq (const BoundingBox a, const BoundingBox b)
template<int nthread>
__global__ void calcTileListPosKernel (const int numComputes, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, int *__restrict__ tilePos)
template<int nthread>
__global__ void updatePatchesKernel (const int numComputes, const int *__restrict__ tilePos, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, TileList *__restrict__ tileLists)
__host__ __device__
__forceinline__ int 
buildTileListsBBKernel_shmem_sizePerThread (const int maxTileListLen)
__global__ void buildTileListsBBKernel (const int numTileLists, TileList *__restrict__ tileLists, const CudaPatchRecord *__restrict__ patches, const int *__restrict__ tileListPos, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const int maxTileListLen, const BoundingBox *__restrict__ boundingBoxes, int *__restrict__ tileJatomStart, const int tileJatomStartSize, unsigned int *__restrict__ tileListDepth, int *__restrict__ tileListOrder, PatchPairRecord *__restrict__ patchPairs, TileListStat *__restrict__ tileListStat)
__global__ void repackTileListsKernel (const int numTileLists, const int begin_bit, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrder, const int *__restrict__ jtiles, const TileList *__restrict__ tileListsSrc, TileList *__restrict__ tileListsDst, const PatchPairRecord *__restrict__ patchPairsSrc, PatchPairRecord *__restrict__ patchPairsDst, const int *__restrict__ tileJatomStartSrc, int *__restrict__ tileJatomStartDst, const TileExcl *__restrict__ tileExclsSrc, TileExcl *__restrict__ tileExclsDst)
template<typename keyT , typename valT , bool ascend>
 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__ void sortTileListsKernel(const int numTileListsSrc
 BlockLoadU (tempStorage.loadU).Load(tileListDepthSrc
 BlockLoad (tempStorage.load).Load(tileListOrderSrc
 if (ascend) BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys
else BlockRadixSort (tempStorage.sort).SortDescendingBlockedToStriped(keys
__global__ void reOrderTileListDepth (const int numTileLists, const int *__restrict__ tileListOrder, unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
__global__ void bitshiftTileListDepth (const int numTileLists, const int begin_bit, unsigned int *__restrict__ tileListDepth)
int ilog2 (int a)

Variables

const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ valT
*__restrict__ valT
*__restrict__ tileListOrderDst
typedef cub::BlockLoad< keyT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
cub::BLOCK_LOAD_WARP_TRANSPOSE > 
BlockLoadU
__thread DeviceCUDAdeviceCUDA
const int numTileListsDst
const int const int begin_bit
const int const int const int end_bit
const int const int const int
const keyT 
oobKey
const int const int const int
const keyT keyT *__restrict__ 
tileListDepthSrc
const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ 
tileListDepthDst
const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ valT
*__restrict__ 
tileListOrderSrc
union {
   BlockLoad::TempStorage   load
   BlockLoadU::TempStorage   loadU
   BlockRadixSort::TempStorage   sort
tempStorage
keyT keys [SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
valT values [SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
 numTileListsSrc
 BLOCK_SYNC

Define Documentation

#define __ldg   *
#define BOUNDINGBOXKERNEL_NUM_WARP   8
#define OVERALLOC   1.2f

Definition at line 15 of file CudaTileListKernel.cu.

#define REPACKTILELISTSKERNEL_NUM_WARP   32

Definition at line 535 of file CudaTileListKernel.cu.

Referenced by repackTileListsKernel().

#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD   22

Definition at line 628 of file CudaTileListKernel.cu.

#define SORTTILELISTSKERNEL_NUM_THREAD   512

Definition at line 627 of file CudaTileListKernel.cu.

#define TILELISTKERNELNEW_NUM_WARP   4

Definition at line 312 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists().


Typedef Documentation

typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad

Definition at line 640 of file CudaTileListKernel.cu.

typedef cub::BlockRadixSort<keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT> BlockRadixSort

Definition at line 643 of file CudaTileListKernel.cu.


Function Documentation

template<typename keyT , typename valT , bool ascend>
__launch_bounds__ ( SORTTILELISTSKERNEL_NUM_THREAD  ,
 
) const [inline]
__global__ void bitshiftTileListDepth ( const int  numTileLists,
const int  begin_bit,
unsigned int *__restrict__  tileListDepth 
)

Definition at line 682 of file CudaTileListKernel.cu.

References itileList.

00683                                             {
00684 
00685   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
00686   {
00687     unsigned int a = tileListDepth[itileList];
00688     a >>= begin_bit;
00689     a &= 65535;
00690     tileListDepth[itileList] = a;
00691   }
00692 
00693 }

__global__ void bitshiftTileListDepth ( const int  numTileLists,
const int  begin_bit,
const int *__restrict__  outputOrder,
const unsigned int *__restrict__  tileListDepthSrc,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 135 of file CudaTileListKernel.cu.

References j.

00137                                                {
00138 
00139   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
00140   {
00141     int j = outputOrder[numTileLists - i - 1];
00142     tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
00143   }
00144 
00145 }

BlockLoad ( tempStorage.  load  ) 
BlockLoadU ( tempStorage.  loadU  ) 
else BlockRadixSort ( tempStorage.  sort  ) 
__global__ void buildBoundingBoxesKernel ( const int  atomStorageSize,
const float4 *__restrict__  xyzq,
BoundingBox *__restrict__  boundingBoxes 
)

Definition at line 228 of file CudaTileListKernel.cu.

References BOUNDINGBOXKERNEL_NUM_WARP, tempStorage, WARPSIZE, BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

00229                                            {
00230 
00231   const int warpId = threadIdx.x / WARPSIZE;
00232   const int wid = threadIdx.x % WARPSIZE;
00233 
00234   // Loop with warp-aligned index to avoid warp-divergence
00235   for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
00236     // Full atom index
00237     const int i = iwarp + wid;
00238     // Bounding box index
00239     const int ibb = i/WARPSIZE;
00240 
00241     float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
00242 
00243     volatile float3 minxyz, maxxyz;
00244 
00245     typedef cub::WarpReduce<float> WarpReduce;
00246     __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
00247     minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
00248     minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
00249     minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
00250     maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
00251     maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
00252     maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
00253 
00254     if (wid == 0) {
00255       BoundingBox boundingBox;
00256       boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
00257       boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
00258       boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
00259       boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
00260       boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
00261       boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
00262       boundingBoxes[ibb] = boundingBox;
00263     }
00264   }
00265 
00266 }

__global__ void buildRemoveZerosSortKey ( const int  numTileLists,
const unsigned int *__restrict__  tileListDepth,
const int  begin_bit,
unsigned int *__restrict__  sortKey 
)

Definition at line 66 of file CudaTileListKernel.cu.

References itileList.

00067                                                                                                            {
00068 
00069   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
00070   {
00071     int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
00072     sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
00073   }
00074 
00075 }

__global__ void buildSortKeys ( const int  numTileListsDst,
const int  maxTileListLen,
const TileList *__restrict__  tileListsSrc,
const int *__restrict__  tileListOrderDst,
const unsigned int *__restrict__  tileListDepthDst,
int2 *__restrict__  minmaxListLen,
unsigned int *__restrict__  sortKeys 
)

Definition at line 163 of file CudaTileListKernel.cu.

References j.

00167                                                                          {
00168 
00169   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
00170   {
00171     int k = tileListOrderDst[i];
00172     int icompute = tileListsSrc[k].icompute;
00173     int depth    = tileListDepthDst[i] & 65535;
00174     // depth is in range [1 ... maxTileListLen]
00175     int j        = icompute*maxTileListLen + (depth-1);
00176     sortKeys[j] = i;
00177     int2 minmax = minmaxListLen[icompute];
00178     int2 minmaxOrig = minmax;
00179     if (minmax.x > depth) minmax.x = depth;
00180     if (minmax.y < depth) minmax.y = depth;
00181     if (minmax.x != minmaxOrig.x) {
00182       atomicMin(&minmaxListLen[icompute].x, minmax.x);
00183     }
00184     if (minmax.y != minmaxOrig.y) {
00185       atomicMax(&minmaxListLen[icompute].y, minmax.y);
00186     }
00187   }
00188 
00189 }

__global__ void buildTileListsBBKernel ( const int  numTileLists,
TileList *__restrict__  tileLists,
const CudaPatchRecord *__restrict__  patches,
const int *__restrict__  tileListPos,
const float3  lata,
const float3  latb,
const float3  latc,
const float  cutoff2,
const int  maxTileListLen,
const BoundingBox *__restrict__  boundingBoxes,
int *__restrict__  tileJatomStart,
const int  tileJatomStartSize,
unsigned int *__restrict__  tileListDepth,
int *__restrict__  tileListOrder,
PatchPairRecord *__restrict__  patchPairs,
TileListStat *__restrict__  tileListStat 
)

Definition at line 391 of file CudaTileListKernel.cu.

References CudaPatchRecord::atomStart, buildTileListsBBKernel_shmem_sizePerThread(), distsq(), PatchPairRecord::iatomFreeSize, PatchPairRecord::iatomSize, TileList::iatomStart, TileList::icompute, itileList, j, PatchPairRecord::jatomFreeSize, PatchPairRecord::jatomSize, TileList::jtileEnd, TileList::jtileStart, CudaPatchRecord::numAtoms, CudaPatchRecord::numFreeAtoms, TileList::offsetXYZ, TileList::patchInd, WARP_FULL_MASK, and WARPSIZE.

00403                                            {
00404 
00405   extern __shared__ char sh_buffer[];
00406   int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
00407   int pos = threadIdx.x*sizePerThread;
00408   volatile char* sh_tile = (char*)&sh_buffer[pos];
00409 
00410   // Loop with warp-aligned index to avoid warp-divergence
00411   for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
00412 
00413     // Use one thread per tile list
00414     const int wid = threadIdx.x % WARPSIZE;
00415     const int itileList = iwarp + wid;
00416 
00417     int i;
00418     int itileListLen = 0;
00419     CudaPatchRecord patch1;
00420     CudaPatchRecord patch2;
00421     float3 offsetXYZ;
00422     int2 patchInd;
00423     int numTiles2;
00424     int icompute;
00425 
00426     if (itileList < numTileLists) {
00427       offsetXYZ = tileLists[itileList].offsetXYZ;
00428       patchInd  = tileLists[itileList].patchInd;
00429       icompute  = tileLists[itileList].icompute;
00430       // Get i-column
00431       i = itileList - tileListPos[icompute];
00432 
00433       float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
00434       float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
00435       float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
00436 
00437       // DH - set zeroShift flag if magnitude of shift vector is zero
00438       bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
00439 
00440       // Load patches
00441       patch1 = patches[patchInd.x];
00442       patch2 = patches[patchInd.y];
00443       // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
00444       numTiles2 = (patch2.numAtoms-1)/WARPSIZE+1;
00445       int tileStart1 = patch1.atomStart/WARPSIZE;
00446       int tileStart2 = patch2.atomStart/WARPSIZE;
00447 
00448       // DH - self requires that zeroShift is also set
00449       bool self = zeroShift && (tileStart1 == tileStart2);
00450 
00451       // Load i-atom data (and shift coordinates)
00452       BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
00453       boundingBoxI.x += shx;
00454       boundingBoxI.y += shy;
00455       boundingBoxI.z += shz;
00456 
00457       for (int j=0;j < numTiles2;j++) {
00458         sh_tile[j] = 0;
00459         if (!self || j >= i) {
00460           BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
00461           float r2bb = distsq(boundingBoxI, boundingBoxJ);
00462           if (r2bb < cutoff2) {
00463             sh_tile[j] = 1;
00464             itileListLen++;
00465           }
00466         }
00467       }
00468 
00469       tileListDepth[itileList] = (unsigned int)itileListLen;
00470       tileListOrder[itileList] = itileList;
00471     }
00472 
00473     typedef cub::WarpScan<int> WarpScan;
00474     __shared__ typename WarpScan::TempStorage tempStorage;
00475     int active = (itileListLen > 0);
00476     int activePos;
00477     WarpScan(tempStorage).ExclusiveSum(active, activePos);
00478     int itileListPos;
00479     WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
00480 
00481     int jtileStart, numJtiles;
00482     // Last thread in the warp knows the total number
00483     if (wid == WARPSIZE-1) {
00484       atomicAdd(&tileListStat->numTileLists, activePos + active);
00485       numJtiles = itileListPos + itileListLen;
00486       jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
00487     }
00488     numJtiles  = cub::ShuffleIndex(numJtiles,  WARPSIZE-1, WARPSIZE, WARP_FULL_MASK);
00489     jtileStart = cub::ShuffleIndex(jtileStart, WARPSIZE-1, WARPSIZE, WARP_FULL_MASK);
00490     if (jtileStart + numJtiles > tileJatomStartSize) {
00491       // tileJatomStart out of memory, exit
00492       if (wid == 0) tileListStat->tilesSizeExceeded = true;
00493       return;
00494     }
00495 
00496     int jStart = itileListPos;
00497     int jEnd   = cub::ShuffleDown(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);
00498     if (wid == WARPSIZE-1) jEnd = numJtiles;
00499 
00500     if (itileListLen > 0) {
00501       // Setup tileLists[]
00502       TileList TLtmp;
00503       TLtmp.iatomStart = patch1.atomStart + i*WARPSIZE;
00504       TLtmp.jtileStart = jtileStart + jStart;
00505       TLtmp.jtileEnd   = jtileStart + jEnd - 1;
00506       TLtmp.patchInd   = patchInd;
00507       TLtmp.offsetXYZ  = offsetXYZ;
00508       TLtmp.icompute   = icompute;
00509       // TLtmp.patchNumList.x = 0;
00510       // TLtmp.patchNumList.y = 0;
00511       tileLists[itileList] = TLtmp;
00512       // PatchPair
00513       PatchPairRecord patchPair;
00514       patchPair.iatomSize     = patch1.atomStart + patch1.numAtoms;
00515       patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
00516       patchPair.jatomSize     = patch2.atomStart + patch2.numAtoms;
00517       patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
00518       patchPairs[itileList] = patchPair;
00519 
00520       // Write tiles
00521       int jtile = jtileStart + jStart;
00522       for (int j=0;j < numTiles2;j++) {
00523         if (sh_tile[j]) {
00524           tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
00525           jtile++;
00526         }
00527       }
00528 
00529     }
00530 
00531   }
00532 
00533 }

__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread ( const int  maxTileListLen  ) 

Definition at line 382 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists(), and buildTileListsBBKernel().

00382                                                                          {
00383   // Size in bytes
00384   int size = (
00385     maxTileListLen*sizeof(char)
00386     );
00387   return size;
00388 }

__global__ void calcPatchNumLists ( const int  numTileLists,
const int  numPatches,
const TileList *__restrict__  tileLists,
int *__restrict__  patchNumLists 
)

Definition at line 26 of file CudaTileListKernel.cu.

00027                                                                            {
00028 
00029   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
00030   {
00031     int2 patchInd = tileLists[i].patchInd;
00032     atomicAdd(&patchNumLists[patchInd.x], 1);
00033     if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
00034   }
00035 
00036 }

template<int nthread>
__global__ void calcTileListPosKernel ( const int  numComputes,
const CudaComputeRecord *__restrict__  computes,
const CudaPatchRecord *__restrict__  patches,
int *__restrict__  tilePos 
) [inline]

Definition at line 318 of file CudaTileListKernel.cu.

References BLOCK_SYNC, and WARPSIZE.

00321                              {
00322 
00323   typedef cub::BlockScan<int, nthread> BlockScan;
00324 
00325   __shared__ typename BlockScan::TempStorage tempStorage;
00326   __shared__ int shTilePos0;
00327 
00328   if (threadIdx.x == nthread-1) {
00329     shTilePos0 = 0;
00330   }
00331 
00332   for (int base=0;base < numComputes;base+=nthread) {
00333     int k = base + threadIdx.x;
00334 
00335     int numTiles1 = (k < numComputes) ? (patches[computes[k].patchInd.x].numAtoms-1)/WARPSIZE+1 : 0;
00336 
00337     // Calculate positions in tile list and jtile list
00338     int tilePosVal;
00339     BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
00340 
00341     // Store into global memory
00342     if (k < numComputes) {
00343       tilePos[k] = shTilePos0 + tilePosVal;
00344     }
00345 
00346     BLOCK_SYNC;
00347     // Store block end position
00348     if (threadIdx.x == nthread-1) {
00349       shTilePos0 += tilePosVal + numTiles1;
00350     }
00351   }
00352 }

__device__ __forceinline__ float distsq ( const BoundingBox  a,
const BoundingBox  b 
)

Definition at line 271 of file CudaTileListKernel.cu.

References BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

00271                                                                                   {
00272   float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
00273   float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
00274   float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
00275   float r2 = dx*dx + dy*dy + dz*dz;
00276   return r2;
00277 }

__global__ void fillSortKeys ( const int  numComputes,
const int  maxTileListLen,
const int2 *__restrict__  minmaxListLen,
unsigned int *__restrict__  sortKeys 
)

Definition at line 191 of file CudaTileListKernel.cu.

References j, and WARPSIZE.

00192                                                                                {
00193 
00194   for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
00195     const int wid = threadIdx.x % WARPSIZE;
00196     int2 minmax = minmaxListLen[i];
00197     int minlen = minmax.x;
00198     int maxlen = minmax.y;
00199     // minlen, maxlen are in range [1 ... maxTileListLen]
00200     // as long as i is in tileListsSrc[].icompute above
00201     if ( maxlen < minlen ) {
00202       minlen = 1;
00203       maxlen = maxTileListLen;
00204     }
00205     unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
00206     unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
00207     unsigned int aveKey = (maxKey + minKey)/2;
00208     for (int j=wid;j < minlen-1;j+=WARPSIZE) {
00209       sortKeys[i*maxTileListLen + j] = minKey;
00210     }
00211     for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
00212       sortKeys[i*maxTileListLen + j] = maxKey;
00213     }
00214     for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
00215       if (sortKeys[i*maxTileListLen + j] == 0) {
00216         sortKeys[i*maxTileListLen + j] = aveKey;
00217       }
00218     }
00219   }
00220 
00221 }

if ( ascend   ) 
int ilog2 ( int  a  ) 

Definition at line 1150 of file CudaTileListKernel.cu.

01150                  {
01151   // if (a < 0)
01152   //   NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
01153   int k = 1;
01154   while (a >>= 1) k++;
01155   return k;
01156 }

__global__ void initMinMaxListLen ( const int  numComputes,
const int  maxTileListLen,
int2 *__restrict__  minmaxListLen 
)

Definition at line 147 of file CudaTileListKernel.cu.

00148                                     {
00149 
00150   int2 val;
00151   val.x = maxTileListLen+1;
00152   val.y = 0;
00153   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
00154   {
00155     minmaxListLen[i] = val;
00156   }
00157 
00158 }

template<int width>
__global__ void localSort ( const int  n,
const int  begin_bit,
const int  num_bit,
unsigned int *__restrict__  keys,
int *__restrict__  vals 
) [inline]

Definition at line 92 of file CudaTileListKernel.cu.

References BLOCK_SYNC, and tempStorage.

00093                                                            {
00094 
00095   // NOTE: blockDim.x = width
00096 
00097   for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
00098   {
00099     int i = base + threadIdx.x;
00100     typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
00101     __shared__ typename BlockRadixSort::TempStorage tempStorage;
00102     unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
00103     int val[1] = {(i < n) ? vals[i] : 0};
00104     BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
00105     if (i < n) {
00106       keys[i] = key[0];
00107       vals[i] = val[0];
00108     }
00109     BLOCK_SYNC;
00110   }
00111 
00112 }

void NAMD_die ( const char *   ) 

Definition at line 81 of file common.C.

Referenced by Controller::adaptTempInit(), ConfigList::add_element(), ComputeMsmMgr::addPotential(), after_backend_init(), Controller::algorithm(), AnisoElem::AnisoElem(), msm::PatchData::anterpolation(), msm::PatchData::anterpolationC1Hermite(), Parameters::assign_angle_index(), Parameters::assign_bond_index(), Parameters::assign_crossterm_index(), Parameters::assign_dihedral_index(), Parameters::assign_improper_index(), Parameters::assign_vdw_index(), WorkDistrib::assignNodeToPatch(), ComputeNonbondedCUDA::build_exclusions(), Molecule::build_go_arrays(), Molecule::build_go_params(), Molecule::build_go_sigmas(), Molecule::build_go_sigmas2(), Molecule::build_gridforce_params(), buildBondData(), HomePatch::buildRattleList(), ReductionMgr::buildSpanTree(), CudaTileListKernel::buildTileLists(), ParallelIOMgr::calcAtomsInEachPatch(), ComputeQMMgr::calcMOPAC(), ComputeQMMgr::calcORCA(), GlobalMasterTest::calculate(), GlobalMasterTcl::calculate(), GlobalMaster::calculate(), ComputeQMMgr::calcUSR(), GlobalMaster::check(), ComputeMsmMgr::compute(), PmeRealSpace::compute_forces(), AngleElem::computeForce(), ComputeMgr::ComputeMgr(), ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeTclBC::ComputeTclBC(), ConfigList::ConfigList(), NamdState::configListInit(), ConfigList::ConfigListNode::ConfigListNode(), Sequencer::correctMomentum(), Controller::correctMomentum(), WorkDistrib::createAtomLists(), cuda_errcheck(), cuda_getargs(), cuda_init_bspline_coeffs(), CudaComputeNonbonded::CudaComputeNonbonded(), cudaDie(), ComputeMsmMgr::d_stencil_1d(), Molecule::delete_qm_bonded(), ComputeNonbondedSelf::doForce(), ComputeGridForce::doForce(), HomePatch::doMarginCheck(), Parameters::done_reading_files(), ComputeTclBC::doWork(), ComputeFullDirect::doWork(), Node::earlyExit(), ARestraint::EarlyExit(), GlobalMasterMisc::easy_calc(), msm::Grid< BlockDiagram >::elem(), msm::GridFixed< T, N >::elem(), msm::Array< BlockDiagram >::elem(), ScriptTcl::eval(), colvarproxy_namd::fatal_error(), PmeXPencil::fft_init(), PmeYPencil::fft_init(), PmeZPencil::fft_init(), PmeRealSpace::fill_charges(), parm::get(), Molecule::get_atom_from_index_in_residue(), Molecule::get_atom_from_name(), Molecule::get_atomtype(), Parameters::get_dihedral_params(), Molecule::get_fep_bonded_type(), Parameters::get_improper_params(), Molecule::get_residue_size(), Parameters::get_vdw_params(), TholeElem::getMoleculePointers(), ExclElem::getMoleculePointers(), ImproperElem::getMoleculePointers(), GromacsPairElem::getMoleculePointers(), DihedralElem::getMoleculePointers(), CrosstermElem::getMoleculePointers(), BondElem::getMoleculePointers(), AnisoElem::getMoleculePointers(), AngleElem::getMoleculePointers(), CudaComputeNonbondedKernel::getPatchReadyQueue(), TholeElem::getTupleInfo(), AnisoElem::getTupleInfo(), GlobalMasterIMD::GlobalMasterIMD(), GlobalMasterSymmetry::GlobalMasterSymmetry(), GromacsTopFile::GromacsTopFile(), BackEnd::init(), LdbCoordinator::initialize(), GridforceLiteGrid::initialize(), GridforceFullSubGrid::initialize(), GridforceFullMainGrid::initialize(), DeviceCUDA::initialize(), ComputePmeMgr::initialize(), ComputeMsmMgr::initialize(), SimParameters::initialize_config_data(), msm::PatchData::interpolation(), msm::PatchData::interpolationC1Hermite(), ScriptTcl::load(), NamdState::loadStructure(), ludcmp(), main::main(), Node::mallocTest(), WorkDistrib::mapComputes(), HomePatch::minimize_rattle2(), Molecule::Molecule(), HomePatch::mollyAverage(), HomePatch::mollyMollify(), NAMD_new_handler(), NAMD_read_int(), NAMD_read_line(), NAMD_seek(), NAMD_write(), ComputeMsmMgr::ndsplitting(), CudaComputeNonbondedKernel::nonbondedForce(), Vector::operator[](), Node::outputPatchComputeMaps(), MGridforceParamsList::pack_data(), WorkDistrib::patchMapInit(), PDB::PDB(), PDBUnknown::PDBUnknown(), parm::preadln(), Molecule::prepare_qm(), ProblemParsing(), GlobalMaster::processData(), MsmC1HermiteBlockProxyMsg::put(), MsmBlockProxyMsg::put(), HomePatch::rattle1old(), HomePatch::rattle2(), read_binary_file(), Parameters::read_charmm_parameter_file(), Parameters::read_ener_table(), Parameters::read_energy_type(), Parameters::read_energy_type_bothcubspline(), Parameters::read_energy_type_cubspline(), Molecule::read_go_file(), Parameters::read_parameter_file(), Parameters::read_parm(), SimParameters::readExtendedSystem(), GridforceFullBaseGrid::readSubgridHierarchy(), RecBisection::RecBisection(), Molecule::receive_GoMolecule(), Parameters::receive_Parameters(), CollectionMaster::receiveDataStream(), ParallelIOMgr::recvAtomsCntPerPatch(), HomePatch::recvCheckpointReq(), Controller::recvCheckpointReq(), ComputeMgr::recvComputeDPMEData(), ComputeMgr::recvComputeDPMEResults(), ComputeMgr::recvComputeEwaldData(), ComputeMgr::recvComputeEwaldResults(), ComputeMgr::recvComputeGlobalData(), ComputeMgr::recvComputeGlobalResults(), ComputeMsmSerialMgr::recvCoord(), ComputeFmmSerialMgr::recvCoord(), ComputeExtMgr::recvCoord(), ComputeQMMgr::recvPartQM(), ComputeQMMgr::recvPntChrg(), Output::recvReplicaDcdData(), Node::reloadCharges(), Node::reloadGridforceGrid(), ReductionMgr::remoteRegister(), ReductionMgr::remoteUnregister(), ReductionSet::removeData(), Sequencer::rescaleaccelMD(), Controller::rescaleaccelMD(), CudaTileListKernel::reSortTileLists(), ScriptTcl::run(), ComputeMsm::saveResults(), SimParameters::scriptSet(), ComputeNonbondedUtil::select(), Molecule::send_GoMolecule(), Parameters::send_Parameters(), ComputeMgr::sendComputeDPMEData(), ComputeMgr::sendComputeEwaldData(), StringList::set(), msm::Array< T >::setmax(), CudaTileListKernel::setTileListVirialEnergyGBISLength(), CudaTileListKernel::setTileListVirialEnergyLength(), ComputeMsmMgr::setup_hgrid_1d(), ComputeMsmMgr::setup_periodic_blocksize(), SimParameters::setupIDWS(), PatchMap::sizeGrid(), ComputeMsmMgr::splitting(), ComputeMsmMgr::stencil_1d(), NamdCentLB::Strategy(), StringList::StringList(), ScriptTcl::tclmain(), TholeElem::TholeElem(), Node::updateGridScale(), ReductionMgr::willRequire(), and ReductionMgr::willSubmit().

00083 {
00084    if ( ! err_msg ) err_msg = "(unknown error)";
00085    char *new_err_msg = new char[strlen(err_msg) + 40];
00086    sprintf(new_err_msg,"FATAL ERROR: %s\n",err_msg);
00087    CkPrintf(new_err_msg);
00088    fflush(stdout);
00089    if ( CmiNumPartitions() > 1 ) {
00090      sprintf(new_err_msg,"REPLICA %d FATAL ERROR: %s\n", CmiMyPartition(), err_msg);
00091    }
00092    CmiAbort(new_err_msg);
00093    delete [] new_err_msg;
00094 }

__global__ void reOrderTileListDepth ( const int  numTileLists,
const int *__restrict__  tileListOrder,
unsigned int *__restrict__  tileListDepthSrc,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 668 of file CudaTileListKernel.cu.

References j.

00669                                                                                             {
00670 
00671   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
00672   {
00673     int j = tileListOrder[i];
00674     tileListDepthDst[i] = tileListDepthSrc[j];
00675   }
00676 
00677 }

__global__ void repackTileListsKernel ( const int  numTileLists,
const int  begin_bit,
const int *__restrict__  tileListPos,
const int *__restrict__  tileListOrder,
const int *__restrict__  jtiles,
const TileList *__restrict__  tileListsSrc,
TileList *__restrict__  tileListsDst,
const PatchPairRecord *__restrict__  patchPairsSrc,
PatchPairRecord *__restrict__  patchPairsDst,
const int *__restrict__  tileJatomStartSrc,
int *__restrict__  tileJatomStartDst,
const TileExcl *__restrict__  tileExclsSrc,
TileExcl *__restrict__  tileExclsDst 
)

Definition at line 537 of file CudaTileListKernel.cu.

References __ldg, TileList::iatomStart, TileList::icompute, j, TileList::jtileEnd, TileList::jtileStart, TileList::offsetXYZ, TileList::patchInd, REPACKTILELISTSKERNEL_NUM_WARP, WARP_BALLOT, WARP_FULL_MASK, and WARPSIZE.

00543                                                                                   {
00544 
00545   const int wid = threadIdx.x % WARPSIZE;
00546 
00547   // One warp does one tile list
00548   for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/WARPSIZE*gridDim.x)
00549   {
00550     int j = tileListOrder[i];
00551     int start = tileListPos[i];
00552     int end   = tileListPos[i+1]-1;
00553     if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
00554     // TileList
00555     int startOld   = __ldg(&tileListsSrc[j].jtileStart);
00556     int endOld     = __ldg(&tileListsSrc[j].jtileEnd);
00557     int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
00558     float3 offsetXYZ;
00559     offsetXYZ.x  = __ldg(&tileListsSrc[j].offsetXYZ.x);
00560     offsetXYZ.y  = __ldg(&tileListsSrc[j].offsetXYZ.y);
00561     offsetXYZ.z  = __ldg(&tileListsSrc[j].offsetXYZ.z);
00562     int2 patchInd = tileListsSrc[j].patchInd;
00563     int icompute = __ldg(&tileListsSrc[j].icompute);
00564     if (wid == 0) {
00565       TileList tileList;
00566       tileList.iatomStart = iatomStart;
00567       tileList.offsetXYZ  = offsetXYZ;
00568       tileList.jtileStart = start;
00569       tileList.jtileEnd   = end;
00570       tileList.patchInd   = patchInd;
00571       tileList.icompute   = icompute;
00572       tileListsDst[i] = tileList;
00573     }
00574 
00575     if (jtiles == NULL) {
00576       // No jtiles, simple copy will do
00577       int jtile = start;
00578       for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
00579         if (jtileOld + wid <= endOld) {
00580           tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
00581         }
00582       }
00583       if (tileExclsSrc != NULL) {
00584         int jtile = start;
00585         for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
00586           tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
00587         }
00588       }
00589     } else {
00590       int jtile0 = start;
00591       for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
00592         int t = jtileOld + wid;
00593         int jtile = (t <= endOld) ? jtiles[t] : 0;
00594         jtile >>= begin_bit;
00595         jtile &= 65535;
00596         typedef cub::WarpScan<int> WarpScan;
00597         __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
00598         int warpId = threadIdx.x / WARPSIZE;
00599         int jtilePos;
00600         WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
00601 
00602         if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
00603 
00604         if (tileExclsSrc != NULL) {
00605           unsigned int b = WARP_BALLOT(WARP_FULL_MASK, jtile);
00606           while (b != 0) {
00607             // k = index of thread that has data
00608             int k = __ffs(b) - 1;
00609             tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
00610             // remove 1 bit and advance jtile0
00611             b ^= ((unsigned int)1 << k);
00612             jtile0++;
00613           }
00614         } else {
00615           jtile0 += __popc(WARP_BALLOT(WARP_FULL_MASK, jtile));
00616         }
00617       }
00618     }
00619   }
00620 
00621 }

__global__ void setPatchNumLists_findEmptyPatches ( const int  numTileLists,
TileList *__restrict__  tileLists,
const int *__restrict__  patchNumLists,
const int  numPatches,
int *__restrict__  numEmptyPatches,
int *__restrict__  emptyPatches 
)

Definition at line 42 of file CudaTileListKernel.cu.

00044                                                                                            {
00045 
00046   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
00047   {
00048     int2 patchInd = tileLists[i].patchInd;
00049     int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
00050     tileLists[i].patchNumList = patchNumList;
00051   }
00052 
00053   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
00054   {
00055     if (patchNumLists[i] == 0) {
00056       int ind = atomicAdd(numEmptyPatches, 1);
00057       emptyPatches[ind] = i;
00058     }
00059   }
00060 
00061 }

__global__ void setupSortKey ( const int  numTileLists,
const int  maxTileListLen,
const TileList *__restrict__  tileLists,
const unsigned int *__restrict__  tileListDepth,
const int  begin_bit,
const unsigned int *__restrict__  sortKeys,
unsigned int *__restrict__  sortKey 
)

Definition at line 77 of file CudaTileListKernel.cu.

References itileList.

00079                                                                                                       {
00080 
00081   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
00082   {
00083     int icompute = tileLists[itileList].icompute;
00084     int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
00085     int i = icompute*maxTileListLen + (depth - 1);
00086     sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
00087   }
00088 
00089 }

__global__ void storeInReverse ( const int  numTileListsSrc,
const int  begin_bit,
const int *__restrict__  outputOrder,
const int *__restrict__  tileListPos,
const int *__restrict__  tileListOrderSrc,
const unsigned int *__restrict__  tileListDepthSrc,
int *__restrict__  tileListOrderDst,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 114 of file CudaTileListKernel.cu.

References j.

00119                                                {
00120 
00121   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
00122   {
00123     int j = outputOrder[numTileListsSrc - i - 1];
00124     if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
00125       int k = tileListPos[i];
00126       tileListDepthDst[k] = tileListDepthSrc[j];
00127       tileListOrderDst[k] = j; //tileListOrderSrc[j];
00128     }
00129   }
00130 }

template<int nthread>
__global__ void updatePatchesKernel ( const int  numComputes,
const int *__restrict__  tilePos,
const CudaComputeRecord *__restrict__  computes,
const CudaPatchRecord *__restrict__  patches,
TileList *__restrict__  tileLists 
) [inline]

Definition at line 356 of file CudaTileListKernel.cu.

References CudaComputeRecord::offsetXYZ, CudaComputeRecord::patchInd, and WARPSIZE.

00360                                     {
00361 
00362   const int tid = threadIdx.x % nthread;
00363 
00364   // nthread threads takes care of one compute
00365   for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
00366   {
00367     CudaComputeRecord compute = computes[k];
00368     float3 offsetXYZ = compute.offsetXYZ;
00369     int2 patchInd = compute.patchInd;
00370     int numTiles1 = (patches[patchInd.x].numAtoms-1)/WARPSIZE+1;
00371     int itileList0 = tilePos[k];
00372     for (int i=tid;i < numTiles1;i+=nthread) {
00373       tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
00374       tileLists[itileList0 + i].patchInd  = patchInd;
00375       tileLists[itileList0 + i].icompute  = k;
00376     }
00377   }
00378 
00379 }


Variable Documentation

else begin_bit

Definition at line 631 of file CudaTileListKernel.cu.

Definition at line 655 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ valT* __restrict__ tileListOrderDst typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU

Definition at line 631 of file CudaTileListKernel.cu.

Definition at line 18 of file DeviceCUDA.C.

else end_bit

Definition at line 631 of file CudaTileListKernel.cu.

Definition at line 651 of file CudaTileListKernel.cu.

BlockLoad::TempStorage load
BlockLoadU::TempStorage loadU

Definition at line 647 of file CudaTileListKernel.cu.

const int numTileListsDst

Definition at line 631 of file CudaTileListKernel.cu.

Definition at line 654 of file CudaTileListKernel.cu.

Definition at line 631 of file CudaTileListKernel.cu.

BlockRadixSort::TempStorage sort
__shared__ { ... } tempStorage
const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ tileListDepthDst

Definition at line 631 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ tileListDepthSrc

Definition at line 631 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ tileListOrderSrc

Definition at line 631 of file CudaTileListKernel.cu.

else values

Definition at line 652 of file CudaTileListKernel.cu.

Referenced by Controller::printEnergies(), and Parameters::read_parm().


Generated on 15 Sep 2019 for NAMD by  doxygen 1.6.1