CudaComputeNonbondedKernel.cu File Reference

#include <cuda.h>
#include <cub/cub.cuh>
#include "CudaComputeNonbondedKernel.h"
#include "CudaTileListKernel.h"
#include "DeviceCUDA.h"

Go to the source code of this file.

Defines

#define OVERALLOC   1.2f
#define MAX_CONST_EXCLUSIONS   2048
#define NONBONDKERNEL_NUM_WARP   4
#define LARGE_FLOAT   (float)(1.0e10)
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP   32
#define REDUCEVIRIALENERGYKERNEL_NUM_WARP   32
#define REDUCEGBISENERGYKERNEL_NUM_WARP   32
#define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING)

Functions

void NAMD_die (const char *)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcForceEnergy (const float r2, const float qi, const float qj, const float dx, const float dy, const float dz, const int vdwtypei, const int vdwtypej, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, float3 &iforce, float3 &iforceSlow, float3 &jforce, float3 &jforceSlow, float &energyVdw, float &energyElec, float &energySlow)
template<bool doSlow>
__device__ __forceinline__ void storeForces (const int pos, const float3 force, const float3 forceSlow, float4 *__restrict__ devForces, float4 *__restrict__ devForcesSlow)
template<bool doSlow>
__device__ __forceinline__ void storeForces (const int pos, const float3 force, const float3 forceSlow, float3 *__restrict__ forces, float3 *__restrict__ forcesSlow)
template<bool doPairlist>
__device__ __forceinline__ void shuffleNext (float &xyzq_j_w, int &vdwtypej, int &jatomIndex, int &jexclMaxdiff, int &jexclIndex)
template<bool doPairlist>
__device__ __forceinline__ void shuffleNext (float &xyzq_j_w, int &vdwtypej, int &jatomIndex)
template<bool doSlow>
__device__ __forceinline__ void shuffleNext (float3 &jforce, float3 &jforceSlow)
__device__ __forceinline__ float distsq (const BoundingBox a, const float4 b)
template<bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doStreaming>
__global__ void __launch_bounds__ (WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
 if (itileList< numTileLists)
__global__ void reduceNonbondedVirialKernel (const bool doSlow, const int atomStorageSize, const float4 *__restrict__ xyzq, const float4 *__restrict__ devForces, const float4 *__restrict__ devForcesSlow, VirialEnergy *__restrict__ virialEnergy)
__global__ void reduceVirialEnergyKernel (const bool doEnergy, const bool doVirial, const bool doSlow, const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
__global__ void reduceGBISEnergyKernel (const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)

Variables

__thread DeviceCUDAdeviceCUDA
__constant__ unsigned int constExclusions [MAX_CONST_EXCLUSIONS]
__global__ void const int numTileLists
__global__ void const int
const TileList *__restrict__ 
tileLists
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ 
tileExcls
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ 
tileJatomStart
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int 
vdwCoefTableWidth
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__ 
vdwCoefTable
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ 
vdwTypes
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 
lata
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 
latb
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 
latc
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ 
xyzq
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float 
cutoff2
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t 
vdwCoefTableTex
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t 
forceTableTex
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t 
energyTableTex
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int 
atomStorageSize
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float 
plcutoff2
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__ 
patchPairs
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ 
atomIndex
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ 
exclIndexMaxDiff
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__ 
overflowExclusions
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ 
tileListDepth
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ 
tileListOrder
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ 
jtiles
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ 
tileListStat
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__ 
boundingBoxes
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float4 *__restrict__ 
devForces
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float4 *__restrict__ float4
*__restrict__ 
devForcesSlow
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float4 *__restrict__ float4
*__restrict__ const int 
numPatches
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float4 *__restrict__ float4
*__restrict__ const int
unsigned int *__restrict__ 
patchNumCount
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float4 *__restrict__ float4
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ 
cudaPatches
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float4 *__restrict__ float4
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ float4
*__restrict__ 
mapForces
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float4 *__restrict__ float4
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ float4
*__restrict__ float4
*__restrict__ 
mapForcesSlow
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float4 *__restrict__ float4
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ float4
*__restrict__ float4
*__restrict__ int
*__restrict__ 
mapPatchReadyQueue
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float4 *__restrict__ float4
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ float4
*__restrict__ float4
*__restrict__ int
*__restrict__ int
*__restrict__ 
outputOrder
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
const int *__restrict__ const
float3 const float3 const
float3 const float4
*__restrict__ const float
cudaTextureObject_t
cudaTextureObject_t
cudaTextureObject_t const int
const float const
PatchPairRecord *__restrict__
const int *__restrict__ const
int2 *__restrict__ const
unsigned int *__restrict__
unsigned int *__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float4 *__restrict__ float4
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ float4
*__restrict__ float4
*__restrict__ int
*__restrict__ int
*__restrict__
TileListVirialEnergy
*__restrict__ virialEnergy int 
itileList = start + threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x

Define Documentation

#define CALL ( DOENERGY,
DOVIRIAL,
DOSLOW,
DOPAIRLIST,
DOSTREAMING   ) 
Value:
nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
  <<< nblock, nthread, shMemSize, stream >>>  \
  (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
    cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
    vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
    cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
    atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
    tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
    tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
    numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
    outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
#define LARGE_FLOAT   (float)(1.0e10)

Definition at line 151 of file CudaComputeNonbondedKernel.cu.

#define MAX_CONST_EXCLUSIONS   2048

Definition at line 15 of file CudaComputeNonbondedKernel.cu.

#define NONBONDKERNEL_NUM_WARP   4

Definition at line 18 of file CudaComputeNonbondedKernel.cu.

#define OVERALLOC   1.2f
#define REDUCEGBISENERGYKERNEL_NUM_WARP   32
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP   32
#define REDUCEVIRIALENERGYKERNEL_NUM_WARP   32

Function Documentation

template<bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doStreaming>
__global__ void __launch_bounds__ ( WARPSIZE *  NONBONDKERNEL_NUM_WARP,
doPairlist?  10):(doEnergy?(10):(10) 
) const [inline]
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcForceEnergy ( const float  r2,
const float  qi,
const float  qj,
const float  dx,
const float  dy,
const float  dz,
const int  vdwtypei,
const int  vdwtypej,
const float2 *__restrict__  vdwCoefTable,
cudaTextureObject_t  vdwCoefTableTex,
cudaTextureObject_t  forceTableTex,
cudaTextureObject_t  energyTableTex,
float3 &  iforce,
float3 &  iforceSlow,
float3 &  jforce,
float3 &  jforceSlow,
float &  energyVdw,
float &  energyElec,
float &  energySlow 
) [inline]

Definition at line 22 of file CudaComputeNonbondedKernel.cu.

References __ldg, float2::x, and float2::y.

00028                                                           {
00029 
00030   int vdwIndex = vdwtypej + vdwtypei;
00031 #if __CUDA_ARCH__ >= 350
00032   float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
00033 #else
00034   float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
00035 #endif
00036 
00037   float rinv = rsqrtf(r2);
00038   float4 ei;
00039   float4 fi = tex1D<float4>(forceTableTex, rinv);
00040   if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
00041 
00042   float fSlow = qi * qj;
00043   float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;
00044 
00045   if (doEnergy) {
00046     energyVdw  += ljab.x * ei.z + ljab.y * ei.y;
00047     energyElec += fSlow * ei.x;
00048     if (doSlow) energySlow += fSlow * ei.w;
00049   }
00050   if (doSlow) fSlow *= fi.w;
00051 
00052   float fx = dx * f;
00053   float fy = dy * f;
00054   float fz = dz * f;
00055   iforce.x += fx;
00056   iforce.y += fy;
00057   iforce.z += fz;
00058   jforce.x -= fx;
00059   jforce.y -= fy;
00060   jforce.z -= fz;
00061 
00062   if (doSlow) {
00063     float fxSlow = dx * fSlow;
00064     float fySlow = dy * fSlow;
00065     float fzSlow = dz * fSlow;
00066     iforceSlow.x += fxSlow;
00067     iforceSlow.y += fySlow;
00068     iforceSlow.z += fzSlow;
00069     jforceSlow.x -= fxSlow;
00070     jforceSlow.y -= fySlow;
00071     jforceSlow.z -= fzSlow;
00072   }
00073 }

__device__ __forceinline__ float distsq ( const BoundingBox  a,
const float4  b 
)

Definition at line 143 of file CudaComputeNonbondedKernel.cu.

References BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

Referenced by buildTileListsBBKernel(), and if().

00143                                                                              {
00144   float dx = max(0.0f, fabsf(a.x - b.x) - a.wx);
00145   float dy = max(0.0f, fabsf(a.y - b.y) - a.wy);
00146   float dz = max(0.0f, fabsf(a.z - b.z) - a.wz);
00147   float r2 = dx*dx + dy*dy + dz*dz;
00148   return r2;
00149 }

if (  ) 

Definition at line 195 of file CudaComputeNonbondedKernel.cu.

References CudaPatchRecord::atomStart, constExclusions, distsq(), PatchPairRecord::iatomFreeSize, PatchPairRecord::iatomSize, TileList::iatomStart, j, PatchPairRecord::jatomFreeSize, PatchPairRecord::jatomSize, TileList::jtileEnd, TileList::jtileStart, LARGE_FLOAT, MAX_CONST_EXCLUSIONS, CudaPatchRecord::numAtoms, TileList::offsetXYZ, TileList::patchInd, TileList::patchNumList, tempStorage, WARP_ALL, WARP_ANY, WARP_FULL_MASK, WARP_SHUFFLE, WARP_SHUFFLE_XOR, WARPSIZE, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

00196   {
00197 
00198     float3 iforce;
00199     float3 iforceSlow;
00200     float energyVdw, energyElec, energySlow;
00201     int nexcluded;
00202     unsigned int itileListLen;
00203     int2 patchInd;
00204     int2 patchNumList;
00205 
00206     // Start computation
00207     {
00208       // Warp index (0...warpsize-1)
00209       const int wid = threadIdx.x % WARPSIZE;
00210 
00211       TileList tmp = tileLists[itileList];
00212       int iatomStart = tmp.iatomStart;
00213       int jtileStart = tmp.jtileStart;
00214       int jtileEnd   = tmp.jtileEnd;
00215       patchInd     = tmp.patchInd;
00216       patchNumList = tmp.patchNumList;
00217 
00218       float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
00219       float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
00220       float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
00221 
00222       // DH - set zeroShift flag if magnitude of shift vector is zero
00223       bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
00224 
00225       int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
00226       if (doPairlist) {
00227         PatchPairRecord PPStmp = patchPairs[itileList];
00228         iatomSize     = PPStmp.iatomSize;
00229         iatomFreeSize = PPStmp.iatomFreeSize;
00230         jatomSize     = PPStmp.jatomSize;
00231         jatomFreeSize = PPStmp.jatomFreeSize;
00232       }
00233 
00234       // Write to global memory here to avoid register spilling
00235       if (doVirial) {
00236         if (wid == 0) {
00237           virialEnergy[itileList].shx = shx;
00238           virialEnergy[itileList].shy = shy;
00239           virialEnergy[itileList].shz = shz;
00240         }
00241       }
00242 
00243       // Load i-atom data (and shift coordinates)
00244       float4 xyzq_i = xyzq[iatomStart + wid];
00245       xyzq_i.x += shx;
00246       xyzq_i.y += shy;
00247       xyzq_i.z += shz;
00248       int vdwtypei = vdwTypes[iatomStart + wid]*vdwCoefTableWidth;
00249 
00250       // Load i-atom data (and shift coordinates)
00251       BoundingBox boundingBoxI;
00252       if (doPairlist) {
00253         boundingBoxI = boundingBoxes[iatomStart/WARPSIZE];
00254         boundingBoxI.x += shx;
00255         boundingBoxI.y += shy;
00256         boundingBoxI.z += shz;
00257       }
00258 
00259       // Get i-atom global index
00260 #ifdef USE_NEW_EXCL_METHOD
00261       int iatomIndex, minExclAtom, maxExclAtom;
00262 #else
00263       int iatomIndex;
00264 #endif
00265       if (doPairlist) {
00266 #ifdef USE_NEW_EXCL_METHOD
00267         iatomIndex = atomIndex[iatomStart + wid];
00268         int2 tmp = minmaxExclAtom[iatomStart + wid];
00269         minExclAtom = tmp.x;
00270         maxExclAtom = tmp.y;
00271 #else
00272         iatomIndex = atomIndex[iatomStart + wid];
00273 #endif
00274       }
00275 
00276       // i-forces in registers
00277       // float3 iforce;
00278       iforce.x = 0.0f;
00279       iforce.y = 0.0f;
00280       iforce.z = 0.0f;
00281 
00282       // float3 iforceSlow;
00283       if (doSlow) {
00284         iforceSlow.x = 0.0f;
00285         iforceSlow.y = 0.0f;
00286         iforceSlow.z = 0.0f;
00287       }
00288 
00289       // float energyVdw, energyElec, energySlow;
00290       if (doEnergy) {
00291         energyVdw = 0.0f;
00292         energyElec = 0.0f;
00293         if (doSlow) energySlow = 0.0f;
00294       }
00295 
00296       // Number of exclusions
00297       // NOTE: Lowest bit is used as indicator bit for tile pairs:
00298       //       bit 0 tile has no atoms within pairlist cutoff
00299       //       bit 1 tile has atoms within pairlist cutoff
00300       // int nexcluded;
00301       if (doPairlist) nexcluded = 0;
00302 
00303       // Number of i loops and free atoms
00304       int nfreei;
00305       if (doPairlist) {
00306         int nloopi = min(iatomSize - iatomStart, WARPSIZE);
00307         nfreei = max(iatomFreeSize - iatomStart, 0);
00308         if (wid >= nloopi) {
00309           xyzq_i.x = -LARGE_FLOAT;
00310           xyzq_i.y = -LARGE_FLOAT;
00311           xyzq_i.z = -LARGE_FLOAT;
00312         }
00313       }
00314 
00315       // tile list stuff
00316       // int itileListLen;
00317       // int minJatomStart;
00318       if (doPairlist) {
00319         // minJatomStart = tileJatomStart[jtileStart];
00320         itileListLen = 0;
00321       }
00322 
00323       // Exclusion index and maxdiff
00324       int iexclIndex, iexclMaxdiff;
00325       if (doPairlist) {
00326         int2 tmp = exclIndexMaxDiff[iatomStart + wid];
00327         iexclIndex   = tmp.x;
00328         iexclMaxdiff = tmp.y;
00329       }
00330 
00331       for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
00332 
00333         // Load j-atom starting index and exclusion mask
00334         int jatomStart = tileJatomStart[jtile];
00335 
00336         float4 xyzq_j = xyzq[jatomStart + wid];
00337 
00338         // Check for early bail
00339         if (doPairlist) {
00340           float r2bb = distsq(boundingBoxI, xyzq_j);
00341           if (WARP_ALL(WARP_FULL_MASK, r2bb > plcutoff2)) continue;
00342         }
00343         unsigned int excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
00344         int vdwtypej = vdwTypes[jatomStart + wid];
00345 
00346         // Get i-atom global index
00347         int jatomIndex;
00348         if (doPairlist) {
00349           jatomIndex = atomIndex[jatomStart + wid];
00350         }
00351 
00352         // Number of j loops and free atoms
00353         int nfreej;
00354         if (doPairlist) {
00355           int nloopj = min(jatomSize - jatomStart, WARPSIZE);
00356           nfreej = max(jatomFreeSize - jatomStart, 0);
00357           //if (nfreei == 0 && nfreej == 0) continue;
00358           if (wid >= nloopj) {
00359             xyzq_j.x = LARGE_FLOAT;
00360             xyzq_j.y = LARGE_FLOAT;
00361             xyzq_j.z = LARGE_FLOAT;
00362           }
00363         }
00364 
00365         // DH - self requires that zeroShift is also set
00366         const bool self = zeroShift && (iatomStart == jatomStart);
00367         const int modval = (self) ? 2*WARPSIZE-1 : WARPSIZE-1;
00368 
00369         float3 jforce;
00370         jforce.x = 0.0f;
00371         jforce.y = 0.0f;
00372         jforce.z = 0.0f;
00373         
00374         float3 jforceSlow;
00375         if (doSlow) {
00376           jforceSlow.x = 0.0f;
00377           jforceSlow.y = 0.0f;
00378           jforceSlow.z = 0.0f;
00379         }
00380 
00381         int t = (self) ? 1 : 0;
00382 
00383         if (doPairlist) {
00384           // Build pair list
00385           // NOTE: Pairlist update, we must also include the diagonal since this is used
00386           //       in GBIS phase 2.
00387           // Clear the lowest (indicator) bit
00388           nexcluded &= (~1);
00389 
00390           // For self tiles, do the diagonal term (t=0).
00391           // NOTE: No energies are computed here, since this self-diagonal term is only for GBIS phase 2
00392           if (self) {
00393             int j = (0 + wid) & modval;
00394             // NOTE: __shfl() operation can give non-sense here because j may be >= WARPSIZE.
00395             //       However, if (j < WARPSIZE ..) below makes sure that these non-sense
00396             //       results are not actually every used
00397             float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
00398             float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
00399             float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
00400 
00401             float r2 = dx*dx + dy*dy + dz*dz;
00402 
00403             if (j < WARPSIZE && r2 < plcutoff2) {
00404               // We have atom pair within the pairlist cutoff => Set indicator bit
00405               nexcluded |= 1;
00406             }
00407             shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex);
00408           }
00409 
00410           for (;t < WARPSIZE;t++) {
00411             int j = (t + wid) & modval;
00412 
00413             // NOTE: __shfl() operation can give non-sense here because j may be >= WARPSIZE.
00414             //       However, if (j < WARPSIZE ..) below makes sure that these non-sense
00415             //       results are not used
00416             float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
00417             float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
00418             float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
00419 
00420             float r2 = dx*dx + dy*dy + dz*dz;
00421 
00422             excl >>= 1;
00423             if (j < WARPSIZE && r2 < plcutoff2) {
00424               // We have atom pair within the pairlist cutoff => Set indicator bit
00425               nexcluded |= 1;
00426               if (j < nfreej || wid < nfreei) {
00427                 bool excluded = false;
00428                 int indexdiff = jatomIndex - iatomIndex;
00429                 if ( abs(indexdiff) <= iexclMaxdiff) {
00430                   indexdiff += iexclIndex;
00431                   int indexword = ((unsigned int) indexdiff) >> 5;
00432 
00433                   if ( indexword < MAX_CONST_EXCLUSIONS ) {
00434                     indexword = constExclusions[indexword];
00435                   } else {
00436                     indexword = overflowExclusions[indexword];
00437                   }
00438 
00439                   excluded = ((indexword & (1<<(indexdiff&31))) != 0);
00440                 }
00441                 if (excluded) nexcluded += 2;
00442                 if (!excluded) excl |= 0x80000000;
00443                 if (!excluded && r2 < cutoff2) {
00444                   calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
00445                     vdwtypei, vdwtypej,
00446                     vdwCoefTable,
00447                     vdwCoefTableTex, forceTableTex, energyTableTex,
00448                     iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow);
00449                 }
00450               }
00451             }
00452             shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex);
00453             shuffleNext<doSlow>(jforce, jforceSlow);
00454           } // t
00455         } else {
00456           // Just compute forces
00457           if (self) {
00458             excl >>= 1;
00459             xyzq_j.x = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00460             xyzq_j.y = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00461             xyzq_j.z = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00462             shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex);
00463           }
00464           for (;t < WARPSIZE;t++) {
00465             if ((excl & 1)) {
00466               float dx = xyzq_j.x - xyzq_i.x;
00467               float dy = xyzq_j.y - xyzq_i.y;
00468               float dz = xyzq_j.z - xyzq_i.z;
00469 
00470               float r2 = dx*dx + dy*dy + dz*dz;
00471 
00472               if (r2 < cutoff2) {
00473                 calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
00474                   vdwtypei, vdwtypej,
00475                   vdwCoefTable,
00476                   vdwCoefTableTex, forceTableTex, energyTableTex,
00477                   iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow);
00478               } // (r2 < cutoff2)
00479             } // (excl & 1)
00480             excl >>= 1;
00481             xyzq_j.x = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00482             xyzq_j.y = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00483             xyzq_j.z = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00484             shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex);
00485             shuffleNext<doSlow>(jforce, jforceSlow);
00486           } // t
00487         }
00488 
00489         // Write j-forces
00490         storeForces<doSlow>(jatomStart + wid, jforce, jforceSlow, devForces, devForcesSlow);
00491 
00492         // Write exclusions
00493         if (doPairlist && WARP_ANY(WARP_FULL_MASK, nexcluded & 1)) {
00494           int anyexcl = (65536 | WARP_ANY(WARP_FULL_MASK, excl));
00495           // Mark this jtile as non-empty:
00496           //  VdW:      1 if tile has atom pairs within pairlist cutoff and some these atoms interact
00497           //  GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
00498           if (wid == 0) jtiles[jtile] = anyexcl;
00499           // Store exclusions
00500           tileExcls[jtile].excl[wid] = excl;
00501           // itileListLen:
00502           // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
00503           // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
00504           itileListLen += anyexcl;
00505           // NOTE, this minJatomStart is only stored once for the first tile list entry
00506           // minJatomStart = min(minJatomStart, jatomStart);
00507         }
00508 
00509       } // jtile
00510 
00511       // Write i-forces
00512       storeForces<doSlow>(iatomStart + wid, iforce, iforceSlow, devForces, devForcesSlow);
00513     }
00514     // Done with computation
00515 
00516     // Save pairlist stuff
00517     if (doPairlist) {
00518 
00519       // Warp index (0...warpsize-1)
00520       const int wid = threadIdx.x % WARPSIZE;
00521 
00522       if (wid == 0) {
00523         // minJatomStart is in range [0 ... atomStorageSize-1]
00524         //int atom0 = (minJatomStart)/WARPSIZE;
00525         // int atom0 = 0;
00526         // int storageOffset = atomStorageSize/WARPSIZE;
00527         // int itileListLen = 0;
00528         // for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) itileListLen += jtiles[jtile];
00529         // Store 0 if itileListLen == 0
00530         // tileListDepth[itileList] = (itileListLen > 0)*(itileListLen*storageOffset + atom0);
00531         tileListDepth[itileList] = itileListLen;
00532         tileListOrder[itileList] = itileList;
00533         // Number of active tilelists with tile with atom pairs within pairlist cutoff that interact
00534         if ((itileListLen & 65535) > 0) atomicAdd(&tileListStat->numTileLists, 1);
00535         // Number of active tilelists with tiles with atom pairs within pairlist cutoff (but not necessary interacting)
00536         if (itileListLen > 0) atomicAdd(&tileListStat->numTileListsGBIS, 1);
00537         // NOTE: always numTileListsGBIS >= numTileLists
00538       }
00539 
00540       typedef cub::WarpReduce<int> WarpReduceInt;
00541       __shared__ typename WarpReduceInt::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
00542       int warpId = threadIdx.x / WARPSIZE;
00543       // Remove indicator bit
00544       nexcluded >>= 1;
00545       volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
00546       if (wid == 0) atomicAdd(&tileListStat->numExcluded, nexcludedWarp);
00547 
00548     }
00549 
00550     if (doVirial) {
00551       // Warp index (0...warpsize-1)
00552       const int wid = threadIdx.x % WARPSIZE;
00553 
00554       typedef cub::WarpReduce<float> WarpReduce;
00555       __shared__ typename WarpReduce::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
00556       int warpId = threadIdx.x / WARPSIZE;
00557       volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
00558       volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
00559       volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
00560       if (wid == 0) {
00561         virialEnergy[itileList].forcex = iforcexSum;
00562         virialEnergy[itileList].forcey = iforceySum;
00563         virialEnergy[itileList].forcez = iforcezSum;
00564       }
00565 
00566       if (doSlow) {
00567         iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
00568         iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
00569         iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
00570         if (wid == 0) {
00571           virialEnergy[itileList].forceSlowx = iforcexSum;
00572           virialEnergy[itileList].forceSlowy = iforceySum;
00573           virialEnergy[itileList].forceSlowz = iforcezSum;
00574         }
00575       }
00576     }
00577 
00578     // Reduce energy
00579     if (doEnergy) {
00580       // NOTE: We must hand write these warp-wide reductions to avoid excess register spillage
00581       //       (Why does CUB suck here?)
00582 #pragma unroll
00583       for (int i=16;i >= 1;i/=2) {
00584         energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
00585         energyElec += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec, i, 32);
00586         if (doSlow) energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
00587       }
00588 
00589       if (threadIdx.x % WARPSIZE == 0) {
00590         virialEnergy[itileList].energyVdw  = energyVdw;
00591         virialEnergy[itileList].energyElec = energyElec;
00592         if (doSlow) virialEnergy[itileList].energySlow = energySlow;
00593       }
00594     }
00595 
00596     if (doStreaming) {
00597       // Make sure devForces and devForcesSlow have been written into device memory
00598       // NO NEED TO SYNCHRONIZE THREADS, THIS IS WARP-LEVEL
00599       __threadfence();
00600 
00601       int patchDone[2] = {false, false};
00602       const int wid = threadIdx.x % WARPSIZE;
00603       if (wid == 0) {
00604         int patchCountOld0 = atomicInc(&patchNumCount[patchInd.x], (unsigned int)(patchNumList.x-1));
00605         patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
00606         if (patchInd.x != patchInd.y) {
00607           int patchCountOld1 = atomicInc(&patchNumCount[patchInd.y], (unsigned int)(patchNumList.y-1));
00608           patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
00609         }
00610       }
00611 
00612       patchDone[0] = WARP_ANY(WARP_FULL_MASK, patchDone[0]);
00613       patchDone[1] = WARP_ANY(WARP_FULL_MASK, patchDone[1]);
00614 
00615       if (patchDone[0]) {
00616         // Patch 1 is done, write onto host-mapped memory
00617         CudaPatchRecord patch = cudaPatches[patchInd.x];
00618         int start = patch.atomStart;
00619         int end   = start + patch.numAtoms;
00620         for (int i=start+wid;i < end;i+=WARPSIZE) {
00621           mapForces[i] = devForces[i];
00622           if (doSlow) mapForcesSlow[i] = devForcesSlow[i];
00623         }
00624       }
00625       if (patchDone[1]) {
00626         // Patch 2 is done
00627         CudaPatchRecord patch = cudaPatches[patchInd.y];
00628         int start = patch.atomStart;
00629         int end   = start + patch.numAtoms;
00630         for (int i=start+wid;i < end;i+=WARPSIZE) {
00631           mapForces[i] = devForces[i];
00632           if (doSlow) mapForcesSlow[i] = devForcesSlow[i];
00633         }
00634       }
00635 
00636       if (patchDone[0] || patchDone[1]) {
00637         // Make sure mapForces and mapForcesSlow are up-to-date
00638         __threadfence_system();
00639         // Add patch into "patchReadyQueue"
00640         if (wid == 0) {
00641           if (patchDone[0]) {
00642             int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
00643             // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
00644             mapPatchReadyQueue[ind] = patchInd.x;
00645           }
00646           if (patchDone[1]) {
00647             int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
00648             // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
00649             mapPatchReadyQueue[ind] = patchInd.y;
00650           }
00651         }
00652         // Make sure "patchReadyQueue" is visible in page-locked host memory
00653         __threadfence_system();
00654       }
00655     }
00656 
00657     if (doStreaming && outputOrder != NULL && threadIdx.x % WARPSIZE == 0) {
00658       int index = atomicAdd(&tileListStat->outputOrderIndex, 1);
00659       outputOrder[index] = itileList;
00660     }
00661   } // if (itileList < numTileLists)

void NAMD_die ( const char *   ) 

Definition at line 81 of file common.C.

00083 {
00084    if ( ! err_msg ) err_msg = "(unknown error)";
00085    char *new_err_msg = new char[strlen(err_msg) + 40];
00086    sprintf(new_err_msg,"FATAL ERROR: %s\n",err_msg);
00087    CkPrintf(new_err_msg);
00088    fflush(stdout);
00089    if ( CmiNumPartitions() > 1 ) {
00090      sprintf(new_err_msg,"REPLICA %d FATAL ERROR: %s\n", CmiMyPartition(), err_msg);
00091    }
00092    CmiAbort(new_err_msg);
00093    delete [] new_err_msg;
00094 }

__global__ void reduceGBISEnergyKernel ( const int  numTileLists,
const TileListVirialEnergy *__restrict__  tileListVirialEnergy,
VirialEnergy *__restrict__  virialEnergy 
)

Definition at line 910 of file CudaComputeNonbondedKernel.cu.

References BLOCK_SYNC, and tempStorage.

00912                                            {
00913 
00914   for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
00915   {
00916     int itileList = ibase + threadIdx.x;
00917     double energyGBISt = 0.0;
00918     if (itileList < numTileLists) {
00919       energyGBISt = tileListVirialEnergy[itileList].energyGBIS;
00920     }
00921 
00922     typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
00923     __shared__ typename BlockReduce::TempStorage tempStorage;
00924     volatile double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt); BLOCK_SYNC;
00925     if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
00926   }
00927 
00928 }

__global__ void reduceNonbondedVirialKernel ( const bool  doSlow,
const int  atomStorageSize,
const float4 *__restrict__  xyzq,
const float4 *__restrict__  devForces,
const float4 *__restrict__  devForcesSlow,
VirialEnergy *__restrict__  virialEnergy 
)

Definition at line 668 of file CudaComputeNonbondedKernel.cu.

References BLOCK_SYNC, and tempStorage.

00672                                            {
00673 
00674   for (int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
00675   {
00676     int i = ibase + threadIdx.x;
00677 
00678     // Set to zero to avoid nan*0
00679     float4 pos;
00680     pos.x = 0.0f;
00681     pos.y = 0.0f;
00682     pos.z = 0.0f;
00683     float4 force, forceSlow;
00684     force.x = 0.0f;
00685     force.y = 0.0f;
00686     force.z = 0.0f;
00687     forceSlow.x = 0.0f;
00688     forceSlow.y = 0.0f;
00689     forceSlow.z = 0.0f;
00690     if (i < atomStorageSize) {
00691       pos = xyzq[i];
00692       force = devForces[i];
00693       if (doSlow) forceSlow = devForcesSlow[i];
00694     }
00695     // Reduce across the entire thread block
00696     float vxxt = force.x*pos.x;
00697     float vxyt = force.x*pos.y;
00698     float vxzt = force.x*pos.z;
00699     float vyxt = force.y*pos.x;
00700     float vyyt = force.y*pos.y;
00701     float vyzt = force.y*pos.z;
00702     float vzxt = force.z*pos.x;
00703     float vzyt = force.z*pos.y;
00704     float vzzt = force.z*pos.z;
00705     // atomicAdd(&virialEnergy->virial[0], (double)vxx);
00706     // atomicAdd(&virialEnergy->virial[1], (double)vxy);
00707     // atomicAdd(&virialEnergy->virial[2], (double)vxz);
00708     // atomicAdd(&virialEnergy->virial[3], (double)vyx);
00709     // atomicAdd(&virialEnergy->virial[4], (double)vyy);
00710     // atomicAdd(&virialEnergy->virial[5], (double)vyz);
00711     // atomicAdd(&virialEnergy->virial[6], (double)vzx);
00712     // atomicAdd(&virialEnergy->virial[7], (double)vzy);
00713     // atomicAdd(&virialEnergy->virial[8], (double)vzz);
00714 
00715     typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
00716     __shared__ typename BlockReduce::TempStorage tempStorage;
00717     volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
00718     volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
00719     volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
00720     volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
00721     volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
00722     volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
00723     volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
00724     volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
00725     volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
00726     if (threadIdx.x == 0) {
00727       atomicAdd(&virialEnergy->virial[0], (double)vxx);
00728       atomicAdd(&virialEnergy->virial[1], (double)vxy);
00729       atomicAdd(&virialEnergy->virial[2], (double)vxz);
00730       atomicAdd(&virialEnergy->virial[3], (double)vyx);
00731       atomicAdd(&virialEnergy->virial[4], (double)vyy);
00732       atomicAdd(&virialEnergy->virial[5], (double)vyz);
00733       atomicAdd(&virialEnergy->virial[6], (double)vzx);
00734       atomicAdd(&virialEnergy->virial[7], (double)vzy);
00735       atomicAdd(&virialEnergy->virial[8], (double)vzz);
00736     }
00737 
00738     if (doSlow) {
00739       // if (isnan(forceSlow.x) || isnan(forceSlow.y) || isnan(forceSlow.z))
00740       float vxxSlowt = forceSlow.x*pos.x;
00741       float vxySlowt = forceSlow.x*pos.y;
00742       float vxzSlowt = forceSlow.x*pos.z;
00743       float vyxSlowt = forceSlow.y*pos.x;
00744       float vyySlowt = forceSlow.y*pos.y;
00745       float vyzSlowt = forceSlow.y*pos.z;
00746       float vzxSlowt = forceSlow.z*pos.x;
00747       float vzySlowt = forceSlow.z*pos.y;
00748       float vzzSlowt = forceSlow.z*pos.z;
00749       // atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
00750       // atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
00751       // atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
00752       // atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
00753       // atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
00754       // atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
00755       // atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
00756       // atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
00757       // atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
00758       volatile float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt); BLOCK_SYNC;
00759       volatile float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt); BLOCK_SYNC;
00760       volatile float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt); BLOCK_SYNC;
00761       volatile float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt); BLOCK_SYNC;
00762       volatile float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt); BLOCK_SYNC;
00763       volatile float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt); BLOCK_SYNC;
00764       volatile float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt); BLOCK_SYNC;
00765       volatile float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt); BLOCK_SYNC;
00766       volatile float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt); BLOCK_SYNC;
00767       if (threadIdx.x == 0) {
00768         atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
00769         atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
00770         atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
00771         atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
00772         atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
00773         atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
00774         atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
00775         atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
00776         atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
00777       }
00778     }
00779   
00780   }
00781 
00782 }

__global__ void reduceVirialEnergyKernel ( const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const int  numTileLists,
const TileListVirialEnergy *__restrict__  tileListVirialEnergy,
VirialEnergy *__restrict__  virialEnergy 
)

Definition at line 785 of file CudaComputeNonbondedKernel.cu.

References BLOCK_SYNC, TileListVirialEnergy::energyElec, TileListVirialEnergy::energySlow, TileListVirialEnergy::energyVdw, TileListVirialEnergy::forceSlowx, TileListVirialEnergy::forceSlowy, TileListVirialEnergy::forceSlowz, TileListVirialEnergy::forcex, TileListVirialEnergy::forcey, TileListVirialEnergy::forcez, TileListVirialEnergy::shx, TileListVirialEnergy::shy, TileListVirialEnergy::shz, and tempStorage.

00789                                            {
00790 
00791   for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
00792   {
00793     int itileList = ibase + threadIdx.x;
00794     TileListVirialEnergy ve;
00795     if (itileList < numTileLists) {
00796       ve = tileListVirialEnergy[itileList];
00797     } else {
00798       // Set to zero to avoid nan*0
00799       if (doVirial) {
00800         ve.shx = 0.0f;
00801         ve.shy = 0.0f;
00802         ve.shz = 0.0f;
00803         ve.forcex = 0.0f;
00804         ve.forcey = 0.0f;
00805         ve.forcez = 0.0f;
00806         ve.forceSlowx = 0.0f;
00807         ve.forceSlowy = 0.0f;
00808         ve.forceSlowz = 0.0f;
00809       }
00810       if (doEnergy) {
00811         ve.energyVdw = 0.0;
00812         ve.energyElec = 0.0;
00813         ve.energySlow = 0.0;
00814         // ve.energyGBIS = 0.0;
00815       }
00816     }
00817 
00818     if (doVirial) {
00819       typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
00820       __shared__ typename BlockReduce::TempStorage tempStorage;
00821       float vxxt = ve.forcex*ve.shx;
00822       float vxyt = ve.forcex*ve.shy;
00823       float vxzt = ve.forcex*ve.shz;
00824       float vyxt = ve.forcey*ve.shx;
00825       float vyyt = ve.forcey*ve.shy;
00826       float vyzt = ve.forcey*ve.shz;
00827       float vzxt = ve.forcez*ve.shx;
00828       float vzyt = ve.forcez*ve.shy;
00829       float vzzt = ve.forcez*ve.shz;
00830       volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
00831       volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
00832       volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
00833       volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
00834       volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
00835       volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
00836       volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
00837       volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
00838       volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
00839       if (threadIdx.x == 0) {
00840         atomicAdd(&virialEnergy->virial[0], (double)vxx);
00841         atomicAdd(&virialEnergy->virial[1], (double)vxy);
00842         atomicAdd(&virialEnergy->virial[2], (double)vxz);
00843         atomicAdd(&virialEnergy->virial[3], (double)vyx);
00844         atomicAdd(&virialEnergy->virial[4], (double)vyy);
00845         atomicAdd(&virialEnergy->virial[5], (double)vyz);
00846         atomicAdd(&virialEnergy->virial[6], (double)vzx);
00847         atomicAdd(&virialEnergy->virial[7], (double)vzy);
00848         atomicAdd(&virialEnergy->virial[8], (double)vzz);
00849       }
00850 
00851       if (doSlow) {
00852         typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
00853         __shared__ typename BlockReduce::TempStorage tempStorage;
00854         float vxxt = ve.forceSlowx*ve.shx;
00855         float vxyt = ve.forceSlowx*ve.shy;
00856         float vxzt = ve.forceSlowx*ve.shz;
00857         float vyxt = ve.forceSlowy*ve.shx;
00858         float vyyt = ve.forceSlowy*ve.shy;
00859         float vyzt = ve.forceSlowy*ve.shz;
00860         float vzxt = ve.forceSlowz*ve.shx;
00861         float vzyt = ve.forceSlowz*ve.shy;
00862         float vzzt = ve.forceSlowz*ve.shz;
00863         volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
00864         volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
00865         volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
00866         volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
00867         volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
00868         volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
00869         volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
00870         volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
00871         volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
00872         if (threadIdx.x == 0) {
00873           atomicAdd(&virialEnergy->virialSlow[0], (double)vxx);
00874           atomicAdd(&virialEnergy->virialSlow[1], (double)vxy);
00875           atomicAdd(&virialEnergy->virialSlow[2], (double)vxz);
00876           atomicAdd(&virialEnergy->virialSlow[3], (double)vyx);
00877           atomicAdd(&virialEnergy->virialSlow[4], (double)vyy);
00878           atomicAdd(&virialEnergy->virialSlow[5], (double)vyz);
00879           atomicAdd(&virialEnergy->virialSlow[6], (double)vzx);
00880           atomicAdd(&virialEnergy->virialSlow[7], (double)vzy);
00881           atomicAdd(&virialEnergy->virialSlow[8], (double)vzz);
00882         }
00883       }
00884     }
00885 
00886     if (doEnergy) {
00887       typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
00888       __shared__ typename BlockReduce::TempStorage tempStorage;
00889       volatile double energyVdw  = BlockReduce(tempStorage).Sum(ve.energyVdw); BLOCK_SYNC;
00890       volatile double energyElec = BlockReduce(tempStorage).Sum(ve.energyElec); BLOCK_SYNC;
00891       if (threadIdx.x == 0) {
00892           atomicAdd(&virialEnergy->energyVdw, (double)energyVdw);
00893           atomicAdd(&virialEnergy->energyElec, (double)energyElec);
00894       }
00895       if (doSlow) {
00896         volatile double energySlow = BlockReduce(tempStorage).Sum(ve.energySlow); BLOCK_SYNC;
00897         if (threadIdx.x == 0) atomicAdd(&virialEnergy->energySlow, (double)energySlow);
00898       }
00899       // if (doGBIS) {
00900       //   double energyGBIS = BlockReduce(tempStorage).Sum(ve.energyGBIS); BLOCK_SYNC;
00901       //   if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
00902       // }
00903     }
00904 
00905   }
00906 
00907 }

template<bool doSlow>
__device__ __forceinline__ void shuffleNext ( float3 &  jforce,
float3 &  jforceSlow 
) [inline]

Definition at line 127 of file CudaComputeNonbondedKernel.cu.

References WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

00127                                                      {
00128   jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00129   jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00130   jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00131   if (doSlow) {
00132     jforceSlow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00133     jforceSlow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00134     jforceSlow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00135   }
00136 }

template<bool doPairlist>
__device__ __forceinline__ void shuffleNext ( float &  xyzq_j_w,
int &  vdwtypej,
int &  jatomIndex 
) [inline]

Definition at line 117 of file CudaComputeNonbondedKernel.cu.

References WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

00117                                                                   {
00118   xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00119   vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00120   if (doPairlist) {
00121     jatomIndex   = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);    
00122   }
00123 }

template<bool doPairlist>
__device__ __forceinline__ void shuffleNext ( float &  xyzq_j_w,
int &  vdwtypej,
int &  jatomIndex,
int &  jexclMaxdiff,
int &  jexclIndex 
) [inline]

Definition at line 105 of file CudaComputeNonbondedKernel.cu.

References WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

00105                                                                                                       {
00106   xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00107   vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00108   if (doPairlist) {
00109     jatomIndex   = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);    
00110     jexclIndex   = WARP_SHUFFLE(WARP_FULL_MASK, jexclIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00111     jexclMaxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jexclMaxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00112   }
00113 }

template<bool doSlow>
__device__ __forceinline__ void storeForces ( const int  pos,
const float3  force,
const float3  forceSlow,
float3 *__restrict__  forces,
float3 *__restrict__  forcesSlow 
) [inline]

Definition at line 91 of file CudaComputeNonbondedKernel.cu.

00092                                                                 {
00093   atomicAdd(&forces[pos].x, force.x);
00094   atomicAdd(&forces[pos].y, force.y);
00095   atomicAdd(&forces[pos].z, force.z);
00096   if (doSlow) {
00097     atomicAdd(&forcesSlow[pos].x, forceSlow.x);
00098     atomicAdd(&forcesSlow[pos].y, forceSlow.y);
00099     atomicAdd(&forcesSlow[pos].z, forceSlow.z);
00100   }
00101 }

template<bool doSlow>
__device__ __forceinline__ void storeForces ( const int  pos,
const float3  force,
const float3  forceSlow,
float4 *__restrict__  devForces,
float4 *__restrict__  devForcesSlow 
) [inline]

Definition at line 77 of file CudaComputeNonbondedKernel.cu.

00078                                                                       {
00079   atomicAdd(&devForces[pos].x, force.x);
00080   atomicAdd(&devForces[pos].y, force.y);
00081   atomicAdd(&devForces[pos].z, force.z);
00082   if (doSlow) {
00083     atomicAdd(&devForcesSlow[pos].x, forceSlow.x);
00084     atomicAdd(&devForcesSlow[pos].y, forceSlow.y);
00085     atomicAdd(&devForcesSlow[pos].z, forceSlow.z);
00086   }
00087 }


Variable Documentation

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ atomIndex

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

Referenced by HomePatch::doAtomMigration().

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ boundingBoxes

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]

Definition at line 16 of file CudaComputeNonbondedKernel.cu.

Referenced by CudaComputeNonbondedKernel::bindExclusions(), and if().

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float4* __restrict__ float4* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ cudaPatches

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cutoff2

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float4* __restrict__ devForces

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float4* __restrict__ float4* __restrict__ devForcesSlow

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

Definition at line 18 of file DeviceCUDA.C.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t energyTableTex

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ exclIndexMaxDiff

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t forceTableTex

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float4* __restrict__ float4* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ float4* __restrict__ float4* __restrict__ int* __restrict__ int* __restrict__ TileListVirialEnergy* __restrict__ virialEnergy int itileList = start + threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x
__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ jtiles

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 lata

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 latb

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 latc

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float4* __restrict__ float4* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ float4* __restrict__ mapForces

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float4* __restrict__ float4* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ float4* __restrict__ float4* __restrict__ mapForcesSlow

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float4* __restrict__ float4* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ float4* __restrict__ float4* __restrict__ int* __restrict__ mapPatchReadyQueue

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float4* __restrict__ float4* __restrict__ const int numPatches
__global__ void const int numTileLists

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float4* __restrict__ float4* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ float4* __restrict__ float4* __restrict__ int* __restrict__ int* __restrict__ outputOrder

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ overflowExclusions

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float4* __restrict__ float4* __restrict__ const int unsigned int* __restrict__ patchNumCount

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ patchPairs

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float plcutoff2

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ tileExcls

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ tileJatomStart

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ tileListDepth

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ tileListOrder

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ tileLists

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ tileListStat

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ vdwCoefTable

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t vdwCoefTableTex

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int vdwCoefTableWidth

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ vdwTypes

Definition at line 161 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ xyzq

Definition at line 161 of file CudaComputeNonbondedKernel.cu.


Generated on 21 Nov 2019 for NAMD by  doxygen 1.6.1