NAMD
Classes | Macros | Typedefs | Functions
ComputeNonbondedCUDAKernel.h File Reference

Go to the source code of this file.

Classes

struct  exclmask
 

Macros

#define __align__(X)
 
#define NUM_WARP   4
 
#define WARPSIZE   32
 
#define PATCH_PAIR_SIZE   (sizeof(patch_pair)/4)
 
#define COPY_ATOM(DEST, SOURCE)
 
#define COPY_PARAM(DEST, SOURCE)
 
#define COPY_ATOM_TO_SHARED(ATOM, PARAM, SHARED)
 
#define COPY_ATOM_FROM_SHARED(ATOM, PARAM, SHARED)
 
#define MAX_EXCLUSIONS   (1<<27)
 
#define MAX_CONST_EXCLUSIONS   2048
 
#define FORCE_TABLE_SIZE   4096
 

Typedefs

typedef float GBReal
 

Functions

void cuda_errcheck (const char *msg)
 
struct __align__ (16) patch_pair
 
void cuda_bind_exclusions (const unsigned int *t, int n)
 
void cuda_bind_lj_table (const float2 *t, int _lj_table_size)
 
void cuda_bind_force_table (const float4 *t, const float4 *et)
 
void cuda_init ()
 
void cuda_bind_patch_pairs (patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int nexclmask, int plist_len)
 
void cuda_bind_atom_params (const atom_param *t)
 
void cuda_bind_vdw_types (const int *t)
 
void cuda_bind_atoms (const atom *a)
 
void cuda_bind_forces (float4 *f, float4 *f_slow)
 
void cuda_bind_virials (float *v, int *queue, int *blockorder)
 
void cuda_nonbonded_forces (float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
 
void cuda_GBIS_P1 (int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
 
void cuda_GBIS_P2 (int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
 
void cuda_GBIS_P3 (int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
 
void cuda_bind_GBIS_intRad (float *intRad0H, float *intRadSH)
 
void cuda_bind_GBIS_energy (float *energy_gbis)
 
void cuda_bind_GBIS_psiSum (GBReal *psiSumH)
 
void cuda_bind_GBIS_bornRad (float *bornRadH)
 
void cuda_bind_GBIS_dEdaSum (GBReal *dEdaSumH)
 
void cuda_bind_GBIS_dHdrPrefix (float *dHdrPrefixH)
 
int cuda_stream_finished ()
 

Macro Definition Documentation

#define __align__ (   X)

Definition at line 9 of file ComputeNonbondedCUDAKernel.h.

#define COPY_ATOM (   DEST,
  SOURCE 
)
Value:
{ \
DEST.position.x = SOURCE.position.x; \
DEST.position.y = SOURCE.position.y; \
DEST.position.z = SOURCE.position.z; \
DEST.charge = SOURCE.charge; \
}

Definition at line 59 of file ComputeNonbondedCUDAKernel.h.

#define COPY_ATOM_FROM_SHARED (   ATOM,
  PARAM,
  SHARED 
)
Value:
{ \
COPY_ATOM( ATOM, SHARED ) \
COPY_PARAM( PARAM, SHARED ) \
}
#define COPY_PARAM(DEST, SOURCE)
#define ATOM
#define COPY_ATOM(DEST, SOURCE)

Definition at line 79 of file ComputeNonbondedCUDAKernel.h.

#define COPY_ATOM_TO_SHARED (   ATOM,
  PARAM,
  SHARED 
)
Value:
{ \
COPY_ATOM( SHARED, ATOM ) \
COPY_PARAM( SHARED, PARAM ) \
}
#define COPY_PARAM(DEST, SOURCE)
#define ATOM
#define COPY_ATOM(DEST, SOURCE)

Definition at line 74 of file ComputeNonbondedCUDAKernel.h.

#define COPY_PARAM (   DEST,
  SOURCE 
)
Value:
{ \
DEST.sqrt_epsilon = SOURCE.sqrt_epsilon; \
DEST.half_sigma = SOURCE.half_sigma; \
DEST.index = SOURCE.index; \
DEST.excl_index = SOURCE.excl_index; \
DEST.excl_maxdiff = SOURCE.excl_maxdiff; \
}

Definition at line 66 of file ComputeNonbondedCUDAKernel.h.

#define FORCE_TABLE_SIZE   4096
#define MAX_CONST_EXCLUSIONS   2048

Definition at line 87 of file ComputeNonbondedCUDAKernel.h.

Referenced by cuda_bind_exclusions(), and void().

#define MAX_EXCLUSIONS   (1<<27)

Definition at line 86 of file ComputeNonbondedCUDAKernel.h.

Referenced by ComputeNonbondedCUDA::build_exclusions().

#define NUM_WARP   4
#define PATCH_PAIR_SIZE   (sizeof(patch_pair)/4)

Definition at line 45 of file ComputeNonbondedCUDAKernel.h.

Referenced by GBIS_P1_Kernel(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), and void().

#define WARPSIZE   32

Typedef Documentation

typedef float GBReal

Definition at line 3 of file ComputeNonbondedCUDAKernel.h.

Function Documentation

struct __align__ ( 16  )

Definition at line 22 of file ComputeNonbondedCUDAKernel.h.

References plist_size.

22  {
23  float3 offset;
24  int patch1_start; // Coordinate/force start for this patch
25  int patch1_size; // Size of the patch
26  int patch2_start;
27  int patch2_size;
28  int patch1_ind; // Patch index
29  int patch2_ind;
30  int patch1_num_pairs; // Number of pairs that involve this patch
31  int patch2_num_pairs;
32  union {
33  bool patch_done[2]; // After-GPU-computation shared memory temporary storage
34  struct {
35  int plist_start; // Pair list start
36  int plist_size; // Pair list size
37  };
38  };
39  int exclmask_start; // Exclusion mask start
40  int patch1_free_size; // Size of the free atoms in patch
41  int patch2_free_size; // Size of the free atoms in patch
42 // int pad1, pad2;
43 };
static __thread int plist_size
void cuda_bind_atom_params ( const atom_param *  t)

Definition at line 352 of file ComputeNonbondedCUDAKernel.cu.

References atom_params, cuda_errcheck(), num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

352  {
353  cudaMemcpyAsync(atom_params, t, num_atoms * sizeof(atom_param),
354  cudaMemcpyHostToDevice, stream);
355  cuda_errcheck("memcpy to atom_params");
356 }
__thread cudaStream_t stream
static __thread atom_param * atom_params
static __thread int num_atoms
void cuda_errcheck(const char *msg)
void cuda_bind_atoms ( const atom *  a)

Definition at line 364 of file ComputeNonbondedCUDAKernel.cu.

References atoms, cuda_errcheck(), num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

364  {
365  cuda_errcheck("before memcpy to atoms");
366  cudaMemcpyAsync(atoms, a, num_atoms * sizeof(atom),
367  cudaMemcpyHostToDevice, stream);
368  cuda_errcheck("memcpy to atoms");
369 }
static __thread atom * atoms
__thread cudaStream_t stream
static __thread int num_atoms
void cuda_errcheck(const char *msg)
void cuda_bind_exclusions ( const unsigned int *  t,
int  n 
)

Definition at line 22 of file ComputeNonbondedCUDAKernel.cu.

References const_exclusions, cuda_errcheck(), exclusions, exclusions_size, MAX_CONST_EXCLUSIONS, overflow_exclusions, and tex_exclusions.

Referenced by ComputeNonbondedCUDA::build_exclusions().

22  {
23  exclusions_size = n;
24  static __thread int exclusions_alloc;
25  if ( exclusions && exclusions_alloc < exclusions_size ) {
26  cudaFree(exclusions);
27  cuda_errcheck("freeing exclusions");
28  cudaFree(overflow_exclusions);
29  cuda_errcheck("freeing overflow_exclusions");
30  exclusions = 0;
31  }
32  if ( ! exclusions ) {
33  exclusions_alloc = exclusions_size;
34  cudaMalloc((void**) &exclusions, n*sizeof(unsigned int));
35  cuda_errcheck("malloc exclusions");
36  cudaMalloc((void**) &overflow_exclusions, n*sizeof(unsigned int));
37  cuda_errcheck("malloc overflow_exclusions");
38  }
39  cudaMemcpy(exclusions, t, n*sizeof(unsigned int), cudaMemcpyHostToDevice);
40  cuda_errcheck("memcpy exclusions");
41  tex_exclusions.normalized = false;
42  tex_exclusions.addressMode[0] = cudaAddressModeClamp;
43  tex_exclusions.filterMode = cudaFilterModePoint;
44  cudaBindTexture(NULL, tex_exclusions, exclusions, n*sizeof(unsigned int));
45  cuda_errcheck("binding exclusions to texture");
46 
47  cudaMemcpy(overflow_exclusions, t,
48  n*sizeof(unsigned int), cudaMemcpyHostToDevice);
49  cuda_errcheck("memcpy to overflow_exclusions");
50  int nconst = ( n < MAX_CONST_EXCLUSIONS ? n : MAX_CONST_EXCLUSIONS );
51  cudaMemcpyToSymbol(const_exclusions, t, nconst*sizeof(unsigned int), 0);
52  cuda_errcheck("memcpy to const_exclusions");
53 }
static __thread unsigned int * exclusions
static __thread unsigned int * overflow_exclusions
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
texture< unsigned int, 1, cudaReadModeElementType > tex_exclusions
static __thread int exclusions_size
#define MAX_CONST_EXCLUSIONS
void cuda_errcheck(const char *msg)
void cuda_bind_force_table ( const float4 *  t,
const float4 *  et 
)

Definition at line 90 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), energy_table, force_table, and FORCE_TABLE_SIZE.

Referenced by ComputeNonbondedCUDA::build_force_table().

90  {
91  static __thread cudaArray *ct;
92  static __thread cudaArray *ect;
93  if ( ! ct ) {
94  cudaMallocArray(&ct, &force_table.channelDesc, FORCE_TABLE_SIZE, 1);
95  cuda_errcheck("allocating force table");
96  }
97  if ( ! ect ) {
98  cudaMallocArray(&ect, &energy_table.channelDesc, FORCE_TABLE_SIZE, 1);
99  cuda_errcheck("allocating energy table");
100  }
101  cudaMemcpyToArray(ct, 0, 0, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
102  // cudaMemcpy(ct, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
103  cuda_errcheck("memcpy to force table");
104  cudaMemcpyToArray(ect, 0, 0, et, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
105  cuda_errcheck("memcpy to energy table");
106 
107  force_table.normalized = true;
108  force_table.addressMode[0] = cudaAddressModeClamp;
109  force_table.addressMode[1] = cudaAddressModeClamp;
110  force_table.filterMode = cudaFilterModeLinear;
111 
112  energy_table.normalized = true;
113  energy_table.addressMode[0] = cudaAddressModeClamp;
114  energy_table.addressMode[1] = cudaAddressModeClamp;
115  energy_table.filterMode = cudaFilterModeLinear;
116 
117  cudaBindTextureToArray(force_table, ct);
118  cuda_errcheck("binding force table to texture");
119 
120  cudaBindTextureToArray(energy_table, ect);
121  cuda_errcheck("binding energy table to texture");
122 }
#define FORCE_TABLE_SIZE
void cuda_errcheck(const char *msg)
texture< float4, 1, cudaReadModeElementType > energy_table
texture< float4, 1, cudaReadModeElementType > force_table
void cuda_bind_forces ( float4 *  f,
float4 *  f_slow 
)

Definition at line 371 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), forces, and slow_forces.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

371  {
372  cudaHostGetDevicePointer(&forces, f, 0);
373  cuda_errcheck("cudaHostGetDevicePointer forces");
374  cudaHostGetDevicePointer(&slow_forces, f_slow, 0);
375  cuda_errcheck("cudaHostGetDevicePointer slow_forces");
376 }
static __thread float4 * forces
static __thread float4 * slow_forces
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_bornRad ( float *  bornRadH)

Definition at line 406 of file ComputeNonbondedCUDAKernel.cu.

References bornRadD, cuda_errcheck(), num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

406  {
407  cudaMemcpyAsync(bornRadD, bornRadH, num_atoms * sizeof(float),
408  cudaMemcpyHostToDevice, stream);
409  cuda_errcheck("memcpy to bornRad");
410 }
static __thread float * bornRadD
static __thread float * bornRadH
__thread cudaStream_t stream
static __thread int num_atoms
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_dEdaSum ( GBReal dEdaSumH)

Definition at line 412 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), and dEdaSumD.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

412  {
413  cudaHostGetDevicePointer(&dEdaSumD, dEdaSumH, 0);
414  cuda_errcheck("cudaHostGetDevicePointer dEdaSum");
415 }
static __thread GBReal * dEdaSumD
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_dHdrPrefix ( float *  dHdrPrefixH)

Definition at line 417 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), dHdrPrefixD, num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

417  {
418  cudaMemcpyAsync(dHdrPrefixD, dHdrPrefixH, num_atoms * sizeof(float),
419  cudaMemcpyHostToDevice, stream);
420  cuda_errcheck("memcpy to dHdrPrefix");
421 }
static __thread float * dHdrPrefixH
__thread cudaStream_t stream
static __thread float * dHdrPrefixD
static __thread int num_atoms
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_energy ( float *  energy_gbis)

Definition at line 389 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), and energy_gbis.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

389  {
390  cudaHostGetDevicePointer(&energy_gbis, e, 0);
391  cuda_errcheck("cudaHostGetDevicePointer energy_gbis");
392 }
static __thread float * energy_gbis
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_intRad ( float *  intRad0H,
float *  intRadSH 
)

Definition at line 393 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), intRad0D, intRadSD, num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

393  {
394  cudaMemcpyAsync(intRad0D, intRad0H, num_atoms * sizeof(float),
395  cudaMemcpyHostToDevice, stream);
396  cudaMemcpyAsync(intRadSD, intRadSH, num_atoms * sizeof(float),
397  cudaMemcpyHostToDevice, stream);
398  cuda_errcheck("memcpy to intRad");
399 }
static __thread float * intRadSD
static __thread float * intRad0D
__thread cudaStream_t stream
static __thread float * intRadSH
static __thread int num_atoms
void cuda_errcheck(const char *msg)
static __thread float * intRad0H
void cuda_bind_GBIS_psiSum ( GBReal psiSumH)

Definition at line 401 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), and psiSumD.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

401  {
402  cudaHostGetDevicePointer(&psiSumD, psiSumH, 0);
403  cuda_errcheck("cudaHostGetDevicePointer psiSum");
404 }
void cuda_errcheck(const char *msg)
static __thread GBReal * psiSumD
void cuda_bind_lj_table ( const float2 t,
int  _lj_table_size 
)

Definition at line 59 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), lj_table, and lj_table_size.

Referenced by ComputeNonbondedCUDA::build_lj_table().

59  {
60  static __thread float2 *ct;
61  static __thread int lj_table_alloc;
62  lj_table_size = _lj_table_size;
63  if ( ct && lj_table_alloc < lj_table_size ) {
64  cudaFree(ct);
65  cuda_errcheck("freeing lj table");
66  ct = 0;
67  }
68  if ( ! ct ) {
69  lj_table_alloc = lj_table_size;
70  cudaMalloc((void**) &ct, lj_table_size*lj_table_size*sizeof(float2));
71  cuda_errcheck("allocating lj table");
72  }
73  cudaMemcpy(ct, t, lj_table_size*lj_table_size*sizeof(float2),
74  cudaMemcpyHostToDevice);
75  cuda_errcheck("memcpy to lj table");
76 
77  lj_table.normalized = false;
78  lj_table.addressMode[0] = cudaAddressModeClamp;
79  lj_table.filterMode = cudaFilterModePoint;
80 
81  cudaBindTexture((size_t*)0, lj_table, ct,
83  cuda_errcheck("binding lj table to texture");
84 }
static __thread int lj_table_size
texture< float2, 1, cudaReadModeElementType > lj_table
void cuda_errcheck(const char *msg)
void cuda_bind_patch_pairs ( patch_pair *  h_patch_pairs,
int  npatch_pairs,
int  npatches,
int  natoms,
int  nexclmask,
int  plist_len 
)

Definition at line 293 of file ComputeNonbondedCUDAKernel.cu.

References atom_params, atom_params_size, atoms, atoms_size, bornRadD, bornRadD_size, cuda_errcheck(), dHdrPrefixD, dHdrPrefixD_size, exclmasks, exclmasks_size, GBIS_P1_counters, GBIS_P1_counters_size, GBIS_P2_counters, GBIS_P2_counters_size, GBIS_P3_counters, GBIS_P3_counters_size, global_counters, global_counters_size, intRad0D, intRad0D_size, intRadSD, intRadSD_size, num_atoms, num_patches, num_virials, patch_pairs, patch_pairs_size, plist, plist_size, slow_tmpforces, slow_tmpforces_size, slow_tmpvirials, slow_tmpvirials_size, tmp_dEdaSumD, tmp_dEdaSumD_size, tmp_energy_gbis, tmp_energy_gbis_size, tmp_psiSumD, tmp_psiSumD_size, tmpforces, tmpforces_size, tmpvirials, tmpvirials_size, vdw_types, and vdw_types_size.

Referenced by ComputeNonbondedCUDA::doWork().

295  {
296  num_patches = npatches;
297  num_virials = npatches;
298  num_atoms = natoms;
299  reallocate_device<patch_pair>(&patch_pairs, &patch_pairs_size, npatch_pairs, 1.2f);
300  reallocate_device<atom>(&atoms, &atoms_size, num_atoms, 1.2f);
301  reallocate_device<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
302  reallocate_device<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
303  reallocate_device<unsigned int>(&global_counters, &global_counters_size, num_patches+2, 1.2f);
304  reallocate_device<float4>(&tmpforces, &tmpforces_size, num_atoms, 1.2f);
305  reallocate_device<float4>(&slow_tmpforces, &slow_tmpforces_size, num_atoms, 1.2f);
306  reallocate_device<unsigned int>(&plist, &plist_size, plist_len, 1.2f);
307  reallocate_device<exclmask>(&exclmasks, &exclmasks_size, nexclmask, 1.2f);
308  reallocate_device<float>(&tmpvirials, &tmpvirials_size, num_patches*16, 1.2f);
309  reallocate_device<float>(&slow_tmpvirials, &slow_tmpvirials_size, num_patches*16, 1.2f);
310 
311  // For GBIS
312  reallocate_device<unsigned int>(&GBIS_P1_counters, &GBIS_P1_counters_size, num_patches, 1.2f);
313  reallocate_device<unsigned int>(&GBIS_P2_counters, &GBIS_P2_counters_size, num_patches, 1.2f);
314  reallocate_device<unsigned int>(&GBIS_P3_counters, &GBIS_P3_counters_size, num_patches, 1.2f);
315  reallocate_device<float>(&intRad0D, &intRad0D_size, num_atoms, 1.2f);
316  reallocate_device<float>(&intRadSD, &intRadSD_size, num_atoms, 1.2f);
317  reallocate_device<GBReal>(&tmp_psiSumD, &tmp_psiSumD_size, num_atoms, 1.2f);
318  reallocate_device<float>(&bornRadD, &bornRadD_size, num_atoms, 1.2f);
319  reallocate_device<GBReal>(&tmp_dEdaSumD, &tmp_dEdaSumD_size, num_atoms, 1.2f);
320  reallocate_device<float>(&dHdrPrefixD, &dHdrPrefixD_size, num_atoms, 1.2f);
321  reallocate_device<float>(&tmp_energy_gbis, &tmp_energy_gbis_size, num_patches, 1.2f);
322 
323  cudaMemcpy(patch_pairs, h_patch_pairs, npatch_pairs*sizeof(patch_pair), cudaMemcpyHostToDevice);
324  cuda_errcheck("memcpy to patch_pairs");
325 
326  cudaMemset(global_counters, 0, (num_patches+2)*sizeof(unsigned int));
327  cuda_errcheck("memset global_counters");
328 
329  cudaMemset(GBIS_P1_counters, 0, num_patches*sizeof(unsigned int));
330  cuda_errcheck("memset GBIS_P1_counters");
331 
332  cudaMemset(GBIS_P2_counters, 0, num_patches*sizeof(unsigned int));
333  cuda_errcheck("memset GBIS_P2_counters");
334 
335  cudaMemset(GBIS_P3_counters, 0, num_patches*sizeof(unsigned int));
336  cuda_errcheck("memset GBIS_P3_counters");
337 
338  cudaMemset(tmpforces, 0, num_atoms*sizeof(float4));
339  cuda_errcheck("memset tmpforces");
340 
341  cudaMemset(tmpvirials, 0, num_patches*sizeof(float)*16);
342  cuda_errcheck("memset tmpvirials");
343 
344  cudaMemset(slow_tmpforces, 0, num_atoms*sizeof(float4));
345  cuda_errcheck("memset slow_tmpforces");
346 
347  cudaMemset(slow_tmpvirials, 0, num_patches*sizeof(float)*16);
348  cuda_errcheck("memset slow_tmpvirials");
349 
350 }
static __thread unsigned int * GBIS_P2_counters
static __thread GBReal * tmp_dEdaSumD
static __thread float * tmp_energy_gbis
static __thread float * intRadSD
static __thread int patch_pairs_size
static __thread int intRad0D_size
static __thread float * bornRadD
static __thread atom * atoms
static __thread float * intRad0D
static __thread GBReal * tmp_psiSumD
static __thread unsigned int * plist
static __thread int GBIS_P1_counters_size
static __thread int intRadSD_size
static __thread exclmask * exclmasks
static __thread float * slow_tmpvirials
static __thread int plist_size
static __thread int vdw_types_size
static __thread int atom_params_size
static __thread float * dHdrPrefixD
static __thread int global_counters_size
static __thread int GBIS_P3_counters_size
static __thread float * tmpvirials
static __thread patch_pair * patch_pairs
static __thread unsigned int * GBIS_P1_counters
static __thread int num_patches
static __thread int num_virials
static __thread int tmpvirials_size
static __thread int tmp_psiSumD_size
static __thread int bornRadD_size
static __thread atom_param * atom_params
static __thread int GBIS_P2_counters_size
static __thread int tmp_dEdaSumD_size
static __thread int tmp_energy_gbis_size
static __thread unsigned int * GBIS_P3_counters
static __thread int num_atoms
static __thread int slow_tmpforces_size
static __thread float4 * slow_tmpforces
static __thread int tmpforces_size
static __thread unsigned int * global_counters
void cuda_errcheck(const char *msg)
static __thread int slow_tmpvirials_size
static __thread int dHdrPrefixD_size
static __thread int exclmasks_size
static __thread int * vdw_types
static __thread float4 * tmpforces
static __thread int atoms_size
void cuda_bind_vdw_types ( const int *  t)

Definition at line 358 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), num_atoms, stream, and vdw_types.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

358  {
359  cudaMemcpyAsync(vdw_types, t, num_atoms * sizeof(int),
360  cudaMemcpyHostToDevice, stream);
361  cuda_errcheck("memcpy to vdw_types");
362 }
__thread cudaStream_t stream
static __thread int num_atoms
void cuda_errcheck(const char *msg)
static __thread int * vdw_types
void cuda_bind_virials ( float *  v,
int *  queue,
int *  blockorder 
)

Definition at line 378 of file ComputeNonbondedCUDAKernel.cu.

References block_order, cuda_errcheck(), force_ready_queue, num_virials, slow_virials, and virials.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

378  {
379  cudaHostGetDevicePointer(&virials, v, 0);
380  cuda_errcheck("cudaHostGetDevicePointer virials");
382  cudaHostGetDevicePointer(&force_ready_queue, queue, 0);
383  cuda_errcheck("cudaHostGetDevicePointer force_ready_queue");
384  cudaHostGetDevicePointer(&block_order, blockorder, 0);
385  cuda_errcheck("cudaHostGetDevicePointer block_order");
386 }
static __thread float * slow_virials
static __thread float * virials
static __thread int * force_ready_queue
static __thread int num_virials
void cuda_errcheck(const char *msg)
static __thread int * block_order
void cuda_errcheck ( const char *  msg)

Definition at line 41 of file ComputeNonbondedCUDA.C.

41  {
42  cudaError_t err;
43  if ((err = cudaGetLastError()) != cudaSuccess) {
44  char host[128];
45  gethostname(host, 128); host[127] = 0;
46  char devstr[128] = "";
47  int devnum;
48  if ( cudaGetDevice(&devnum) == cudaSuccess ) {
49  sprintf(devstr, " device %d", devnum);
50  }
51  cudaDeviceProp deviceProp;
52  if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
53  sprintf(devstr, " device %d pci %x:%x:%x", devnum,
54  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
55  }
56  char errmsg[1024];
57  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
58  NAMD_die(errmsg);
59  }
60 }
void NAMD_die(const char *err_msg)
Definition: common.C:83
void cuda_GBIS_P1 ( int  cbegin,
int  ccount,
int  pbegin,
int  pcount,
float  a_cut,
float  rho_0,
float3  lata,
float3  latb,
float3  latc,
cudaStream_t &  strm 
)

Definition at line 569 of file ComputeNonbondedCUDAKernel.cu.

References atoms, cuda_errcheck(), GBIS_P1_counters, intRad0D, intRadSD, lata, latb, latc, max_grid_size, num_atoms, NUM_WARP, patch_pairs, psiSumD, tmp_psiSumD, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

580  {
581 
582  if ( ccount ) {
583  cudaMemsetAsync(tmp_psiSumD, 0, num_atoms*sizeof(GBReal), strm);
584 
585  int grid_dim = max_grid_size; // maximum allowed
586  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
587  if (grid_dim > ccount - cstart) {
588  grid_dim = ccount - cstart;
589  }
590 
591  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
592  GBIS_P1_Kernel<<<grid_dim, nthread3, 0, strm>>>(
593  patch_pairs+cbegin+cstart,
594  atoms,
595  intRad0D,
596  intRadSD,
597  tmp_psiSumD,
598  psiSumD,
599  a_cut,
600  rho_0,
601  lata,
602  latb,
603  latc,
605  );
606  cuda_errcheck("dev_GBIS_P1");
607  } // end for
608  }
609 } // end GBIS P1
static __thread float * intRadSD
static __thread atom * atoms
static __thread float * intRad0D
static __thread GBReal * tmp_psiSumD
static __thread patch_pair * patch_pairs
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
static __thread unsigned int * GBIS_P1_counters
static __thread int num_atoms
void cuda_errcheck(const char *msg)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__thread int max_grid_size
#define NUM_WARP
#define WARPSIZE
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
static __thread GBReal * psiSumD
float GBReal
Definition: ComputeGBIS.inl:17
void cuda_GBIS_P2 ( int  cbegin,
int  ccount,
int  pbegin,
int  pcount,
float  a_cut,
float  r_cut,
float  scaling,
float  kappa,
float  smoothDist,
float  epsilon_p,
float  epsilon_s,
float3  lata,
float3  latb,
float3  latc,
int  doEnergy,
int  doFullElec,
cudaStream_t &  strm 
)

Definition at line 614 of file ComputeNonbondedCUDAKernel.cu.

References atoms, bornRadD, cuda_errcheck(), dEdaSumD, energy_gbis, forces, GBIS_P2_counters, lata, latb, latc, max_grid_size, num_atoms, num_patches, NUM_WARP, patch_pairs, tmp_dEdaSumD, tmp_energy_gbis, tmpforces, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

632  {
633 
634  if ( ccount ) {
635  cudaMemsetAsync(tmp_dEdaSumD, 0, num_atoms*sizeof(GBReal), strm);
636  cudaMemsetAsync(tmp_energy_gbis, 0, num_patches*sizeof(float), strm);
637 
638  int grid_dim = max_grid_size; // maximum allowed
639  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
640  if (grid_dim > ccount - cstart)
641  grid_dim = ccount - cstart;
642 
643  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
644  GBIS_P2_Kernel<<<grid_dim, nthread3, 0, strm>>>(
645  patch_pairs+cbegin+cstart,
646  atoms,
647  bornRadD,
648  tmp_dEdaSumD,
649  dEdaSumD,
650  a_cut,
651  r_cut,
652  scaling,
653  kappa,
654  smoothDist,
655  epsilon_p,
656  epsilon_s,
657  lata,
658  latb,
659  latc,
660  doEnergy,
661  doFullElec,
662  tmpforces,
663  forces,
665  energy_gbis,
667  );
668  cuda_errcheck("dev_GBIS_P2");
669  } // end for
670  }
671 } // end P2
static __thread unsigned int * GBIS_P2_counters
static __thread GBReal * tmp_dEdaSumD
static __thread float * tmp_energy_gbis
static __thread float * bornRadD
static __thread atom * atoms
static __thread float4 * forces
static __thread GBReal * dEdaSumD
static __thread float * energy_gbis
static __thread patch_pair * patch_pairs
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
static __thread int num_patches
static __thread int num_atoms
void cuda_errcheck(const char *msg)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__thread int max_grid_size
#define NUM_WARP
#define WARPSIZE
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
static __thread float4 * tmpforces
float GBReal
Definition: ComputeGBIS.inl:17
void cuda_GBIS_P3 ( int  cbegin,
int  ccount,
int  pbegin,
int  pcount,
float  a_cut,
float  rho_0,
float  scaling,
float3  lata,
float3  latb,
float3  latc,
cudaStream_t &  strm 
)

Definition at line 676 of file ComputeNonbondedCUDAKernel.cu.

References atoms, cuda_errcheck(), dHdrPrefixD, GBIS_P3_counters, intRad0D, intRadSD, lata, latb, latc, max_grid_size, NUM_WARP, patch_pairs, slow_forces, slow_tmpforces, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

688  {
689  int grid_dim = max_grid_size; // maximum allowed
690  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
691  if (grid_dim > ccount - cstart)
692  grid_dim = ccount - cstart;
693 
694  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
695  GBIS_P3_Kernel<<<grid_dim, nthread3, 0, strm>>>(
696  patch_pairs+cbegin+cstart,
697  atoms,
698  intRad0D,
699  intRadSD,
700  dHdrPrefixD,
701  a_cut,
702  rho_0,
703  scaling,
704  lata,
705  latb,
706  latc,
708  slow_forces,
710  );
711  cuda_errcheck("dev_GBIS_P3");
712  }
713 }
static __thread float * intRadSD
static __thread atom * atoms
static __thread float * intRad0D
static __thread float4 * slow_forces
static __thread float * dHdrPrefixD
static __thread patch_pair * patch_pairs
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
static __thread unsigned int * GBIS_P3_counters
static __thread float4 * slow_tmpforces
void cuda_errcheck(const char *msg)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__thread int max_grid_size
#define NUM_WARP
#define WARPSIZE
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
void cuda_init ( )

Definition at line 203 of file ComputeNonbondedCUDAKernel.cu.

References atom_params, atom_params_size, atoms, atoms_size, bornRadD, bornRadD_size, cuda_errcheck(), dEdaSumD, dHdrPrefixD, dHdrPrefixD_size, energy_gbis, exclmasks, exclmasks_size, exclusions, exclusions_size, force_ready_queue, forces, GBIS_P1_counters, GBIS_P1_counters_size, GBIS_P2_counters, GBIS_P2_counters_size, GBIS_P3_counters, GBIS_P3_counters_size, global_counters, global_counters_size, intRad0D, intRad0D_size, intRadSD, intRadSD_size, max_grid_size, patch_pairs, patch_pairs_size, plist, plist_size, psiSumD, slow_forces, slow_tmpforces, slow_tmpforces_size, slow_tmpvirials, slow_tmpvirials_size, tmp_dEdaSumD, tmp_dEdaSumD_size, tmp_energy_gbis, tmp_energy_gbis_size, tmp_psiSumD, tmp_psiSumD_size, tmpforces, tmpforces_size, tmpvirials, tmpvirials_size, vdw_types, and vdw_types_size.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

203  {
204  patch_pairs_size = 0;
205  patch_pairs = NULL;
206 
207  atom_params_size = 0;
208  atom_params = NULL;
209 
210  vdw_types_size = 0;
211  vdw_types = NULL;
212 
213  atoms_size = 0;
214  atoms = NULL;
215 
216  tmpforces_size = 0;
217  tmpforces = NULL;
218 
220  slow_tmpforces = NULL;
221 
222  tmpvirials_size = 0;
223  tmpvirials = NULL;
224 
226  slow_tmpvirials = NULL;
227 
229  global_counters = NULL;
230 
231  plist_size = 0;
232  plist = NULL;
233 
234  exclmasks_size = 0;
235  exclmasks = NULL;
236 
237  forces = NULL;
238  slow_forces = NULL;
239 
240  force_ready_queue = NULL;
241 
242  exclusions_size = 0;
243  exclusions = NULL;
244 
245  // --------------------
246  // For GBIS
247  // --------------------
248  intRad0D_size = 0;
249  intRad0D = NULL;
250 
251  intRadSD_size = 0;
252  intRadSD = NULL;
253 
254  psiSumD = NULL; // host-mapped memory
255 
256  tmp_psiSumD_size = 0;
257  tmp_psiSumD = NULL;
258 
259  bornRadD_size = 0;
260  bornRadD = NULL;
261 
262  dEdaSumD = NULL; // host-mapped memory
263 
264  tmp_dEdaSumD_size = 0;
265  tmp_dEdaSumD = NULL;
266 
267  dHdrPrefixD_size = 0;
268  dHdrPrefixD = NULL;
269 
271  GBIS_P1_counters = NULL;
272 
274  GBIS_P2_counters = NULL;
275 
277  GBIS_P3_counters = NULL;
278 
279  energy_gbis = NULL; // host-mapped memory
280 
282  tmp_energy_gbis = NULL;
283 
284  int dev;
285  cudaGetDevice(&dev);
286  cuda_errcheck("cudaGetDevice");
287  cudaDeviceProp deviceProp;
288  cudaGetDeviceProperties(&deviceProp, dev);
289  cuda_errcheck("cudaGetDeviceProperties");
290  max_grid_size = deviceProp.maxGridSize[1];
291 }
static __thread unsigned int * GBIS_P2_counters
static __thread GBReal * tmp_dEdaSumD
static __thread float * tmp_energy_gbis
static __thread float * intRadSD
static __thread int patch_pairs_size
static __thread int intRad0D_size
static __thread float * bornRadD
static __thread unsigned int * exclusions
static __thread atom * atoms
static __thread float * intRad0D
static __thread float4 * forces
static __thread GBReal * tmp_psiSumD
static __thread unsigned int * plist
static __thread GBReal * dEdaSumD
static __thread int GBIS_P1_counters_size
static __thread int intRadSD_size
static __thread float * energy_gbis
static __thread exclmask * exclmasks
static __thread float * slow_tmpvirials
static __thread int plist_size
static __thread float4 * slow_forces
static __thread int vdw_types_size
static __thread int atom_params_size
static __thread float * dHdrPrefixD
static __thread int global_counters_size
static __thread int GBIS_P3_counters_size
static __thread float * tmpvirials
static __thread int * force_ready_queue
static __thread patch_pair * patch_pairs
static __thread unsigned int * GBIS_P1_counters
static __thread int tmpvirials_size
static __thread int tmp_psiSumD_size
static __thread int bornRadD_size
static __thread atom_param * atom_params
static __thread int GBIS_P2_counters_size
static __thread int tmp_dEdaSumD_size
static __thread int tmp_energy_gbis_size
static __thread unsigned int * GBIS_P3_counters
static __thread int slow_tmpforces_size
static __thread int exclusions_size
static __thread float4 * slow_tmpforces
static __thread int tmpforces_size
static __thread unsigned int * global_counters
void cuda_errcheck(const char *msg)
static __thread int slow_tmpvirials_size
static __thread int dHdrPrefixD_size
static __thread int exclmasks_size
static __thread int * vdw_types
__thread int max_grid_size
static __thread float4 * tmpforces
static __thread GBReal * psiSumD
static __thread int atoms_size
void cuda_nonbonded_forces ( float3  lata,
float3  latb,
float3  latc,
float  cutoff2,
float  plcutoff2,
int  cbegin,
int  ccount,
int  ctotal,
int  doSlow,
int  doEnergy,
int  usePairlists,
int  savePairlists,
int  doStreaming,
int  saveOrder,
cudaStream_t &  strm 
)

Definition at line 500 of file ComputeNonbondedCUDAKernel.cu.

References CALL, cuda_errcheck(), cutoff2, max_grid_size, num_atoms, num_patches, NUM_WARP, slow_tmpforces, slow_tmpvirials, tmpforces, tmpvirials, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

504  {
505 
506  if ( ccount ) {
507  if ( usePairlists ) {
508  if ( ! savePairlists ) plcutoff2 = 0.;
509  } else {
510  plcutoff2 = cutoff2;
511  }
512 
513  cudaMemsetAsync(tmpforces, 0, num_atoms*sizeof(float4), strm);
514  cudaMemsetAsync(tmpvirials, 0, num_patches*sizeof(float)*16, strm);
515  if ( doSlow ) {
516  cudaMemsetAsync(slow_tmpforces, 0, num_atoms*sizeof(float4), strm);
517  cudaMemsetAsync(slow_tmpvirials, 0, num_patches*sizeof(float)*16, strm);
518  }
519 
520  int grid_dim = max_grid_size; // maximum allowed
521  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
522  if ( grid_dim > ccount - cstart ) grid_dim = ccount - cstart;
523 
524  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
525 
526 #define CALL(X) X<<< grid_dim, nthread3, 0, strm >>> \
527  (patch_pairs, atoms, atom_params, vdw_types, plist, \
528  tmpforces, (doSlow?slow_tmpforces:NULL), \
529  forces, (doSlow?slow_forces:NULL), \
530  tmpvirials, (doSlow?slow_tmpvirials:NULL), \
531  virials, (doSlow?slow_virials:NULL), \
532  global_counters, (doStreaming?force_ready_queue:NULL), \
533  overflow_exclusions, num_patches, \
534  cbegin+cstart, ctotal, (saveOrder?block_order:NULL), \
535  exclmasks, lj_table_size, \
536  lata, latb, latc, cutoff2, plcutoff2, doSlow)
537 //end definition
538 
539  if ( doEnergy ) {
540  if ( doSlow ) {
541  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_energy_pairlist);
542  else CALL(dev_nonbonded_slow_energy);
543  } else {
544  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_energy_pairlist);
545  else CALL(dev_nonbonded_energy);
546  }
547  } else {
548  if ( doSlow ) {
549  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_pairlist);
550  else CALL(dev_nonbonded_slow);
551  } else {
552  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_pairlist);
553  else CALL(dev_nonbonded);
554  }
555  }
556 
557  cuda_errcheck("dev_nonbonded");
558  }
559  }
560 
561 }
static __thread float * slow_tmpvirials
static __thread float * tmpvirials
static __thread int num_patches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float plcutoff2
static __thread int num_atoms
static __thread float4 * slow_tmpforces
void cuda_errcheck(const char *msg)
__thread int max_grid_size
#define NUM_WARP
#define WARPSIZE
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
#define CALL(DOENERGY, DOVIRIAL)
static __thread float4 * tmpforces
int cuda_stream_finished ( )