NAMD
Classes | Macros | Typedefs | Functions
ComputeNonbondedCUDAKernel.h File Reference
#include "HipDefines.h"

Go to the source code of this file.

Classes

struct  exclmask
 

Macros

#define NUM_WARP   4
 
#define PATCH_PAIR_SIZE   (sizeof(patch_pair)/4)
 
#define COPY_ATOM(DEST, SOURCE)
 
#define COPY_PARAM(DEST, SOURCE)
 
#define COPY_ATOM_TO_SHARED(ATOM, PARAM, SHARED)
 
#define COPY_ATOM_FROM_SHARED(ATOM, PARAM, SHARED)
 
#define MAX_EXCLUSIONS   (1<<27)
 
#define MAX_CONST_EXCLUSIONS   2048
 
#define FORCE_TABLE_SIZE   4096
 

Typedefs

typedef float GBReal
 

Functions

void cuda_errcheck (const char *msg)
 
struct __align__ (16) patch_pair
 
void cuda_bind_exclusions (const unsigned int *t, int n)
 
void cuda_bind_lj_table (const float2 *t, int _lj_table_size)
 
void cuda_bind_force_table (const float4 *t, const float4 *et)
 
void cuda_init ()
 
void cuda_bind_patch_pairs (patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int nexclmask, int plist_len)
 
void cuda_bind_atom_params (const atom_param *t)
 
void cuda_bind_vdw_types (const int *t)
 
void cuda_bind_atoms (const atom *a)
 
void cuda_bind_forces (float4 *f, float4 *f_slow)
 
void cuda_bind_virials (float *v, int *queue, int *blockorder)
 
void cuda_nonbonded_forces (float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
 
void cuda_GBIS_P1 (int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
 
void cuda_GBIS_P2 (int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
 
void cuda_GBIS_P3 (int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
 
void cuda_bind_GBIS_intRad (float *intRad0H, float *intRadSH)
 
void cuda_bind_GBIS_energy (float *energy_gbis)
 
void cuda_bind_GBIS_psiSum (GBReal *psiSumH)
 
void cuda_bind_GBIS_bornRad (float *bornRadH)
 
void cuda_bind_GBIS_dEdaSum (GBReal *dEdaSumH)
 
void cuda_bind_GBIS_dHdrPrefix (float *dHdrPrefixH)
 
int cuda_stream_finished ()
 

Macro Definition Documentation

#define COPY_ATOM (   DEST,
  SOURCE 
)
Value:
{ \
DEST.position.x = SOURCE.position.x; \
DEST.position.y = SOURCE.position.y; \
DEST.position.z = SOURCE.position.z; \
DEST.charge = SOURCE.charge; \
}

Definition at line 53 of file ComputeNonbondedCUDAKernel.h.

#define COPY_ATOM_FROM_SHARED (   ATOM,
  PARAM,
  SHARED 
)
Value:
{ \
COPY_ATOM( ATOM, SHARED ) \
COPY_PARAM( PARAM, SHARED ) \
}
#define COPY_PARAM(DEST, SOURCE)
#define ATOM
#define COPY_ATOM(DEST, SOURCE)

Definition at line 73 of file ComputeNonbondedCUDAKernel.h.

#define COPY_ATOM_TO_SHARED (   ATOM,
  PARAM,
  SHARED 
)
Value:
{ \
COPY_ATOM( SHARED, ATOM ) \
COPY_PARAM( SHARED, PARAM ) \
}
#define COPY_PARAM(DEST, SOURCE)
#define ATOM
#define COPY_ATOM(DEST, SOURCE)

Definition at line 68 of file ComputeNonbondedCUDAKernel.h.

#define COPY_PARAM (   DEST,
  SOURCE 
)
Value:
{ \
DEST.sqrt_epsilon = SOURCE.sqrt_epsilon; \
DEST.half_sigma = SOURCE.half_sigma; \
DEST.index = SOURCE.index; \
DEST.excl_index = SOURCE.excl_index; \
DEST.excl_maxdiff = SOURCE.excl_maxdiff; \
}

Definition at line 60 of file ComputeNonbondedCUDAKernel.h.

#define FORCE_TABLE_SIZE   4096
#define MAX_CONST_EXCLUSIONS   2048

Definition at line 81 of file ComputeNonbondedCUDAKernel.h.

Referenced by cuda_bind_exclusions(), and void().

#define MAX_EXCLUSIONS   (1<<27)

Definition at line 80 of file ComputeNonbondedCUDAKernel.h.

Referenced by ComputeNonbondedCUDA::build_exclusions().

#define NUM_WARP   4
#define PATCH_PAIR_SIZE   (sizeof(patch_pair)/4)

Definition at line 39 of file ComputeNonbondedCUDAKernel.h.

Referenced by GBIS_P1_Kernel(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), and void().

Typedef Documentation

typedef float GBReal

Definition at line 4 of file ComputeNonbondedCUDAKernel.h.

Function Documentation

struct __align__ ( 16  )

Definition at line 16 of file ComputeNonbondedCUDAKernel.h.

References plist_size.

16  {
17  float3 offset;
18  int patch1_start; // Coordinate/force start for this patch
19  int patch1_size; // Size of the patch
20  int patch2_start;
21  int patch2_size;
22  int patch1_ind; // Patch index
23  int patch2_ind;
24  int patch1_num_pairs; // Number of pairs that involve this patch
25  int patch2_num_pairs;
26  union {
27  bool patch_done[2]; // After-GPU-computation shared memory temporary storage
28  struct {
29  int plist_start; // Pair list start
30  int plist_size; // Pair list size
31  };
32  };
33  int exclmask_start; // Exclusion mask start
34  int patch1_free_size; // Size of the free atoms in patch
35  int patch2_free_size; // Size of the free atoms in patch
36 // int pad1, pad2;
37 };
static __thread int plist_size
void cuda_bind_atom_params ( const atom_param *  t)

Definition at line 352 of file ComputeNonbondedCUDAKernel.cu.

References atom_params, cuda_errcheck(), num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

352  {
353  cudaMemcpyAsync(atom_params, t, num_atoms * sizeof(atom_param),
354  cudaMemcpyHostToDevice, stream);
355  cuda_errcheck("memcpy to atom_params");
356 }
__thread cudaStream_t stream
static __thread atom_param * atom_params
static __thread int num_atoms
void cuda_errcheck(const char *msg)
void cuda_bind_atoms ( const atom *  a)

Definition at line 364 of file ComputeNonbondedCUDAKernel.cu.

References atoms, cuda_errcheck(), num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

364  {
365  cuda_errcheck("before memcpy to atoms");
366  cudaMemcpyAsync(atoms, a, num_atoms * sizeof(atom),
367  cudaMemcpyHostToDevice, stream);
368  cuda_errcheck("memcpy to atoms");
369 }
static __thread atom * atoms
__thread cudaStream_t stream
static __thread int num_atoms
void cuda_errcheck(const char *msg)
void cuda_bind_exclusions ( const unsigned int *  t,
int  n 
)

Definition at line 22 of file ComputeNonbondedCUDAKernel.cu.

References const_exclusions, cuda_errcheck(), exclusions, exclusions_size, MAX_CONST_EXCLUSIONS, overflow_exclusions, and tex_exclusions.

Referenced by ComputeNonbondedCUDA::build_exclusions().

22  {
23  exclusions_size = n;
24  static __thread int exclusions_alloc;
25  if ( exclusions && exclusions_alloc < exclusions_size ) {
26  cudaFree(exclusions);
27  cuda_errcheck("freeing exclusions");
28  cudaFree(overflow_exclusions);
29  cuda_errcheck("freeing overflow_exclusions");
30  exclusions = 0;
31  }
32  if ( ! exclusions ) {
33  exclusions_alloc = exclusions_size;
34  cudaMalloc((void**) &exclusions, n*sizeof(unsigned int));
35  cuda_errcheck("malloc exclusions");
36  cudaMalloc((void**) &overflow_exclusions, n*sizeof(unsigned int));
37  cuda_errcheck("malloc overflow_exclusions");
38  }
39  cudaMemcpy(exclusions, t, n*sizeof(unsigned int), cudaMemcpyHostToDevice);
40  cuda_errcheck("memcpy exclusions");
41  tex_exclusions.normalized = false;
42  tex_exclusions.addressMode[0] = cudaAddressModeClamp;
43  tex_exclusions.filterMode = cudaFilterModePoint;
44  cudaBindTexture(NULL, tex_exclusions, exclusions, n*sizeof(unsigned int));
45  cuda_errcheck("binding exclusions to texture");
46 
47  cudaMemcpy(overflow_exclusions, t,
48  n*sizeof(unsigned int), cudaMemcpyHostToDevice);
49  cuda_errcheck("memcpy to overflow_exclusions");
50  int nconst = ( n < MAX_CONST_EXCLUSIONS ? n : MAX_CONST_EXCLUSIONS );
51  cudaMemcpyToSymbol(const_exclusions, t, nconst*sizeof(unsigned int), 0);
52  cuda_errcheck("memcpy to const_exclusions");
53 }
static __thread unsigned int * exclusions
static __thread unsigned int * overflow_exclusions
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
texture< unsigned int, 1, cudaReadModeElementType > tex_exclusions
static __thread int exclusions_size
#define MAX_CONST_EXCLUSIONS
void cuda_errcheck(const char *msg)
void cuda_bind_force_table ( const float4 *  t,
const float4 *  et 
)

Definition at line 90 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), energy_table, force_table, and FORCE_TABLE_SIZE.

Referenced by ComputeNonbondedCUDA::build_force_table().

90  {
91  static __thread cudaArray *ct;
92  static __thread cudaArray *ect;
93  if ( ! ct ) {
94  cudaMallocArray(&ct, &force_table.channelDesc, FORCE_TABLE_SIZE, 1);
95  cuda_errcheck("allocating force table");
96  }
97  if ( ! ect ) {
98  cudaMallocArray(&ect, &energy_table.channelDesc, FORCE_TABLE_SIZE, 1);
99  cuda_errcheck("allocating energy table");
100  }
101  cudaMemcpyToArray(ct, 0, 0, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
102  // cudaMemcpy(ct, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
103  cuda_errcheck("memcpy to force table");
104  cudaMemcpyToArray(ect, 0, 0, et, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
105  cuda_errcheck("memcpy to energy table");
106 
107  force_table.normalized = true;
108  force_table.addressMode[0] = cudaAddressModeClamp;
109  force_table.addressMode[1] = cudaAddressModeClamp;
110  force_table.filterMode = cudaFilterModeLinear;
111 
112  energy_table.normalized = true;
113  energy_table.addressMode[0] = cudaAddressModeClamp;
114  energy_table.addressMode[1] = cudaAddressModeClamp;
115  energy_table.filterMode = cudaFilterModeLinear;
116 
117  cudaBindTextureToArray(force_table, ct);
118  cuda_errcheck("binding force table to texture");
119 
120  cudaBindTextureToArray(energy_table, ect);
121  cuda_errcheck("binding energy table to texture");
122 }
#define FORCE_TABLE_SIZE
void cuda_errcheck(const char *msg)
texture< float4, 1, cudaReadModeElementType > energy_table
texture< float4, 1, cudaReadModeElementType > force_table
void cuda_bind_forces ( float4 *  f,
float4 *  f_slow 
)

Definition at line 371 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), forces, and slow_forces.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

371  {
372  cudaHostGetDevicePointer((void **)&forces, f, 0);
373  cuda_errcheck("cudaHostGetDevicePointer forces");
374  cudaHostGetDevicePointer((void **)&slow_forces, f_slow, 0);
375  cuda_errcheck("cudaHostGetDevicePointer slow_forces");
376 }
static __thread float4 * forces
static __thread float4 * slow_forces
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_bornRad ( float *  bornRadH)

Definition at line 406 of file ComputeNonbondedCUDAKernel.cu.

References bornRadD, cuda_errcheck(), num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

406  {
407  cudaMemcpyAsync(bornRadD, bornRadH, num_atoms * sizeof(float),
408  cudaMemcpyHostToDevice, stream);
409  cuda_errcheck("memcpy to bornRad");
410 }
static __thread float * bornRadD
static __thread float * bornRadH
__thread cudaStream_t stream
static __thread int num_atoms
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_dEdaSum ( GBReal dEdaSumH)

Definition at line 412 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), and dEdaSumD.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

412  {
413  cudaHostGetDevicePointer((void **)&dEdaSumD, dEdaSumH, 0);
414  cuda_errcheck("cudaHostGetDevicePointer dEdaSum");
415 }
static __thread GBReal * dEdaSumD
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_dHdrPrefix ( float *  dHdrPrefixH)

Definition at line 417 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), dHdrPrefixD, num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

417  {
418  cudaMemcpyAsync(dHdrPrefixD, dHdrPrefixH, num_atoms * sizeof(float),
419  cudaMemcpyHostToDevice, stream);
420  cuda_errcheck("memcpy to dHdrPrefix");
421 }
static __thread float * dHdrPrefixH
__thread cudaStream_t stream
static __thread float * dHdrPrefixD
static __thread int num_atoms
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_energy ( float *  energy_gbis)

Definition at line 389 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), and energy_gbis.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

389  {
390  cudaHostGetDevicePointer((void **)&energy_gbis, e, 0);
391  cuda_errcheck("cudaHostGetDevicePointer energy_gbis");
392 }
static __thread float * energy_gbis
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_intRad ( float *  intRad0H,
float *  intRadSH 
)

Definition at line 393 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), intRad0D, intRadSD, num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

393  {
394  cudaMemcpyAsync(intRad0D, intRad0H, num_atoms * sizeof(float),
395  cudaMemcpyHostToDevice, stream);
396  cudaMemcpyAsync(intRadSD, intRadSH, num_atoms * sizeof(float),
397  cudaMemcpyHostToDevice, stream);
398  cuda_errcheck("memcpy to intRad");
399 }
static __thread float * intRadSD
static __thread float * intRad0D
__thread cudaStream_t stream
static __thread float * intRadSH
static __thread int num_atoms
void cuda_errcheck(const char *msg)
static __thread float * intRad0H
void cuda_bind_GBIS_psiSum ( GBReal psiSumH)

Definition at line 401 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), and psiSumD.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

401  {
402  cudaHostGetDevicePointer((void **)&psiSumD, psiSumH, 0);
403  cuda_errcheck("cudaHostGetDevicePointer psiSum");
404 }
void cuda_errcheck(const char *msg)
static __thread GBReal * psiSumD
void cuda_bind_lj_table ( const float2 t,
int  _lj_table_size 
)

Definition at line 59 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), lj_table, and lj_table_size.

Referenced by ComputeNonbondedCUDA::build_lj_table().

59  {
60  static __thread float2 *ct;
61  static __thread int lj_table_alloc;
62  lj_table_size = _lj_table_size;
63  if ( ct && lj_table_alloc < lj_table_size ) {
64  cudaFree(ct);
65  cuda_errcheck("freeing lj table");
66  ct = 0;
67  }
68  if ( ! ct ) {
69  lj_table_alloc = lj_table_size;
70  cudaMalloc((void**) &ct, lj_table_size*lj_table_size*sizeof(float2));
71  cuda_errcheck("allocating lj table");
72  }
73  cudaMemcpy(ct, t, lj_table_size*lj_table_size*sizeof(float2),
74  cudaMemcpyHostToDevice);
75  cuda_errcheck("memcpy to lj table");
76 
77  lj_table.normalized = false;
78  lj_table.addressMode[0] = cudaAddressModeClamp;
79  lj_table.filterMode = cudaFilterModePoint;
80 
81  cudaBindTexture((size_t*)0, lj_table, ct,
83  cuda_errcheck("binding lj table to texture");
84 }
static __thread int lj_table_size
texture< float2, 1, cudaReadModeElementType > lj_table
void cuda_errcheck(const char *msg)
void cuda_bind_patch_pairs ( patch_pair *  h_patch_pairs,
int  npatch_pairs,
int  npatches,
int  natoms,
int  nexclmask,
int  plist_len 
)

Definition at line 293 of file ComputeNonbondedCUDAKernel.cu.

References atom_params, atom_params_size, atoms, atoms_size, bornRadD, bornRadD_size, cuda_errcheck(), dHdrPrefixD, dHdrPrefixD_size, exclmasks, exclmasks_size, GBIS_P1_counters, GBIS_P1_counters_size, GBIS_P2_counters, GBIS_P2_counters_size, GBIS_P3_counters, GBIS_P3_counters_size, global_counters, global_counters_size, intRad0D, intRad0D_size, intRadSD, intRadSD_size, num_atoms, num_patches, num_virials, patch_pairs, patch_pairs_size, plist, plist_size, slow_tmpforces, slow_tmpforces_size, slow_tmpvirials, slow_tmpvirials_size, tmp_dEdaSumD, tmp_dEdaSumD_size, tmp_energy_gbis, tmp_energy_gbis_size, tmp_psiSumD, tmp_psiSumD_size, tmpforces, tmpforces_size, tmpvirials, tmpvirials_size, vdw_types, and vdw_types_size.

Referenced by ComputeNonbondedCUDA::doWork().

295  {
296  num_patches = npatches;
297  num_virials = npatches;
298  num_atoms = natoms;
299  reallocate_device<patch_pair>(&patch_pairs, &patch_pairs_size, npatch_pairs, 1.2f);
300  reallocate_device<atom>(&atoms, &atoms_size, num_atoms, 1.2f);
301  reallocate_device<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
302  reallocate_device<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
303  reallocate_device<unsigned int>(&global_counters, &global_counters_size, num_patches+2, 1.2f);
304  reallocate_device<float4>(&tmpforces, &tmpforces_size, num_atoms, 1.2f);
305  reallocate_device<float4>(&slow_tmpforces, &slow_tmpforces_size, num_atoms, 1.2f);
306  reallocate_device<unsigned int>(&plist, &plist_size, plist_len, 1.2f);
307  reallocate_device<exclmask>(&exclmasks, &exclmasks_size, nexclmask, 1.2f);
308  reallocate_device<float>(&tmpvirials, &tmpvirials_size, num_patches*16, 1.2f);
309  reallocate_device<float>(&slow_tmpvirials, &slow_tmpvirials_size, num_patches*16, 1.2f);
310 
311  // For GBIS
312  reallocate_device<unsigned int>(&GBIS_P1_counters, &GBIS_P1_counters_size, num_patches, 1.2f);
313  reallocate_device<unsigned int>(&GBIS_P2_counters, &GBIS_P2_counters_size, num_patches, 1.2f);
314  reallocate_device<unsigned int>(&GBIS_P3_counters, &GBIS_P3_counters_size, num_patches, 1.2f);
315  reallocate_device<float>(&intRad0D, &intRad0D_size, num_atoms, 1.2f);
316  reallocate_device<float>(&intRadSD, &intRadSD_size, num_atoms, 1.2f);
317  reallocate_device<GBReal>(&tmp_psiSumD, &tmp_psiSumD_size, num_atoms, 1.2f);
318  reallocate_device<float>(&bornRadD, &bornRadD_size, num_atoms, 1.2f);
319  reallocate_device<GBReal>(&tmp_dEdaSumD, &tmp_dEdaSumD_size, num_atoms, 1.2f);
320  reallocate_device<float>(&dHdrPrefixD, &dHdrPrefixD_size, num_atoms, 1.2f);
321  reallocate_device<float>(&tmp_energy_gbis, &tmp_energy_gbis_size, num_patches, 1.2f);
322 
323  cudaMemcpy(patch_pairs, h_patch_pairs, npatch_pairs*sizeof(patch_pair), cudaMemcpyHostToDevice);
324  cuda_errcheck("memcpy to patch_pairs");
325 
326  cudaMemset(global_counters, 0, (num_patches+2)*sizeof(unsigned int));
327  cuda_errcheck("memset global_counters");
328 
329  cudaMemset(GBIS_P1_counters, 0, num_patches*sizeof(unsigned int));
330  cuda_errcheck("memset GBIS_P1_counters");
331 
332  cudaMemset(GBIS_P2_counters, 0, num_patches*sizeof(unsigned int));
333  cuda_errcheck("memset GBIS_P2_counters");
334 
335  cudaMemset(GBIS_P3_counters, 0, num_patches*sizeof(unsigned int));
336  cuda_errcheck("memset GBIS_P3_counters");
337 
338  cudaMemset(tmpforces, 0, num_atoms*sizeof(float4));
339  cuda_errcheck("memset tmpforces");
340 
341  cudaMemset(tmpvirials, 0, num_patches*sizeof(float)*16);
342  cuda_errcheck("memset tmpvirials");
343 
344  cudaMemset(slow_tmpforces, 0, num_atoms*sizeof(float4));
345  cuda_errcheck("memset slow_tmpforces");
346 
347  cudaMemset(slow_tmpvirials, 0, num_patches*sizeof(float)*16);
348  cuda_errcheck("memset slow_tmpvirials");
349 
350 }
static __thread unsigned int * GBIS_P2_counters
static __thread GBReal * tmp_dEdaSumD
static __thread float * tmp_energy_gbis
static __thread float * intRadSD
static __thread int patch_pairs_size
static __thread int intRad0D_size
static __thread float * bornRadD
static __thread atom * atoms
static __thread float * intRad0D
static __thread GBReal * tmp_psiSumD
static __thread unsigned int * plist
static __thread int GBIS_P1_counters_size
static __thread int intRadSD_size
static __thread exclmask * exclmasks
static __thread float * slow_tmpvirials
static __thread int plist_size
static __thread int vdw_types_size
static __thread int atom_params_size
static __thread float * dHdrPrefixD
static __thread int global_counters_size
static __thread int GBIS_P3_counters_size
static __thread float * tmpvirials
static __thread patch_pair * patch_pairs
static __thread unsigned int * GBIS_P1_counters
static __thread int num_patches
static __thread int num_virials
static __thread int tmpvirials_size
static __thread int tmp_psiSumD_size
static __thread int bornRadD_size
static __thread atom_param * atom_params
static __thread int GBIS_P2_counters_size
static __thread int tmp_dEdaSumD_size
static __thread int tmp_energy_gbis_size
static __thread unsigned int * GBIS_P3_counters
static __thread int num_atoms
static __thread int slow_tmpforces_size
static __thread float4 * slow_tmpforces
static __thread int tmpforces_size
static __thread unsigned int * global_counters
void cuda_errcheck(const char *msg)
static __thread int slow_tmpvirials_size
static __thread int dHdrPrefixD_size
static __thread int exclmasks_size
static __thread int * vdw_types
static __thread float4 * tmpforces
static __thread int atoms_size
void cuda_bind_vdw_types ( const int *  t)

Definition at line 358 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), num_atoms, stream, and vdw_types.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

358  {
359  cudaMemcpyAsync(vdw_types, t, num_atoms * sizeof(int),
360  cudaMemcpyHostToDevice, stream);
361  cuda_errcheck("memcpy to vdw_types");
362 }
__thread cudaStream_t stream
static __thread int num_atoms
void cuda_errcheck(const char *msg)
static __thread int * vdw_types
void cuda_bind_virials ( float *  v,
int *  queue,
int *  blockorder 
)

Definition at line 378 of file ComputeNonbondedCUDAKernel.cu.

References block_order, cuda_errcheck(), force_ready_queue, num_virials, slow_virials, and virials.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

378  {
379  cudaHostGetDevicePointer((void **)&virials, v, 0);
380  cuda_errcheck("cudaHostGetDevicePointer virials");
382  cudaHostGetDevicePointer((void **)&force_ready_queue, queue, 0);
383  cuda_errcheck("cudaHostGetDevicePointer force_ready_queue");
384  cudaHostGetDevicePointer((void **)&block_order, blockorder, 0);
385  cuda_errcheck("cudaHostGetDevicePointer block_order");
386 }
static __thread float * slow_virials
static __thread float * virials
static __thread int * force_ready_queue
static __thread int num_virials
void cuda_errcheck(const char *msg)
static __thread int * block_order
void cuda_errcheck ( const char *  msg)

Definition at line 45 of file ComputeNonbondedCUDA.C.

45  {
46  cudaError_t err;
47  if ((err = cudaGetLastError()) != cudaSuccess) {
48  char host[128];
49  gethostname(host, 128); host[127] = 0;
50  char devstr[128] = "";
51  int devnum;
52  if ( cudaGetDevice(&devnum) == cudaSuccess ) {
53  sprintf(devstr, " device %d", devnum);
54  }
55  cudaDeviceProp deviceProp;
56  if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
57  sprintf(devstr, " device %d pci %x:%x:%x", devnum,
58  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
59  }
60  char errmsg[1024];
61  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
62  NAMD_die(errmsg);
63  }
64 }
void NAMD_die(const char *err_msg)
Definition: common.C:83
void cuda_GBIS_P1 ( int  cbegin,
int  ccount,
int  pbegin,
int  pcount,
float  a_cut,
float  rho_0,
float3  lata,
float3  latb,
float3  latc,
cudaStream_t &  strm 
)

Definition at line 568 of file ComputeNonbondedCUDAKernel.cu.

References atoms, cuda_errcheck(), GBIS_P1_counters, intRad0D, intRadSD, lata, latb, latc, max_grid_size, num_atoms, NUM_WARP, patch_pairs, psiSumD, tmp_psiSumD, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

579  {
580 
581  if ( ccount ) {
582  cudaMemsetAsync(tmp_psiSumD, 0, num_atoms*sizeof(GBReal), strm);
583 
584  int grid_dim = max_grid_size; // maximum allowed
585  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
586  if (grid_dim > ccount - cstart) {
587  grid_dim = ccount - cstart;
588  }
589 
590  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
591  GBIS_P1_Kernel<<<grid_dim, nthread3, 0, strm>>>(
592  patch_pairs+cbegin+cstart,
593  atoms,
594  intRad0D,
595  intRadSD,
596  tmp_psiSumD,
597  psiSumD,
598  a_cut,
599  rho_0,
600  lata,
601  latb,
602  latc,
604  );
605  cuda_errcheck("dev_GBIS_P1");
606  } // end for
607  }
608 } // end GBIS P1
static __thread float * intRadSD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread atom * atoms
static __thread float * intRad0D
static __thread GBReal * tmp_psiSumD
#define WARPSIZE
Definition: CudaUtils.h:10
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
static __thread patch_pair * patch_pairs
static __thread unsigned int * GBIS_P1_counters
static __thread int num_atoms
void cuda_errcheck(const char *msg)
__thread int max_grid_size
#define NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
static __thread GBReal * psiSumD
float GBReal
Definition: ComputeGBIS.inl:17
void cuda_GBIS_P2 ( int  cbegin,
int  ccount,
int  pbegin,
int  pcount,
float  a_cut,
float  r_cut,
float  scaling,
float  kappa,
float  smoothDist,
float  epsilon_p,
float  epsilon_s,
float3  lata,
float3  latb,
float3  latc,
int  doEnergy,
int  doFullElec,
cudaStream_t &  strm 
)

Definition at line 613 of file ComputeNonbondedCUDAKernel.cu.

References atoms, bornRadD, cuda_errcheck(), dEdaSumD, energy_gbis, forces, GBIS_P2_counters, lata, latb, latc, max_grid_size, num_atoms, num_patches, NUM_WARP, patch_pairs, tmp_dEdaSumD, tmp_energy_gbis, tmpforces, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

631  {
632 
633  if ( ccount ) {
634  cudaMemsetAsync(tmp_dEdaSumD, 0, num_atoms*sizeof(GBReal), strm);
635  cudaMemsetAsync(tmp_energy_gbis, 0, num_patches*sizeof(float), strm);
636 
637  int grid_dim = max_grid_size; // maximum allowed
638  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
639  if (grid_dim > ccount - cstart)
640  grid_dim = ccount - cstart;
641 
642  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
643  GBIS_P2_Kernel<<<grid_dim, nthread3, 0, strm>>>(
644  patch_pairs+cbegin+cstart,
645  atoms,
646  bornRadD,
647  tmp_dEdaSumD,
648  dEdaSumD,
649  a_cut,
650  r_cut,
651  scaling,
652  kappa,
653  smoothDist,
654  epsilon_p,
655  epsilon_s,
656  lata,
657  latb,
658  latc,
659  doEnergy,
660  doFullElec,
661  tmpforces,
662  forces,
664  energy_gbis,
666  );
667  cuda_errcheck("dev_GBIS_P2");
668  } // end for
669  }
670 } // end P2
static __thread unsigned int * GBIS_P2_counters
static __thread GBReal * tmp_dEdaSumD
static __thread float * tmp_energy_gbis
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread float * bornRadD
static __thread atom * atoms
static __thread float4 * forces
static __thread GBReal * dEdaSumD
static __thread float * energy_gbis
#define WARPSIZE
Definition: CudaUtils.h:10
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
static __thread patch_pair * patch_pairs
static __thread int num_patches
static __thread int num_atoms
void cuda_errcheck(const char *msg)
__thread int max_grid_size
#define NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
static __thread float4 * tmpforces
float GBReal
Definition: ComputeGBIS.inl:17
void cuda_GBIS_P3 ( int  cbegin,
int  ccount,
int  pbegin,
int  pcount,
float  a_cut,
float  rho_0,
float  scaling,
float3  lata,
float3  latb,
float3  latc,
cudaStream_t &  strm 
)

Definition at line 675 of file ComputeNonbondedCUDAKernel.cu.

References atoms, cuda_errcheck(), dHdrPrefixD, GBIS_P3_counters, intRad0D, intRadSD, lata, latb, latc, max_grid_size, NUM_WARP, patch_pairs, slow_forces, slow_tmpforces, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

687  {
688  int grid_dim = max_grid_size; // maximum allowed
689  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
690  if (grid_dim > ccount - cstart)
691  grid_dim = ccount - cstart;
692 
693  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
694  GBIS_P3_Kernel<<<grid_dim, nthread3, 0, strm>>>(
695  patch_pairs+cbegin+cstart,
696  atoms,
697  intRad0D,
698  intRadSD,
699  dHdrPrefixD,
700  a_cut,
701  rho_0,
702  scaling,
703  lata,
704  latb,
705  latc,
707  slow_forces,
709  );
710  cuda_errcheck("dev_GBIS_P3");
711  }
712 }
static __thread float * intRadSD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread atom * atoms
static __thread float * intRad0D
#define WARPSIZE
Definition: CudaUtils.h:10
static __thread float4 * slow_forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
static __thread float * dHdrPrefixD
static __thread patch_pair * patch_pairs
static __thread unsigned int * GBIS_P3_counters
static __thread float4 * slow_tmpforces
void cuda_errcheck(const char *msg)
__thread int max_grid_size
#define NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void cuda_init ( )

Definition at line 203 of file ComputeNonbondedCUDAKernel.cu.

References atom_params, atom_params_size, atoms, atoms_size, bornRadD, bornRadD_size, cuda_errcheck(), dEdaSumD, dHdrPrefixD, dHdrPrefixD_size, energy_gbis, exclmasks, exclmasks_size, exclusions, exclusions_size, force_ready_queue, forces, GBIS_P1_counters, GBIS_P1_counters_size, GBIS_P2_counters, GBIS_P2_counters_size, GBIS_P3_counters, GBIS_P3_counters_size, global_counters, global_counters_size, intRad0D, intRad0D_size, intRadSD, intRadSD_size, max_grid_size, patch_pairs, patch_pairs_size, plist, plist_size, psiSumD, slow_forces, slow_tmpforces, slow_tmpforces_size, slow_tmpvirials, slow_tmpvirials_size, tmp_dEdaSumD, tmp_dEdaSumD_size, tmp_energy_gbis, tmp_energy_gbis_size, tmp_psiSumD, tmp_psiSumD_size, tmpforces, tmpforces_size, tmpvirials, tmpvirials_size, vdw_types, and vdw_types_size.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

203  {
204  patch_pairs_size = 0;
205  patch_pairs = NULL;
206 
207  atom_params_size = 0;
208  atom_params = NULL;
209 
210  vdw_types_size = 0;
211  vdw_types = NULL;
212 
213  atoms_size = 0;
214  atoms = NULL;
215 
216  tmpforces_size = 0;
217  tmpforces = NULL;
218 
220  slow_tmpforces = NULL;
221 
222  tmpvirials_size = 0;
223  tmpvirials = NULL;
224 
226  slow_tmpvirials = NULL;
227 
229  global_counters = NULL;
230 
231  plist_size = 0;
232  plist = NULL;
233 
234  exclmasks_size = 0;
235  exclmasks = NULL;
236 
237  forces = NULL;
238  slow_forces = NULL;
239 
240  force_ready_queue = NULL;
241 
242  exclusions_size = 0;
243  exclusions = NULL;
244 
245  // --------------------
246  // For GBIS
247  // --------------------
248  intRad0D_size = 0;
249  intRad0D = NULL;
250 
251  intRadSD_size = 0;
252  intRadSD = NULL;
253 
254  psiSumD = NULL; // host-mapped memory
255 
256  tmp_psiSumD_size = 0;
257  tmp_psiSumD = NULL;
258 
259  bornRadD_size = 0;
260  bornRadD = NULL;
261 
262  dEdaSumD = NULL; // host-mapped memory
263 
264  tmp_dEdaSumD_size = 0;
265  tmp_dEdaSumD = NULL;
266 
267  dHdrPrefixD_size = 0;
268  dHdrPrefixD = NULL;
269 
271  GBIS_P1_counters = NULL;
272 
274  GBIS_P2_counters = NULL;
275 
277  GBIS_P3_counters = NULL;
278 
279  energy_gbis = NULL; // host-mapped memory
280 
282  tmp_energy_gbis = NULL;
283 
284  int dev;
285  cudaGetDevice(&dev);
286  cuda_errcheck("cudaGetDevice");
287  cudaDeviceProp deviceProp;
288  cudaGetDeviceProperties(&deviceProp, dev);
289  cuda_errcheck("cudaGetDeviceProperties");
290  max_grid_size = deviceProp.maxGridSize[1];
291 }
static __thread unsigned int * GBIS_P2_counters
static __thread GBReal * tmp_dEdaSumD
static __thread float * tmp_energy_gbis
static __thread float * intRadSD
static __thread int patch_pairs_size
static __thread int intRad0D_size
static __thread float * bornRadD
static __thread unsigned int * exclusions
static __thread atom * atoms
static __thread float * intRad0D
static __thread float4 * forces
static __thread GBReal * tmp_psiSumD
static __thread unsigned int * plist
static __thread GBReal * dEdaSumD
static __thread int GBIS_P1_counters_size
static __thread int intRadSD_size
static __thread float * energy_gbis
static __thread exclmask * exclmasks
static __thread float * slow_tmpvirials
static __thread int plist_size
static __thread float4 * slow_forces
static __thread int vdw_types_size
static __thread int atom_params_size
static __thread float * dHdrPrefixD
static __thread int global_counters_size
static __thread int GBIS_P3_counters_size
static __thread float * tmpvirials
static __thread int * force_ready_queue
static __thread patch_pair * patch_pairs
static __thread unsigned int * GBIS_P1_counters
static __thread int tmpvirials_size
static __thread int tmp_psiSumD_size
static __thread int bornRadD_size
static __thread atom_param * atom_params
static __thread int GBIS_P2_counters_size
static __thread int tmp_dEdaSumD_size
static __thread int tmp_energy_gbis_size
static __thread unsigned int * GBIS_P3_counters
static __thread int slow_tmpforces_size
static __thread int exclusions_size
static __thread float4 * slow_tmpforces
static __thread int tmpforces_size
static __thread unsigned int * global_counters
void cuda_errcheck(const char *msg)
static __thread int slow_tmpvirials_size
static __thread int dHdrPrefixD_size
static __thread int exclmasks_size
static __thread int * vdw_types
__thread int max_grid_size
static __thread float4 * tmpforces
static __thread GBReal * psiSumD
static __thread int atoms_size
void cuda_nonbonded_forces ( float3  lata,
float3  latb,
float3  latc,
float  cutoff2,
float  plcutoff2,
int  cbegin,
int  ccount,
int  ctotal,
int  doSlow,
int  doEnergy,
int  usePairlists,
int  savePairlists,
int  doStreaming,
int  saveOrder,
cudaStream_t &  strm 
)

Definition at line 499 of file ComputeNonbondedCUDAKernel.cu.

References CALL, cuda_errcheck(), cutoff2, max_grid_size, num_atoms, num_patches, NUM_WARP, slow_tmpforces, slow_tmpvirials, tmpforces, tmpvirials, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

503  {
504 
505  if ( ccount ) {
506  if ( usePairlists ) {
507  if ( ! savePairlists ) plcutoff2 = 0.;
508  } else {
509  plcutoff2 = cutoff2;
510  }
511 
512  cudaMemsetAsync(tmpforces, 0, num_atoms*sizeof(float4), strm);
513  cudaMemsetAsync(tmpvirials, 0, num_patches*sizeof(float)*16, strm);
514  if ( doSlow ) {
515  cudaMemsetAsync(slow_tmpforces, 0, num_atoms*sizeof(float4), strm);
516  cudaMemsetAsync(slow_tmpvirials, 0, num_patches*sizeof(float)*16, strm);
517  }
518 
519  int grid_dim = max_grid_size; // maximum allowed
520  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
521  if ( grid_dim > ccount - cstart ) grid_dim = ccount - cstart;
522 
523  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
524 #define CALL(X) X<<< grid_dim, nthread3, 0, strm >>> ( \
525  patch_pairs, atoms, atom_params, vdw_types, plist, \
526  tmpforces, (doSlow?slow_tmpforces:NULL), \
527  forces, (doSlow?slow_forces:NULL), \
528  tmpvirials, (doSlow?slow_tmpvirials:NULL), \
529  virials, (doSlow?slow_virials:NULL), \
530  global_counters, (doStreaming?force_ready_queue:NULL), \
531  overflow_exclusions, num_patches, \
532  cbegin+cstart, ctotal, (saveOrder?block_order:NULL), \
533  exclmasks, lj_table_size, \
534  lata, latb, latc, cutoff2, plcutoff2, doSlow)
535 
536 //end definition
537 
538  if ( doEnergy ) {
539  if ( doSlow ) {
540  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_energy_pairlist);
541  else CALL(dev_nonbonded_slow_energy);
542  } else {
543  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_energy_pairlist);
544  else CALL(dev_nonbonded_energy);
545  }
546  } else {
547  if ( doSlow ) {
548  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_pairlist);
549  else CALL(dev_nonbonded_slow);
550  } else {
551  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_pairlist);
552  else CALL(dev_nonbonded);
553  }
554  }
555 
556  cuda_errcheck("dev_nonbonded");
557  }
558  }
559 
560 }
static __thread float * slow_tmpvirials
#define WARPSIZE
Definition: CudaUtils.h:10
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
static __thread float * tmpvirials
static __thread int num_patches
static __thread int num_atoms
static __thread float4 * slow_tmpforces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
void cuda_errcheck(const char *msg)
__thread int max_grid_size
#define NUM_WARP
#define CALL(DOENERGY, DOVIRIAL)
static __thread float4 * tmpforces
int cuda_stream_finished ( )