6 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
9 #define __thread __declspec(thread)
19 #define SET_EXCL(EXCL,BASE,DIFF) \
20 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
24 static __thread
int exclusions_alloc;
34 cudaMalloc((
void**) &
exclusions, n*
sizeof(
unsigned int));
39 cudaMemcpy(
exclusions, t, n*
sizeof(
unsigned int), cudaMemcpyHostToDevice);
48 n*
sizeof(
unsigned int), cudaMemcpyHostToDevice);
56 texture<float2, 1, cudaReadModeElementType>
lj_table;
60 static __thread
float2 *ct;
61 static __thread
int lj_table_alloc;
74 cudaMemcpyHostToDevice);
78 lj_table.addressMode[0] = cudaAddressModeClamp;
79 lj_table.filterMode = cudaFilterModePoint;
81 cudaBindTexture((
size_t*)0,
lj_table, ct,
91 static __thread cudaArray *ct;
92 static __thread cudaArray *ect;
101 cudaMemcpyToArray(ct, 0, 0, t,
FORCE_TABLE_SIZE*
sizeof(float4), cudaMemcpyHostToDevice);
104 cudaMemcpyToArray(ect, 0, 0, et,
FORCE_TABLE_SIZE*
sizeof(float4), cudaMemcpyHostToDevice);
150 static __thread
unsigned int*
plist;
287 cudaDeviceProp deviceProp;
288 cudaGetDeviceProperties(&deviceProp, dev);
294 int npatches,
int natoms,
int plist_len,
323 cudaMemcpy(patch_pairs, h_patch_pairs, npatch_pairs*
sizeof(patch_pair), cudaMemcpyHostToDevice);
326 cudaMemset(global_counters, 0, (num_patches+2)*
sizeof(
unsigned int));
329 cudaMemset(GBIS_P1_counters, 0, num_patches*
sizeof(
unsigned int));
332 cudaMemset(GBIS_P2_counters, 0, num_patches*
sizeof(
unsigned int));
335 cudaMemset(GBIS_P3_counters, 0, num_patches*
sizeof(
unsigned int));
338 cudaMemset(tmpforces, 0, num_atoms*
sizeof(float4));
341 cudaMemset(tmpvirials, 0, num_patches*
sizeof(
float)*16);
344 cudaMemset(slow_tmpforces, 0, num_atoms*
sizeof(float4));
347 cudaMemset(slow_tmpvirials, 0, num_patches*
sizeof(
float)*16);
354 cudaMemcpyHostToDevice,
stream);
360 cudaMemcpyHostToDevice,
stream);
367 cudaMemcpyHostToDevice,
stream);
372 cudaHostGetDevicePointer((
void **)&
forces, f, 0);
374 cudaHostGetDevicePointer((
void **)&
slow_forces, f_slow, 0);
379 cudaHostGetDevicePointer((
void **)&
virials, v, 0);
384 cudaHostGetDevicePointer((
void **)&
block_order, blockorder, 0);
390 cudaHostGetDevicePointer((
void **)&
energy_gbis, e, 0);
395 cudaMemcpyHostToDevice,
stream);
397 cudaMemcpyHostToDevice,
stream);
402 cudaHostGetDevicePointer((
void **)&
psiSumD, psiSumH, 0);
408 cudaMemcpyHostToDevice,
stream);
413 cudaHostGetDevicePointer((
void **)&
dEdaSumD, dEdaSumH, 0);
419 cudaMemcpyHostToDevice,
stream);
425 void cuda_load_forces(float4 *f, float4 *f_slow,
int begin,
int count) {
427 cudaMemcpyAsync(f+begin,
forces+begin, count *
sizeof(float4),
428 cudaMemcpyDeviceToHost,
stream);
430 cudaMemcpyAsync(f_slow+begin,
slow_forces+begin, count *
sizeof(float4),
431 cudaMemcpyDeviceToHost,
stream);
436 void cuda_load_virials(
float *v,
int doSlow) {
437 int count = force_lists_size;
438 if ( doSlow ) count *= 2;
439 cudaMemcpyAsync(v,
virials, count * 16*
sizeof(
float),
440 cudaMemcpyDeviceToHost,
stream);
446 __host__ __device__
static int3 patch_coords_from_id(
447 dim3 PATCH_GRID,
int id) {
449 return make_int3(
id % PATCH_GRID.x,
450 (
id / PATCH_GRID.x ) % PATCH_GRID.y,
451 id / ( PATCH_GRID.x * PATCH_GRID.y ) );
454 __host__ __device__
static int patch_id_from_coords(
455 dim3 PATCH_GRID, int3
coords) {
458 int x = (coords.x + 4 * PATCH_GRID.x) % PATCH_GRID.x;
459 int y = (coords.y + 4 * PATCH_GRID.y) % PATCH_GRID.y;
460 int z = (coords.z + 4 * PATCH_GRID.z) % PATCH_GRID.z;
462 return ( z * PATCH_GRID.y +
y ) * PATCH_GRID.x +
x;
465 __host__ __device__
static int3 patch_offset_from_neighbor(
int neighbor) {
468 int3 coords = patch_coords_from_id(make_uint3(3,3,3), neighbor);
469 return make_int3(coords.x - 1, coords.y - 1, coords.z - 1);
474 #define BLOCK_SIZE 128
476 #define MAKE_PAIRLIST
501 int cbegin,
int ccount,
int ctotal,
502 int doSlow,
int doEnergy,
int usePairlists,
int savePairlists,
503 int doStreaming,
int saveOrder, cudaStream_t &strm) {
506 if ( usePairlists ) {
507 if ( ! savePairlists ) plcutoff2 = 0.;
520 for (
int cstart = 0; cstart < ccount; cstart += grid_dim ) {
521 if ( grid_dim > ccount - cstart ) grid_dim = ccount - cstart;
524 #define CALL(X) X<<< grid_dim, nthread3, 0, strm >>> ( \
525 patch_pairs, atoms, atom_params, vdw_types, plist, \
526 tmpforces, (doSlow?slow_tmpforces:NULL), \
527 forces, (doSlow?slow_forces:NULL), \
528 tmpvirials, (doSlow?slow_tmpvirials:NULL), \
529 virials, (doSlow?slow_virials:NULL), \
530 global_counters, (doStreaming?force_ready_queue:NULL), \
531 overflow_exclusions, num_patches, \
532 cbegin+cstart, ctotal, (saveOrder?block_order:NULL), \
533 exclmasks, lj_table_size, \
534 lata, latb, latc, cutoff2, plcutoff2, doSlow)
540 if ( plcutoff2 != 0. )
CALL(dev_nonbonded_slow_energy_pairlist);
541 else CALL(dev_nonbonded_slow_energy);
543 if ( plcutoff2 != 0. )
CALL(dev_nonbonded_energy_pairlist);
544 else CALL(dev_nonbonded_energy);
548 if ( plcutoff2 != 0. )
CALL(dev_nonbonded_slow_pairlist);
549 else CALL(dev_nonbonded_slow);
551 if ( plcutoff2 != 0. )
CALL(dev_nonbonded_pairlist);
552 else CALL(dev_nonbonded);
585 for (
int cstart = 0; cstart < ccount; cstart += grid_dim ) {
586 if (grid_dim > ccount - cstart) {
587 grid_dim = ccount - cstart;
591 GBIS_P1_Kernel<<<grid_dim, nthread3, 0, strm>>>(
638 for (
int cstart = 0; cstart < ccount; cstart += grid_dim ) {
639 if (grid_dim > ccount - cstart)
640 grid_dim = ccount - cstart;
643 GBIS_P2_Kernel<<<grid_dim, nthread3, 0, strm>>>(
689 for (
int cstart = 0; cstart < ccount; cstart += grid_dim ) {
690 if (grid_dim > ccount - cstart)
691 grid_dim = ccount - cstart;
694 GBIS_P3_Kernel<<<grid_dim, nthread3, 0, strm>>>(
716 return ( cudaStreamQuery(
stream) == cudaSuccess );
void cuda_bind_force_table(const float4 *t, const float4 *et)
static __thread unsigned int * GBIS_P2_counters
void cuda_bind_forces(float4 *f, float4 *f_slow)
static __thread GBReal * tmp_dEdaSumD
int cuda_stream_finished()
static __thread float * tmp_energy_gbis
void cuda_bind_exclusions(const unsigned int *t, int n)
static __thread float * intRadSD
void cuda_bind_atoms(const atom *a)
static __thread int patch_pairs_size
static __thread int intRad0D_size
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
static __thread float * bornRadD
static __thread unsigned int * exclusions
static __thread atom * atoms
static __thread float * intRad0D
static __thread float4 * forces
static __thread GBReal * tmp_psiSumD
static __thread unsigned int * plist
static __thread unsigned int * overflow_exclusions
static __thread GBReal * dEdaSumD
static __thread float * slow_virials
static __thread float * bornRadH
static __thread int GBIS_P1_counters_size
static __thread int intRadSD_size
static __thread float * energy_gbis
static __thread exclmask * exclmasks
static __thread float * slow_tmpvirials
static __thread float * dHdrPrefixH
static __thread int plist_size
void cuda_bind_GBIS_energy(float *e)
void cuda_bind_GBIS_dEdaSum(GBReal *dEdaSumH)
static __thread float * virials
static __thread float4 * slow_forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
static __thread int vdw_types_size
void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
static __thread int atom_params_size
__thread cudaStream_t stream
static __thread float * dHdrPrefixD
static __thread int global_counters_size
static __thread int GBIS_P3_counters_size
static __thread float * tmpvirials
static __thread int * force_ready_queue
static __thread patch_pair * patch_pairs
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
static __thread float * intRadSH
void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int plist_len, int nexclmask)
void cuda_bind_vdw_types(const int *t)
void cuda_bind_lj_table(const float2 *t, int _lj_table_size)
static __thread unsigned int * GBIS_P1_counters
static __thread int num_patches
texture< float2, 1, cudaReadModeElementType > lj_table
void cuda_bind_GBIS_dHdrPrefix(float *dHdrPrefixH)
static __thread int num_virials
texture< unsigned int, 1, cudaReadModeElementType > tex_exclusions
static __thread int tmpvirials_size
static __thread int tmp_psiSumD_size
static __thread int bornRadD_size
static __thread atom_param * atom_params
void cuda_bind_GBIS_bornRad(float *bornRadH)
static __thread int GBIS_P2_counters_size
static __thread int tmp_dEdaSumD_size
static __thread int tmp_energy_gbis_size
void cuda_bind_virials(float *v, int *queue, int *blockorder)
static __thread unsigned int * GBIS_P3_counters
static __thread int num_atoms
static __thread int slow_tmpforces_size
static __thread int exclusions_size
void cuda_bind_GBIS_psiSum(GBReal *psiSumH)
void cuda_GBIS_P3(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
#define MAX_CONST_EXCLUSIONS
static __thread float4 * slow_tmpforces
static __thread int tmpforces_size
void cuda_GBIS_P1(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
static __thread unsigned int * global_counters
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_intRad(float *intRad0H, float *intRadSH)
static __thread int slow_tmpvirials_size
texture< float4, 1, cudaReadModeElementType > energy_table
static __thread int dHdrPrefixD_size
static __thread int exclmasks_size
static __thread int * vdw_types
__thread int max_grid_size
texture< float4, 1, cudaReadModeElementType > force_table
void cuda_GBIS_P2(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
__thread cudaStream_t stream2
static __thread int * block_order
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void cuda_bind_atom_params(const atom_param *t)
#define CALL(DOENERGY, DOVIRIAL)
static __thread float4 * tmpforces
static __thread GBReal * psiSumD
static __thread float * intRad0H
static __thread int atoms_size