NAMD
ComputeNonbondedCUDAKernel.cu
Go to the documentation of this file.
1 
2 #include "CudaUtils.h"
4 #include <stdio.h>
5 
6 #ifdef NAMD_CUDA
7 
8 #ifdef WIN32
9 #define __thread __declspec(thread)
10 #endif
11 
12 texture<unsigned int, 1, cudaReadModeElementType> tex_exclusions;
13 static __thread int exclusions_size;
14 static __thread unsigned int *exclusions;
15 
16 __constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS];
17 static __thread unsigned int *overflow_exclusions;
18 
19 #define SET_EXCL(EXCL,BASE,DIFF) \
20  (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
21 
22 void cuda_bind_exclusions(const unsigned int *t, int n) {
23  exclusions_size = n;
24  static __thread int exclusions_alloc;
25  if ( exclusions && exclusions_alloc < exclusions_size ) {
26  cudaFree(exclusions);
27  cuda_errcheck("freeing exclusions");
28  cudaFree(overflow_exclusions);
29  cuda_errcheck("freeing overflow_exclusions");
30  exclusions = 0;
31  }
32  if ( ! exclusions ) {
33  exclusions_alloc = exclusions_size;
34  cudaMalloc((void**) &exclusions, n*sizeof(unsigned int));
35  cuda_errcheck("malloc exclusions");
36  cudaMalloc((void**) &overflow_exclusions, n*sizeof(unsigned int));
37  cuda_errcheck("malloc overflow_exclusions");
38  }
39  cudaMemcpy(exclusions, t, n*sizeof(unsigned int), cudaMemcpyHostToDevice);
40  cuda_errcheck("memcpy exclusions");
41  tex_exclusions.normalized = false;
42  tex_exclusions.addressMode[0] = cudaAddressModeClamp;
43  tex_exclusions.filterMode = cudaFilterModePoint;
44  cudaBindTexture(NULL, tex_exclusions, exclusions, n*sizeof(unsigned int));
45  cuda_errcheck("binding exclusions to texture");
46 
47  cudaMemcpy(overflow_exclusions, t,
48  n*sizeof(unsigned int), cudaMemcpyHostToDevice);
49  cuda_errcheck("memcpy to overflow_exclusions");
50  int nconst = ( n < MAX_CONST_EXCLUSIONS ? n : MAX_CONST_EXCLUSIONS );
51  cudaMemcpyToSymbol(const_exclusions, t, nconst*sizeof(unsigned int), 0);
52  cuda_errcheck("memcpy to const_exclusions");
53 }
54 
55 
56 texture<float2, 1, cudaReadModeElementType> lj_table;
57 static __thread int lj_table_size;
58 
59 void cuda_bind_lj_table(const float2 *t, int _lj_table_size) {
60  static __thread float2 *ct;
61  static __thread int lj_table_alloc;
62  lj_table_size = _lj_table_size;
63  if ( ct && lj_table_alloc < lj_table_size ) {
64  cudaFree(ct);
65  cuda_errcheck("freeing lj table");
66  ct = 0;
67  }
68  if ( ! ct ) {
69  lj_table_alloc = lj_table_size;
70  cudaMalloc((void**) &ct, lj_table_size*lj_table_size*sizeof(float2));
71  cuda_errcheck("allocating lj table");
72  }
73  cudaMemcpy(ct, t, lj_table_size*lj_table_size*sizeof(float2),
74  cudaMemcpyHostToDevice);
75  cuda_errcheck("memcpy to lj table");
76 
77  lj_table.normalized = false;
78  lj_table.addressMode[0] = cudaAddressModeClamp;
79  lj_table.filterMode = cudaFilterModePoint;
80 
81  cudaBindTexture((size_t*)0, lj_table, ct,
83  cuda_errcheck("binding lj table to texture");
84 }
85 
86 
87 texture<float4, 1, cudaReadModeElementType> force_table;
88 texture<float4, 1, cudaReadModeElementType> energy_table;
89 
90 void cuda_bind_force_table(const float4 *t, const float4 *et) {
91  static __thread cudaArray *ct;
92  static __thread cudaArray *ect;
93  if ( ! ct ) {
94  cudaMallocArray(&ct, &force_table.channelDesc, FORCE_TABLE_SIZE, 1);
95  cuda_errcheck("allocating force table");
96  }
97  if ( ! ect ) {
98  cudaMallocArray(&ect, &energy_table.channelDesc, FORCE_TABLE_SIZE, 1);
99  cuda_errcheck("allocating energy table");
100  }
101  cudaMemcpyToArray(ct, 0, 0, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
102  // cudaMemcpy(ct, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
103  cuda_errcheck("memcpy to force table");
104  cudaMemcpyToArray(ect, 0, 0, et, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
105  cuda_errcheck("memcpy to energy table");
106 
107  force_table.normalized = true;
108  force_table.addressMode[0] = cudaAddressModeClamp;
109  force_table.addressMode[1] = cudaAddressModeClamp;
110  force_table.filterMode = cudaFilterModeLinear;
111 
112  energy_table.normalized = true;
113  energy_table.addressMode[0] = cudaAddressModeClamp;
114  energy_table.addressMode[1] = cudaAddressModeClamp;
115  energy_table.filterMode = cudaFilterModeLinear;
116 
117  cudaBindTextureToArray(force_table, ct);
118  cuda_errcheck("binding force table to texture");
119 
120  cudaBindTextureToArray(energy_table, ect);
121  cuda_errcheck("binding energy table to texture");
122 }
123 
124 static __thread int num_patches;
125 static __thread int num_virials;
126 static __thread int num_atoms;
127 // Size of the device array followed by the array pointer
128 static __thread int patch_pairs_size;
129 static __thread patch_pair* patch_pairs;
130 static __thread int atom_params_size;
131 static __thread atom_param* atom_params;
132 static __thread int vdw_types_size;
133 static __thread int* vdw_types;
134 static __thread int atoms_size;
135 static __thread atom* atoms;
136 
137 static __thread int tmpforces_size;
138 static __thread float4* tmpforces;
139 static __thread int slow_tmpforces_size;
140 static __thread float4* slow_tmpforces;
141 
142 static __thread int tmpvirials_size;
143 static __thread float* tmpvirials;
144 static __thread int slow_tmpvirials_size;
145 static __thread float* slow_tmpvirials;
146 
147 static __thread int global_counters_size;
148 static __thread unsigned int* global_counters;
149 static __thread int plist_size;
150 static __thread unsigned int* plist;
151 static __thread int exclmasks_size;
152 static __thread exclmask* exclmasks;
153 // Device pointers to the page-locked host arrays (provided by ComputeNonbondedCUDA -class)
154 static __thread float4* forces;
155 static __thread float4* slow_forces;
156 static __thread int* force_ready_queue;
157 static __thread float* virials;
158 static __thread float* slow_virials;
159 static __thread int* block_order;
160 
161 //GBIS arrays
162 static __thread int intRad0D_size;
163 static __thread float *intRad0D;
164 
165 static __thread int intRadSD_size;
166 static __thread float *intRadSD;
167 
168 static __thread GBReal *psiSumD; // host-mapped memory
169 
170 static __thread int tmp_psiSumD_size;
171 static __thread GBReal *tmp_psiSumD;
172 
173 static __thread int bornRadD_size;
174 static __thread float *bornRadD;
175 
176 static __thread GBReal *dEdaSumD; // host-mapped memory
177 
178 static __thread int tmp_dEdaSumD_size;
179 static __thread GBReal *tmp_dEdaSumD;
180 
181 static __thread int dHdrPrefixD_size;
182 static __thread float *dHdrPrefixD;
183 
184 static __thread int GBIS_P1_counters_size;
185 static __thread unsigned int *GBIS_P1_counters;
186 
187 static __thread int GBIS_P2_counters_size;
188 static __thread unsigned int *GBIS_P2_counters;
189 
190 static __thread int GBIS_P3_counters_size;
191 static __thread unsigned int *GBIS_P3_counters;
192 
193 static __thread float *energy_gbis; // host-mapped memory
194 
195 static __thread int tmp_energy_gbis_size;
196 static __thread float *tmp_energy_gbis;
197 
198 __thread int max_grid_size;
199 
200 __thread cudaStream_t stream;
201 __thread cudaStream_t stream2;
202 
203 void cuda_init() {
204  patch_pairs_size = 0;
205  patch_pairs = NULL;
206 
207  atom_params_size = 0;
208  atom_params = NULL;
209 
210  vdw_types_size = 0;
211  vdw_types = NULL;
212 
213  atoms_size = 0;
214  atoms = NULL;
215 
216  tmpforces_size = 0;
217  tmpforces = NULL;
218 
220  slow_tmpforces = NULL;
221 
222  tmpvirials_size = 0;
223  tmpvirials = NULL;
224 
226  slow_tmpvirials = NULL;
227 
229  global_counters = NULL;
230 
231  plist_size = 0;
232  plist = NULL;
233 
234  exclmasks_size = 0;
235  exclmasks = NULL;
236 
237  forces = NULL;
238  slow_forces = NULL;
239 
240  force_ready_queue = NULL;
241 
242  exclusions_size = 0;
243  exclusions = NULL;
244 
245  // --------------------
246  // For GBIS
247  // --------------------
248  intRad0D_size = 0;
249  intRad0D = NULL;
250 
251  intRadSD_size = 0;
252  intRadSD = NULL;
253 
254  psiSumD = NULL; // host-mapped memory
255 
256  tmp_psiSumD_size = 0;
257  tmp_psiSumD = NULL;
258 
259  bornRadD_size = 0;
260  bornRadD = NULL;
261 
262  dEdaSumD = NULL; // host-mapped memory
263 
264  tmp_dEdaSumD_size = 0;
265  tmp_dEdaSumD = NULL;
266 
267  dHdrPrefixD_size = 0;
268  dHdrPrefixD = NULL;
269 
271  GBIS_P1_counters = NULL;
272 
274  GBIS_P2_counters = NULL;
275 
277  GBIS_P3_counters = NULL;
278 
279  energy_gbis = NULL; // host-mapped memory
280 
282  tmp_energy_gbis = NULL;
283 
284  int dev;
285  cudaGetDevice(&dev);
286  cuda_errcheck("cudaGetDevice");
287  cudaDeviceProp deviceProp;
288  cudaGetDeviceProperties(&deviceProp, dev);
289  cuda_errcheck("cudaGetDeviceProperties");
290  max_grid_size = deviceProp.maxGridSize[1];
291 }
292 
293 void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs,
294  int npatches, int natoms, int plist_len,
295  int nexclmask) {
296  num_patches = npatches;
297  num_virials = npatches;
298  num_atoms = natoms;
299  reallocate_device<patch_pair>(&patch_pairs, &patch_pairs_size, npatch_pairs, 1.2f);
300  reallocate_device<atom>(&atoms, &atoms_size, num_atoms, 1.2f);
301  reallocate_device<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
302  reallocate_device<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
303  reallocate_device<unsigned int>(&global_counters, &global_counters_size, num_patches+2, 1.2f);
304  reallocate_device<float4>(&tmpforces, &tmpforces_size, num_atoms, 1.2f);
305  reallocate_device<float4>(&slow_tmpforces, &slow_tmpforces_size, num_atoms, 1.2f);
306  reallocate_device<unsigned int>(&plist, &plist_size, plist_len, 1.2f);
307  reallocate_device<exclmask>(&exclmasks, &exclmasks_size, nexclmask, 1.2f);
308  reallocate_device<float>(&tmpvirials, &tmpvirials_size, num_patches*16, 1.2f);
309  reallocate_device<float>(&slow_tmpvirials, &slow_tmpvirials_size, num_patches*16, 1.2f);
310 
311  // For GBIS
312  reallocate_device<unsigned int>(&GBIS_P1_counters, &GBIS_P1_counters_size, num_patches, 1.2f);
313  reallocate_device<unsigned int>(&GBIS_P2_counters, &GBIS_P2_counters_size, num_patches, 1.2f);
314  reallocate_device<unsigned int>(&GBIS_P3_counters, &GBIS_P3_counters_size, num_patches, 1.2f);
315  reallocate_device<float>(&intRad0D, &intRad0D_size, num_atoms, 1.2f);
316  reallocate_device<float>(&intRadSD, &intRadSD_size, num_atoms, 1.2f);
317  reallocate_device<GBReal>(&tmp_psiSumD, &tmp_psiSumD_size, num_atoms, 1.2f);
318  reallocate_device<float>(&bornRadD, &bornRadD_size, num_atoms, 1.2f);
319  reallocate_device<GBReal>(&tmp_dEdaSumD, &tmp_dEdaSumD_size, num_atoms, 1.2f);
320  reallocate_device<float>(&dHdrPrefixD, &dHdrPrefixD_size, num_atoms, 1.2f);
321  reallocate_device<float>(&tmp_energy_gbis, &tmp_energy_gbis_size, num_patches, 1.2f);
322 
323  cudaMemcpy(patch_pairs, h_patch_pairs, npatch_pairs*sizeof(patch_pair), cudaMemcpyHostToDevice);
324  cuda_errcheck("memcpy to patch_pairs");
325 
326  cudaMemset(global_counters, 0, (num_patches+2)*sizeof(unsigned int));
327  cuda_errcheck("memset global_counters");
328 
329  cudaMemset(GBIS_P1_counters, 0, num_patches*sizeof(unsigned int));
330  cuda_errcheck("memset GBIS_P1_counters");
331 
332  cudaMemset(GBIS_P2_counters, 0, num_patches*sizeof(unsigned int));
333  cuda_errcheck("memset GBIS_P2_counters");
334 
335  cudaMemset(GBIS_P3_counters, 0, num_patches*sizeof(unsigned int));
336  cuda_errcheck("memset GBIS_P3_counters");
337 
338  cudaMemset(tmpforces, 0, num_atoms*sizeof(float4));
339  cuda_errcheck("memset tmpforces");
340 
341  cudaMemset(tmpvirials, 0, num_patches*sizeof(float)*16);
342  cuda_errcheck("memset tmpvirials");
343 
344  cudaMemset(slow_tmpforces, 0, num_atoms*sizeof(float4));
345  cuda_errcheck("memset slow_tmpforces");
346 
347  cudaMemset(slow_tmpvirials, 0, num_patches*sizeof(float)*16);
348  cuda_errcheck("memset slow_tmpvirials");
349 
350 }
351 
352 void cuda_bind_atom_params(const atom_param *t) {
353  cudaMemcpyAsync(atom_params, t, num_atoms * sizeof(atom_param),
354  cudaMemcpyHostToDevice, stream);
355  cuda_errcheck("memcpy to atom_params");
356 }
357 
358 void cuda_bind_vdw_types(const int *t) {
359  cudaMemcpyAsync(vdw_types, t, num_atoms * sizeof(int),
360  cudaMemcpyHostToDevice, stream);
361  cuda_errcheck("memcpy to vdw_types");
362 }
363 
364 void cuda_bind_atoms(const atom *a) {
365  cuda_errcheck("before memcpy to atoms");
366  cudaMemcpyAsync(atoms, a, num_atoms * sizeof(atom),
367  cudaMemcpyHostToDevice, stream);
368  cuda_errcheck("memcpy to atoms");
369 }
370 
371 void cuda_bind_forces(float4 *f, float4 *f_slow) {
372  cudaHostGetDevicePointer(&forces, f, 0);
373  cuda_errcheck("cudaHostGetDevicePointer forces");
374  cudaHostGetDevicePointer(&slow_forces, f_slow, 0);
375  cuda_errcheck("cudaHostGetDevicePointer slow_forces");
376 }
377 
378 void cuda_bind_virials(float *v, int *queue, int *blockorder) {
379  cudaHostGetDevicePointer(&virials, v, 0);
380  cuda_errcheck("cudaHostGetDevicePointer virials");
382  cudaHostGetDevicePointer(&force_ready_queue, queue, 0);
383  cuda_errcheck("cudaHostGetDevicePointer force_ready_queue");
384  cudaHostGetDevicePointer(&block_order, blockorder, 0);
385  cuda_errcheck("cudaHostGetDevicePointer block_order");
386 }
387 
388 //GBIS bindings
389 void cuda_bind_GBIS_energy(float *e) {
390  cudaHostGetDevicePointer(&energy_gbis, e, 0);
391  cuda_errcheck("cudaHostGetDevicePointer energy_gbis");
392 }
394  cudaMemcpyAsync(intRad0D, intRad0H, num_atoms * sizeof(float),
395  cudaMemcpyHostToDevice, stream);
396  cudaMemcpyAsync(intRadSD, intRadSH, num_atoms * sizeof(float),
397  cudaMemcpyHostToDevice, stream);
398  cuda_errcheck("memcpy to intRad");
399 }
400 
402  cudaHostGetDevicePointer(&psiSumD, psiSumH, 0);
403  cuda_errcheck("cudaHostGetDevicePointer psiSum");
404 }
405 
407  cudaMemcpyAsync(bornRadD, bornRadH, num_atoms * sizeof(float),
408  cudaMemcpyHostToDevice, stream);
409  cuda_errcheck("memcpy to bornRad");
410 }
411 
413  cudaHostGetDevicePointer(&dEdaSumD, dEdaSumH, 0);
414  cuda_errcheck("cudaHostGetDevicePointer dEdaSum");
415 }
416 
418  cudaMemcpyAsync(dHdrPrefixD, dHdrPrefixH, num_atoms * sizeof(float),
419  cudaMemcpyHostToDevice, stream);
420  cuda_errcheck("memcpy to dHdrPrefix");
421 }
422 // end GBIS methods
423 
424 #if 0
425 void cuda_load_forces(float4 *f, float4 *f_slow, int begin, int count) {
426  // printf("load forces %d %d %d\n",begin,count,num_atoms);
427  cudaMemcpyAsync(f+begin, forces+begin, count * sizeof(float4),
428  cudaMemcpyDeviceToHost, stream);
429  if ( f_slow ) {
430  cudaMemcpyAsync(f_slow+begin, slow_forces+begin, count * sizeof(float4),
431  cudaMemcpyDeviceToHost, stream);
432  }
433  cuda_errcheck("memcpy from forces");
434 }
435 
436 void cuda_load_virials(float *v, int doSlow) {
437  int count = force_lists_size;
438  if ( doSlow ) count *= 2;
439  cudaMemcpyAsync(v, virials, count * 16*sizeof(float),
440  cudaMemcpyDeviceToHost, stream);
441  cuda_errcheck("memcpy from virials");
442 }
443 #endif
444 
445 #if 0
446 __host__ __device__ static int3 patch_coords_from_id(
447  dim3 PATCH_GRID, int id) {
448 
449  return make_int3( id % PATCH_GRID.x,
450  ( id / PATCH_GRID.x ) % PATCH_GRID.y,
451  id / ( PATCH_GRID.x * PATCH_GRID.y ) );
452 }
453 
454 __host__ __device__ static int patch_id_from_coords(
455  dim3 PATCH_GRID, int3 coords) {
456 
457  // handles periodic boundaries
458  int x = (coords.x + 4 * PATCH_GRID.x) % PATCH_GRID.x;
459  int y = (coords.y + 4 * PATCH_GRID.y) % PATCH_GRID.y;
460  int z = (coords.z + 4 * PATCH_GRID.z) % PATCH_GRID.z;
461 
462  return ( z * PATCH_GRID.y + y ) * PATCH_GRID.x + x;
463 }
464 
465 __host__ __device__ static int3 patch_offset_from_neighbor(int neighbor) {
466 
467  // int3 coords = patch_coords_from_id(make_uint3(3,3,3), 13 + neighbor);
468  int3 coords = patch_coords_from_id(make_uint3(3,3,3), neighbor);
469  return make_int3(coords.x - 1, coords.y - 1, coords.z - 1);
470 
471 }
472 #endif
473 
474 #define BLOCK_SIZE 128
475 #define SHARED_SIZE 32
476 
477 #define MAKE_PAIRLIST
478 #define DO_SLOW
479 #define DO_ENERGY
481 #undef DO_ENERGY
483 #undef DO_SLOW
484 #define DO_ENERGY
486 #undef DO_ENERGY
488 #undef MAKE_PAIRLIST
489 #define DO_SLOW
490 #define DO_ENERGY
492 #undef DO_ENERGY
494 #undef DO_SLOW
495 #define DO_ENERGY
497 #undef DO_ENERGY
499 
500 void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc,
501  float cutoff2, float plcutoff2,
502  int cbegin, int ccount, int ctotal,
503  int doSlow, int doEnergy, int usePairlists, int savePairlists,
504  int doStreaming, int saveOrder, cudaStream_t &strm) {
505 
506  if ( ccount ) {
507  if ( usePairlists ) {
508  if ( ! savePairlists ) plcutoff2 = 0.;
509  } else {
510  plcutoff2 = cutoff2;
511  }
512 
513  cudaMemsetAsync(tmpforces, 0, num_atoms*sizeof(float4), strm);
514  cudaMemsetAsync(tmpvirials, 0, num_patches*sizeof(float)*16, strm);
515  if ( doSlow ) {
516  cudaMemsetAsync(slow_tmpforces, 0, num_atoms*sizeof(float4), strm);
517  cudaMemsetAsync(slow_tmpvirials, 0, num_patches*sizeof(float)*16, strm);
518  }
519 
520  int grid_dim = max_grid_size; // maximum allowed
521  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
522  if ( grid_dim > ccount - cstart ) grid_dim = ccount - cstart;
523 
524  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
525 
526 #define CALL(X) X<<< grid_dim, nthread3, 0, strm >>> \
527  (patch_pairs, atoms, atom_params, vdw_types, plist, \
528  tmpforces, (doSlow?slow_tmpforces:NULL), \
529  forces, (doSlow?slow_forces:NULL), \
530  tmpvirials, (doSlow?slow_tmpvirials:NULL), \
531  virials, (doSlow?slow_virials:NULL), \
532  global_counters, (doStreaming?force_ready_queue:NULL), \
533  overflow_exclusions, num_patches, \
534  cbegin+cstart, ctotal, (saveOrder?block_order:NULL), \
535  exclmasks, lj_table_size, \
536  lata, latb, latc, cutoff2, plcutoff2, doSlow)
537 //end definition
538 
539  if ( doEnergy ) {
540  if ( doSlow ) {
541  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_energy_pairlist);
542  else CALL(dev_nonbonded_slow_energy);
543  } else {
544  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_energy_pairlist);
545  else CALL(dev_nonbonded_energy);
546  }
547  } else {
548  if ( doSlow ) {
549  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_pairlist);
550  else CALL(dev_nonbonded_slow);
551  } else {
552  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_pairlist);
553  else CALL(dev_nonbonded);
554  }
555  }
556 
557  cuda_errcheck("dev_nonbonded");
558  }
559  }
560 
561 }
562 
563 //import GBIS Kernel definitions
564 #include "ComputeGBISCUDAKernel.h"
565 
567 // GBIS P1
570  int cbegin,
571  int ccount,
572  int pbegin,
573  int pcount,
574  float a_cut,
575  float rho_0,
576  float3 lata,
577  float3 latb,
578  float3 latc,
579  cudaStream_t &strm
580 ) {
581 
582  if ( ccount ) {
583  cudaMemsetAsync(tmp_psiSumD, 0, num_atoms*sizeof(GBReal), strm);
584 
585  int grid_dim = max_grid_size; // maximum allowed
586  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
587  if (grid_dim > ccount - cstart) {
588  grid_dim = ccount - cstart;
589  }
590 
591  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
592  GBIS_P1_Kernel<<<grid_dim, nthread3, 0, strm>>>(
593  patch_pairs+cbegin+cstart,
594  atoms,
595  intRad0D,
596  intRadSD,
597  tmp_psiSumD,
598  psiSumD,
599  a_cut,
600  rho_0,
601  lata,
602  latb,
603  latc,
605  );
606  cuda_errcheck("dev_GBIS_P1");
607  } // end for
608  }
609 } // end GBIS P1
610 
612 // GBIS P2
615  int cbegin,
616  int ccount,
617  int pbegin,
618  int pcount,
619  float a_cut,
620  float r_cut,
621  float scaling,
622  float kappa,
623  float smoothDist,
624  float epsilon_p,
625  float epsilon_s,
626  float3 lata,
627  float3 latb,
628  float3 latc,
629  int doEnergy,
630  int doFullElec,
631  cudaStream_t &strm
632 ) {
633 
634  if ( ccount ) {
635  cudaMemsetAsync(tmp_dEdaSumD, 0, num_atoms*sizeof(GBReal), strm);
636  cudaMemsetAsync(tmp_energy_gbis, 0, num_patches*sizeof(float), strm);
637 
638  int grid_dim = max_grid_size; // maximum allowed
639  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
640  if (grid_dim > ccount - cstart)
641  grid_dim = ccount - cstart;
642 
643  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
644  GBIS_P2_Kernel<<<grid_dim, nthread3, 0, strm>>>(
645  patch_pairs+cbegin+cstart,
646  atoms,
647  bornRadD,
648  tmp_dEdaSumD,
649  dEdaSumD,
650  a_cut,
651  r_cut,
652  scaling,
653  kappa,
654  smoothDist,
655  epsilon_p,
656  epsilon_s,
657  lata,
658  latb,
659  latc,
660  doEnergy,
661  doFullElec,
662  tmpforces,
663  forces,
665  energy_gbis,
667  );
668  cuda_errcheck("dev_GBIS_P2");
669  } // end for
670  }
671 } // end P2
672 
674 // GBIS P3
677  int cbegin,
678  int ccount,
679  int pbegin,
680  int pcount,
681  float a_cut,
682  float rho_0,
683  float scaling,
684  float3 lata,
685  float3 latb,
686  float3 latc,
687  cudaStream_t &strm
688 ) {
689  int grid_dim = max_grid_size; // maximum allowed
690  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
691  if (grid_dim > ccount - cstart)
692  grid_dim = ccount - cstart;
693 
694  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
695  GBIS_P3_Kernel<<<grid_dim, nthread3, 0, strm>>>(
696  patch_pairs+cbegin+cstart,
697  atoms,
698  intRad0D,
699  intRadSD,
700  dHdrPrefixD,
701  a_cut,
702  rho_0,
703  scaling,
704  lata,
705  latb,
706  latc,
708  slow_forces,
710  );
711  cuda_errcheck("dev_GBIS_P3");
712  }
713 }
714 
715 #if 0
716 int cuda_stream_finished() {
717  return ( cudaStreamQuery(stream) == cudaSuccess );
718 }
719 #endif
720 
721 
722 #else // NAMD_CUDA
723 
724 // for make depends
726 #include "ComputeGBISCUDAKernel.h"
727 
728 #endif // NAMD_CUDA
729 
void cuda_bind_force_table(const float4 *t, const float4 *et)
static __thread unsigned int * GBIS_P2_counters
void cuda_bind_forces(float4 *f, float4 *f_slow)
static __thread GBReal * tmp_dEdaSumD
int cuda_stream_finished()
static __thread float * tmp_energy_gbis
void cuda_bind_exclusions(const unsigned int *t, int n)
static __thread float * intRadSD
void cuda_bind_atoms(const atom *a)
static __thread int patch_pairs_size
static __thread int intRad0D_size
static __thread int lj_table_size
static __thread float * bornRadD
static __thread unsigned int * exclusions
static __thread atom * atoms
static __thread float * intRad0D
static __thread float4 * forces
static __thread GBReal * tmp_psiSumD
static __thread unsigned int * plist
static __thread unsigned int * overflow_exclusions
static __thread GBReal * dEdaSumD
#define FORCE_TABLE_SIZE
static __thread float * slow_virials
static __thread float * bornRadH
static __thread int GBIS_P1_counters_size
static __thread int intRadSD_size
static __thread float * energy_gbis
static __thread exclmask * exclmasks
static __thread float * slow_tmpvirials
static __thread float * dHdrPrefixH
static __thread int plist_size
void cuda_bind_GBIS_energy(float *e)
void cuda_bind_GBIS_dEdaSum(GBReal *dEdaSumH)
static __thread float * virials
static __thread float4 * slow_forces
static __thread int vdw_types_size
void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
static __thread int atom_params_size
__thread cudaStream_t stream
static __thread float * dHdrPrefixD
static __thread int global_counters_size
static __thread int GBIS_P3_counters_size
static __thread float * tmpvirials
static __thread int * force_ready_queue
static __thread patch_pair * patch_pairs
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
static __thread float * intRadSH
void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int plist_len, int nexclmask)
void cuda_bind_vdw_types(const int *t)
void cuda_bind_lj_table(const float2 *t, int _lj_table_size)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
static __thread unsigned int * GBIS_P1_counters
static __thread int num_patches
texture< float2, 1, cudaReadModeElementType > lj_table
void cuda_bind_GBIS_dHdrPrefix(float *dHdrPrefixH)
static __thread int num_virials
texture< unsigned int, 1, cudaReadModeElementType > tex_exclusions
gridSize z
static __thread int tmpvirials_size
static __thread int tmp_psiSumD_size
static __thread int bornRadD_size
static __thread atom_param * atom_params
void cuda_bind_GBIS_bornRad(float *bornRadH)
static __thread int GBIS_P2_counters_size
static __thread int tmp_dEdaSumD_size
static __thread int tmp_energy_gbis_size
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float plcutoff2
void cuda_bind_virials(float *v, int *queue, int *blockorder)
static __thread unsigned int * GBIS_P3_counters
static __thread int num_atoms
static __thread int slow_tmpforces_size
static __thread int exclusions_size
void cuda_bind_GBIS_psiSum(GBReal *psiSumH)
void cuda_GBIS_P3(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
#define MAX_CONST_EXCLUSIONS
static __thread float4 * slow_tmpforces
static __thread int tmpforces_size
void cuda_GBIS_P1(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
static __thread unsigned int * global_counters
void cuda_init()
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_intRad(float *intRad0H, float *intRadSH)
static __thread int slow_tmpvirials_size
texture< float4, 1, cudaReadModeElementType > energy_table
static __thread int dHdrPrefixD_size
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
static __thread int exclmasks_size
static __thread int * vdw_types
__thread int max_grid_size
texture< float4, 1, cudaReadModeElementType > force_table
gridSize y
#define NUM_WARP
#define WARPSIZE
void cuda_GBIS_P2(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
__thread cudaStream_t stream2
static __thread int * block_order
void cuda_bind_atom_params(const atom_param *t)
gridSize x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
#define CALL(DOENERGY, DOVIRIAL)
static float * coords
Definition: ScriptTcl.C:66
static __thread float4 * tmpforces
static __thread GBReal * psiSumD
static __thread float * intRad0H
float GBReal
Definition: ComputeGBIS.inl:17
static __thread int atoms_size