NAMD
ComputeNonbondedCUDA.C
Go to the documentation of this file.
1 
2 #include "common.h"
3 #include "charm++.h"
4 
5 #ifdef NAMD_CUDA
6 #include <cuda_runtime.h>
7 #include <cuda.h>
8 #endif
9 
10 #include "WorkDistrib.h"
11 #include "ComputeMgr.h"
12 #include "ProxyMgr.h"
14 #include "ComputeNonbondedCUDA.h"
15 #include "LJTable.h"
16 #include "ObjectArena.h"
17 #include "SortAtoms.h"
18 #include "Priorities.h"
19 #include <algorithm>
20 
21 #include "NamdTypes.h"
22 #include "DeviceCUDA.h"
23 #include "CudaUtils.h"
24 
25 //#define PRINT_GBIS
26 #undef PRINT_GBIS
27 
28 #ifdef NAMD_CUDA
29 
30 #ifdef WIN32
31 #define __thread __declspec(thread)
32 #endif
33 
34 extern __thread int max_grid_size;
35 
36 extern __thread cudaStream_t stream;
37 extern __thread cudaStream_t stream2;
38 
39 extern __thread DeviceCUDA *deviceCUDA;
40 
41 void cuda_errcheck(const char *msg) {
42  cudaError_t err;
43  if ((err = cudaGetLastError()) != cudaSuccess) {
44  char host[128];
45  gethostname(host, 128); host[127] = 0;
46  char devstr[128] = "";
47  int devnum;
48  if ( cudaGetDevice(&devnum) == cudaSuccess ) {
49  sprintf(devstr, " device %d", devnum);
50  }
51  cudaDeviceProp deviceProp;
52  if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
53  sprintf(devstr, " device %d pci %x:%x:%x", devnum,
54  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
55  }
56  char errmsg[1024];
57  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
58  NAMD_die(errmsg);
59  }
60 }
61 
62 static inline bool sortop_bitreverse(int a, int b) {
63  if ( a == b ) return 0;
64  for ( int bit = 1; bit; bit *= 2 ) {
65  if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
66  }
67  return 0;
68 }
69 
70 static __thread ComputeNonbondedCUDA* cudaCompute = 0;
71 static __thread ComputeMgr *computeMgr = 0;
72 
75 }
76 
78  if ( deviceCUDA->getMasterPe() != CkMyPe() ) return;
81 }
82 
84 
86  const int dim = ljTable->get_table_dim();
87 
88  // round dim up to odd multiple of 16
89  int tsize = (((dim+16+31)/32)*32)-16;
90  if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
91 
92  float2 *t = new float2[tsize*tsize];
93  float2 *row = t;
94  for ( int i=0; i<dim; ++i, row += tsize ) {
95  for ( int j=0; j<dim; ++j ) {
96  const LJTable::TableEntry *e = ljTable->table_val(i,j);
97  row[j].x = e->A * scaling;
98  row[j].y = e->B * scaling;
99  }
100  }
101 
102  cuda_bind_lj_table(t,tsize);
103  delete [] t;
104 
105  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
106  CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
107  }
108 }
109 
111 
112  float4 t[FORCE_TABLE_SIZE];
113  float4 et[FORCE_TABLE_SIZE]; // energy table
114 
117  // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
118  const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
119 
120  double r2list[FORCE_TABLE_SIZE]; // double to match cpu code
121  for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
122  double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
123  r2list[i] = r*r + r2_delta;
124  }
125 
126  union { double f; int32 i[2]; } byte_order_test;
127  byte_order_test.f = 1.0; // should occupy high-order bits only
128  int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
129 
130  for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
131  double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
132  int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc; // table_i >= 0
133 
134  if ( r > cutoff ) {
135  t[i].x = 0.;
136  t[i].y = 0.;
137  t[i].z = 0.;
138  t[i].w = 0.;
139  et[i].x = 0.;
140  et[i].y = 0.;
141  et[i].z = 0.;
142  et[i].w = 0.;
143  continue;
144  }
145 
146  BigReal diffa = r2list[i] - r2_table[table_i];
147 
148  // coulomb 1/r or fast force
149  // t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1
150  {
151  BigReal table_a = fast_table[4*table_i];
152  BigReal table_b = fast_table[4*table_i+1];
153  BigReal table_c = fast_table[4*table_i+2];
154  BigReal table_d = fast_table[4*table_i+3];
155  BigReal grad =
156  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
157  t[i].x = 2. * grad;
158  BigReal ener = table_a + diffa *
159  ( ( table_d * diffa + table_c ) * diffa + table_b);
160  et[i].x = ener;
161  }
162 
163 
164  // pme correction for slow force
165  // t[i].w = 0.;
166  {
167  BigReal table_a = scor_table[4*table_i];
168  BigReal table_b = scor_table[4*table_i+1];
169  BigReal table_c = scor_table[4*table_i+2];
170  BigReal table_d = scor_table[4*table_i+3];
171  BigReal grad =
172  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
173  t[i].w = 2. * grad;
174  BigReal ener = table_a + diffa *
175  ( ( table_d * diffa + table_c ) * diffa + table_b);
176  et[i].w = ener;
177  }
178 
179 
180  // vdw 1/r^6
181  // t[i].y = 6. / (r8); // -1/r * d/dr r^-6
182  {
183  BigReal table_a = vdwb_table[4*table_i];
184  BigReal table_b = vdwb_table[4*table_i+1];
185  BigReal table_c = vdwb_table[4*table_i+2];
186  BigReal table_d = vdwb_table[4*table_i+3];
187  BigReal grad =
188  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
189  t[i].y = 2. * -1. * grad;
190  BigReal ener = table_a + diffa *
191  ( ( table_d * diffa + table_c ) * diffa + table_b);
192  et[i].y = -1. * ener;
193  }
194 
195 
196  // vdw 1/r^12
197  // t[i].z = 12e / (r8 * r4 * r2); // -1/r * d/dr r^-12
198  {
199  BigReal table_a = vdwa_table[4*table_i];
200  BigReal table_b = vdwa_table[4*table_i+1];
201  BigReal table_c = vdwa_table[4*table_i+2];
202  BigReal table_d = vdwa_table[4*table_i+3];
203  BigReal grad =
204  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
205  t[i].z = 2. * grad;
206  BigReal ener = table_a + diffa *
207  ( ( table_d * diffa + table_c ) * diffa + table_b);
208  et[i].z = ener;
209  }
210 
211  // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
212  // t[i].x, t[i].y, t[i].z, t[i].w);
213 
214 /*
215  double r2 = r * r;
216  double r4 = r2 * r2;
217  double r8 = r4 * r4;
218 
219  t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1
220  t[i].y = 6. / (r8); // -1/r * d/dr r^-6
221  t[i].z = 12. / (r8 * r4 * r2); // -1/r * d/dr r^-12
222  t[i].w = 0.;
223 */
224  }
225 
226  t[0].x = 0.f;
227  t[0].y = 0.f;
228  t[0].z = 0.f;
229  t[0].w = 0.f;
230  et[0].x = et[1].x;
231  et[0].y = et[1].y;
232  et[0].z = et[1].z;
233  et[0].w = et[1].w;
234 
235  cuda_bind_force_table(t,et);
236 
237  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
238  CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
239  }
240 }
241 
243  bool operator() (int32 *li, int32 *lj) {
244  return ( li[1] < lj[1] );
245  }
246 };
247 
249  if ( deviceCUDA->getMasterPe() != CkMyPe() ) return;
251 }
252 
253 static __thread int2 *exclusionsByAtom;
254 
257 
258 #ifdef MEM_OPT_VERSION
259  int natoms = mol->exclSigPoolSize;
260 #else
261  int natoms = mol->numAtoms;
262 #endif
263 
264  delete [] exclusionsByAtom;
265  exclusionsByAtom = new int2[natoms];
266 
267  // create unique sorted lists
268 
269  ObjectArena<int32> listArena;
270  ResizeArray<int32*> unique_lists;
271  int32 **listsByAtom = new int32*[natoms];
273  for ( int i=0; i<natoms; ++i ) {
274  curList.resize(0);
275  curList.add(0); // always excluded from self
276 #ifdef MEM_OPT_VERSION
277  const ExclusionSignature *sig = mol->exclSigPool + i;
278  int n = sig->fullExclCnt;
279  for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
280  n += 1;
281 #else
282  const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
283  int n = mol_list[0] + 1;
284  for ( int j=1; j<n; ++j ) {
285  curList.add(mol_list[j] - i);
286  }
287 #endif
288  curList.sort();
289 
290  int j;
291  for ( j=0; j<unique_lists.size(); ++j ) {
292  if ( n != unique_lists[j][0] ) continue; // no match
293  int k;
294  for ( k=0; k<n; ++k ) {
295  if ( unique_lists[j][k+3] != curList[k] ) break;
296  }
297  if ( k == n ) break; // found match
298  }
299  if ( j == unique_lists.size() ) { // no match
300  int32 *list = listArena.getNewArray(n+3);
301  list[0] = n;
302  int maxdiff = 0;
303  maxdiff = -1 * curList[0];
304  if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
305  list[1] = maxdiff;
306  for ( int k=0; k<n; ++k ) {
307  list[k+3] = curList[k];
308  }
309  unique_lists.add(list);
310  }
311  listsByAtom[i] = unique_lists[j];
312  }
313  // sort lists by maxdiff
314  std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
315  long int totalbits = 0;
316  int nlists = unique_lists.size();
317  for ( int j=0; j<nlists; ++j ) {
318  int32 *list = unique_lists[j];
319  int maxdiff = list[1];
320  list[2] = totalbits + maxdiff;
321  totalbits += 2*maxdiff + 1;
322  }
323  for ( int i=0; i<natoms; ++i ) {
324  exclusionsByAtom[i].x = listsByAtom[i][1]; // maxdiff
325  exclusionsByAtom[i].y = listsByAtom[i][2]; // start
326  }
327  delete [] listsByAtom;
328 
329  if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
330 
331  {
332  long int bytesneeded = totalbits / 8;
333  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
334  CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
335  unique_lists.size(), bytesneeded);
336  }
337 
338  long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
339  if ( bytesneeded > bytesavail ) {
340  char errmsg[512];
341  sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
342  "but only %ld bytes can be addressed with 32-bit int.",
343  unique_lists.size(), bytesneeded, bytesavail);
344  NAMD_die(errmsg);
345  }
346  }
347 
348 #define SET_EXCL(EXCL,BASE,DIFF) \
349  (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
350 
351  unsigned int *exclusion_bits = new unsigned int[totalbits/32];
352  memset(exclusion_bits, 0, totalbits/8);
353 
354  long int base = 0;
355  for ( int i=0; i<unique_lists.size(); ++i ) {
356  base += unique_lists[i][1];
357  if ( unique_lists[i][2] != (int32)base ) {
358  NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
359  }
360  int n = unique_lists[i][0];
361  for ( int j=0; j<n; ++j ) {
362  SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
363  }
364  base += unique_lists[i][1] + 1;
365  }
366 
367  cuda_bind_exclusions(exclusion_bits, totalbits/32);
368 
369  delete [] exclusion_bits;
370 }
371 
372 
374 
375  if ( ! cudaCompute ) NAMD_bug("register_self called early");
376 
378 
380  cr.c = c;
381  cr.pid[0] = pid; cr.pid[1] = pid;
382  cr.offset = 0.;
383  if ( cudaCompute->patchRecords[pid].isLocal ) {
385  } else {
387  }
388 }
389 
391 
392  if ( ! cudaCompute ) NAMD_bug("register_pair called early");
393 
394  cudaCompute->requirePatch(pid[0]);
395  cudaCompute->requirePatch(pid[1]);
396 
398  cr.c = c;
399  cr.pid[0] = pid[0]; cr.pid[1] = pid[1];
400 
401  int t1 = t[0];
402  int t2 = t[1];
403  Vector offset = cudaCompute->patchMap->center(pid[0])
404  - cudaCompute->patchMap->center(pid[1]);
405  offset.x += (t1%3-1) - (t2%3-1);
406  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
407  offset.z += (t1/9-1) - (t2/9-1);
408  cr.offset = offset;
409 
410  if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
412  } else {
414  }
415 }
416 
418 
419  NAMD_bug("unregister_compute unimplemented");
420 
421 }
422 
423 // static __thread cudaEvent_t start_upload;
424 static __thread cudaEvent_t start_calc;
425 static __thread cudaEvent_t end_remote_download;
426 static __thread cudaEvent_t end_local_download;
427 
430 
431 void init_arrays();
432 
434  ComputeNonbondedCUDA *m, int idx) : Compute(c), slaveIndex(idx) {
435 #ifdef PRINT_GBIS
436  CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
437 #endif
438 
439  if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
440  if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
441  if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
442 
443  // CkPrintf("create ComputeNonbondedCUDA\n");
444  master = m ? m : this;
445  cudaCompute = this;
446  computeMgr = mgr;
449  reduction = 0;
450 
452  if (params->pressureProfileOn) {
453  NAMD_die("pressure profile not supported in CUDA");
454  }
455 
456  atomsChanged = 1;
457  computesChanged = 1;
459 
460  pairlistsValid = 0;
461  pairlistTolerance = 0.;
462  usePairlists = 0;
463  savePairlists = 0;
464  plcutoff2 = 0.;
465 
466  workStarted = 0;
469 
470  // Zero array sizes and pointers
471  init_arrays();
472 
473  atoms_size = 0;
474  atoms = NULL;
475 
476  forces_size = 0;
477  forces = NULL;
478 
479  slow_forces_size = 0;
480  slow_forces = NULL;
481 
482  psiSumH_size = 0;
483  psiSumH = NULL;
484 
485  dEdaSumH_size = 0;
486  dEdaSumH = NULL;
487 
488  if ( master != this ) { // I am slave
490  master->slaves[slaveIndex] = this;
491  if ( master->slavePes[slaveIndex] != CkMyPe() ) {
492  NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
493  }
495  registerPatches();
496  return;
497  }
498  masterPe = CkMyPe();
499 
500  const bool streaming = ! (deviceCUDA->getNoStreaming() || params->GBISOn);
501  if ( streaming && ! deviceCUDA->getSharedGpu() && ! deviceCUDA->getNoMergeGrids() )
503 
504  // Sanity checks for New CUDA kernel
505  if (params->GBISOn) {
506  // GBIS
507  if (deviceCUDA->getNoMergeGrids()) {
508  NAMD_die("CUDA kernel cannot use +nomergegrids with GBIS simulations");
509  }
510  // Set mergegrids ON as long as user hasn't defined +nomergegrids
512  // Final sanity check
513  if (!deviceCUDA->getMergeGrids()) NAMD_die("CUDA GBIS kernel final sanity check failed");
514  } else {
515  // non-GBIS
517  NAMD_die("CUDA kernel requires +mergegrids if +nostreaming is used");
518  }
519  }
520 
521 #if CUDA_VERSION >= 5050
522  int leastPriority, greatestPriority;
523  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
524  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
525  if ( leastPriority != greatestPriority ) {
526  if ( CkMyNode() == 0 ) {
527  int dev = deviceCUDA->getDeviceID();
528  CkPrintf("CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
529  }
530  if ( deviceCUDA->getMergeGrids() && params->PMEOn && params->PMEOffload && !params->usePMECUDA) {
531  greatestPriority = leastPriority;
532  }
533  if (params->usePMECUDA) greatestPriority = leastPriority;
534  cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority);
535  cudaStreamCreateWithPriority(&stream2,cudaStreamDefault,leastPriority);
536  } else
537 #endif
538  {
539  cudaStreamCreate(&stream);
540  cuda_errcheck("cudaStreamCreate");
541  int dev = deviceCUDA->getDeviceID();
542  cudaDeviceProp deviceProp;
543  cudaGetDeviceProperties(&deviceProp, dev);
544  cuda_errcheck("cudaGetDeviceProperties");
545  if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
546  if ( CkMyNode() == 0 ) CkPrintf("CUDA device %d supports concurrent kernels.\n", dev);
547  cudaStreamCreate(&stream2);
548  } else {
549  stream2 = stream;
550  }
551  }
552  cuda_errcheck("cudaStreamCreate");
553 
554  // Get GPU device ID
556 
557  cuda_init();
558  if ( max_grid_size < 65535 ) NAMD_bug("bad CUDA max_grid_size");
560  // cudaEventCreate(&start_upload);
561  cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
562  cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
563  cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
564 
565  patchRecords.resize(patchMap->numPatches());
568 
569  if ( params->PMEOn && params->PMEOffload && !params->usePMECUDA) deviceCUDA->setGpuIsMine(0);
570 }
571 
572 
574 
576 
577  computesChanged = 1;
578  patch_record &pr = patchRecords[pid];
579  if ( pr.refCount == 0 ) {
580  pr.isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
581  pr.isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
582  if ( deviceCUDA->getMergeGrids() ) {
583  pr.isLocal = 0;
584  } else if ( CkNumNodes() < 2 ) {
585  pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
586  patchMap->index_b(pid) ^ patchMap->index_c(pid) );
587  } else {
588  pr.isLocal = pr.isSameNode;
589  }
590  if ( pr.isLocal ) {
591  localActivePatches.add(pid);
592  } else {
594  }
595  activePatches.add(pid);
596  pr.patchID = pid;
597  pr.hostPe = -1;
598  pr.x = NULL;
599  pr.xExt = NULL;
600  pr.r = NULL;
601  pr.f = NULL;
602  pr.intRad = NULL;
603  pr.psiSum = NULL;
604  pr.bornRad = NULL;
605  pr.dEdaSum = NULL;
606  pr.dHdrPrefix = NULL;
607  }
608  pr.refCount += 1;
609 }
610 
612 
614  int npatches = master->activePatches.size();
615  int *pids = master->activePatches.begin();
616  patch_record *recs = master->patchRecords.begin();
617  for ( int i=0; i<npatches; ++i ) {
618  int pid = pids[i];
619  patch_record &pr = recs[pid];
620  if ( pr.hostPe == CkMyPe() ) {
621  pr.slave = this;
622  pr.msg = new (PRIORITY_SIZE) FinishWorkMsg;
623  hostedPatches.add(pid);
624  if ( pr.isLocal ) {
625  localHostedPatches.add(pid);
626  } else {
628  }
630  pr.p = patchMap->patch(pid);
631  pr.positionBox = pr.p->registerPositionPickup(this);
632  pr.forceBox = pr.p->registerForceDeposit(this);
633  if (simParams->GBISOn) {
634  pr.intRadBox = pr.p->registerIntRadPickup(this);
635  pr.psiSumBox = pr.p->registerPsiSumDeposit(this);
636  pr.bornRadBox = pr.p->registerBornRadPickup(this);
637  pr.dEdaSumBox = pr.p->registerDEdaSumDeposit(this);
639  }
640  }
641  }
642  if ( master == this ) setNumPatches(activePatches.size());
644  if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
645  CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
646 }
647 
649  bool operator() (int pidj, int pidi) { // i and j reversed
650  int ppi = PATCH_PRIORITY(pidi);
651  int ppj = PATCH_PRIORITY(pidj);
652  if ( ppi != ppj ) return ppi < ppj;
653  return pidi < pidj;
654  }
655 };
656 
658 
659  int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
660  int numPesOnNodeSharingDevice = 0;
661  int masterIndex = -1;
662  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
663  int pe = deviceCUDA->getPesSharingDevice(i);
664  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
665  if ( CkNodeOf(pe) == CkMyNode() ) {
666  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
667  }
668  }
669 
670  int npatches = activePatches.size();
671 
672  if ( npatches ) {
674  }
675 
676  // calculate priority rank of local home patch within pe
677  {
678  ResizeArray< ResizeArray<int> > homePatchByRank(CkMyNodeSize());
679  for ( int i=0; i<npatches; ++i ) {
680  int pid = activePatches[i];
681  int homePe = patchMap->node(pid);
682  if ( CkNodeOf(homePe) == CkMyNode() ) {
683  homePatchByRank[CkRankOf(homePe)].add(pid);
684  }
685  }
686  for ( int i=0; i<CkMyNodeSize(); ++i ) {
688  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
689  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
690  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
691  int pid = homePatchByRank[i][j];
692  patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
693  }
694  }
695  }
696 
697  int *count = new int[npatches];
698  memset(count, 0, sizeof(int)*npatches);
699  int *pcount = new int[numPesOnNodeSharingDevice];
700  memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
701  int *rankpcount = new int[CkMyNodeSize()];
702  memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
703  char *table = new char[npatches*numPesOnNodeSharingDevice];
704  memset(table, 0, npatches*numPesOnNodeSharingDevice);
705 
706  int unassignedpatches = npatches;
707 
708  if ( 0 ) { // assign all to device pe
709  for ( int i=0; i<npatches; ++i ) {
710  int pid = activePatches[i];
711  patch_record &pr = patchRecords[pid];
712  pr.hostPe = CkMyPe();
713  }
714  unassignedpatches = 0;
715  pcount[masterIndex] = npatches;
716  } else
717 
718  // assign if home pe and build table of natural proxies
719  for ( int i=0; i<npatches; ++i ) {
720  int pid = activePatches[i];
721  patch_record &pr = patchRecords[pid];
722  int homePe = patchMap->node(pid);
723  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
724  int pe = pesOnNodeSharingDevice[j];
725  if ( pe == homePe ) {
726  pr.hostPe = pe; --unassignedpatches;
727  pcount[j] += 1;
728  }
729  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
730  table[i*numPesOnNodeSharingDevice+j] = 1;
731  }
732  }
733  if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
734  pr.hostPe = homePe; --unassignedpatches;
735  rankpcount[CkRankOf(homePe)] += 1;
736  }
737  }
738  // assign if only one pe has a required proxy
739  int assignj = 0;
740  for ( int i=0; i<npatches; ++i ) {
741  int pid = activePatches[i];
742  patch_record &pr = patchRecords[pid];
743  if ( pr.hostPe != -1 ) continue;
744  int c = 0;
745  int lastj;
746  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
747  if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
748  }
749  count[i] = c;
750  if ( c == 1 ) {
751  pr.hostPe = pesOnNodeSharingDevice[lastj];
752  --unassignedpatches;
753  pcount[lastj] += 1;
754  }
755  }
756  while ( unassignedpatches ) {
757  int i;
758  for ( i=0; i<npatches; ++i ) {
759  if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
760  int pid = activePatches[i];
761  patch_record &pr = patchRecords[pid];
762  if ( pr.hostPe != -1 ) continue;
763  pr.hostPe = pesOnNodeSharingDevice[assignj];
764  --unassignedpatches;
765  pcount[assignj] += 1;
766  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
767  break;
768  }
769  if ( i<npatches ) continue; // start search again
770  for ( i=0; i<npatches; ++i ) {
771  int pid = activePatches[i];
772  patch_record &pr = patchRecords[pid];
773  if ( pr.hostPe != -1 ) continue;
774  if ( count[i] ) continue;
775  pr.hostPe = pesOnNodeSharingDevice[assignj];
776  --unassignedpatches;
777  pcount[assignj] += 1;
778  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
779  break;
780  }
781  if ( i<npatches ) continue; // start search again
782  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
783  }
784 
785  for ( int i=0; i<npatches; ++i ) {
786  int pid = activePatches[i];
787  patch_record &pr = patchRecords[pid];
788  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
789  }
790 
791  slavePes = new int[CkMyNodeSize()];
792  slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
793  numSlaves = 0;
794  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
795  int pe = pesOnNodeSharingDevice[j];
796  int rank = pe - CkNodeFirst(CkMyNode());
797  // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
798  // CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
799  if ( pe == CkMyPe() ) continue;
800  if ( ! pcount[j] && ! rankpcount[rank] ) continue;
801  rankpcount[rank] = 0; // skip in rank loop below
802  slavePes[numSlaves] = pe;
804  ++numSlaves;
805  }
806  for ( int j=0; j<CkMyNodeSize(); ++j ) {
807  int pe = CkNodeFirst(CkMyNode()) + j;
808  // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
809  // CkMyPe(),j,pe,rankpcount[j]);
810  if ( ! rankpcount[j] ) continue;
811  if ( pe == CkMyPe() ) continue;
812  slavePes[numSlaves] = pe;
814  ++numSlaves;
815  }
816  registerPatches();
817 
818  delete [] pesOnNodeSharingDevice;
819  delete [] count;
820  delete [] pcount;
821  delete [] rankpcount;
822  delete [] table;
823 }
824 
825 static __thread int atom_params_size;
826 static __thread atom_param* atom_params;
827 
828 static __thread int vdw_types_size;
829 static __thread int* vdw_types;
830 
831 static __thread int dummy_size;
832 static __thread float* dummy_dev;
833 
834 static __thread int force_ready_queue_size;
835 static __thread int *force_ready_queue;
836 static __thread int force_ready_queue_len;
837 static __thread int force_ready_queue_next;
838 
839 static __thread int block_order_size;
840 static __thread int *block_order;
841 
842 static __thread int num_atoms;
843 static __thread int num_local_atoms;
844 static __thread int num_remote_atoms;
845 
846 static __thread int virials_size;
847 static __thread float *virials;
848 static __thread int num_virials;
849 // NOTE: slow_virials is a computed pointer from "virials" -do not deallocate
850 static __thread float *slow_virials;
851 
852 static __thread int energy_gbis_size;
853 static __thread float *energy_gbis;
854 
855 //GBIS host pointers
856 static __thread int intRad0H_size;
857 static __thread float *intRad0H;
858 static __thread int intRadSH_size;
859 static __thread float *intRadSH;
860 static __thread int bornRadH_size;
861 static __thread float *bornRadH;
862 static __thread int dHdrPrefixH_size;
863 static __thread float *dHdrPrefixH;
864 
865 static __thread int cuda_timer_count;
866 static __thread double cuda_timer_total;
867 static __thread double kernel_time;
868 static __thread double remote_submit_time;
869 static __thread double local_submit_time;
870 
871 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
872 
873 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime); // fix Charm++
874 
875 #ifdef PRINT_GBIS
876 #define GBISP(...) CkPrintf(__VA_ARGS__);
877 #else
878 #define GBISP(...)
879 #endif
880 
881 #define count_limit 1000000
882 static __thread int check_count;
883 static __thread int check_remote_count;
884 static __thread int check_local_count;
885 
886 void init_arrays() {
887 
888  atom_params_size = 0;
889  atom_params = NULL;
890 
891  vdw_types_size = 0;
892  vdw_types = NULL;
893 
894  dummy_size = 0;
895  dummy_dev = NULL;
896 
898  force_ready_queue = NULL;
901 
902  block_order_size = 0;
903  block_order = NULL;
904 
905  num_atoms = 0;
906  num_local_atoms = 0;
907  num_remote_atoms = 0;
908 
909  virials_size = 0;
910  virials = NULL;
911  num_virials = 0;
912 
913  energy_gbis_size = 0;
914  energy_gbis = NULL;
915 
916  intRad0H_size = 0;
917  intRad0H = NULL;
918  intRadSH_size = 0;
919  intRadSH = NULL;
920  bornRadH_size = 0;
921  bornRadH = NULL;
922  dHdrPrefixH_size = 0;
923  dHdrPrefixH = NULL;
924 
925 }
926 
927 void cuda_check_progress(void *arg, double walltime) {
929 
930  int flindex;
931  int poll_again = 1;
932  while ( -1 != (flindex = force_ready_queue[force_ready_queue_next]) ) {
933  // CkPrintf("Pe %d forces ready %d is index %d at %lf\n",
934  // CkMyPe(), force_ready_queue_next, flindex, walltime);
937  check_count = 0;
938  if ( force_ready_queue_next == force_ready_queue_len ) {
939  poll_again = 0;
940  CUDA_TRACE_LOCAL(kernel_time,walltime);
941  kernel_time = walltime - kernel_time;
942  // need to guarantee this finishes before the last patch message!
943  ((ComputeNonbondedCUDA *) arg)->workStarted = 0;
944  ((ComputeNonbondedCUDA *) arg)->finishReductions();
945  }
946  ((ComputeNonbondedCUDA *) arg)->messageFinishPatch(flindex);
947  if ( force_ready_queue_next == force_ready_queue_len ) break;
948  }
949  if ( ++check_count >= count_limit ) {
950  char errmsg[256];
951  sprintf(errmsg,"cuda_check_progress polled %d times over %f s on step %d",
952  check_count, walltime - remote_submit_time,
953  ((ComputeNonbondedCUDA *) arg)->step);
954  cudaDie(errmsg,cudaSuccess);
955  }
956  if ( poll_again ) {
957  CcdCallBacksReset(0,walltime); // fix Charm++
959  }
960 }
961 
962 void cuda_check_remote_progress(void *arg, double walltime) {
963 
965  cudaError_t err = cudaEventQuery(end_remote_download);
966  if ( err == cudaSuccess ) {
967  local_submit_time = walltime;
969  if ( deviceCUDA->getMergeGrids() ) { // no local
971  }
972  check_remote_count = 0;
973  cuda_errcheck("at cuda remote stream completed");
975  } else if ( err != cudaErrorNotReady ) {
976  char errmsg[256];
977  sprintf(errmsg,"in cuda_check_remote_progress after polling %d times over %f s on step %d",
979  ((ComputeNonbondedCUDA *) arg)->step);
980  cudaDie(errmsg,err);
981  } else if ( ++check_remote_count >= count_limit ) {
982  char errmsg[256];
983  sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
985  ((ComputeNonbondedCUDA *) arg)->step);
986  cudaDie(errmsg,err);
987  } else if ( check_local_count ) {
988  NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
989  } else {
990  CcdCallBacksReset(0,walltime); // fix Charm++
992  }
993 }
994 
995 void cuda_check_local_progress(void *arg, double walltime) {
996 
998  cudaError_t err = cudaEventQuery(end_local_download);
999  if ( err == cudaSuccess ) {
1001  kernel_time = walltime - kernel_time;
1002  check_local_count = 0;
1003  cuda_errcheck("at cuda local stream completed");
1005  } else if ( err != cudaErrorNotReady ) {
1006  char errmsg[256];
1007  sprintf(errmsg,"in cuda_check_local_progress after polling %d times over %f s on step %d",
1009  ((ComputeNonbondedCUDA *) arg)->step);
1010  cudaDie(errmsg,err);
1011  } else if ( ++check_local_count >= count_limit ) {
1012  char errmsg[256];
1013  sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
1015  ((ComputeNonbondedCUDA *) arg)->step);
1016  cudaDie(errmsg,err);
1017  } else if ( check_remote_count ) {
1018  NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
1019  } else {
1020  CcdCallBacksReset(0,walltime); // fix Charm++
1022  }
1023 }
1024 
1025 #if 0
1026 // don't use this one unless timer is part of stream, above is better
1027 void cuda_check_progress(void *arg, double walltime) {
1028  if ( cuda_stream_finished() ) {
1029  kernel_time = walltime - kernel_time;
1030  CcdCallBacksReset(0,walltime); // fix Charm++
1031  CUDA_POLL(ccd_index);
1032  // ((ComputeNonbondedCUDA *) arg)->finishWork();
1034  }
1035 }
1036 #endif
1037 
1039  //fprintf(stderr, "%d ComputeNonbondedCUDA::atomUpdate\n",CkMyPe());
1040  atomsChanged = 1;
1041 }
1042 
1043 static __thread int kernel_launch_state = 0;
1044 
1046  const Lattice &l;
1047  cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
1050  Vector a = l.a();
1051  Vector b = l.b();
1052  Vector c = l.c();
1053  BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
1054  BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
1055  return ( ri < rj );
1056  }
1057 };
1058 
1063  const ComputeNonbondedCUDA::patch_record *patchrecs) : distop(sod), pr(patchrecs) { }
1064  bool pid_compare_priority(int pidi, int pidj) {
1065  const ComputeNonbondedCUDA::patch_record &pri = pr[pidi];
1066  const ComputeNonbondedCUDA::patch_record &prj = pr[pidj];
1067  if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
1068  if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
1069  if ( pri.isSameNode && ! prj.isSameNode ) return 0;
1070  if ( prj.isSameNode && ! pri.isSameNode ) return 1;
1071  if ( pri.isSameNode ) { // and prj.isSameNode
1072  int rpri = pri.reversePriorityRankInPe;
1073  int rprj = prj.reversePriorityRankInPe;
1074  if ( rpri != rprj ) return rpri > rprj;
1075  return sortop_bitreverse(CkRankOf(pri.hostPe),CkRankOf(prj.hostPe));
1076  }
1077  int ppi = PATCH_PRIORITY(pidi);
1078  int ppj = PATCH_PRIORITY(pidj);
1079  if ( ppi != ppj ) return ppi < ppj;
1080  return pidi < pidj;
1081  }
1083  ComputeNonbondedCUDA::compute_record i) { // i and j reversed
1084  int pidi = pid_compare_priority(i.pid[0],i.pid[1]) ? i.pid[0] : i.pid[1];
1085  int pidj = pid_compare_priority(j.pid[0],j.pid[1]) ? j.pid[0] : j.pid[1];
1086  if ( pidi != pidj ) return pid_compare_priority(pidi, pidj);
1087  return distop(i,j);
1088  }
1089 };
1090 
1092  //fprintf(stderr, "ComputeNonbondedCUDA::skip()\n");
1094  for ( int i=0; i<hostedPatches.size(); ++i ) {
1096  pr.positionBox->skip();
1097  pr.forceBox->skip();
1098  if (simParams->GBISOn) {
1099  pr.intRadBox->skip();
1100  pr.psiSumBox->skip();
1101  pr.bornRadBox->skip();
1102  pr.dEdaSumBox->skip();
1103  pr.dHdrPrefixBox->skip();
1104  }
1105  }
1106 }
1107 
1109 
1111  Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
1112  lattice = flags.lattice;
1113  doSlow = flags.doFullElectrostatics;
1114  doEnergy = flags.doEnergy;
1115  step = flags.step;
1116 
1117  if ( ! flags.doNonbonded ) {
1118  GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
1119  if ( master != this ) {
1122  } else {
1123  for ( int i = 0; i < numSlaves; ++i ) {
1125  }
1126  skip();
1127  }
1128  if ( reduction ) {
1129  reduction->submit();
1130  }
1131 
1132  return 1;
1133  }
1134 
1135  for ( int i=0; i<hostedPatches.size(); ++i ) {
1137  if (!simParams->GBISOn || gbisPhase == 1) {
1138  GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
1139  pr.x = pr.positionBox->open();
1140  pr.xExt = pr.p->getCompAtomExtInfo();
1141  }
1142 
1143  if (simParams->GBISOn) {
1144  if (gbisPhase == 1) {
1145  GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
1146  pr.intRad = pr.intRadBox->open();
1147  pr.psiSum = pr.psiSumBox->open();
1148  } else if (gbisPhase == 2) {
1149  GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
1150  pr.bornRad = pr.bornRadBox->open();
1151  pr.dEdaSum = pr.dEdaSumBox->open();
1152  } else if (gbisPhase == 3) {
1153  GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
1154  pr.dHdrPrefix = pr.dHdrPrefixBox->open();
1155  }
1156  GBISP("opened GBIS boxes");
1157  }
1158  }
1159 
1160  if ( master == this ) return 0; //work to do, enqueue as usual
1161 
1162  // message masterPe
1165  atomsChanged = 0;
1166 
1167  workStarted = 1;
1168 
1169  return 1;
1170 }
1171 
1173 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
1174 CkMyPe(), sequence(), gbisPhase, workStarted, atomsChanged);
1175 
1176  // Set GPU device ID
1177  cudaCheck(cudaSetDevice(deviceID));
1178 
1180  ResizeArray<int> &patch_pair_num(*patch_pair_num_ptr);
1181 
1182  if ( workStarted ) { //if work already started, check if finished
1183  if ( finishWork() ) { // finished
1184  workStarted = 0;
1185  basePriority = PROXY_DATA_PRIORITY; // higher to aid overlap
1186  } else { // need to call again
1187  workStarted = 2;
1188  basePriority = PROXY_RESULTS_PRIORITY; // lower for local
1189  if ( master == this && kernel_launch_state > 2 ) {
1190  cuda_check_local_progress(this,0.); // launches polling
1191  }
1192  }
1193  return;
1194  }
1195 
1196  workStarted = 1;
1198 
1200  Parameters *params = Node::Object()->parameters;
1202 
1203  //execute only during GBIS phase 1, or if not using GBIS
1204  if (!simParams->GBISOn || gbisPhase == 1) {
1205 
1206  //bind new patches to GPU
1207  if ( atomsChanged || computesChanged ) {
1208  int npatches = activePatches.size();
1209 
1210  pairlistsValid = 0;
1211  pairlistTolerance = 0.;
1212 
1213  if ( computesChanged ) {
1214  computesChanged = 0;
1215 
1216  if ( ! dummy_size ) {
1217  dummy_size = 1024*1024;
1218  cudaMalloc((void**)&dummy_dev,dummy_size);
1219  cuda_errcheck("in cudaMalloc dummy_dev");
1220  }
1221 
1222  bool did_realloc = reallocate_host<int>(&force_ready_queue, &force_ready_queue_size, npatches,
1223  1.2f, cudaHostAllocMapped);
1224  if (did_realloc) {
1225  for (int k=0; k < force_ready_queue_size; ++k)
1226  force_ready_queue[k] = -1;
1227  }
1228  force_ready_queue_len = npatches;
1229  reallocate_host<int>(&block_order, &block_order_size,
1230  2*(localComputeRecords.size()+remoteComputeRecords.size()),
1231  1.2f, cudaHostAllocMapped);
1232 
1233  num_virials = npatches;
1234  reallocate_host<float>(&virials, &virials_size, 2*16*num_virials,
1235  1.2f, cudaHostAllocMapped);
1236  slow_virials = virials + 16*num_virials;
1237 
1238  reallocate_host<float>(&energy_gbis, &energy_gbis_size, npatches,
1239  1.2f, cudaHostAllocMapped);
1240  for (int i = 0; i < energy_gbis_size; i++) energy_gbis[i] = 0.f;
1241 
1242  int *ap = activePatches.begin();
1243  for ( int i=0; i<localActivePatches.size(); ++i ) {
1244  *(ap++) = localActivePatches[i];
1245  }
1246  for ( int i=0; i<remoteActivePatches.size(); ++i ) {
1247  *(ap++) = remoteActivePatches[i];
1248  }
1249 
1250  // sort computes by distance between patches
1252  std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
1253  std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
1254 
1255  const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
1256 
1257  if ( streaming ) {
1258  // iout << "Reverse-priority sorting...\n" << endi;
1259  cr_sortop_reverse_priority sorp(so,patchRecords.begin());
1260  std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),sorp);
1261  std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),sorp);
1262  patchPairsReordered = 0;
1263  //patchPairsReordered = 1;
1264  // int len = remoteComputeRecords.size();
1265  // for ( int i=0; i<len; ++i ) {
1266  // iout << "reverse_order " << i << " " << remoteComputeRecords[i].pid[0] << "\n";
1267  // }
1268  // int len2 = localComputeRecords.size();
1269  // for ( int i=0; i<len2; ++i ) {
1270  // iout << "reverse_order " << (i+len) << " " << localComputeRecords[i].pid[0] << "\n";
1271  // }
1272  // iout << endi;
1273  } else {
1274  patchPairsReordered = 1;
1275  }
1276 
1277  int nlc = localComputeRecords.size();
1278  int nrc = remoteComputeRecords.size();
1279  computeRecords.resize(nlc+nrc);
1280  compute_record *cr = computeRecords.begin();
1281  for ( int i=0; i<nrc; ++i ) {
1282  *(cr++) = remoteComputeRecords[i];
1283  }
1284  for ( int i=0; i<nlc; ++i ) {
1285  *(cr++) = localComputeRecords[i];
1286  }
1287 
1288  // patch_pair_num[i] = number of patch pairs that involve patch i
1289  patch_pair_num.resize(npatches);
1290  for ( int i=0; i<npatches; ++i ) {
1291  patchRecords[activePatches[i]].localIndex = i;
1292  patch_pair_num[i] = 0;
1293  }
1294 
1295  int ncomputes = computeRecords.size();
1296  patch_pairs.resize(ncomputes);
1297  for ( int i=0; i<ncomputes; ++i ) {
1299  int lp1 = patchRecords[cr.pid[0]].localIndex;
1300  int lp2 = patchRecords[cr.pid[1]].localIndex;
1301  patch_pair_num[lp1]++;
1302  if (lp1 != lp2) patch_pair_num[lp2]++;
1303  patch_pair &pp = patch_pairs[i];
1304  pp.offset.x = cr.offset.x;
1305  pp.offset.y = cr.offset.y;
1306  pp.offset.z = cr.offset.z;
1307  }
1308 
1309  for ( int i=0; i<ncomputes; ++i ) {
1311  int lp1 = patchRecords[cr.pid[0]].localIndex;
1312  int lp2 = patchRecords[cr.pid[1]].localIndex;
1313  patch_pair &pp = patch_pairs[i];
1314  pp.patch1_ind = lp1;
1315  pp.patch2_ind = lp2;
1316  pp.patch1_num_pairs = patch_pair_num[lp1];
1317  pp.patch2_num_pairs = patch_pair_num[lp2];
1318  }
1319 
1320  if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
1321  CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
1323  localComputeRecords.size(), remoteComputeRecords.size());
1324  }
1325  } // computesChanged
1326  else if ( ! patchPairsReordered ) {
1327  patchPairsReordered = 1;
1328  int len = patch_pairs.size();
1329  int nlc = localComputeRecords.size();
1330  int nrc = remoteComputeRecords.size();
1331  if ( len != nlc + nrc ) NAMD_bug("array size mismatch in ComputeNonbondedCUDA reordering");
1332  ResizeArray<ComputeNonbondedCUDA::compute_record> new_computeRecords(len);
1333  ResizeArray<patch_pair> new_patch_pairs(len);
1334  int irc=nrc;
1335  int ilc=len;
1336  for ( int i=0; i<len; ++i ) {
1337  int boi = block_order[i];
1338  int dest;
1339  if ( boi < nrc ) { dest = --irc; } else { dest = --ilc; }
1340  new_computeRecords[dest] = computeRecords[boi];
1341  new_patch_pairs[dest] = patch_pairs[boi];
1342  }
1343  if ( irc != 0 || ilc != nrc ) NAMD_bug("block index mismatch in ComputeNonbondedCUDA reordering");
1344  computeRecords.swap(new_computeRecords);
1345  patch_pairs.swap(new_patch_pairs);
1346  }
1347 
1348  int istart = 0;
1349  int i;
1350  for ( i=0; i<npatches; ++i ) {
1351  if ( i == localActivePatches.size() ) {
1352  num_local_atoms = istart;
1353  }
1355  pr.localStart = istart;
1356  int natoms = pr.p->getNumAtoms();
1357  int nfreeatoms = natoms;
1358  if ( fixedAtomsOn ) {
1359  const CompAtomExt *aExt = pr.xExt;
1360  for ( int j=0; j<natoms; ++j ) {
1361  if ( aExt[j].atomFixed ) --nfreeatoms;
1362  }
1363  }
1364  pr.numAtoms = natoms;
1365  pr.numFreeAtoms = nfreeatoms;
1366  istart += natoms;
1367  istart += 16 - (natoms & 15);
1368  }
1369  if ( i == localActivePatches.size() ) {
1370  num_local_atoms = istart;
1371  }
1372  num_atoms = istart;
1374  reallocate_host<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
1375  reallocate_host<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
1376  reallocate_host<CudaAtom>(&atoms, &atoms_size, num_atoms, 1.2f);
1377  reallocate_host<float4>(&forces, &forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
1378  reallocate_host<float4>(&slow_forces, &slow_forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
1379  if (simParams->GBISOn) {
1380  reallocate_host<float>(&intRad0H, &intRad0H_size, num_atoms, 1.2f);
1381  reallocate_host<float>(&intRadSH, &intRadSH_size, num_atoms, 1.2f);
1382  reallocate_host<GBReal>(&psiSumH, &psiSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
1383  reallocate_host<float>(&bornRadH, &bornRadH_size, num_atoms, 1.2f);
1384  reallocate_host<GBReal>(&dEdaSumH, &dEdaSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
1385  reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixH_size, num_atoms, 1.2f);
1386  }
1387 
1388  int bfstart = 0;
1389  int exclmask_start = 0;
1390  int ncomputes = computeRecords.size();
1391  for ( int i=0; i<ncomputes; ++i ) {
1393  int p1 = cr.pid[0];
1394  int p2 = cr.pid[1];
1395  patch_pair &pp = patch_pairs[i];
1396  pp.patch1_start = patchRecords[p1].localStart;
1397  pp.patch1_size = patchRecords[p1].numAtoms;
1398  pp.patch1_free_size = patchRecords[p1].numFreeAtoms;
1399  pp.patch2_start = patchRecords[p2].localStart;
1400  pp.patch2_size = patchRecords[p2].numAtoms;
1401  pp.patch2_free_size = patchRecords[p2].numFreeAtoms;
1402  pp.plist_start = bfstart;
1403  // size1*size2 = number of patch pairs
1404  int size1 = (pp.patch1_size-1)/WARPSIZE+1;
1405  int size2 = (pp.patch2_size-1)/WARPSIZE+1;
1406  pp.plist_size = (size1*size2-1)/32+1;
1407  bfstart += pp.plist_size;
1408  pp.exclmask_start = exclmask_start;
1409  exclmask_start += size1*size2;
1410  } //for ncomputes
1411 
1412  cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
1413  activePatches.size(), num_atoms, bfstart,
1414  exclmask_start);
1415 
1416  } // atomsChanged || computesChanged
1417 
1418  double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
1419 
1420  Flags &flags = patchRecords[hostedPatches[0]].p->flags;
1421  float maxAtomMovement = 0.;
1422  float maxPatchTolerance = 0.;
1423 
1424  for ( int i=0; i<activePatches.size(); ++i ) {
1426 
1427  float maxMove = pr.p->flags.maxAtomMovement;
1428  if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
1429 
1430  float maxTol = pr.p->flags.pairlistTolerance;
1431  if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
1432 
1433  int start = pr.localStart;
1434  int n = pr.numAtoms;
1435  const CompAtom *a = pr.x;
1436  const CompAtomExt *aExt = pr.xExt;
1437  if ( atomsChanged ) {
1438 
1439  atom_param *ap = atom_params + start;
1440  for ( int k=0; k<n; ++k ) {
1441  int j = aExt[k].sortOrder;
1442  ap[k].vdw_type = a[j].vdwType;
1443  vdw_types[start + k] = a[j].vdwType;
1444  ap[k].index = aExt[j].id;
1445 #ifdef MEM_OPT_VERSION
1446  ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
1447  ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
1448 #else // ! MEM_OPT_VERSION
1449  ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
1450  ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
1451 #endif // MEM_OPT_VERSION
1452  }
1453  }
1454  {
1455 #if 1
1456  const CudaAtom *ac = pr.p->getCudaAtomList();
1457  CudaAtom *ap = atoms + start;
1458  memcpy(ap, ac, sizeof(atom)*n);
1459 #else
1460  Vector center =
1462  atom *ap = atoms + start;
1463  for ( int k=0; k<n; ++k ) {
1464  int j = aExt[k].sortOrder;
1465  ap[k].position.x = a[j].position.x - center.x;
1466  ap[k].position.y = a[j].position.y - center.y;
1467  ap[k].position.z = a[j].position.z - center.z;
1468  ap[k].charge = charge_scaling * a[j].charge;
1469  }
1470 #endif
1471  }
1472  }
1473 
1474  savePairlists = 0;
1475  usePairlists = 0;
1476  if ( flags.savePairlists ) {
1477  savePairlists = 1;
1478  usePairlists = 1;
1479  } else if ( flags.usePairlists ) {
1480  if ( ! pairlistsValid ||
1481  ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1483  } else {
1484  usePairlists = 1;
1485  }
1486  }
1487  if ( ! usePairlists ) {
1488  pairlistsValid = 0;
1489  }
1490  float plcutoff = cutoff;
1491  if ( savePairlists ) {
1492  pairlistsValid = 1;
1493  pairlistTolerance = 2. * maxPatchTolerance;
1494  plcutoff += pairlistTolerance;
1495  }
1496  plcutoff2 = plcutoff * plcutoff;
1497 
1498  //CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1499  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1500 
1501 #if 0
1502  // calculate warp divergence
1503  if ( 1 ) {
1504  Flags &flags = patchRecords[hostedPatches[0]].p->flags;
1505  Lattice &lattice = flags.lattice;
1506  float3 lata, latb, latc;
1507  lata.x = lattice.a().x;
1508  lata.y = lattice.a().y;
1509  lata.z = lattice.a().z;
1510  latb.x = lattice.b().x;
1511  latb.y = lattice.b().y;
1512  latb.z = lattice.b().z;
1513  latc.x = lattice.c().x;
1514  latc.y = lattice.c().y;
1515  latc.z = lattice.c().z;
1516 
1517  int ncomputes = computeRecords.size();
1518  for ( int ic=0; ic<ncomputes; ++ic ) {
1519  patch_pair &pp = patch_pairs[ic];
1520  atom *a1 = atoms + pp.patch1_atom_start;
1521  int n1 = pp.patch1_size;
1522  atom *a2 = atoms + pp.patch2_atom_start;
1523  int n2 = pp.patch2_size;
1524  float offx = pp.offset.x * lata.x
1525  + pp.offset.y * latb.x
1526  + pp.offset.z * latc.x;
1527  float offy = pp.offset.x * lata.y
1528  + pp.offset.y * latb.y
1529  + pp.offset.z * latc.y;
1530  float offz = pp.offset.x * lata.z
1531  + pp.offset.y * latb.z
1532  + pp.offset.z * latc.z;
1533  // CkPrintf("%f %f %f\n", offx, offy, offz);
1534  int atoms_tried = 0;
1535  int blocks_tried = 0;
1536  int atoms_used = 0;
1537  int blocks_used = 0;
1538  for ( int ii=0; ii<n1; ii+=32 ) { // warps
1539  for ( int jj=0; jj<n2; jj+=16 ) { // shared atom loads
1540  int block_used = 0;
1541  for ( int j=jj; j<jj+16 && j<n2; ++j ) { // shared atoms
1542  int atom_used = 0;
1543  for ( int i=ii; i<ii+32 && i<n1; ++i ) { // threads
1544  float dx = offx + a1[i].position.x - a2[j].position.x;
1545  float dy = offy + a1[i].position.y - a2[j].position.y;
1546  float dz = offz + a1[i].position.z - a2[j].position.z;
1547  float r2 = dx*dx + dy*dy + dz*dz;
1548  if ( r2 < cutoff2 ) atom_used = 1;
1549  }
1550  ++atoms_tried;
1551  if ( atom_used ) { block_used = 1; ++atoms_used; }
1552  }
1553  ++blocks_tried;
1554  if ( block_used ) { ++blocks_used; }
1555  }
1556  }
1557  CkPrintf("blocks = %d/%d (%f) atoms = %d/%d (%f)\n",
1558  blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
1559  atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
1560  }
1561  }
1562 #endif
1563 
1564  } // !GBISOn || gbisPhase == 1
1565 
1566  //Do GBIS
1567  if (simParams->GBISOn) {
1568  //open GBIS boxes depending on phase
1569  for ( int i=0; i<activePatches.size(); ++i ) {
1571  GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
1572  if (gbisPhase == 1) {
1573  //Copy GBIS intRadius to Host
1574  if (atomsChanged) {
1575  float *intRad0 = intRad0H + pr.localStart;
1576  float *intRadS = intRadSH + pr.localStart;
1577  for ( int k=0; k<pr.numAtoms; ++k ) {
1578  int j = pr.xExt[k].sortOrder;
1579  intRad0[k] = pr.intRad[2*j+0];
1580  intRadS[k] = pr.intRad[2*j+1];
1581  }
1582  }
1583  } else if (gbisPhase == 2) {
1584  float *bornRad = bornRadH + pr.localStart;
1585  for ( int k=0; k<pr.numAtoms; ++k ) {
1586  int j = pr.xExt[k].sortOrder;
1587  bornRad[k] = pr.bornRad[j];
1588  }
1589  } else if (gbisPhase == 3) {
1590  float *dHdrPrefix = dHdrPrefixH + pr.localStart;
1591  for ( int k=0; k<pr.numAtoms; ++k ) {
1592  int j = pr.xExt[k].sortOrder;
1593  dHdrPrefix[k] = pr.dHdrPrefix[j];
1594  }
1595  } // end phases
1596  } // end for patches
1597  } // if GBISOn
1598 
1599  kernel_time = CkWallTimer();
1600  kernel_launch_state = 1;
1601  //if ( gpu_is_mine || ! doSlow ) recvYieldDevice(-1);
1602  if ( deviceCUDA->getGpuIsMine() || ! doSlow ) recvYieldDevice(-1);
1603 }
1604 
1605 void cuda_check_remote_calc(void *arg, double walltime) {
1606  // in theory we only need end_remote_calc, but overlap isn't reliable
1607  // if ( cudaEventQuery(end_remote_calc) == cudaSuccess ) {
1608  if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
1609 // CkPrintf("Pe %d yielding to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
1611 // CkPrintf("Pe %d yielded to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
1612  } else {
1613  CcdCallBacksReset(0,walltime); // fix Charm++
1615  }
1616 }
1617 
1618 void cuda_check_local_calc(void *arg, double walltime) {
1619  // in theory we only need end_local_calc, but overlap isn't reliable
1620  // if ( cudaEventQuery(end_local_calc) == cudaSuccess ) {
1621  if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
1622 // CkPrintf("Pe %d yielding to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
1624 // CkPrintf("Pe %d yielded to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
1625  } else {
1626  CcdCallBacksReset(0,walltime); // fix Charm++
1628  }
1629 }
1630 
1631 // computeMgr->sendYieldDevice(next_pe_sharing_gpu);
1632 
1634 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
1635 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
1636 workStarted, gbisPhase, kernel_launch_state, pe)
1637 
1638  float3 lata, latb, latc;
1639  lata.x = lattice.a().x;
1640  lata.y = lattice.a().y;
1641  lata.z = lattice.a().z;
1642  latb.x = lattice.b().x;
1643  latb.y = lattice.b().y;
1644  latb.z = lattice.b().z;
1645  latc.x = lattice.c().x;
1646  latc.y = lattice.c().y;
1647  latc.z = lattice.c().z;
1649 
1650  // Set GPU device ID
1651  cudaSetDevice(deviceID);
1652 
1653  const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
1654 
1655  double walltime;
1656  if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
1657  walltime = CkWallTimer();
1658  CcdCallBacksReset(0,walltime); // fix Charm++
1659  }
1660 
1661  switch ( kernel_launch_state ) {
1663 // Remote
1664  case 1:
1665 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
1667  //gpu_is_mine = 0;
1669  remote_submit_time = walltime;
1670 
1671  if (!simParams->GBISOn || gbisPhase == 1) {
1672  // cudaEventRecord(start_upload, stream);
1673  if ( atomsChanged ) {
1676  }
1677  if ( simParams->GBISOn) {
1679  if ( atomsChanged ) {
1681  }
1682  }
1683  atomsChanged = 0;
1684  cuda_bind_atoms((const atom *)atoms);
1687  if ( simParams->GBISOn) {
1689  }
1690  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1691  //call CUDA Kernels
1692 
1693  cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
1694  0,remoteComputeRecords.size(),
1697  if (simParams->GBISOn) {
1698  cuda_GBIS_P1(
1699  0,remoteComputeRecords.size(),
1701  simParams->alpha_cutoff-simParams->fsMax,
1702  simParams->coulomb_radius_offset,
1703  lata, latb, latc, stream);
1704  }
1705  //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
1706  // num_local_atom_records,num_remote_atom_records);
1707  if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
1708  cudaEventRecord(end_remote_download, stream);
1709  }
1710  if ( streaming ) {
1713  } else {
1715  }
1716  if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
1718  break;
1719  }
1720  } // !GBIS or gbisPhase==1
1721  if (simParams->GBISOn) {
1722  if (gbisPhase == 1) {
1723  //GBIS P1 Kernel launched in previous code block
1724  } else if (gbisPhase == 2) {
1725 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
1726  // cudaEventRecord(start_upload, stream);
1729  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1730  cuda_GBIS_P2(
1731  0,remoteComputeRecords.size(),
1733  (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
1734  simParams->nonbondedScaling, simParams->kappa,
1735  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1736  simParams->dielectric, simParams->solvent_dielectric,
1737  lata, latb, latc,
1739  );
1740  cudaEventRecord(end_remote_download, stream);
1742  } else if (gbisPhase == 3) {
1743 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
1744  // cudaEventRecord(start_upload, stream);
1746  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1747  if (doSlow)
1748  cuda_GBIS_P3(
1749  0,remoteComputeRecords.size(),
1751  (simParams->alpha_cutoff-simParams->fsMax),
1752  simParams->coulomb_radius_offset,
1753  simParams->nonbondedScaling,
1754  lata, latb, latc, stream
1755  );
1756  cudaEventRecord(end_remote_download, stream);
1758  }
1759  }
1760 
1762 // Local
1763  case 2:
1764 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
1766  //gpu_is_mine = 0;
1768 
1769  if ( stream2 != stream ) {
1770  // needed to ensure that upload is finished
1771  cudaStreamWaitEvent(stream2, start_calc, 0);
1772  // priorities do not prevent local from launching ahead
1773  // of remote, so delay local stream with a small memset
1774  cudaMemsetAsync(dummy_dev, 0, dummy_size, stream2);
1775  }
1776 
1777  if (!simParams->GBISOn || gbisPhase == 1) {
1778 
1779  cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
1783  if (simParams->GBISOn) {
1784  cuda_GBIS_P1(
1787  simParams->alpha_cutoff-simParams->fsMax,
1788  simParams->coulomb_radius_offset,
1789  lata, latb, latc, stream2 );
1790  }
1791  //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
1792  // 0,num_local_atom_records);
1793  //cuda_load_virials(virials, doSlow); // slow_virials follows virials
1794  if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
1795  cudaEventRecord(end_local_download, stream2);
1796  }
1797  if ( ! streaming && workStarted == 2 ) {
1798  GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1799 cuda_check_local_progress\n", CkMyPe())
1801  }
1802  if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
1803  GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1804 cuda_check_local_calc\n", CkMyPe())
1806  break;
1807  }
1808 
1809  } // !GBIS or gbisPhase==1
1810  if (simParams->GBISOn) {
1811  if (gbisPhase == 1) {
1812  //GBIS P1 Kernel launched in previous code block
1813  } else if (gbisPhase == 2) {
1814 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
1815  cuda_GBIS_P2(
1818  (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
1819  simParams->nonbondedScaling, simParams->kappa,
1820  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1821  simParams->dielectric, simParams->solvent_dielectric,
1822  lata, latb, latc,
1824  );
1825  cudaEventRecord(end_local_download, stream2);
1826  if ( workStarted == 2 ) {
1828  }
1829  } else if (gbisPhase == 3) {
1830 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
1831  if (doSlow)
1832  cuda_GBIS_P3(
1835  (simParams->alpha_cutoff-simParams->fsMax),
1836  simParams->coulomb_radius_offset,
1837  simParams->nonbondedScaling,
1838  lata, latb, latc, stream2
1839  );
1840  cudaEventRecord(end_local_download, stream2);
1841  if ( workStarted == 2 ) {
1843  }
1844  } // phases
1845  } // GBISOn
1846  if ( simParams->PMEOn && simParams->PMEOffload && !simParams->usePMECUDA) break;
1847 
1848  default:
1849 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
1850  //gpu_is_mine = 1;
1852  break;
1853  } // switch
1854 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
1855 }
1856 
1858  int pid = activePatches[flindex];
1859  patch_record &pr = patchRecords[pid];
1860  //fprintf(stderr, "%d ComputeNonbondedCUDA::messageFinishPatch %d\n",CkMyPe(),pr.hostPe);
1862 }
1863 
1865  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishPatch \n",CkMyPe());
1867  pr.r = pr.forceBox->open();
1868  finishPatch(pr);
1869  pr.positionBox->close(&(pr.x));
1870  pr.forceBox->close(&(pr.r));
1871 }
1872 
1873 void ComputeNonbondedCUDA::finishPatch(patch_record &pr) {
1874  int start = pr.localStart;
1875  const CompAtomExt *aExt = pr.xExt;
1876  int nfree = pr.numAtoms;
1877  pr.f = pr.r->f[Results::nbond];
1878  Force *f = pr.f;
1879  Force *f_slow = pr.r->f[Results::slow];
1880  const CompAtom *a = pr.x;
1881  // int *ao = atom_order + start;
1882  float4 *af = master->forces + start;
1883  float4 *af_slow = master->slow_forces + start;
1884  // only free atoms return forces
1885  for ( int k=0; k<nfree; ++k ) {
1886  int j = aExt[k].sortOrder;
1887  f[j].x += af[k].x;
1888  f[j].y += af[k].y;
1889  f[j].z += af[k].z;
1890  // wcount += af[k].w;
1891  // virial += af[k].w;
1892  if ( doSlow ) {
1893  f_slow[j].x += af_slow[k].x;
1894  f_slow[j].y += af_slow[k].y;
1895  f_slow[j].z += af_slow[k].z;
1896  // virial_slow += af_slow[k].w;
1897  }
1898  }
1899 }
1900 
1901 //dtanner
1903  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishWork() \n",CkMyPe());
1904 
1905  if ( master == this ) {
1906  for ( int i = 0; i < numSlaves; ++i ) {
1908  }
1909  }
1910 
1911 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
1912 CkMyPe(), workStarted, gbisPhase)
1913 
1916 
1917  ResizeArray<int> &patches( workStarted == 1 ?
1919 
1920  // long long int wcount = 0;
1921  // double virial = 0.;
1922  // double virial_slow = 0.;
1923  for ( int i=0; i<patches.size(); ++i ) {
1924  // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
1925  patch_record &pr = master->patchRecords[patches[i]];
1926 
1927  if ( !simParams->GBISOn || gbisPhase == 1 ) {
1928  patch_record &pr = master->patchRecords[patches[i]];
1929 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
1930  pr.r = pr.forceBox->open();
1931  } // !GBISOn || gbisPhase==1
1932 
1933  int start = pr.localStart;
1934  const CompAtomExt *aExt = pr.xExt;
1935  if ( !simParams->GBISOn || gbisPhase == 3 ) {
1936  finishPatch(pr);
1937  } // !GBISOn || gbisPhase == 3
1938 
1939 #if 0
1940  if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
1941  CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
1942  j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
1943  af[j].w);
1944  }
1945 #endif
1946 
1947  //Close Boxes depending on Phase
1948  if (simParams->GBISOn) {
1949  if (gbisPhase == 1) {
1950  //Copy dEdaSum from Host to Patch Box
1951  GBReal *psiSumMaster = master->psiSumH + start;
1952  for ( int k=0; k<pr.numAtoms; ++k ) {
1953  int j = aExt[k].sortOrder;
1954  pr.psiSum[j] += psiSumMaster[k];
1955  }
1956 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
1957  pr.psiSumBox->close(&(pr.psiSum));
1958 
1959  } else if (gbisPhase == 2) {
1960  //Copy dEdaSum from Host to Patch Box
1961  GBReal *dEdaSumMaster = master->dEdaSumH + start;
1962  for ( int k=0; k<pr.numAtoms; ++k ) {
1963  int j = aExt[k].sortOrder;
1964  pr.dEdaSum[j] += dEdaSumMaster[k];
1965  }
1966 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
1967  pr.dEdaSumBox->close(&(pr.dEdaSum));
1968 
1969  } else if (gbisPhase == 3) {
1970 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
1971  pr.intRadBox->close(&(pr.intRad)); //box 6
1972  pr.bornRadBox->close(&(pr.bornRad)); //box 7
1973  pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
1974  pr.positionBox->close(&(pr.x)); //box 0
1975  pr.forceBox->close(&(pr.r));
1976  } //end phases
1977  } else { //not GBIS
1978 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
1979  pr.positionBox->close(&(pr.x));
1980  pr.forceBox->close(&(pr.r));
1981  }
1982  }// end for
1983 
1984 #if 0
1985  virial *= (-1./6.);
1986  reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
1987  reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
1988  reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
1989  if ( doSlow ) {
1990  virial_slow *= (-1./6.);
1991  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
1992  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
1993  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
1994  }
1995 #endif
1996 
1997  if ( workStarted == 1 && ! deviceCUDA->getMergeGrids() &&
1998  ( localHostedPatches.size() || master == this ) ) {
1999  GBISP("not finished, call again\n");
2000  return 0; // not finished, call again
2001  }
2002 
2003  if ( master != this ) { // finished
2004  GBISP("finished\n");
2005  if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
2006  atomsChanged = 0;
2007  return 1;
2008  }
2009 
2011 
2012  if ( !simParams->GBISOn || gbisPhase == 3 ) {
2013 
2014  atomsChanged = 0;
2015  finishReductions();
2016 
2017  } // !GBISOn || gbisPhase==3
2018 
2019  // Next GBIS Phase
2020 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
2021  if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
2022 
2023  GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
2024  return 1; // finished and ready for next step
2025 }
2026 
2027 
2029  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishReductions \n",CkMyPe());
2030 
2031  basePriority = PROXY_DATA_PRIORITY; // higher to aid overlap
2032 
2034 
2035  if ( 0 && patchPairsReordered && patchPairsReordered < 3 ) {
2036  if ( patchPairsReordered++ == 2) {
2037  int patch_len = patchRecords.size();
2038  ResizeArray<int> plast(patch_len);
2039  for ( int i=0; i<patch_len; ++i ) {
2040  plast[i] = -1;
2041  }
2042  int order_len = localComputeRecords.size()+remoteComputeRecords.size();
2043  for ( int i=0; i<order_len; ++i ) {
2044  plast[computeRecords[block_order[i]].pid[0]] = i;
2045  iout << "block_order " << i << " " << block_order[i] << " " << computeRecords[block_order[i]].pid[0] << "\n";
2046  }
2047  iout << endi;
2048  for ( int i=0; i<patch_len; ++i ) {
2049  iout << "patch_last " << i << " " << plast[i] << "\n";
2050  }
2051  iout << endi;
2052  }
2053  }
2054 
2055  {
2056  Tensor virial_tensor;
2057  BigReal energyv = 0.;
2058  BigReal energye = 0.;
2059  BigReal energys = 0.;
2060  int nexcluded = 0;
2061  for ( int i = 0; i < num_virials; ++i ) {
2062  virial_tensor.xx += virials[16*i];
2063  virial_tensor.xy += virials[16*i+1];
2064  virial_tensor.xz += virials[16*i+2];
2065  virial_tensor.yx += virials[16*i+3];
2066  virial_tensor.yy += virials[16*i+4];
2067  virial_tensor.yz += virials[16*i+5];
2068  virial_tensor.zx += virials[16*i+6];
2069  virial_tensor.zy += virials[16*i+7];
2070  virial_tensor.zz += virials[16*i+8];
2071  energyv += virials[16*i+9];
2072  energye += virials[16*i+10];
2073  energys += virials[16*i+11];
2074  nexcluded += ((int *)virials)[16*i+12];
2075  if (simParams->GBISOn) {
2076  energye += energy_gbis[i];
2077  }
2078  }
2080  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
2081  if ( doEnergy ) {
2082  reduction->item(REDUCTION_LJ_ENERGY) += energyv;
2083  reduction->item(REDUCTION_ELECT_ENERGY) += energye;
2085  }
2086  }
2087  if ( doSlow ) {
2088  Tensor virial_slow_tensor;
2089  for ( int i = 0; i < num_virials; ++i ) {
2090  virial_slow_tensor.xx += slow_virials[16*i];
2091  virial_slow_tensor.xy += slow_virials[16*i+1];
2092  virial_slow_tensor.xz += slow_virials[16*i+2];
2093  virial_slow_tensor.yx += slow_virials[16*i+3];
2094  virial_slow_tensor.yy += slow_virials[16*i+4];
2095  virial_slow_tensor.yz += slow_virials[16*i+5];
2096  virial_slow_tensor.zx += slow_virials[16*i+6];
2097  virial_slow_tensor.zy += slow_virials[16*i+7];
2098  virial_slow_tensor.zz += slow_virials[16*i+8];
2099  }
2100  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
2101  }
2102 
2103  reduction->submit();
2104 
2105  cuda_timer_count++;
2106  if ( simParams->outputCudaTiming &&
2107  step % simParams->outputCudaTiming == 0 ) {
2108 
2109  // int natoms = mol->numAtoms;
2110  // double wpa = wcount; wpa /= natoms;
2111 
2112  // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
2113  // kernel_time * 1.e3, time * 1.e3, wpa);
2114 
2115 #if 0
2116  float upload_ms, remote_calc_ms;
2117  float local_calc_ms, total_ms;
2118  cuda_errcheck("before event timers");
2119  cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
2120  cuda_errcheck("in event timer 1");
2121  cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
2122  cuda_errcheck("in event timer 2");
2123  cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
2124  cuda_errcheck("in event timer 3");
2125  cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
2126  cuda_errcheck("in event timer 4");
2127  cuda_errcheck("in event timers");
2128 
2129  CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
2130  CkMyPe(), upload_ms, remote_calc_ms,
2131  local_calc_ms, total_ms);
2132 #endif
2133 
2134  if ( cuda_timer_count >= simParams->outputCudaTiming ) {
2136  CkPrintf("CUDA TIMING: %d %f ms/step on node %d\n",
2137  step, cuda_timer_total * 1.e3, CkMyPe());
2138  }
2139  cuda_timer_count = 0;
2140  cuda_timer_total = 0;
2141  }
2142 
2143 }
2144 
2145 
2146 #endif // NAMD_CUDA
2147 
static Node * Object()
Definition: Node.h:86
void setNumPatches(int n)
Definition: Compute.h:52
ResizeArray< int > remoteHostedPatches
void sendNonbondedCUDASlaveEnqueuePatch(ComputeNonbondedCUDA *c, int, int, int, int, FinishWorkMsg *)
Definition: ComputeMgr.C:1565
static __thread int * block_order
Box< Patch, GBReal > * registerDEdaSumDeposit(Compute *cid)
Definition: Patch.C:204
static BigReal * fast_table
#define COMPUTE_PROXY_PRIORITY
Definition: Priorities.h:71
void cuda_bind_force_table(const float4 *t, const float4 *et)
BigReal zy
Definition: Tensor.h:19
void sendNonbondedCUDASlaveSkip(ComputeNonbondedCUDA *c, int)
Definition: ComputeMgr.C:1541
void sendBuildCudaForceTable()
Definition: ComputeMgr.C:1467
void build_cuda_exclusions()
static BigReal * scor_table
void sendYieldDevice(int pe)
Definition: ComputeMgr.C:1434
Type * getNewArray(int n)
Definition: ObjectArena.h:49
#define PROXY_RESULTS_PRIORITY
Definition: Priorities.h:73
ResizeArray< int > localActivePatches
static void messageFinishCUDA(Compute *)
Definition: WorkDistrib.C:2896
int sequence(void)
Definition: Compute.h:64
void cuda_bind_forces(float4 *f, float4 *f_slow)
static __thread int check_count
static bool sortop_bitreverse(int a, int b)
void build_cuda_force_table()
BigReal xz
Definition: Tensor.h:17
void unregister_cuda_compute(ComputeID c)
static __thread double cuda_timer_total
BigReal solvent_dielectric
int cuda_stream_finished()
int sortOrder
Definition: NamdTypes.h:87
static __thread int intRadSH_size
static __thread int check_remote_count
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
void cuda_check_progress(void *arg, double walltime)
ResizeArray< compute_record > computeRecords
void cuda_bind_exclusions(const unsigned int *t, int n)
short int32
Definition: dumpdcd.c:24
int ComputeID
Definition: NamdTypes.h:183
void cuda_check_local_progress(void *arg, double walltime)
void cuda_bind_atoms(const atom *a)
#define CUDA_TRACE_POLL_REMOTE
Definition: DeviceCUDA.h:17
bool getSharedGpu()
Definition: DeviceCUDA.h:98
static __thread cudaEvent_t end_remote_download
static PatchMap * Object()
Definition: PatchMap.h:27
#define GBISP(...)
void setMergeGrids(const int val)
Definition: DeviceCUDA.h:96
#define CUDA_TRACE_REMOTE(START, END)
Definition: DeviceCUDA.h:23
static __thread ComputeMgr * computeMgr
Definition: Vector.h:64
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:43
SimParameters * simParameters
Definition: Node.h:178
BigReal nonbondedScaling
static __thread int dummy_size
int index_a(int pid) const
Definition: PatchMap.h:86
int savePairlists
Definition: PatchTypes.h:39
static const Molecule * mol
const ComputeNonbondedCUDA::patch_record * pr
#define COULOMB
Definition: common.h:46
#define FORCE_TABLE_SIZE
BigReal & item(int i)
Definition: ReductionMgr.h:312
static __thread float * bornRadH
BigReal z
Definition: Vector.h:66
float x
Definition: NamdTypes.h:154
static __thread int2 * exclusionsByAtom
static BigReal * vdwa_table
int usePairlists
Definition: PatchTypes.h:38
Position position
Definition: NamdTypes.h:53
static __thread float * dHdrPrefixH
BigReal yz
Definition: Tensor.h:18
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2727
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
int getMergeGrids()
Definition: DeviceCUDA.h:95
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
Box< Patch, Real > * registerBornRadPickup(Compute *cid)
Definition: Patch.C:196
int get_table_dim() const
Definition: LJTable.h:44
#define count_limit
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
#define iout
Definition: InfoStream.h:87
void cuda_bind_GBIS_energy(float *e)
bool pid_compare_priority(int pidi, int pidj)
Patch * patch(PatchID pid)
Definition: PatchMap.h:235
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
void sendNonbondedCUDASlaveReady(int, int, int, int)
Definition: ComputeMgr.C:1525
void CcdCallBacksReset(void *ignored, double curWallTime)
void cuda_bind_GBIS_dEdaSum(GBReal *dEdaSumH)
SubmitReduction * reduction
static __thread int dHdrPrefixH_size
BigReal switchingDist
static __thread cudaEvent_t end_local_download
BigReal coulomb_radius_offset
Flags flags
Definition: Patch.h:127
void register_cuda_compute_self(ComputeID c, PatchID pid)
void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
__thread cudaStream_t stream
static __thread float * slow_virials
bool operator()(int pidj, int pidi)
static __thread int force_ready_queue_next
void send_build_cuda_force_table()
static __thread int intRad0H_size
Charge charge
Definition: NamdTypes.h:54
void cuda_check_local_calc(void *arg, double walltime)
#define CUDA_POLL(FN, ARG)
static __thread ResizeArray< int > * patch_pair_num_ptr
Box< Patch, GBReal > * registerPsiSumDeposit(Compute *cid)
Definition: Patch.C:164
#define PRIORITY_SIZE
Definition: Priorities.h:13
static __thread patch_pair * patch_pairs
CudaAtom * getCudaAtomList()
Definition: Patch.h:124
static __thread float * intRadSH
const TableEntry * table_val(unsigned int i, unsigned int j) const
Definition: LJTable.h:35
void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int plist_len, int nexclmask)
static __thread double kernel_time
void setGpuIsMine(const int val)
Definition: DeviceCUDA.h:105
ComputeNonbondedCUDA ** slaves
void cuda_bind_vdw_types(const int *t)
void cuda_bind_lj_table(const float2 *t, int _lj_table_size)
int getMasterPe()
Definition: DeviceCUDA.h:100
void sendCreateNonbondedCUDASlave(int, int)
Definition: ComputeMgr.C:1511
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int doEnergy
Definition: PatchTypes.h:20
int doFullElectrostatics
Definition: PatchTypes.h:23
BigReal yx
Definition: Tensor.h:18
iterator end(void)
Definition: ResizeArray.h:37
Box< Patch, Real > * registerIntRadPickup(Compute *cid)
Definition: Patch.C:179
void cuda_bind_GBIS_dHdrPrefix(float *dHdrPrefixH)
static __thread int force_ready_queue_size
static __thread int num_remote_atoms
int index_b(int pid) const
Definition: PatchMap.h:87
static __thread int virials_size
int priority(void)
Definition: Compute.h:65
void cuda_bind_GBIS_bornRad(float *bornRadH)
ResizeArray< int > remoteActivePatches
BigReal x
Definition: Vector.h:66
cr_sortop_distance(const Lattice &lattice)
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:102
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
BigReal alpha_cutoff
int numAtoms
Definition: Molecule.h:556
int PatchID
Definition: NamdTypes.h:182
void createProxy(PatchID pid)
Definition: ProxyMgr.C:493
ComputeNonbondedCUDA * master
ResizeArray< compute_record > localComputeRecords
int doNonbonded
Definition: PatchTypes.h:22
void NAMD_die(const char *err_msg)
Definition: common.C:83
static __thread float * virials
void cuda_bind_virials(float *v, int *queue, int *blockorder)
void init_arrays()
ResizeArray< compute_record > remoteComputeRecords
LocalWorkMsg * localWorkMsg2
static AtomMap * Object()
Definition: AtomMap.h:36
void cuda_bind_GBIS_psiSum(GBReal *psiSumH)
void cuda_GBIS_P3(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
static __thread int cuda_timer_count
static __thread ResizeArray< patch_pair > * patch_pairs_ptr
BigReal maxAtomMovement
Definition: PatchTypes.h:41
static __thread int bornRadH_size
static __thread int num_virials
ResizeArray< int > hostedPatches
BigReal xx
Definition: Tensor.h:17
ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr, ComputeNonbondedCUDA *m=0, int idx=-1)
ResizeArray< int > activePatches
int getDeviceID()
Definition: DeviceCUDA.h:107
void skip(void)
Definition: Box.h:63
int index_c(int pid) const
Definition: PatchMap.h:88
bool operator()(ComputeNonbondedCUDA::compute_record i, ComputeNonbondedCUDA::compute_record j)
static __thread int vdw_types_size
static __thread double remote_submit_time
int add(const Elem &elem)
Definition: ResizeArray.h:97
static __thread double local_submit_time
void cuda_GBIS_P1(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
bool operator()(ComputeNonbondedCUDA::compute_record j, ComputeNonbondedCUDA::compute_record i)
BigReal zz
Definition: Tensor.h:19
int gbisPhase
Definition: Compute.h:39
Parameters * parameters
Definition: Node.h:177
void cuda_init()
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
BlockRadixSort::TempStorage sort
void cuda_check_remote_progress(void *arg, double walltime)
static __thread int force_ready_queue_len
static __thread int energy_gbis_size
static __thread float * dummy_dev
#define simParams
Definition: Output.C:127
short vdwType
Definition: NamdTypes.h:55
int numPatches(void) const
Definition: PatchMap.h:59
void resize(int i)
Definition: ResizeArray.h:84
void swap(ResizeArray< Elem > &ra)
Definition: ResizeArray.h:64
#define CUDA_TRACE_POLL_LOCAL
Definition: DeviceCUDA.h:20
int node(int pid) const
Definition: PatchMap.h:114
int getNoMergeGrids()
Definition: DeviceCUDA.h:94
void cuda_errcheck(const char *msg)
Position unscale(ScaledPosition s) const
Definition: Lattice.h:77
void cuda_bind_GBIS_intRad(float *intRad0H, float *intRadSH)
static BigReal * vdwb_table
int getNextPeSharingGpu()
Definition: DeviceCUDA.h:99
ResizeArray< int > localHostedPatches
Definition: Tensor.h:15
void sendNonbondedCUDASlaveEnqueue(ComputeNonbondedCUDA *c, int, int, int, int)
Definition: ComputeMgr.C:1554
static __thread float * energy_gbis
BigReal xy
Definition: Tensor.h:17
Box< Patch, Real > * registerDHdrPrefixPickup(Compute *cid)
Definition: Patch.C:218
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
BigReal y
Definition: Vector.h:66
Vector b() const
Definition: Lattice.h:253
static __thread int kernel_launch_state
void cuda_check_remote_calc(void *arg, double walltime)
int getNumAtoms()
Definition: Patch.h:105
static const LJTable * ljTable
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
Bool pressureProfileOn
int getNoStreaming()
Definition: DeviceCUDA.h:93
BigReal yy
Definition: Tensor.h:18
Lattice lattice
Definition: PatchTypes.h:44
static __thread cudaEvent_t start_calc
bool operator()(int32 *li, int32 *lj)
static __thread atom_param * atom_params
__thread int max_grid_size
ResizeArray< patch_record > patchRecords
cr_sortop_reverse_priority(cr_sortop_distance &sod, const ComputeNonbondedCUDA::patch_record *patchrecs)
Box< Patch, CompAtom > * positionBox
#define WARPSIZE
BigReal pairlistTolerance
Definition: PatchTypes.h:40
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
static __thread int block_order_size
static __thread int * vdw_types
void cuda_GBIS_P2(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
BigReal dielectric
void submit(void)
Definition: ReductionMgr.h:323
int basePriority
Definition: Compute.h:37
int size(void) const
Definition: ResizeArray.h:127
__thread cudaStream_t stream2
int getGpuIsMine()
Definition: DeviceCUDA.h:104
infostream & endi(infostream &s)
Definition: InfoStream.C:38
static __thread int atom_params_size
static __thread int * force_ready_queue
void cuda_bind_atom_params(const atom_param *t)
#define SET_EXCL(EXCL, BASE, DIFF)
#define CUDA_TRACE_LOCAL(START, END)
Definition: DeviceCUDA.h:26
#define MAX_EXCLUSIONS
Data * open(void)
Definition: Box.h:39
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:101
BigReal zx
Definition: Tensor.h:19
static __thread int num_atoms
Molecule * molecule
Definition: Node.h:176
Vector a() const
Definition: Lattice.h:252
const int32 * get_full_exclusions_for_atom(int anum) const
Definition: Molecule.h:1157
static __thread int num_local_atoms
void close(Data **const t)
Definition: Box.h:49
Vector c() const
Definition: Lattice.h:254
static __thread float * intRad0H
static __thread int check_local_count
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
Definition: Patch.C:107
double BigReal
Definition: common.h:114
int step
Definition: PatchTypes.h:16
float GBReal
Definition: ComputeGBIS.inl:17
#define PATCH_PRIORITY(PID)
Definition: Priorities.h:25
CompAtomExt * getCompAtomExtInfo()
Definition: Patch.h:117
static __thread ComputeNonbondedCUDA * cudaCompute
iterator begin(void)
Definition: ResizeArray.h:36
void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[])
Box< Patch, Results > * registerForceDeposit(Compute *cid)
Definition: Patch.C:228