NAMD
CudaComputeNonbonded.C
Go to the documentation of this file.
1 #include <algorithm>
2 #include <map>
3 #include <vector>
4 #include "NamdTypes.h"
5 #include "charm++.h"
6 #include "Patch.h"
7 #include "PatchMap.h"
8 #include "ProxyMgr.h"
9 #include "LJTable.h"
10 #include "Node.h"
11 #include "ObjectArena.h"
12 // #include "ComputeCUDAMgr.h"
13 #include "ReductionMgr.h"
14 #include "CudaComputeNonbonded.h"
15 #include "WorkDistrib.h"
16 #include "HomePatch.h"
17 #include "Priorities.h"
18 #include "ComputePmeCUDAMgr.h"
19 //#include "CudaUtils.h"
20 
21 #include "DeviceCUDA.h"
22 #ifdef NAMD_CUDA
23 #ifdef WIN32
24 #define __thread __declspec(thread)
25 #endif
26 extern __thread DeviceCUDA *deviceCUDA;
27 #endif
28 
29 #ifdef NAMD_CUDA
30 
31 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
32 
33 //
34 // Class constructor
35 //
37  CudaNonbondedTables& cudaNonbondedTables, bool doStreaming) :
38 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
39 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
40 
41  cudaCheck(cudaSetDevice(deviceID));
42 
43  exclusionsByAtom = NULL;
44 
45  vdwTypes = NULL;
46  vdwTypesSize = 0;
47 
48  exclIndexMaxDiff = NULL;
49  exclIndexMaxDiffSize = 0;
50 
51  atomIndex = NULL;
52  atomIndexSize = 0;
53 
54  atomStorageSize = 0;
55 
56  // Atom and charge storage
57  atoms = NULL;
58  atomsSize = 0;
59 
60  // Force storage
61  h_forces = NULL;
62  h_forcesSize = 0;
63  h_forcesSlow = NULL;
64  h_forcesSlowSize = 0;
65 
66  d_forces = NULL;
67  d_forcesSize = 0;
68  d_forcesSlow = NULL;
69  d_forcesSlowSize = 0;
70 
71  // GBIS
72  intRad0H = NULL;
73  intRad0HSize = 0;
74  intRadSH = NULL;
75  intRadSHSize = 0;
76  psiSumH = NULL;
77  psiSumHSize = 0;
78  bornRadH = NULL;
79  bornRadHSize = 0;
80  dEdaSumH = NULL;
81  dEdaSumHSize = 0;
82  dHdrPrefixH = NULL;
83  dHdrPrefixHSize = 0;
84  maxShmemPerBlock = 0;
85  cudaPatches = NULL;
86 
87  atomsChangedIn = true;
88  atomsChanged = true;
89  computesChanged = true;
90 
91  forceDoneEventRecord = false;
92 
94  if (simParams->pressureProfileOn) {
95  NAMD_die("CudaComputeNonbonded, pressure profile not supported");
96  }
97 
98  if (simParams->GBISOn) gbisPhase = 3;
99 
100  doSkip = false;
101 }
102 
103 //
104 // Class destructor
105 //
107  cudaCheck(cudaSetDevice(deviceID));
108  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
109  if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
110  if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
111  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
112  if (h_forces != NULL) deallocate_host<float4>(&h_forces);
113  if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
114  if (d_forces != NULL) deallocate_device<float4>(&d_forces);
115  if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
116 
117  // GBIS
118  if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
119  if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
120  if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
121  if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
122  if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
123  if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
124 
125  if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
126 
127  if (patches.size() > 0) {
128  deallocate_host<VirialEnergy>(&h_virialEnergy);
129  deallocate_device<VirialEnergy>(&d_virialEnergy);
130  cudaCheck(cudaStreamDestroy(stream));
131  cudaCheck(cudaEventDestroy(forceDoneEvent));
132  CmiDestroyLock(lock);
133  delete reduction;
134  }
135 
136  // NOTE: unregistering happens in [sync] -entry method
137  computeMgr->sendUnregisterBoxesOnPe(pes, this);
138 
139 }
140 
141 void CudaComputeNonbonded::unregisterBox(int i) {
142  if (patches[i].positionBox != NULL) patches[i].patch->unregisterPositionPickup(this, &patches[i].positionBox);
143  if (patches[i].forceBox != NULL) patches[i].patch->unregisterForceDeposit(this, &patches[i].forceBox);
144  if (patches[i].intRadBox != NULL) patches[i].patch->unregisterIntRadPickup(this, &patches[i].intRadBox);
145  if (patches[i].psiSumBox != NULL) patches[i].patch->unregisterPsiSumDeposit(this, &patches[i].psiSumBox);
146  if (patches[i].bornRadBox != NULL) patches[i].patch->unregisterBornRadPickup(this, &patches[i].bornRadBox);
147  if (patches[i].dEdaSumBox != NULL) patches[i].patch->unregisterDEdaSumDeposit(this, &patches[i].dEdaSumBox);
148  if (patches[i].dHdrPrefixBox != NULL) patches[i].patch->unregisterDHdrPrefixPickup(this, &patches[i].dHdrPrefixBox);
149 }
150 
152  if (rankPatches[CkMyRank()].size() == 0)
153  NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
154  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
155  unregisterBox(rankPatches[CkMyRank()][i]);
156  }
157 }
158 
159 //
160 // Register inter-patch (self) compute.
161 // Only serialized calls allowed
162 //
164  computesChanged = true;
165  addPatch(pid);
166  addCompute(cid, pid, pid, 0.);
167 }
168 
169 //
170 // Register pair-patch compute.
171 // Only serialized calls allowed
172 //
174  computesChanged = true;
175  addPatch(pid[0]);
176  addPatch(pid[1]);
177  PatchMap* patchMap = PatchMap::Object();
178  int t1 = trans[0];
179  int t2 = trans[1];
180  Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
181  offset.x += (t1%3-1) - (t2%3-1);
182  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
183  offset.z += (t1/9-1) - (t2/9-1);
184  addCompute(cid, pid[0], pid[1], offset);
185 }
186 
187 //
188 // Add patch
189 //
190 void CudaComputeNonbonded::addPatch(PatchID pid) {
191  patches.push_back(PatchRecord(pid));
192 }
193 
194 //
195 // Add compute
196 //
197 void CudaComputeNonbonded::addCompute(ComputeID cid, PatchID pid1, PatchID pid2, Vector offset) {
198  ComputeRecord cr;
199  cr.cid = cid;
200  cr.pid[0] = pid1;
201  cr.pid[1] = pid2;
202  cr.offset = offset;
203  computes.push_back(cr);
204 }
205 
206 //
207 // Update numAtoms and numFreeAtoms on a patch
208 //
209 void CudaComputeNonbonded::updatePatch(int i) {
210  int numAtoms = patches[i].patch->getNumAtoms();
211  int numFreeAtoms = numAtoms;
212  if ( fixedAtomsOn ) {
213  const CompAtomExt *aExt = patches[i].patch->getCompAtomExtInfo();
214  for ( int j=0; j< numAtoms; ++j ) {
215  if ( aExt[j].atomFixed ) --numFreeAtoms;
216  }
217  }
218  patches[i].numAtoms = numAtoms;
219  patches[i].numFreeAtoms = numFreeAtoms;
220  cudaPatches[i].numAtoms = numAtoms;
221  cudaPatches[i].numFreeAtoms = numFreeAtoms;
222 }
223 
224 int CudaComputeNonbonded::findPid(PatchID pid) {
225  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
226  int j = rankPatches[CkMyRank()][i];
227  if (patches[j].patchID == pid) return j;
228  }
229  return -1;
230 }
231 
232 void CudaComputeNonbonded::patchReady(PatchID pid, int doneMigration, int seq) {
233  if (doneMigration) {
234  int i = findPid(pid);
235  if (i == -1)
236  NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
237  updatePatch(i);
238  }
239  CmiLock(lock);
240  Compute::patchReady(pid, doneMigration, seq);
241  CmiUnlock(lock);
242 }
243 
245  CmiLock(lock);
246  Compute::gbisP2PatchReady(pid, seq);
247  CmiUnlock(lock);
248 }
249 
251  CmiLock(lock);
252  Compute::gbisP3PatchReady(pid, seq);
253  CmiUnlock(lock);
254 }
255 
256 void CudaComputeNonbonded::assignPatch(int i) {
257  PatchMap* patchMap = PatchMap::Object();
258  PatchID pid = patches[i].patchID;
259  Patch* patch = patchMap->patch(pid);
260  if (patch == NULL) {
261  // Create ProxyPatch if none exists
263  patch = patchMap->patch(pid);
264  }
265  patches[i].patch = patch;
266  if (patches[i].patch == NULL) {
267  NAMD_bug("CudaComputeNonbonded::assignPatch, patch not found");
268  }
269  patches[i].positionBox = patches[i].patch->registerPositionPickup(this);
270  patches[i].forceBox = patches[i].patch->registerForceDeposit(this);
272  if (simParams->GBISOn) {
273  patches[i].intRadBox = patches[i].patch->registerIntRadPickup(this);
274  patches[i].psiSumBox = patches[i].patch->registerPsiSumDeposit(this);
275  patches[i].bornRadBox = patches[i].patch->registerBornRadPickup(this);
276  patches[i].dEdaSumBox = patches[i].patch->registerDEdaSumDeposit(this);
277  patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(this);
278  }
279  // Store Pe where this patch was registered
280 #if 1
281  if (patches[i].pe != CkMyPe()) {
282  NAMD_bug("CudaComputeNonbonded::assignPatch, patch assigned to incorrect Pe");
283  }
284 #else
285  patches[i].pe = CkMyPe();
286 #endif
287  //
288  patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
289  patches[i].isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
290 }
291 
293  bool operator() (int2 pidj, int2 pidi) { // i and j reversed
294  int ppi = PATCH_PRIORITY(pidi.x);
295  int ppj = PATCH_PRIORITY(pidj.x);
296  if ( ppi != ppj ) return ppi < ppj;
297  return pidi.x < pidj.x;
298  }
299 };
300 
302  if (rankPatches[CkMyRank()].size() == 0)
303  NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
304 
305  // calculate priority rank of local home patch within pe
306  {
307  PatchMap* patchMap = PatchMap::Object();
308  ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
309  for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
310  int i = rankPatches[CkMyRank()][k];
311  int pid = patches[i].patchID;
312  int homePe = patchMap->node(pid);
313  if ( CkNodeOf(homePe) == CkMyNode() ) {
314  int2 pid_index;
315  pid_index.x = pid;
316  pid_index.y = i;
317  homePatchByRank[CkRankOf(homePe)].add(pid_index);
318  }
319  }
320  for ( int i=0; i<CkMyNodeSize(); ++i ) {
322  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
323  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
324  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
325  int index = homePatchByRank[i][j].y;
326  patches[index].reversePriorityRankInPe = j + masterBoost;
327  }
328  }
329  }
330 
331  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
332  assignPatch(rankPatches[CkMyRank()][i]);
333  }
334 }
335 
336 //
337 // Returns Pe of Patch ID "pid", -1 otherwise
338 //
339 // int findHomePatchPe(std::vector<PatchIDList>& rankPatchIDs, PatchID pid) {
340 int findHomePatchPe(PatchIDList* rankPatchIDs, PatchID pid) {
341  // for (int i=0;i < rankPatchIDs.size();i++) {
342  for (int i=0;i < CkMyNodeSize();i++) {
343  if (rankPatchIDs[i].find(pid) != -1) return CkNodeFirst(CkMyNode()) + i;
344  }
345  return -1;
346 }
347 
348 //
349 // Find all PEs that have Patch
350 //
351 void findProxyPatchPes(std::vector<int>& proxyPatchPes, PatchID pid) {
352  proxyPatchPes.clear();
353  for (int i=0;i < CkMyNodeSize();i++) {
354  int pe = CkNodeFirst(CkMyNode()) + i;
355  if (PatchMap::ObjectOnPe(pe)->patch(pid) != NULL)
356  proxyPatchPes.push_back(pe);
357  }
358 }
359 
360 //
361 // Called after all computes have been registered
362 //
364  // Remove duplicate patches
365  std::sort(patches.begin(), patches.end());
366  std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
367  patches.erase(last, patches.end());
368  // Set number of patches
369  setNumPatches(patches.size());
370  masterPe = CkMyPe();
371  computeMgr = computeMgrIn;
372  // Start patch counter
373  patchesCounter = getNumPatches();
374  // Patch ID map
375  std::map<PatchID, int> pidMap;
376 #if 1
377  //-------------------------------------------------------
378  // Copied in from ComputeNonbondedCUDA::assignPatches()
379  //-------------------------------------------------------
380 
381  std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
382  int numPesOnNodeSharingDevice = 0;
383  int masterIndex = -1;
384  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
385  int pe = deviceCUDA->getPesSharingDevice(i);
386  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
387  if ( CkNodeOf(pe) == CkMyNode() ) {
388  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
389  }
390  }
391 
392  std::vector<int> count(patches.size(), 0);
393  std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
394  std::vector<int> rankpcount(CkMyNodeSize(), 0);
395  std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
396 
397  PatchMap* patchMap = PatchMap::Object();
398 
399  int unassignedpatches = patches.size();
400 
401  for (int i=0;i < patches.size(); ++i) {
402  patches[i].pe = -1;
403  }
404 
405  // assign if home pe and build table of natural proxies
406  for (int i=0;i < patches.size(); ++i) {
407  int pid = patches[i].patchID;
408  // homePe = PE where the patch currently resides
409  int homePe = patchMap->node(pid);
410  for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
411  int pe = pesOnNodeSharingDevice[j];
412  // If homePe is sharing this device, assign this patch to homePe
413  if ( pe == homePe ) {
414  patches[i].pe = pe;
415  --unassignedpatches;
416  pcount[j] += 1;
417  }
418  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
419  table[i*numPesOnNodeSharingDevice + j] = 1;
420  }
421  }
422  // Assign this patch to homePe, if it resides on the same node
423  if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
424  patches[i].pe = homePe;
425  --unassignedpatches;
426  rankpcount[CkRankOf(homePe)] += 1;
427  }
428  }
429  // assign if only one pe has a required proxy
430  for (int i=0; i < patches.size(); ++i) {
431  int pid = patches[i].patchID;
432  if ( patches[i].pe != -1 ) continue;
433  int c = 0;
434  int lastj;
435  for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
436  if ( table[i*numPesOnNodeSharingDevice + j] ) {
437  ++c;
438  lastj = j;
439  }
440  }
441  count[i] = c;
442  if ( c == 1 ) {
443  patches[i].pe = pesOnNodeSharingDevice[lastj];
444  --unassignedpatches;
445  pcount[lastj] += 1;
446  }
447  }
448  int assignj = 0;
449  while ( unassignedpatches ) {
450  int i;
451  for (i=0;i < patches.size(); ++i) {
452  if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
453  int pid = patches[i].patchID;
454  // patch_record &pr = patchRecords[pid];
455  if ( patches[i].pe != -1 ) continue;
456  patches[i].pe = pesOnNodeSharingDevice[assignj];
457  --unassignedpatches;
458  pcount[assignj] += 1;
459  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
460  break;
461  }
462  if (i < patches.size() ) continue; // start search again
463  for ( i=0;i < patches.size(); ++i ) {
464  int pid = patches[i].patchID;
465  // patch_record &pr = patchRecords[pid];
466  if ( patches[i].pe != -1 ) continue;
467  if ( count[i] ) continue;
468  patches[i].pe = pesOnNodeSharingDevice[assignj];
469  --unassignedpatches;
470  pcount[assignj] += 1;
471  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
472  break;
473  }
474  if ( i < patches.size() ) continue; // start search again
475  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
476  }
477 
478  // For each rank, list of patches
479  rankPatches.resize(CkMyNodeSize());
480  for (int i=0; i < patches.size(); ++i) {
481  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
482  pidMap[patches[i].patchID] = i;
483  }
484 
485  // for ( int i=0; i < patches.size(); ++i ) {
486  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
487  // }
488 
489 /*
490  slavePes = new int[CkMyNodeSize()];
491  slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
492  numSlaves = 0;
493  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
494  int pe = pesOnNodeSharingDevice[j];
495  int rank = pe - CkNodeFirst(CkMyNode());
496  // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
497  // CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
498  if ( pe == CkMyPe() ) continue;
499  if ( ! pcount[j] && ! rankpcount[rank] ) continue;
500  rankpcount[rank] = 0; // skip in rank loop below
501  slavePes[numSlaves] = pe;
502  computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
503  ++numSlaves;
504  }
505  for ( int j=0; j<CkMyNodeSize(); ++j ) {
506  int pe = CkNodeFirst(CkMyNode()) + j;
507  // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
508  // CkMyPe(),j,pe,rankpcount[j]);
509  if ( ! rankpcount[j] ) continue;
510  if ( pe == CkMyPe() ) continue;
511  slavePes[numSlaves] = pe;
512  computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
513  ++numSlaves;
514  }
515 */
516 
517 #else
518  // For each rank, list of patches
519  rankPatches.resize(CkMyNodeSize());
520  // For each rank, list of home patch IDs
521  PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
522  for (int i=0;i < CkMyNodeSize();i++) {
523  int pe = CkNodeFirst(CkMyNode()) + i;
524  PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
525  }
526  std::vector<int> proxyPatchPes;
527  std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
528  //--------------------------------------------------------
529  // Build a list of PEs to avoid
530  std::vector<int> pesToAvoid;
531 #if 0
532  // Avoid other GPUs' master PEs
533  for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
534  int pe = deviceCUDA->getMasterPeForDeviceID(i);
535  if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
536  }
537  // Avoid PEs that are involved in PME
538  ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
539  for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
540  if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
541  }
542  // Set counters of avoidable PEs to high numbers
543  for (int i=0;i < pesToAvoid.size();i++) {
544  int pe = pesToAvoid[i];
545  peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
546  }
547 #endif
548  // Avoid master Pe somewhat
549  peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
550  //--------------------------------------------------------
551  for (int i=0;i < patches.size();i++) {
552  PatchID pid = patches[i].patchID;
553  int pe = findHomePatchPe(rankHomePatchIDs, pid);
554  if (pe == -1) {
555  // Patch not present on this node => try finding a ProxyPatch
556  findProxyPatchPes(proxyPatchPes, pid);
557  if (proxyPatchPes.size() == 0) {
558  // No ProxyPatch => create one on rank that has the least ProxyPatches
559  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
560  pe = CkNodeFirst(CkMyNode()) + rank;
561  peProxyPatchCounter[rank]++;
562  } else {
563  // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
564  // this is done by finding the entry with minimum peProxyPatchCounter -value
565  // Find miniumum among proxyPatchPes, i.e., find the minimum among
566  // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
567  // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
568  // [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
569  // - proxyPatchPes.begin();
570  // pe = proxyPatchPes[pppi];
571  int minCounter = (1 << 30);
572  for (int j=0;j < proxyPatchPes.size();j++) {
573  if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
574  pe = proxyPatchPes[j];
575  minCounter = peProxyPatchCounter[CkRankOf(pe)];
576  }
577  }
578  if (pe == -1)
579  NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
580  peProxyPatchCounter[CkRankOf(pe)]++;
581  }
582  } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
583  // Found home patch on this node, but it's on PE that should be avoided => find a new one
584  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
585  pe = CkNodeFirst(CkMyNode()) + rank;
586  peProxyPatchCounter[rank]++;
587  }
588  if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
589  NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
590  rankPatches[CkRankOf(pe)].push_back(i);
591  pidMap[pid] = i;
592  }
593 
594  delete [] rankHomePatchIDs;
595 #endif
596  // Setup computes using pidMap
597  for (int i=0;i < computes.size();i++) {
598  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
599  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
600  }
601  for (int i=0;i < CkMyNodeSize();i++) {
602  if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
603  }
604  computeMgr->sendAssignPatchesOnPe(pes, this);
605 }
606 
608  if (patches.size() > 0) {
609  // Allocate CUDA version of patches
610  cudaCheck(cudaSetDevice(deviceID));
611  allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
612 
613  allocate_host<VirialEnergy>(&h_virialEnergy, 1);
614  allocate_device<VirialEnergy>(&d_virialEnergy, 1);
615 
616  /* JM: Queries for maximum sharedMemoryPerBlock on deviceID
617  */
618  cudaDeviceProp props;
619  cudaCheck(cudaGetDeviceProperties(&props, deviceID)); //Gets properties of 'deviceID device'
620  maxShmemPerBlock = props.sharedMemPerBlock;
621 
622 #if CUDA_VERSION >= 5050
623  int leastPriority, greatestPriority;
624  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
625  int priority = (doStreaming) ? leastPriority : greatestPriority;
626  // int priority = greatestPriority;
627  cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
628 #else
629  cudaCheck(cudaStreamCreate(&stream));
630 #endif
631  cudaCheck(cudaEventCreate(&forceDoneEvent));
632 
633  buildExclusions();
634 
635  lock = CmiCreateLock();
636 
638  }
639 }
640 
641 //
642 // atomUpdate() can be called by any Pe
643 //
645  atomsChangedIn = true;
646 }
647 
648 //
649 // Compute patches[].atomStart, patches[].numAtoms, patches[].numFreeAtoms, and atomStorageSize
650 //
651 void CudaComputeNonbonded::updatePatches() {
652 
653  // Maximum number of tiles per tile list
654  maxTileListLen = 0;
655  int atomStart = 0;
656  for (int i=0;i < patches.size();i++) {
657  patches[i].atomStart = atomStart;
658  cudaPatches[i].atomStart = atomStart;
659  int numAtoms = patches[i].numAtoms;
660  int numTiles = ((numAtoms-1)/WARPSIZE+1);
661  maxTileListLen = std::max(maxTileListLen, numTiles);
662  atomStart += numTiles*WARPSIZE;
663  }
664  atomStorageSize = atomStart;
665 
666  if (maxTileListLen >= 65536) {
667  NAMD_bug("CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
668  }
669 }
670 
671 void CudaComputeNonbonded::skipPatch(int i) {
672  if (CkMyPe() != patches[i].pe)
673  NAMD_bug("CudaComputeNonbonded::skipPatch called on wrong Pe");
674  Flags &flags = patches[i].patch->flags;
675  patches[i].positionBox->skip();
676  patches[i].forceBox->skip();
677  if (flags.doGBIS) {
678  patches[i].psiSumBox->skip();
679  patches[i].intRadBox->skip();
680  patches[i].bornRadBox->skip();
681  patches[i].dEdaSumBox->skip();
682  patches[i].dHdrPrefixBox->skip();
683  }
684 }
685 
687  if (rankPatches[CkMyRank()].size() == 0)
688  NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
689  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
690  skipPatch(rankPatches[CkMyRank()][i]);
691  }
692  bool done = false;
693  CmiLock(lock);
694  patchesCounter -= rankPatches[CkMyRank()].size();
695  if (patchesCounter == 0) {
696  patchesCounter = getNumPatches();
697  done = true;
698  }
699  CmiUnlock(lock);
700  if (done) {
701  // Reduction must be done on masterPe
702  computeMgr->sendFinishReductions(masterPe, this);
703  }
704 }
705 
706 void CudaComputeNonbonded::skip() {
707  if (CkMyPe() != masterPe)
708  NAMD_bug("CudaComputeNonbonded::skip() called on non masterPe");
709 
710  if (patches.size() == 0) return;
711 
712  doSkip = true;
713 
714  computeMgr->sendSkipPatchesOnPe(pes, this);
715 }
716 
717 void CudaComputeNonbonded::getMaxMovementTolerance(float& maxAtomMovement, float& maxPatchTolerance) {
718  if (CkMyPe() != masterPe)
719  NAMD_bug("CudaComputeNonbonded::getMaxMovementTolerance() called on non masterPe");
720 
721  for (int i=0;i < patches.size();++i) {
722  PatchRecord &pr = patches[i];
723 
724  float maxMove = pr.patch->flags.maxAtomMovement;
725  if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
726 
727  float maxTol = pr.patch->flags.pairlistTolerance;
728  if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
729  }
730 }
731 
732 inline void CudaComputeNonbonded::updateVdwTypesExclLoop(int first, int last, void *result, int paraNum, void *param) {
734  c->updateVdwTypesExclSubset(first, last);
735 }
736 
737 void CudaComputeNonbonded::updateVdwTypesExclSubset(int first, int last) {
738  for (int i=first;i <= last;i++) {
739  PatchRecord &pr = patches[i];
740  int start = pr.atomStart;
741  int numAtoms = pr.numAtoms;
742  const CompAtom *compAtom = pr.compAtom;
743  const CompAtomExt *compAtomExt = pr.patch->getCompAtomExtInfo();
744  // Atoms have changed, re-do exclusions and vdw types
745  int2* exclp = exclIndexMaxDiff + start;
746  int* aip = atomIndex + start;
747  for ( int k=0;k < numAtoms; ++k ) {
748  int j = compAtomExt[k].sortOrder;
749  vdwTypes[start + k] = compAtom[j].vdwType;
750  aip[k] = compAtomExt[j].id;
751 #ifdef MEM_OPT_VERSION
752  exclp[k].x = exclusionsByAtom[compAtomExt[j].exclId].y;
753  exclp[k].y = exclusionsByAtom[compAtomExt[j].exclId].x;
754 #else // ! MEM_OPT_VERSION
755  exclp[k].x = exclusionsByAtom[compAtomExt[j].id].y;
756  exclp[k].y = exclusionsByAtom[compAtomExt[j].id].x;
757 #endif // MEM_OPT_VERSION
758  }
759  }
760 }
761 
762 //
763 // Called every time atoms changed
764 //
765 void CudaComputeNonbonded::updateVdwTypesExcl() {
766  // Re-allocate (VdwTypes, exclIndexMaxDiff) as needed
767  reallocate_host<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, 1.4f);
768  reallocate_host<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, 1.4f);
769  reallocate_host<int>(&atomIndex, &atomIndexSize, atomStorageSize, 1.4f);
770 
771 #if CMK_SMP && USE_CKLOOP
772  int useCkLoop = Node::Object()->simParameters->useCkLoop;
773  if (useCkLoop >= 1) {
774  CkLoop_Parallelize(updateVdwTypesExclLoop, 1, (void *)this, CkMyNodeSize(), 0, patches.size()-1);
775  } else
776 #endif
777  {
778  updateVdwTypesExclSubset(0, patches.size()-1);
779  }
780 
781  nonbondedKernel.updateVdwTypesExcl(atomStorageSize, vdwTypes, exclIndexMaxDiff, atomIndex, stream);
782 }
783 
784 inline void CudaComputeNonbonded::copyAtomsLoop(int first, int last, void *result, int paraNum, void *param) {
786  c->copyAtomsSubset(first, last);
787 }
788 
789 void CudaComputeNonbonded::copyAtomsSubset(int first, int last) {
790  for (int i=first;i <= last;++i) {
791  PatchRecord &pr = patches[i];
792  int numAtoms = pr.numAtoms;
793  if (numAtoms > 0) {
794  int start = pr.atomStart;
795  const CudaAtom *src = pr.patch->getCudaAtomList();
796  CudaAtom *dst = atoms + start;
797  memcpy(dst, src, sizeof(CudaAtom)*numAtoms);
798  // Fill the rest with the copy of the last atom
799  int numAtomsAlign = ((numAtoms-1)/32+1)*32;
800  CudaAtom lastAtom = src[numAtoms-1];
801  for (int j=numAtoms;j < numAtomsAlign;j++) {
802  dst[j] = lastAtom;
803  }
804  }
805  }
806 }
807 
808 void CudaComputeNonbonded::copyGBISphase(int i) {
809  if (CkMyPe() != patches[i].pe)
810  NAMD_bug("CudaComputeNonbonded::copyGBISphase called on wrong Pe");
811  PatchRecord &pr = patches[i];
812  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
813  if (gbisPhase == 1) {
814  //Copy GBIS intRadius to Host
815  if (atomsChanged) {
816  float *intRad0 = intRad0H + pr.atomStart;
817  float *intRadS = intRadSH + pr.atomStart;
818  for (int k=0;k < pr.numAtoms;++k) {
819  int j = aExt[k].sortOrder;
820  intRad0[k] = pr.intRad[2*j+0];
821  intRadS[k] = pr.intRad[2*j+1];
822  }
823  }
824  } else if (gbisPhase == 2) {
825  float *bornRad = bornRadH + pr.atomStart;
826  for ( int k=0; k < pr.numAtoms; ++k ) {
827  int j = aExt[k].sortOrder;
828  bornRad[k] = pr.bornRad[j];
829  }
830  } else if (gbisPhase == 3) {
831  float *dHdrPrefix = dHdrPrefixH + pr.atomStart;
832  for ( int k=0; k < pr.numAtoms; ++k ) {
833  int j = aExt[k].sortOrder;
834  dHdrPrefix[k] = pr.dHdrPrefix[j];
835  }
836  } // end phases
837 }
838 
839 void CudaComputeNonbonded::openBox(int i) {
840  if (CkMyPe() != patches[i].pe)
841  NAMD_bug("CudaComputeNonbonded::openBox called on wrong Pe");
842  SimParameters *simParams = Node::Object()->simParameters;
843  if (!simParams->GBISOn || gbisPhase == 1) {
844  patches[i].compAtom = patches[i].positionBox->open();
845  copyAtomsSubset(i, i);
846  }
847  if (simParams->GBISOn) {
848  if (gbisPhase == 1) {
849  patches[i].intRad = patches[i].intRadBox->open();
850  patches[i].psiSum = patches[i].psiSumBox->open();
851  } else if (gbisPhase == 2) {
852  patches[i].bornRad = patches[i].bornRadBox->open();
853  patches[i].dEdaSum = patches[i].dEdaSumBox->open();
854  } else if (gbisPhase == 3) {
855  patches[i].dHdrPrefix = patches[i].dHdrPrefixBox->open();
856  }
857  copyGBISphase(i);
858  }
859 }
860 
862  if (masterPe != CkMyPe())
863  NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
865 }
866 
868  if (rankPatches[CkMyRank()].size() == 0)
869  NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
870  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
871  openBox(rankPatches[CkMyRank()][i]);
872  }
873  bool done = false;
874  CmiLock(lock);
875  patchesCounter -= rankPatches[CkMyRank()].size();
876  if (patchesCounter == 0) {
877  patchesCounter = getNumPatches();
878  done = true;
879  }
880  CmiUnlock(lock);
881  if (done) {
882  computeMgr->sendLaunchWork(masterPe, this);
883  }
884 }
885 
887  // Simply enqueu doWork on masterPe and return "no work"
888  computeMgr->sendMessageEnqueueWork(masterPe, this);
889  return 1;
890 }
891 
892 void CudaComputeNonbonded::reallocateArrays() {
893  cudaCheck(cudaSetDevice(deviceID));
894  SimParameters *simParams = Node::Object()->simParameters;
895 
896  // Re-allocate atoms
897  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
898 
899  // Re-allocate forces
900  if (doStreaming) {
901  reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
902  reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
903  } else {
904  reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f);
905  reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f);
906  }
907  reallocate_device<float4>(&d_forces, &d_forcesSize, atomStorageSize, 1.4f);
908  reallocate_device<float4>(&d_forcesSlow, &d_forcesSlowSize, atomStorageSize, 1.4f);
909  nonbondedKernel.reallocate_forceSOA(atomStorageSize);
910 
911  if (simParams->GBISOn) {
912  reallocate_host<float>(&intRad0H, &intRad0HSize, atomStorageSize, 1.2f);
913  reallocate_host<float>(&intRadSH, &intRadSHSize, atomStorageSize, 1.2f);
914  reallocate_host<GBReal>(&psiSumH, &psiSumHSize, atomStorageSize, 1.2f);
915  reallocate_host<GBReal>(&dEdaSumH, &dEdaSumHSize, atomStorageSize, 1.2f);
916  reallocate_host<float>(&bornRadH, &bornRadHSize, atomStorageSize, 1.2f);
917  reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixHSize, atomStorageSize, 1.2f);
918  }
919 }
920 
922  if (CkMyPe() != masterPe)
923  NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
924 
925  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
926  // atomsChangedIn can be set to true by any Pe
927  // atomsChanged can only be set by masterPe
928  // This use of double varibles makes sure we don't have race condition
929  atomsChanged = atomsChangedIn;
930  atomsChangedIn = false;
931 
932  SimParameters *simParams = Node::Object()->simParameters;
933 
934  if (patches.size() == 0) return; // No work do to
935 
936  // Take the flags from the first patch on this Pe
937  // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
938  Flags &flags = patches[0].patch->flags;
939 
940  doSlow = flags.doFullElectrostatics;
941  doEnergy = flags.doEnergy;
942  doVirial = flags.doVirial;
943 
944  if (flags.doNonbonded) {
945 
946  if (simParams->GBISOn) {
947  gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
948  }
949 
950  if (!simParams->GBISOn || gbisPhase == 1) {
951  if ( computesChanged ) {
952  updateComputes();
953  }
954  if (atomsChanged) {
955  // Re-calculate patch atom numbers and storage
956  updatePatches();
957  reSortDone = false;
958  }
959  reallocateArrays();
960  }
961 
962  // Open boxes on Pes and launch work to masterPe
963  computeMgr->sendOpenBoxesOnPe(pes, this);
964 
965  } else {
966  // No work to do, skip
967  skip();
968  }
969 
970 }
971 
973  if (CkMyPe() != masterPe)
974  NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
975 
976  beforeForceCompute = CkWallTimer();
977 
978  cudaCheck(cudaSetDevice(deviceID));
979  SimParameters *simParams = Node::Object()->simParameters;
980 
981  //execute only during GBIS phase 1, or if not using GBIS
982  if (!simParams->GBISOn || gbisPhase == 1) {
983 
984  if ( atomsChanged || computesChanged ) {
985  // Invalidate pair lists
986  pairlistsValid = false;
987  pairlistTolerance = 0.0f;
988  }
989 
990  // Get maximum atom movement and patch tolerance
991  float maxAtomMovement = 0.0f;
992  float maxPatchTolerance = 0.0f;
993  getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
994  // Update pair-list cutoff
995  Flags &flags = patches[0].patch->flags;
996  savePairlists = false;
997  usePairlists = false;
998  if ( flags.savePairlists ) {
999  savePairlists = true;
1000  usePairlists = true;
1001  } else if ( flags.usePairlists ) {
1002  if ( ! pairlistsValid ||
1003  ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1004  reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
1005  } else {
1006  usePairlists = true;
1007  }
1008  }
1009  if ( ! usePairlists ) {
1010  pairlistsValid = false;
1011  }
1012  float plcutoff = cutoff;
1013  if ( savePairlists ) {
1014  pairlistsValid = true;
1015  pairlistTolerance = 2. * maxPatchTolerance;
1016  plcutoff += pairlistTolerance;
1017  }
1018  plcutoff2 = plcutoff * plcutoff;
1019 
1020  // if (atomsChanged)
1021  // CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1022  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1023 
1024  } // if (!simParams->GBISOn || gbisPhase == 1)
1025 
1026  // Calculate PME & VdW forces
1027  if (!simParams->GBISOn || gbisPhase == 1) {
1028  doForce();
1029  if (doStreaming) {
1030  patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
1031  patchReadyQueueLen = tileListKernel.getNumPatches();
1032  patchReadyQueueNext = 0;
1033  // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
1034  int numEmptyPatches = tileListKernel.getNumEmptyPatches();
1035  int* emptyPatches = tileListKernel.getEmptyPatches();
1036  for (int i=0;i < numEmptyPatches;i++) {
1037  PatchRecord &pr = patches[emptyPatches[i]];
1038  memset(h_forces+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1039  if (doSlow) memset(h_forcesSlow+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1040  patchReadyQueue[i] = emptyPatches[i];
1041  }
1042  if (patchReadyQueueLen != patches.size())
1043  NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1044  }
1045  }
1046 
1047  // For GBIS phase 1 at pairlist update, we must re-sort tile list
1048  // before calling doGBISphase1().
1049  if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
1050  // In this code path doGBISphase1() is called in forceDone()
1051  forceDoneSetCallback();
1052  return;
1053  }
1054 
1055  // GBIS Phases
1056  if (simParams->GBISOn) {
1057  if (gbisPhase == 1) {
1058  doGBISphase1();
1059  } else if (gbisPhase == 2) {
1060  doGBISphase2();
1061  } else if (gbisPhase == 3) {
1062  doGBISphase3();
1063  }
1064  }
1065 
1066  // Copy forces to host
1067  if (!simParams->GBISOn || gbisPhase == 3) {
1068  if (!doStreaming) {
1069  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1070  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1071  }
1072  }
1073 
1074  if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
1075  // For GBIS, energies are ready after phase 2
1076  nonbondedKernel.reduceVirialEnergy(tileListKernel,
1077  atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
1078  d_forces, d_forcesSlow, d_virialEnergy, stream);
1079  copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1080  }
1081 
1082  // Setup call back
1083  forceDoneSetCallback();
1084 }
1085 
1086 //
1087 // GBIS Phase 1
1088 //
1089 void CudaComputeNonbonded::doGBISphase1() {
1090  cudaCheck(cudaSetDevice(deviceID));
1091 
1092  if (atomsChanged) {
1093  GBISKernel.updateIntRad(atomStorageSize, intRad0H, intRadSH, stream);
1094  }
1095 
1096  SimParameters *simParams = Node::Object()->simParameters;
1097  Lattice lattice = patches[0].patch->flags.lattice;
1098 
1099  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1100  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1101  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1102 
1103  GBISKernel.GBISphase1(tileListKernel, atomStorageSize,
1104  lata, latb, latc,
1105  simParams->alpha_cutoff-simParams->fsMax, psiSumH, stream);
1106 }
1107 
1108 //
1109 // GBIS Phase 2
1110 //
1111 void CudaComputeNonbonded::doGBISphase2() {
1112  cudaCheck(cudaSetDevice(deviceID));
1113 
1114  SimParameters *simParams = Node::Object()->simParameters;
1115  Lattice lattice = patches[0].patch->flags.lattice;
1116 
1117  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1118  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1119  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1120 
1121  GBISKernel.updateBornRad(atomStorageSize, bornRadH, stream);
1122 
1123  GBISKernel.GBISphase2(tileListKernel, atomStorageSize,
1124  doEnergy, doSlow,
1125  lata, latb, latc,
1126  simParams->cutoff, simParams->nonbondedScaling, simParams->kappa,
1127  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1128  simParams->dielectric, simParams->solvent_dielectric,
1129  d_forces, dEdaSumH, stream);
1130 }
1131 
1132 //
1133 // GBIS Phase 3
1134 //
1135 void CudaComputeNonbonded::doGBISphase3() {
1136  cudaCheck(cudaSetDevice(deviceID));
1137  SimParameters *simParams = Node::Object()->simParameters;
1138  Lattice lattice = patches[0].patch->flags.lattice;
1139 
1140  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1141  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1142  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1143 
1144  if (doSlow) {
1145  GBISKernel.update_dHdrPrefix(atomStorageSize, dHdrPrefixH, stream);
1146 
1147  GBISKernel.GBISphase3(tileListKernel, atomStorageSize,
1148  lata, latb, latc,
1149  simParams->alpha_cutoff-simParams->fsMax, d_forcesSlow, stream);
1150  }
1151 }
1152 
1153 //
1154 // Calculate electrostatic & VdW forces
1155 //
1156 void CudaComputeNonbonded::doForce() {
1157  cudaCheck(cudaSetDevice(deviceID));
1158 
1159  Lattice lattice = patches[0].patch->flags.lattice;
1160  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1161  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1162  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1163  bool doPairlist = (savePairlists || !usePairlists);
1164 
1165  if (doPairlist) {
1166  int numTileLists = calcNumTileLists();
1167  // Build initial tile lists and sort
1168  tileListKernel.buildTileLists(numTileLists, patches.size(), atomStorageSize,
1169  maxTileListLen, lata, latb, latc,
1170  cudaPatches, (const float4*)atoms, plcutoff2, maxShmemPerBlock, stream);
1171  // Prepare tile list for atom-based refinement
1172  tileListKernel.prepareTileList(stream);
1173  }
1174 
1175  if (atomsChanged) {
1176  // Update Vdw types and exclusion index & maxdiff
1177  updateVdwTypesExcl();
1178  }
1179 
1180  beforeForceCompute = CkWallTimer();
1181 
1182  // Calculate forces (and refine tile list if atomsChanged=true)
1183  nonbondedKernel.nonbondedForce(tileListKernel, atomStorageSize, doPairlist,
1184  doEnergy, doVirial, doSlow, lata, latb, latc,
1185  (const float4*)atoms, cutoff2, d_forces, d_forcesSlow, h_forces, h_forcesSlow,
1186  stream);
1187 
1188  if (doPairlist) {
1189  tileListKernel.finishTileList(stream);
1190  }
1191 
1192  traceUserBracketEvent(CUDA_DEBUG_EVENT, beforeForceCompute, CkWallTimer());
1193 }
1194 
1195 //
1196 // Count an upper estimate for the number of tile lists
1197 //
1198 int CudaComputeNonbonded::calcNumTileLists() {
1199  int numTileLists = 0;
1200  for (int i=0;i < computes.size();i++) {
1201  int pi1 = computes[i].patchInd[0];
1202  int numAtoms1 = patches[pi1].numAtoms;
1203  int numTiles1 = (numAtoms1-1)/WARPSIZE+1;
1204  numTileLists += numTiles1;
1205  }
1206  return numTileLists;
1207 }
1208 
1209 //
1210 // Finish & submit reductions
1211 //
1213 
1214  if (CkMyPe() != masterPe)
1215  NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
1216 
1217  // fprintf(stderr, "%d finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
1218 
1219  if (!doSkip) {
1220 
1221  if (doStreaming && (doVirial || doEnergy)) {
1222  // For streaming kernels, we must wait for virials and forces to be copied back to CPU
1223  if (!forceDoneEventRecord)
1224  NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1225  cudaCheck(cudaEventSynchronize(forceDoneEvent));
1226  forceDoneEventRecord = false;
1227  }
1228 
1229  if (doVirial) {
1230  Tensor virialTensor;
1231  virialTensor.xx = h_virialEnergy->virial[0];
1232  virialTensor.xy = h_virialEnergy->virial[1];
1233  virialTensor.xz = h_virialEnergy->virial[2];
1234  virialTensor.yx = h_virialEnergy->virial[3];
1235  virialTensor.yy = h_virialEnergy->virial[4];
1236  virialTensor.yz = h_virialEnergy->virial[5];
1237  virialTensor.zx = h_virialEnergy->virial[6];
1238  virialTensor.zy = h_virialEnergy->virial[7];
1239  virialTensor.zz = h_virialEnergy->virial[8];
1240  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1241  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1242  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1243  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
1244  if (doSlow) {
1245  Tensor virialTensor;
1246  virialTensor.xx = h_virialEnergy->virialSlow[0];
1247  virialTensor.xy = h_virialEnergy->virialSlow[1];
1248  virialTensor.xz = h_virialEnergy->virialSlow[2];
1249  virialTensor.yx = h_virialEnergy->virialSlow[3];
1250  virialTensor.yy = h_virialEnergy->virialSlow[4];
1251  virialTensor.yz = h_virialEnergy->virialSlow[5];
1252  virialTensor.zx = h_virialEnergy->virialSlow[6];
1253  virialTensor.zy = h_virialEnergy->virialSlow[7];
1254  virialTensor.zz = h_virialEnergy->virialSlow[8];
1255  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1256  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1257  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1258  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
1259  }
1260  }
1261  if (doEnergy) {
1262  // if (doSlow)
1263  // printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
1264  SimParameters *simParams = Node::Object()->simParameters;
1265  reduction->item(REDUCTION_LJ_ENERGY) += h_virialEnergy->energyVdw;
1266  reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((simParams->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
1267  // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
1268  if (doSlow) reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
1269  // fprintf(stderr, "h_virialEnergy->energyElec %lf\n", h_virialEnergy->energyElec);
1270  }
1271 
1272  reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
1273  }
1274  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1275  reduction->submit();
1276 
1277  // Reset flags
1278  doSkip = false;
1279  computesChanged = false;
1280 }
1281 
1282 //
1283 // Finish a single patch
1284 //
1285 void CudaComputeNonbonded::finishPatch(int i) {
1286  if (CkMyPe() != patches[i].pe)
1287  NAMD_bug("CudaComputeNonbonded::finishPatch called on wrong Pe");
1288 
1289  PatchRecord &pr = patches[i];
1290  pr.results = pr.forceBox->open();
1291 
1292  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1293  int atomStart = pr.atomStart;
1294  int numAtoms = pr.numAtoms;
1295  if (numAtoms > 0) {
1296  Force *f = pr.results->f[Results::nbond];
1297  Force *f_slow = pr.results->f[Results::slow];
1298  float4 *af = h_forces + atomStart;
1299  float4 *af_slow = h_forcesSlow + atomStart;
1300  // float maxf = 0.0f;
1301  // int maxf_k;
1302  for ( int k=0; k<numAtoms; ++k ) {
1303  int j = aExt[k].sortOrder;
1304  f[j].x += af[k].x;
1305  f[j].y += af[k].y;
1306  f[j].z += af[k].z;
1307  // if (maxf < fabsf(af[k].x) || maxf < fabsf(af[k].y) || maxf < fabsf(af[k].z)) {
1308  // maxf = std::max(maxf, fabsf(af[k].x));
1309  // maxf = std::max(maxf, fabsf(af[k].y));
1310  // maxf = std::max(maxf, fabsf(af[k].z));
1311  // maxf_k = k;
1312  // }
1313  if ( doSlow ) {
1314  f_slow[j].x += af_slow[k].x;
1315  f_slow[j].y += af_slow[k].y;
1316  f_slow[j].z += af_slow[k].z;
1317  }
1318  }
1319  // if (maxf > 10000.0f) {
1320  // fprintf(stderr, "%d %f %f %f\n", maxf_k, af[maxf_k].x, af[maxf_k].y, af[maxf_k].z);
1321  // cudaCheck(cudaStreamSynchronize(stream));
1322  // NAMD_die("maxf!");
1323  // }
1324  }
1325 
1326  pr.positionBox->close(&(pr.compAtom));
1327  pr.forceBox->close(&(pr.results));
1328 }
1329 
1330 //
1331 // Finish a set of patches on this pe
1332 //
1333 void CudaComputeNonbonded::finishSetOfPatchesOnPe(std::vector<int>& patchSet) {
1334  if (patchSet.size() == 0)
1335  NAMD_bug("CudaComputeNonbonded::finishPatchesOnPe, empty rank");
1336  SimParameters *simParams = Node::Object()->simParameters;
1337  // Save value of gbisPhase here because it can change after the last finishGBISPhase() or finishPatch() is called
1338  int gbisPhaseSave = gbisPhase;
1339  // Close Boxes depending on Phase
1340  if (simParams->GBISOn) {
1341  for (int i=0;i < patchSet.size();i++) {
1342  finishGBISPhase(patchSet[i]);
1343  }
1344  }
1345  // Finish patches
1346  if (!simParams->GBISOn || gbisPhaseSave == 3) {
1347  for (int i=0;i < patchSet.size();i++) {
1348  finishPatch(patchSet[i]);
1349  }
1350  }
1351  bool done = false;
1352  CmiLock(lock);
1353  patchesCounter -= patchSet.size();
1354  if (patchesCounter == 0) {
1355  patchesCounter = getNumPatches();
1356  done = true;
1357  }
1358  CmiUnlock(lock);
1359  if (done) {
1360  // Do reductions
1361  if (!simParams->GBISOn || gbisPhaseSave == 3) {
1362  // Reduction must be done on masterPe
1363  computeMgr->sendFinishReductions(masterPe, this);
1364  }
1365  }
1366 }
1367 
1368 //
1369 // Finish all patches that are on this pe
1370 //
1372  finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1373 }
1374 
1375 //
1376 // Finish single patch on this pe
1377 //
1379  std::vector<int> v(1, i);
1380  finishSetOfPatchesOnPe(v);
1381 }
1382 
1383 void CudaComputeNonbonded::finishPatches() {
1384  computeMgr->sendFinishPatchesOnPe(pes, this);
1385 }
1386 
1387 void CudaComputeNonbonded::finishGBISPhase(int i) {
1388  if (CkMyPe() != patches[i].pe)
1389  NAMD_bug("CudaComputeNonbonded::finishGBISPhase called on wrong Pe");
1390  PatchRecord &pr = patches[i];
1391  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1392  int atomStart = pr.atomStart;
1393  if (gbisPhase == 1) {
1394  GBReal *psiSumMaster = psiSumH + atomStart;
1395  for ( int k=0; k<pr.numAtoms; ++k ) {
1396  int j = aExt[k].sortOrder;
1397  pr.psiSum[j] += psiSumMaster[k];
1398  }
1399  pr.psiSumBox->close(&(pr.psiSum));
1400  } else if (gbisPhase == 2) {
1401  GBReal *dEdaSumMaster = dEdaSumH + atomStart;
1402  for ( int k=0; k<pr.numAtoms; ++k ) {
1403  int j = aExt[k].sortOrder;
1404  pr.dEdaSum[j] += dEdaSumMaster[k];
1405  }
1406  pr.dEdaSumBox->close(&(pr.dEdaSum));
1407  } else if (gbisPhase == 3) {
1408  pr.intRadBox->close(&(pr.intRad)); //box 6
1409  pr.bornRadBox->close(&(pr.bornRad)); //box 7
1410  pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
1411  } //end phases
1412 }
1413 
1414 void CudaComputeNonbonded::finishTimers() {
1415  SimParameters *simParams = Node::Object()->simParameters;
1416 
1417  if (simParams->GBISOn) {
1418  if (gbisPhase == 1)
1419  traceUserBracketEvent(CUDA_GBIS1_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1420  if (gbisPhase == 2)
1421  traceUserBracketEvent(CUDA_GBIS2_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1422  if (gbisPhase == 3)
1423  traceUserBracketEvent(CUDA_GBIS3_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1424  } else {
1425  traceUserBracketEvent(CUDA_NONBONDED_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1426  }
1427 }
1428 
1429 //
1430 // Re-sort tile lists if neccessary
1431 //
1432 void CudaComputeNonbonded::reSortTileLists() {
1433  // Re-sort tile lists
1434  SimParameters *simParams = Node::Object()->simParameters;
1435  cudaCheck(cudaSetDevice(deviceID));
1436  tileListKernel.reSortTileLists(simParams->GBISOn, stream);
1437 }
1438 
1439 void CudaComputeNonbonded::forceDoneCheck(void *arg, double walltime) {
1441 
1442  if (CkMyPe() != c->masterPe)
1443  NAMD_bug("CudaComputeNonbonded::forceDoneCheck called on non masterPe");
1444 
1445  SimParameters *simParams = Node::Object()->simParameters;
1446  cudaCheck(cudaSetDevice(c->deviceID));
1447 
1448  if (c->doStreaming) {
1449  int patchInd;
1450  while ( -1 != (patchInd = c->patchReadyQueue[c->patchReadyQueueNext]) ) {
1451  c->patchReadyQueue[c->patchReadyQueueNext] = -1;
1452  c->patchReadyQueueNext++;
1453  c->checkCount = 0;
1454 
1455  if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) {
1456  c->finishTimers();
1457  if (c->atomsChanged && (!simParams->GBISOn || c->gbisPhase == 1) && !c->reSortDone) {
1458  c->reSortTileLists();
1459  c->reSortDone = true;
1460  if (simParams->GBISOn && c->gbisPhase == 1) {
1461  // We must do GBIS Phase 1
1462  c->doGBISphase1();
1463  c->forceDoneSetCallback();
1464  return;
1465  }
1466  }
1467  }
1468 
1469  // Finish patch
1470  int pe = c->patches[patchInd].pe;
1471  PatchID patchID = c->patches[patchInd].patchID; // for priority
1472  c->computeMgr->sendFinishPatchOnPe(pe, c, patchInd, patchID);
1473 
1474  // Last patch, return
1475  if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) return;
1476 
1477  }
1478  } else {
1479  if (!c->forceDoneEventRecord)
1480  NAMD_bug("CudaComputeNonbonded::forceDoneCheck, forceDoneEvent not being recorded");
1481  cudaError_t err = cudaEventQuery(c->forceDoneEvent);
1482  if (err == cudaSuccess) {
1483  // Event has occurred
1484  c->forceDoneEventRecord = false;
1485  c->checkCount = 0;
1486  c->finishTimers();
1487  if (c->atomsChanged && (!simParams->GBISOn || c->gbisPhase == 1) && !c->reSortDone) {
1488  c->reSortTileLists();
1489  c->reSortDone = true;
1490  if (simParams->GBISOn && c->gbisPhase == 1) {
1491  // We must do GBIS Phase 1
1492  c->doGBISphase1();
1493  c->forceDoneSetCallback();
1494  return;
1495  }
1496  }
1497  c->finishPatches();
1498  return;
1499  } else if (err != cudaErrorNotReady) {
1500  // Anything else is an error
1501  char errmsg[256];
1502  sprintf(errmsg,"in CudaComputeNonbonded::forceDoneCheck after polling %d times over %f s",
1503  c->checkCount, walltime - c->beforeForceCompute);
1504  cudaDie(errmsg,err);
1505  }
1506  }
1507 
1508  // if (c->checkCount % 1000 == 0)
1509  // fprintf(stderr, "c->patchReadyQueueNext %d\n", c->patchReadyQueueNext);
1510 
1511  // Event has not occurred
1512  c->checkCount++;
1513  if (c->checkCount >= 1000000) {
1514  char errmsg[256];
1515  sprintf(errmsg,"CudaComputeNonbonded::forceDoneCheck polled %d times over %f s",
1516  c->checkCount, walltime - c->beforeForceCompute);
1517  cudaDie(errmsg,cudaSuccess);
1518  }
1519 
1520  // Call again
1521  CcdCallBacksReset(0, walltime);
1522  CcdCallFnAfter(forceDoneCheck, arg, 0.1);
1523 }
1524 
1525 //
1526 // Set call back for all the work in the stream at this point
1527 //
1528 void CudaComputeNonbonded::forceDoneSetCallback() {
1529  if (CkMyPe() != masterPe)
1530  NAMD_bug("CudaComputeNonbonded::forceDoneSetCallback called on non masterPe");
1531  beforeForceCompute = CkWallTimer();
1532  cudaCheck(cudaSetDevice(deviceID));
1533  if (!doStreaming || doVirial || doEnergy) {
1534  cudaCheck(cudaEventRecord(forceDoneEvent, stream));
1535  forceDoneEventRecord = true;
1536  }
1537  checkCount = 0;
1538  CcdCallBacksReset(0, CmiWallTimer());
1539  // Set the call back at 0.1ms
1540  CcdCallFnAfter(forceDoneCheck, this, 0.1);
1541 }
1542 
1543 struct cr_sortop_distance {
1544  const Lattice &l;
1545  cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
1548  Vector a = l.a();
1549  Vector b = l.b();
1550  Vector c = l.c();
1551  BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
1552  BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
1553  return ( ri < rj );
1554  }
1555 };
1556 
1557 static inline bool sortop_bitreverse(int a, int b) {
1558  if ( a == b ) return 0;
1559  for ( int bit = 1; bit; bit *= 2 ) {
1560  if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
1561  }
1562  return 0;
1563 }
1564 
1569  const CudaComputeNonbonded::PatchRecord *patchrecs) : distop(sod), pr(patchrecs) { }
1570  bool pid_compare_priority(int2 pidi, int2 pidj) {
1571  const CudaComputeNonbonded::PatchRecord &pri = pr[pidi.y];
1572  const CudaComputeNonbonded::PatchRecord &prj = pr[pidj.y];
1573  if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
1574  if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
1575  if ( pri.isSameNode && ! prj.isSameNode ) return 0;
1576  if ( prj.isSameNode && ! pri.isSameNode ) return 1;
1577  if ( pri.isSameNode ) { // and prj.isSameNode
1578  int rpri = pri.reversePriorityRankInPe;
1579  int rprj = prj.reversePriorityRankInPe;
1580  if ( rpri != rprj ) return rpri > rprj;
1581  return sortop_bitreverse(CkRankOf(pri.pe),CkRankOf(prj.pe));
1582  }
1583  int ppi = PATCH_PRIORITY(pidi.x);
1584  int ppj = PATCH_PRIORITY(pidj.x);
1585  if ( ppi != ppj ) return ppi < ppj;
1586  return pidi.x < pidj.x;
1587  }
1589  CudaComputeNonbonded::ComputeRecord i) { // i and j reversed
1590  // Choose patch i (= patch with greater priority)
1591  int2 pidi = pid_compare_priority(make_int2(i.pid[0], i.patchInd[0]), make_int2(i.pid[1], i.patchInd[1])) ? make_int2(i.pid[0], i.patchInd[0]) : make_int2(i.pid[1], i.patchInd[1]);
1592  // Choose patch j
1593  int2 pidj = pid_compare_priority(make_int2(j.pid[0], j.patchInd[0]), make_int2(j.pid[1], j.patchInd[1])) ? make_int2(j.pid[0], j.patchInd[0]) : make_int2(j.pid[1], j.patchInd[1]);
1594  if ( pidi.x != pidj.x ) return pid_compare_priority(pidi, pidj);
1595  return distop(i,j);
1596  }
1597 };
1598 
1599 //
1600 // Setup computes. This is only done at the beginning and at load balancing, hence the lack of
1601 // consideration for performance in the CPU->GPU memory copy.
1602 //
1603 void CudaComputeNonbonded::updateComputes() {
1604  cudaCheck(cudaSetDevice(deviceID));
1605 
1606  Lattice lattice = patches[0].patch->flags.lattice;
1607  cr_sortop_distance so(lattice);
1608  std::stable_sort(computes.begin(), computes.end(), so);
1609 
1610  if (doStreaming) {
1611  cr_sortop_reverse_priority sorp(so, patches.data());
1612  std::stable_sort(computes.begin(), computes.end(), sorp);
1613  }
1614 
1615  CudaComputeRecord* cudaComputes = new CudaComputeRecord[computes.size()];
1616 
1617  for (int i=0;i < computes.size();i++) {
1618  cudaComputes[i].patchInd.x = computes[i].patchInd[0];
1619  cudaComputes[i].patchInd.y = computes[i].patchInd[1];
1620  cudaComputes[i].offsetXYZ.x = computes[i].offset.x;
1621  cudaComputes[i].offsetXYZ.y = computes[i].offset.y;
1622  cudaComputes[i].offsetXYZ.z = computes[i].offset.z;
1623  }
1624 
1625  tileListKernel.updateComputes(computes.size(), cudaComputes, stream);
1626  cudaCheck(cudaStreamSynchronize(stream));
1627 
1628  delete [] cudaComputes;
1629 }
1630 
1631 struct exlist_sortop {
1632  bool operator() (int32 *li, int32 *lj) {
1633  return ( li[1] < lj[1] );
1634  }
1635 };
1636 
1637 //
1638 // Builds the exclusions table. Swiped from ComputeNonbondedCUDA.C
1639 //
1640 void CudaComputeNonbonded::buildExclusions() {
1641  cudaCheck(cudaSetDevice(deviceID));
1642 
1644 
1645 #ifdef MEM_OPT_VERSION
1646  int natoms = mol->exclSigPoolSize;
1647 #else
1648  int natoms = mol->numAtoms;
1649 #endif
1650 
1651  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
1652  exclusionsByAtom = new int2[natoms];
1653 
1654  // create unique sorted lists
1655 
1656  ObjectArena<int32> listArena;
1657  ResizeArray<int32*> unique_lists;
1658  int32 **listsByAtom = new int32*[natoms];
1660  for ( int i=0; i<natoms; ++i ) {
1661  curList.resize(0);
1662  curList.add(0); // always excluded from self
1663 #ifdef MEM_OPT_VERSION
1664  const ExclusionSignature *sig = mol->exclSigPool + i;
1665  int n = sig->fullExclCnt;
1666  for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
1667  n += 1;
1668 #else
1669  const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
1670  int n = mol_list[0] + 1;
1671  for ( int j=1; j<n; ++j ) {
1672  curList.add(mol_list[j] - i);
1673  }
1674 #endif
1675  curList.sort();
1676 
1677  int j;
1678  for ( j=0; j<unique_lists.size(); ++j ) {
1679  if ( n != unique_lists[j][0] ) continue; // no match
1680  int k;
1681  for ( k=0; k<n; ++k ) {
1682  if ( unique_lists[j][k+3] != curList[k] ) break;
1683  }
1684  if ( k == n ) break; // found match
1685  }
1686  if ( j == unique_lists.size() ) { // no match
1687  int32 *list = listArena.getNewArray(n+3);
1688  list[0] = n;
1689  int maxdiff = 0;
1690  maxdiff = -1 * curList[0];
1691  if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
1692  list[1] = maxdiff;
1693  for ( int k=0; k<n; ++k ) {
1694  list[k+3] = curList[k];
1695  }
1696  unique_lists.add(list);
1697  }
1698  listsByAtom[i] = unique_lists[j];
1699  }
1700  // sort lists by maxdiff
1701  std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
1702  long int totalbits = 0;
1703  int nlists = unique_lists.size();
1704  for ( int j=0; j<nlists; ++j ) {
1705  int32 *list = unique_lists[j];
1706  int maxdiff = list[1];
1707  list[2] = totalbits + maxdiff;
1708  totalbits += 2*maxdiff + 1;
1709  }
1710  for ( int i=0; i<natoms; ++i ) {
1711  exclusionsByAtom[i].x = listsByAtom[i][1]; // maxdiff
1712  exclusionsByAtom[i].y = listsByAtom[i][2]; // start
1713  }
1714  delete [] listsByAtom;
1715 
1716  if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
1717 
1718  {
1719  long int bytesneeded = totalbits / 8;
1720  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
1721  CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
1722  unique_lists.size(), bytesneeded);
1723  }
1724 
1725  long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
1726  if ( bytesneeded > bytesavail ) {
1727  char errmsg[512];
1728  sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
1729  "but only %ld bytes can be addressed with 32-bit int.",
1730  unique_lists.size(), bytesneeded, bytesavail);
1731  NAMD_die(errmsg);
1732  }
1733  }
1734 
1735 #define SET_EXCL(EXCL,BASE,DIFF) \
1736  (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
1737 
1738  unsigned int *exclusion_bits = new unsigned int[totalbits/32];
1739  memset(exclusion_bits, 0, totalbits/8);
1740 
1741  long int base = 0;
1742  for ( int i=0; i<unique_lists.size(); ++i ) {
1743  base += unique_lists[i][1];
1744  if ( unique_lists[i][2] != (int32)base ) {
1745  NAMD_bug("CudaComputeNonbonded::build_exclusions base != stored");
1746  }
1747  int n = unique_lists[i][0];
1748  for ( int j=0; j<n; ++j ) {
1749  SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
1750  }
1751  base += unique_lists[i][1] + 1;
1752  }
1753 
1754  int numExclusions = totalbits/32;
1755 
1756  nonbondedKernel.bindExclusions(numExclusions, exclusion_bits);
1757 
1758  delete [] exclusion_bits;
1759 }
1760 
1761 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
void setNumPatches(int n)
Definition: Compute.h:52
#define CUDA_GBIS2_KERNEL_EVENT
Definition: DeviceCUDA.h:13
BigReal zy
Definition: Tensor.h:19
Type * getNewArray(int n)
Definition: ObjectArena.h:49
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
int getDeviceCount()
Definition: DeviceCUDA.h:87
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
virtual void gbisP3PatchReady(PatchID, int seq)
Definition: Compute.C:94
void prepareTileList(cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
BigReal xz
Definition: Tensor.h:17
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
BigReal solvent_dielectric
int sortOrder
Definition: NamdTypes.h:87
cr_sortop_reverse_priority(cr_sortop_distance &sod, const CudaComputeNonbonded::PatchRecord *patchrecs)
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
short int32
Definition: dumpdcd.c:24
int ComputeID
Definition: NamdTypes.h:183
static PatchMap * Object()
Definition: PatchMap.h:27
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1664
Definition: Vector.h:64
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:43
SimParameters * simParameters
Definition: Node.h:178
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1653
BigReal nonbondedScaling
void basePatchIDList(int pe, PatchIDList &)
Definition: PatchMap.C:454
virtual void gbisP2PatchReady(PatchID, int seq)
int savePairlists
Definition: PatchTypes.h:39
static const Molecule * mol
BigReal & item(int i)
Definition: ReductionMgr.h:312
BigReal z
Definition: Vector.h:66
int usePairlists
Definition: PatchTypes.h:38
BigReal yz
Definition: Tensor.h:18
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2727
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
bool pid_compare_priority(int pidi, int pidj)
Patch * patch(PatchID pid)
Definition: PatchMap.h:235
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
void CcdCallBacksReset(void *ignored, double curWallTime)
const CudaComputeNonbonded::PatchRecord * pr
void reallocate_forceSOA(int atomStorageSize)
BigReal switchingDist
virtual void gbisP2PatchReady(PatchID, int seq)
Definition: Compute.C:84
Definition: Patch.h:35
bool operator()(int pidj, int pidi)
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
#define SET_EXCL(EXCL, BASE, DIFF)
static ComputePmeCUDAMgr * Object()
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1675
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
bool pid_compare_priority(int2 pidi, int2 pidj)
virtual void gbisP3PatchReady(PatchID, int seq)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
void NAMD_bug(const char *err_msg)
Definition: common.C:123
#define CUDA_DEBUG_EVENT
Definition: DeviceCUDA.h:10
int doEnergy
Definition: PatchTypes.h:20
CudaComputeNonbonded(ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
int doFullElectrostatics
Definition: PatchTypes.h:23
BigReal yx
Definition: Tensor.h:18
iterator end(void)
Definition: ResizeArray.h:37
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:381
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1686
#define CUDA_GBIS3_KERNEL_EVENT
Definition: DeviceCUDA.h:14
int priority(void)
Definition: Compute.h:65
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
BigReal x
Definition: Vector.h:66
cr_sortop_distance(const Lattice &lattice)
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:102
void finishTileList(cudaStream_t stream)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
BigReal alpha_cutoff
int numAtoms
Definition: Molecule.h:556
int PatchID
Definition: NamdTypes.h:182
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1612
void createProxy(PatchID pid)
Definition: ProxyMgr.C:493
int doNonbonded
Definition: PatchTypes.h:22
void NAMD_die(const char *err_msg)
Definition: common.C:83
void registerComputeSelf(ComputeID cid, PatchID pid)
static bool sortop_bitreverse(int a, int b)
BigReal xx
Definition: Tensor.h:17
double virialSlow[9]
__global__ void const int numTileLists
bool operator()(ComputeNonbondedCUDA::compute_record i, ComputeNonbondedCUDA::compute_record j)
int add(const Elem &elem)
Definition: ResizeArray.h:97
bool operator()(ComputeNonbondedCUDA::compute_record j, ComputeNonbondedCUDA::compute_record i)
BigReal zz
Definition: Tensor.h:19
virtual void patchReady(PatchID, int doneMigration, int seq)
int gbisPhase
Definition: Compute.h:39
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
BlockRadixSort::TempStorage sort
#define simParams
Definition: Output.C:127
short vdwType
Definition: NamdTypes.h:55
void resize(int i)
Definition: ResizeArray.h:84
int node(int pid) const
Definition: PatchMap.h:114
Definition: Tensor.h:15
#define CUDA_NONBONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:11
void registerComputePair(ComputeID cid, PatchID *pid, int *trans)
BigReal xy
Definition: Tensor.h:17
int getNumPatches()
Definition: Compute.h:53
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
int doVirial
Definition: PatchTypes.h:21
BigReal y
Definition: Vector.h:66
Vector b() const
Definition: Lattice.h:253
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
Bool pressureProfileOn
BigReal yy
Definition: Tensor.h:18
#define CUDA_GBIS1_KERNEL_EVENT
Definition: DeviceCUDA.h:12
void assignPatches(ComputeMgr *computeMgrIn)
bool operator()(int32 *li, int32 *lj)
#define WARPSIZE
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
BigReal dielectric
int doGBIS
Definition: PatchTypes.h:28
void submit(void)
Definition: ReductionMgr.h:323
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:63
int size(void) const
Definition: ResizeArray.h:127
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1639
void sendFinishPatchOnPe(int pe, CudaComputeNonbonded *c, int i, PatchID patchID)
Definition: ComputeMgr.C:1626
#define MAX_EXCLUSIONS
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:101
BigReal zx
Definition: Tensor.h:19
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
Molecule * molecule
Definition: Node.h:176
Vector a() const
Definition: Lattice.h:252
const ComputeID cid
Definition: Compute.h:43
const int32 * get_full_exclusions_for_atom(int anum) const
Definition: Molecule.h:1157
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1586
Vector c() const
Definition: Lattice.h:254
double BigReal
Definition: common.h:112
float GBReal
Definition: ComputeGBIS.inl:17
#define PATCH_PRIORITY(PID)
Definition: Priorities.h:25
void sendSkipPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1599
iterator begin(void)
Definition: ResizeArray.h:36
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)