NAMD
Classes | Public Member Functions | Static Public Member Functions | Public Attributes | List of all members
ComputeNonbondedCUDA Class Reference

#include <ComputeNonbondedCUDA.h>

Inheritance diagram for ComputeNonbondedCUDA:
Compute ComputeNonbondedUtil

Classes

struct  compute_record
 
struct  patch_record
 

Public Member Functions

 ComputeNonbondedCUDA (ComputeID c, ComputeMgr *mgr, ComputeNonbondedCUDA *m=0, int idx=-1)
 
 ~ComputeNonbondedCUDA ()
 
void atomUpdate ()
 
void doWork ()
 
int noWork ()
 
void skip ()
 
void recvYieldDevice (int pe)
 
int finishWork ()
 
void finishReductions ()
 
void finishPatch (int)
 
void messageFinishPatch (int)
 
void requirePatch (int pid)
 
void assignPatches ()
 
void registerPatches ()
 
- Public Member Functions inherited from Compute
 Compute (ComputeID)
 
int type ()
 
virtual ~Compute ()
 
void setNumPatches (int n)
 
int getNumPatches ()
 
virtual void initialize ()
 
virtual void patchReady (PatchID, int doneMigration, int seq)
 
int sequence (void)
 
int priority (void)
 
int getGBISPhase (void)
 
virtual void gbisP2PatchReady (PatchID, int seq)
 
virtual void gbisP3PatchReady (PatchID, int seq)
 

Static Public Member Functions

static void build_lj_table ()
 
static void build_force_table ()
 
static void build_exclusions ()
 

Public Attributes

LocalWorkMsglocalWorkMsg2
 
int workStarted
 
Lattice lattice
 
int doSlow
 
int doEnergy
 
int step
 
ResizeArray< int > activePatches
 
ResizeArray< int > localActivePatches
 
ResizeArray< int > remoteActivePatches
 
ResizeArray< int > hostedPatches
 
ResizeArray< int > localHostedPatches
 
ResizeArray< int > remoteHostedPatches
 
ResizeArray< patch_recordpatchRecords
 
ResizeArray< compute_recordcomputeRecords
 
ResizeArray< compute_recordlocalComputeRecords
 
ResizeArray< compute_recordremoteComputeRecords
 
int forces_size
 
float4 * forces
 
int slow_forces_size
 
float4 * slow_forces
 
int psiSumH_size
 
GBRealpsiSumH
 
int dEdaSumH_size
 
GBRealdEdaSumH
 
int deviceID
 
PatchMappatchMap
 
AtomMapatomMap
 
SubmitReductionreduction
 
ComputeNonbondedCUDAmaster
 
int masterPe
 
int slaveIndex
 
ComputeNonbondedCUDA ** slaves
 
int * slavePes
 
int numSlaves
 
int atomsChanged
 
int computesChanged
 
int patchPairsReordered
 
int pairlistsValid
 
float pairlistTolerance
 
int usePairlists
 
int savePairlists
 
float plcutoff2
 
int atoms_size
 
CudaAtomatoms
 
- Public Attributes inherited from Compute
const ComputeID cid
 
LDObjHandle ldObjHandle
 
LocalWorkMsg *const localWorkMsg
 

Additional Inherited Members

- Protected Member Functions inherited from Compute
void enqueueWork ()
 
- Protected Attributes inherited from Compute
int computeType
 
int basePriority
 
int gbisPhase
 
int gbisPhasePriority [3]
 

Detailed Description

Definition at line 20 of file ComputeNonbondedCUDA.h.

Constructor & Destructor Documentation

ComputeNonbondedCUDA::ComputeNonbondedCUDA ( ComputeID  c,
ComputeMgr mgr,
ComputeNonbondedCUDA m = 0,
int  idx = -1 
)

Definition at line 433 of file ComputeNonbondedCUDA.C.

References atomMap, atoms, atoms_size, atomsChanged, Compute::basePriority, build_exclusions(), computeMgr, computesChanged, cuda_errcheck(), cuda_init(), cudaCompute, dEdaSumH, dEdaSumH_size, deviceCUDA, deviceID, end_local_download, end_remote_download, forces, forces_size, SimParameters::GBISOn, DeviceCUDA::getDeviceID(), DeviceCUDA::getMergeGrids(), DeviceCUDA::getNoMergeGrids(), DeviceCUDA::getNoStreaming(), DeviceCUDA::getSharedGpu(), init_arrays(), localWorkMsg2, master, masterPe, max_grid_size, NAMD_bug(), NAMD_die(), PatchMap::numPatches(), PatchMap::Object(), AtomMap::Object(), Node::Object(), pairlistsValid, pairlistTolerance, patch_pair_num_ptr, patch_pairs_ptr, patchMap, patchPairsReordered, patchRecords, plcutoff2, SimParameters::PMEOffload, SimParameters::PMEOn, SimParameters::pressureProfileOn, PRIORITY_SIZE, PROXY_DATA_PRIORITY, psiSumH, psiSumH_size, reduction, registerPatches(), savePairlists, DeviceCUDA::setGpuIsMine(), DeviceCUDA::setMergeGrids(), Node::simParameters, slaveIndex, slavePes, slaves, slow_forces, slow_forces_size, start_calc, stream, stream2, usePairlists, SimParameters::usePMECUDA, and workStarted.

434  : Compute(c), slaveIndex(idx) {
435 #ifdef PRINT_GBIS
436  CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
437 #endif
438 
439  if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
440  if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
441  if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
442 
443  // CkPrintf("create ComputeNonbondedCUDA\n");
444  master = m ? m : this;
445  cudaCompute = this;
446  computeMgr = mgr;
449  reduction = 0;
450 
452  if (params->pressureProfileOn) {
453  NAMD_die("pressure profile not supported in CUDA");
454  }
455 
456  atomsChanged = 1;
457  computesChanged = 1;
459 
460  pairlistsValid = 0;
461  pairlistTolerance = 0.;
462  usePairlists = 0;
463  savePairlists = 0;
464  plcutoff2 = 0.;
465 
466  workStarted = 0;
469 
470  // Zero array sizes and pointers
471  init_arrays();
472 
473  atoms_size = 0;
474  atoms = NULL;
475 
476  forces_size = 0;
477  forces = NULL;
478 
479  slow_forces_size = 0;
480  slow_forces = NULL;
481 
482  psiSumH_size = 0;
483  psiSumH = NULL;
484 
485  dEdaSumH_size = 0;
486  dEdaSumH = NULL;
487 
488  if ( master != this ) { // I am slave
490  master->slaves[slaveIndex] = this;
491  if ( master->slavePes[slaveIndex] != CkMyPe() ) {
492  NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
493  }
495  registerPatches();
496  return;
497  }
498  masterPe = CkMyPe();
499 
500  const bool streaming = ! (deviceCUDA->getNoStreaming() || params->GBISOn);
501  if ( streaming && ! deviceCUDA->getSharedGpu() && ! deviceCUDA->getNoMergeGrids() )
503 
504  // Sanity checks for New CUDA kernel
505  if (params->GBISOn) {
506  // GBIS
507  if (deviceCUDA->getNoMergeGrids()) {
508  NAMD_die("CUDA kernel cannot use +nomergegrids with GBIS simulations");
509  }
510  // Set mergegrids ON as long as user hasn't defined +nomergegrids
512  // Final sanity check
513  if (!deviceCUDA->getMergeGrids()) NAMD_die("CUDA GBIS kernel final sanity check failed");
514  } else {
515  // non-GBIS
517  NAMD_die("CUDA kernel requires +mergegrids if +nostreaming is used");
518  }
519  }
520 
521 #if CUDA_VERSION >= 5050
522  int leastPriority, greatestPriority;
523  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
524  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
525  if ( leastPriority != greatestPriority ) {
526  if ( CkMyNode() == 0 ) {
527  int dev = deviceCUDA->getDeviceID();
528  CkPrintf("CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
529  }
530  if ( deviceCUDA->getMergeGrids() && params->PMEOn && params->PMEOffload && !params->usePMECUDA) {
531  greatestPriority = leastPriority;
532  }
533  if (params->usePMECUDA) greatestPriority = leastPriority;
534  cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority);
535  cudaStreamCreateWithPriority(&stream2,cudaStreamDefault,leastPriority);
536  } else
537 #endif
538  {
539  cudaStreamCreate(&stream);
540  cuda_errcheck("cudaStreamCreate");
541  int dev = deviceCUDA->getDeviceID();
542  cudaDeviceProp deviceProp;
543  cudaGetDeviceProperties(&deviceProp, dev);
544  cuda_errcheck("cudaGetDeviceProperties");
545  if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
546  if ( CkMyNode() == 0 ) CkPrintf("CUDA device %d supports concurrent kernels.\n", dev);
547  cudaStreamCreate(&stream2);
548  } else {
549  stream2 = stream;
550  }
551  }
552  cuda_errcheck("cudaStreamCreate");
553 
554  // Get GPU device ID
556 
557  cuda_init();
558  if ( max_grid_size < 65535 ) NAMD_bug("bad CUDA max_grid_size");
560  // cudaEventCreate(&start_upload);
561  cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
562  cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
563  cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
564 
565  patchRecords.resize(patchMap->numPatches());
568 
569  if ( params->PMEOn && params->PMEOffload && !params->usePMECUDA) deviceCUDA->setGpuIsMine(0);
570 }
static Node * Object()
Definition: Node.h:86
bool getSharedGpu()
Definition: DeviceCUDA.h:98
static __thread cudaEvent_t end_remote_download
static PatchMap * Object()
Definition: PatchMap.h:27
void setMergeGrids(const int val)
Definition: DeviceCUDA.h:96
static __thread ComputeMgr * computeMgr
SimParameters * simParameters
Definition: Node.h:178
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
int getMergeGrids()
Definition: DeviceCUDA.h:95
SubmitReduction * reduction
static __thread cudaEvent_t end_local_download
__thread cudaStream_t stream
static __thread ResizeArray< int > * patch_pair_num_ptr
#define PRIORITY_SIZE
Definition: Priorities.h:13
void setGpuIsMine(const int val)
Definition: DeviceCUDA.h:105
ComputeNonbondedCUDA ** slaves
void NAMD_bug(const char *err_msg)
Definition: common.C:123
ComputeNonbondedCUDA * master
void NAMD_die(const char *err_msg)
Definition: common.C:83
void init_arrays()
LocalWorkMsg * localWorkMsg2
static AtomMap * Object()
Definition: AtomMap.h:36
static __thread ResizeArray< patch_pair > * patch_pairs_ptr
int getDeviceID()
Definition: DeviceCUDA.h:107
void cuda_init()
int numPatches(void) const
Definition: PatchMap.h:59
int getNoMergeGrids()
Definition: DeviceCUDA.h:94
void cuda_errcheck(const char *msg)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
Bool pressureProfileOn
int getNoStreaming()
Definition: DeviceCUDA.h:93
static __thread cudaEvent_t start_calc
__thread int max_grid_size
ResizeArray< patch_record > patchRecords
int basePriority
Definition: Compute.h:37
__thread cudaStream_t stream2
Compute(ComputeID)
Definition: Compute.C:33
static __thread ComputeNonbondedCUDA * cudaCompute
ComputeNonbondedCUDA::~ComputeNonbondedCUDA ( )

Definition at line 573 of file ComputeNonbondedCUDA.C.

573 { ; }

Member Function Documentation

void ComputeNonbondedCUDA::assignPatches ( )

Definition at line 657 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< T >::add(), computeMgr, deviceCUDA, DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputeNonbondedCUDA::patch_record::hostPe, PatchMap::node(), numSlaves, ReductionMgr::Object(), PatchMap::ObjectOnPe(), patchMap, patchRecords, reduction, REDUCTIONS_BASIC, registerPatches(), ComputeMgr::sendCreateNonbondedCUDASlave(), ResizeArray< T >::size(), slavePes, slaves, sort, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

657  {
658 
659  int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
660  int numPesOnNodeSharingDevice = 0;
661  int masterIndex = -1;
662  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
663  int pe = deviceCUDA->getPesSharingDevice(i);
664  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
665  if ( CkNodeOf(pe) == CkMyNode() ) {
666  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
667  }
668  }
669 
670  int npatches = activePatches.size();
671 
672  if ( npatches ) {
674  }
675 
676  // calculate priority rank of local home patch within pe
677  {
678  ResizeArray< ResizeArray<int> > homePatchByRank(CkMyNodeSize());
679  for ( int i=0; i<npatches; ++i ) {
680  int pid = activePatches[i];
681  int homePe = patchMap->node(pid);
682  if ( CkNodeOf(homePe) == CkMyNode() ) {
683  homePatchByRank[CkRankOf(homePe)].add(pid);
684  }
685  }
686  for ( int i=0; i<CkMyNodeSize(); ++i ) {
688  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
689  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
690  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
691  int pid = homePatchByRank[i][j];
692  patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
693  }
694  }
695  }
696 
697  int *count = new int[npatches];
698  memset(count, 0, sizeof(int)*npatches);
699  int *pcount = new int[numPesOnNodeSharingDevice];
700  memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
701  int *rankpcount = new int[CkMyNodeSize()];
702  memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
703  char *table = new char[npatches*numPesOnNodeSharingDevice];
704  memset(table, 0, npatches*numPesOnNodeSharingDevice);
705 
706  int unassignedpatches = npatches;
707 
708  if ( 0 ) { // assign all to device pe
709  for ( int i=0; i<npatches; ++i ) {
710  int pid = activePatches[i];
711  patch_record &pr = patchRecords[pid];
712  pr.hostPe = CkMyPe();
713  }
714  unassignedpatches = 0;
715  pcount[masterIndex] = npatches;
716  } else
717 
718  // assign if home pe and build table of natural proxies
719  for ( int i=0; i<npatches; ++i ) {
720  int pid = activePatches[i];
721  patch_record &pr = patchRecords[pid];
722  int homePe = patchMap->node(pid);
723  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
724  int pe = pesOnNodeSharingDevice[j];
725  if ( pe == homePe ) {
726  pr.hostPe = pe; --unassignedpatches;
727  pcount[j] += 1;
728  }
729  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
730  table[i*numPesOnNodeSharingDevice+j] = 1;
731  }
732  }
733  if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
734  pr.hostPe = homePe; --unassignedpatches;
735  rankpcount[CkRankOf(homePe)] += 1;
736  }
737  }
738  // assign if only one pe has a required proxy
739  int assignj = 0;
740  for ( int i=0; i<npatches; ++i ) {
741  int pid = activePatches[i];
742  patch_record &pr = patchRecords[pid];
743  if ( pr.hostPe != -1 ) continue;
744  int c = 0;
745  int lastj;
746  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
747  if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
748  }
749  count[i] = c;
750  if ( c == 1 ) {
751  pr.hostPe = pesOnNodeSharingDevice[lastj];
752  --unassignedpatches;
753  pcount[lastj] += 1;
754  }
755  }
756  while ( unassignedpatches ) {
757  int i;
758  for ( i=0; i<npatches; ++i ) {
759  if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
760  int pid = activePatches[i];
761  patch_record &pr = patchRecords[pid];
762  if ( pr.hostPe != -1 ) continue;
763  pr.hostPe = pesOnNodeSharingDevice[assignj];
764  --unassignedpatches;
765  pcount[assignj] += 1;
766  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
767  break;
768  }
769  if ( i<npatches ) continue; // start search again
770  for ( i=0; i<npatches; ++i ) {
771  int pid = activePatches[i];
772  patch_record &pr = patchRecords[pid];
773  if ( pr.hostPe != -1 ) continue;
774  if ( count[i] ) continue;
775  pr.hostPe = pesOnNodeSharingDevice[assignj];
776  --unassignedpatches;
777  pcount[assignj] += 1;
778  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
779  break;
780  }
781  if ( i<npatches ) continue; // start search again
782  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
783  }
784 
785  for ( int i=0; i<npatches; ++i ) {
786  int pid = activePatches[i];
787  patch_record &pr = patchRecords[pid];
788  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
789  }
790 
791  slavePes = new int[CkMyNodeSize()];
792  slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
793  numSlaves = 0;
794  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
795  int pe = pesOnNodeSharingDevice[j];
796  int rank = pe - CkNodeFirst(CkMyNode());
797  // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
798  // CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
799  if ( pe == CkMyPe() ) continue;
800  if ( ! pcount[j] && ! rankpcount[rank] ) continue;
801  rankpcount[rank] = 0; // skip in rank loop below
802  slavePes[numSlaves] = pe;
804  ++numSlaves;
805  }
806  for ( int j=0; j<CkMyNodeSize(); ++j ) {
807  int pe = CkNodeFirst(CkMyNode()) + j;
808  // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
809  // CkMyPe(),j,pe,rankpcount[j]);
810  if ( ! rankpcount[j] ) continue;
811  if ( pe == CkMyPe() ) continue;
812  slavePes[numSlaves] = pe;
814  ++numSlaves;
815  }
816  registerPatches();
817 
818  delete [] pesOnNodeSharingDevice;
819  delete [] count;
820  delete [] pcount;
821  delete [] rankpcount;
822  delete [] table;
823 }
static __thread ComputeMgr * computeMgr
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
SubmitReduction * reduction
ComputeNonbondedCUDA ** slaves
void sendCreateNonbondedCUDASlave(int, int)
Definition: ComputeMgr.C:1511
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:102
ResizeArray< int > activePatches
BlockRadixSort::TempStorage sort
int node(int pid) const
Definition: PatchMap.h:114
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
ResizeArray< patch_record > patchRecords
int size(void) const
Definition: ResizeArray.h:127
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:101
void ComputeNonbondedCUDA::atomUpdate ( void  )
virtual

Reimplemented from Compute.

Definition at line 1038 of file ComputeNonbondedCUDA.C.

References atomsChanged.

1038  {
1039  //fprintf(stderr, "%d ComputeNonbondedCUDA::atomUpdate\n",CkMyPe());
1040  atomsChanged = 1;
1041 }
void ComputeNonbondedCUDA::build_exclusions ( )
static

Definition at line 255 of file ComputeNonbondedCUDA.C.

References ResizeArray< Elem >::add(), ResizeArray< T >::add(), ResizeArray< T >::begin(), cuda_bind_exclusions(), ResizeArray< T >::end(), exclusionsByAtom, ExclusionSignature::fullExclCnt, ExclusionSignature::fullOffset, Molecule::get_full_exclusions_for_atom(), ObjectArena< Type >::getNewArray(), MAX_EXCLUSIONS, ComputeNonbondedUtil::mol, Node::molecule, NAMD_bug(), NAMD_die(), Molecule::numAtoms, Node::Object(), ResizeArray< Elem >::resize(), SET_EXCL, ResizeArray< T >::size(), and SortableResizeArray< Type >::sort().

Referenced by build_cuda_exclusions(), and ComputeNonbondedCUDA().

255  {
257 
258 #ifdef MEM_OPT_VERSION
259  int natoms = mol->exclSigPoolSize;
260 #else
261  int natoms = mol->numAtoms;
262 #endif
263 
264  delete [] exclusionsByAtom;
265  exclusionsByAtom = new int2[natoms];
266 
267  // create unique sorted lists
268 
269  ObjectArena<int32> listArena;
270  ResizeArray<int32*> unique_lists;
271  int32 **listsByAtom = new int32*[natoms];
273  for ( int i=0; i<natoms; ++i ) {
274  curList.resize(0);
275  curList.add(0); // always excluded from self
276 #ifdef MEM_OPT_VERSION
277  const ExclusionSignature *sig = mol->exclSigPool + i;
278  int n = sig->fullExclCnt;
279  for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
280  n += 1;
281 #else
282  const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
283  int n = mol_list[0] + 1;
284  for ( int j=1; j<n; ++j ) {
285  curList.add(mol_list[j] - i);
286  }
287 #endif
288  curList.sort();
289 
290  int j;
291  for ( j=0; j<unique_lists.size(); ++j ) {
292  if ( n != unique_lists[j][0] ) continue; // no match
293  int k;
294  for ( k=0; k<n; ++k ) {
295  if ( unique_lists[j][k+3] != curList[k] ) break;
296  }
297  if ( k == n ) break; // found match
298  }
299  if ( j == unique_lists.size() ) { // no match
300  int32 *list = listArena.getNewArray(n+3);
301  list[0] = n;
302  int maxdiff = 0;
303  maxdiff = -1 * curList[0];
304  if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
305  list[1] = maxdiff;
306  for ( int k=0; k<n; ++k ) {
307  list[k+3] = curList[k];
308  }
309  unique_lists.add(list);
310  }
311  listsByAtom[i] = unique_lists[j];
312  }
313  // sort lists by maxdiff
314  std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
315  long int totalbits = 0;
316  int nlists = unique_lists.size();
317  for ( int j=0; j<nlists; ++j ) {
318  int32 *list = unique_lists[j];
319  int maxdiff = list[1];
320  list[2] = totalbits + maxdiff;
321  totalbits += 2*maxdiff + 1;
322  }
323  for ( int i=0; i<natoms; ++i ) {
324  exclusionsByAtom[i].x = listsByAtom[i][1]; // maxdiff
325  exclusionsByAtom[i].y = listsByAtom[i][2]; // start
326  }
327  delete [] listsByAtom;
328 
329  if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
330 
331  {
332  long int bytesneeded = totalbits / 8;
333  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
334  CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
335  unique_lists.size(), bytesneeded);
336  }
337 
338  long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
339  if ( bytesneeded > bytesavail ) {
340  char errmsg[512];
341  sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
342  "but only %ld bytes can be addressed with 32-bit int.",
343  unique_lists.size(), bytesneeded, bytesavail);
344  NAMD_die(errmsg);
345  }
346  }
347 
348 #define SET_EXCL(EXCL,BASE,DIFF) \
349  (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
350 
351  unsigned int *exclusion_bits = new unsigned int[totalbits/32];
352  memset(exclusion_bits, 0, totalbits/8);
353 
354  long int base = 0;
355  for ( int i=0; i<unique_lists.size(); ++i ) {
356  base += unique_lists[i][1];
357  if ( unique_lists[i][2] != (int32)base ) {
358  NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
359  }
360  int n = unique_lists[i][0];
361  for ( int j=0; j<n; ++j ) {
362  SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
363  }
364  base += unique_lists[i][1] + 1;
365  }
366 
367  cuda_bind_exclusions(exclusion_bits, totalbits/32);
368 
369  delete [] exclusion_bits;
370 }
static Node * Object()
Definition: Node.h:86
Type * getNewArray(int n)
Definition: ObjectArena.h:49
void cuda_bind_exclusions(const unsigned int *t, int n)
short int32
Definition: dumpdcd.c:24
static const Molecule * mol
static __thread int2 * exclusionsByAtom
void NAMD_bug(const char *err_msg)
Definition: common.C:123
iterator end(void)
Definition: ResizeArray.h:37
int numAtoms
Definition: Molecule.h:556
void NAMD_die(const char *err_msg)
Definition: common.C:83
int add(const Elem &elem)
Definition: ResizeArray.h:97
void resize(int i)
Definition: ResizeArray.h:84
int size(void) const
Definition: ResizeArray.h:127
#define SET_EXCL(EXCL, BASE, DIFF)
#define MAX_EXCLUSIONS
Molecule * molecule
Definition: Node.h:176
const int32 * get_full_exclusions_for_atom(int anum) const
Definition: Molecule.h:1157
iterator begin(void)
Definition: ResizeArray.h:36
void ComputeNonbondedCUDA::build_force_table ( )
static

Definition at line 110 of file ComputeNonbondedCUDA.C.

References cuda_bind_force_table(), ComputeNonbondedUtil::cutoff, ComputeNonbondedUtil::fast_table, FORCE_TABLE_SIZE, ComputeNonbondedUtil::r2_delta, ComputeNonbondedUtil::r2_delta_exp, ComputeNonbondedUtil::r2_table, ComputeNonbondedUtil::scor_table, ComputeNonbondedUtil::vdwa_table, and ComputeNonbondedUtil::vdwb_table.

Referenced by build_cuda_force_table().

110  { // static
111 
112  float4 t[FORCE_TABLE_SIZE];
113  float4 et[FORCE_TABLE_SIZE]; // energy table
114 
117  // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
118  const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
119 
120  double r2list[FORCE_TABLE_SIZE]; // double to match cpu code
121  for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
122  double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
123  r2list[i] = r*r + r2_delta;
124  }
125 
126  union { double f; int32 i[2]; } byte_order_test;
127  byte_order_test.f = 1.0; // should occupy high-order bits only
128  int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
129 
130  for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
131  double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
132  int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc; // table_i >= 0
133 
134  if ( r > cutoff ) {
135  t[i].x = 0.;
136  t[i].y = 0.;
137  t[i].z = 0.;
138  t[i].w = 0.;
139  et[i].x = 0.;
140  et[i].y = 0.;
141  et[i].z = 0.;
142  et[i].w = 0.;
143  continue;
144  }
145 
146  BigReal diffa = r2list[i] - r2_table[table_i];
147 
148  // coulomb 1/r or fast force
149  // t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1
150  {
151  BigReal table_a = fast_table[4*table_i];
152  BigReal table_b = fast_table[4*table_i+1];
153  BigReal table_c = fast_table[4*table_i+2];
154  BigReal table_d = fast_table[4*table_i+3];
155  BigReal grad =
156  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
157  t[i].x = 2. * grad;
158  BigReal ener = table_a + diffa *
159  ( ( table_d * diffa + table_c ) * diffa + table_b);
160  et[i].x = ener;
161  }
162 
163 
164  // pme correction for slow force
165  // t[i].w = 0.;
166  {
167  BigReal table_a = scor_table[4*table_i];
168  BigReal table_b = scor_table[4*table_i+1];
169  BigReal table_c = scor_table[4*table_i+2];
170  BigReal table_d = scor_table[4*table_i+3];
171  BigReal grad =
172  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
173  t[i].w = 2. * grad;
174  BigReal ener = table_a + diffa *
175  ( ( table_d * diffa + table_c ) * diffa + table_b);
176  et[i].w = ener;
177  }
178 
179 
180  // vdw 1/r^6
181  // t[i].y = 6. / (r8); // -1/r * d/dr r^-6
182  {
183  BigReal table_a = vdwb_table[4*table_i];
184  BigReal table_b = vdwb_table[4*table_i+1];
185  BigReal table_c = vdwb_table[4*table_i+2];
186  BigReal table_d = vdwb_table[4*table_i+3];
187  BigReal grad =
188  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
189  t[i].y = 2. * -1. * grad;
190  BigReal ener = table_a + diffa *
191  ( ( table_d * diffa + table_c ) * diffa + table_b);
192  et[i].y = -1. * ener;
193  }
194 
195 
196  // vdw 1/r^12
197  // t[i].z = 12e / (r8 * r4 * r2); // -1/r * d/dr r^-12
198  {
199  BigReal table_a = vdwa_table[4*table_i];
200  BigReal table_b = vdwa_table[4*table_i+1];
201  BigReal table_c = vdwa_table[4*table_i+2];
202  BigReal table_d = vdwa_table[4*table_i+3];
203  BigReal grad =
204  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
205  t[i].z = 2. * grad;
206  BigReal ener = table_a + diffa *
207  ( ( table_d * diffa + table_c ) * diffa + table_b);
208  et[i].z = ener;
209  }
210 
211  // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
212  // t[i].x, t[i].y, t[i].z, t[i].w);
213 
214 /*
215  double r2 = r * r;
216  double r4 = r2 * r2;
217  double r8 = r4 * r4;
218 
219  t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1
220  t[i].y = 6. / (r8); // -1/r * d/dr r^-6
221  t[i].z = 12. / (r8 * r4 * r2); // -1/r * d/dr r^-12
222  t[i].w = 0.;
223 */
224  }
225 
226  t[0].x = 0.f;
227  t[0].y = 0.f;
228  t[0].z = 0.f;
229  t[0].w = 0.f;
230  et[0].x = et[1].x;
231  et[0].y = et[1].y;
232  et[0].z = et[1].z;
233  et[0].w = et[1].w;
234 
235  cuda_bind_force_table(t,et);
236 
237  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
238  CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
239  }
240 }
static BigReal * fast_table
void cuda_bind_force_table(const float4 *t, const float4 *et)
static BigReal * scor_table
short int32
Definition: dumpdcd.c:24
#define FORCE_TABLE_SIZE
static BigReal * vdwa_table
static BigReal * vdwb_table
double BigReal
Definition: common.h:112
void ComputeNonbondedCUDA::build_lj_table ( )
static

Definition at line 83 of file ComputeNonbondedCUDA.C.

References LJTable::TableEntry::A, LJTable::TableEntry::B, cuda_bind_lj_table(), LJTable::get_table_dim(), ComputeNonbondedUtil::ljTable, NAMD_bug(), ComputeNonbondedUtil::scaling, and LJTable::table_val().

Referenced by build_cuda_force_table().

83  { // static
84 
86  const int dim = ljTable->get_table_dim();
87 
88  // round dim up to odd multiple of 16
89  int tsize = (((dim+16+31)/32)*32)-16;
90  if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
91 
92  float2 *t = new float2[tsize*tsize];
93  float2 *row = t;
94  for ( int i=0; i<dim; ++i, row += tsize ) {
95  for ( int j=0; j<dim; ++j ) {
96  const LJTable::TableEntry *e = ljTable->table_val(i,j);
97  row[j].x = e->A * scaling;
98  row[j].y = e->B * scaling;
99  }
100  }
101 
102  cuda_bind_lj_table(t,tsize);
103  delete [] t;
104 
105  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
106  CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
107  }
108 }
int get_table_dim() const
Definition: LJTable.h:44
const TableEntry * table_val(unsigned int i, unsigned int j) const
Definition: LJTable.h:35
void cuda_bind_lj_table(const float2 *t, int _lj_table_size)
void NAMD_bug(const char *err_msg)
Definition: common.C:123
static const LJTable * ljTable
void ComputeNonbondedCUDA::doWork ( void  )
virtual

Reimplemented from Compute.

Definition at line 1172 of file ComputeNonbondedCUDA.C.

References Lattice::a(), activePatches, atom_params, atom_params_size, atoms, atoms_size, atomsChanged, Lattice::b(), Compute::basePriority, ResizeArray< T >::begin(), block_order, block_order_size, ComputeNonbondedCUDA::patch_record::bornRad, bornRadH, bornRadH_size, Lattice::c(), PatchMap::center(), CompAtom::charge, COMPUTE_PROXY_PRIORITY, computeRecords, computesChanged, COULOMB, cuda_bind_patch_pairs(), cuda_check_local_progress(), cuda_errcheck(), cudaCheck, cudaCompute, ComputeNonbondedUtil::cutoff, ComputeNonbondedUtil::cutoff2, dEdaSumH, dEdaSumH_size, deviceCUDA, deviceID, ComputeNonbondedCUDA::patch_record::dHdrPrefix, dHdrPrefixH, dHdrPrefixH_size, ComputeNonbondedUtil::dielectric_1, doSlow, dummy_dev, dummy_size, energy_gbis, energy_gbis_size, exclusionsByAtom, finishWork(), ComputeNonbondedUtil::fixedAtomsOn, Patch::flags, force_ready_queue, force_ready_queue_len, force_ready_queue_size, forces, forces_size, SimParameters::GBISOn, GBISP, Compute::gbisPhase, Patch::getCudaAtomList(), DeviceCUDA::getGpuIsMine(), DeviceCUDA::getNoStreaming(), Patch::getNumAtoms(), hostedPatches, CompAtomExt::id, ComputeNonbondedCUDA::patch_record::intRad, intRad0H, intRad0H_size, intRadSH, intRadSH_size, SubmitReduction::item(), kernel_time, lata, latb, latc, Flags::lattice, lattice, localActivePatches, localComputeRecords, ComputeNonbondedCUDA::patch_record::localStart, master, Flags::maxAtomMovement, ComputeNonbondedUtil::mol, Node::molecule, NAMD_bug(), num_atoms, num_local_atoms, num_remote_atoms, num_virials, ComputeNonbondedCUDA::patch_record::numAtoms, ComputeNonbondedCUDA::patch_record::numFreeAtoms, Node::Object(), ComputeNonbondedCUDA::compute_record::offset, ComputeNonbondedCUDA::patch_record::p, pairlistsValid, Flags::pairlistTolerance, pairlistTolerance, Node::parameters, patch_pair_num_ptr, patch_pairs, patch_pairs_ptr, ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchPairsReordered, patchRecords, ComputeNonbondedCUDA::compute_record::pid, plcutoff2, CompAtom::position, PROXY_DATA_PRIORITY, PROXY_RESULTS_PRIORITY, psiSumH, psiSumH_size, recvYieldDevice(), reduction, REDUCTION_PAIRLIST_WARNINGS, remoteActivePatches, remoteComputeRecords, ResizeArray< T >::resize(), Flags::savePairlists, savePairlists, ComputeNonbondedUtil::scaling, Compute::sequence(), Node::simParameters, simParams, ResizeArray< T >::size(), slow_forces, slow_forces_size, slow_virials, CompAtomExt::sortOrder, ResizeArray< T >::swap(), Lattice::unscale(), Flags::usePairlists, usePairlists, vdw_types, vdw_types_size, CompAtom::vdwType, virials, virials_size, WARPSIZE, workStarted, ComputeNonbondedCUDA::patch_record::x, Vector::x, CudaAtom::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z.

1172  {
1173 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
1174 CkMyPe(), sequence(), gbisPhase, workStarted, atomsChanged);
1175 
1176  // Set GPU device ID
1177  cudaCheck(cudaSetDevice(deviceID));
1178 
1180  ResizeArray<int> &patch_pair_num(*patch_pair_num_ptr);
1181 
1182  if ( workStarted ) { //if work already started, check if finished
1183  if ( finishWork() ) { // finished
1184  workStarted = 0;
1185  basePriority = PROXY_DATA_PRIORITY; // higher to aid overlap
1186  } else { // need to call again
1187  workStarted = 2;
1188  basePriority = PROXY_RESULTS_PRIORITY; // lower for local
1189  if ( master == this && kernel_launch_state > 2 ) {
1190  cuda_check_local_progress(this,0.); // launches polling
1191  }
1192  }
1193  return;
1194  }
1195 
1196  workStarted = 1;
1198 
1200  Parameters *params = Node::Object()->parameters;
1202 
1203  //execute only during GBIS phase 1, or if not using GBIS
1204  if (!simParams->GBISOn || gbisPhase == 1) {
1205 
1206  //bind new patches to GPU
1207  if ( atomsChanged || computesChanged ) {
1208  int npatches = activePatches.size();
1209 
1210  pairlistsValid = 0;
1211  pairlistTolerance = 0.;
1212 
1213  if ( computesChanged ) {
1214  computesChanged = 0;
1215 
1216  if ( ! dummy_size ) {
1217  dummy_size = 1024*1024;
1218  cudaMalloc((void**)&dummy_dev,dummy_size);
1219  cuda_errcheck("in cudaMalloc dummy_dev");
1220  }
1221 
1222  bool did_realloc = reallocate_host<int>(&force_ready_queue, &force_ready_queue_size, npatches,
1223  1.2f, cudaHostAllocMapped);
1224  if (did_realloc) {
1225  for (int k=0; k < force_ready_queue_size; ++k)
1226  force_ready_queue[k] = -1;
1227  }
1228  force_ready_queue_len = npatches;
1229  reallocate_host<int>(&block_order, &block_order_size,
1230  2*(localComputeRecords.size()+remoteComputeRecords.size()),
1231  1.2f, cudaHostAllocMapped);
1232 
1233  num_virials = npatches;
1234  reallocate_host<float>(&virials, &virials_size, 2*16*num_virials,
1235  1.2f, cudaHostAllocMapped);
1236  slow_virials = virials + 16*num_virials;
1237 
1238  reallocate_host<float>(&energy_gbis, &energy_gbis_size, npatches,
1239  1.2f, cudaHostAllocMapped);
1240  for (int i = 0; i < energy_gbis_size; i++) energy_gbis[i] = 0.f;
1241 
1242  int *ap = activePatches.begin();
1243  for ( int i=0; i<localActivePatches.size(); ++i ) {
1244  *(ap++) = localActivePatches[i];
1245  }
1246  for ( int i=0; i<remoteActivePatches.size(); ++i ) {
1247  *(ap++) = remoteActivePatches[i];
1248  }
1249 
1250  // sort computes by distance between patches
1252  std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
1253  std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
1254 
1255  const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
1256 
1257  if ( streaming ) {
1258  // iout << "Reverse-priority sorting...\n" << endi;
1259  cr_sortop_reverse_priority sorp(so,patchRecords.begin());
1260  std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),sorp);
1261  std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),sorp);
1262  patchPairsReordered = 0;
1263  //patchPairsReordered = 1;
1264  // int len = remoteComputeRecords.size();
1265  // for ( int i=0; i<len; ++i ) {
1266  // iout << "reverse_order " << i << " " << remoteComputeRecords[i].pid[0] << "\n";
1267  // }
1268  // int len2 = localComputeRecords.size();
1269  // for ( int i=0; i<len2; ++i ) {
1270  // iout << "reverse_order " << (i+len) << " " << localComputeRecords[i].pid[0] << "\n";
1271  // }
1272  // iout << endi;
1273  } else {
1274  patchPairsReordered = 1;
1275  }
1276 
1277  int nlc = localComputeRecords.size();
1278  int nrc = remoteComputeRecords.size();
1279  computeRecords.resize(nlc+nrc);
1280  compute_record *cr = computeRecords.begin();
1281  for ( int i=0; i<nrc; ++i ) {
1282  *(cr++) = remoteComputeRecords[i];
1283  }
1284  for ( int i=0; i<nlc; ++i ) {
1285  *(cr++) = localComputeRecords[i];
1286  }
1287 
1288  // patch_pair_num[i] = number of patch pairs that involve patch i
1289  patch_pair_num.resize(npatches);
1290  for ( int i=0; i<npatches; ++i ) {
1291  patchRecords[activePatches[i]].localIndex = i;
1292  patch_pair_num[i] = 0;
1293  }
1294 
1295  int ncomputes = computeRecords.size();
1296  patch_pairs.resize(ncomputes);
1297  for ( int i=0; i<ncomputes; ++i ) {
1299  int lp1 = patchRecords[cr.pid[0]].localIndex;
1300  int lp2 = patchRecords[cr.pid[1]].localIndex;
1301  patch_pair_num[lp1]++;
1302  if (lp1 != lp2) patch_pair_num[lp2]++;
1303  patch_pair &pp = patch_pairs[i];
1304  pp.offset.x = cr.offset.x;
1305  pp.offset.y = cr.offset.y;
1306  pp.offset.z = cr.offset.z;
1307  }
1308 
1309  for ( int i=0; i<ncomputes; ++i ) {
1311  int lp1 = patchRecords[cr.pid[0]].localIndex;
1312  int lp2 = patchRecords[cr.pid[1]].localIndex;
1313  patch_pair &pp = patch_pairs[i];
1314  pp.patch1_ind = lp1;
1315  pp.patch2_ind = lp2;
1316  pp.patch1_num_pairs = patch_pair_num[lp1];
1317  pp.patch2_num_pairs = patch_pair_num[lp2];
1318  }
1319 
1320  if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
1321  CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
1323  localComputeRecords.size(), remoteComputeRecords.size());
1324  }
1325  } // computesChanged
1326  else if ( ! patchPairsReordered ) {
1327  patchPairsReordered = 1;
1328  int len = patch_pairs.size();
1329  int nlc = localComputeRecords.size();
1330  int nrc = remoteComputeRecords.size();
1331  if ( len != nlc + nrc ) NAMD_bug("array size mismatch in ComputeNonbondedCUDA reordering");
1332  ResizeArray<ComputeNonbondedCUDA::compute_record> new_computeRecords(len);
1333  ResizeArray<patch_pair> new_patch_pairs(len);
1334  int irc=nrc;
1335  int ilc=len;
1336  for ( int i=0; i<len; ++i ) {
1337  int boi = block_order[i];
1338  int dest;
1339  if ( boi < nrc ) { dest = --irc; } else { dest = --ilc; }
1340  new_computeRecords[dest] = computeRecords[boi];
1341  new_patch_pairs[dest] = patch_pairs[boi];
1342  }
1343  if ( irc != 0 || ilc != nrc ) NAMD_bug("block index mismatch in ComputeNonbondedCUDA reordering");
1344  computeRecords.swap(new_computeRecords);
1345  patch_pairs.swap(new_patch_pairs);
1346  }
1347 
1348  int istart = 0;
1349  int i;
1350  for ( i=0; i<npatches; ++i ) {
1351  if ( i == localActivePatches.size() ) {
1352  num_local_atoms = istart;
1353  }
1354  patch_record &pr = patchRecords[activePatches[i]];
1355  pr.localStart = istart;
1356  int natoms = pr.p->getNumAtoms();
1357  int nfreeatoms = natoms;
1358  if ( fixedAtomsOn ) {
1359  const CompAtomExt *aExt = pr.xExt;
1360  for ( int j=0; j<natoms; ++j ) {
1361  if ( aExt[j].atomFixed ) --nfreeatoms;
1362  }
1363  }
1364  pr.numAtoms = natoms;
1365  pr.numFreeAtoms = nfreeatoms;
1366  istart += natoms;
1367  istart += 16 - (natoms & 15);
1368  }
1369  if ( i == localActivePatches.size() ) {
1370  num_local_atoms = istart;
1371  }
1372  num_atoms = istart;
1374  reallocate_host<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
1375  reallocate_host<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
1376  reallocate_host<CudaAtom>(&atoms, &atoms_size, num_atoms, 1.2f);
1377  reallocate_host<float4>(&forces, &forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
1378  reallocate_host<float4>(&slow_forces, &slow_forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
1379  if (simParams->GBISOn) {
1380  reallocate_host<float>(&intRad0H, &intRad0H_size, num_atoms, 1.2f);
1381  reallocate_host<float>(&intRadSH, &intRadSH_size, num_atoms, 1.2f);
1382  reallocate_host<GBReal>(&psiSumH, &psiSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
1383  reallocate_host<float>(&bornRadH, &bornRadH_size, num_atoms, 1.2f);
1384  reallocate_host<GBReal>(&dEdaSumH, &dEdaSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
1385  reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixH_size, num_atoms, 1.2f);
1386  }
1387 
1388  int bfstart = 0;
1389  int exclmask_start = 0;
1390  int ncomputes = computeRecords.size();
1391  for ( int i=0; i<ncomputes; ++i ) {
1393  int p1 = cr.pid[0];
1394  int p2 = cr.pid[1];
1395  patch_pair &pp = patch_pairs[i];
1396  pp.patch1_start = patchRecords[p1].localStart;
1397  pp.patch1_size = patchRecords[p1].numAtoms;
1398  pp.patch1_free_size = patchRecords[p1].numFreeAtoms;
1399  pp.patch2_start = patchRecords[p2].localStart;
1400  pp.patch2_size = patchRecords[p2].numAtoms;
1401  pp.patch2_free_size = patchRecords[p2].numFreeAtoms;
1402  pp.plist_start = bfstart;
1403  // size1*size2 = number of patch pairs
1404  int size1 = (pp.patch1_size-1)/WARPSIZE+1;
1405  int size2 = (pp.patch2_size-1)/WARPSIZE+1;
1406  pp.plist_size = (size1*size2-1)/32+1;
1407  bfstart += pp.plist_size;
1408  pp.exclmask_start = exclmask_start;
1409  exclmask_start += size1*size2;
1410  } //for ncomputes
1411 
1413  activePatches.size(), num_atoms, bfstart,
1414  exclmask_start);
1415 
1416  } // atomsChanged || computesChanged
1417 
1418  double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
1419 
1420  Flags &flags = patchRecords[hostedPatches[0]].p->flags;
1421  float maxAtomMovement = 0.;
1422  float maxPatchTolerance = 0.;
1423 
1424  for ( int i=0; i<activePatches.size(); ++i ) {
1425  patch_record &pr = patchRecords[activePatches[i]];
1426 
1427  float maxMove = pr.p->flags.maxAtomMovement;
1428  if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
1429 
1430  float maxTol = pr.p->flags.pairlistTolerance;
1431  if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
1432 
1433  int start = pr.localStart;
1434  int n = pr.numAtoms;
1435  const CompAtom *a = pr.x;
1436  const CompAtomExt *aExt = pr.xExt;
1437  if ( atomsChanged ) {
1438 
1439  atom_param *ap = atom_params + start;
1440  for ( int k=0; k<n; ++k ) {
1441  int j = aExt[k].sortOrder;
1442  ap[k].vdw_type = a[j].vdwType;
1443  vdw_types[start + k] = a[j].vdwType;
1444  ap[k].index = aExt[j].id;
1445 #ifdef MEM_OPT_VERSION
1446  ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
1447  ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
1448 #else // ! MEM_OPT_VERSION
1449  ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
1450  ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
1451 #endif // MEM_OPT_VERSION
1452  }
1453  }
1454  {
1455 #if 1
1456  const CudaAtom *ac = pr.p->getCudaAtomList();
1457  CudaAtom *ap = atoms + start;
1458  memcpy(ap, ac, sizeof(atom)*n);
1459 #else
1460  Vector center =
1461  pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
1462  atom *ap = atoms + start;
1463  for ( int k=0; k<n; ++k ) {
1464  int j = aExt[k].sortOrder;
1465  ap[k].position.x = a[j].position.x - center.x;
1466  ap[k].position.y = a[j].position.y - center.y;
1467  ap[k].position.z = a[j].position.z - center.z;
1468  ap[k].charge = charge_scaling * a[j].charge;
1469  }
1470 #endif
1471  }
1472  }
1473 
1474  savePairlists = 0;
1475  usePairlists = 0;
1476  if ( flags.savePairlists ) {
1477  savePairlists = 1;
1478  usePairlists = 1;
1479  } else if ( flags.usePairlists ) {
1480  if ( ! pairlistsValid ||
1481  ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1483  } else {
1484  usePairlists = 1;
1485  }
1486  }
1487  if ( ! usePairlists ) {
1488  pairlistsValid = 0;
1489  }
1490  float plcutoff = cutoff;
1491  if ( savePairlists ) {
1492  pairlistsValid = 1;
1493  pairlistTolerance = 2. * maxPatchTolerance;
1494  plcutoff += pairlistTolerance;
1495  }
1496  plcutoff2 = plcutoff * plcutoff;
1497 
1498  //CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1499  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1500 
1501 #if 0
1502  // calculate warp divergence
1503  if ( 1 ) {
1504  Flags &flags = patchRecords[hostedPatches[0]].p->flags;
1505  Lattice &lattice = flags.lattice;
1506  float3 lata, latb, latc;
1507  lata.x = lattice.a().x;
1508  lata.y = lattice.a().y;
1509  lata.z = lattice.a().z;
1510  latb.x = lattice.b().x;
1511  latb.y = lattice.b().y;
1512  latb.z = lattice.b().z;
1513  latc.x = lattice.c().x;
1514  latc.y = lattice.c().y;
1515  latc.z = lattice.c().z;
1516 
1517  int ncomputes = computeRecords.size();
1518  for ( int ic=0; ic<ncomputes; ++ic ) {
1519  patch_pair &pp = patch_pairs[ic];
1520  atom *a1 = atoms + pp.patch1_atom_start;
1521  int n1 = pp.patch1_size;
1522  atom *a2 = atoms + pp.patch2_atom_start;
1523  int n2 = pp.patch2_size;
1524  float offx = pp.offset.x * lata.x
1525  + pp.offset.y * latb.x
1526  + pp.offset.z * latc.x;
1527  float offy = pp.offset.x * lata.y
1528  + pp.offset.y * latb.y
1529  + pp.offset.z * latc.y;
1530  float offz = pp.offset.x * lata.z
1531  + pp.offset.y * latb.z
1532  + pp.offset.z * latc.z;
1533  // CkPrintf("%f %f %f\n", offx, offy, offz);
1534  int atoms_tried = 0;
1535  int blocks_tried = 0;
1536  int atoms_used = 0;
1537  int blocks_used = 0;
1538  for ( int ii=0; ii<n1; ii+=32 ) { // warps
1539  for ( int jj=0; jj<n2; jj+=16 ) { // shared atom loads
1540  int block_used = 0;
1541  for ( int j=jj; j<jj+16 && j<n2; ++j ) { // shared atoms
1542  int atom_used = 0;
1543  for ( int i=ii; i<ii+32 && i<n1; ++i ) { // threads
1544  float dx = offx + a1[i].position.x - a2[j].position.x;
1545  float dy = offy + a1[i].position.y - a2[j].position.y;
1546  float dz = offz + a1[i].position.z - a2[j].position.z;
1547  float r2 = dx*dx + dy*dy + dz*dz;
1548  if ( r2 < cutoff2 ) atom_used = 1;
1549  }
1550  ++atoms_tried;
1551  if ( atom_used ) { block_used = 1; ++atoms_used; }
1552  }
1553  ++blocks_tried;
1554  if ( block_used ) { ++blocks_used; }
1555  }
1556  }
1557  CkPrintf("blocks = %d/%d (%f) atoms = %d/%d (%f)\n",
1558  blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
1559  atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
1560  }
1561  }
1562 #endif
1563 
1564  } // !GBISOn || gbisPhase == 1
1565 
1566  //Do GBIS
1567  if (simParams->GBISOn) {
1568  //open GBIS boxes depending on phase
1569  for ( int i=0; i<activePatches.size(); ++i ) {
1570  patch_record &pr = master->patchRecords[activePatches[i]];
1571  GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
1572  if (gbisPhase == 1) {
1573  //Copy GBIS intRadius to Host
1574  if (atomsChanged) {
1575  float *intRad0 = intRad0H + pr.localStart;
1576  float *intRadS = intRadSH + pr.localStart;
1577  for ( int k=0; k<pr.numAtoms; ++k ) {
1578  int j = pr.xExt[k].sortOrder;
1579  intRad0[k] = pr.intRad[2*j+0];
1580  intRadS[k] = pr.intRad[2*j+1];
1581  }
1582  }
1583  } else if (gbisPhase == 2) {
1584  float *bornRad = bornRadH + pr.localStart;
1585  for ( int k=0; k<pr.numAtoms; ++k ) {
1586  int j = pr.xExt[k].sortOrder;
1587  bornRad[k] = pr.bornRad[j];
1588  }
1589  } else if (gbisPhase == 3) {
1590  float *dHdrPrefix = dHdrPrefixH + pr.localStart;
1591  for ( int k=0; k<pr.numAtoms; ++k ) {
1592  int j = pr.xExt[k].sortOrder;
1593  dHdrPrefix[k] = pr.dHdrPrefix[j];
1594  }
1595  } // end phases
1596  } // end for patches
1597  } // if GBISOn
1598 
1599  kernel_time = CkWallTimer();
1600  kernel_launch_state = 1;
1601  //if ( gpu_is_mine || ! doSlow ) recvYieldDevice(-1);
1602  if ( deviceCUDA->getGpuIsMine() || ! doSlow ) recvYieldDevice(-1);
1603 }
static Node * Object()
Definition: Node.h:86
static __thread int * block_order
#define COMPUTE_PROXY_PRIORITY
Definition: Priorities.h:71
#define PROXY_RESULTS_PRIORITY
Definition: Priorities.h:73
ResizeArray< int > localActivePatches
int sequence(void)
Definition: Compute.h:64
int sortOrder
Definition: NamdTypes.h:87
static __thread int intRadSH_size
ResizeArray< compute_record > computeRecords
void cuda_check_local_progress(void *arg, double walltime)
#define GBISP(...)
Definition: Vector.h:64
SimParameters * simParameters
Definition: Node.h:178
static __thread int dummy_size
int savePairlists
Definition: PatchTypes.h:39
static const Molecule * mol
#define COULOMB
Definition: common.h:44
BigReal & item(int i)
Definition: ReductionMgr.h:312
static __thread float * bornRadH
BigReal z
Definition: Vector.h:66
static __thread int2 * exclusionsByAtom
int usePairlists
Definition: PatchTypes.h:38
Position position
Definition: NamdTypes.h:53
static __thread float * dHdrPrefixH
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
SubmitReduction * reduction
static __thread int dHdrPrefixH_size
static __thread float * slow_virials
static __thread int intRad0H_size
Charge charge
Definition: NamdTypes.h:54
static __thread ResizeArray< int > * patch_pair_num_ptr
static __thread patch_pair * patch_pairs
static __thread float * intRadSH
void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int plist_len, int nexclmask)
static __thread double kernel_time
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
void NAMD_bug(const char *err_msg)
Definition: common.C:123
static __thread int force_ready_queue_size
static __thread int num_remote_atoms
static __thread int virials_size
ResizeArray< int > remoteActivePatches
BigReal x
Definition: Vector.h:66
ComputeNonbondedCUDA * master
ResizeArray< compute_record > localComputeRecords
static __thread float * virials
ResizeArray< compute_record > remoteComputeRecords
static __thread ResizeArray< patch_pair > * patch_pairs_ptr
static __thread int bornRadH_size
static __thread int num_virials
ResizeArray< int > hostedPatches
ResizeArray< int > activePatches
static __thread int vdw_types_size
int gbisPhase
Definition: Compute.h:39
Parameters * parameters
Definition: Node.h:177
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
static __thread int force_ready_queue_len
static __thread int energy_gbis_size
static __thread float * dummy_dev
#define simParams
Definition: Output.C:127
short vdwType
Definition: NamdTypes.h:55
void cuda_errcheck(const char *msg)
static __thread float * energy_gbis
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
BigReal y
Definition: Vector.h:66
Vector b() const
Definition: Lattice.h:253
static __thread int kernel_launch_state
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
int getNoStreaming()
Definition: DeviceCUDA.h:93
Lattice lattice
Definition: PatchTypes.h:44
static __thread atom_param * atom_params
ResizeArray< patch_record > patchRecords
#define WARPSIZE
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
static __thread int block_order_size
static __thread int * vdw_types
int basePriority
Definition: Compute.h:37
int size(void) const
Definition: ResizeArray.h:127
int getGpuIsMine()
Definition: DeviceCUDA.h:104
static __thread int atom_params_size
static __thread int * force_ready_queue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
static __thread int num_atoms
Molecule * molecule
Definition: Node.h:176
Vector a() const
Definition: Lattice.h:252
static __thread int num_local_atoms
Vector c() const
Definition: Lattice.h:254
static __thread float * intRad0H
static __thread ComputeNonbondedCUDA * cudaCompute
iterator begin(void)
Definition: ResizeArray.h:36
void ComputeNonbondedCUDA::finishPatch ( int  flindex)
virtual

Reimplemented from Compute.

Definition at line 1864 of file ComputeNonbondedCUDA.C.

References activePatches, Box< Owner, Data >::close(), ComputeNonbondedCUDA::patch_record::forceBox, master, Box< Owner, Data >::open(), patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::r, and ComputeNonbondedCUDA::patch_record::x.

Referenced by finishWork().

1864  {
1865  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishPatch \n",CkMyPe());
1866  patch_record &pr = master->patchRecords[master->activePatches[flindex]];
1867  pr.r = pr.forceBox->open();
1868  finishPatch(pr);
1869  pr.positionBox->close(&(pr.x));
1870  pr.forceBox->close(&(pr.r));
1871 }
ComputeNonbondedCUDA * master
ResizeArray< int > activePatches
ResizeArray< patch_record > patchRecords
void ComputeNonbondedCUDA::finishReductions ( )

Definition at line 2028 of file ComputeNonbondedCUDA.C.

References ADD_TENSOR_OBJECT, Compute::basePriority, block_order, computeRecords, cuda_errcheck(), cuda_timer_count, cuda_timer_total, doEnergy, doSlow, end_local_download, end_remote_download, endi(), energy_gbis, SimParameters::GBISOn, iout, SubmitReduction::item(), localComputeRecords, num_virials, Node::Object(), SimParameters::outputCudaTiming, patchPairsReordered, patchRecords, PROXY_DATA_PRIORITY, reduction, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, remoteComputeRecords, Node::simParameters, simParams, slow_virials, start_calc, step, SubmitReduction::submit(), virials, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

Referenced by finishWork().

2028  {
2029  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishReductions \n",CkMyPe());
2030 
2031  basePriority = PROXY_DATA_PRIORITY; // higher to aid overlap
2032 
2034 
2035  if ( 0 && patchPairsReordered && patchPairsReordered < 3 ) {
2036  if ( patchPairsReordered++ == 2) {
2037  int patch_len = patchRecords.size();
2038  ResizeArray<int> plast(patch_len);
2039  for ( int i=0; i<patch_len; ++i ) {
2040  plast[i] = -1;
2041  }
2042  int order_len = localComputeRecords.size()+remoteComputeRecords.size();
2043  for ( int i=0; i<order_len; ++i ) {
2044  plast[computeRecords[block_order[i]].pid[0]] = i;
2045  iout << "block_order " << i << " " << block_order[i] << " " << computeRecords[block_order[i]].pid[0] << "\n";
2046  }
2047  iout << endi;
2048  for ( int i=0; i<patch_len; ++i ) {
2049  iout << "patch_last " << i << " " << plast[i] << "\n";
2050  }
2051  iout << endi;
2052  }
2053  }
2054 
2055  {
2056  Tensor virial_tensor;
2057  BigReal energyv = 0.;
2058  BigReal energye = 0.;
2059  BigReal energys = 0.;
2060  int nexcluded = 0;
2061  for ( int i = 0; i < num_virials; ++i ) {
2062  virial_tensor.xx += virials[16*i];
2063  virial_tensor.xy += virials[16*i+1];
2064  virial_tensor.xz += virials[16*i+2];
2065  virial_tensor.yx += virials[16*i+3];
2066  virial_tensor.yy += virials[16*i+4];
2067  virial_tensor.yz += virials[16*i+5];
2068  virial_tensor.zx += virials[16*i+6];
2069  virial_tensor.zy += virials[16*i+7];
2070  virial_tensor.zz += virials[16*i+8];
2071  energyv += virials[16*i+9];
2072  energye += virials[16*i+10];
2073  energys += virials[16*i+11];
2074  nexcluded += ((int *)virials)[16*i+12];
2075  if (simParams->GBISOn) {
2076  energye += energy_gbis[i];
2077  }
2078  }
2080  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
2081  if ( doEnergy ) {
2082  reduction->item(REDUCTION_LJ_ENERGY) += energyv;
2083  reduction->item(REDUCTION_ELECT_ENERGY) += energye;
2085  }
2086  }
2087  if ( doSlow ) {
2088  Tensor virial_slow_tensor;
2089  for ( int i = 0; i < num_virials; ++i ) {
2090  virial_slow_tensor.xx += slow_virials[16*i];
2091  virial_slow_tensor.xy += slow_virials[16*i+1];
2092  virial_slow_tensor.xz += slow_virials[16*i+2];
2093  virial_slow_tensor.yx += slow_virials[16*i+3];
2094  virial_slow_tensor.yy += slow_virials[16*i+4];
2095  virial_slow_tensor.yz += slow_virials[16*i+5];
2096  virial_slow_tensor.zx += slow_virials[16*i+6];
2097  virial_slow_tensor.zy += slow_virials[16*i+7];
2098  virial_slow_tensor.zz += slow_virials[16*i+8];
2099  }
2100  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
2101  }
2102 
2103  reduction->submit();
2104 
2105  cuda_timer_count++;
2106  if ( simParams->outputCudaTiming &&
2107  step % simParams->outputCudaTiming == 0 ) {
2108 
2109  // int natoms = mol->numAtoms;
2110  // double wpa = wcount; wpa /= natoms;
2111 
2112  // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
2113  // kernel_time * 1.e3, time * 1.e3, wpa);
2114 
2115 #if 0
2116  float upload_ms, remote_calc_ms;
2117  float local_calc_ms, total_ms;
2118  cuda_errcheck("before event timers");
2119  cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
2120  cuda_errcheck("in event timer 1");
2121  cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
2122  cuda_errcheck("in event timer 2");
2123  cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
2124  cuda_errcheck("in event timer 3");
2125  cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
2126  cuda_errcheck("in event timer 4");
2127  cuda_errcheck("in event timers");
2128 
2129  CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
2130  CkMyPe(), upload_ms, remote_calc_ms,
2131  local_calc_ms, total_ms);
2132 #endif
2133 
2134  if ( cuda_timer_count >= simParams->outputCudaTiming ) {
2136  CkPrintf("CUDA TIMING: %d %f ms/step on node %d\n",
2137  step, cuda_timer_total * 1.e3, CkMyPe());
2138  }
2139  cuda_timer_count = 0;
2140  cuda_timer_total = 0;
2141  }
2142 
2143 }
static Node * Object()
Definition: Node.h:86
static __thread int * block_order
BigReal zy
Definition: Tensor.h:19
BigReal xz
Definition: Tensor.h:17
static __thread double cuda_timer_total
ResizeArray< compute_record > computeRecords
static __thread cudaEvent_t end_remote_download
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:43
SimParameters * simParameters
Definition: Node.h:178
BigReal & item(int i)
Definition: ReductionMgr.h:312
BigReal yz
Definition: Tensor.h:18
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
#define iout
Definition: InfoStream.h:87
SubmitReduction * reduction
static __thread cudaEvent_t end_local_download
static __thread float * slow_virials
BigReal yx
Definition: Tensor.h:18
ResizeArray< compute_record > localComputeRecords
static __thread float * virials
ResizeArray< compute_record > remoteComputeRecords
static __thread int cuda_timer_count
static __thread int num_virials
BigReal xx
Definition: Tensor.h:17
BigReal zz
Definition: Tensor.h:19
#define simParams
Definition: Output.C:127
void cuda_errcheck(const char *msg)
Definition: Tensor.h:15
static __thread float * energy_gbis
BigReal xy
Definition: Tensor.h:17
BigReal yy
Definition: Tensor.h:18
static __thread cudaEvent_t start_calc
ResizeArray< patch_record > patchRecords
void submit(void)
Definition: ReductionMgr.h:323
int basePriority
Definition: Compute.h:37
infostream & endi(infostream &s)
Definition: InfoStream.C:38
BigReal zx
Definition: Tensor.h:19
double BigReal
Definition: common.h:112
int ComputeNonbondedCUDA::finishWork ( )

Definition at line 1902 of file ComputeNonbondedCUDA.C.

References atomsChanged, ComputeNonbondedCUDA::patch_record::bornRad, ComputeNonbondedCUDA::patch_record::bornRadBox, Box< Owner, Data >::close(), computeMgr, cuda_timer_total, ComputeNonbondedCUDA::patch_record::dEdaSum, ComputeNonbondedCUDA::patch_record::dEdaSumBox, dEdaSumH, deviceCUDA, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, doSlow, finishPatch(), finishReductions(), ComputeNonbondedCUDA::patch_record::forceBox, SimParameters::GBISOn, GBISP, Compute::gbisPhase, DeviceCUDA::getMergeGrids(), ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::intRadBox, SubmitReduction::item(), kernel_time, localHostedPatches, ComputeNonbondedCUDA::patch_record::localStart, master, ComputeNonbondedUtil::mol, Node::molecule, ComputeNonbondedCUDA::patch_record::numAtoms, numSlaves, Node::Object(), Box< Owner, Data >::open(), ComputeNonbondedCUDA::patch_record::patchID, patchRecords, CompAtom::position, ComputeNonbondedCUDA::patch_record::positionBox, Compute::priority(), ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::psiSumBox, psiSumH, ComputeNonbondedCUDA::patch_record::r, reduction, remoteHostedPatches, ComputeMgr::sendNonbondedCUDASlaveEnqueue(), Compute::sequence(), Node::simParameters, simParams, ResizeArray< T >::size(), slavePes, slaves, CompAtomExt::sortOrder, workStarted, ComputeNonbondedCUDA::patch_record::x, Vector::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z.

Referenced by doWork().

1902  {
1903  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishWork() \n",CkMyPe());
1904 
1905  if ( master == this ) {
1906  for ( int i = 0; i < numSlaves; ++i ) {
1908  }
1909  }
1910 
1911 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
1912 CkMyPe(), workStarted, gbisPhase)
1913 
1914  Molecule *mol = Node::Object()->molecule;
1915  SimParameters *simParams = Node::Object()->simParameters;
1916 
1917  ResizeArray<int> &patches( workStarted == 1 ?
1919 
1920  // long long int wcount = 0;
1921  // double virial = 0.;
1922  // double virial_slow = 0.;
1923  for ( int i=0; i<patches.size(); ++i ) {
1924  // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
1925  patch_record &pr = master->patchRecords[patches[i]];
1926 
1927  if ( !simParams->GBISOn || gbisPhase == 1 ) {
1928  patch_record &pr = master->patchRecords[patches[i]];
1929 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
1930  pr.r = pr.forceBox->open();
1931  } // !GBISOn || gbisPhase==1
1932 
1933  int start = pr.localStart;
1934  const CompAtomExt *aExt = pr.xExt;
1935  if ( !simParams->GBISOn || gbisPhase == 3 ) {
1936  finishPatch(pr);
1937  } // !GBISOn || gbisPhase == 3
1938 
1939 #if 0
1940  if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
1941  CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
1942  j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
1943  af[j].w);
1944  }
1945 #endif
1946 
1947  //Close Boxes depending on Phase
1948  if (simParams->GBISOn) {
1949  if (gbisPhase == 1) {
1950  //Copy dEdaSum from Host to Patch Box
1951  GBReal *psiSumMaster = master->psiSumH + start;
1952  for ( int k=0; k<pr.numAtoms; ++k ) {
1953  int j = aExt[k].sortOrder;
1954  pr.psiSum[j] += psiSumMaster[k];
1955  }
1956 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
1957  pr.psiSumBox->close(&(pr.psiSum));
1958 
1959  } else if (gbisPhase == 2) {
1960  //Copy dEdaSum from Host to Patch Box
1961  GBReal *dEdaSumMaster = master->dEdaSumH + start;
1962  for ( int k=0; k<pr.numAtoms; ++k ) {
1963  int j = aExt[k].sortOrder;
1964  pr.dEdaSum[j] += dEdaSumMaster[k];
1965  }
1966 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
1967  pr.dEdaSumBox->close(&(pr.dEdaSum));
1968 
1969  } else if (gbisPhase == 3) {
1970 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
1971  pr.intRadBox->close(&(pr.intRad)); //box 6
1972  pr.bornRadBox->close(&(pr.bornRad)); //box 7
1973  pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
1974  pr.positionBox->close(&(pr.x)); //box 0
1975  pr.forceBox->close(&(pr.r));
1976  } //end phases
1977  } else { //not GBIS
1978 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
1979  pr.positionBox->close(&(pr.x));
1980  pr.forceBox->close(&(pr.r));
1981  }
1982  }// end for
1983 
1984 #if 0
1985  virial *= (-1./6.);
1986  reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
1987  reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
1988  reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
1989  if ( doSlow ) {
1990  virial_slow *= (-1./6.);
1991  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
1992  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
1993  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
1994  }
1995 #endif
1996 
1997  if ( workStarted == 1 && ! deviceCUDA->getMergeGrids() &&
1998  ( localHostedPatches.size() || master == this ) ) {
1999  GBISP("not finished, call again\n");
2000  return 0; // not finished, call again
2001  }
2002 
2003  if ( master != this ) { // finished
2004  GBISP("finished\n");
2005  if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
2006  atomsChanged = 0;
2007  return 1;
2008  }
2009 
2011 
2012  if ( !simParams->GBISOn || gbisPhase == 3 ) {
2013 
2014  atomsChanged = 0;
2015  finishReductions();
2016 
2017  } // !GBISOn || gbisPhase==3
2018 
2019  // Next GBIS Phase
2020 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
2021  if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
2022 
2023  GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
2024  return 1; // finished and ready for next step
2025 }
ResizeArray< int > remoteHostedPatches
int sequence(void)
Definition: Compute.h:64
static __thread double cuda_timer_total
int sortOrder
Definition: NamdTypes.h:87
Definition: Node.h:78
#define GBISP(...)
static __thread ComputeMgr * computeMgr
static const Molecule * mol
BigReal & item(int i)
Definition: ReductionMgr.h:312
int getMergeGrids()
Definition: DeviceCUDA.h:95
if(ComputeNonbondedUtil::goMethod==2)
SubmitReduction * reduction
static Units next(Units u)
Definition: ParseOptions.C:48
static __thread double kernel_time
ComputeNonbondedCUDA ** slaves
int priority(void)
Definition: Compute.h:65
ComputeNonbondedCUDA * master
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:127
ResizeArray< int > localHostedPatches
void sendNonbondedCUDASlaveEnqueue(ComputeNonbondedCUDA *c, int, int, int, int)
Definition: ComputeMgr.C:1554
#define CUDA(X)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
ResizeArray< patch_record > patchRecords
float GBReal
Definition: ComputeGBIS.inl:17
for(int i=0;i< n1;++i)
void ComputeNonbondedCUDA::messageFinishPatch ( int  flindex)

Definition at line 1857 of file ComputeNonbondedCUDA.C.

References activePatches, computeMgr, ComputeNonbondedCUDA::patch_record::hostPe, ComputeNonbondedCUDA::patch_record::msg, patchRecords, PROXY_DATA_PRIORITY, ComputeMgr::sendNonbondedCUDASlaveEnqueuePatch(), Compute::sequence(), and ComputeNonbondedCUDA::patch_record::slave.

1857  {
1858  int pid = activePatches[flindex];
1859  patch_record &pr = patchRecords[pid];
1860  //fprintf(stderr, "%d ComputeNonbondedCUDA::messageFinishPatch %d\n",CkMyPe(),pr.hostPe);
1861  computeMgr->sendNonbondedCUDASlaveEnqueuePatch(pr.slave,pr.hostPe,sequence(),PROXY_DATA_PRIORITY,flindex,pr.msg);
1862 }
void sendNonbondedCUDASlaveEnqueuePatch(ComputeNonbondedCUDA *c, int, int, int, int, FinishWorkMsg *)
Definition: ComputeMgr.C:1565
int sequence(void)
Definition: Compute.h:64
static __thread ComputeMgr * computeMgr
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
ResizeArray< int > activePatches
ResizeArray< patch_record > patchRecords
int ComputeNonbondedCUDA::noWork ( )
virtual

Reimplemented from Compute.

Definition at line 1108 of file ComputeNonbondedCUDA.C.

References atomsChanged, ComputeNonbondedCUDA::patch_record::bornRad, ComputeNonbondedCUDA::patch_record::bornRadBox, computeMgr, ComputeNonbondedCUDA::patch_record::dEdaSum, ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, Flags::doEnergy, doEnergy, Flags::doFullElectrostatics, Flags::doNonbonded, doSlow, SimParameters::GBISOn, GBISP, Compute::gbisPhase, Patch::getCompAtomExtInfo(), hostedPatches, ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::intRadBox, Flags::lattice, lattice, master, masterPe, numSlaves, Node::Object(), Box< Owner, Data >::open(), ComputeNonbondedCUDA::patch_record::p, ComputeNonbondedCUDA::patch_record::patchID, patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::psiSumBox, reduction, ComputeMgr::sendNonbondedCUDASlaveReady(), ComputeMgr::sendNonbondedCUDASlaveSkip(), Compute::sequence(), Node::simParameters, simParams, ResizeArray< T >::size(), skip(), slavePes, slaves, Flags::step, step, SubmitReduction::submit(), workStarted, ComputeNonbondedCUDA::patch_record::x, and ComputeNonbondedCUDA::patch_record::xExt.

1108  {
1109 
1111  Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
1112  lattice = flags.lattice;
1113  doSlow = flags.doFullElectrostatics;
1114  doEnergy = flags.doEnergy;
1115  step = flags.step;
1116 
1117  if ( ! flags.doNonbonded ) {
1118  GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
1119  if ( master != this ) {
1122  } else {
1123  for ( int i = 0; i < numSlaves; ++i ) {
1125  }
1126  skip();
1127  }
1128  if ( reduction ) {
1129  reduction->submit();
1130  }
1131 
1132  return 1;
1133  }
1134 
1135  for ( int i=0; i<hostedPatches.size(); ++i ) {
1136  patch_record &pr = master->patchRecords[hostedPatches[i]];
1137  if (!simParams->GBISOn || gbisPhase == 1) {
1138  GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
1139  pr.x = pr.positionBox->open();
1140  pr.xExt = pr.p->getCompAtomExtInfo();
1141  }
1142 
1143  if (simParams->GBISOn) {
1144  if (gbisPhase == 1) {
1145  GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
1146  pr.intRad = pr.intRadBox->open();
1147  pr.psiSum = pr.psiSumBox->open();
1148  } else if (gbisPhase == 2) {
1149  GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
1150  pr.bornRad = pr.bornRadBox->open();
1151  pr.dEdaSum = pr.dEdaSumBox->open();
1152  } else if (gbisPhase == 3) {
1153  GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
1154  pr.dHdrPrefix = pr.dHdrPrefixBox->open();
1155  }
1156  GBISP("opened GBIS boxes");
1157  }
1158  }
1159 
1160  if ( master == this ) return 0; //work to do, enqueue as usual
1161 
1162  // message masterPe
1165  atomsChanged = 0;
1166 
1167  workStarted = 1;
1168 
1169  return 1;
1170 }
static Node * Object()
Definition: Node.h:86
void sendNonbondedCUDASlaveSkip(ComputeNonbondedCUDA *c, int)
Definition: ComputeMgr.C:1541
int sequence(void)
Definition: Compute.h:64
#define GBISP(...)
static __thread ComputeMgr * computeMgr
SimParameters * simParameters
Definition: Node.h:178
void sendNonbondedCUDASlaveReady(int, int, int, int)
Definition: ComputeMgr.C:1525
SubmitReduction * reduction
ComputeNonbondedCUDA ** slaves
int doEnergy
Definition: PatchTypes.h:20
int doFullElectrostatics
Definition: PatchTypes.h:23
ComputeNonbondedCUDA * master
int doNonbonded
Definition: PatchTypes.h:22
ResizeArray< int > hostedPatches
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:127
Lattice lattice
Definition: PatchTypes.h:44
ResizeArray< patch_record > patchRecords
void submit(void)
Definition: ReductionMgr.h:323
int size(void) const
Definition: ResizeArray.h:127
int step
Definition: PatchTypes.h:16
void ComputeNonbondedCUDA::recvYieldDevice ( int  pe)

Definition at line 1633 of file ComputeNonbondedCUDA.C.

References Lattice::a(), SimParameters::alpha_cutoff, atom_params, atoms, atomsChanged, Lattice::b(), block_order, bornRadH, Lattice::c(), CcdCallBacksReset(), SimParameters::coulomb_radius_offset, cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_forces(), cuda_bind_GBIS_bornRad(), cuda_bind_GBIS_dEdaSum(), cuda_bind_GBIS_dHdrPrefix(), cuda_bind_GBIS_energy(), cuda_bind_GBIS_intRad(), cuda_bind_GBIS_psiSum(), cuda_bind_vdw_types(), cuda_bind_virials(), cuda_check_local_calc(), cuda_check_local_progress(), cuda_check_progress(), cuda_check_remote_calc(), cuda_check_remote_progress(), cuda_GBIS_P1(), cuda_GBIS_P2(), cuda_GBIS_P3(), cuda_nonbonded_forces(), CUDA_POLL, SimParameters::cutoff, ComputeNonbondedUtil::cutoff2, dEdaSumH, deviceCUDA, deviceID, dHdrPrefixH, SimParameters::dielectric, doEnergy, doSlow, dummy_dev, dummy_size, end_local_download, end_remote_download, energy_gbis, force_ready_queue, force_ready_queue_next, forces, SimParameters::fsMax, SimParameters::GBISOn, GBISP, Compute::gbisPhase, DeviceCUDA::getMergeGrids(), DeviceCUDA::getNoStreaming(), DeviceCUDA::getSharedGpu(), intRad0H, intRadSH, SimParameters::kappa, kernel_launch_state, lata, latb, latc, lattice, localActivePatches, localComputeRecords, SimParameters::nonbondedScaling, Node::Object(), patchPairsReordered, plcutoff2, SimParameters::PMEOffload, SimParameters::PMEOn, psiSumH, remote_submit_time, remoteActivePatches, remoteComputeRecords, savePairlists, Compute::sequence(), DeviceCUDA::setGpuIsMine(), Node::simParameters, simParams, ResizeArray< T >::size(), slow_forces, SimParameters::solvent_dielectric, start_calc, stream, stream2, SimParameters::switchingActive, SimParameters::switchingDist, usePairlists, SimParameters::usePMECUDA, vdw_types, virials, workStarted, Vector::x, Vector::y, and Vector::z.

Referenced by doWork(), and ComputeMgr::recvYieldDevice().

1633  {
1634 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
1635 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
1637 
1638  float3 lata, latb, latc;
1639  lata.x = lattice.a().x;
1640  lata.y = lattice.a().y;
1641  lata.z = lattice.a().z;
1642  latb.x = lattice.b().x;
1643  latb.y = lattice.b().y;
1644  latb.z = lattice.b().z;
1645  latc.x = lattice.c().x;
1646  latc.y = lattice.c().y;
1647  latc.z = lattice.c().z;
1648  SimParameters *simParams = Node::Object()->simParameters;
1649 
1650  // Set GPU device ID
1651  cudaSetDevice(deviceID);
1652 
1653  const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
1654 
1655  double walltime;
1656  if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
1657  walltime = CkWallTimer();
1658  CcdCallBacksReset(0,walltime); // fix Charm++
1659  }
1660 
1661  switch ( kernel_launch_state ) {
1663 // Remote
1664  case 1:
1665 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
1667  //gpu_is_mine = 0;
1669  remote_submit_time = walltime;
1670 
1671  if (!simParams->GBISOn || gbisPhase == 1) {
1672  // cudaEventRecord(start_upload, stream);
1673  if ( atomsChanged ) {
1676  }
1677  if ( simParams->GBISOn) {
1679  if ( atomsChanged ) {
1681  }
1682  }
1683  atomsChanged = 0;
1684  cuda_bind_atoms((const atom *)atoms);
1687  if ( simParams->GBISOn) {
1689  }
1690  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1691  //call CUDA Kernels
1692 
1694  0,remoteComputeRecords.size(),
1697  if (simParams->GBISOn) {
1698  cuda_GBIS_P1(
1699  0,remoteComputeRecords.size(),
1701  simParams->alpha_cutoff-simParams->fsMax,
1702  simParams->coulomb_radius_offset,
1703  lata, latb, latc, stream);
1704  }
1705  //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
1706  // num_local_atom_records,num_remote_atom_records);
1707  if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
1708  cudaEventRecord(end_remote_download, stream);
1709  }
1710  if ( streaming ) {
1713  } else {
1715  }
1716  if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
1718  break;
1719  }
1720  } // !GBIS or gbisPhase==1
1721  if (simParams->GBISOn) {
1722  if (gbisPhase == 1) {
1723  //GBIS P1 Kernel launched in previous code block
1724  } else if (gbisPhase == 2) {
1725 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
1726  // cudaEventRecord(start_upload, stream);
1729  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1730  cuda_GBIS_P2(
1731  0,remoteComputeRecords.size(),
1733  (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
1734  simParams->nonbondedScaling, simParams->kappa,
1735  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1736  simParams->dielectric, simParams->solvent_dielectric,
1737  lata, latb, latc,
1738  doEnergy, doSlow, stream
1739  );
1740  cudaEventRecord(end_remote_download, stream);
1742  } else if (gbisPhase == 3) {
1743 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
1744  // cudaEventRecord(start_upload, stream);
1746  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1747  if (doSlow)
1748  cuda_GBIS_P3(
1749  0,remoteComputeRecords.size(),
1751  (simParams->alpha_cutoff-simParams->fsMax),
1752  simParams->coulomb_radius_offset,
1753  simParams->nonbondedScaling,
1754  lata, latb, latc, stream
1755  );
1756  cudaEventRecord(end_remote_download, stream);
1757  CUDA_POLL(cuda_check_remote_progress,this);
1758  }
1759  }
1760 
1762 // Local
1763  case 2:
1764 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
1766  //gpu_is_mine = 0;
1767  deviceCUDA->setGpuIsMine(0);
1768 
1769  if ( stream2 != stream ) {
1770  // needed to ensure that upload is finished
1771  cudaStreamWaitEvent(stream2, start_calc, 0);
1772  // priorities do not prevent local from launching ahead
1773  // of remote, so delay local stream with a small memset
1774  cudaMemsetAsync(dummy_dev, 0, dummy_size, stream2);
1775  }
1776 
1777  if (!simParams->GBISOn || gbisPhase == 1) {
1778 
1779  cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
1783  if (simParams->GBISOn) {
1784  cuda_GBIS_P1(
1787  simParams->alpha_cutoff-simParams->fsMax,
1788  simParams->coulomb_radius_offset,
1789  lata, latb, latc, stream2 );
1790  }
1791  //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
1792  // 0,num_local_atom_records);
1793  //cuda_load_virials(virials, doSlow); // slow_virials follows virials
1794  if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
1795  cudaEventRecord(end_local_download, stream2);
1796  }
1797  if ( ! streaming && workStarted == 2 ) {
1798  GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1799 cuda_check_local_progress\n", CkMyPe())
1801  }
1802  if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
1803  GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1804 cuda_check_local_calc\n", CkMyPe())
1806  break;
1807  }
1808 
1809  } // !GBIS or gbisPhase==1
1810  if (simParams->GBISOn) {
1811  if (gbisPhase == 1) {
1812  //GBIS P1 Kernel launched in previous code block
1813  } else if (gbisPhase == 2) {
1814 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
1815  cuda_GBIS_P2(
1817  0,localActivePatches.size(),
1818  (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
1819  simParams->nonbondedScaling, simParams->kappa,
1820  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1821  simParams->dielectric, simParams->solvent_dielectric,
1822  lata, latb, latc,
1823  doEnergy, doSlow, stream2
1824  );
1825  cudaEventRecord(end_local_download, stream2);
1826  if ( workStarted == 2 ) {
1827  CUDA_POLL(cuda_check_local_progress,this);
1828  }
1829  } else if (gbisPhase == 3) {
1830 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
1831  if (doSlow)
1832  cuda_GBIS_P3(
1834  0,localActivePatches.size(),
1835  (simParams->alpha_cutoff-simParams->fsMax),
1836  simParams->coulomb_radius_offset,
1837  simParams->nonbondedScaling,
1838  lata, latb, latc, stream2
1839  );
1840  cudaEventRecord(end_local_download, stream2);
1841  if ( workStarted == 2 ) {
1842  CUDA_POLL(cuda_check_local_progress,this);
1843  }
1844  } // phases
1845  } // GBISOn
1846  if ( simParams->PMEOn && simParams->PMEOffload && !simParams->usePMECUDA) break;
1847 
1848  default:
1849 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
1850  //gpu_is_mine = 1;
1851  deviceCUDA->setGpuIsMine(1);
1852  break;
1853  } // switch
1854 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
1855 }
static __thread int * block_order
ResizeArray< int > localActivePatches
int sequence(void)
Definition: Compute.h:64
void cuda_bind_forces(float4 *f, float4 *f_slow)
void cuda_check_progress(void *arg, double walltime)
Definition: Node.h:78
void cuda_check_local_progress(void *arg, double walltime)
void cuda_bind_atoms(const atom *a)
bool getSharedGpu()
Definition: DeviceCUDA.h:98
static __thread cudaEvent_t end_remote_download
#define GBISP(...)
static __thread int dummy_size
static __thread float * bornRadH
static __thread float * dHdrPrefixH
int getMergeGrids()
Definition: DeviceCUDA.h:95
if(ComputeNonbondedUtil::goMethod==2)
void cuda_bind_GBIS_energy(float *e)
void CcdCallBacksReset(void *ignored, double curWallTime)
void cuda_bind_GBIS_dEdaSum(GBReal *dEdaSumH)
static __thread cudaEvent_t end_local_download
void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
__thread cudaStream_t stream
static __thread int force_ready_queue_next
void cuda_check_local_calc(void *arg, double walltime)
#define CUDA_POLL(FN, ARG)
static __thread float * intRadSH
void setGpuIsMine(const int val)
Definition: DeviceCUDA.h:105
void cuda_bind_vdw_types(const int *t)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
void cuda_bind_GBIS_dHdrPrefix(float *dHdrPrefixH)
gridSize z
void cuda_bind_GBIS_bornRad(float *bornRadH)
ResizeArray< int > remoteActivePatches
ResizeArray< compute_record > localComputeRecords
static __thread float * virials
void cuda_bind_virials(float *v, int *queue, int *blockorder)
ResizeArray< compute_record > remoteComputeRecords
void cuda_bind_GBIS_psiSum(GBReal *psiSumH)
void cuda_GBIS_P3(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
static __thread double remote_submit_time
void cuda_GBIS_P1(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
int gbisPhase
Definition: Compute.h:39
void cuda_check_remote_progress(void *arg, double walltime)
static __thread float * dummy_dev
#define simParams
Definition: Output.C:127
void cuda_bind_GBIS_intRad(float *intRad0H, float *intRadSH)
static __thread float * energy_gbis
#define CUDA(X)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
static __thread int kernel_launch_state
void cuda_check_remote_calc(void *arg, double walltime)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
static __thread cudaEvent_t start_calc
static __thread atom_param * atom_params
gridSize y
static __thread int * vdw_types
void cuda_GBIS_P2(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
int size(void) const
Definition: ResizeArray.h:127
__thread cudaStream_t stream2
static __thread int * force_ready_queue
void cuda_bind_atom_params(const atom_param *t)
gridSize x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
static __thread float * intRad0H
void ComputeNonbondedCUDA::registerPatches ( )

Definition at line 611 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< T >::add(), ResizeArray< T >::begin(), ComputeNonbondedCUDA::patch_record::bornRadBox, ProxyMgr::createProxy(), ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, ComputeNonbondedCUDA::patch_record::forceBox, SimParameters::GBISOn, hostedPatches, ComputeNonbondedCUDA::patch_record::hostPe, ComputeNonbondedCUDA::patch_record::intRadBox, ComputeNonbondedCUDA::patch_record::isLocal, localHostedPatches, master, masterPe, ComputeNonbondedCUDA::patch_record::msg, Node::Object(), ProxyMgr::Object(), ComputeNonbondedCUDA::patch_record::p, PatchMap::patch(), patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, PRIORITY_SIZE, ComputeNonbondedCUDA::patch_record::psiSumBox, Patch::registerBornRadPickup(), Patch::registerDEdaSumDeposit(), Patch::registerDHdrPrefixPickup(), Patch::registerForceDeposit(), Patch::registerIntRadPickup(), Patch::registerPositionPickup(), Patch::registerPsiSumDeposit(), remoteHostedPatches, Compute::setNumPatches(), Node::simParameters, simParams, ResizeArray< T >::size(), and ComputeNonbondedCUDA::patch_record::slave.

Referenced by assignPatches(), and ComputeNonbondedCUDA().

611  {
612 
614  int npatches = master->activePatches.size();
615  int *pids = master->activePatches.begin();
616  patch_record *recs = master->patchRecords.begin();
617  for ( int i=0; i<npatches; ++i ) {
618  int pid = pids[i];
619  patch_record &pr = recs[pid];
620  if ( pr.hostPe == CkMyPe() ) {
621  pr.slave = this;
622  pr.msg = new (PRIORITY_SIZE) FinishWorkMsg;
623  hostedPatches.add(pid);
624  if ( pr.isLocal ) {
625  localHostedPatches.add(pid);
626  } else {
628  }
630  pr.p = patchMap->patch(pid);
631  pr.positionBox = pr.p->registerPositionPickup(this);
632  pr.forceBox = pr.p->registerForceDeposit(this);
633  if (simParams->GBISOn) {
634  pr.intRadBox = pr.p->registerIntRadPickup(this);
635  pr.psiSumBox = pr.p->registerPsiSumDeposit(this);
636  pr.bornRadBox = pr.p->registerBornRadPickup(this);
637  pr.dEdaSumBox = pr.p->registerDEdaSumDeposit(this);
638  pr.dHdrPrefixBox = pr.p->registerDHdrPrefixPickup(this);
639  }
640  }
641  }
642  if ( master == this ) setNumPatches(activePatches.size());
644  if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
645  CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
646 }
static Node * Object()
Definition: Node.h:86
void setNumPatches(int n)
Definition: Compute.h:52
ResizeArray< int > remoteHostedPatches
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
SimParameters * simParameters
Definition: Node.h:178
Patch * patch(PatchID pid)
Definition: PatchMap.h:235
#define PRIORITY_SIZE
Definition: Priorities.h:13
OwnerBox< Patch, CompAtom > positionBox
Definition: Patch.h:213
void createProxy(PatchID pid)
Definition: ProxyMgr.C:493
ComputeNonbondedCUDA * master
ResizeArray< int > hostedPatches
ResizeArray< int > activePatches
int add(const Elem &elem)
Definition: ResizeArray.h:97
#define simParams
Definition: Output.C:127
ResizeArray< int > localHostedPatches
ResizeArray< patch_record > patchRecords
int size(void) const
Definition: ResizeArray.h:127
iterator begin(void)
Definition: ResizeArray.h:36
void ComputeNonbondedCUDA::requirePatch ( int  pid)

Definition at line 575 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< T >::add(), ComputeNonbondedCUDA::patch_record::bornRad, computesChanged, ComputeNonbondedCUDA::patch_record::dEdaSum, deviceCUDA, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::f, DeviceCUDA::getMergeGrids(), ComputeNonbondedCUDA::patch_record::hostPe, PatchMap::index_a(), PatchMap::index_b(), PatchMap::index_c(), ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::isLocal, ComputeNonbondedCUDA::patch_record::isSameNode, ComputeNonbondedCUDA::patch_record::isSamePhysicalNode, localActivePatches, PatchMap::node(), ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::r, ComputeNonbondedCUDA::patch_record::refCount, remoteActivePatches, ComputeNonbondedCUDA::patch_record::x, and ComputeNonbondedCUDA::patch_record::xExt.

Referenced by register_cuda_compute_pair(), and register_cuda_compute_self().

575  {
576 
577  computesChanged = 1;
578  patch_record &pr = patchRecords[pid];
579  if ( pr.refCount == 0 ) {
580  pr.isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
581  pr.isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
582  if ( deviceCUDA->getMergeGrids() ) {
583  pr.isLocal = 0;
584  } else if ( CkNumNodes() < 2 ) {
585  pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
586  patchMap->index_b(pid) ^ patchMap->index_c(pid) );
587  } else {
588  pr.isLocal = pr.isSameNode;
589  }
590  if ( pr.isLocal ) {
591  localActivePatches.add(pid);
592  } else {
594  }
595  activePatches.add(pid);
596  pr.patchID = pid;
597  pr.hostPe = -1;
598  pr.x = NULL;
599  pr.xExt = NULL;
600  pr.r = NULL;
601  pr.f = NULL;
602  pr.intRad = NULL;
603  pr.psiSum = NULL;
604  pr.bornRad = NULL;
605  pr.dEdaSum = NULL;
606  pr.dHdrPrefix = NULL;
607  }
608  pr.refCount += 1;
609 }
ResizeArray< int > localActivePatches
int index_a(int pid) const
Definition: PatchMap.h:86
int getMergeGrids()
Definition: DeviceCUDA.h:95
int index_b(int pid) const
Definition: PatchMap.h:87
ResizeArray< int > remoteActivePatches
ResizeArray< int > activePatches
int index_c(int pid) const
Definition: PatchMap.h:88
int add(const Elem &elem)
Definition: ResizeArray.h:97
int node(int pid) const
Definition: PatchMap.h:114
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
ResizeArray< patch_record > patchRecords
void ComputeNonbondedCUDA::skip ( void  )

Definition at line 1091 of file ComputeNonbondedCUDA.C.

References ComputeNonbondedCUDA::patch_record::bornRadBox, ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, ComputeNonbondedCUDA::patch_record::forceBox, SimParameters::GBISOn, hostedPatches, ComputeNonbondedCUDA::patch_record::intRadBox, master, Node::Object(), patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::psiSumBox, Node::simParameters, simParams, ResizeArray< T >::size(), and Box< Owner, Data >::skip().

Referenced by noWork(), and ComputeMgr::recvNonbondedCUDASlaveSkip().

1091  {
1092  //fprintf(stderr, "ComputeNonbondedCUDA::skip()\n");
1094  for ( int i=0; i<hostedPatches.size(); ++i ) {
1095  patch_record &pr = master->patchRecords[hostedPatches[i]];
1096  pr.positionBox->skip();
1097  pr.forceBox->skip();
1098  if (simParams->GBISOn) {
1099  pr.intRadBox->skip();
1100  pr.psiSumBox->skip();
1101  pr.bornRadBox->skip();
1102  pr.dEdaSumBox->skip();
1103  pr.dHdrPrefixBox->skip();
1104  }
1105  }
1106 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
ComputeNonbondedCUDA * master
ResizeArray< int > hostedPatches
#define simParams
Definition: Output.C:127
ResizeArray< patch_record > patchRecords
int size(void) const
Definition: ResizeArray.h:127

Member Data Documentation

ResizeArray<int> ComputeNonbondedCUDA::activePatches
AtomMap* ComputeNonbondedCUDA::atomMap

Definition at line 119 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

CudaAtom* ComputeNonbondedCUDA::atoms

Definition at line 142 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::atoms_size

Definition at line 141 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::atomsChanged
ResizeArray<compute_record> ComputeNonbondedCUDA::computeRecords

Definition at line 98 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), and finishReductions().

int ComputeNonbondedCUDA::computesChanged

Definition at line 132 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and requirePatch().

GBReal* ComputeNonbondedCUDA::dEdaSumH

Definition at line 111 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::dEdaSumH_size

Definition at line 110 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::deviceID

Definition at line 116 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::doEnergy

Definition at line 80 of file ComputeNonbondedCUDA.h.

Referenced by finishReductions(), noWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::doSlow

Definition at line 80 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), finishReductions(), finishWork(), noWork(), and recvYieldDevice().

float4* ComputeNonbondedCUDA::forces

Definition at line 102 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::forces_size

Definition at line 101 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

ResizeArray<int> ComputeNonbondedCUDA::hostedPatches

Definition at line 96 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), noWork(), registerPatches(), and skip().

Lattice ComputeNonbondedCUDA::lattice

Definition at line 79 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), noWork(), and recvYieldDevice().

ResizeArray<int> ComputeNonbondedCUDA::localActivePatches

Definition at line 95 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), and requirePatch().

ResizeArray<compute_record> ComputeNonbondedCUDA::localComputeRecords
ResizeArray<int> ComputeNonbondedCUDA::localHostedPatches
LocalWorkMsg* ComputeNonbondedCUDA::localWorkMsg2
ComputeNonbondedCUDA* ComputeNonbondedCUDA::master
int ComputeNonbondedCUDA::masterPe

Definition at line 125 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), noWork(), and registerPatches().

int ComputeNonbondedCUDA::numSlaves

Definition at line 129 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), finishWork(), and noWork().

int ComputeNonbondedCUDA::pairlistsValid

Definition at line 135 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

float ComputeNonbondedCUDA::pairlistTolerance

Definition at line 136 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

PatchMap* ComputeNonbondedCUDA::patchMap
int ComputeNonbondedCUDA::patchPairsReordered
ResizeArray<patch_record> ComputeNonbondedCUDA::patchRecords
float ComputeNonbondedCUDA::plcutoff2

Definition at line 139 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

GBReal* ComputeNonbondedCUDA::psiSumH

Definition at line 108 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::psiSumH_size

Definition at line 107 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

SubmitReduction* ComputeNonbondedCUDA::reduction
ResizeArray<int> ComputeNonbondedCUDA::remoteActivePatches

Definition at line 95 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), and requirePatch().

ResizeArray<compute_record> ComputeNonbondedCUDA::remoteComputeRecords
ResizeArray<int> ComputeNonbondedCUDA::remoteHostedPatches

Definition at line 96 of file ComputeNonbondedCUDA.h.

Referenced by finishWork(), and registerPatches().

int ComputeNonbondedCUDA::savePairlists

Definition at line 138 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::slaveIndex

Definition at line 126 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

int* ComputeNonbondedCUDA::slavePes

Definition at line 128 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), finishWork(), and noWork().

ComputeNonbondedCUDA** ComputeNonbondedCUDA::slaves

Definition at line 127 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), finishWork(), and noWork().

float4* ComputeNonbondedCUDA::slow_forces

Definition at line 105 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::slow_forces_size

Definition at line 104 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::step

Definition at line 81 of file ComputeNonbondedCUDA.h.

Referenced by finishReductions(), and noWork().

int ComputeNonbondedCUDA::usePairlists

Definition at line 137 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::workStarted

The documentation for this class was generated from the following files: