ComputeNonbondedCUDA Class Reference

#include <ComputeNonbondedCUDA.h>

Inheritance diagram for ComputeNonbondedCUDA:
Compute ComputeNonbondedUtil

List of all members.

Classes

struct  compute_record
struct  patch_record

Public Member Functions

 ComputeNonbondedCUDA (ComputeID c, ComputeMgr *mgr, ComputeNonbondedCUDA *m=0, int idx=-1)
 ~ComputeNonbondedCUDA ()
void atomUpdate ()
void doWork ()
int noWork ()
void skip ()
void recvYieldDevice (int pe)
int finishWork ()
void finishReductions ()
void finishPatch (int)
void messageFinishPatch (int)
void requirePatch (int pid)
void assignPatches ()
void registerPatches ()

Static Public Member Functions

static void build_lj_table ()
static void build_force_table ()
static void build_exclusions ()

Public Attributes

LocalWorkMsglocalWorkMsg2
int workStarted
Lattice lattice
int doSlow
int doEnergy
int step
ResizeArray< int > activePatches
ResizeArray< int > localActivePatches
ResizeArray< int > remoteActivePatches
ResizeArray< int > hostedPatches
ResizeArray< int > localHostedPatches
ResizeArray< int > remoteHostedPatches
ResizeArray< patch_recordpatchRecords
ResizeArray< compute_recordcomputeRecords
ResizeArray< compute_recordlocalComputeRecords
ResizeArray< compute_recordremoteComputeRecords
int forces_size
float4 * forces
int slow_forces_size
float4 * slow_forces
int psiSumH_size
GBRealpsiSumH
int dEdaSumH_size
GBRealdEdaSumH
int deviceID
PatchMappatchMap
AtomMapatomMap
SubmitReductionreduction
ComputeNonbondedCUDAmaster
int masterPe
int slaveIndex
ComputeNonbondedCUDA ** slaves
int * slavePes
int numSlaves
int atomsChanged
int computesChanged
int patchPairsReordered
int pairlistsValid
float pairlistTolerance
int usePairlists
int savePairlists
float plcutoff2
int atoms_size
CudaAtomatoms

Detailed Description

Definition at line 20 of file ComputeNonbondedCUDA.h.


Constructor & Destructor Documentation

ComputeNonbondedCUDA::ComputeNonbondedCUDA ( ComputeID  c,
ComputeMgr mgr,
ComputeNonbondedCUDA m = 0,
int  idx = -1 
)

Definition at line 433 of file ComputeNonbondedCUDA.C.

References atomMap, atoms, atoms_size, atomsChanged, Compute::basePriority, build_exclusions(), computeMgr, computesChanged, cuda_errcheck(), cuda_init(), cudaCompute, dEdaSumH, dEdaSumH_size, deviceCUDA, deviceID, end_local_download, end_remote_download, forces, forces_size, SimParameters::GBISOn, DeviceCUDA::getDeviceID(), DeviceCUDA::getMergeGrids(), DeviceCUDA::getNoMergeGrids(), DeviceCUDA::getNoStreaming(), DeviceCUDA::getSharedGpu(), init_arrays(), localWorkMsg2, master, masterPe, max_grid_size, NAMD_bug(), NAMD_die(), PatchMap::numPatches(), Node::Object(), AtomMap::Object(), PatchMap::Object(), pairlistsValid, pairlistTolerance, patch_pair_num_ptr, patch_pairs_ptr, patchMap, patchPairsReordered, patchRecords, plcutoff2, SimParameters::PMEOffload, SimParameters::PMEOn, SimParameters::pressureProfileOn, PRIORITY_SIZE, PROXY_DATA_PRIORITY, psiSumH, psiSumH_size, reduction, registerPatches(), ResizeArray< Elem >::resize(), savePairlists, DeviceCUDA::setGpuIsMine(), DeviceCUDA::setMergeGrids(), Node::simParameters, slaveIndex, slavePes, slaves, slow_forces, slow_forces_size, start_calc, stream, stream2, usePairlists, SimParameters::usePMECUDA, and workStarted.

00434                                                                              : Compute(c), slaveIndex(idx) {
00435 #ifdef PRINT_GBIS
00436    CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
00437 #endif
00438 
00439   if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00440   if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00441   if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00442 
00443   // CkPrintf("create ComputeNonbondedCUDA\n");
00444   master = m ? m : this;
00445   cudaCompute = this;
00446   computeMgr = mgr;
00447   patchMap = PatchMap::Object();
00448   atomMap = AtomMap::Object();
00449   reduction = 0;
00450 
00451   SimParameters *params = Node::Object()->simParameters;
00452   if (params->pressureProfileOn) {
00453     NAMD_die("pressure profile not supported in CUDA");
00454   }
00455 
00456   atomsChanged = 1;
00457   computesChanged = 1;
00458   patchPairsReordered = 1;
00459 
00460   pairlistsValid = 0;
00461   pairlistTolerance = 0.;
00462   usePairlists = 0;
00463   savePairlists = 0;
00464   plcutoff2 = 0.;
00465 
00466   workStarted = 0;
00467   basePriority = PROXY_DATA_PRIORITY;
00468   localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00469 
00470   // Zero array sizes and pointers
00471   init_arrays();
00472 
00473   atoms_size = 0;
00474   atoms = NULL;
00475 
00476   forces_size = 0;
00477   forces = NULL;
00478   
00479   slow_forces_size = 0;
00480   slow_forces = NULL;
00481 
00482   psiSumH_size = 0;
00483   psiSumH = NULL;
00484 
00485   dEdaSumH_size = 0;
00486   dEdaSumH = NULL;
00487 
00488   if ( master != this ) { // I am slave
00489     masterPe = master->masterPe;
00490     master->slaves[slaveIndex] = this;
00491     if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00492       NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
00493     }
00494     deviceID = master->deviceID;
00495     registerPatches();
00496     return;
00497   }
00498   masterPe = CkMyPe();
00499 
00500   const bool streaming = ! (deviceCUDA->getNoStreaming() || params->GBISOn);
00501   if ( streaming && ! deviceCUDA->getSharedGpu() && ! deviceCUDA->getNoMergeGrids() )
00502     deviceCUDA->setMergeGrids(1);
00503 
00504   // Sanity checks for New CUDA kernel
00505   if (params->GBISOn) {
00506     // GBIS
00507     if (deviceCUDA->getNoMergeGrids()) {
00508       NAMD_die("CUDA kernel cannot use +nomergegrids with GBIS simulations");
00509     }
00510     // Set mergegrids ON as long as user hasn't defined +nomergegrids
00511     if (!deviceCUDA->getNoMergeGrids()) deviceCUDA->setMergeGrids(1);
00512     // Final sanity check
00513     if (!deviceCUDA->getMergeGrids()) NAMD_die("CUDA GBIS kernel final sanity check failed");
00514   } else {
00515     // non-GBIS
00516     if (deviceCUDA->getNoStreaming() && !deviceCUDA->getMergeGrids()) {
00517       NAMD_die("CUDA kernel requires +mergegrids if +nostreaming is used");
00518     }
00519   }
00520 
00521 #if CUDA_VERSION >= 5050
00522   int leastPriority, greatestPriority;
00523   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00524   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00525   if ( leastPriority != greatestPriority ) {
00526     if ( CkMyNode() == 0 ) {
00527       int dev = deviceCUDA->getDeviceID();
00528       CkPrintf("CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
00529     }
00530     if ( deviceCUDA->getMergeGrids() && params->PMEOn && params->PMEOffload && !params->usePMECUDA) {
00531       greatestPriority = leastPriority;
00532     }
00533     if (params->usePMECUDA) greatestPriority = leastPriority;
00534     cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority);
00535     cudaStreamCreateWithPriority(&stream2,cudaStreamDefault,leastPriority);
00536   } else
00537 #endif
00538   {
00539     cudaStreamCreate(&stream);
00540     cuda_errcheck("cudaStreamCreate");
00541     int dev = deviceCUDA->getDeviceID();
00542     cudaDeviceProp deviceProp;
00543     cudaGetDeviceProperties(&deviceProp, dev);
00544     cuda_errcheck("cudaGetDeviceProperties");
00545     if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
00546       if ( CkMyNode() == 0 ) CkPrintf("CUDA device %d supports concurrent kernels.\n", dev);
00547       cudaStreamCreate(&stream2);
00548     } else {
00549       stream2 = stream;
00550     }
00551   }
00552   cuda_errcheck("cudaStreamCreate");
00553 
00554   // Get GPU device ID
00555   deviceID = deviceCUDA->getDeviceID();
00556 
00557   cuda_init();
00558   if ( max_grid_size < 65535 ) NAMD_bug("bad CUDA max_grid_size");
00559   build_exclusions();
00560   // cudaEventCreate(&start_upload);
00561   cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
00562   cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
00563   cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
00564 
00565   patchRecords.resize(patchMap->numPatches());
00566   patch_pairs_ptr = new ResizeArray<patch_pair>;
00567   patch_pair_num_ptr = new ResizeArray<int>;
00568 
00569   if ( params->PMEOn && params->PMEOffload && !params->usePMECUDA) deviceCUDA->setGpuIsMine(0);
00570 }

ComputeNonbondedCUDA::~ComputeNonbondedCUDA (  ) 

Definition at line 573 of file ComputeNonbondedCUDA.C.

00573 { ; }


Member Function Documentation

void ComputeNonbondedCUDA::assignPatches (  ) 

Definition at line 657 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< Elem >::add(), computeMgr, deviceCUDA, DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputeNonbondedCUDA::patch_record::hostPe, j, PatchMap::node(), numSlaves, ReductionMgr::Object(), PatchMap::ObjectOnPe(), patchMap, patchRecords, reduction, REDUCTIONS_BASIC, registerPatches(), ComputeMgr::sendCreateNonbondedCUDASlave(), ResizeArray< Elem >::size(), slavePes, slaves, sort, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

00657                                          {
00658 
00659   int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00660   int numPesOnNodeSharingDevice = 0;
00661   int masterIndex = -1;
00662   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00663     int pe = deviceCUDA->getPesSharingDevice(i);
00664     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00665     if ( CkNodeOf(pe) == CkMyNode() ) {
00666       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00667     }
00668   }
00669 
00670   int npatches = activePatches.size();
00671 
00672   if ( npatches ) {
00673     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00674   }
00675 
00676   // calculate priority rank of local home patch within pe
00677   {
00678     ResizeArray< ResizeArray<int> > homePatchByRank(CkMyNodeSize());
00679     for ( int i=0; i<npatches; ++i ) {
00680       int pid = activePatches[i];
00681       int homePe = patchMap->node(pid);
00682       if ( CkNodeOf(homePe) == CkMyNode() ) {
00683         homePatchByRank[CkRankOf(homePe)].add(pid);
00684       }
00685     }
00686     for ( int i=0; i<CkMyNodeSize(); ++i ) {
00687       pid_sortop_reverse_priority so;
00688       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
00689       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
00690       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
00691         int pid = homePatchByRank[i][j];
00692         patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
00693       }
00694     }
00695   }
00696 
00697   int *count = new int[npatches];
00698   memset(count, 0, sizeof(int)*npatches);
00699   int *pcount = new int[numPesOnNodeSharingDevice];
00700   memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
00701   int *rankpcount = new int[CkMyNodeSize()];
00702   memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
00703   char *table = new char[npatches*numPesOnNodeSharingDevice];
00704   memset(table, 0, npatches*numPesOnNodeSharingDevice);
00705 
00706   int unassignedpatches = npatches;
00707 
00708   if ( 0 ) { // assign all to device pe
00709     for ( int i=0; i<npatches; ++i ) {
00710       int pid = activePatches[i];
00711       patch_record &pr = patchRecords[pid];
00712       pr.hostPe = CkMyPe();
00713     }
00714     unassignedpatches = 0;
00715     pcount[masterIndex] = npatches;
00716   } else 
00717 
00718   // assign if home pe and build table of natural proxies
00719   for ( int i=0; i<npatches; ++i ) {
00720     int pid = activePatches[i];
00721     patch_record &pr = patchRecords[pid];
00722     int homePe = patchMap->node(pid);
00723     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00724       int pe = pesOnNodeSharingDevice[j];
00725       if ( pe == homePe ) {
00726         pr.hostPe = pe;  --unassignedpatches;
00727         pcount[j] += 1;
00728       }
00729       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00730         table[i*numPesOnNodeSharingDevice+j] = 1;
00731       }
00732     }
00733     if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00734       pr.hostPe = homePe;  --unassignedpatches;
00735       rankpcount[CkRankOf(homePe)] += 1;
00736     }
00737   }
00738   // assign if only one pe has a required proxy
00739   int assignj = 0;
00740   for ( int i=0; i<npatches; ++i ) {
00741     int pid = activePatches[i];
00742     patch_record &pr = patchRecords[pid];
00743     if ( pr.hostPe != -1 ) continue;
00744     int c = 0;
00745     int lastj;
00746     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00747       if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
00748     }
00749     count[i] = c;
00750     if ( c == 1 ) {
00751       pr.hostPe = pesOnNodeSharingDevice[lastj];
00752       --unassignedpatches;
00753       pcount[lastj] += 1;
00754     }
00755   }
00756   while ( unassignedpatches ) {
00757     int i;
00758     for ( i=0; i<npatches; ++i ) {
00759       if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
00760       int pid = activePatches[i];
00761       patch_record &pr = patchRecords[pid];
00762       if ( pr.hostPe != -1 ) continue;
00763       pr.hostPe = pesOnNodeSharingDevice[assignj];
00764       --unassignedpatches;
00765       pcount[assignj] += 1;
00766       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00767       break;
00768     }
00769     if ( i<npatches ) continue;  // start search again
00770     for ( i=0; i<npatches; ++i ) {
00771       int pid = activePatches[i];
00772       patch_record &pr = patchRecords[pid];
00773       if ( pr.hostPe != -1 ) continue;
00774       if ( count[i] ) continue;
00775       pr.hostPe = pesOnNodeSharingDevice[assignj];
00776       --unassignedpatches;
00777       pcount[assignj] += 1;
00778       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00779       break;
00780     }
00781     if ( i<npatches ) continue;  // start search again
00782     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00783   }
00784 
00785   for ( int i=0; i<npatches; ++i ) {
00786     int pid = activePatches[i];
00787     patch_record &pr = patchRecords[pid];
00788     // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
00789   }
00790 
00791   slavePes = new int[CkMyNodeSize()];
00792   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00793   numSlaves = 0;
00794   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00795     int pe = pesOnNodeSharingDevice[j];
00796     int rank = pe - CkNodeFirst(CkMyNode());
00797     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00798     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00799     if ( pe == CkMyPe() ) continue;
00800     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00801     rankpcount[rank] = 0;  // skip in rank loop below
00802     slavePes[numSlaves] = pe;
00803     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00804     ++numSlaves;
00805   }
00806   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00807     int pe = CkNodeFirst(CkMyNode()) + j;
00808     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00809     //          CkMyPe(),j,pe,rankpcount[j]);
00810     if ( ! rankpcount[j] ) continue;
00811     if ( pe == CkMyPe() ) continue;
00812     slavePes[numSlaves] = pe;
00813     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00814     ++numSlaves;
00815   }
00816   registerPatches();
00817 
00818   delete [] pesOnNodeSharingDevice;
00819   delete [] count;
00820   delete [] pcount;
00821   delete [] rankpcount;
00822   delete [] table;
00823 }

void ComputeNonbondedCUDA::atomUpdate ( void   )  [virtual]

Reimplemented from Compute.

Definition at line 1038 of file ComputeNonbondedCUDA.C.

References atomsChanged.

01038                                       {
01039   //fprintf(stderr, "%d ComputeNonbondedCUDA::atomUpdate\n",CkMyPe());
01040   atomsChanged = 1;
01041 }

void ComputeNonbondedCUDA::build_exclusions (  )  [static]

Definition at line 255 of file ComputeNonbondedCUDA.C.

References ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), cuda_bind_exclusions(), ResizeArray< Elem >::end(), exclusionsByAtom, ExclusionSignature::fullExclCnt, ExclusionSignature::fullOffset, Molecule::get_full_exclusions_for_atom(), ObjectArena< Type >::getNewArray(), j, MAX_EXCLUSIONS, ComputeNonbondedUtil::mol, Node::molecule, NAMD_bug(), NAMD_die(), Molecule::numAtoms, Node::Object(), ResizeArray< Elem >::resize(), SET_EXCL, ResizeArray< Elem >::size(), and SortableResizeArray< Elem >::sort().

Referenced by build_cuda_exclusions(), and ComputeNonbondedCUDA().

00255                                             {
00256   Molecule *mol = Node::Object()->molecule;
00257 
00258 #ifdef MEM_OPT_VERSION
00259   int natoms = mol->exclSigPoolSize;
00260 #else
00261   int natoms = mol->numAtoms; 
00262 #endif
00263 
00264   delete [] exclusionsByAtom;
00265   exclusionsByAtom = new int2[natoms];
00266 
00267   // create unique sorted lists
00268 
00269   ObjectArena<int32> listArena;
00270   ResizeArray<int32*> unique_lists;
00271   int32 **listsByAtom = new int32*[natoms];
00272   SortableResizeArray<int32> curList;
00273   for ( int i=0; i<natoms; ++i ) {
00274     curList.resize(0);
00275     curList.add(0);  // always excluded from self
00276 #ifdef MEM_OPT_VERSION
00277     const ExclusionSignature *sig = mol->exclSigPool + i;
00278     int n = sig->fullExclCnt;
00279     for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
00280     n += 1;
00281 #else
00282     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00283     int n = mol_list[0] + 1;
00284     for ( int j=1; j<n; ++j ) {
00285       curList.add(mol_list[j] - i);
00286     }
00287 #endif
00288     curList.sort();
00289 
00290     int j;
00291     for ( j=0; j<unique_lists.size(); ++j ) {
00292       if ( n != unique_lists[j][0] ) continue;  // no match
00293       int k;
00294       for ( k=0; k<n; ++k ) {
00295         if ( unique_lists[j][k+3] != curList[k] ) break;
00296       }
00297       if ( k == n ) break;  // found match
00298     }
00299     if ( j == unique_lists.size() ) {  // no match
00300       int32 *list = listArena.getNewArray(n+3);
00301       list[0] = n;
00302       int maxdiff = 0;
00303       maxdiff = -1 * curList[0];
00304       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00305       list[1] = maxdiff;
00306       for ( int k=0; k<n; ++k ) {
00307         list[k+3] = curList[k];
00308       }
00309       unique_lists.add(list);
00310     }
00311     listsByAtom[i] = unique_lists[j];
00312   }
00313   // sort lists by maxdiff
00314   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00315   long int totalbits = 0;
00316   int nlists = unique_lists.size();
00317   for ( int j=0; j<nlists; ++j ) {
00318     int32 *list = unique_lists[j];
00319     int maxdiff = list[1];
00320     list[2] = totalbits + maxdiff;
00321     totalbits += 2*maxdiff + 1;
00322   }
00323   for ( int i=0; i<natoms; ++i ) {
00324     exclusionsByAtom[i].x = listsByAtom[i][1];  // maxdiff
00325     exclusionsByAtom[i].y = listsByAtom[i][2];  // start
00326   }
00327   delete [] listsByAtom;
00328 
00329   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00330 
00331   {
00332     long int bytesneeded = totalbits / 8;
00333     if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00334     CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
00335                 unique_lists.size(), bytesneeded);
00336     }
00337 
00338     long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
00339     if ( bytesneeded > bytesavail ) {
00340       char errmsg[512];
00341       sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
00342                      "but only %ld bytes can be addressed with 32-bit int.",
00343                      unique_lists.size(), bytesneeded, bytesavail);
00344       NAMD_die(errmsg);
00345     }
00346   }
00347 
00348 #define SET_EXCL(EXCL,BASE,DIFF) \
00349          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00350 
00351   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00352   memset(exclusion_bits, 0, totalbits/8);
00353 
00354   long int base = 0;
00355   for ( int i=0; i<unique_lists.size(); ++i ) {
00356     base += unique_lists[i][1];
00357     if ( unique_lists[i][2] != (int32)base ) {
00358       NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00359     }
00360     int n = unique_lists[i][0];
00361     for ( int j=0; j<n; ++j ) {
00362       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00363     }
00364     base += unique_lists[i][1] + 1;
00365   }
00366 
00367   cuda_bind_exclusions(exclusion_bits, totalbits/32);
00368 
00369   delete [] exclusion_bits;
00370 }

void ComputeNonbondedCUDA::build_force_table (  )  [static]

Definition at line 110 of file ComputeNonbondedCUDA.C.

References cuda_bind_force_table(), ComputeNonbondedUtil::cutoff, diffa, ComputeNonbondedUtil::fast_table, FORCE_TABLE_SIZE, ComputeNonbondedUtil::r2_delta, ComputeNonbondedUtil::r2_delta_exp, ComputeNonbondedUtil::r2_table, ComputeNonbondedUtil::scor_table, table_i, ComputeNonbondedUtil::vdwa_table, and ComputeNonbondedUtil::vdwb_table.

Referenced by build_cuda_force_table().

00110                                              {  // static
00111 
00112   float4 t[FORCE_TABLE_SIZE];
00113   float4 et[FORCE_TABLE_SIZE];  // energy table
00114 
00115   const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00116   const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00117   // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00118   const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00119 
00120   double r2list[FORCE_TABLE_SIZE];  // double to match cpu code
00121   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00122     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00123     r2list[i] = r*r + r2_delta;
00124   }
00125 
00126   union { double f; int32 i[2]; } byte_order_test;
00127   byte_order_test.f = 1.0;  // should occupy high-order bits only
00128   int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00129 
00130   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00131     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00132     int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;  // table_i >= 0
00133 
00134     if ( r > cutoff ) {
00135       t[i].x = 0.;
00136       t[i].y = 0.;
00137       t[i].z = 0.;
00138       t[i].w = 0.;
00139       et[i].x = 0.;
00140       et[i].y = 0.;
00141       et[i].z = 0.;
00142       et[i].w = 0.;
00143       continue;
00144     }
00145 
00146     BigReal diffa = r2list[i] - r2_table[table_i];
00147 
00148     // coulomb 1/r or fast force
00149     // t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00150     {
00151       BigReal table_a = fast_table[4*table_i];
00152       BigReal table_b = fast_table[4*table_i+1];
00153       BigReal table_c = fast_table[4*table_i+2];
00154       BigReal table_d = fast_table[4*table_i+3];
00155       BigReal grad =
00156                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00157       t[i].x = 2. * grad;
00158       BigReal ener = table_a + diffa *
00159                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00160       et[i].x = ener;
00161     }
00162 
00163 
00164     // pme correction for slow force
00165     // t[i].w = 0.;
00166     {
00167       BigReal table_a = scor_table[4*table_i];
00168       BigReal table_b = scor_table[4*table_i+1];
00169       BigReal table_c = scor_table[4*table_i+2];
00170       BigReal table_d = scor_table[4*table_i+3];
00171       BigReal grad =
00172                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00173       t[i].w = 2. * grad;
00174       BigReal ener = table_a + diffa *
00175                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00176       et[i].w = ener;
00177     }
00178 
00179 
00180     // vdw 1/r^6
00181     // t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00182     {
00183       BigReal table_a = vdwb_table[4*table_i];
00184       BigReal table_b = vdwb_table[4*table_i+1];
00185       BigReal table_c = vdwb_table[4*table_i+2];
00186       BigReal table_d = vdwb_table[4*table_i+3];
00187       BigReal grad =
00188                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00189       t[i].y = 2. * -1. * grad;
00190       BigReal ener = table_a + diffa *
00191                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00192       et[i].y = -1. * ener;
00193     }
00194 
00195 
00196     // vdw 1/r^12
00197     // t[i].z = 12e / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00198     {
00199       BigReal table_a = vdwa_table[4*table_i];
00200       BigReal table_b = vdwa_table[4*table_i+1];
00201       BigReal table_c = vdwa_table[4*table_i+2];
00202       BigReal table_d = vdwa_table[4*table_i+3];
00203       BigReal grad =
00204                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00205       t[i].z = 2. * grad;
00206       BigReal ener = table_a + diffa *
00207                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00208       et[i].z = ener;
00209     }
00210 
00211     // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00212     //   t[i].x, t[i].y, t[i].z, t[i].w);
00213 
00214 /*
00215     double r2 = r * r;
00216     double r4 = r2 * r2;
00217     double r8 = r4 * r4;
00218 
00219     t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00220     t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00221     t[i].z = 12. / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00222     t[i].w = 0.;
00223 */
00224   }
00225 
00226   t[0].x = 0.f;
00227   t[0].y = 0.f;
00228   t[0].z = 0.f;
00229   t[0].w = 0.f;
00230   et[0].x = et[1].x;
00231   et[0].y = et[1].y;
00232   et[0].z = et[1].z;
00233   et[0].w = et[1].w;
00234 
00235   cuda_bind_force_table(t,et);
00236 
00237   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00238     CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00239   }
00240 }

void ComputeNonbondedCUDA::build_lj_table (  )  [static]

Definition at line 83 of file ComputeNonbondedCUDA.C.

References LJTable::TableEntry::A, LJTable::TableEntry::B, cuda_bind_lj_table(), LJTable::get_table_dim(), j, ComputeNonbondedUtil::ljTable, NAMD_bug(), ComputeNonbondedUtil::scaling, and LJTable::table_val().

Referenced by build_cuda_force_table().

00083                                           {  // static
00084 
00085   const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
00086   const int dim = ljTable->get_table_dim();
00087 
00088   // round dim up to odd multiple of 16
00089   int tsize = (((dim+16+31)/32)*32)-16;
00090   if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
00091 
00092   float2 *t = new float2[tsize*tsize];
00093   float2 *row = t;
00094   for ( int i=0; i<dim; ++i, row += tsize ) {
00095     for ( int j=0; j<dim; ++j ) {
00096       const LJTable::TableEntry *e = ljTable->table_val(i,j);
00097       row[j].x = e->A * scaling;
00098       row[j].y = e->B * scaling;
00099     }
00100   }
00101 
00102   cuda_bind_lj_table(t,tsize);
00103   delete [] t;
00104 
00105   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00106     CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
00107   }
00108 }

void ComputeNonbondedCUDA::doWork ( void   )  [virtual]

Reimplemented from Compute.

Definition at line 1172 of file ComputeNonbondedCUDA.C.

References Lattice::a(), activePatches, atom_params, atom_params_size, atoms, atoms_size, atomsChanged, Lattice::b(), Compute::basePriority, ResizeArray< Elem >::begin(), block_order, block_order_size, ComputeNonbondedCUDA::patch_record::bornRad, bornRadH, bornRadH_size, Lattice::c(), PatchMap::center(), CompAtom::charge, COMPUTE_PROXY_PRIORITY, computeRecords, computesChanged, COULOMB, cuda_bind_patch_pairs(), cuda_check_local_progress(), cuda_errcheck(), cudaCheck, cudaCompute, ComputeNonbondedUtil::cutoff, ComputeNonbondedUtil::cutoff2, dEdaSumH, dEdaSumH_size, deviceCUDA, deviceID, ComputeNonbondedCUDA::patch_record::dHdrPrefix, dHdrPrefixH, dHdrPrefixH_size, ComputeNonbondedUtil::dielectric_1, doSlow, dummy_dev, dummy_size, ResizeArray< Elem >::end(), energy_gbis, energy_gbis_size, exclusionsByAtom, finishWork(), ComputeNonbondedUtil::fixedAtomsOn, Patch::flags, force_ready_queue, force_ready_queue_len, force_ready_queue_size, forces, forces_size, SimParameters::GBISOn, GBISP, Compute::gbisPhase, Patch::getCudaAtomList(), DeviceCUDA::getGpuIsMine(), DeviceCUDA::getNoStreaming(), Patch::getNumAtoms(), hostedPatches, CompAtomExt::id, ComputeNonbondedCUDA::patch_record::intRad, intRad0H, intRad0H_size, intRadSH, intRadSH_size, SubmitReduction::item(), j, kernel_time, lata, latb, latc, Flags::lattice, lattice, localActivePatches, localComputeRecords, ComputeNonbondedCUDA::patch_record::localStart, master, Flags::maxAtomMovement, ComputeNonbondedUtil::mol, Node::molecule, NAMD_bug(), num_atoms, num_local_atoms, num_remote_atoms, num_virials, ComputeNonbondedCUDA::patch_record::numAtoms, ComputeNonbondedCUDA::patch_record::numFreeAtoms, Node::Object(), ComputeNonbondedCUDA::compute_record::offset, ComputeNonbondedCUDA::patch_record::p, pairlistsValid, Flags::pairlistTolerance, pairlistTolerance, Node::parameters, patch_pair_num_ptr, patch_pairs, patch_pairs_ptr, ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchPairsReordered, patchRecords, ComputeNonbondedCUDA::compute_record::pid, plcutoff2, CompAtom::position, PROXY_DATA_PRIORITY, PROXY_RESULTS_PRIORITY, psiSumH, psiSumH_size, recvYieldDevice(), reduction, REDUCTION_PAIRLIST_WARNINGS, remoteActivePatches, remoteComputeRecords, ResizeArray< Elem >::resize(), Flags::savePairlists, savePairlists, ComputeNonbondedUtil::scaling, Compute::sequence(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_forces, slow_forces_size, slow_virials, CompAtomExt::sortOrder, ResizeArray< Elem >::swap(), Lattice::unscale(), Flags::usePairlists, usePairlists, vdw_types, vdw_types_size, CompAtom::vdwType, virials, virials_size, WARPSIZE, workStarted, CudaAtom::x, ComputeNonbondedCUDA::patch_record::x, Vector::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z.

01172                                   {
01173 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
01174 CkMyPe(), sequence(), gbisPhase, workStarted, atomsChanged);
01175 
01176   // Set GPU device ID
01177   cudaCheck(cudaSetDevice(deviceID));
01178 
01179   ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
01180   ResizeArray<int> &patch_pair_num(*patch_pair_num_ptr);
01181 
01182   if ( workStarted ) { //if work already started, check if finished
01183     if ( finishWork() ) {  // finished
01184       workStarted = 0;
01185       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
01186     } else {  // need to call again
01187       workStarted = 2;
01188       basePriority = PROXY_RESULTS_PRIORITY;  // lower for local
01189       if ( master == this && kernel_launch_state > 2 ) {
01190         cuda_check_local_progress(this,0.);  // launches polling
01191       }
01192     }
01193     return;
01194   }
01195 
01196   workStarted = 1;
01197   basePriority = COMPUTE_PROXY_PRIORITY;
01198 
01199   Molecule *mol = Node::Object()->molecule;
01200   Parameters *params = Node::Object()->parameters;
01201   SimParameters *simParams = Node::Object()->simParameters;
01202 
01203   //execute only during GBIS phase 1, or if not using GBIS
01204   if (!simParams->GBISOn || gbisPhase == 1) {
01205 
01206     //bind new patches to GPU
01207     if ( atomsChanged || computesChanged ) {
01208       int npatches = activePatches.size();
01209 
01210       pairlistsValid = 0;
01211       pairlistTolerance = 0.;
01212 
01213       if ( computesChanged ) {
01214         computesChanged = 0;
01215 
01216         if ( ! dummy_size ) {
01217           dummy_size = 1024*1024;
01218           cudaMalloc((void**)&dummy_dev,dummy_size);
01219           cuda_errcheck("in cudaMalloc dummy_dev");
01220         }
01221 
01222         bool did_realloc = reallocate_host<int>(&force_ready_queue, &force_ready_queue_size, npatches,
01223           1.2f, cudaHostAllocMapped);
01224         if (did_realloc) {
01225           for (int k=0; k < force_ready_queue_size; ++k)
01226             force_ready_queue[k] = -1;
01227         }
01228         force_ready_queue_len = npatches;
01229         reallocate_host<int>(&block_order, &block_order_size,
01230           2*(localComputeRecords.size()+remoteComputeRecords.size()),
01231           1.2f, cudaHostAllocMapped);
01232     
01233         num_virials = npatches;
01234         reallocate_host<float>(&virials, &virials_size, 2*16*num_virials,
01235           1.2f, cudaHostAllocMapped);
01236         slow_virials = virials + 16*num_virials;
01237 
01238         reallocate_host<float>(&energy_gbis, &energy_gbis_size, npatches,
01239           1.2f, cudaHostAllocMapped);
01240         for (int i  = 0; i < energy_gbis_size; i++) energy_gbis[i] = 0.f;
01241 
01242         int *ap = activePatches.begin();
01243         for ( int i=0; i<localActivePatches.size(); ++i ) {
01244           *(ap++) = localActivePatches[i];
01245         }
01246         for ( int i=0; i<remoteActivePatches.size(); ++i ) {
01247           *(ap++) = remoteActivePatches[i];
01248         }
01249 
01250         // sort computes by distance between patches
01251         cr_sortop_distance so(lattice);
01252         std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01253         std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01254 
01255         const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01256 
01257         if ( streaming ) {
01258           // iout << "Reverse-priority sorting...\n" << endi;
01259           cr_sortop_reverse_priority sorp(so,patchRecords.begin());
01260           std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),sorp);
01261           std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),sorp);
01262           patchPairsReordered = 0;
01263           //patchPairsReordered = 1;
01264       // int len = remoteComputeRecords.size();
01265       // for ( int i=0; i<len; ++i ) {
01266       //   iout << "reverse_order " << i << " " << remoteComputeRecords[i].pid[0] << "\n";
01267       // }
01268       // int len2 = localComputeRecords.size();
01269       // for ( int i=0; i<len2; ++i ) {
01270       //   iout << "reverse_order " << (i+len) << " " << localComputeRecords[i].pid[0] << "\n";
01271       // }
01272       // iout << endi;
01273         } else {
01274           patchPairsReordered = 1;
01275         }
01276  
01277         int nlc = localComputeRecords.size();
01278         int nrc = remoteComputeRecords.size();
01279         computeRecords.resize(nlc+nrc);
01280         compute_record *cr = computeRecords.begin();
01281         for ( int i=0; i<nrc; ++i ) {
01282           *(cr++) = remoteComputeRecords[i];
01283         }
01284         for ( int i=0; i<nlc; ++i ) {
01285           *(cr++) = localComputeRecords[i];
01286         }
01287 
01288         // patch_pair_num[i] = number of patch pairs that involve patch i
01289         patch_pair_num.resize(npatches);
01290         for ( int i=0; i<npatches; ++i ) {
01291           patchRecords[activePatches[i]].localIndex = i;
01292           patch_pair_num[i] = 0;
01293         }
01294 
01295         int ncomputes = computeRecords.size();
01296         patch_pairs.resize(ncomputes);
01297         for ( int i=0; i<ncomputes; ++i ) {
01298           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01299           int lp1 = patchRecords[cr.pid[0]].localIndex;
01300           int lp2 = patchRecords[cr.pid[1]].localIndex;
01301           patch_pair_num[lp1]++;
01302           if (lp1 != lp2) patch_pair_num[lp2]++;
01303           patch_pair &pp = patch_pairs[i];
01304           pp.offset.x = cr.offset.x;
01305           pp.offset.y = cr.offset.y;
01306           pp.offset.z = cr.offset.z;
01307         }
01308 
01309         for ( int i=0; i<ncomputes; ++i ) {
01310           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01311           int lp1 = patchRecords[cr.pid[0]].localIndex;
01312           int lp2 = patchRecords[cr.pid[1]].localIndex;
01313           patch_pair &pp = patch_pairs[i];
01314           pp.patch1_ind = lp1;
01315           pp.patch2_ind = lp2;
01316           pp.patch1_num_pairs = patch_pair_num[lp1];
01317           pp.patch2_num_pairs = patch_pair_num[lp2];
01318         }
01319 
01320       if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
01321         CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01322                  CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01323                  localComputeRecords.size(), remoteComputeRecords.size());
01324       }
01325     }  // computesChanged
01326     else if ( ! patchPairsReordered ) {
01327       patchPairsReordered = 1;
01328       int len = patch_pairs.size();
01329       int nlc = localComputeRecords.size();
01330       int nrc = remoteComputeRecords.size();
01331       if ( len != nlc + nrc ) NAMD_bug("array size mismatch in ComputeNonbondedCUDA reordering");
01332       ResizeArray<ComputeNonbondedCUDA::compute_record> new_computeRecords(len);
01333       ResizeArray<patch_pair> new_patch_pairs(len);
01334       int irc=nrc;
01335       int ilc=len;
01336       for ( int i=0; i<len; ++i ) {
01337         int boi = block_order[i];
01338         int dest;
01339         if ( boi < nrc ) { dest = --irc; } else { dest = --ilc; }
01340         new_computeRecords[dest] = computeRecords[boi];
01341         new_patch_pairs[dest] = patch_pairs[boi];
01342       }
01343       if ( irc != 0 || ilc != nrc ) NAMD_bug("block index mismatch in ComputeNonbondedCUDA reordering");
01344       computeRecords.swap(new_computeRecords);
01345       patch_pairs.swap(new_patch_pairs);
01346     }
01347 
01348     int istart = 0;
01349     int i;
01350     for ( i=0; i<npatches; ++i ) {
01351       if ( i == localActivePatches.size() ) {
01352         num_local_atoms = istart;
01353       }
01354       patch_record &pr = patchRecords[activePatches[i]];
01355       pr.localStart = istart;
01356       int natoms = pr.p->getNumAtoms();
01357       int nfreeatoms = natoms;
01358       if ( fixedAtomsOn ) {
01359         const CompAtomExt *aExt = pr.xExt;
01360         for ( int j=0; j<natoms; ++j ) {
01361           if ( aExt[j].atomFixed ) --nfreeatoms;
01362         }
01363       }
01364       pr.numAtoms = natoms;
01365       pr.numFreeAtoms = nfreeatoms;
01366       istart += natoms;
01367       istart += 16 - (natoms & 15);
01368     }
01369     if ( i == localActivePatches.size() ) {
01370       num_local_atoms = istart;
01371     }
01372     num_atoms = istart;
01373     num_remote_atoms = num_atoms - num_local_atoms;
01374     reallocate_host<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
01375     reallocate_host<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
01376     reallocate_host<CudaAtom>(&atoms, &atoms_size, num_atoms, 1.2f);
01377     reallocate_host<float4>(&forces, &forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01378     reallocate_host<float4>(&slow_forces, &slow_forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01379     if (simParams->GBISOn) {
01380       reallocate_host<float>(&intRad0H, &intRad0H_size, num_atoms, 1.2f);
01381       reallocate_host<float>(&intRadSH, &intRadSH_size, num_atoms, 1.2f);
01382       reallocate_host<GBReal>(&psiSumH, &psiSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01383       reallocate_host<float>(&bornRadH, &bornRadH_size, num_atoms, 1.2f);
01384       reallocate_host<GBReal>(&dEdaSumH, &dEdaSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01385       reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixH_size, num_atoms, 1.2f);
01386     }
01387 
01388     int bfstart = 0;
01389     int exclmask_start = 0;
01390     int ncomputes = computeRecords.size();
01391     for ( int i=0; i<ncomputes; ++i ) {
01392       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01393       int p1 = cr.pid[0];
01394       int p2 = cr.pid[1];
01395       patch_pair &pp = patch_pairs[i];
01396       pp.patch1_start = patchRecords[p1].localStart;
01397       pp.patch1_size  = patchRecords[p1].numAtoms;
01398       pp.patch1_free_size = patchRecords[p1].numFreeAtoms;
01399       pp.patch2_start = patchRecords[p2].localStart;
01400       pp.patch2_size  = patchRecords[p2].numAtoms;
01401       pp.patch2_free_size = patchRecords[p2].numFreeAtoms;
01402       pp.plist_start = bfstart;
01403       // size1*size2 = number of patch pairs
01404       int size1 = (pp.patch1_size-1)/WARPSIZE+1;
01405       int size2 = (pp.patch2_size-1)/WARPSIZE+1;
01406       pp.plist_size = (size1*size2-1)/32+1;
01407       bfstart += pp.plist_size;
01408       pp.exclmask_start = exclmask_start;
01409       exclmask_start += size1*size2;
01410     } //for ncomputes
01411 
01412     cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
01413       activePatches.size(), num_atoms, bfstart, 
01414       exclmask_start);
01415 
01416   }  // atomsChanged || computesChanged
01417 
01418   double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01419 
01420   Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01421   float maxAtomMovement = 0.;
01422   float maxPatchTolerance = 0.;
01423 
01424   for ( int i=0; i<activePatches.size(); ++i ) {
01425     patch_record &pr = patchRecords[activePatches[i]];
01426 
01427     float maxMove = pr.p->flags.maxAtomMovement;
01428     if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01429 
01430     float maxTol = pr.p->flags.pairlistTolerance;
01431     if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01432 
01433     int start = pr.localStart;
01434     int n = pr.numAtoms;
01435     const CompAtom *a = pr.x;
01436     const CompAtomExt *aExt = pr.xExt;
01437     if ( atomsChanged ) {
01438 
01439       atom_param *ap = atom_params + start;
01440       for ( int k=0; k<n; ++k ) {
01441         int j = aExt[k].sortOrder;
01442         ap[k].vdw_type = a[j].vdwType;
01443         vdw_types[start + k] = a[j].vdwType;
01444         ap[k].index = aExt[j].id;
01445 #ifdef MEM_OPT_VERSION
01446         ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01447         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01448 #else // ! MEM_OPT_VERSION
01449         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01450         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01451 #endif // MEM_OPT_VERSION
01452       }
01453     }
01454     {
01455 #if 1
01456       const CudaAtom *ac = pr.p->getCudaAtomList();
01457       CudaAtom *ap = atoms + start;
01458       memcpy(ap, ac, sizeof(atom)*n);
01459 #else
01460       Vector center =
01461         pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
01462       atom *ap = atoms + start;
01463       for ( int k=0; k<n; ++k ) {
01464         int j = aExt[k].sortOrder;
01465         ap[k].position.x = a[j].position.x - center.x;
01466         ap[k].position.y = a[j].position.y - center.y;
01467         ap[k].position.z = a[j].position.z - center.z;
01468         ap[k].charge = charge_scaling * a[j].charge;
01469       }
01470 #endif
01471     }
01472   }
01473 
01474   savePairlists = 0;
01475   usePairlists = 0;
01476   if ( flags.savePairlists ) {
01477     savePairlists = 1;
01478     usePairlists = 1;
01479   } else if ( flags.usePairlists ) {
01480     if ( ! pairlistsValid ||
01481          ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01482       reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01483     } else {
01484       usePairlists = 1;
01485     }
01486   }
01487   if ( ! usePairlists ) {
01488     pairlistsValid = 0;
01489   }
01490   float plcutoff = cutoff;
01491   if ( savePairlists ) {
01492     pairlistsValid = 1;
01493     pairlistTolerance = 2. * maxPatchTolerance;
01494     plcutoff += pairlistTolerance;
01495   }
01496   plcutoff2 = plcutoff * plcutoff;
01497 
01498   //CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01499   //  plcutoff, pairlistTolerance, savePairlists, usePairlists);
01500 
01501 #if 0
01502   // calculate warp divergence
01503   if ( 1 ) {
01504     Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01505     Lattice &lattice = flags.lattice;
01506     float3 lata, latb, latc;
01507     lata.x = lattice.a().x;
01508     lata.y = lattice.a().y;
01509     lata.z = lattice.a().z;
01510     latb.x = lattice.b().x;
01511     latb.y = lattice.b().y;
01512     latb.z = lattice.b().z;
01513     latc.x = lattice.c().x;
01514     latc.y = lattice.c().y;
01515     latc.z = lattice.c().z;
01516 
01517     int ncomputes = computeRecords.size();
01518     for ( int ic=0; ic<ncomputes; ++ic ) {
01519       patch_pair &pp = patch_pairs[ic];
01520       atom *a1 = atoms + pp.patch1_atom_start;
01521       int n1 = pp.patch1_size;
01522       atom *a2 = atoms + pp.patch2_atom_start;
01523       int n2 = pp.patch2_size;
01524       float offx = pp.offset.x * lata.x
01525                + pp.offset.y * latb.x
01526                + pp.offset.z * latc.x;
01527       float offy = pp.offset.x * lata.y
01528                + pp.offset.y * latb.y
01529                + pp.offset.z * latc.y;
01530       float offz = pp.offset.x * lata.z
01531                + pp.offset.y * latb.z
01532                + pp.offset.z * latc.z;
01533       // CkPrintf("%f %f %f\n", offx, offy, offz);
01534       int atoms_tried = 0;
01535       int blocks_tried = 0;
01536       int atoms_used = 0;
01537       int blocks_used = 0;
01538       for ( int ii=0; ii<n1; ii+=32 ) {  // warps
01539         for ( int jj=0; jj<n2; jj+=16 ) {  // shared atom loads
01540           int block_used = 0;
01541           for ( int j=jj; j<jj+16 && j<n2; ++j ) {  // shared atoms
01542             int atom_used = 0;
01543             for ( int i=ii; i<ii+32 && i<n1; ++i ) {  // threads
01544               float dx = offx + a1[i].position.x - a2[j].position.x;
01545               float dy = offy + a1[i].position.y - a2[j].position.y;
01546               float dz = offz + a1[i].position.z - a2[j].position.z;
01547               float r2 = dx*dx + dy*dy + dz*dz;
01548               if ( r2 < cutoff2 ) atom_used = 1;
01549             }
01550             ++atoms_tried;
01551             if ( atom_used ) { block_used = 1; ++atoms_used; }
01552           }
01553           ++blocks_tried;
01554           if ( block_used ) { ++blocks_used; }
01555         }
01556       }
01557       CkPrintf("blocks = %d/%d (%f)  atoms = %d/%d (%f)\n",
01558                 blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
01559                 atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
01560     }
01561   }
01562 #endif
01563 
01564   } // !GBISOn || gbisPhase == 1
01565 
01566   //Do GBIS
01567   if (simParams->GBISOn) {
01568     //open GBIS boxes depending on phase
01569     for ( int i=0; i<activePatches.size(); ++i ) {
01570       patch_record &pr = master->patchRecords[activePatches[i]];
01571       GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
01572       if (gbisPhase == 1) {
01573         //Copy GBIS intRadius to Host
01574         if (atomsChanged) {
01575           float *intRad0 = intRad0H + pr.localStart;
01576           float *intRadS = intRadSH + pr.localStart;
01577           for ( int k=0; k<pr.numAtoms; ++k ) {
01578             int j = pr.xExt[k].sortOrder;
01579             intRad0[k] = pr.intRad[2*j+0];
01580             intRadS[k] = pr.intRad[2*j+1];
01581           }
01582         }
01583       } else if (gbisPhase == 2) {
01584         float *bornRad = bornRadH + pr.localStart;
01585         for ( int k=0; k<pr.numAtoms; ++k ) {
01586           int j = pr.xExt[k].sortOrder;
01587           bornRad[k] = pr.bornRad[j];
01588         }
01589       } else if (gbisPhase == 3) {
01590         float *dHdrPrefix = dHdrPrefixH + pr.localStart;
01591         for ( int k=0; k<pr.numAtoms; ++k ) {
01592           int j = pr.xExt[k].sortOrder;
01593           dHdrPrefix[k] = pr.dHdrPrefix[j];
01594         }
01595       } // end phases
01596     } // end for patches
01597   } // if GBISOn
01598 
01599   kernel_time = CkWallTimer();
01600   kernel_launch_state = 1;
01601   //if ( gpu_is_mine || ! doSlow ) recvYieldDevice(-1);
01602   if ( deviceCUDA->getGpuIsMine() || ! doSlow ) recvYieldDevice(-1);
01603 }

void ComputeNonbondedCUDA::finishPatch ( int  flindex  )  [virtual]

Reimplemented from Compute.

Definition at line 1864 of file ComputeNonbondedCUDA.C.

References activePatches, Box< Owner, Data >::close(), ComputeNonbondedCUDA::patch_record::forceBox, master, Box< Owner, Data >::open(), patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::r, and ComputeNonbondedCUDA::patch_record::x.

Referenced by finishWork().

01864                                                   {
01865   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishPatch \n",CkMyPe());
01866   patch_record &pr = master->patchRecords[master->activePatches[flindex]];
01867   pr.r = pr.forceBox->open();
01868   finishPatch(pr);
01869   pr.positionBox->close(&(pr.x));
01870   pr.forceBox->close(&(pr.r));
01871 }

void ComputeNonbondedCUDA::finishReductions (  ) 

Definition at line 2028 of file ComputeNonbondedCUDA.C.

References ADD_TENSOR_OBJECT, Compute::basePriority, block_order, computeRecords, cuda_errcheck(), cuda_timer_count, cuda_timer_total, doEnergy, doSlow, end_local_download, end_remote_download, endi(), energy_gbis, SimParameters::GBISOn, iout, SubmitReduction::item(), localComputeRecords, num_virials, Node::Object(), SimParameters::outputCudaTiming, patchPairsReordered, patchRecords, PROXY_DATA_PRIORITY, reduction, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, remoteComputeRecords, Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_virials, start_calc, step, SubmitReduction::submit(), virials, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

Referenced by finishWork().

02028                                             {
02029   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishReductions \n",CkMyPe());
02030 
02031   basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
02032 
02033   SimParameters *simParams = Node::Object()->simParameters;
02034 
02035   if ( 0 && patchPairsReordered && patchPairsReordered < 3 ) {
02036     if ( patchPairsReordered++ == 2) {
02037       int patch_len = patchRecords.size();
02038       ResizeArray<int> plast(patch_len);
02039       for ( int i=0; i<patch_len; ++i ) {
02040         plast[i] = -1;
02041       }
02042       int order_len = localComputeRecords.size()+remoteComputeRecords.size();
02043       for ( int i=0; i<order_len; ++i ) {
02044         plast[computeRecords[block_order[i]].pid[0]] = i;
02045         iout << "block_order " << i << " " << block_order[i] << " " << computeRecords[block_order[i]].pid[0] << "\n";
02046       }
02047       iout << endi;
02048       for ( int i=0; i<patch_len; ++i ) {
02049         iout << "patch_last " << i << " " << plast[i] << "\n";
02050       }
02051       iout << endi;
02052     }
02053   }
02054 
02055   {
02056     Tensor virial_tensor;
02057     BigReal energyv = 0.;
02058     BigReal energye = 0.;
02059     BigReal energys = 0.;
02060     int nexcluded = 0;
02061     for ( int i = 0; i < num_virials; ++i ) {
02062       virial_tensor.xx += virials[16*i];
02063       virial_tensor.xy += virials[16*i+1];
02064       virial_tensor.xz += virials[16*i+2];
02065       virial_tensor.yx += virials[16*i+3];
02066       virial_tensor.yy += virials[16*i+4];
02067       virial_tensor.yz += virials[16*i+5];
02068       virial_tensor.zx += virials[16*i+6];
02069       virial_tensor.zy += virials[16*i+7];
02070       virial_tensor.zz += virials[16*i+8];
02071       energyv += virials[16*i+9];
02072       energye += virials[16*i+10];
02073       energys += virials[16*i+11];
02074       nexcluded += ((int *)virials)[16*i+12];
02075       if (simParams->GBISOn) {
02076         energye += energy_gbis[i];
02077       }
02078     }
02079     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += nexcluded;
02080     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
02081     if ( doEnergy ) {
02082       reduction->item(REDUCTION_LJ_ENERGY) += energyv;
02083       reduction->item(REDUCTION_ELECT_ENERGY) += energye;
02084       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energys;
02085     }
02086   }
02087   if ( doSlow ) {
02088     Tensor virial_slow_tensor;
02089     for ( int i = 0; i < num_virials; ++i ) {
02090       virial_slow_tensor.xx += slow_virials[16*i];
02091       virial_slow_tensor.xy += slow_virials[16*i+1];
02092       virial_slow_tensor.xz += slow_virials[16*i+2];
02093       virial_slow_tensor.yx += slow_virials[16*i+3];
02094       virial_slow_tensor.yy += slow_virials[16*i+4];
02095       virial_slow_tensor.yz += slow_virials[16*i+5];
02096       virial_slow_tensor.zx += slow_virials[16*i+6];
02097       virial_slow_tensor.zy += slow_virials[16*i+7];
02098       virial_slow_tensor.zz += slow_virials[16*i+8];
02099     }
02100     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
02101   }
02102 
02103   reduction->submit();
02104 
02105   cuda_timer_count++;
02106   if ( simParams->outputCudaTiming &&
02107         step % simParams->outputCudaTiming == 0 ) {
02108 
02109     // int natoms = mol->numAtoms; 
02110     // double wpa = wcount;  wpa /= natoms;
02111 
02112     // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
02113     //  kernel_time * 1.e3, time * 1.e3, wpa);
02114 
02115 #if 0
02116     float upload_ms, remote_calc_ms;
02117     float local_calc_ms, total_ms;
02118     cuda_errcheck("before event timers");
02119     cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
02120     cuda_errcheck("in event timer 1");
02121     cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02122     cuda_errcheck("in event timer 2");
02123     cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02124     cuda_errcheck("in event timer 3");
02125     cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
02126     cuda_errcheck("in event timer 4");
02127     cuda_errcheck("in event timers");
02128 
02129     CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
02130              CkMyPe(), upload_ms, remote_calc_ms,
02131              local_calc_ms, total_ms);
02132 #endif
02133 
02134     if ( cuda_timer_count >= simParams->outputCudaTiming ) {
02135       cuda_timer_total /= cuda_timer_count;
02136       CkPrintf("CUDA TIMING: %d  %f ms/step on node %d\n",
02137                step, cuda_timer_total * 1.e3, CkMyPe());
02138     }
02139     cuda_timer_count = 0;
02140     cuda_timer_total = 0;
02141   }
02142 
02143 }

int ComputeNonbondedCUDA::finishWork (  ) 

Definition at line 1902 of file ComputeNonbondedCUDA.C.

References atomsChanged, ComputeNonbondedCUDA::patch_record::bornRad, ComputeNonbondedCUDA::patch_record::bornRadBox, Box< Owner, Data >::close(), computeMgr, cuda_timer_total, ComputeNonbondedCUDA::patch_record::dEdaSum, ComputeNonbondedCUDA::patch_record::dEdaSumBox, dEdaSumH, deviceCUDA, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, doSlow, finishPatch(), finishReductions(), ComputeNonbondedCUDA::patch_record::forceBox, SimParameters::GBISOn, GBISP, Compute::gbisPhase, DeviceCUDA::getMergeGrids(), ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::intRadBox, SubmitReduction::item(), j, kernel_time, localHostedPatches, ComputeNonbondedCUDA::patch_record::localStart, master, ComputeNonbondedUtil::mol, Node::molecule, ComputeNonbondedCUDA::patch_record::numAtoms, numSlaves, Node::Object(), Box< Owner, Data >::open(), ComputeNonbondedCUDA::patch_record::patchID, patchRecords, CompAtom::position, ComputeNonbondedCUDA::patch_record::positionBox, Compute::priority(), ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::psiSumBox, psiSumH, ComputeNonbondedCUDA::patch_record::r, reduction, remoteHostedPatches, ComputeMgr::sendNonbondedCUDASlaveEnqueue(), Compute::sequence(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slavePes, slaves, CompAtomExt::sortOrder, workStarted, Vector::x, ComputeNonbondedCUDA::patch_record::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z.

Referenced by doWork().

01902                                      {
01903   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishWork() \n",CkMyPe());
01904 
01905   if ( master == this ) {
01906     for ( int i = 0; i < numSlaves; ++i ) {
01907       computeMgr->sendNonbondedCUDASlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
01908     }
01909   }
01910 
01911 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
01912 CkMyPe(), workStarted, gbisPhase)
01913 
01914   Molecule *mol = Node::Object()->molecule;
01915   SimParameters *simParams = Node::Object()->simParameters;
01916 
01917   ResizeArray<int> &patches( workStarted == 1 ?
01918                                 remoteHostedPatches : localHostedPatches );
01919 
01920   // long long int wcount = 0;
01921   // double virial = 0.;
01922   // double virial_slow = 0.;
01923   for ( int i=0; i<patches.size(); ++i ) {
01924     // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
01925     patch_record &pr = master->patchRecords[patches[i]];
01926 
01927     if ( !simParams->GBISOn || gbisPhase == 1 ) {
01928       patch_record &pr = master->patchRecords[patches[i]];
01929 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
01930       pr.r = pr.forceBox->open();
01931     } // !GBISOn || gbisPhase==1
01932 
01933     int start = pr.localStart;
01934     const CompAtomExt *aExt = pr.xExt;
01935     if ( !simParams->GBISOn || gbisPhase == 3 ) {
01936       finishPatch(pr);
01937     } // !GBISOn || gbisPhase == 3
01938 
01939 #if 0
01940     if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01941       CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01942         j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01943         af[j].w);
01944     }
01945 #endif
01946 
01947     //Close Boxes depending on Phase
01948     if (simParams->GBISOn) {
01949       if (gbisPhase == 1) {
01950         //Copy dEdaSum from Host to Patch Box
01951         GBReal *psiSumMaster = master->psiSumH + start;
01952         for ( int k=0; k<pr.numAtoms; ++k ) {
01953           int j = aExt[k].sortOrder;
01954           pr.psiSum[j] += psiSumMaster[k];
01955         }
01956 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
01957         pr.psiSumBox->close(&(pr.psiSum));
01958 
01959       } else if (gbisPhase == 2) {
01960         //Copy dEdaSum from Host to Patch Box
01961         GBReal *dEdaSumMaster = master->dEdaSumH + start;
01962         for ( int k=0; k<pr.numAtoms; ++k ) {
01963           int j = aExt[k].sortOrder;
01964           pr.dEdaSum[j] += dEdaSumMaster[k];
01965         }
01966 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
01967         pr.dEdaSumBox->close(&(pr.dEdaSum));
01968 
01969       } else if (gbisPhase == 3) {
01970 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
01971         pr.intRadBox->close(&(pr.intRad)); //box 6
01972         pr.bornRadBox->close(&(pr.bornRad)); //box 7
01973         pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
01974         pr.positionBox->close(&(pr.x)); //box 0
01975         pr.forceBox->close(&(pr.r));
01976       } //end phases
01977     } else { //not GBIS
01978 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
01979       pr.positionBox->close(&(pr.x));
01980       pr.forceBox->close(&(pr.r));
01981     }
01982   }// end for
01983 
01984 #if 0
01985   virial *= (-1./6.);
01986   reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
01987   reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
01988   reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
01989   if ( doSlow ) {
01990     virial_slow *= (-1./6.);
01991     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
01992     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
01993     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
01994   }
01995 #endif
01996 
01997   if ( workStarted == 1 && ! deviceCUDA->getMergeGrids() &&
01998        ( localHostedPatches.size() || master == this ) ) {
01999     GBISP("not finished, call again\n");
02000     return 0;  // not finished, call again
02001   }
02002 
02003   if ( master != this ) {  // finished
02004     GBISP("finished\n");
02005     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02006     atomsChanged = 0;
02007     return 1;
02008   }
02009 
02010   cuda_timer_total += kernel_time;
02011 
02012   if ( !simParams->GBISOn || gbisPhase == 3 ) {
02013 
02014     atomsChanged = 0;
02015     finishReductions();
02016 
02017   } // !GBISOn || gbisPhase==3  
02018 
02019   // Next GBIS Phase
02020 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
02021     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02022 
02023   GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
02024   return 1;  // finished and ready for next step
02025 }

void ComputeNonbondedCUDA::messageFinishPatch ( int  flindex  ) 

Definition at line 1857 of file ComputeNonbondedCUDA.C.

References activePatches, computeMgr, ComputeNonbondedCUDA::patch_record::hostPe, ComputeNonbondedCUDA::patch_record::msg, patchRecords, PROXY_DATA_PRIORITY, ComputeMgr::sendNonbondedCUDASlaveEnqueuePatch(), Compute::sequence(), and ComputeNonbondedCUDA::patch_record::slave.

01857                                                          {
01858   int pid = activePatches[flindex];
01859   patch_record &pr = patchRecords[pid];
01860   //fprintf(stderr, "%d ComputeNonbondedCUDA::messageFinishPatch %d\n",CkMyPe(),pr.hostPe);
01861   computeMgr->sendNonbondedCUDASlaveEnqueuePatch(pr.slave,pr.hostPe,sequence(),PROXY_DATA_PRIORITY,flindex,pr.msg);
01862 }

int ComputeNonbondedCUDA::noWork (  )  [virtual]

Reimplemented from Compute.

Definition at line 1108 of file ComputeNonbondedCUDA.C.

References atomsChanged, ComputeNonbondedCUDA::patch_record::bornRad, ComputeNonbondedCUDA::patch_record::bornRadBox, computeMgr, ComputeNonbondedCUDA::patch_record::dEdaSum, ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, Flags::doEnergy, doEnergy, Flags::doFullElectrostatics, Flags::doNonbonded, doSlow, SimParameters::GBISOn, GBISP, Compute::gbisPhase, Patch::getCompAtomExtInfo(), hostedPatches, ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::intRadBox, Flags::lattice, lattice, master, masterPe, numSlaves, Node::Object(), Box< Owner, Data >::open(), ComputeNonbondedCUDA::patch_record::p, ComputeNonbondedCUDA::patch_record::patchID, patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::psiSumBox, reduction, ComputeMgr::sendNonbondedCUDASlaveReady(), ComputeMgr::sendNonbondedCUDASlaveSkip(), Compute::sequence(), Node::simParameters, simParams, ResizeArray< Elem >::size(), skip(), slavePes, slaves, Flags::step, step, SubmitReduction::submit(), workStarted, ComputeNonbondedCUDA::patch_record::x, and ComputeNonbondedCUDA::patch_record::xExt.

01108                                  {
01109 
01110   SimParameters *simParams = Node::Object()->simParameters;
01111   Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01112   lattice = flags.lattice;
01113   doSlow = flags.doFullElectrostatics;
01114   doEnergy = flags.doEnergy;
01115   step = flags.step;
01116 
01117   if ( ! flags.doNonbonded ) {
01118     GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01119     if ( master != this ) {
01120       computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01121                         hostedPatches.size(),atomsChanged,sequence());
01122     } else {
01123       for ( int i = 0; i < numSlaves; ++i ) {
01124         computeMgr->sendNonbondedCUDASlaveSkip(slaves[i],slavePes[i]);
01125       }
01126       skip();
01127     }
01128     if ( reduction ) {
01129        reduction->submit();
01130     }
01131 
01132     return 1;
01133   }
01134 
01135   for ( int i=0; i<hostedPatches.size(); ++i ) {
01136     patch_record &pr = master->patchRecords[hostedPatches[i]];
01137     if (!simParams->GBISOn || gbisPhase == 1) {
01138       GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01139       pr.x = pr.positionBox->open();
01140       pr.xExt = pr.p->getCompAtomExtInfo();
01141     }
01142 
01143     if (simParams->GBISOn) {
01144       if (gbisPhase == 1) {
01145         GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01146         pr.intRad     = pr.intRadBox->open();
01147         pr.psiSum     = pr.psiSumBox->open();
01148       } else if (gbisPhase == 2) {
01149         GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01150         pr.bornRad    = pr.bornRadBox->open();
01151         pr.dEdaSum    = pr.dEdaSumBox->open();
01152       } else if (gbisPhase == 3) {
01153         GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01154         pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01155       }
01156       GBISP("opened GBIS boxes");
01157     }
01158   }
01159 
01160   if ( master == this ) return 0; //work to do, enqueue as usual
01161 
01162   // message masterPe
01163   computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01164                         hostedPatches.size(),atomsChanged,sequence());
01165   atomsChanged = 0;
01166 
01167   workStarted = 1;
01168 
01169   return 1;
01170 }

void ComputeNonbondedCUDA::recvYieldDevice ( int  pe  ) 

Definition at line 1633 of file ComputeNonbondedCUDA.C.

References Lattice::a(), SimParameters::alpha_cutoff, atom_params, atoms, atomsChanged, Lattice::b(), block_order, bornRadH, Lattice::c(), CcdCallBacksReset(), SimParameters::coulomb_radius_offset, cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_forces(), cuda_bind_GBIS_bornRad(), cuda_bind_GBIS_dEdaSum(), cuda_bind_GBIS_dHdrPrefix(), cuda_bind_GBIS_energy(), cuda_bind_GBIS_intRad(), cuda_bind_GBIS_psiSum(), cuda_bind_vdw_types(), cuda_bind_virials(), cuda_check_local_calc(), cuda_check_local_progress(), cuda_check_progress(), cuda_check_remote_calc(), cuda_check_remote_progress(), cuda_GBIS_P1(), cuda_GBIS_P2(), cuda_GBIS_P3(), cuda_nonbonded_forces(), CUDA_POLL, SimParameters::cutoff, ComputeNonbondedUtil::cutoff2, dEdaSumH, deviceCUDA, deviceID, dHdrPrefixH, SimParameters::dielectric, doEnergy, doSlow, dummy_dev, dummy_size, end_local_download, end_remote_download, energy_gbis, force_ready_queue, force_ready_queue_next, forces, SimParameters::fsMax, SimParameters::GBISOn, GBISP, Compute::gbisPhase, DeviceCUDA::getMergeGrids(), DeviceCUDA::getNoStreaming(), DeviceCUDA::getSharedGpu(), intRad0H, intRadSH, SimParameters::kappa, lata, latb, latc, lattice, localActivePatches, localComputeRecords, SimParameters::nonbondedScaling, Node::Object(), patchPairsReordered, plcutoff2, SimParameters::PMEOffload, SimParameters::PMEOn, psiSumH, remote_submit_time, remoteActivePatches, remoteComputeRecords, savePairlists, Compute::sequence(), DeviceCUDA::setGpuIsMine(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_forces, SimParameters::solvent_dielectric, start_calc, stream, stream2, SimParameters::switchingActive, SimParameters::switchingDist, usePairlists, SimParameters::usePMECUDA, vdw_types, virials, workStarted, Vector::x, Vector::y, and Vector::z.

Referenced by doWork(), and ComputeMgr::recvYieldDevice().

01633                                                  {
01634 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
01635 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
01636 workStarted, gbisPhase, kernel_launch_state, pe)
01637 
01638   float3 lata, latb, latc;
01639   lata.x = lattice.a().x;
01640   lata.y = lattice.a().y;
01641   lata.z = lattice.a().z;
01642   latb.x = lattice.b().x;
01643   latb.y = lattice.b().y;
01644   latb.z = lattice.b().z;
01645   latc.x = lattice.c().x;
01646   latc.y = lattice.c().y;
01647   latc.z = lattice.c().z;
01648   SimParameters *simParams = Node::Object()->simParameters;
01649 
01650   // Set GPU device ID
01651   cudaSetDevice(deviceID);
01652 
01653   const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01654 
01655   double walltime;
01656   if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
01657     walltime = CkWallTimer();
01658     CcdCallBacksReset(0,walltime);  // fix Charm++
01659   }
01660 
01661   switch ( kernel_launch_state ) {
01663 // Remote
01664   case 1:
01665 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
01666     ++kernel_launch_state;
01667     //gpu_is_mine = 0;
01668     deviceCUDA->setGpuIsMine(0);
01669     remote_submit_time = walltime;
01670 
01671     if (!simParams->GBISOn || gbisPhase == 1) {
01672       // cudaEventRecord(start_upload, stream);
01673       if ( atomsChanged ) {
01674         cuda_bind_atom_params(atom_params);
01675         cuda_bind_vdw_types(vdw_types);
01676       }
01677       if ( simParams->GBISOn) {
01678         cuda_bind_GBIS_psiSum(psiSumH);
01679                if ( atomsChanged ) {
01680                  cuda_bind_GBIS_intRad(intRad0H, intRadSH);
01681                }
01682       }
01683       atomsChanged = 0;
01684       cuda_bind_atoms((const atom *)atoms);
01685       cuda_bind_forces(forces, slow_forces);
01686       cuda_bind_virials(virials, force_ready_queue, block_order);
01687       if ( simParams->GBISOn) {
01688         cuda_bind_GBIS_energy(energy_gbis);
01689       }
01690       if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01691       //call CUDA Kernels
01692 
01693       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01694                             0,remoteComputeRecords.size(),
01695                             remoteComputeRecords.size()+localComputeRecords.size(),
01696                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream);
01697       if (simParams->GBISOn) {
01698         cuda_GBIS_P1(
01699           0,remoteComputeRecords.size(),
01700                       localActivePatches.size(),remoteActivePatches.size(),
01701                       simParams->alpha_cutoff-simParams->fsMax,
01702                       simParams->coulomb_radius_offset,
01703                       lata, latb, latc, stream);
01704       }
01705       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01706       //    num_local_atom_records,num_remote_atom_records);
01707       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
01708         cudaEventRecord(end_remote_download, stream);
01709       }
01710       if ( streaming ) {
01711         force_ready_queue_next = 0;
01712         CUDA_POLL(cuda_check_progress,this);
01713       } else {
01714         CUDA_POLL(cuda_check_remote_progress,this);
01715       }
01716       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
01717         CUDA_POLL(cuda_check_remote_calc,this);
01718         break;
01719       }
01720     } // !GBIS or gbisPhase==1
01721     if (simParams->GBISOn) {
01722       if (gbisPhase == 1) {
01723         //GBIS P1 Kernel launched in previous code block
01724       } else if (gbisPhase == 2) {
01725 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
01726         // cudaEventRecord(start_upload, stream);
01727         cuda_bind_GBIS_bornRad(bornRadH);
01728         cuda_bind_GBIS_dEdaSum(dEdaSumH);
01729         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01730         cuda_GBIS_P2(
01731           0,remoteComputeRecords.size(),
01732           localActivePatches.size(),remoteActivePatches.size(),
01733           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01734           simParams->nonbondedScaling, simParams->kappa,
01735           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01736           simParams->dielectric, simParams->solvent_dielectric,
01737           lata, latb, latc,
01738           doEnergy, doSlow, stream
01739           );
01740         cudaEventRecord(end_remote_download, stream);
01741         CUDA_POLL(cuda_check_remote_progress,this);
01742       } else if (gbisPhase == 3) {
01743 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
01744         // cudaEventRecord(start_upload, stream);
01745         cuda_bind_GBIS_dHdrPrefix(dHdrPrefixH);
01746         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01747         if (doSlow)
01748         cuda_GBIS_P3(
01749           0,remoteComputeRecords.size(),
01750           localActivePatches.size(),remoteActivePatches.size(),
01751           (simParams->alpha_cutoff-simParams->fsMax),
01752           simParams->coulomb_radius_offset,
01753           simParams->nonbondedScaling,
01754           lata, latb, latc, stream
01755           );
01756         cudaEventRecord(end_remote_download, stream);
01757         CUDA_POLL(cuda_check_remote_progress,this);
01758       }
01759     }
01760 
01762 // Local
01763   case 2:
01764 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
01765     ++kernel_launch_state;
01766     //gpu_is_mine = 0;
01767     deviceCUDA->setGpuIsMine(0);
01768 
01769     if ( stream2 != stream ) {
01770       // needed to ensure that upload is finished
01771       cudaStreamWaitEvent(stream2, start_calc, 0);
01772       // priorities do not prevent local from launching ahead
01773       // of remote, so delay local stream with a small memset
01774       cudaMemsetAsync(dummy_dev, 0, dummy_size, stream2);
01775     }
01776 
01777     if (!simParams->GBISOn || gbisPhase == 1) {
01778 
01779       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01780                             remoteComputeRecords.size(),localComputeRecords.size(),
01781                             remoteComputeRecords.size()+localComputeRecords.size(),
01782                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream2);
01783       if (simParams->GBISOn) {
01784         cuda_GBIS_P1(
01785                      remoteComputeRecords.size(),localComputeRecords.size(),
01786                      0,localActivePatches.size(),
01787                      simParams->alpha_cutoff-simParams->fsMax,
01788                      simParams->coulomb_radius_offset,
01789                      lata, latb, latc, stream2 );
01790       }
01791       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01792       //    0,num_local_atom_records);
01793       //cuda_load_virials(virials, doSlow);  // slow_virials follows virials
01794       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
01795         cudaEventRecord(end_local_download, stream2);
01796       }
01797       if ( ! streaming && workStarted == 2 ) {
01798         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01799 cuda_check_local_progress\n", CkMyPe())
01800         CUDA_POLL(cuda_check_local_progress,this);
01801       }
01802       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
01803         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01804 cuda_check_local_calc\n", CkMyPe())
01805           CUDA_POLL(cuda_check_local_calc,this);
01806         break;
01807       }
01808 
01809     } // !GBIS or gbisPhase==1
01810     if (simParams->GBISOn) {
01811       if (gbisPhase == 1) {
01812         //GBIS P1 Kernel launched in previous code block
01813       } else if (gbisPhase == 2) {
01814 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
01815         cuda_GBIS_P2(
01816           remoteComputeRecords.size(),localComputeRecords.size(),
01817           0,localActivePatches.size(),
01818           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01819           simParams->nonbondedScaling, simParams->kappa,
01820           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01821           simParams->dielectric, simParams->solvent_dielectric,
01822           lata, latb, latc,
01823           doEnergy, doSlow, stream2
01824           );
01825         cudaEventRecord(end_local_download, stream2);
01826         if ( workStarted == 2 ) {
01827           CUDA_POLL(cuda_check_local_progress,this);
01828         }
01829       } else if (gbisPhase == 3) {
01830 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
01831         if (doSlow)
01832         cuda_GBIS_P3(
01833           remoteComputeRecords.size(),localComputeRecords.size(),
01834           0,localActivePatches.size(),
01835           (simParams->alpha_cutoff-simParams->fsMax),
01836           simParams->coulomb_radius_offset,
01837           simParams->nonbondedScaling,
01838           lata, latb, latc, stream2
01839           );
01840         cudaEventRecord(end_local_download, stream2);
01841         if ( workStarted == 2 ) {
01842           CUDA_POLL(cuda_check_local_progress,this);
01843         }
01844       } // phases
01845     } // GBISOn
01846     if ( simParams->PMEOn && simParams->PMEOffload  && !simParams->usePMECUDA) break;
01847 
01848   default:
01849 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
01850     //gpu_is_mine = 1;
01851     deviceCUDA->setGpuIsMine(1);
01852     break;
01853   } // switch
01854 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
01855 }

void ComputeNonbondedCUDA::registerPatches (  ) 

Definition at line 611 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), ComputeNonbondedCUDA::patch_record::bornRadBox, ProxyMgr::createProxy(), ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, ComputeNonbondedCUDA::patch_record::forceBox, SimParameters::GBISOn, hostedPatches, ComputeNonbondedCUDA::patch_record::hostPe, ComputeNonbondedCUDA::patch_record::intRadBox, ComputeNonbondedCUDA::patch_record::isLocal, localHostedPatches, master, masterPe, ComputeNonbondedCUDA::patch_record::msg, ProxyMgr::Object(), Node::Object(), ComputeNonbondedCUDA::patch_record::p, PatchMap::patch(), patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, PRIORITY_SIZE, ComputeNonbondedCUDA::patch_record::psiSumBox, Patch::registerBornRadPickup(), Patch::registerDEdaSumDeposit(), Patch::registerDHdrPrefixPickup(), Patch::registerForceDeposit(), Patch::registerIntRadPickup(), Patch::registerPositionPickup(), Patch::registerPsiSumDeposit(), remoteHostedPatches, Compute::setNumPatches(), Node::simParameters, simParams, ResizeArray< Elem >::size(), and ComputeNonbondedCUDA::patch_record::slave.

Referenced by assignPatches(), and ComputeNonbondedCUDA().

00611                                            {
00612 
00613   SimParameters *simParams = Node::Object()->simParameters;
00614   int npatches = master->activePatches.size();
00615   int *pids = master->activePatches.begin();
00616   patch_record *recs = master->patchRecords.begin();
00617   for ( int i=0; i<npatches; ++i ) {
00618     int pid = pids[i];
00619     patch_record &pr = recs[pid];
00620     if ( pr.hostPe == CkMyPe() ) {
00621       pr.slave = this;
00622       pr.msg = new (PRIORITY_SIZE) FinishWorkMsg;
00623       hostedPatches.add(pid);
00624       if ( pr.isLocal ) {
00625         localHostedPatches.add(pid);
00626       } else {
00627         remoteHostedPatches.add(pid);
00628       }
00629       ProxyMgr::Object()->createProxy(pid);
00630       pr.p = patchMap->patch(pid);
00631       pr.positionBox = pr.p->registerPositionPickup(this);
00632       pr.forceBox = pr.p->registerForceDeposit(this);
00633       if (simParams->GBISOn) {
00634         pr.intRadBox      = pr.p->registerIntRadPickup(this);
00635         pr.psiSumBox      = pr.p->registerPsiSumDeposit(this);
00636         pr.bornRadBox     = pr.p->registerBornRadPickup(this);
00637         pr.dEdaSumBox     = pr.p->registerDEdaSumDeposit(this);
00638         pr.dHdrPrefixBox  = pr.p->registerDHdrPrefixPickup(this);
00639       }
00640     }
00641   }
00642   if ( master == this ) setNumPatches(activePatches.size());
00643   else setNumPatches(hostedPatches.size());
00644   if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
00645   CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00646 }

void ComputeNonbondedCUDA::requirePatch ( int  pid  ) 

Definition at line 575 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< Elem >::add(), ComputeNonbondedCUDA::patch_record::bornRad, computesChanged, ComputeNonbondedCUDA::patch_record::dEdaSum, deviceCUDA, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::f, DeviceCUDA::getMergeGrids(), ComputeNonbondedCUDA::patch_record::hostPe, PatchMap::index_a(), PatchMap::index_b(), PatchMap::index_c(), ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::isLocal, ComputeNonbondedCUDA::patch_record::isSameNode, ComputeNonbondedCUDA::patch_record::isSamePhysicalNode, localActivePatches, PatchMap::node(), ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::r, ComputeNonbondedCUDA::patch_record::refCount, remoteActivePatches, ComputeNonbondedCUDA::patch_record::x, and ComputeNonbondedCUDA::patch_record::xExt.

Referenced by register_cuda_compute_pair(), and register_cuda_compute_self().

00575                                                {
00576 
00577   computesChanged = 1;
00578   patch_record &pr = patchRecords[pid];
00579   if ( pr.refCount == 0 ) {
00580     pr.isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
00581     pr.isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00582     if ( deviceCUDA->getMergeGrids() ) {
00583       pr.isLocal = 0;
00584     } else if ( CkNumNodes() < 2 ) {
00585       pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
00586          patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00587     } else {
00588       pr.isLocal = pr.isSameNode;
00589     }
00590     if ( pr.isLocal ) {
00591       localActivePatches.add(pid);
00592     } else {
00593       remoteActivePatches.add(pid);
00594     }
00595     activePatches.add(pid);
00596     pr.patchID = pid;
00597     pr.hostPe = -1;
00598     pr.x = NULL;
00599     pr.xExt = NULL;
00600     pr.r = NULL;
00601     pr.f = NULL;
00602     pr.intRad      = NULL;
00603     pr.psiSum      = NULL;
00604     pr.bornRad     = NULL;
00605     pr.dEdaSum     = NULL;
00606     pr.dHdrPrefix  = NULL;
00607   }
00608   pr.refCount += 1;
00609 }

void ComputeNonbondedCUDA::skip ( void   ) 

Definition at line 1091 of file ComputeNonbondedCUDA.C.

References ComputeNonbondedCUDA::patch_record::bornRadBox, ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, ComputeNonbondedCUDA::patch_record::forceBox, SimParameters::GBISOn, hostedPatches, ComputeNonbondedCUDA::patch_record::intRadBox, master, Node::Object(), patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::psiSumBox, Node::simParameters, simParams, ResizeArray< Elem >::size(), and Box< Owner, Data >::skip().

Referenced by noWork(), and ComputeMgr::recvNonbondedCUDASlaveSkip().

01091                                 {
01092   //fprintf(stderr, "ComputeNonbondedCUDA::skip()\n");
01093   SimParameters *simParams = Node::Object()->simParameters;
01094   for ( int i=0; i<hostedPatches.size(); ++i ) {
01095     patch_record &pr = master->patchRecords[hostedPatches[i]];
01096     pr.positionBox->skip();
01097     pr.forceBox->skip();
01098     if (simParams->GBISOn) {
01099       pr.intRadBox->skip();
01100       pr.psiSumBox->skip();
01101       pr.bornRadBox->skip();
01102       pr.dEdaSumBox->skip();
01103       pr.dHdrPrefixBox->skip();
01104     }
01105   }
01106 }


Member Data Documentation

Definition at line 119 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

Definition at line 142 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

Definition at line 141 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

Definition at line 98 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), and finishReductions().

Definition at line 132 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and requirePatch().

Definition at line 111 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().

Definition at line 110 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

Definition at line 116 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

Definition at line 80 of file ComputeNonbondedCUDA.h.

Referenced by finishReductions(), noWork(), and recvYieldDevice().

Definition at line 80 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), finishReductions(), finishWork(), noWork(), and recvYieldDevice().

Definition at line 102 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

Definition at line 101 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

Definition at line 96 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), noWork(), registerPatches(), and skip().

Definition at line 79 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), noWork(), and recvYieldDevice().

Definition at line 95 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), and requirePatch().

Definition at line 125 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), noWork(), and registerPatches().

Definition at line 129 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), finishWork(), and noWork().

Definition at line 135 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

Definition at line 136 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

Definition at line 139 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

Definition at line 108 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().

Definition at line 107 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

Definition at line 95 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), and requirePatch().

Definition at line 96 of file ComputeNonbondedCUDA.h.

Referenced by finishWork(), and registerPatches().

Definition at line 138 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

Definition at line 126 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

Definition at line 128 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), finishWork(), and noWork().

Definition at line 127 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), finishWork(), and noWork().

Definition at line 105 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

Definition at line 104 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

Definition at line 81 of file ComputeNonbondedCUDA.h.

Referenced by finishReductions(), and noWork().

Definition at line 137 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().


The documentation for this class was generated from the following files:

Generated on 11 Nov 2019 for NAMD by  doxygen 1.6.1