CudaComputeNonbonded Class Reference

#include <CudaComputeNonbonded.h>

Inheritance diagram for CudaComputeNonbonded:
Compute ComputeNonbondedUtil

List of all members.

Classes

struct  ComputeRecord
struct  PatchRecord

Public Member Functions

 CudaComputeNonbonded (ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 ~CudaComputeNonbonded ()
void registerComputeSelf (ComputeID cid, PatchID pid)
void registerComputePair (ComputeID cid, PatchID *pid, int *trans)
void assignPatches (ComputeMgr *computeMgrIn)
virtual void initialize ()
virtual void atomUpdate ()
virtual int noWork ()
virtual void doWork ()
void launchWork ()
void finishReductions ()
void unregisterBoxesOnPe ()
void assignPatchesOnPe ()
void openBoxesOnPe ()
void skipPatchesOnPe ()
void finishPatchesOnPe ()
void finishPatchOnPe (int i)
void messageEnqueueWork ()
virtual void patchReady (PatchID, int doneMigration, int seq)
virtual void gbisP2PatchReady (PatchID, int seq)
virtual void gbisP3PatchReady (PatchID, int seq)

Detailed Description

Definition at line 21 of file CudaComputeNonbonded.h.


Constructor & Destructor Documentation

CudaComputeNonbonded::CudaComputeNonbonded ( ComputeID  c,
int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 36 of file CudaComputeNonbonded.C.

References cudaCheck, SimParameters::GBISOn, Compute::gbisPhase, NAMD_die(), Node::Object(), SimParameters::pressureProfileOn, Node::simParameters, and simParams.

00037                                                               :
00038 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
00039 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
00040 
00041   cudaCheck(cudaSetDevice(deviceID));
00042 
00043         exclusionsByAtom = NULL;
00044 
00045   vdwTypes = NULL;
00046   vdwTypesSize = 0;
00047 
00048   exclIndexMaxDiff = NULL;
00049   exclIndexMaxDiffSize = 0;
00050 
00051   atomIndex = NULL;
00052   atomIndexSize = 0;
00053 
00054   atomStorageSize = 0;
00055 
00056   // Atom and charge storage
00057   atoms = NULL;
00058   atomsSize = 0;
00059 
00060   // Force storage
00061   h_forces = NULL;
00062   h_forcesSize = 0;
00063   h_forcesSlow = NULL;
00064   h_forcesSlowSize = 0;
00065 
00066   d_forces = NULL;
00067   d_forcesSize = 0;
00068   d_forcesSlow = NULL;
00069   d_forcesSlowSize = 0;
00070 
00071   // GBIS
00072   intRad0H = NULL;
00073   intRad0HSize = 0;
00074   intRadSH = NULL;
00075   intRadSHSize = 0;
00076   psiSumH = NULL;
00077   psiSumHSize = 0;
00078   bornRadH = NULL;
00079   bornRadHSize = 0;
00080   dEdaSumH = NULL;
00081   dEdaSumHSize = 0;
00082   dHdrPrefixH = NULL;
00083   dHdrPrefixHSize = 0;
00084   maxShmemPerBlock = 0;
00085   cudaPatches = NULL;
00086 
00087   atomsChangedIn = true;
00088   atomsChanged = true;
00089   computesChanged = true;
00090 
00091   forceDoneEventRecord = false;
00092 
00093   SimParameters *simParams = Node::Object()->simParameters;
00094   if (simParams->pressureProfileOn) {
00095     NAMD_die("CudaComputeNonbonded, pressure profile not supported");
00096   }
00097 
00098   if (simParams->GBISOn) gbisPhase = 3;
00099 
00100   doSkip = false;
00101 }

CudaComputeNonbonded::~CudaComputeNonbonded (  ) 

Definition at line 106 of file CudaComputeNonbonded.C.

References cudaCheck, and ComputeMgr::sendUnregisterBoxesOnPe().

00106                                             {
00107   cudaCheck(cudaSetDevice(deviceID));
00108         if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
00109   if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
00110   if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
00111   if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
00112   if (h_forces != NULL) deallocate_host<float4>(&h_forces);
00113   if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
00114   if (d_forces != NULL) deallocate_device<float4>(&d_forces);
00115   if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
00116 
00117   // GBIS
00118   if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
00119   if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
00120   if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
00121   if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
00122   if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
00123   if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
00124 
00125   if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
00126 
00127   if (patches.size() > 0) {
00128     deallocate_host<VirialEnergy>(&h_virialEnergy);
00129     deallocate_device<VirialEnergy>(&d_virialEnergy);
00130     cudaCheck(cudaStreamDestroy(stream));
00131     cudaCheck(cudaEventDestroy(forceDoneEvent));
00132     CmiDestroyLock(lock);
00133     delete reduction;
00134   }
00135 
00136   // NOTE: unregistering happens in [sync] -entry method
00137   computeMgr->sendUnregisterBoxesOnPe(pes, this);
00138 
00139 }


Member Function Documentation

void CudaComputeNonbonded::assignPatches ( ComputeMgr computeMgrIn  ) 

Definition at line 363 of file CudaComputeNonbonded.C.

References PatchMap::basePatchIDList(), deviceCUDA, findHomePatchPe(), findProxyPatchPes(), DeviceCUDA::getDeviceCount(), DeviceCUDA::getMasterPeForDeviceID(), Compute::getNumPatches(), DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputePmeCUDAMgr::isPmePe(), j, NAMD_bug(), ComputePmeCUDAMgr::Object(), PatchMap::Object(), PatchMap::ObjectOnPe(), ComputeMgr::sendAssignPatchesOnPe(), Compute::setNumPatches(), and sort.

Referenced by ComputeMgr::createComputes().

00363                                                                  {
00364   // Remove duplicate patches
00365   std::sort(patches.begin(), patches.end());
00366   std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
00367   patches.erase(last, patches.end());
00368   // Set number of patches
00369   setNumPatches(patches.size());
00370   masterPe = CkMyPe();
00371   computeMgr = computeMgrIn;
00372   // Start patch counter
00373   patchesCounter = getNumPatches();
00374   // Patch ID map
00375   std::map<PatchID, int> pidMap;
00376 #if 1
00377   //-------------------------------------------------------
00378   // Copied in from ComputeNonbondedCUDA::assignPatches()
00379   //-------------------------------------------------------
00380 
00381   std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
00382   int numPesOnNodeSharingDevice = 0;
00383   int masterIndex = -1;
00384   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00385     int pe = deviceCUDA->getPesSharingDevice(i);
00386     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00387     if ( CkNodeOf(pe) == CkMyNode() ) {
00388       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00389     }
00390   }
00391 
00392   std::vector<int> count(patches.size(), 0);
00393   std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
00394   std::vector<int> rankpcount(CkMyNodeSize(), 0);
00395   std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
00396 
00397   PatchMap* patchMap = PatchMap::Object();
00398 
00399   int unassignedpatches = patches.size();
00400 
00401   for (int i=0;i < patches.size(); ++i) {
00402     patches[i].pe = -1;
00403   }
00404 
00405   // assign if home pe and build table of natural proxies
00406   for (int i=0;i < patches.size(); ++i) {
00407     int pid = patches[i].patchID;
00408     // homePe = PE where the patch currently resides
00409     int homePe = patchMap->node(pid);
00410     for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
00411       int pe = pesOnNodeSharingDevice[j];
00412       // If homePe is sharing this device, assign this patch to homePe
00413       if ( pe == homePe ) {
00414         patches[i].pe = pe;
00415         --unassignedpatches;
00416         pcount[j] += 1;
00417       }
00418       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00419         table[i*numPesOnNodeSharingDevice + j] = 1;
00420       }
00421     }
00422     // Assign this patch to homePe, if it resides on the same node
00423     if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00424       patches[i].pe = homePe;
00425       --unassignedpatches;
00426       rankpcount[CkRankOf(homePe)] += 1;
00427     }
00428   }
00429   // assign if only one pe has a required proxy
00430   for (int i=0; i < patches.size(); ++i) {
00431     int pid = patches[i].patchID;
00432     if ( patches[i].pe != -1 ) continue;
00433     int c = 0;
00434     int lastj;
00435     for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
00436       if ( table[i*numPesOnNodeSharingDevice + j] ) {
00437         ++c;
00438         lastj = j;
00439       }
00440     }
00441     count[i] = c;
00442     if ( c == 1 ) {
00443       patches[i].pe = pesOnNodeSharingDevice[lastj];
00444       --unassignedpatches;
00445       pcount[lastj] += 1;
00446     }
00447   }
00448   int assignj = 0;
00449   while ( unassignedpatches ) {
00450     int i;
00451     for (i=0;i < patches.size(); ++i) {
00452       if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
00453       int pid = patches[i].patchID;
00454       // patch_record &pr = patchRecords[pid];
00455       if ( patches[i].pe != -1 ) continue;
00456       patches[i].pe = pesOnNodeSharingDevice[assignj];
00457       --unassignedpatches;
00458       pcount[assignj] += 1;
00459       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00460       break;
00461     }
00462     if (i < patches.size() ) continue;  // start search again
00463     for ( i=0;i < patches.size(); ++i ) {
00464       int pid = patches[i].patchID;
00465       // patch_record &pr = patchRecords[pid];
00466       if ( patches[i].pe != -1 ) continue;
00467       if ( count[i] ) continue;
00468       patches[i].pe = pesOnNodeSharingDevice[assignj];
00469       --unassignedpatches;
00470       pcount[assignj] += 1;
00471       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00472       break;
00473     }
00474     if ( i < patches.size() ) continue;  // start search again
00475     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00476   }
00477 
00478   // For each rank, list of patches
00479   rankPatches.resize(CkMyNodeSize());
00480   for (int i=0; i < patches.size(); ++i) {
00481     rankPatches[CkRankOf(patches[i].pe)].push_back(i);
00482     pidMap[patches[i].patchID] = i;
00483   }
00484 
00485   // for ( int i=0; i < patches.size(); ++i ) {
00486   //   CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
00487   // }
00488 
00489 /*
00490   slavePes = new int[CkMyNodeSize()];
00491   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00492   numSlaves = 0;
00493   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00494     int pe = pesOnNodeSharingDevice[j];
00495     int rank = pe - CkNodeFirst(CkMyNode());
00496     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00497     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00498     if ( pe == CkMyPe() ) continue;
00499     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00500     rankpcount[rank] = 0;  // skip in rank loop below
00501     slavePes[numSlaves] = pe;
00502     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00503     ++numSlaves;
00504   }
00505   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00506     int pe = CkNodeFirst(CkMyNode()) + j;
00507     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00508     //          CkMyPe(),j,pe,rankpcount[j]);
00509     if ( ! rankpcount[j] ) continue;
00510     if ( pe == CkMyPe() ) continue;
00511     slavePes[numSlaves] = pe;
00512     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00513     ++numSlaves;
00514   }
00515 */
00516 
00517 #else
00518   // For each rank, list of patches
00519   rankPatches.resize(CkMyNodeSize());
00520   // For each rank, list of home patch IDs
00521   PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
00522   for (int i=0;i < CkMyNodeSize();i++) {
00523     int pe = CkNodeFirst(CkMyNode()) + i;
00524     PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
00525   }
00526   std::vector<int> proxyPatchPes;
00527   std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
00528   //--------------------------------------------------------
00529   // Build a list of PEs to avoid
00530   std::vector<int> pesToAvoid;
00531 #if 0
00532   // Avoid other GPUs' master PEs
00533   for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
00534     int pe = deviceCUDA->getMasterPeForDeviceID(i);
00535     if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
00536   }
00537   // Avoid PEs that are involved in PME
00538   ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
00539   for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
00540     if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
00541   }
00542   // Set counters of avoidable PEs to high numbers
00543   for (int i=0;i < pesToAvoid.size();i++) {
00544     int pe = pesToAvoid[i];
00545     peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
00546   }
00547 #endif
00548   // Avoid master Pe somewhat
00549   peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
00550   //--------------------------------------------------------
00551   for (int i=0;i < patches.size();i++) {
00552     PatchID pid = patches[i].patchID;
00553     int pe = findHomePatchPe(rankHomePatchIDs, pid);
00554     if (pe == -1) {
00555       // Patch not present on this node => try finding a ProxyPatch
00556       findProxyPatchPes(proxyPatchPes, pid);
00557       if (proxyPatchPes.size() == 0) {
00558         // No ProxyPatch => create one on rank that has the least ProxyPatches
00559         int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
00560         pe = CkNodeFirst(CkMyNode()) + rank;
00561         peProxyPatchCounter[rank]++;
00562       } else {
00563         // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
00564         // this is done by finding the entry with minimum peProxyPatchCounter -value
00565         // Find miniumum among proxyPatchPes, i.e., find the minimum among
00566         // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
00567         // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
00568         //   [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
00569         //   - proxyPatchPes.begin();
00570         // pe = proxyPatchPes[pppi];
00571         int minCounter = (1 << 30);
00572         for (int j=0;j < proxyPatchPes.size();j++) {
00573           if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
00574             pe = proxyPatchPes[j];
00575             minCounter = peProxyPatchCounter[CkRankOf(pe)];
00576           }
00577         }
00578         if (pe == -1)
00579           NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
00580         peProxyPatchCounter[CkRankOf(pe)]++;
00581       }
00582     } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
00583       // Found home patch on this node, but it's on PE that should be avoided => find a new one
00584       int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
00585       pe = CkNodeFirst(CkMyNode()) + rank;
00586       peProxyPatchCounter[rank]++;
00587     }
00588     if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
00589       NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
00590     rankPatches[CkRankOf(pe)].push_back(i);
00591     pidMap[pid] = i;
00592   }
00593 
00594   delete [] rankHomePatchIDs;
00595 #endif
00596   // Setup computes using pidMap
00597   for (int i=0;i < computes.size();i++) {
00598     computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
00599     computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
00600   }
00601   for (int i=0;i < CkMyNodeSize();i++) {
00602     if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
00603   }
00604   computeMgr->sendAssignPatchesOnPe(pes, this);
00605 }

void CudaComputeNonbonded::assignPatchesOnPe (  ) 

Definition at line 301 of file CudaComputeNonbonded.C.

References ResizeArray< Elem >::add(), j, NAMD_bug(), PatchMap::node(), PatchMap::Object(), ResizeArray< Elem >::size(), and sort.

Referenced by ComputeMgr::recvAssignPatchesOnPe().

00301                                              {
00302   if (rankPatches[CkMyRank()].size() == 0)
00303     NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
00304 
00305   // calculate priority rank of local home patch within pe
00306   {
00307     PatchMap* patchMap = PatchMap::Object();
00308     ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
00309     for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
00310       int i = rankPatches[CkMyRank()][k];
00311       int pid = patches[i].patchID;
00312       int homePe = patchMap->node(pid);
00313       if ( CkNodeOf(homePe) == CkMyNode() ) {
00314         int2 pid_index;
00315         pid_index.x = pid;
00316         pid_index.y = i;
00317         homePatchByRank[CkRankOf(homePe)].add(pid_index);
00318       }
00319     }
00320     for ( int i=0; i<CkMyNodeSize(); ++i ) {
00321       pid_sortop_reverse_priority so;
00322       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
00323       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
00324       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
00325         int index = homePatchByRank[i][j].y;
00326         patches[index].reversePriorityRankInPe = j + masterBoost;
00327       }
00328     }
00329   }
00330 
00331   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00332     assignPatch(rankPatches[CkMyRank()][i]);
00333   }
00334 }

void CudaComputeNonbonded::atomUpdate ( void   )  [virtual]

Reimplemented from Compute.

Definition at line 644 of file CudaComputeNonbonded.C.

00644                                       {
00645   atomsChangedIn = true;
00646 }

void CudaComputeNonbonded::doWork ( void   )  [virtual]

Reimplemented from Compute.

Definition at line 920 of file CudaComputeNonbonded.C.

References Flags::doEnergy, Flags::doFullElectrostatics, Flags::doNonbonded, Flags::doVirial, SimParameters::GBISOn, Compute::gbisPhase, NAMD_bug(), Node::Object(), ComputeMgr::sendOpenBoxesOnPe(), and Node::simParameters.

00920                                   {
00921   if (CkMyPe() != masterPe)
00922     NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
00923 
00924   // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
00925   // atomsChangedIn can be set to true by any Pe
00926   // atomsChanged can only be set by masterPe
00927   // This use of double varibles makes sure we don't have race condition
00928   atomsChanged = atomsChangedIn;
00929   atomsChangedIn = false;
00930 
00931   SimParameters *simParams = Node::Object()->simParameters;
00932 
00933   if (patches.size() == 0) return;  // No work do to
00934 
00935   // Take the flags from the first patch on this Pe
00936   // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
00937   Flags &flags = patches[0].patch->flags;
00938 
00939   doSlow = flags.doFullElectrostatics;
00940   doEnergy = flags.doEnergy;
00941   doVirial = flags.doVirial;
00942 
00943   if (flags.doNonbonded) {
00944 
00945     if (simParams->GBISOn) {
00946       gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
00947     }
00948 
00949     if (!simParams->GBISOn || gbisPhase == 1) {
00950       if ( computesChanged ) {
00951         updateComputes();
00952       }
00953       if (atomsChanged) {
00954         // Re-calculate patch atom numbers and storage
00955         updatePatches();
00956         reSortDone = false;
00957       }
00958       reallocateArrays();
00959     }
00960 
00961     // Open boxes on Pes and launch work to masterPe
00962     computeMgr->sendOpenBoxesOnPe(pes, this);
00963 
00964   } else {
00965     // No work to do, skip
00966     skip();
00967   }
00968 
00969 }

void CudaComputeNonbonded::finishPatchesOnPe (  ) 

Definition at line 1369 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchesOnPe().

01369                                              {
01370   finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
01371 }

void CudaComputeNonbonded::finishPatchOnPe ( int  i  ) 

Definition at line 1376 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchOnPe().

01376                                                 {
01377   std::vector<int> v(1, i);
01378   finishSetOfPatchesOnPe(v);
01379 }

void CudaComputeNonbonded::finishReductions (  ) 

Definition at line 1210 of file CudaComputeNonbonded.C.

References ADD_TENSOR_OBJECT, cudaCheck, VirialEnergy::energyElec, VirialEnergy::energyGBIS, VirialEnergy::energySlow, VirialEnergy::energyVdw, SimParameters::GBISOn, CudaTileListKernel::getNumExcluded(), SubmitReduction::item(), NAMD_bug(), Node::Object(), REDUCTION_COMPUTE_CHECKSUM, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, Node::simParameters, SubmitReduction::submit(), VirialEnergy::virial, VirialEnergy::virialSlow, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

Referenced by ComputeMgr::recvFinishReductions().

01210                                             {
01211 
01212   if (CkMyPe() != masterPe)
01213     NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
01214 
01215   // fprintf(stderr, "%d finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
01216 
01217   if (!doSkip) {
01218 
01219     if (doStreaming && (doVirial || doEnergy)) {
01220       // For streaming kernels, we must wait for virials and forces to be copied back to CPU
01221       if (!forceDoneEventRecord)
01222         NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
01223       cudaCheck(cudaEventSynchronize(forceDoneEvent));
01224       forceDoneEventRecord = false;
01225     }
01226 
01227     if (doVirial) {
01228       Tensor virialTensor;
01229       virialTensor.xx = h_virialEnergy->virial[0];
01230       virialTensor.xy = h_virialEnergy->virial[1];
01231       virialTensor.xz = h_virialEnergy->virial[2];
01232       virialTensor.yx = h_virialEnergy->virial[3];
01233       virialTensor.yy = h_virialEnergy->virial[4];
01234       virialTensor.yz = h_virialEnergy->virial[5];
01235       virialTensor.zx = h_virialEnergy->virial[6];
01236       virialTensor.zy = h_virialEnergy->virial[7];
01237       virialTensor.zz = h_virialEnergy->virial[8];
01238       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
01239       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
01240       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
01241       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
01242       if (doSlow) {
01243         Tensor virialTensor;
01244         virialTensor.xx = h_virialEnergy->virialSlow[0];
01245         virialTensor.xy = h_virialEnergy->virialSlow[1];
01246         virialTensor.xz = h_virialEnergy->virialSlow[2];
01247         virialTensor.yx = h_virialEnergy->virialSlow[3];
01248         virialTensor.yy = h_virialEnergy->virialSlow[4];
01249         virialTensor.yz = h_virialEnergy->virialSlow[5];
01250         virialTensor.zx = h_virialEnergy->virialSlow[6];
01251         virialTensor.zy = h_virialEnergy->virialSlow[7];
01252         virialTensor.zz = h_virialEnergy->virialSlow[8];
01253         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
01254         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
01255         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
01256         ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
01257       }
01258     }
01259     if (doEnergy) {
01260       // if (doSlow)
01261       //   printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
01262       SimParameters *simParams = Node::Object()->simParameters;
01263       reduction->item(REDUCTION_LJ_ENERGY)    += h_virialEnergy->energyVdw;
01264       reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((simParams->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
01265       // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
01266       if (doSlow) reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
01267       // fprintf(stderr, "h_virialEnergy->energyElec %lf\n", h_virialEnergy->energyElec);
01268     }
01269 
01270     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
01271   }
01272   reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
01273   reduction->submit();
01274 
01275   // Reset flags
01276   doSkip = false;
01277   computesChanged = false;
01278 }

void CudaComputeNonbonded::gbisP2PatchReady ( PatchID  pid,
int  seq 
) [virtual]

Reimplemented from Compute.

Definition at line 244 of file CudaComputeNonbonded.C.

00244                                                                 {
00245   CmiLock(lock);
00246   Compute::gbisP2PatchReady(pid, seq);
00247   CmiUnlock(lock);
00248 }

void CudaComputeNonbonded::gbisP3PatchReady ( PatchID  pid,
int  seq 
) [virtual]

Reimplemented from Compute.

Definition at line 250 of file CudaComputeNonbonded.C.

00250                                                                 {
00251   CmiLock(lock);
00252   Compute::gbisP3PatchReady(pid, seq);
00253   CmiUnlock(lock);
00254 }

void CudaComputeNonbonded::initialize ( void   )  [virtual]

Reimplemented from Compute.

Definition at line 607 of file CudaComputeNonbonded.C.

References cudaCheck, ReductionMgr::Object(), Compute::priority(), REDUCTIONS_BASIC, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

00607                                       {
00608   if (patches.size() > 0) {
00609     // Allocate CUDA version of patches
00610     cudaCheck(cudaSetDevice(deviceID));
00611     allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
00612 
00613     allocate_host<VirialEnergy>(&h_virialEnergy, 1);
00614     allocate_device<VirialEnergy>(&d_virialEnergy, 1);
00615 
00616   /* JM: Queries for maximum sharedMemoryPerBlock on deviceID
00617    */
00618    cudaDeviceProp props;
00619    cudaCheck(cudaGetDeviceProperties(&props, deviceID)); //Gets properties of 'deviceID device'
00620    maxShmemPerBlock = props.sharedMemPerBlock;
00621 
00622 #if CUDA_VERSION >= 5050
00623     int leastPriority, greatestPriority;
00624     cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
00625     int priority = (doStreaming) ? leastPriority : greatestPriority;
00626     // int priority = greatestPriority;
00627     cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
00628 #else
00629     cudaCheck(cudaStreamCreate(&stream));
00630 #endif
00631     cudaCheck(cudaEventCreate(&forceDoneEvent));
00632 
00633     buildExclusions();
00634 
00635     lock = CmiCreateLock();
00636 
00637     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00638   }
00639 }

void CudaComputeNonbonded::launchWork (  ) 

Definition at line 971 of file CudaComputeNonbonded.C.

References CudaComputeNonbonded::PatchRecord::atomStart, cudaCheck, ComputeNonbondedUtil::cutoff, SimParameters::GBISOn, Compute::gbisPhase, CudaTileListKernel::getEmptyPatches(), CudaTileListKernel::getNumEmptyPatches(), CudaTileListKernel::getNumPatches(), CudaComputeNonbondedKernel::getPatchReadyQueue(), SubmitReduction::item(), NAMD_bug(), CudaComputeNonbonded::PatchRecord::numAtoms, Node::Object(), CudaComputeNonbondedKernel::reduceVirialEnergy(), REDUCTION_PAIRLIST_WARNINGS, Flags::savePairlists, Node::simParameters, and Flags::usePairlists.

Referenced by ComputeMgr::recvLaunchWork().

00971                                       {
00972   if (CkMyPe() != masterPe)
00973     NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
00974 
00975   beforeForceCompute = CkWallTimer();
00976 
00977   cudaCheck(cudaSetDevice(deviceID));
00978   SimParameters *simParams = Node::Object()->simParameters;
00979 
00980   //execute only during GBIS phase 1, or if not using GBIS
00981   if (!simParams->GBISOn || gbisPhase == 1) {
00982 
00983     if ( atomsChanged || computesChanged ) {
00984       // Invalidate pair lists
00985       pairlistsValid = false;
00986       pairlistTolerance = 0.0f;
00987     }
00988 
00989     // Get maximum atom movement and patch tolerance
00990     float maxAtomMovement = 0.0f;
00991     float maxPatchTolerance = 0.0f;
00992     getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
00993     // Update pair-list cutoff
00994     Flags &flags = patches[0].patch->flags;
00995     savePairlists = false;
00996     usePairlists = false;
00997     if ( flags.savePairlists ) {
00998       savePairlists = true;
00999       usePairlists = true;
01000     } else if ( flags.usePairlists ) {
01001       if ( ! pairlistsValid ||
01002            ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01003         reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01004       } else {
01005         usePairlists = true;
01006       }
01007     }
01008     if ( ! usePairlists ) {
01009       pairlistsValid = false;
01010     }
01011     float plcutoff = cutoff;
01012     if ( savePairlists ) {
01013       pairlistsValid = true;
01014       pairlistTolerance = 2. * maxPatchTolerance;
01015       plcutoff += pairlistTolerance;
01016     }
01017     plcutoff2 = plcutoff * plcutoff;
01018 
01019     // if (atomsChanged)
01020     //   CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01021     //     plcutoff, pairlistTolerance, savePairlists, usePairlists);
01022 
01023   } // if (!simParams->GBISOn || gbisPhase == 1)
01024 
01025   // Calculate PME & VdW forces
01026   if (!simParams->GBISOn || gbisPhase == 1) {
01027     doForce();
01028     if (doStreaming) {
01029       patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
01030       patchReadyQueueLen = tileListKernel.getNumPatches();
01031       patchReadyQueueNext = 0;
01032       // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
01033       int numEmptyPatches = tileListKernel.getNumEmptyPatches();
01034       int* emptyPatches = tileListKernel.getEmptyPatches();
01035       for (int i=0;i < numEmptyPatches;i++) {
01036         PatchRecord &pr = patches[emptyPatches[i]];
01037         memset(h_forces+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
01038         if (doSlow) memset(h_forcesSlow+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
01039         patchReadyQueue[i] = emptyPatches[i];
01040       }
01041       if (patchReadyQueueLen != patches.size())
01042         NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
01043     }
01044   }
01045 
01046   // For GBIS phase 1 at pairlist update, we must re-sort tile list
01047   // before calling doGBISphase1().
01048   if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
01049     // In this code path doGBISphase1() is called in forceDone()
01050     forceDoneSetCallback();
01051     return;
01052   }
01053 
01054   // GBIS Phases
01055   if (simParams->GBISOn) {
01056     if (gbisPhase == 1) {
01057       doGBISphase1();
01058     } else if (gbisPhase == 2) {
01059       doGBISphase2();
01060     } else if (gbisPhase == 3) {
01061       doGBISphase3();
01062     }
01063   }
01064 
01065   // Copy forces to host
01066   if (!simParams->GBISOn || gbisPhase == 3) {
01067     if (!doStreaming) {
01068       copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
01069       if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
01070     }
01071   }
01072 
01073   if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
01074     // For GBIS, energies are ready after phase 2
01075     nonbondedKernel.reduceVirialEnergy(tileListKernel,
01076       atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
01077       d_forces, d_forcesSlow, d_virialEnergy, stream);
01078     copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
01079   }
01080 
01081   // Setup call back
01082   forceDoneSetCallback();
01083 }

void CudaComputeNonbonded::messageEnqueueWork (  ) 

Definition at line 861 of file CudaComputeNonbonded.C.

References NAMD_bug().

Referenced by ComputeMgr::recvMessageEnqueueWork().

00861                                               {
00862   if (masterPe != CkMyPe())
00863     NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
00864   WorkDistrib::messageEnqueueWork(this);
00865 }

int CudaComputeNonbonded::noWork (  )  [virtual]

Reimplemented from Compute.

Definition at line 886 of file CudaComputeNonbonded.C.

References ComputeMgr::sendMessageEnqueueWork().

00886                                  {
00887   // Simply enqueu doWork on masterPe and return "no work"
00888   computeMgr->sendMessageEnqueueWork(masterPe, this);
00889   return 1;
00890 }

void CudaComputeNonbonded::openBoxesOnPe (  ) 

Definition at line 867 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendLaunchWork().

Referenced by ComputeMgr::recvOpenBoxesOnPe().

00867                                          {
00868   if (rankPatches[CkMyRank()].size() == 0)
00869     NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
00870   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00871     openBox(rankPatches[CkMyRank()][i]);
00872   }
00873   bool done = false;
00874   CmiLock(lock);
00875   patchesCounter -= rankPatches[CkMyRank()].size();
00876   if (patchesCounter == 0) {
00877     patchesCounter = getNumPatches();
00878     done = true;
00879   }
00880   CmiUnlock(lock);
00881   if (done) {
00882     computeMgr->sendLaunchWork(masterPe, this);
00883   }
00884 }

void CudaComputeNonbonded::patchReady ( PatchID  pid,
int  doneMigration,
int  seq 
) [virtual]

Reimplemented from Compute.

Definition at line 232 of file CudaComputeNonbonded.C.

References NAMD_bug().

00232                                                                              {
00233   if (doneMigration) {
00234     int i = findPid(pid);
00235     if (i == -1)
00236       NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
00237     updatePatch(i);
00238   }
00239   CmiLock(lock);
00240   Compute::patchReady(pid, doneMigration, seq);
00241   CmiUnlock(lock);
00242 }

void CudaComputeNonbonded::registerComputePair ( ComputeID  cid,
PatchID pid,
int *  trans 
)

Definition at line 173 of file CudaComputeNonbonded.C.

References PatchMap::center(), PatchMap::Object(), Vector::x, Vector::y, and Vector::z.

00173                                                                                       {
00174   computesChanged = true;
00175   addPatch(pid[0]);
00176   addPatch(pid[1]);
00177   PatchMap* patchMap = PatchMap::Object();
00178   int t1 = trans[0];
00179   int t2 = trans[1];
00180   Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
00181   offset.x += (t1%3-1) - (t2%3-1);
00182   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00183   offset.z += (t1/9-1) - (t2/9-1);
00184   addCompute(cid, pid[0], pid[1], offset);
00185 }

void CudaComputeNonbonded::registerComputeSelf ( ComputeID  cid,
PatchID  pid 
)

Definition at line 163 of file CudaComputeNonbonded.C.

00163                                                                          {
00164   computesChanged = true;
00165   addPatch(pid);
00166   addCompute(cid, pid, pid, 0.);
00167 }

void CudaComputeNonbonded::skipPatchesOnPe (  ) 

Definition at line 686 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendFinishReductions().

Referenced by ComputeMgr::recvSkipPatchesOnPe().

00686                                            {
00687   if (rankPatches[CkMyRank()].size() == 0)
00688     NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
00689   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00690     skipPatch(rankPatches[CkMyRank()][i]);
00691   }
00692   bool done = false;
00693   CmiLock(lock);
00694   patchesCounter -= rankPatches[CkMyRank()].size();
00695   if (patchesCounter == 0) {
00696     patchesCounter = getNumPatches();
00697     done = true;
00698   }
00699   CmiUnlock(lock);
00700   if (done) {
00701     // Reduction must be done on masterPe
00702     computeMgr->sendFinishReductions(masterPe, this);
00703   }
00704 }

void CudaComputeNonbonded::unregisterBoxesOnPe (  ) 

Definition at line 151 of file CudaComputeNonbonded.C.

References NAMD_bug().

Referenced by ComputeMgr::recvUnregisterBoxesOnPe().

00151                                                {
00152   if (rankPatches[CkMyRank()].size() == 0)
00153     NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
00154   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00155     unregisterBox(rankPatches[CkMyRank()][i]);
00156   }
00157 }


The documentation for this class was generated from the following files:

Generated on 11 Nov 2019 for NAMD by  doxygen 1.6.1