ComputePmeCUDADevice Class Reference

#include <ComputePmeCUDAMgr.h>

List of all members.

Classes

struct  PencilLocation

Public Member Functions

 ComputePmeCUDADevice ()
 ComputePmeCUDADevice (CkMigrateMessage *m)
 ~ComputePmeCUDADevice ()
void initialize (PmeGrid &pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in, int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in, CProxy_PmeAtomFiler pmeAtomFiler_in)
int getDeviceID ()
cudaStream_t getStream ()
CProxy_ComputePmeCUDAMgr getMgrProxy ()
void setPencilProxy (CProxy_CudaPmePencilXYZ pmePencilXYZ_in)
void setPencilProxy (CProxy_CudaPmePencilXY pmePencilXY_in)
void setPencilProxy (CProxy_CudaPmePencilX pmePencilX_in)
void activate_pencils ()
void initializePatches (int numHomePatches_in)
void registerNeighbor ()
void recvAtoms (PmeAtomMsg *msg)
void sendAtomsToNeighbors ()
void sendAtomsToNeighbor (int y, int z, int atomIval)
void recvAtomsFromNeighbor (PmeAtomPencilMsg *msg)
void registerRecvAtomsFromNeighbor ()
void spreadCharge ()
void gatherForce ()
void gatherForceDone ()
void sendForcesToNeighbors ()
void recvForcesFromNeighbor (PmeForcePencilMsg *msg)
void mergeForcesOnPatch (int homePatchIndex)
void sendForcesToPatch (PmeForceMsg *forceMsg)
void gatherForceDoneSubset (int first, int last)

Detailed Description

Definition at line 293 of file ComputePmeCUDAMgr.h.


Constructor & Destructor Documentation

ComputePmeCUDADevice::ComputePmeCUDADevice (  ) 

Definition at line 938 of file ComputePmeCUDAMgr.C.

00938                                            {
00939   // __sdag_init();
00940   numHomePatches = 0;
00941   forceCapacity = 0;
00942   force = NULL;
00943   pmeRealSpaceCompute = NULL;
00944   streamCreated = false;
00945   lock_numHomePatchesMerged = CmiCreateLock();
00946   lock_numPencils = CmiCreateLock();
00947   lock_numNeighborsRecv = CmiCreateLock();
00948   lock_recvAtoms = CmiCreateLock();
00949   numNeighborsExpected = 0;
00950   numStrayAtoms = 0;
00951   // Reset counters
00952   numNeighborsRecv = 0;
00953   numHomePatchesRecv = 0;
00954   numHomePatchesMerged = 0;
00955   atomI = 0;
00956   forceI = 1;
00957 }

ComputePmeCUDADevice::ComputePmeCUDADevice ( CkMigrateMessage *  m  ) 

Definition at line 959 of file ComputePmeCUDAMgr.C.

00959                                                               {
00960   // __sdag_init();
00961   numHomePatches = 0;
00962   forceCapacity = 0;
00963   force = NULL;
00964   pmeRealSpaceCompute = NULL;
00965   streamCreated = false;
00966   lock_numHomePatchesMerged = CmiCreateLock();
00967   lock_numPencils = CmiCreateLock();
00968   lock_numNeighborsRecv = CmiCreateLock();
00969   lock_recvAtoms = CmiCreateLock();
00970   numNeighborsExpected = 0;
00971   numStrayAtoms = 0;
00972   // Reset counters
00973   numNeighborsRecv = 0;
00974   numHomePatchesRecv = 0;
00975   numHomePatchesMerged = 0;
00976   atomI = 0;
00977   forceI = 1;
00978 }

ComputePmeCUDADevice::~ComputePmeCUDADevice (  ) 

Definition at line 980 of file ComputePmeCUDAMgr.C.

References cudaCheck, and j.

00980                                             {
00981   if (streamCreated) {
00982     cudaCheck(cudaSetDevice(deviceID));
00983     cudaCheck(cudaStreamDestroy(stream));
00984   }
00985   for (int j=0;j < 2;j++)
00986     for (int i=0;i < pmeAtomStorage[j].size();i++) {
00987       if (pmeAtomStorageAllocatedHere[i]) delete pmeAtomStorage[j][i];
00988     }
00989   if (force != NULL) deallocate_host<CudaForce>(&force);
00990   if (pmeRealSpaceCompute != NULL) delete pmeRealSpaceCompute;
00991   CmiDestroyLock(lock_numHomePatchesMerged);
00992   CmiDestroyLock(lock_numPencils);
00993   CmiDestroyLock(lock_numNeighborsRecv);
00994   CmiDestroyLock(lock_recvAtoms);
00995 }


Member Function Documentation

void ComputePmeCUDADevice::activate_pencils (  ) 

Definition at line 1097 of file ComputePmeCUDAMgr.C.

References PmeStartMsg::data, PmeStartMsg::dataSize, PmeRealSpaceCompute::getData(), and PmeRealSpaceCompute::getDataSize().

01097                                             {
01098   if (pmePencilType == 1) {
01099     PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
01100     pmeStartXMsg->data = pmeRealSpaceCompute->getData();
01101     pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01102     pmePencilX(0, pencilIndexY, pencilIndexZ).start(pmeStartXMsg);
01103   } else if (pmePencilType == 2) {
01104     PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
01105     pmeStartXMsg->data = pmeRealSpaceCompute->getData();
01106     pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01107     pmePencilXY(0, 0, pencilIndexZ).start(pmeStartXMsg);
01108   } else if (pmePencilType == 3) {
01109     PmeStartMsg* pmeStartMsg = new PmeStartMsg();
01110     pmeStartMsg->data = pmeRealSpaceCompute->getData();
01111     pmeStartMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01112     pmePencilXYZ[0].start(pmeStartMsg);
01113   }
01114 }

void ComputePmeCUDADevice::gatherForce (  ) 

Definition at line 1415 of file ComputePmeCUDAMgr.C.

References CUDA_PME_SPREADCHARGE_EVENT, and PmeRealSpaceCompute::gatherForce().

01415                                        {
01416   traceUserBracketEvent(CUDA_PME_SPREADCHARGE_EVENT, beforeWalltime, CmiWallTimer());
01417   beforeWalltime = CmiWallTimer();
01418   // (already have the updated lattice)
01419   pmeRealSpaceCompute->gatherForce(lattice, force);
01420   // Set callback that will call gatherForceDone() once gatherForce is done
01421   ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->gatherForceSetCallback(this);
01422   // ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->waitGatherForceDone();
01423   // gatherForceDone();
01424 }

void ComputePmeCUDADevice::gatherForceDone (  ) 

Definition at line 1450 of file ComputePmeCUDAMgr.C.

References CUDA_PME_GATHERFORCE_EVENT, gatherForceDoneLoop(), Node::Object(), sendForcesToNeighbors(), Node::simParameters, and SimParameters::useCkLoop.

01450                                            {
01451   // Primary pencil has the forces
01452 
01453   traceUserBracketEvent(CUDA_PME_GATHERFORCE_EVENT, beforeWalltime, CmiWallTimer());
01454 
01455   // Send forces to neighbors
01456   sendForcesToNeighbors();
01457 
01458 #if CMK_SMP && USE_CKLOOP
01459   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01460   if (useCkLoop >= 1) {
01461     CkLoop_Parallelize(gatherForceDoneLoop, 1, (void *)this, CkMyNodeSize(), 0, numHomePatches-1);
01462   } else
01463 #endif
01464 
01465   {
01466     // Loop through home patches and mark the primary pencil as "done"
01467     for (int homePatchIndex=0;homePatchIndex < numHomePatches;homePatchIndex++) {
01468       bool done = false;
01469       // ----------------------------- lock start ---------------------------
01470       // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01471       //       we really would only need to each element but this would required
01472       //       numHomePatches number of locks.
01473       if (pmePencilType != 3) CmiLock(lock_numPencils);
01474       numPencils[forceI][homePatchIndex]--;
01475       if (numPencils[forceI][homePatchIndex] == 0) done = true;
01476       if (pmePencilType != 3) CmiUnlock(lock_numPencils);
01477       // ----------------------------- lock end  ---------------------------
01478       if (done) {
01479         // This home patch is done, launch force merging
01480         thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
01481       }
01482     }
01483   }
01484 
01485   // In case we have no home patches, clear the primary pencil storage here
01486   if (numHomePatches == 0) {
01487     int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01488     pmeAtomStorage[forceI][pp0]->clear();
01489   }
01490 
01491 }

void ComputePmeCUDADevice::gatherForceDoneSubset ( int  first,
int  last 
)

Definition at line 1431 of file ComputePmeCUDAMgr.C.

References mergeForcesOnPatch().

Referenced by gatherForceDoneLoop().

01431                                                                     {
01432   for (int homePatchIndex=first;homePatchIndex <= last;homePatchIndex++) {
01433     bool done = false;
01434     // ----------------------------- lock start ---------------------------
01435     // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01436     //       we really would only need to each element but this would required
01437     //       numHomePatches number of locks.
01438     if (pmePencilType != 3) CmiLock(lock_numPencils);
01439     numPencils[forceI][homePatchIndex]--;
01440     if (numPencils[forceI][homePatchIndex] == 0) done = true;
01441     if (pmePencilType != 3) CmiUnlock(lock_numPencils);
01442     // ----------------------------- lock end  ---------------------------
01443     if (done) {
01444       // This home patch is done, launch force merging
01445       mergeForcesOnPatch(homePatchIndex);
01446     }
01447   }
01448 }

int ComputePmeCUDADevice::getDeviceID (  ) 

Definition at line 1071 of file ComputePmeCUDAMgr.C.

01071                                       {
01072   return deviceID;
01073 }

CProxy_ComputePmeCUDAMgr ComputePmeCUDADevice::getMgrProxy (  ) 

Definition at line 1075 of file ComputePmeCUDAMgr.C.

01075                                                            {
01076   return mgrProxy;
01077 }

cudaStream_t ComputePmeCUDADevice::getStream (  ) 

Definition at line 1067 of file ComputePmeCUDAMgr.C.

01067                                              {
01068   return stream;
01069 }

void ComputePmeCUDADevice::initialize ( PmeGrid pmeGrid_in,
int  pencilIndexY_in,
int  pencilIndexZ_in,
int  deviceID_in,
int  pmePencilType_in,
CProxy_ComputePmeCUDAMgr  mgrProxy_in,
CProxy_PmeAtomFiler  pmeAtomFiler_in 
)

Definition at line 997 of file ComputePmeCUDAMgr.C.

References createStream(), cudaCheck, j, PmeGrid::yBlocks, and PmeGrid::zBlocks.

00999                                        {
01000 
01001   deviceID = deviceID_in;
01002   cudaCheck(cudaSetDevice(deviceID));
01003   pmePencilType = pmePencilType_in;
01004   pmeGrid = pmeGrid_in;
01005   pencilIndexY = pencilIndexY_in;
01006   pencilIndexZ = pencilIndexZ_in;
01007   mgrProxy = mgrProxy_in;
01008   pmeAtomFiler = pmeAtomFiler_in;
01009   // Size of the neighboring pencil grid, max 3x3
01010   yNBlocks = std::min(pmeGrid.yBlocks, 3);
01011   zNBlocks = std::min(pmeGrid.zBlocks, 3);
01012   // Local pencil is at y=0,z=0
01013   if (yNBlocks == 1) {
01014     ylo = 0;
01015     yhi = 0;
01016   } else if (yNBlocks == 2) {
01017     ylo = -1;
01018     yhi = 0;
01019   } else {
01020     ylo = -1;
01021     yhi = 1;
01022   }
01023   if (zNBlocks == 1) {
01024     zlo = 0;
01025     zhi = 0;
01026   } else if (zNBlocks == 2) {
01027     zlo = -1;
01028     zhi = 0;
01029   } else {
01030     zlo = -1;
01031     zhi = 1;
01032   }
01033   
01034   neighborForcePencilMsgs.resize(yNBlocks*zNBlocks, NULL);
01035   // neighborForcePencils.resize(yNBlocks*zNBlocks);
01036   for (int j=0;j < 2;j++)
01037     homePatchIndexList[j].resize(yNBlocks*zNBlocks);
01038   neighborPatchIndex.resize(yNBlocks*zNBlocks);
01039 
01040   pmeAtomStorageAllocatedHere.resize(yNBlocks*zNBlocks, false);
01041   for (int j=0;j < 2;j++) {
01042     pmeAtomStorage[j].resize(yNBlocks*zNBlocks, NULL);
01043     for (int z=zlo;z <= zhi;z++) {
01044       for (int y=ylo;y <= yhi;y++) {
01045         int pp = y-ylo + (z-zlo)*yNBlocks;
01046         int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01047         int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01048         if (y == 0 && z == 0) {
01049           // Primary pencil
01050           pmeAtomStorage[j][pp] = new CudaPmeAtomStorage(pmePencilType != 3);
01051         } else {
01052           pmeAtomStorage[j][pp] = new CpuPmeAtomStorage(pmePencilType != 3);
01053         }
01054         pmeAtomStorageAllocatedHere[pp] = true;
01055       }
01056     }
01057   }
01058 
01059   // Create stream for this device
01060   createStream(stream);
01061   streamCreated = true;
01062   pmeRealSpaceCompute = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ,
01063     deviceID, stream);
01064 
01065 }

void ComputePmeCUDADevice::initializePatches ( int  numHomePatches_in  ) 

Definition at line 1116 of file ComputePmeCUDAMgr.C.

References j, PmeGrid::yBlocks, and PmeGrid::zBlocks.

01116                                                                   {
01117   numHomePatches = numHomePatches_in;
01118   for (int j=0;j < 2;j++)
01119     numPencils[j].resize(numHomePatches);
01120   for (int j=0;j < 2;j++)
01121     plList[j].resize(numHomePatches);
01122   for (int j=0;j < 2;j++)
01123     homePatchForceMsgs[j].resize(numHomePatches);
01124   // for (int j=0;j < 2;j++)
01125   //   numHomeAtoms[j].resize(numHomePatches);
01126   // If we have home patches, register this pencil with the neighbors and with self
01127   if (numHomePatches > 0) {
01128     for (int z=zlo;z <= zhi;z++) {
01129       for (int y=ylo;y <= yhi;y++) {
01130         int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01131         int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01132         int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
01133         mgrProxy[node].registerNeighbor(yt, zt);
01134       }
01135     }
01136   }
01137 }

void ComputePmeCUDADevice::mergeForcesOnPatch ( int  homePatchIndex  ) 

Definition at line 1584 of file ComputePmeCUDAMgr.C.

References j, sendForcesToPatch(), CudaForce::x, CudaForce::y, and CudaForce::z.

Referenced by gatherForceDoneSubset().

01584                                                                 {
01585   // We have all the forces for this patch => merge on a single Pe
01586 
01587   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01588 
01589   // Message that goes out to the compute
01590   PmeForceMsg *forceMsg = homePatchForceMsgs[forceI][homePatchIndex];
01591 
01592   if (pmePencilType == 3) {
01593     // 3D box => simple memory copy will do
01594     // Location of forces in the force[] array
01595     int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01596     // plList[homePatchIndex] array tells you the location of pencils that are sharing this home patch
01597     int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
01598     int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01599     int atomEnd   = patchPos[pencilPatchIndex];
01600     int numAtoms = atomEnd-atomStart;
01601     if (forceMsg->zeroCopy) {
01602       // Zero-copy, just pass the pointer
01603       forceMsg->force = force+atomStart;
01604     } else {
01605       memcpy(forceMsg->force, force+atomStart, numAtoms*sizeof(CudaForce));
01606     }
01607   } else {
01608 
01609     // Zero force array
01610     // memset(forceMsg->force, 0, numHomeAtoms[forceI][homePatchIndex]*sizeof(CudaForce));
01611     memset(forceMsg->force, 0, forceMsg->numAtoms*sizeof(CudaForce));
01612 
01613     // Store forces from primary pencil
01614     {
01615       int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01616       int* index = pmeAtomStorage[forceI][pp0]->getAtomIndex();
01617       int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
01618       int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01619       int atomEnd   = patchPos[pencilPatchIndex];
01620       int numAtoms = atomEnd-atomStart;
01621 
01622       // Copy in local forces that are stored in the force[] array
01623       for (int i=0;i < numAtoms;i++) {
01624         forceMsg->force[index[atomStart + i]] = force[atomStart + i];
01625       }
01626 
01627     }
01628 
01629     // Add forces from neighboring pencils
01630     for (int j=1;j < plList[forceI][homePatchIndex].size();j++) {
01631       int pp               = plList[forceI][homePatchIndex][j].pp;
01632       int pencilPatchIndex = plList[forceI][homePatchIndex][j].pencilPatchIndex;
01633 
01634       int* patchPos = pmeAtomStorage[forceI][pp]->getPatchPos();
01635       int* index = pmeAtomStorage[forceI][pp]->getAtomIndex();
01636       int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01637       int atomEnd   = patchPos[pencilPatchIndex];
01638       int numAtoms = atomEnd-atomStart;
01639       CudaForce *dstForce = forceMsg->force;
01640       // CudaForce *srcForce = neighborForcePencils[pp].force;
01641       CudaForce *srcForce = neighborForcePencilMsgs[pp]->force;
01642 
01643       for (int i=0;i < numAtoms;i++) {
01644         dstForce[index[atomStart + i]].x += srcForce[atomStart + i].x;
01645         dstForce[index[atomStart + i]].y += srcForce[atomStart + i].y;
01646         dstForce[index[atomStart + i]].z += srcForce[atomStart + i].z;
01647       }
01648 
01649     }
01650   }
01651 
01652   // Clear storage
01653   plList[forceI][homePatchIndex].clear();
01654 
01655   // ----------------------------- lock start ---------------------------
01656   // bool done = false;
01657   CmiLock(lock_numHomePatchesMerged);
01658   numHomePatchesMerged++;
01659   if (numHomePatchesMerged == numHomePatches) {
01660     // Reset counter
01661     numHomePatchesMerged = 0;
01662 
01663     // Delete messages
01664     for (int i=0;i < neighborForcePencilMsgs.size();i++) {
01665       if (neighborForcePencilMsgs[i] != NULL) {
01666         delete neighborForcePencilMsgs[i];
01667         neighborForcePencilMsgs[i] = NULL;
01668       }
01669     }
01670 
01671     // Done merging and sending forces => clear storage
01672     for (int pp=0;pp < homePatchIndexList[forceI].size();pp++)
01673       homePatchIndexList[forceI][pp].clear();
01674     for (int pp=0;pp < pmeAtomStorage[forceI].size();pp++)
01675       pmeAtomStorage[forceI][pp]->clear();
01676 
01677   }
01678   CmiUnlock(lock_numHomePatchesMerged);
01679   // ----------------------------- lock end  ---------------------------
01680 
01681   // Patch is done => send over to the node that contains the ComputePmeCUDA compute,
01682   // this node will then rely the message to the Pe that originally sent the atoms
01683   int pe = forceMsg->pe;
01684   if (CkNodeOf(pe) != CkMyNode())
01685     thisProxy[CkNodeOf(pe)].sendForcesToPatch(forceMsg);
01686   else
01687     sendForcesToPatch(forceMsg);
01688 
01689 }

void ComputePmeCUDADevice::recvAtoms ( PmeAtomMsg msg  ) 

Definition at line 1148 of file ComputePmeCUDAMgr.C.

References PmeAtomMsg::atoms, PmeAtomMsg::compute, PmeForceMsg::compute, PmeAtomMsg::doEnergy, PmeAtomMsg::doVirial, PmeAtomFiler::fileAtoms(), PmeAtomFiler::getAtomIndex(), PmeAtomFiler::getNumAtoms(), j, PmeAtomMsg::lattice, NAMD_bug(), PmeForceMsg::numAtoms, PmeAtomMsg::numAtoms, PmeForceMsg::numStrayAtoms, PmeForceMsg::pe, PmeAtomMsg::pe, PRIORITY_SIZE, sendAtomsToNeighbors(), CudaAtom::x, CudaAtom::y, PmeGrid::yBlocks, CudaAtom::z, PmeGrid::zBlocks, and PmeForceMsg::zeroCopy.

01148                                                     {
01149 
01150   PmeAtomFiler *pmeAtomFilerPtr = pmeAtomFiler[CkMyPe()].ckLocalBranch();
01151   // Store "virial" and "energy" flags
01152   doVirial = msg->doVirial;
01153   doEnergy = msg->doEnergy;
01154   // Store lattice
01155   lattice = msg->lattice;
01156 
01157   // Primary pencil index
01158   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01159   int p0 = 0;
01160   int pencilPatchIndex[9];
01161   int numStrayAtomsPatch = 0;
01162   if (pmePencilType == 3) {
01163     // 3D box => store atoms directly without index
01164     // NOTE: We don't check for stray atoms here!
01165     pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
01166   } else {
01167 
01168     // File atoms
01169     pmeAtomFilerPtr->fileAtoms(msg->numAtoms, msg->atoms, lattice, pmeGrid,
01170       pencilIndexY, pencilIndexZ, ylo, yhi, zlo, zhi);
01171 
01172     // Loop through pencils and add atoms to pencil atom lists
01173     // NOTE: we only store to neighboring pencil if there are atoms to store
01174     int numAtomsCheck = 0;
01175     for (int p=0;p < 9;p++) {
01176 
01177       int y = (p % 3);
01178       int z = (p / 3);
01179 
01180       int pp = y + z*yNBlocks;
01181       int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
01182       if (pp == pp0) p0 = p;
01183       if (pp == pp0 || numAtoms > 0) {
01184         if (pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1 && (y != 0 || z != 0))
01185           NAMD_bug("ComputePmeCUDADevice::recvAtoms, problem with atom filing");
01186         int* index = pmeAtomFilerPtr->getAtomIndex(p);
01187         pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index);
01188         // Number of patches in this storage tells you how many home patches contributed and
01189         // homePatchIndex (pe) tells you which patch contributed
01190         numAtomsCheck += numAtoms;
01191       }
01192     }
01193 
01194     // Deal with stray atoms
01195     numStrayAtomsPatch = pmeAtomFilerPtr->getNumAtoms(9);
01196     if (numStrayAtomsPatch > 0) {
01197       int* index = pmeAtomFilerPtr->getAtomIndex(9);
01198       CkPrintf("%d stray charges detected. Up to 10 listed below (index in patch, x, y, z):\n", numStrayAtomsPatch);
01199       for (int i=0;i < std::min(numStrayAtomsPatch, 10);i++) {
01200         int j = index[i];
01201         CkPrintf("%d %f %f %f\n", j, msg->atoms[j].x, msg->atoms[j].y, msg->atoms[j].z);
01202       }
01203     }
01204 
01205     if (numAtomsCheck + numStrayAtomsPatch < msg->numAtoms)
01206       NAMD_bug("ComputePmeCUDADevice::recvAtoms, missing atoms");
01207   }
01208 
01209   // Create storage for home patch forces
01210   PmeForceMsg *forceMsg;
01211   if (pmePencilType == 3 && CkNodeOf(msg->pe) == CkMyNode()) {
01212     // 3D FFT and compute resides on the same node => use zero-copy forces
01213     forceMsg = new (0, PRIORITY_SIZE) PmeForceMsg();
01214     forceMsg->zeroCopy = true;
01215   } else {
01216     forceMsg = new (msg->numAtoms, PRIORITY_SIZE) PmeForceMsg();
01217     forceMsg->zeroCopy = false;
01218   }
01219   forceMsg->numAtoms = msg->numAtoms;
01220   forceMsg->pe = msg->pe;
01221   forceMsg->compute = msg->compute;
01222   forceMsg->numStrayAtoms = numStrayAtomsPatch;
01223 
01224   bool done = false;
01225   // ----------------------------- lock start ---------------------------
01226   // Only after writing has finished, we get homePatchIndex
01227   // This quarantees that for whatever thread that receives "done=true", writing has finished on
01228   // ALL threads.
01229   CmiLock(lock_recvAtoms);
01230   numStrayAtoms += numStrayAtomsPatch;
01231   // Secure homePatchIndex. All writes after this must be inside lock-region
01232   int homePatchIndex = numHomePatchesRecv;
01233   // Store primary pencil first
01234   plList[atomI][homePatchIndex].push_back(PencilLocation(pp0, pencilPatchIndex[p0]));
01235   if (pmePencilType != 3) {
01236     // Go back to through neighboring pencils and store "homePatchIndex"
01237     for (int p=0;p < 9;p++) {
01238 
01239       int y = (p % 3);
01240       int z = (p / 3);
01241 
01242       int pp = y + z*yNBlocks;
01243       int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
01244       if (pp != pp0 && numAtoms > 0) {
01245         homePatchIndexList[atomI][pp].push_back(homePatchIndex);
01246         // plList[0...numHomePatches-1] = for each home patch stores the location of pencils that are
01247         //                                sharing it
01248         // plList[homePatchIndex].size() tells the number of pencils that the home patch is shared with
01249         plList[atomI][homePatchIndex].push_back(PencilLocation(pp, pencilPatchIndex[p]));
01250       }
01251     }
01252   }
01253   homePatchForceMsgs[atomI][homePatchIndex] = forceMsg;
01254   // numHomeAtoms[atomI][homePatchIndex] = msg->numAtoms;
01255   // Set the number of pencils contributing to this home patch
01256   numPencils[atomI][homePatchIndex] = plList[atomI][homePatchIndex].size();
01257   //
01258   numHomePatchesRecv++;
01259   if (numHomePatchesRecv == numHomePatches) {
01260     // Reset counter
01261     numHomePatchesRecv = 0;
01262     done = true;
01263   }
01264   CmiUnlock(lock_recvAtoms);
01265   // ----------------------------- lock end  ---------------------------
01266 
01267   // plList[atomI][homePatchIndex] array tells you the location of pencils that are sharing this home patch
01268 
01269   delete msg;
01270 
01271   if (done) {
01272     // Pencil has received all home patches and writing to memory is done => send atoms to neighbors
01273     sendAtomsToNeighbors();
01274   }
01275 }

void ComputePmeCUDADevice::recvAtomsFromNeighbor ( PmeAtomPencilMsg msg  ) 

Definition at line 1323 of file ComputePmeCUDAMgr.C.

References PmeAtomPencilMsg::atoms, PmeAtomPencilMsg::doEnergy, PmeAtomPencilMsg::doVirial, PmeAtomPencilMsg::lattice, NAMD_bug(), PmeAtomPencilMsg::numAtoms, registerRecvAtomsFromNeighbor(), PmeAtomPencilMsg::srcY, PmeAtomPencilMsg::srcZ, PmeGrid::yBlocks, and PmeGrid::zBlocks.

01323                                                                       {
01324   // Store into primary pencil
01325   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01326   // Compute pencil index relative to primary pencil
01327   int y = msg->srcY - pencilIndexY;
01328   if (y < ylo) y += pmeGrid.yBlocks;
01329   if (y > yhi) y -= pmeGrid.yBlocks;
01330   int z = msg->srcZ - pencilIndexZ;
01331   if (z < zlo) z += pmeGrid.zBlocks;
01332   if (z > zhi) z -= pmeGrid.zBlocks;
01333   if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
01334     NAMD_bug("ComputePmeCUDADevice::recvAtomsFromNeighbor, pencil index outside bounds");
01335   }
01336   // Read energy and virial flags
01337   doEnergy = msg->doEnergy;
01338   doVirial = msg->doVirial;
01339   // Read lattice
01340   lattice = msg->lattice;
01341   // Pencil index where atoms came from
01342   int pp = y-ylo + (z-zlo)*yNBlocks;
01343   // Store atoms and mark down the patch index where these atoms were added
01344   neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
01345 
01346   delete msg;
01347 
01348   registerRecvAtomsFromNeighbor();
01349 }

void ComputePmeCUDADevice::recvForcesFromNeighbor ( PmeForcePencilMsg msg  ) 

Definition at line 1529 of file ComputePmeCUDAMgr.C.

References NAMD_bug(), PmeForcePencilMsg::srcY, PmeForcePencilMsg::srcZ, PmeGrid::yBlocks, and PmeGrid::zBlocks.

01529                                                                         {
01530 
01531   // Source pencil index
01532   int y = msg->srcY - pencilIndexY;
01533   if (y < ylo) y += pmeGrid.yBlocks;
01534   if (y > yhi) y -= pmeGrid.yBlocks;
01535   int z = msg->srcZ - pencilIndexZ;
01536   if (z < zlo) z += pmeGrid.zBlocks;
01537   if (z > zhi) z -= pmeGrid.zBlocks;
01538 
01539   if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
01540     NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, pencil index outside bounds");
01541   }
01542 
01543   // Source pencil
01544   int pp = y-ylo + (z-zlo)*yNBlocks;
01545 
01546   // Store message (deleted in mergeForcesOnPatch)
01547   neighborForcePencilMsgs[pp] = msg;
01548 
01549   // neighborForcePencils[pp].force = new CudaForce[msg->numAtoms];
01550   // memcpy(neighborForcePencils[pp].force, msg->force, sizeof(CudaForce)*msg->numAtoms);
01551   // neighborForcePencils[pp].numAtoms = msg->numAtoms;
01552   // neighborForcePencils[pp].y = msg->y;
01553   // neighborForcePencils[pp].z = msg->z;
01554   // neighborForcePencils[pp].srcY = msg->srcY;
01555   // neighborForcePencils[pp].srcZ = msg->srcZ;
01556   // delete msg;
01557 
01558   // numPatches = number of home patches this pencil has
01559   int numPatches = pmeAtomStorage[forceI][pp]->getNumPatches();
01560   if (numPatches != homePatchIndexList[forceI][pp].size()) {
01561     NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, numPatches incorrect");
01562   }
01563   for (int i=0;i < numPatches;i++) {
01564     // this pencil contributed to home patch with index "homePatchIndex"
01565     int homePatchIndex = homePatchIndexList[forceI][pp][i];
01566     // ----------------------------- lock start ---------------------------
01567     // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01568     //       we really would only need to each element but this would required
01569     //       numHomePatches number of locks.
01570     bool done = false;
01571     CmiLock(lock_numPencils);
01572     numPencils[forceI][homePatchIndex]--;
01573     if (numPencils[forceI][homePatchIndex] == 0) done = true;
01574     CmiUnlock(lock_numPencils);
01575     // ----------------------------- lock end  ---------------------------
01576     if (done) {
01577       // This home patch is done, launch force merging
01578       thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
01579     }
01580   }
01581 
01582 }

void ComputePmeCUDADevice::registerNeighbor (  ) 

Definition at line 1139 of file ComputePmeCUDAMgr.C.

01139                                             {
01140   CmiLock(lock_numHomePatchesMerged);
01141   numNeighborsExpected++;
01142   CmiUnlock(lock_numHomePatchesMerged);
01143 }

void ComputePmeCUDADevice::registerRecvAtomsFromNeighbor (  ) 

Definition at line 1351 of file ComputePmeCUDAMgr.C.

References spreadCharge().

Referenced by recvAtomsFromNeighbor(), and sendAtomsToNeighbors().

01351                                                          {
01352   // Primary pencil
01353   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01354 
01355   bool done = false;
01356   // ----------------------------- lock start ---------------------------
01357   CmiLock(lock_numNeighborsRecv);
01358   numNeighborsRecv++;
01359   if (numNeighborsRecv == numNeighborsExpected) {
01360     // Reset counter
01361     numNeighborsRecv = 0;
01362     done = true;
01363   }
01364   CmiUnlock(lock_numNeighborsRecv);
01365   // ----------------------------- lock end  ---------------------------
01366 
01367   if (done) {
01368     // Primary pencil has received all atoms and writing has finished => spread charge
01369     spreadCharge();
01370   }  
01371 }

void ComputePmeCUDADevice::sendAtomsToNeighbor ( int  y,
int  z,
int  atomIval 
)

Definition at line 1295 of file ComputePmeCUDAMgr.C.

References PmeAtomPencilMsg::atoms, atoms, PmeAtomPencilMsg::doEnergy, PmeAtomPencilMsg::doVirial, PmeAtomPencilMsg::lattice, PmeAtomPencilMsg::numAtoms, PRIORITY_SIZE, PmeAtomPencilMsg::srcY, PmeAtomPencilMsg::srcZ, PmeAtomPencilMsg::y, PmeGrid::yBlocks, PmeAtomPencilMsg::z, and PmeGrid::zBlocks.

01295                                                                          {
01296   // Pencil index  
01297   int pp = y-ylo + (z-zlo)*yNBlocks;
01298   // This neighbor pencil is done, finish it up before accessing it
01299   pmeAtomStorage[atomIval][pp]->finish();
01300   // Compute destination neighbor pencil index (yt,zt)
01301   int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01302   int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01303   int numAtoms = pmeAtomStorage[atomIval][pp]->getNumAtoms();
01304   CudaAtom* atoms = pmeAtomStorage[atomIval][pp]->getAtoms();
01305   PmeAtomPencilMsg* msgPencil = new (numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
01306   memcpy(msgPencil->atoms, atoms, numAtoms*sizeof(CudaAtom));
01307   msgPencil->numAtoms = numAtoms;
01308   // Store destination pencil index
01309   msgPencil->y = yt;
01310   msgPencil->z = zt;
01311   // Store source pencil index
01312   msgPencil->srcY = pencilIndexY;
01313   msgPencil->srcZ = pencilIndexZ;
01314   // Store energy and virial flags
01315   msgPencil->doEnergy = doEnergy;
01316   msgPencil->doVirial = doVirial;
01317   // Store lattice
01318   msgPencil->lattice = lattice;
01319   int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
01320   mgrProxy[node].recvAtomsFromNeighbor(msgPencil);
01321 }

void ComputePmeCUDADevice::sendAtomsToNeighbors (  ) 

Definition at line 1280 of file ComputePmeCUDAMgr.C.

References registerRecvAtomsFromNeighbor().

Referenced by recvAtoms().

01280                                                 {
01281   for (int z=zlo;z <= zhi;z++) {
01282     for (int y=ylo;y <= yhi;y++) {
01283       // Only send to neighbors, not self
01284       if (y != 0 || z != 0) {
01285         // NOTE: Must send atomI -value since this will change in spreadCharge(), which might occur
01286         // before these sends have been performed
01287         thisProxy[CkMyNode()].sendAtomsToNeighbor(y, z, atomI);
01288       }
01289     }
01290   }
01291   // Register primary pencil
01292   registerRecvAtomsFromNeighbor();
01293 }

void ComputePmeCUDADevice::sendForcesToNeighbors (  ) 

Definition at line 1496 of file ComputePmeCUDAMgr.C.

References PmeForcePencilMsg::force, PmeForcePencilMsg::numAtoms, PRIORITY_SIZE, PmeForcePencilMsg::srcY, PmeForcePencilMsg::srcZ, PmeForcePencilMsg::y, PmeGrid::yBlocks, PmeForcePencilMsg::z, and PmeGrid::zBlocks.

Referenced by gatherForceDone().

01496                                                  {
01497   // Primary pencil has the forces
01498   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01499   int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01500   // Loop through neighboring pencils
01501   for (int z=zlo;z <= zhi;z++) {
01502     for (int y=ylo;y <= yhi;y++) {
01503       // Only send to neighbors, not self
01504       if (y != 0 || z != 0) {
01505         int pp = y-ylo + (z-zlo)*yNBlocks;
01506         int patchIndex = neighborPatchIndex[pp];
01507         int atomStart = (patchIndex == 0) ? 0 : patchPos[patchIndex-1];
01508         int atomEnd   = patchPos[patchIndex];
01509         int natom = atomEnd-atomStart;
01510         // copy forces
01511         PmeForcePencilMsg *msg = new (natom, PRIORITY_SIZE) PmeForcePencilMsg;
01512         msg->numAtoms = natom;
01513         memcpy(msg->force, force+atomStart, natom*sizeof(CudaForce));
01514         // Calculate destination pencil index (dstY, dstZ) for this neighbor
01515         int dstY = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01516         int dstZ = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01517         int node = mgrProxy.ckLocalBranch()->getNode(dstY, dstZ);
01518         msg->y = dstY;
01519         msg->z = dstZ;
01520         // Store source pencil index
01521         msg->srcY = pencilIndexY;
01522         msg->srcZ = pencilIndexZ;
01523         mgrProxy[node].recvForcesFromNeighbor(msg);
01524       }
01525     }
01526   }
01527 }

void ComputePmeCUDADevice::sendForcesToPatch ( PmeForceMsg forceMsg  ) 

Definition at line 1691 of file ComputePmeCUDAMgr.C.

References PmeForceMsg::compute, Compute::localWorkMsg, PmeForceMsg::pe, and ComputePmeCUDA::storePmeForceMsg().

Referenced by mergeForcesOnPatch().

01691                                                                   {
01692   // Now we're on the node that has Pe, hence "compute" -pointer is valid
01693   int pe                  = forceMsg->pe;
01694   ComputePmeCUDA *compute = forceMsg->compute;
01695 
01696   // Store message for use in ComputePmeCUDA, where it'll also be deleted.
01697   if (compute->storePmeForceMsg(forceMsg)) {
01698     // Enqueue on the pe that sent the atoms in the first place
01699     LocalWorkMsg *lmsg = compute->localWorkMsg;
01700     CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
01701     wdProxy[pe].enqueuePme(lmsg);
01702   }
01703 }

void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilX  pmePencilX_in  ) 

Definition at line 1091 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

01091                                                                              {
01092   if (pmePencilType != 1)
01093     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(3), invalid pmePencilType");
01094   pmePencilX = pmePencilX_in;
01095 }

void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilXY  pmePencilXY_in  ) 

Definition at line 1085 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

01085                                                                                {
01086   if (pmePencilType != 2)
01087     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(2), invalid pmePencilType");
01088   pmePencilXY = pmePencilXY_in;
01089 }

void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilXYZ  pmePencilXYZ_in  ) 

Definition at line 1079 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

01079                                                                                  {
01080   if (pmePencilType != 3)
01081     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(1), invalid pmePencilType");
01082   pmePencilXYZ = pmePencilXYZ_in;
01083 }

void ComputePmeCUDADevice::spreadCharge (  ) 

Definition at line 1373 of file ComputePmeCUDAMgr.C.

References atoms, PmeRealSpaceCompute::copyAtoms(), PmeRunMsg::doEnergy, PmeRunMsg::doVirial, PmeRunMsg::lattice, PmeRunMsg::numStrayAtoms, PmeRealSpaceCompute::spreadCharge(), and msm::swap().

Referenced by registerRecvAtomsFromNeighbor().

01373                                         {
01374   // Spread charges in primary pencil
01375   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01376   // Primary pencil is done, finish it up before accessing it
01377   // (clearing is done in mergeForcesOnPatch)
01378   pmeAtomStorage[atomI][pp0]->finish();
01379   // Get the number of atoms and pointer to atoms
01380   int numAtoms = pmeAtomStorage[atomI][pp0]->getNumAtoms();
01381   CudaAtom* atoms = pmeAtomStorage[atomI][pp0]->getAtoms();
01382   // Flip atomI <-> forceI
01383   std::swap(atomI, forceI);
01384   // Re-allocate force buffer if needed
01385   reallocate_host<CudaForce>(&force, &forceCapacity, numAtoms, 1.5f);
01386   // (already have the updated lattice)
01387   pmeRealSpaceCompute->copyAtoms(numAtoms, atoms);
01388   // Spread charge
01389   beforeWalltime = CmiWallTimer();
01390   pmeRealSpaceCompute->spreadCharge(lattice);
01391   // Send "charge grid ready to PME solver"
01392   PmeRunMsg *pmeRunMsg = new PmeRunMsg();
01393   pmeRunMsg->doVirial = doVirial;
01394   pmeRunMsg->doEnergy = doEnergy;
01395   pmeRunMsg->lattice = lattice;
01396   pmeRunMsg->numStrayAtoms = numStrayAtoms;
01397   // Reset stray atom counter
01398   numStrayAtoms = 0;
01399   switch(pmePencilType) {
01400     case 1:
01401     pmePencilX(0, pencilIndexY, pencilIndexZ).chargeGridReady(pmeRunMsg);
01402     break;
01403     case 2:
01404     pmePencilXY(0, 0, pencilIndexZ).chargeGridReady(pmeRunMsg);
01405     break;
01406     case 3:
01407     pmePencilXYZ[0].chargeGridReady(pmeRunMsg);
01408     break;
01409   }
01410 }


The documentation for this class was generated from the following files:

Generated on 25 May 2020 for NAMD by  doxygen 1.6.1