NAMD
ComputePmeCUDAMgr.h
Go to the documentation of this file.
1 #ifndef COMPUTEPMECUDAMGR_H
2 #define COMPUTEPMECUDAMGR_H
3 
4 #include <vector>
5 
6 #include "PmeBase.h"
7 #include "PmeSolver.h"
8 #include "PmeSolverUtil.h"
9 #include "ComputePmeCUDAMgr.decl.h"
10 #ifdef NAMD_CUDA
11 #include <cuda_runtime.h> // Needed for cudaStream_t that is used in ComputePmeCUDAMgr -class
12 #endif
13 
14 #ifdef NAMD_CUDA
15 class ComputePmeCUDA;
16 
17 //
18 // Base class for thread safe atom storage
19 //
21 public:
22  PmeAtomStorage(const bool useIndex) : useIndex(useIndex) {
23  numAtoms = 0;
24  atomCapacity = 0;
25  atom = NULL;
26  atomIndexCapacity = 0;
27  atomIndex = NULL;
28  overflowStart = 0;
29  overflowEnd = 0;
30  overflowAtomCapacity = 0;
31  overflowAtom = NULL;
32  overflowAtomIndexCapacity = 0;
33  overflowAtomIndex = NULL;
34  lock_ = CmiCreateLock();
35  }
36  virtual ~PmeAtomStorage() {
37  CmiDestroyLock(lock_);
38  }
39 
40  int addAtoms(const int natom, const CudaAtom* src) {
41  return addAtoms_(natom, src, NULL);
42  }
43 
44  int addAtomsWithIndex(const int natom, const CudaAtom* src, const int* index) {
45  return addAtoms_(natom, src, index);
46  }
47 
48  // Finish up, must be called after "done" is returned by addAtoms.
49  // Only the last thread that gets the "done" signal from addAtoms can enter here.
50  void finish() {
51  if (overflowEnd-overflowStart > 0) {
52  resize_((void **)&atom, numAtoms, atomCapacity, sizeof(CudaAtom));
53  if (useIndex) resize_((void **)&atomIndex, numAtoms, atomIndexCapacity, sizeof(int));
54  memcpy_(atom+overflowStart, overflowAtom, (overflowEnd - overflowStart)*sizeof(CudaAtom));
55  if (useIndex) memcpy_(atomIndex+overflowStart, overflowAtomIndex, (overflowEnd - overflowStart)*sizeof(int));
56  overflowStart = 0;
57  overflowEnd = 0;
58  }
59  }
60 
61  // Clear and reset storage to initial stage.
62  // Only the last thread that gets the "done" signal from addAtoms can enter here.
63  void clear() {
64  patchPos.clear();
65  numAtoms = 0;
66  }
67 
68  // Return pointer to atom data
70  return atom;
71  }
72 
73  // Return pointer to patch positions
74  int* getPatchPos() {
75  return patchPos.data();
76  }
77 
78  int getNumPatches() {
79  return patchPos.size();
80  }
81 
82  int getNumAtoms() {
83  return numAtoms;
84  }
85 
86  int* getAtomIndex() {
87  if (!useIndex)
88  NAMD_bug("PmeAtomStorage::getAtomIndex, no indexing enabled");
89  return atomIndex;
90  }
91 
92 protected:
93  // Atom array
95  // Atom index array
96  int* atomIndex;
97  // Overflow atom array
99  // Overflow atom index array
101 
102 private:
103  // If true, uses indexed atom arrays
104  const bool useIndex;
105  // Node lock
106  CmiNodeLock lock_;
107  // Data overflow
108  int overflowAtomCapacity;
109  int overflowAtomIndexCapacity;
110  int overflowStart;
111  int overflowEnd;
112  // Number of atoms currently in storage
113  int numAtoms;
114  // Atom patch position
115  std::vector<int> patchPos;
116  // Atom array capacity
117  int atomCapacity;
118  // Atom index array capacity
119  int atomIndexCapacity;
120 
121  // Resize array with 1.5x extra storage
122  void resize_(void **array, int sizeRequested, int& arrayCapacity, const int sizeofType) {
123  // If array is not NULL and has enough capacity => we have nothing to do
124  if (*array != NULL && arrayCapacity >= sizeRequested) return;
125 
126  // Otherwise, allocate new array
127  int newArrayCapacity = (int)(sizeRequested*1.5);
128  void* newArray = alloc_(sizeofType*newArrayCapacity);
129 
130  if (*array != NULL) {
131  // We have old array => copy contents to new array
132  memcpy_(newArray, *array, arrayCapacity*sizeofType);
133  // De-allocate old array
134  dealloc_(*array);
135  }
136 
137  // Set new capacity and array pointer
138  arrayCapacity = newArrayCapacity;
139  *array = newArray;
140  }
141 
142  virtual void memcpy_(void *dst, const void* src, const int size) {
143  memcpy(dst, src, size);
144  }
145 
146  virtual void copyWithIndex_(CudaAtom* dst, const CudaAtom* src, const int natom, const int* indexSrc) {
147  for (int i=0;i < natom;i++) dst[i] = src[indexSrc[i]];
148  }
149 
150  // Allocate array of size bytes
151  virtual void* alloc_(const int size)=0;
152 
153  // Deallocate array
154  virtual void dealloc_(void *p)=0;
155 
156  // Add atoms in thread-safe manner.
157  // Returns the patch index where the atoms were added
158  int addAtoms_(const int natom, const CudaAtom* src, const int* index) {
159  CmiLock(lock_);
160  // Accumulate position for patches:
161  // atoms for patch i are in the range [ patchPos[i-1], patchPos[i]-1 ]
162  int patchInd = patchPos.size();
163  int ppos = (patchInd == 0) ? natom : patchPos[patchInd-1] + natom;
164  patchPos.push_back(ppos);
165  int pos = numAtoms;
166  bool overflow = false;
167  numAtoms += natom;
168  // Check for overflow
169  if (numAtoms > atomCapacity || (useIndex && numAtoms > atomIndexCapacity)) {
170  // number of atoms exceeds capacity, store into overflow buffer
171  // Note: storing to overflow should be very infrequent, most likely only
172  // in the initial call
173  if (overflowEnd-overflowStart == 0) {
174  overflowStart = pos;
175  overflowEnd = pos;
176  }
177  overflowEnd += natom;
178  if (overflowEnd-overflowStart > overflowAtomCapacity) {
179  resize_((void **)&overflowAtom, overflowEnd-overflowStart, overflowAtomCapacity, sizeof(CudaAtom));
180  }
181  if (useIndex && overflowEnd-overflowStart > overflowAtomIndexCapacity) {
182  resize_((void **)&overflowAtomIndex, overflowEnd-overflowStart, overflowAtomIndexCapacity, sizeof(int));
183  }
184  if (index != NULL) {
185  if (useIndex) memcpy_(overflowAtomIndex+overflowEnd-overflowStart-natom, index, natom*sizeof(int));
186  copyWithIndex_(overflowAtom+overflowEnd-overflowStart-natom, src, natom, index);
187  } else {
188  memcpy_(overflowAtom+overflowEnd-overflowStart-natom, src, natom*sizeof(CudaAtom));
189  }
190  overflow = true;
191  }
192  CmiUnlock(lock_);
193  // If no overflow, copy to final position
194  if (!overflow) {
195  if (index != NULL) {
196  if (useIndex) memcpy_(atomIndex+pos, index, natom*sizeof(int));
197  copyWithIndex_(atom+pos, src, natom, index);
198  } else {
199  memcpy_(atom+pos, src, natom*sizeof(CudaAtom));
200  }
201  }
202  return patchInd;
203  }
204 
205 };
206 
207 class PmeAtomMsg : public CMessage_PmeAtomMsg {
208 public:
210  int numAtoms;
211  int i, j;
213  int pe;
216  // int miny, minz;
217 };
218 
219 class PmeForceMsg : public CMessage_PmeForceMsg {
220 public:
222  int pe;
223  int numAtoms;
225  bool zeroCopy;
227 };
228 
229 class PmeLaunchMsg : public CMessage_PmeLaunchMsg {
230 public:
232  int natom;
233  int pe;
235 };
236 
237 class RegisterPatchMsg : public CMessage_RegisterPatchMsg {
238 public:
239  int i, j;
240 };
241 
242 class NumDevicesMsg : public CMessage_NumDevicesMsg {
243 public:
244  NumDevicesMsg(int numDevices) : numDevices(numDevices) {}
246 };
247 
248 class PmeAtomPencilMsg : public CMessage_PmeAtomPencilMsg {
249 public:
251  int numAtoms;
252  int y, z;
253  int srcY, srcZ;
256 };
257 
258 class PmeForcePencilMsg : public CMessage_PmeForcePencilMsg {
259 public:
261  int numAtoms;
262  int y, z;
263  int srcY, srcZ;
264 };
265 
266 class CProxy_ComputePmeCUDADevice;
267 class RecvDeviceMsg : public CMessage_RecvDeviceMsg {
268 public:
269  CProxy_ComputePmeCUDADevice* dev;
271 };
272 
273 class PmeAtomFiler : public CBase_PmeAtomFiler {
274 public:
275  PmeAtomFiler();
276  PmeAtomFiler(CkMigrateMessage *);
277  ~PmeAtomFiler();
278  void fileAtoms(const int numAtoms, const CudaAtom* atoms, Lattice &lattice, const PmeGrid &pmeGrid,
279  const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi);
280  // static inline int yBlock(int p) {return p % 3;}
281  // static inline int zBlock(int p) {return p / 3;}
282  int getNumAtoms(int p) {return pencilSize[p];}
283  int* getAtomIndex(int p) {return pencil[p];}
284 private:
285  // 9 Pencils + 1 Stay atom pencil
286  int pencilSize[9+1];
287  int pencilCapacity[9+1];
288  int* pencil[9+1];
289 };
290 
291 
292 class CProxy_ComputePmeCUDAMgr;
293 class ComputePmeCUDADevice : public CBase_ComputePmeCUDADevice {
294 public:
295  // ComputePmeCUDADevice_SDAG_CODE;
297  ComputePmeCUDADevice(CkMigrateMessage *m);
299  void initialize(PmeGrid& pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in,
300  int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in,
301  CProxy_PmeAtomFiler pmeAtomFiler_in);
302  int getDeviceID();
303  cudaStream_t getStream();
304  CProxy_ComputePmeCUDAMgr getMgrProxy();
305  void setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in);
306  void setPencilProxy(CProxy_CudaPmePencilXY pmePencilXY_in);
307  void setPencilProxy(CProxy_CudaPmePencilX pmePencilX_in);
308  void activate_pencils();
309  void initializePatches(int numHomePatches_in);
310  void registerNeighbor();
311  void recvAtoms(PmeAtomMsg *msg);
312  void sendAtomsToNeighbors();
313  void sendAtomsToNeighbor(int y, int z, int atomIval);
316  void spreadCharge();
317  void gatherForce();
318  void gatherForceDone();
319  void sendForcesToNeighbors();
321  void mergeForcesOnPatch(int homePatchIndex);
322  void sendForcesToPatch(PmeForceMsg *forceMsg);
323 
324  void gatherForceDoneSubset(int first, int last);
325 
326 private:
327  // Store updated lattice
328  Lattice lattice;
329  // Store virial and energy flags
330  bool doVirial, doEnergy;
331  // PME grid definiton
332  PmeGrid pmeGrid;
333  // PME pencil type
334  int pmePencilType;
335  // Neighboring pencil bounds, [-1,1]
336  int ylo, yhi, zlo, zhi;
337  // Size of the neighboring pencil grid, maximum value 3. yNBlocks = yhi - ylo + 1
338  int yNBlocks, zNBlocks;
339  // Number of home patches for this device
340  int numHomePatches;
341  // Pencil location for this device
342  int pencilIndexY, pencilIndexZ;
343 
344  // Number of neighbors expected to provide atoms including self
345  int numNeighborsExpected;
346 
347  // Number of stray atoms
348  int numStrayAtoms;
349 
350  // Node locks
351  CmiNodeLock lock_numHomePatchesMerged;
352  CmiNodeLock lock_numPencils;
353  CmiNodeLock lock_numNeighborsRecv;
354  CmiNodeLock lock_recvAtoms;
355 
356  int atomI, forceI;
357 
358  //----------------------------------------------------------------------------------
359  // Book keeping
360  // NOTE: We keep two copies of pmeAtomStorage and homePatchIndexList so that forces can be
361  // merged while next patch of atoms is already being received
362  //----------------------------------------------------------------------------------
363  // Storage for each pencil on the yNBlocks x zNBlocks grid
364  std::vector< PmeAtomStorage* > pmeAtomStorage[2];
365  std::vector<bool> pmeAtomStorageAllocatedHere;
366 
367  // Size numHomePatches:
368  // Tells how many pencils have contributed to home patch
369  std::vector<int> numPencils[2];
370 
371  // Pencil location
372  struct PencilLocation {
373  // Pencil index
374  int pp;
375  // Patch location in the pencil
376  int pencilPatchIndex;
377  PencilLocation(int pp, int pencilPatchIndex) : pp(pp), pencilPatchIndex(pencilPatchIndex) {}
378  };
379 
380  // Size numHomePatches
381  std::vector< std::vector<PencilLocation> > plList[2];
382 
383  // Size numHomePatches
384  std::vector< PmeForceMsg* > homePatchForceMsgs[2];
385 
386  // // Size numHomePatches
387  // std::vector<int> numHomeAtoms[2];
388 
389  std::vector< std::vector<int> > homePatchIndexList[2];
390 
391  // Number of neighbors from which we have received atoms
392  int numNeighborsRecv;
393 
394  // Number of home patches we have received atom from
395  int numHomePatchesRecv;
396 
397  // Number of home patches we have merged forces for
398  int numHomePatchesMerged;
399 
400  // Size yNBlocks*zNBlocks
401  std::vector< PmeForcePencilMsg* > neighborForcePencilMsgs;
402  // std::vector< PmeForcePencil > neighborForcePencils;
403 
404  // Size yNBlocks*zNBlocks
405  std::vector<int> neighborPatchIndex;
406  //----------------------------------------------------------------------------------
407 
408  // CUDA stream
409  cudaStream_t stream;
410  bool streamCreated;
411  // Device ID
412  int deviceID;
413  // Charge spreading and force gathering
414  PmeRealSpaceCompute* pmeRealSpaceCompute;
415  // Host memory force array
416  int forceCapacity;
417  CudaForce* force;
418 
419  // Proxy for the manager
420  CProxy_ComputePmeCUDAMgr mgrProxy;
421 
422  // Atom filer proxy
423  CProxy_PmeAtomFiler pmeAtomFiler;
424 
425  // Pencil proxy
426  CProxy_CudaPmePencilXYZ pmePencilXYZ;
427  CProxy_CudaPmePencilXY pmePencilXY;
428  CProxy_CudaPmePencilX pmePencilX;
429 
430  // For event tracing
431  double beforeWalltime;
432 };
433 
434 class ComputePmeCUDAMgr : public CBase_ComputePmeCUDAMgr {
435 public:
438  ComputePmeCUDAMgr(CkMigrateMessage *);
440  void setupPencils();
441  void initialize(CkQdMsg *msg);
442  void initialize_pencils(CkQdMsg *msg);
443  void activate_pencils(CkQdMsg *msg);
444  PmeGrid getPmeGrid() {return pmeGrid;}
445  int getNode(int i, int j);
446  int getDevice(int i, int j);
447  int getDevicePencilY(int i, int j);
448  int getDevicePencilZ(int i, int j);
449  int getDeviceIDPencilX(int i, int j);
450  int getDeviceIDPencilY(int i, int j);
451  int getDeviceIDPencilZ(int i, int j);
452  void recvPencils(CProxy_CudaPmePencilXYZ xyz);
453  void recvPencils(CProxy_CudaPmePencilXY xy, CProxy_CudaPmePencilZ z);
454  void recvPencils(CProxy_CudaPmePencilX x, CProxy_CudaPmePencilY y, CProxy_CudaPmePencilZ z);
455 
457  void recvDevices(RecvDeviceMsg* msg);
458  void recvAtomFiler(CProxy_PmeAtomFiler filer);
459  void skip();
460  void recvAtoms(PmeAtomMsg *msg);
461  void getHomePencil(PatchID patchID, int& homey, int& homez);
462  int getHomeNode(PatchID patchID);
463 
464  bool isPmePe(int pe);
465  bool isPmeNode(int node);
466  bool isPmeDevice(int deviceID);
467 
469  CProxy_ComputePmeCUDAMgr mgrProxy(CkpvAccess(BOCclass_group).computePmeCUDAMgr);
470  return mgrProxy.ckLocalBranch();
471  }
472 protected:
473 
474 private:
475  void restrictToMaxPMEPencils();
476 
477  // ---------------------------------------------
478  // For .ci file
479  // Counter for determining numDevicesMax
480  int numNodesContributed;
481  int numDevicesMax;
482 
483  // Number of home patches for each device on this manager
484  std::vector<int> numHomePatchesList;
485 
486  // Counter for "registerPatchDone"
487  int numTotalPatches;
488  // ---------------------------------------------
489 
490  // PME pencil type: 1=column, 2=slab, 3=box
491  int pmePencilType;
492 
493  PmeGrid pmeGrid;
494 
495  // Number of CUDA devices on this node that are used for PME computation
496  int numDevices;
497 
498  std::vector<int> xPes;
499  std::vector<int> yPes;
500  std::vector<int> zPes;
501 
502  // List of pencil coordinates (i,j) for each device held by this node
503  struct IJ {
504  int i, j;
505  };
506  std::vector<IJ> ijPencilX;
507  std::vector<IJ> ijPencilY;
508  std::vector<IJ> ijPencilZ;
509 
510  struct NodeDevice {
511  int node;
512  int device;
513  };
514  std::vector<NodeDevice> nodeDeviceList;
515 
516  // Atom filer proxy
517  CProxy_PmeAtomFiler pmeAtomFiler;
518 
519  // Device proxies
520  std::vector<CProxy_ComputePmeCUDADevice> deviceProxy;
521 
522  // Extra devices
523  struct ExtraDevice {
524  int deviceID;
525  cudaStream_t stream;
526  };
527  std::vector<ExtraDevice> extraDevices;
528 
529  // Pencil proxies
530  CProxy_CudaPmePencilXYZ pmePencilXYZ;
531  CProxy_CudaPmePencilXY pmePencilXY;
532  CProxy_CudaPmePencilX pmePencilX;
533  CProxy_CudaPmePencilY pmePencilY;
534  CProxy_CudaPmePencilZ pmePencilZ;
535 
536 };
537 #else // NAMD_CUDA
538 class ComputePmeCUDAMgr : public CBase_ComputePmeCUDAMgr {
539 };
540 #endif // NAMD_CUDA
541 
542 #endif // COMPUTEPMECUDAMGR_H
int getHomeNode(PatchID patchID)
void sendAtomsToNeighbor(int y, int z, int atomIval)
void recvForcesFromNeighbor(PmeForcePencilMsg *msg)
CProxy_ComputePmeCUDAMgr getMgrProxy()
void initialize(CkQdMsg *msg)
virial xy
ComputePmeCUDA * compute
int getDeviceIDPencilX(int i, int j)
void sendForcesToPatch(PmeForceMsg *forceMsg)
void setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in)
void getHomePencil(PatchID patchID, int &homey, int &homez)
static __thread atom * atoms
void recvAtomFiler(CProxy_PmeAtomFiler filer)
void recvAtoms(PmeAtomMsg *msg)
void mergeForcesOnPatch(int homePatchIndex)
PmeAtomStorage(const bool useIndex)
void recvDevices(RecvDeviceMsg *msg)
int getDeviceIDPencilZ(int i, int j)
__thread cudaStream_t stream
void fileAtoms(const int numAtoms, const CudaAtom *atoms, Lattice &lattice, const PmeGrid &pmeGrid, const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi)
int getNumAtoms(int p)
CudaAtom * overflowAtom
void initialize(PmeGrid &pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in, int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in, CProxy_PmeAtomFiler pmeAtomFiler_in)
static ComputePmeCUDAMgr * Object()
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int getDevicePencilZ(int i, int j)
CudaAtom * atoms
void recvAtomsFromNeighbor(PmeAtomPencilMsg *msg)
gridSize z
virtual ~PmeAtomStorage()
void initializePatches(int numHomePatches_in)
int PatchID
Definition: NamdTypes.h:182
int addAtomsWithIndex(const int natom, const CudaAtom *src, const int *index)
CudaForce * force
int addAtoms(const int natom, const CudaAtom *src)
int getDevice(int i, int j)
int getDeviceIDPencilY(int i, int j)
void recvAtoms(PmeAtomMsg *msg)
void activate_pencils(CkQdMsg *msg)
CudaAtom * getAtoms()
bool isPmeNode(int node)
int getNode(int i, int j)
void initialize_pencils(CkQdMsg *msg)
gridSize y
ComputePmeCUDA * compute
int getDevicePencilY(int i, int j)
int * getAtomIndex(int p)
void recvPencils(CProxy_CudaPmePencilXYZ xyz)
CProxy_ComputePmeCUDADevice * dev
gridSize x
void gatherForceDoneSubset(int first, int last)
NumDevicesMsg(int numDevices)
ComputePmeCUDA * compute
CudaForce * force
bool isPmeDevice(int deviceID)