NAMD
ComputeBondedCUDA.h
Go to the documentation of this file.
1 #ifndef COMPUTEBONDEDCUDA_H
2 #define COMPUTEBONDEDCUDA_H
3 #include "Compute.h"
4 #include "ComputeMap.h"
5 #include "CudaNonbondedTables.h"
8 #include "ComputeHomeTuples.h"
9 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
10 
11 #ifdef BONDED_CUDA
12 
13 #include <vector>
14 #include <array>
15 
16 class ComputeBondedCUDA : public Compute {
17 
18 public:
19 
20  static const int CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES];
21  static const int CudaTupleTypeSizeStage[Tuples::NUM_TUPLE_TYPES];
22 
23 private:
24  bool initializeCalled;
25  SimParameters *params;
26  // Device ID and stream
27  const int deviceID;
28  cudaStream_t stream;
29 #ifdef NODEGROUP_FORCE_REGISTER
30  std::atomic<int> tupleWorkIndex;
31 #endif
32 
33  // Master PE for this compute
34  const int masterPe;
35 
36  // List of all patch IDs on this object
37  std::vector<int> allPatchIDs;
38 
39  // List of tuple patches for the entire compute (i.e. across all PEs)
40  TuplePatchList tuplePatchList;
41 
42  // For every PE, list of patches that it has registered
43  std::vector< std::vector<int> > patchIDsPerRank;
44 
45  // List of PEs involved in the computation
46  std::vector<int> pes;
47 
48  // Self compute
49  struct SelfCompute {
50  int type;
51  std::vector<int> patchIDs;
52  Tuples* tuples;
53  SelfCompute(int type=-1) : type(type), tuples(NULL) {}
54  int operator==(const SelfCompute &elem) const {
55  return (elem.type == type);
56  }
57  };
58 
59  // Home compute, each PE has one
60  struct HomeCompute {
61  std::vector<char> isBasePatch;
62  std::vector<int> patchIDs;
63  // Multiple tuples per PE, each of different kind
64  std::vector< Tuples* > tuples;
65  };
66 
67  // Computes for each PE
68  struct ComputeRecord {
69  HomeCompute homeCompute;
70  // Self computes, organized by type
71  std::vector< SelfCompute > selfComputes;
72  };
73 
74  // Collection of all computes for each PE
75  std::vector< ComputeRecord > computes;
76 
77  // For every tuple type, list of tuples
78  // NOTE: These are pointers to the data recorded in "computes" and
79  // are here to make it easier to traverse across all tuples of certain kind
80  std::array< std::list<Tuples*>, Tuples::NUM_TUPLE_TYPES > tupleList;
81 
82  int numTuplesPerType[Tuples::NUM_TUPLE_TYPES];
83 
84  AtomMap atomMap;
85  std::vector< AtomMapper* > atomMappers;
86 
87  /*struct PatchRecord {
88  int atomStart;
89  int numAtoms;
90  };*/
91  std::vector<PatchRecord> patches;
92 
93  // Patch "patchID" is found in patches[patchIndex[patchID]]
94  std::vector<int> patchIndex;
95 
96  // Maps multiplicit indices
97  std::vector<int> dihedralMultMap;
98  std::vector<int> improperMultMap;
99 
100  // Number of exclusions per rank, separated into modified and non-modified
101  struct NumExcl {
102  int numModifiedExclusions;
103  int numExclusions;
104  };
105  std::vector<NumExcl> numExclPerRank;
106 
107  // Flags that indicate wether this GPU has exclusions and modified exclusions
108  bool hasExclusions;
109  bool hasModifiedExclusions;
110 
111  // All tuple data
112  char* tupleData;
113  size_t tupleDataSize;
114 
115  std::vector<CudaBondStage> bondTupleData;
116  std::vector<CudaAngleStage> angleTupleData;
117  std::vector<CudaDihedralStage> dihedralTupleData;
118  std::vector<CudaDihedralStage> improperTupleData;
119  std::vector<CudaExclusionStage> modifiedExclusionTupleData;
120  std::vector<CudaExclusionStage> exclusionTupleData;
121  std::vector<CudaCrosstermStage> crosstermTupleData;
122 
123  // Bonded CUDA kernel
124  ComputeBondedCUDAKernel bondedKernel;
125 #ifdef NODEGROUP_FORCE_REGISTER
126  MigrationBondedCUDAKernel migrationKernel;
127 #endif // NODEGROUP_FORCE_REGISTER
128 
129  // Pointer to computeMgr that created this object
130  ComputeMgr* computeMgr;
131 
132  // Node-wide counter for patches.
133  int patchesCounter;
134 
135  // Tuple migration data structures
136  double3* h_patchMapCenter;
137  double3* d_patchMapCenter;
138 
139  PatchRecord* d_patchRecord;
140  PatchRecord* h_patchRecord;
141 
142  // "Force done event" for event polling
143  cudaEvent_t forceDoneEvent;
144 
145  // Check counter for event polling
146  int checkCount;
147 
148  // Node lock
149  CmiNodeLock lock;
150  CmiNodeLock printLock;
151 
152  // This variable is set in atomUpdate() by any Pe
153  bool atomsChangedIn;
154  // This variable is set in doWork() by masterPe
155  bool atomsChanged;
156 
157  // Reduction
158  SubmitReduction *reduction;
159  NodeReduction *nodeReduction;
160 
161  // Required storage
162  int atomStorageSize;
163 
164  // Flags pointer
165  Flags* flags;
166 
167  // Lattice and energy and virial booleans
168  Lattice lattice;
169  bool doEnergy;
170  bool doVirial;
171  bool doSlow;
172  bool doMolly;
173 
174  // Current step, for alchemical route
175  int step;
176 
177  // Walltime for force compute start
178  double beforeForceCompute;
179 
180  bool accelMDdoDihe;
181 
182  // Atom storage in pinned host memory
183  CudaAtom* atoms;
184  size_t atomsSize;
185 
186  // Force storage in pinned host memory
187  FORCE_TYPE* forces;
188  size_t forcesSize;
189  int forcesSizeDP;
190 
191  double* energies_virials;
192 
193  CudaAlchFlags hostAlchFlags;
194  CudaAlchParameters hostAlchParameters;
195  CudaAlchLambdas hostAlchLambdas;
196  int pswitchTable[3*3];
197 
198  void mapAtoms();
199  void unmapAtoms();
200 
201  void updatePatches();
202 
203  static void forceDoneCheck(void *arg, double walltime);
204  void forceDoneSetCallback();
205 
206 
207  // ------------ For copyTupleData -------------------
208  struct TupleCopyWork {
209  int tupletype;
210  int ntuples;
211  void* tupleElemList;
212  int64_t tupleDataPos;
213  };
214 
215  std::vector<TupleCopyWork> tupleCopyWorkList;
216 
217  int64_t exclusionStartPos;
218  int64_t exclusionStartPos2;
219  std::vector<CudaBondStage> hostCudaBondStage;
220 
221 #ifdef NODEGROUP_FORCE_REGISTER
222  template <typename T>
223  void sortTupleList(std::vector<T>& tuples, std::vector<int>& tupleCounts, std::vector<int>& tupleOffsets);
224  void sortAndCopyToDevice();
225  void migrateTuples(bool startup);
226 
227  template <typename T, typename P, typename D>
228  void copyTupleToStage(const T& src, const P* __restrict__ p_array, D& dstval);
229 
230  template <typename T, typename P, typename D>
231  void copyToStage(const int ntuples, const T* __restrict__ src,
232  const P* __restrict__ p_array, std::vector<D>& dst);
233 
234  void copyExclusionDataStage(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
235  std::vector<CudaExclusionStage>& dst1, std::vector<CudaExclusionStage>& dst2, int64_t& pos, int64_t& pos2);
236 #endif
237 
238  void copyBondData(const int ntuples, const BondElem* __restrict__ src,
239  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst);
240 
241  void copyBondDatafp32(const int ntuples, const BondElem* __restrict__ src,
242  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst);
243 
244  void copyAngleData(const int ntuples, const AngleElem* __restrict__ src,
245  const AngleValue* __restrict__ angle_array, CudaAngle* __restrict__ dst);
246 
247  template <bool doDihedral, typename T, typename P>
248  void copyDihedralData(const int ntuples, const T* __restrict__ src,
249  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst);
250 
251  template <bool doDihedral, typename T, typename P>
252  void copyDihedralDatafp32(const int ntuples, const T* __restrict__ src,
253  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst);
254 
255  void copyExclusionData(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
256  CudaExclusion* __restrict__ dst1, CudaExclusion* __restrict__ dst2, int64_t& pos, int64_t& pos2);
257 
258  void copyCrosstermData(const int ntuples, const CrosstermElem* __restrict__ src,
259  const CrosstermValue* __restrict__ crossterm_array, CudaCrossterm* __restrict__ dst);
260 
261  static void tupleCopyWorker(int first, int last, void *result, int paraNum, void *param);
262  void tupleCopyWorker(int first, int last);
263 static void tupleCopyWorkerExcl(int first, int last, void *result, int paraNum, void *param);
264  void tupleCopyWorkerExcl(int first, int last);
265 
266 #ifdef NODEGROUP_FORCE_REGISTER
267  void tupleCopyWorkerType(int tupletype);
268 #endif
269  // --------------------------------------------------
270 
271 public:
272 
273  ComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr, int deviceID, CudaNonbondedTables& cudaNonbondedTables);
274  ~ComputeBondedCUDA();
275  void registerCompute(int pe, int type, PatchIDList& pids);
276  void registerSelfCompute(int pe, int type, int pid);
277  void unregisterBoxesOnPe();
278  void assignPatchesOnPe();
279  virtual void patchReady(PatchID, int doneMigration, int seq);
280  virtual void initialize();
281  virtual void atomUpdate();
282  virtual int noWork();
283  virtual void doWork();
284  void messageEnqueueWork();
285  // void updatePatches();
286  void openBoxesOnPe(int startup = 1);
287  void loadTuplesOnPe(const int startup = 1);
288  void copyTupleData();
289  void copyTupleDataSN();
290  void launchWork();
291  void updateCudaAlchParameters();
292 
293  void updateHostCudaAlchFlags();
294  void updateKernelCudaAlchFlags();
295  void updateHostCudaAlchParameters();
296  void updateKernelCudaAlchParameters();
297  void updateHostCudaAlchLambdas();
298  void updateKernelCudaAlchLambdas();
299 
300 #ifdef NODEGROUP_FORCE_REGISTER
301  void updatePatchRecords();
302  void updateMaxTupleCounts(TupleCounts counts);
303  TupleCounts getMaxTupleCounts();
304  void registerPointersToHost();
305  void copyHostRegisterToDevice();
306  void copyPatchData();
307  void copyTupleDataGPU(const int startup);
308  void updatePatchOrder(const std::vector<CudaLocalRecord>& data);
309 #endif // NODEGROUP_FORCE_REGISTER
310 
311  void finishPatchesOnPe();
312  void finishPatches();
313  void finishReductions();
314 
315  std::vector<int>& getBondedPes(void) {return pes;}
316 
317  std::vector<PatchRecord>& getPatches() { return patches; }
318 };
319 
320 #endif // BONDED_CUDA
321 #endif // NAMD_CUDA
322 #endif // COMPUTEBONDEDCUDA_H
int32 ComputeID
Definition: NamdTypes.h:278
virtual void initialize()
Definition: Compute.h:56
virtual void doWork()
Definition: Compute.C:120
#define FORCE_TYPE
int operator==(const AtomSigInfo &s1, const AtomSigInfo &s2)
Definition: CompressPsf.C:146
virtual void atomUpdate()
Definition: Compute.h:59
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:67
virtual int noWork()
Definition: Compute.C:116
int32 PatchID
Definition: NamdTypes.h:277