NAMD
MigrationBondedCUDAKernel.h
Go to the documentation of this file.
1 #ifndef MIGRATIONBONDEDCUDA_H
2 #define MIGRATIONBONDEDCUDA_H
3 #include "CudaUtils.h"
4 #include "CudaRecord.h"
5 #include "TupleTypesCUDA.h"
6 #include "Lattice.h"
7 #ifdef NODEGROUP_FORCE_REGISTER
8 
13 class MigrationBondedCUDAKernel {
14 private:
15 
16  //
17  // Tuple level structures
18  //
19  size_t srcTotalAlloc;
20  TupleSizes srcAlloc;
21  TupleSizes dstAlloc;
22 
23  TupleDataStage dataSrc;
24  TupleDataStage dataDst;
25  TupleDataStagePeer d_peer_data;
26 
27  TupleIntArraysContiguous d_device;
28  TupleIntArraysContiguous d_downstreamPatchIndex;
29  TupleIntArraysContiguous d_dstIndex;
30 
31  //
32  // Patch level data structures
33  //
34  int numPatchesHomePad;
35  TupleIntArraysContiguous d_counts;
36  TupleIntArraysContiguous d_offsets;
37  TupleIntArraysPeer d_peer_counts;
38  TupleIntArraysPeer d_peer_offsets;
39 
40  //
41  // Used to copy tuple counts back to device with single copy
42  //
43  TupleCounts* d_totalCount;
44  TupleCounts* h_totalCount;
45 
46  //
47  // Scratch space used by CUB
48  //
49  size_t patchDeviceScan_alloc;
50  char* d_patchDeviceScan_scratch;
51 
52 public:
53  static constexpr int kNumThreads = 128;
54  static constexpr int kPatchNumberPad = 64; // Pad to 256 bytes. Is this overkill?
55  static constexpr int kNumTuplesTypes = 7;
56 
57  // TODO Tune these
58  static constexpr int kPatchThreads = 1024;
59  static constexpr int kPatchItemsPerThread = 8;
60  static constexpr int kMaxPatchesForSingleThreadBlock = kPatchThreads * kPatchItemsPerThread;
61 
62  static constexpr int kScanThreads = 512;
63  static constexpr int kScanItemsPerThread = 4;
64 
65  MigrationBondedCUDAKernel();
66  ~MigrationBondedCUDAKernel();
67  void setup(const int numDevices, const int numPatchesHome);
68 
69  TupleDataStage getDstBuffers() { return dataDst; }
70  TupleIntArraysContiguous getDeviceTupleCounts() { return d_counts; }
71  TupleIntArraysContiguous getDeviceTupleOffsets() { return d_offsets; }
72 
73  void computeTupleDestination(
74  const int myDeviceIndex,
75  TupleCounts count,
76  const int numPatchesHome,
77  const int4* migrationDestination,
78  const int* patchIDtoGPU,
79  const int* patchIDtoHomePatchIndex,
80  const int aDim,
81  const int bDim,
82  const int cMaxIndex,
83  const int bMaxIndex,
84  const int aMaxIndex,
85  cudaStream_t stream
86  );
87 
88  void reserveTupleDestination(
89  const int myDeviceIndex,
90  const int numPatchesHome,
91  cudaStream_t stream
92  );
93 
94  void computePatchOffsets(
95  const int numPatchesHome,
96  cudaStream_t stream
97  );
98 
99  void performTupleMigration(
100  TupleCounts count,
101  cudaStream_t stream
102  );
103 
104  void updateTuples(
105  TupleCounts count,
106  TupleData data,
107  const int* ids,
108  const PatchRecord* patches,
109  const double3* d_patchMapCenter,
110  const float4* xyzq,
111  const Lattice lattice,
112  cudaStream_t stream
113  );
114 
115  void copyTupleToDevice(
116  TupleCounts count,
117  const int numPatchesHome,
118  TupleDataStage h_dataStage,
119  TupleIntArrays h_counts,
120  TupleIntArrays h_offsets,
121  cudaStream_t stream
122  );
123 
124  bool reallocateBufferDst(TupleCounts counts);
125  bool reallocateBufferSrc(TupleCounts counts);
126 
127  TupleCounts fetchTupleCounts(const int numPatchesHome, cudaStream_t stream);
128  void clearTupleCounts(const int numPatchesHome, cudaStream_t stream);
129 
130  void copyPeerDataToDevice(
131  TupleDataStagePeer h_peer_data,
132  TupleIntArraysPeer h_peer_counts,
133  TupleIntArraysPeer h_peer_offsets,
134  const int numDevices,
135  cudaStream_t stream
136  );
137 
138 };
139 
140 #endif // NAMD_CUDA
141 #endif // MIGRATIONBONDEDCUDA_H
142