NAMD
CudaTileListKernel.h
Go to the documentation of this file.
1 #ifndef CUDATILELISTKERNEL_H
2 #define CUDATILELISTKERNEL_H
3 #ifdef NAMD_CUDA
4 
5 // Exclusion mask: bit 1 = atom pair is included, 0 = atom pair is excluded
6 struct TileExcl {
7  unsigned int excl[32];
8 };
9 
10 struct TileList {
13  int jtileEnd;
14  float3 offsetXYZ;
15  int2 patchInd; // Patch indices for this list
16  union {
17  int2 patchNumList; // Number of lists contributing to each patch
18  // int icompute;
19  };
20  int icompute;
21 };
22 
24  int iatomSize;
26  int jatomSize;
28 };
29 
30 //
31 // Bounding box structure
32 //
33 struct BoundingBox {
34  float x, y, z; // Center
35  float wx, wy, wz; // Half-width
36 };
37 
38 //
39 // Stripped-down CUDA version of compute record
40 //
42  int2 patchInd;
43  float3 offsetXYZ;
44 };
45 
46 //
47 // Stripped-down CUDA version of patch record
48 //
50  int numAtoms;
52  int atomStart;
53 };
54 
55 //
56 // Tile list status. Used to communicate tile list sizes between GPU and CPU
57 //
58 struct TileListStat {
61  int numJtiles;
66 };
67 
69  float shx, shy, shz;
70  float forcex, forcey, forcez;
72  double energyVdw;
73  double energyElec;
74  double energySlow;
75  double energyGBIS;
76 };
77 
78 struct VirialEnergy {
79  double virial[9];
80  double virialSlow[9];
81  double energyVdw;
82  double energyElec;
83  double energySlow;
84  double energyGBIS;
85 };
86 
88 private:
89 
90  template <typename T>
91  struct PtrSize {
92  PtrSize(T* ptr, int size) : ptr(ptr), size(size) {}
93  T* ptr;
94  int size;
95  };
96 
97  const int deviceID;
98 
99  // Events
100  cudaEvent_t tileListStatEvent;
101  bool tileListStatEventRecord;
102 
103  // Pair list cutoff squared
104  float plcutoff2;
105 
106  // Number of patches
107  int numPatches;
108 
109  // Number of computes
110  int numComputes;
111 
112  // Number of tile lists
113  int numTileLists;
114 
115  // Number of tile lists for GBIS
116  int numTileListsGBIS;
117 
118  // Number of tiles
119  int numJtiles;
120 
121  // Maximum number of tiles per tile list
122  int maxTileListLen;
123 
124  CudaPatchRecord* cudaPatches;
125  int cudaPatchesSize;
126 
127  CudaComputeRecord* cudaComputes;
128  int cudaComputesSize;
129 
130  // --- For Streaming ---
131  const bool doStreaming;
132  int* patchNumLists;
133  int patchNumListsSize;
134 
135  int* emptyPatches;
136  int emptyPatchesSize;
137  int* h_emptyPatches;
138  int h_emptyPatchesSize;
139  int numEmptyPatches;
140 
141  unsigned int* sortKeySrc;
142  int sortKeySrcSize;
143  unsigned int* sortKeyDst;
144  int sortKeyDstSize;
145 
146  int maxTileListLen_sortKeys;
147 
148  unsigned int* sortKeys;
149  int sortKeysSize;
150 
151  int2* minmaxListLen;
152  int minmaxListLenSize;
153 
154  int sortKeys_endbit;
155  // ---------------------
156 
157  // Single entry pinned host and device buffers for communicating tile list status
158  TileListStat* h_tileListStat;
159  TileListStat* d_tileListStat;
160 
161  // Atom coordinates and charge
162  float4* xyzq;
163  int xyzqSize;
164  // Atom coordinate storage size
165  int atomStorageSize;
166 
167  // Tile lists
168  TileList* tileLists1;
169  int tileLists1Size;
170  TileList* tileLists2;
171  int tileLists2Size;
172  TileList* tileListsGBIS;
173  int tileListsGBISSize;
174 
175  // Pair pairs
176  PatchPairRecord* patchPairs1;
177  int patchPairs1Size;
178  PatchPairRecord* patchPairs2;
179  int patchPairs2Size;
180 
181  // j-atom start for tiles
182  int* tileJatomStart1;
183  int tileJatomStart1Size;
184  int* tileJatomStart2;
185  int tileJatomStart2Size;
186  int* tileJatomStartGBIS;
187  int tileJatomStartGBISSize;
188 
189  // Bounding boxes
190  BoundingBox* boundingBoxes;
191  int boundingBoxesSize;
192 
193  // Depth of each tile list
194  unsigned int* tileListDepth1;
195  int tileListDepth1Size;
196  unsigned int* tileListDepth2;
197  int tileListDepth2Size;
198 
199  // Tile list order
200  int* tileListOrder1;
201  int tileListOrder1Size;
202  int* tileListOrder2;
203  int tileListOrder2Size;
204 
205  // Position of each tile list = ExclusiveSum(tileListDepths)
206  int* tileListPos;
207  int tileListPosSize;
208 
209  // jtile occupancy and position
210  int* jtiles;
211  int jtilesSize;
212 
213  // Temporary buffers used in buildTileLists
214  int* tilePos;
215  int tilePosSize;
216 
217  // Exclusions
218  TileExcl* tileExcls1;
219  int tileExcls1Size;
220  TileExcl* tileExcls2;
221  int tileExcls2Size;
222 
223  // Temporary storage for CUB
224  char* tempStorage;
225  int tempStorageSize;
226 
227  // Number of exclusions detected
228  int numExcluded;
229 
230  // Virials and energies for tile lists
231  TileListVirialEnergy* tileListVirialEnergy;
232  int tileListVirialEnergySize;
233 
234  int tileListVirialEnergyLength;
235  int tileListVirialEnergyGBISLength;
236 
237  int activeBuffer;
238 
239  void setActiveBuffer(int activeBufferIn) {activeBuffer = activeBufferIn;}
240 
241  void sortTileLists(
242  const bool useJtiles,
243  const int begin_bit, const bool highDepthBitsSet,
244  // Source
245  const int numTileListsSrc, const int numJtilesSrc,
246  PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
247  PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
248  PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
249  // Destination
250  const int numTileListsDst, const int numJtilesDst,
251  PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
252  PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
253  PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
254  cudaStream_t stream);
255 
256  void writeTileList(const char* filename, const int numTileLists,
257  const TileList* d_tileLists, cudaStream_t stream);
258  void writeTileJatomStart(const char* filename, const int numJtiles,
259  const int* d_tileJatomStart, cudaStream_t stream);
260  // void markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
261  // const int numJtiles, int* d_tileJatomStart, cudaStream_t stream);
262 
263  int* outputOrder;
264  int outputOrderSize;
265  bool doOutputOrder;
266 
267 public:
268 
269  CudaTileListKernel(int deviceID, bool doStreaming);
271 
272  int getNumEmptyPatches() {return numEmptyPatches;}
273  int* getEmptyPatches() {return h_emptyPatches;}
274 
275  int getNumExcluded() {return numExcluded;}
276 
277  float get_plcutoff2() {return plcutoff2;}
278  int getNumTileLists() {return numTileLists;}
279  int getNumTileListsGBIS() {return numTileListsGBIS;}
280  int getNumJtiles() {return numJtiles;}
281  BoundingBox* getBoundingBoxes() {return boundingBoxes;}
282  int* getJtiles() {return jtiles;}
283  float4* get_xyzq() {return xyzq;}
284 
285  TileListStat* getTileListStatDevPtr() {return d_tileListStat;}
286  void clearTileListStat(cudaStream_t stream);
287 
288  int* getTileJatomStart() {return ((activeBuffer == 1) ? tileJatomStart1 : tileJatomStart2);}
290  return ((activeBuffer == 1) ? tileLists1 : tileLists2);
291  }
292  unsigned int* getTileListDepth() {return ((activeBuffer == 1) ? tileListDepth1 : tileListDepth2);}
293  int* getTileListOrder() {return ((activeBuffer == 1) ? tileListOrder1 : tileListOrder2);}
294  TileExcl* getTileExcls() {return ((activeBuffer == 1) ? tileExcls1 : tileExcls2);}
295  PatchPairRecord* getPatchPairs() {return ((activeBuffer == 1) ? patchPairs1 : patchPairs2);}
296 
297  int* getTileJatomStartGBIS() {return tileJatomStartGBIS;}
298  TileList* getTileListsGBIS() {return tileListsGBIS;}
299 
300  TileListVirialEnergy* getTileListVirialEnergy() {return tileListVirialEnergy;}
301 
302  CudaPatchRecord* getCudaPatches() {return cudaPatches;}
303 
304  void prepareTileList(cudaStream_t stream);
305  void finishTileList(cudaStream_t stream);
306 
307  void updateComputes(const int numComputesIn,
308  const CudaComputeRecord* h_cudaComputes, cudaStream_t stream);
309 
310  void buildTileLists(const int numTileListsPrev,
311  const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
312  const float3 lata, const float3 latb, const float3 latc,
313  const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq, const float plcutoff2In,
314  const size_t maxShmemPerBlock, cudaStream_t stream);
315 
316  void reSortTileLists(const bool doGBIS, cudaStream_t stream);
317  // void applyOutputOrder(cudaStream_t stream);
318 
319  void setTileListVirialEnergyLength(int len);
320  void setTileListVirialEnergyGBISLength(int len);
321  int getTileListVirialEnergyLength() {return tileListVirialEnergyLength;}
322  int getTileListVirialEnergyGBISLength() {return tileListVirialEnergyGBISLength;}
323 
324  int getNumPatches() {return numPatches;}
325 
326  int getNumComputes() {return numComputes;}
327  int* getOutputOrder() {
328  if (!doStreaming) return NULL;
329  if (doOutputOrder) {
330  return outputOrder;
331  } else {
332  return NULL;
333  }
334  }
335 
336 };
337 #endif // NAMD_CUDA
338 #endif // CUDATILELISTKERNEL_H
CudaTileListKernel(int deviceID, bool doStreaming)
void prepareTileList(cudaStream_t stream)
void setTileListVirialEnergyLength(int len)
numTileListsSrc
const int const int begin_bit
PatchPairRecord * getPatchPairs()
void clearTileListStat(cudaStream_t stream)
void setTileListVirialEnergyGBISLength(int len)
CudaPatchRecord * getCudaPatches()
unsigned int * getTileListDepth()
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ tileListOrderSrc
BoundingBox * getBoundingBoxes()
__thread cudaStream_t stream
unsigned int excl[32]
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
TileList * getTileListsGBIS()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
TileListStat * getTileListStatDevPtr()
const int numTileListsDst
void finishTileList(cudaStream_t stream)
float3 offsetXYZ
double virialSlow[9]
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
TileListVirialEnergy * getTileListVirialEnergy()
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb