NAMD
CudaPmeSolverUtil.h
Go to the documentation of this file.
1 #ifndef CUDAPMESOLVERUTIL_H
2 #define CUDAPMESOLVERUTIL_H
3 #include <stdio.h>
4 #ifdef NAMD_CUDA
5 #include <cuda.h>
6 #include <cufft.h>
7 #endif // NAMD_CUDA
8 #include "PmeSolverUtil.h"
9 #include "CudaUtils.h"
11 
12 #ifdef NAMD_CUDA
13 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream);
14 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename);
15 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream);
16 
17 #define cufftCheck(stmt) do { \
18  cufftResult err = stmt; \
19  if (err != CUFFT_SUCCESS) { \
20  char msg[128]; \
21  sprintf(msg, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \
22  cudaDie(msg); \
23  } \
24 } while(0)
25 
26 //
27 // CUDA implementation of FFTCompute
28 //
29 class CudaFFTCompute : public FFTCompute {
30 private:
31  cufftHandle forwardPlan, backwardPlan;
32  cufftType_t forwardType, backwardType;
33  int deviceID;
34  cudaStream_t stream;
35  void setStream();
36 
37 private:
38  float* allocateData(const int dataSizeRequired);
39  void plan3D(int *n, int flags);
40  void plan2D(int *n, int howmany, int flags);
41  void plan1DX(int *n, int howmany, int flags);
42  void plan1DY(int *n, int howmany, int flags);
43  void plan1DZ(int *n, int howmany, int flags);
44  // int ncall, plantype;
45 
46 public:
47  CudaFFTCompute(int deviceID, cudaStream_t stream) : deviceID(deviceID), stream(stream) {}
49  void forward();
50  void backward();
51 };
52 
53 //
54 // Cuda implementation of PmeKSpaceCompute class
55 //
56 class CudaPmePencilXYZ;
57 class CudaPmePencilZ;
58 
60 private:
61  int deviceID;
62  cudaStream_t stream;
63  // Device memory versions of (bm1, bm2, bm3)
64  float *d_bm1, *d_bm2, *d_bm3;
65  //float *prefac_x, *prefac_y, *prefac_z;
66  struct EnergyVirial {
67  double energy;
68  double virial[9];
69  };
70  EnergyVirial* d_energyVirial;
71  EnergyVirial* h_energyVirial;
72  cudaEvent_t copyEnergyVirialEvent;
73  bool ortho;
74  // Check counter for event polling in energyAndVirialCheck()
75  int checkCount;
76  static void energyAndVirialCheck(void *arg, double walltime);
77  CudaPmePencilXYZ* pencilXYZPtr;
78  CudaPmePencilZ* pencilZPtr;
79 public:
81  const int jblock, const int kblock, double kappa,
82  int deviceID, cudaStream_t stream);
84  void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data);
85  // void waitEnergyAndVirial();
86  double getEnergy();
87  void getVirial(double *virial);
90 };
91 
92 //
93 // Cuda implementation of PmeRealSpaceCompute class
94 //
95 
97 
99 private:
100  bool gridTexObjActive;
101  cudaTextureObject_t gridTexObj;
102  int tex_data_len;
103  float* tex_data;
104  int deviceID;
105  cudaStream_t stream;
106  void setupGridTexture(float* data, int data_len);
107  // Device memory for atoms
108  int d_atomsCapacity;
109  CudaAtom* d_atoms;
110  // Device memory for patches
111  // int d_patchesCapacity;
112  // PatchInfo* d_patches;
113  // Device memory for forces
114  int d_forceCapacity;
115  CudaForce* d_force;
116  // // Device memory for self energy
117  // double* d_selfEnergy;
118  // Events
119  cudaEvent_t gatherForceEvent;
120  // Check counter for event polling
121  int checkCount;
122  // Store device pointer for event polling
123  ComputePmeCUDADevice* devicePtr;
124  static void cuda_gatherforce_check(void *arg, double walltime);
125 public:
126  CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock,
127  int deviceID, cudaStream_t stream);
129  void copyAtoms(const int numAtoms, const CudaAtom* atoms);
130  void spreadCharge(Lattice &lattice);
131  void gatherForce(Lattice &lattice, CudaForce* force);
132  void gatherForceSetCallback(ComputePmeCUDADevice* devicePtr_in);
133  void waitGatherForceDone();
134 };
135 
136 //
137 // Cuda implementation of PmeTranspose class
138 //
140 private:
141  int deviceID;
142  cudaStream_t stream;
143  float2* d_data;
144 #ifndef P2P_ENABLE_3D
145  float2* d_buffer;
146 #endif
147  // List of device data pointers for transpose destinations on:
148  // (a) this device on a different pencil (e.g. in XYZ->YZX transpose, on Y -pencil)
149  // (b) different device on a different pencil
150  // If NULL, use the local d_data -buffer
151  std::vector<float2*> dataPtrsYZX;
152  std::vector<float2*> dataPtrsZXY;
153 
154  // Batch data
155  int max_nx_YZX[3];
156  TransposeBatch<float2> *batchesYZX;
157  int max_nx_ZXY[3];
158  TransposeBatch<float2> *batchesZXY;
159 
160  void copyDataToPeerDevice(const int iblock,
161  const int iblock_out, const int jblock_out, const int kblock_out,
162  int deviceID_out, int permutation_out, float2* data_out);
163 public:
165  const int jblock, const int kblock, int deviceID, cudaStream_t stream);
167  void setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data);
168  void setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data);
169  void transposeXYZtoYZX(const float2* data);
170  void transposeXYZtoZXY(const float2* data);
171  // void waitTransposeDone();
172  void waitStreamSynchronize();
173  void copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize);
174  void copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out);
175 #ifndef P2P_ENABLE_3D
176  void copyDataDeviceToDevice(const int iblock, float2* data_out);
177  float2* getBuffer(const int iblock);
178 #endif
179  void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2* data_out);
180  void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2* data_out);
181 };
182 #endif // NAMD_CUDA
183 #endif // CUDAPMESOLVERUTIL_H
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
const int permutation
void setDataPtrsYZX(std::vector< float2 * > &dataPtrsNew, float2 *data)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
static __thread atom * atoms
void spreadCharge(Lattice &lattice)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
__thread cudaStream_t stream
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
PmeGrid pmeGrid
CudaFFTCompute(int deviceID, cudaStream_t stream)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
const int jblock
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
void getVirial(double *virial)
const int kblock
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream)
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
const int permutation
void setDataPtrsZXY(std::vector< float2 * > &dataPtrsNew, float2 *data)
void gatherForce(Lattice &lattice, CudaForce *force)
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void transposeXYZtoYZX(const float2 *data)
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
void transposeXYZtoZXY(const float2 *data)
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
float2 * getBuffer(const int iblock)
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)