NAMD
CudaUtils.h
Go to the documentation of this file.
1 #ifndef CUDAUTILS_H
2 #define CUDAUTILS_H
3 
4 #ifdef NAMD_CUDA
5 #include <stdio.h>
6 #include <cuda.h>
7 #include <cuda_runtime.h>
8 
9 #define WARPSIZE 32
10 
11 #define WARP_FULL_MASK 0xffffffff
12 
13 #if (__CUDACC_VER_MAJOR__ >= 9)
14 #define NAMD_USE_COOPERATIVE_GROUPS
15 #endif
16 
17 #ifdef NAMD_USE_COOPERATIVE_GROUPS
18  #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
19  __shfl_xor_sync(MASK, VAR, LANE, SIZE)
20  #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
21  __shfl_up_sync(MASK, VAR, DELTA, SIZE)
22  #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
23  __shfl_down_sync(MASK, VAR, DELTA, SIZE)
24  #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
25  __shfl_sync(MASK, VAR, LANE, SIZE)
26  #define WARP_ALL(MASK, P) __all_sync(MASK, P)
27  #define WARP_ANY(MASK, P) __any_sync(MASK, P)
28  #define WARP_BALLOT(MASK, P) __ballot_sync(MASK, P)
29  #define WARP_SYNC(MASK) __syncwarp(MASK)
30  #define BLOCK_SYNC __barrier_sync(0)
31 #else
32  #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
33  __shfl_xor(VAR, LANE, SIZE)
34  #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
35  __shfl_up(VAR, DELTA, SIZE)
36  #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
37  __shfl_down(VAR, DELTA, SIZE)
38  #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
39  __shfl(VAR, LANE, SIZE)
40  #define WARP_ALL(MASK, P) __all(P)
41  #define WARP_ANY(MASK, P) __any(P)
42  #define WARP_BALLOT(MASK, P) __ballot(P)
43  #define WARP_SYNC(MASK)
44  #define BLOCK_SYNC __syncthreads()
45 #endif
46 
47 
48 /*
49 // Define float3 + float3 operation
50 __host__ __device__ inline float3 operator+(const float3 a, const float3 b) {
51  float3 c;
52  c.x = a.x + b.x;
53  c.y = a.y + b.y;
54  c.z = a.z + b.z;
55  return c;
56 }
57 */
58 
59 //
60 // Cuda static assert, copied from Facebook FFT sources. Remove once nvcc has c++11
61 //
62 template <bool>
64 
65 template <>
66 struct CudaStaticAssert<true> {
67 };
68 
69 #define cuda_static_assert(expr) \
70  (CudaStaticAssert<(expr) != 0>())
71 
72 void cudaDie(const char *msg, cudaError_t err=cudaSuccess);
73 
74 void cudaNAMD_bug(const char *msg);
75 
76 //
77 // Error checking wrapper for CUDA
78 //
79 #define cudaCheck(stmt) do { \
80  cudaError_t err = stmt; \
81  if (err != cudaSuccess) { \
82  char msg[256]; \
83  sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
84  cudaDie(msg, err); \
85  } \
86 } while(0)
87 
88 #ifdef __CUDACC__
89 #if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 )
90 // native implementation available
91 #else
92 #if __CUDA_ARCH__ >= 600
93 #error using CAS implementation of double atomicAdd
94 #endif
95 //
96 // Double precision atomicAdd, copied from CUDA_C_Programming_Guide.pdf (ver 5.0)
97 //
98 static __device__ double atomicAdd(double* address, double val) {
99  unsigned long long int* address_as_ull = (unsigned long long int*)address;
100  unsigned long long int old = *address_as_ull, assumed;
101  do {
102  assumed = old;
103  old = atomicCAS(address_as_ull, assumed,
104  __double_as_longlong(val +
105  __longlong_as_double(assumed)));
106  } while (assumed != old);
107  return __longlong_as_double(old);
108 }
109 #endif
110 #endif
111 
112 void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT);
113 void clear_device_array_T(void *data, const int ndata, const size_t sizeofT);
114 
115 template <class T>
116 void clear_device_array(T *data, const int ndata, cudaStream_t stream=0) {
117  clear_device_array_async_T(data, ndata, stream, sizeof(T));
118 }
119 
120 template <class T>
121 void clear_device_array_sync(T *data, const int ndata) {
122  clear_device_array_T(data, ndata, sizeof(T));
123 }
124 
125 void allocate_host_T(void **pp, const int len, const size_t sizeofT);
126 //----------------------------------------------------------------------------------------
127 //
128 // Allocate page-locked host memory
129 // pp = memory pointer
130 // len = length of the array
131 //
132 template <class T>
133 void allocate_host(T **pp, const int len) {
134  allocate_host_T((void **)pp, len, sizeof(T));
135 }
136 
137 
138 void allocate_device_T(void **pp, const int len, const size_t sizeofT);
139 //----------------------------------------------------------------------------------------
140 //
141 // Allocate gpu memory
142 // pp = memory pointer
143 // len = length of the array
144 //
145 template <class T>
146 void allocate_device(T **pp, const int len) {
147  allocate_device_T((void **)pp, len, sizeof(T));
148 }
149 
150 
151 void deallocate_device_T(void **pp);
152 //----------------------------------------------------------------------------------------
153 //
154 // Deallocate gpu memory
155 // pp = memory pointer
156 //
157 template <class T>
158 void deallocate_device(T **pp) {
159  deallocate_device_T((void **)pp);
160 }
161 //----------------------------------------------------------------------------------------
162 
163 bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT);
164 //----------------------------------------------------------------------------------------
165 //
166 // Allocate & re-allocate device memory
167 // pp = memory pointer
168 // curlen = current length of the array
169 // newlen = new required length of the array
170 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
171 //
172 // returns true if reallocation happened
173 //
174 template <class T>
175 bool reallocate_device(T **pp, int *curlen, const int newlen, const float fac=1.0f) {
176  return reallocate_device_T((void **)pp, curlen, newlen, fac, sizeof(T));
177 }
178 //----------------------------------------------------------------------------------------
179 bool reallocate_host_T(void **pp, int *curlen, const int newlen, const float fac,
180  const unsigned int flag, const size_t sizeofT);
181 //----------------------------------------------------------------------------------------
182 //
183 // Allocate & re-allocate pinned host memory
184 // pp = memory pointer
185 // curlen = current length of the array
186 // newlen = new required length of the array
187 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
188 // flag = allocation type:
189 // cudaHostAllocDefault = default type, emulates cudaMallocHost
190 // cudaHostAllocMapped = maps allocation into CUDA address space
191 //
192 // returns true if reallocation happened
193 //
194 template <class T>
195 bool reallocate_host(T **pp, int *curlen, const int newlen,
196  const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault) {
197  return reallocate_host_T((void **)pp, curlen, newlen, fac, flag, sizeof(T));
198 }
199 
200 void deallocate_host_T(void **pp);
201 //----------------------------------------------------------------------------------------
202 //
203 // Deallocate page-locked host memory
204 // pp = memory pointer
205 //
206 template <class T>
207 void deallocate_host(T **pp) {
208  deallocate_host_T((void **)pp);
209 }
210 //----------------------------------------------------------------------------------------
211 
212 void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream,
213  const size_t sizeofT);
214 void copy_HtoD_T(const void *h_array, void *d_array, int array_len,
215  const size_t sizeofT);
216 void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream,
217  const size_t sizeofT);
218 void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT);
219 
220 void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
221  const size_t sizeofT);
222 void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT);
223 
224 //----------------------------------------------------------------------------------------
225 //
226 // Copies memory Host -> Device
227 //
228 template <class T>
229 void copy_HtoD(const T *h_array, T *d_array, int array_len, cudaStream_t stream=0) {
230  copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T));
231 }
232 
233 //----------------------------------------------------------------------------------------
234 //
235 // Copies memory Host -> Device using synchronous calls
236 //
237 template <class T>
238 void copy_HtoD_sync(const T *h_array, T *d_array, int array_len) {
239  copy_HtoD_T(h_array, d_array, array_len, sizeof(T));
240 }
241 
242 //----------------------------------------------------------------------------------------
243 //
244 // Copies memory Device -> Host
245 //
246 template <class T>
247 void copy_DtoH(const T *d_array, T *h_array, const int array_len, cudaStream_t stream=0) {
248  copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T));
249 }
250 //----------------------------------------------------------------------------------------
251 //
252 // Copies memory Device -> Host using synchronous calls
253 //
254 template <class T>
255 void copy_DtoH_sync(const T *d_array, T *h_array, const int array_len) {
256  copy_DtoH_T(d_array, h_array, array_len, sizeof(T));
257 }
258 //----------------------------------------------------------------------------------------
259 //
260 // Copies memory Device -> Device
261 //
262 template <class T>
263 void copy_DtoD(const T *d_src, T *h_dst, const int array_len, cudaStream_t stream=0) {
264  copy_DtoD_async_T(d_src, h_dst, array_len, stream, sizeof(T));
265 }
266 //----------------------------------------------------------------------------------------
267 //
268 // Copies memory Device -> Device using synchronous calls
269 //
270 template <class T>
271 void copy_DtoD_sync(const T *d_src, T *h_dst, const int array_len) {
272  copy_DtoD_T(d_src, h_dst, array_len, sizeof(T));
273 }
274 
275 //----------------------------------------------------------------------------------------
276 //
277 // Copies memory between two peer devices Device -> Device
278 //
279 void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
280  const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
281  const size_t sizeofT);
282 
283 template <class T>
284 void copy_PeerDtoD(const int src_dev, const int dst_dev,
285  const T *d_src, T *d_dst, const int array_len, cudaStream_t stream=0) {
286  copy_PeerDtoD_async_T(src_dev, dst_dev, d_src, d_dst, array_len, stream, sizeof(T));
287 }
288 
289 //----------------------------------------------------------------------------------------
290 //
291 // Copies 3D memory block Host -> Device
292 //
293 void copy3D_HtoD_T(void* src_data, void* dst_data,
294  int src_x0, int src_y0, int src_z0,
295  size_t src_xsize, size_t src_ysize,
296  int dst_x0, int dst_y0, int dst_z0,
297  size_t dst_xsize, size_t dst_ysize,
298  size_t width, size_t height, size_t depth,
299  size_t sizeofT, cudaStream_t stream);
300 
301 template <class T>
302 void copy3D_HtoD(T* src_data, T* dst_data,
303  int src_x0, int src_y0, int src_z0,
304  size_t src_xsize, size_t src_ysize,
305  int dst_x0, int dst_y0, int dst_z0,
306  size_t dst_xsize, size_t dst_ysize,
307  size_t width, size_t height, size_t depth,
308  cudaStream_t stream=0) {
309  copy3D_HtoD_T(src_data, dst_data,
310  src_x0, src_y0, src_z0,
311  src_xsize, src_ysize,
312  dst_x0, dst_y0, dst_z0,
313  dst_xsize, dst_ysize,
314  width, height, depth,
315  sizeof(T), stream);
316 }
317 
318 //----------------------------------------------------------------------------------------
319 //
320 // Copies 3D memory block Device -> Host
321 //
322 void copy3D_DtoH_T(void* src_data, void* dst_data,
323  int src_x0, int src_y0, int src_z0,
324  size_t src_xsize, size_t src_ysize,
325  int dst_x0, int dst_y0, int dst_z0,
326  size_t dst_xsize, size_t dst_ysize,
327  size_t width, size_t height, size_t depth,
328  size_t sizeofT, cudaStream_t stream);
329 
330 template <class T>
331 void copy3D_DtoH(T* src_data, T* dst_data,
332  int src_x0, int src_y0, int src_z0,
333  size_t src_xsize, size_t src_ysize,
334  int dst_x0, int dst_y0, int dst_z0,
335  size_t dst_xsize, size_t dst_ysize,
336  size_t width, size_t height, size_t depth,
337  cudaStream_t stream=0) {
338  copy3D_DtoH_T(src_data, dst_data,
339  src_x0, src_y0, src_z0,
340  src_xsize, src_ysize,
341  dst_x0, dst_y0, dst_z0,
342  dst_xsize, dst_ysize,
343  width, height, depth,
344  sizeof(T), stream);
345 }
346 
347 //----------------------------------------------------------------------------------------
348 //
349 // Copies 3D memory block Device -> Device
350 //
351 void copy3D_DtoD_T(void* src_data, void* dst_data,
352  int src_x0, int src_y0, int src_z0,
353  size_t src_xsize, size_t src_ysize,
354  int dst_x0, int dst_y0, int dst_z0,
355  size_t dst_xsize, size_t dst_ysize,
356  size_t width, size_t height, size_t depth,
357  size_t sizeofT, cudaStream_t stream);
358 
359 template <class T>
360 void copy3D_DtoD(T* src_data, T* dst_data,
361  int src_x0, int src_y0, int src_z0,
362  size_t src_xsize, size_t src_ysize,
363  int dst_x0, int dst_y0, int dst_z0,
364  size_t dst_xsize, size_t dst_ysize,
365  size_t width, size_t height, size_t depth,
366  cudaStream_t stream=0) {
367  copy3D_DtoD_T(src_data, dst_data,
368  src_x0, src_y0, src_z0,
369  src_xsize, src_ysize,
370  dst_x0, dst_y0, dst_z0,
371  dst_xsize, dst_ysize,
372  width, height, depth,
373  sizeof(T), stream);
374 }
375 
376 //----------------------------------------------------------------------------------------
377 //
378 // Copies 3D memory block between two peer devices Device -> Device
379 //
380 void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
381  void* src_data, void* dst_data,
382  int src_x0, int src_y0, int src_z0,
383  size_t src_xsize, size_t src_ysize,
384  int dst_x0, int dst_y0, int dst_z0,
385  size_t dst_xsize, size_t dst_ysize,
386  size_t width, size_t height, size_t depth,
387  size_t sizeofT, cudaStream_t stream);
388 
389 template <class T>
390 void copy3D_PeerDtoD(int src_dev, int dst_dev,
391  T* src_data, T* dst_data,
392  int src_x0, int src_y0, int src_z0,
393  size_t src_xsize, size_t src_ysize,
394  int dst_x0, int dst_y0, int dst_z0,
395  size_t dst_xsize, size_t dst_ysize,
396  size_t width, size_t height, size_t depth,
397  cudaStream_t stream=0) {
398  copy3D_PeerDtoD_T(src_dev, dst_dev,
399  src_data, dst_data,
400  src_x0, src_y0, src_z0,
401  src_xsize, src_ysize,
402  dst_x0, dst_y0, dst_z0,
403  dst_xsize, dst_ysize,
404  width, height, depth,
405  sizeof(T), stream);
406 }
407 
408 #endif // NAMD_CUDA
409 
410 #endif // CUDAUTILS_H
void copy3D_DtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:360
void deallocate_device_T(void **pp)
Definition: CudaUtils.C:84
void copy3D_DtoH(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:331
void copy_HtoD(const T *h_array, T *d_array, int array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:229
void deallocate_host(T **pp)
Definition: CudaUtils.h:207
void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT)
Definition: CudaUtils.C:194
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:225
void allocate_device(T **pp, const int len)
Definition: CudaUtils.h:146
void deallocate_device(T **pp)
Definition: CudaUtils.h:158
void allocate_device_T(void **pp, const int len, const size_t sizeofT)
Definition: CudaUtils.C:75
void copy_DtoH(const T *d_array, T *h_array, const int array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:247
void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT)
Definition: CudaUtils.C:207
void copy3D_HtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:302
void allocate_host(T **pp, const int len)
Definition: CudaUtils.h:133
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:215
__thread cudaStream_t stream
void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:189
void copy_DtoD(const T *d_src, T *h_dst, const int array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:263
void copy_DtoD_sync(const T *d_src, T *h_dst, const int array_len)
Definition: CudaUtils.h:271
bool reallocate_host_T(void **pp, int *curlen, const int newlen, const float fac, const unsigned int flag, const size_t sizeofT)
Definition: CudaUtils.C:150
void copy3D_PeerDtoD(int src_dev, int dst_dev, T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:390
bool reallocate_host(T **pp, int *curlen, const int newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
Definition: CudaUtils.h:195
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:275
void clear_device_array_sync(T *data, const int ndata)
Definition: CudaUtils.h:121
void copy_HtoD_T(const void *h_array, void *d_array, int array_len, const size_t sizeofT)
Definition: CudaUtils.C:180
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:51
void copy_HtoD_sync(const T *h_array, T *d_array, int array_len)
Definition: CudaUtils.h:238
void copy_DtoH_sync(const T *d_array, T *h_array, const int array_len)
Definition: CudaUtils.h:255
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
void clear_device_array(T *data, const int ndata, cudaStream_t stream=0)
Definition: CudaUtils.h:116
void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:202
bool reallocate_device(T **pp, int *curlen, const int newlen, const float fac=1.0f)
Definition: CudaUtils.h:175
bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT)
Definition: CudaUtils.C:117
void copy_PeerDtoD(const int src_dev, const int dst_dev, const T *d_src, T *d_dst, const int array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:284
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:300
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:250
void clear_device_array_T(void *data, const int ndata, const size_t sizeofT)
Definition: CudaUtils.C:55
void deallocate_host_T(void **pp)
Definition: CudaUtils.C:98
void allocate_host_T(void **pp, const int len, const size_t sizeofT)
Definition: CudaUtils.C:65
void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:175