NAMD
CudaPmeSolverUtil.C
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <algorithm>
3 #ifdef NAMD_CUDA
4 #include <cuda_runtime.h>
5 #endif
6 #include "ComputeNonbondedUtil.h"
7 #include "ComputePmeCUDAMgr.h"
8 #include "CudaPmeSolver.h"
9 #include "CudaPmeSolverUtil.h"
10 
11 #ifdef NAMD_CUDA
12 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
13 
14 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream) {
15  fprintf(stderr, "writeComplexToDisk %d %s\n", size, filename);
16  float2* h_data = new float2[size];
17  copy_DtoH<float2>(d_data, h_data, size, stream);
18  cudaCheck(cudaStreamSynchronize(stream));
19  FILE *handle = fopen(filename, "w");
20  for (int i=0;i < size;i++)
21  fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
22  fclose(handle);
23  delete [] h_data;
24 }
25 
26 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename) {
27  FILE *handle = fopen(filename, "w");
28  for (int i=0;i < size;i++)
29  fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
30  fclose(handle);
31 }
32 
33 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream) {
34  fprintf(stderr, "writeRealToDisk %d %s\n", size, filename);
35  float* h_data = new float[size];
36  copy_DtoH<float>(d_data, h_data, size, stream);
37  cudaCheck(cudaStreamSynchronize(stream));
38  FILE *handle = fopen(filename, "w");
39  for (int i=0;i < size;i++)
40  fprintf(handle, "%f\n", h_data[i]);
41  fclose(handle);
42  delete [] h_data;
43 }
44 
45 void CudaFFTCompute::plan3D(int *n, int flags) {
46  cudaCheck(cudaSetDevice(deviceID));
47  forwardType = CUFFT_R2C;
48  backwardType = CUFFT_C2R;
49  cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
50  cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
51  setStream();
52  // plantype = 3;
53 }
54 
55 void CudaFFTCompute::plan2D(int *n, int howmany, int flags) {
56  cudaCheck(cudaSetDevice(deviceID));
57  forwardType = CUFFT_R2C;
58  backwardType = CUFFT_C2R;
59  int nt[2] = {n[1], n[0]};
60  cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
61  cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
62  setStream();
63  // plantype = 2;
64 }
65 
66 void CudaFFTCompute::plan1DX(int *n, int howmany, int flags) {
67  cudaCheck(cudaSetDevice(deviceID));
68  forwardType = CUFFT_R2C;
69  backwardType = CUFFT_C2R;
70  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
71  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
72  setStream();
73  // plantype = 1;
74 }
75 
76 void CudaFFTCompute::plan1DY(int *n, int howmany, int flags) {
77  cudaCheck(cudaSetDevice(deviceID));
78  forwardType = CUFFT_C2C;
79  backwardType = CUFFT_C2C;
80  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
81  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
82  setStream();
83  // plantype = 1;
84 }
85 
86 void CudaFFTCompute::plan1DZ(int *n, int howmany, int flags) {
87  cudaCheck(cudaSetDevice(deviceID));
88  forwardType = CUFFT_C2C;
89  backwardType = CUFFT_C2C;
90  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
91  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
92  setStream();
93  // plantype = 1;
94 }
95 
97  cudaCheck(cudaSetDevice(deviceID));
98  cufftCheck(cufftDestroy(forwardPlan));
99  cufftCheck(cufftDestroy(backwardPlan));
100  if (dataSrcAllocated) deallocate_device<float>(&dataSrc);
101  if (dataDstAllocated) deallocate_device<float>(&dataDst);
102 }
103 
104 float* CudaFFTCompute::allocateData(const int dataSizeRequired) {
105  cudaCheck(cudaSetDevice(deviceID));
106  float* tmp = NULL;
107  allocate_device<float>(&tmp, dataSizeRequired);
108  return tmp;
109 }
110 
111 // int ncall = 0;
112 
114  cudaCheck(cudaSetDevice(deviceID));
115  // ncall++;
116  if (forwardType == CUFFT_R2C) {
117 
118  cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)dataSrc, (cufftComplex *)dataDst));
119 
120  // if (ncall == 1) {
121  // writeComplexToDisk((float2 *)dataSrc, (isize/2+1)*jsize*ksize, "dataSrc.txt", stream);
122  // }
123 
124  // if (ncall == 1 && plantype == 2) {
125  // writeComplexToDisk((float2 *)data, (isize/2+1)*jsize*ksize, "data_fx_fy_z.txt", stream);
126  // }
127 
128  } else if (forwardType == CUFFT_C2C) {
129  // nc2cf++;
130  // if (ncall == 1 && nc2cf == 1)
131  // writeComplexToDisk((float2 *)data, 33*64*64, "data_y_z_fx.txt");
132  // else if (ncall == 1 && nc2cf == 2)
133  // writeComplexToDisk((float2 *)data, 33*64*64, "data_z_fx_fy.txt");
134  cufftCheck(cufftExecC2C(forwardPlan, (cufftComplex *)dataSrc, (cufftComplex *)dataDst, CUFFT_FORWARD));
135  // fprintf(stderr, "ncall %d plantype %d\n", ncall, plantype);
136  // if (ncall == 1 && plantype == 1 && isize == 62) {
137  // writeComplexToDisk((float2 *)data, isize*jsize*(ksize/2+1), "data_fy_z_fx.txt", stream);
138  // }
139  // if (ncall == 1 && nc2cf == 1)
140  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_z_fx.txt");
141  // else if (ncall == 1 && nc2cf == 2)
142  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy.txt");
143  } else {
144  cudaNAMD_bug("CudaFFTCompute::forward(), unsupported FFT type");
145  }
146 }
147 
149  cudaCheck(cudaSetDevice(deviceID));
150  if (backwardType == CUFFT_C2R) {
151  // if (ncall == 1) {
152  // if (plantype == 1)
153  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_by_bz.txt");
154  // else
155  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_fy_fz_2.txt");
156  // }
157 
158  cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)dataDst, (cufftReal *)dataSrc));
159 
160  // if (ncall == 1)
161  // if (plantype == 1)
162  // writeRealToDisk(data, 64*64*64, "data_bx_by_bz_1D.txt");
163  // else
164  // writeRealToDisk(data, 64*64*64, "data_bx_by_bz_3D.txt");
165  } else if (backwardType == CUFFT_C2C) {
166  // nc2cb++;
167  // if (ncall == 1 && nc2cb == 1)
168  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy_2.txt");
169  // else if (ncall == 1 && nc2cb == 2)
170  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_bz_fx.txt");
171  cufftCheck(cufftExecC2C(backwardPlan, (cufftComplex *)dataDst, (cufftComplex *)dataSrc, CUFFT_INVERSE));
172  // if (ncall == 1 && nc2cb == 1)
173  // writeComplexToDisk((float2 *)data, 33*64*64, "data_bz_fx_fy.txt");
174  // else if (ncall == 1 && nc2cb == 2)
175  // writeComplexToDisk((float2 *)data, 33*64*64, "data_by_bz_fx.txt");
176  } else {
177  cudaNAMD_bug("CudaFFTCompute::backward(), unsupported FFT type");
178  }
179 }
180 
181 void CudaFFTCompute::setStream() {
182  cudaCheck(cudaSetDevice(deviceID));
183  cufftCheck(cufftSetStream(forwardPlan, stream));
184  cufftCheck(cufftSetStream(backwardPlan, stream));
185 }
186 
187 //###########################################################################
188 //###########################################################################
189 //###########################################################################
190 
192  const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream) :
193  PmeKSpaceCompute(pmeGrid, permutation, jblock, kblock, kappa),
194  deviceID(deviceID), stream(stream) {
195 
196  cudaCheck(cudaSetDevice(deviceID));
197 
198  // Copy bm1 -> prefac_x on GPU memory
199  float *bm1f = new float[pmeGrid.K1];
200  float *bm2f = new float[pmeGrid.K2];
201  float *bm3f = new float[pmeGrid.K3];
202  for (int i=0;i < pmeGrid.K1;i++) bm1f[i] = (float)bm1[i];
203  for (int i=0;i < pmeGrid.K2;i++) bm2f[i] = (float)bm2[i];
204  for (int i=0;i < pmeGrid.K3;i++) bm3f[i] = (float)bm3[i];
205  allocate_device<float>(&d_bm1, pmeGrid.K1);
206  allocate_device<float>(&d_bm2, pmeGrid.K2);
207  allocate_device<float>(&d_bm3, pmeGrid.K3);
208  copy_HtoD_sync<float>(bm1f, d_bm1, pmeGrid.K1);
209  copy_HtoD_sync<float>(bm2f, d_bm2, pmeGrid.K2);
210  copy_HtoD_sync<float>(bm3f, d_bm3, pmeGrid.K3);
211  delete [] bm1f;
212  delete [] bm2f;
213  delete [] bm3f;
214  allocate_device<EnergyVirial>(&d_energyVirial, 1);
215  allocate_host<EnergyVirial>(&h_energyVirial, 1);
216  // cudaCheck(cudaEventCreateWithFlags(&copyEnergyVirialEvent, cudaEventDisableTiming));
217  cudaCheck(cudaEventCreate(&copyEnergyVirialEvent));
218  // ncall = 0;
219 }
220 
222  cudaCheck(cudaSetDevice(deviceID));
223  deallocate_device<float>(&d_bm1);
224  deallocate_device<float>(&d_bm2);
225  deallocate_device<float>(&d_bm3);
226  deallocate_device<EnergyVirial>(&d_energyVirial);
227  deallocate_host<EnergyVirial>(&h_energyVirial);
228  cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
229 }
230 
231 void CudaPmeKSpaceCompute::solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data) {
232 #if 0
233  // Check lattice to make sure it is updating for constant pressure
234  fprintf(stderr, "K-SPACE LATTICE %g %g %g %g %g %g %g %g %g\n",
235  lattice.a().x, lattice.a().y, lattice.a().z,
236  lattice.b().x, lattice.b().y, lattice.b().z,
237  lattice.c().x, lattice.c().y, lattice.c().z);
238 #endif
239  cudaCheck(cudaSetDevice(deviceID));
240 
241  const bool doEnergyVirial = (doEnergy || doVirial);
242 
243  int nfft1, nfft2, nfft3;
244  float *prefac1, *prefac2, *prefac3;
245 
246  BigReal volume = lattice.volume();
247  Vector a_r = lattice.a_r();
248  Vector b_r = lattice.b_r();
249  Vector c_r = lattice.c_r();
250  float recip1x, recip1y, recip1z;
251  float recip2x, recip2y, recip2z;
252  float recip3x, recip3y, recip3z;
253 
254  if (permutation == Perm_Z_cX_Y) {
255  // Z, X, Y
256  nfft1 = pmeGrid.K3;
257  nfft2 = pmeGrid.K1;
258  nfft3 = pmeGrid.K2;
259  prefac1 = d_bm3;
260  prefac2 = d_bm1;
261  prefac3 = d_bm2;
262  recip1x = c_r.z;
263  recip1y = c_r.x;
264  recip1z = c_r.y;
265  recip2x = a_r.z;
266  recip2y = a_r.x;
267  recip2z = a_r.y;
268  recip3x = b_r.z;
269  recip3y = b_r.x;
270  recip3z = b_r.y;
271  } else if (permutation == Perm_cX_Y_Z) {
272  // X, Y, Z
273  nfft1 = pmeGrid.K1;
274  nfft2 = pmeGrid.K2;
275  nfft3 = pmeGrid.K3;
276  prefac1 = d_bm1;
277  prefac2 = d_bm2;
278  prefac3 = d_bm3;
279  recip1x = a_r.x;
280  recip1y = a_r.y;
281  recip1z = a_r.z;
282  recip2x = b_r.x;
283  recip2y = b_r.y;
284  recip2z = b_r.z;
285  recip3x = c_r.x;
286  recip3y = c_r.y;
287  recip3z = c_r.z;
288  } else {
289  NAMD_bug("CudaPmeKSpaceCompute::solve, invalid permutation");
290  }
291 
292  // ncall++;
293  // if (ncall == 1) {
294  // char filename[256];
295  // sprintf(filename,"dataf_%d_%d.txt",jblock,kblock);
296  // writeComplexToDisk((float2*)data, size1*size2*size3, filename, stream);
297  // }
298 
299  // if (ncall == 1) {
300  // float2* h_data = new float2[size1*size2*size3];
301  // float2* d_data = (float2*)data;
302  // copy_DtoH<float2>(d_data, h_data, size1*size2*size3, stream);
303  // cudaCheck(cudaStreamSynchronize(stream));
304  // FILE *handle = fopen("dataf.txt", "w");
305  // for (int z=0;z < pmeGrid.K3;z++) {
306  // for (int y=0;y < pmeGrid.K2;y++) {
307  // for (int x=0;x < pmeGrid.K1/2+1;x++) {
308  // int i;
309  // if (permutation == Perm_cX_Y_Z) {
310  // i = x + y*size1 + z*size1*size2;
311  // } else {
312  // i = z + x*size1 + y*size1*size2;
313  // }
314  // fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
315  // }
316  // }
317  // }
318  // fclose(handle);
319  // delete [] h_data;
320  // }
321 
322  // Clear energy and virial array if needed
323  if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
324 
325  scalar_sum(permutation == Perm_cX_Y_Z, nfft1, nfft2, nfft3, size1, size2, size3, kappa,
326  recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
327  volume, prefac1, prefac2, prefac3, j0, k0, doEnergyVirial,
328  &d_energyVirial->energy, d_energyVirial->virial, (float2*)data,
329  stream);
330 
331  // Copy energy and virial to host if needed
332  if (doEnergyVirial) {
333  copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
334  cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
335  // cudaCheck(cudaStreamSynchronize(stream));
336  }
337 
338 }
339 
340 // void CudaPmeKSpaceCompute::waitEnergyAndVirial() {
341 // cudaCheck(cudaSetDevice(deviceID));
342 // cudaCheck(cudaEventSynchronize(copyEnergyVirialEvent));
343 // }
344 
345 void CudaPmeKSpaceCompute::energyAndVirialCheck(void *arg, double walltime) {
347 
348  cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
349  if (err == cudaSuccess) {
350  // Event has occurred
351  c->checkCount = 0;
352  if (c->pencilXYZPtr != NULL)
353  c->pencilXYZPtr->energyAndVirialDone();
354  else if (c->pencilZPtr != NULL)
355  c->pencilZPtr->energyAndVirialDone();
356  else
357  NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
358  return;
359  } else if (err == cudaErrorNotReady) {
360  // Event has not occurred
361  c->checkCount++;
362  if (c->checkCount >= 1000000) {
363  char errmsg[256];
364  sprintf(errmsg,"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
365  c->checkCount);
366  cudaDie(errmsg,err);
367  }
368  } else {
369  // Anything else is an error
370  char errmsg[256];
371  sprintf(errmsg,"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
372  c->checkCount);
373  cudaDie(errmsg,err);
374  }
375 
376  // Call again
377  CcdCallBacksReset(0, walltime);
378  CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
379 }
380 
382  cudaCheck(cudaSetDevice(deviceID));
383  pencilXYZPtr = pencilPtr;
384  pencilZPtr = NULL;
385  checkCount = 0;
386  CcdCallBacksReset(0, CmiWallTimer());
387  // Set the call back at 0.1ms
388  CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
389 }
390 
392  cudaCheck(cudaSetDevice(deviceID));
393  pencilXYZPtr = NULL;
394  pencilZPtr = pencilPtr;
395  checkCount = 0;
396  CcdCallBacksReset(0, CmiWallTimer());
397  // Set the call back at 0.1ms
398  CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
399 }
400 
402  return h_energyVirial->energy;
403 }
404 
405 void CudaPmeKSpaceCompute::getVirial(double *virial) {
406  if (permutation == Perm_Z_cX_Y) {
407  // h_energyVirial->virial is storing ZZ, ZX, ZY, XX, XY, YY
408  virial[0] = h_energyVirial->virial[3];
409  virial[1] = h_energyVirial->virial[4];
410  virial[2] = h_energyVirial->virial[1];
411 
412  virial[3] = h_energyVirial->virial[4];
413  virial[4] = h_energyVirial->virial[5];
414  virial[5] = h_energyVirial->virial[2];
415 
416  virial[6] = h_energyVirial->virial[1];
417  virial[7] = h_energyVirial->virial[7];
418  virial[8] = h_energyVirial->virial[0];
419  } else if (permutation == Perm_cX_Y_Z) {
420  // h_energyVirial->virial is storing XX, XY, XZ, YY, YZ, ZZ
421  virial[0] = h_energyVirial->virial[0];
422  virial[1] = h_energyVirial->virial[1];
423  virial[2] = h_energyVirial->virial[2];
424 
425  virial[3] = h_energyVirial->virial[1];
426  virial[4] = h_energyVirial->virial[3];
427  virial[5] = h_energyVirial->virial[4];
428 
429  virial[6] = h_energyVirial->virial[2];
430  virial[7] = h_energyVirial->virial[4];
431  virial[8] = h_energyVirial->virial[5];
432  }
433 }
434 
435 
436 //###########################################################################
437 //###########################################################################
438 //###########################################################################
439 
440 //
441 // Class constructor
442 //
444  const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
445  PmeRealSpaceCompute(pmeGrid, jblock, kblock), deviceID(deviceID), stream(stream) {
446  if (dataSize < xsize*ysize*zsize)
447  NAMD_bug("CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
448  cudaCheck(cudaSetDevice(deviceID));
449  d_atomsCapacity = 0;
450  d_atoms = NULL;
451  d_forceCapacity = 0;
452  d_force = NULL;
453  tex_data = NULL;
454  tex_data_len = 0;
455  allocate_device<float>(&data, dataSize);
456  setupGridTexture(data, xsize*ysize*zsize);
457  cudaCheck(cudaEventCreate(&gatherForceEvent));
458 }
459 
460 //
461 // Class desctructor
462 //
464  cudaCheck(cudaSetDevice(deviceID));
465  if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
466  if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
467  // if (d_patches != NULL) deallocate_device<PatchInfo>(&d_patches);
468  // deallocate_device<double>(&d_selfEnergy);
469  deallocate_device<float>(&data);
470  cudaCheck(cudaEventDestroy(gatherForceEvent));
471 }
472 
473 // //
474 // // Copy patches and atoms to device memory
475 // //
476 // void CudaPmeRealSpaceCompute::setPatchesAtoms(const int numPatches, const PatchInfo* patches,
477 // const int numAtoms, const CudaAtom* atoms) {
478 
479 // this->numPatches = numPatches;
480 // this->numAtoms = numAtoms;
481 
482 // // Reallocate device arrays as neccessary
483 // reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
484 // reallocate_device<PatchInfo>(&d_patches, &d_patchesCapacity, numPatches, 1.5f);
485 
486 // // Copy atom and patch data to device
487 // copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
488 // copy_HtoD<PatchInfo>(patches, d_patches, numPatches, stream);
489 // }
490 
491 //
492 // Copy atoms to device memory
493 //
494 void CudaPmeRealSpaceCompute::copyAtoms(const int numAtoms, const CudaAtom* atoms) {
495  cudaCheck(cudaSetDevice(deviceID));
496  this->numAtoms = numAtoms;
497 
498  // Reallocate device arrays as neccessary
499  reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
500 
501  // Copy atom data to device
502  copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
503 }
504 
505 //
506 // Spread charges on grid
507 //
509  cudaCheck(cudaSetDevice(deviceID));
510 
511  // Clear grid
512  clear_device_array<float>(data, xsize*ysize*zsize, stream);
513 
514  spread_charge((const float4*)d_atoms, numAtoms,
515  pmeGrid.K1, pmeGrid.K2, pmeGrid.K3, xsize, ysize, zsize,
516  xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
517  data, pmeGrid.order, stream);
518 
519  // ncall++;
520 
521  // if (ncall == 1) writeRealToDisk(data, xsize*ysize*zsize, "data.txt");
522 }
523 
524 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(void *arg, double walltime) {
526  cudaCheck(cudaSetDevice(c->deviceID));
527 
528  cudaError_t err = cudaEventQuery(c->gatherForceEvent);
529  if (err == cudaSuccess) {
530  // Event has occurred
531  c->checkCount = 0;
532 // c->deviceProxy[CkMyNode()].gatherForceDone();
533  c->devicePtr->gatherForceDone();
534  return;
535  } else if (err == cudaErrorNotReady) {
536  // Event has not occurred
537  c->checkCount++;
538  if (c->checkCount >= 1000000) {
539  char errmsg[256];
540  sprintf(errmsg,"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
541  c->checkCount);
542  cudaDie(errmsg,err);
543  }
544  } else {
545  // Anything else is an error
546  char errmsg[256];
547  sprintf(errmsg,"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
548  c->checkCount);
549  cudaDie(errmsg,err);
550  }
551 
552  // Call again
553  CcdCallBacksReset(0, walltime);
554  CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
555 }
556 
558  cudaCheck(cudaSetDevice(deviceID));
559  devicePtr = devicePtr_in;
560  checkCount = 0;
561  CcdCallBacksReset(0, CmiWallTimer());
562  // Set the call back at 0.1ms
563  CcdCallFnAfter(cuda_gatherforce_check, this, 0.1);
564 }
565 
567  cudaCheck(cudaSetDevice(deviceID));
568  cudaCheck(cudaEventSynchronize(gatherForceEvent));
569 }
570 
571 void CudaPmeRealSpaceCompute::setupGridTexture(float* data, int data_len) {
572  if (tex_data == data && tex_data_len == data_len) return;
573  tex_data = data;
574  tex_data_len = data_len;
575  // Use texture objects
576  cudaResourceDesc resDesc;
577  memset(&resDesc, 0, sizeof(resDesc));
578  resDesc.resType = cudaResourceTypeLinear;
579  resDesc.res.linear.devPtr = data;
580  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
581  resDesc.res.linear.desc.x = sizeof(float)*8;
582  resDesc.res.linear.sizeInBytes = data_len*sizeof(float);
583  cudaTextureDesc texDesc;
584  memset(&texDesc, 0, sizeof(texDesc));
585  texDesc.readMode = cudaReadModeElementType;
586  cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
587 }
588 
590  cudaCheck(cudaSetDevice(deviceID));
591 
592  // Re-allocate force array if needed
593  reallocate_device<CudaForce>(&d_force, &d_forceCapacity, numAtoms, 1.5f);
594 
595  gather_force((const float4*)d_atoms, numAtoms,
597  xsize, ysize, zsize, xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
598  data, pmeGrid.order, (float3*)d_force,
599  gridTexObj,
600  stream);
601 
602  copy_DtoH<CudaForce>(d_force, force, numAtoms, stream);
603 
604  cudaCheck(cudaEventRecord(gatherForceEvent, stream));
605 }
606 
607 /*
608 double CudaPmeRealSpaceCompute::calcSelfEnergy() {
609  double h_selfEnergy;
610  clear_device_array<double>(d_selfEnergy, 1);
611  calc_sum_charge_squared((const float4*)d_atoms, numAtoms, d_selfEnergy, stream);
612  copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
613  cudaCheck(cudaStreamSynchronize(stream));
614  // 1.7724538509055160273 = sqrt(pi)
615  h_selfEnergy *= -ComputeNonbondedUtil::ewaldcof/1.7724538509055160273;
616  return h_selfEnergy;
617 }
618 */
619 
620 //###########################################################################
621 //###########################################################################
622 //###########################################################################
623 
624 CudaPmeTranspose::CudaPmeTranspose(PmeGrid pmeGrid, const int permutation,
625  const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
626  PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
627  cudaCheck(cudaSetDevice(deviceID));
628 
629  allocate_device<float2>(&d_data, dataSize);
630 #ifndef P2P_ENABLE_3D
631  allocate_device<float2>(&d_buffer, dataSize);
632 #endif
633 
634  // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
635  dataPtrsYZX.resize(nblock, NULL);
636  dataPtrsZXY.resize(nblock, NULL);
637 
638  allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
639  allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
640 }
641 
643  cudaCheck(cudaSetDevice(deviceID));
644  deallocate_device<float2>(&d_data);
645 #ifndef P2P_ENABLE_3D
646  deallocate_device<float2>(&d_buffer);
647 #endif
648  deallocate_device< TransposeBatch<float2> >(&batchesZXY);
649  deallocate_device< TransposeBatch<float2> >(&batchesYZX);
650 }
651 
652 //
653 // Set dataPtrsYZX
654 //
655 void CudaPmeTranspose::setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data) {
656  if (dataPtrsYZX.size() != dataPtrsNew.size())
657  NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
658  for (int iblock=0;iblock < nblock;iblock++) {
659  dataPtrsYZX[iblock] = dataPtrsNew[iblock];
660  }
661  // Build batched data structures
663 
664  for (int iperm=0;iperm < 3;iperm++) {
665  int isize_out;
666  if (iperm == 0) {
667  // Perm_Z_cX_Y:
668  // ZXY -> XYZ
669  isize_out = pmeGrid.K1/2+1;
670  } else if (iperm == 1) {
671  // Perm_cX_Y_Z:
672  // XYZ -> YZX
673  isize_out = pmeGrid.K2;
674  } else {
675  // Perm_Y_Z_cX:
676  // YZX -> ZXY
677  isize_out = pmeGrid.K3;
678  }
679 
680  int max_nx = 0;
681  for (int iblock=0;iblock < nblock;iblock++) {
682 
683  int x0 = pos[iblock];
684  int nx = pos[iblock+1] - x0;
685  max_nx = std::max(max_nx, nx);
686 
687  int width_out;
688  float2* data_out;
689  if (dataPtrsYZX[iblock] == NULL) {
690  // Local transpose, use internal buffer
691  data_out = d_data + jsize*ksize*x0;
692  width_out = jsize;
693  } else {
694  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
695  data_out = dataPtrsYZX[iblock];
696  width_out = isize_out;
697  }
698 
700  batch.nx = nx;
701  batch.ysize_out = width_out;
702  batch.zsize_out = ksize;
703  batch.data_in = data+x0;
704  batch.data_out = data_out;
705 
706  h_batchesYZX[iperm*nblock + iblock] = batch;
707 
708  // transpose_xyz_yzx(
709  // nx, jsize, ksize,
710  // isize, jsize,
711  // width_out, ksize,
712  // data+x0, data_out, stream);
713  }
714 
715  max_nx_YZX[iperm] = max_nx;
716  }
717 
718  copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
719  cudaCheck(cudaStreamSynchronize(stream));
720  delete [] h_batchesYZX;
721 }
722 
723 //
724 // Set dataPtrsZXY
725 //
726 void CudaPmeTranspose::setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data) {
727  if (dataPtrsZXY.size() != dataPtrsNew.size())
728  NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
729  for (int iblock=0;iblock < nblock;iblock++) {
730  dataPtrsZXY[iblock] = dataPtrsNew[iblock];
731  }
732 
733  // Build batched data structures
735 
736  for (int iperm=0;iperm < 3;iperm++) {
737  int isize_out;
738  if (iperm == 0) {
739  // Perm_cX_Y_Z:
740  // XYZ -> ZXY
741  isize_out = pmeGrid.K3;
742  } else if (iperm == 1) {
743  // Perm_Z_cX_Y:
744  // ZXY -> YZX
745  isize_out = pmeGrid.K2;
746  } else {
747  // Perm_Y_Z_cX:
748  // YZX -> XYZ
749  isize_out = pmeGrid.K1/2+1;
750  }
751 
752  int max_nx = 0;
753  for (int iblock=0;iblock < nblock;iblock++) {
754 
755  int x0 = pos[iblock];
756  int nx = pos[iblock+1] - x0;
757  max_nx = std::max(max_nx, nx);
758 
759  int width_out;
760  float2* data_out;
761  if (dataPtrsZXY[iblock] == NULL) {
762  // Local transpose, use internal buffer
763  data_out = d_data + jsize*ksize*x0;
764  width_out = ksize;
765  } else {
766  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
767  data_out = dataPtrsZXY[iblock];
768  width_out = isize_out;
769  }
770 
772  batch.nx = nx;
773  batch.zsize_out = width_out;
774  batch.xsize_out = nx;
775  batch.data_in = data+x0;
776  batch.data_out = data_out;
777 
778  h_batchesZXY[iperm*nblock + iblock] = batch;
779  }
780 
781  max_nx_ZXY[iperm] = max_nx;
782  }
783 
784  copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
785  cudaCheck(cudaStreamSynchronize(stream));
786  delete [] h_batchesZXY;
787 }
788 
790  cudaCheck(cudaSetDevice(deviceID));
791 
792  int iperm;
793  switch(permutation) {
794  case Perm_Z_cX_Y:
795  // ZXY -> XYZ
796  iperm = 0;
797  break;
798  case Perm_cX_Y_Z:
799  // XYZ -> YZX
800  iperm = 1;
801  break;
802  case Perm_Y_Z_cX:
803  // YZX -> ZXY
804  iperm = 2;
805  break;
806  default:
807  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
808  break;
809  }
810 
812  nblock, batchesYZX + iperm*nblock,
813  max_nx_YZX[iperm], jsize, ksize,
814  isize, jsize, stream);
815 
816 
817 /*
818  int isize_out;
819  switch(permutation) {
820  case Perm_Z_cX_Y:
821  // ZXY -> XYZ
822  isize_out = pmeGrid.K1/2+1;
823  break;
824  case Perm_cX_Y_Z:
825  // XYZ -> YZX
826  isize_out = pmeGrid.K2;
827  break;
828  case Perm_Y_Z_cX:
829  // YZX -> ZXY
830  isize_out = pmeGrid.K3;
831  break;
832  default:
833  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
834  break;
835  }
836 
837  for (int iblock=0;iblock < nblock;iblock++) {
838 
839  int x0 = pos[iblock];
840  int nx = pos[iblock+1] - x0;
841 
842  int width_out;
843  float2* data_out;
844  if (dataPtrsYZX[iblock] == NULL) {
845  // Local transpose, use internal buffer
846  data_out = d_data + jsize*ksize*x0;
847  width_out = jsize;
848  } else {
849  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
850  data_out = dataPtrsYZX[iblock];
851  width_out = isize_out;
852  }
853 
854  transpose_xyz_yzx(
855  nx, jsize, ksize,
856  isize, jsize,
857  width_out, ksize,
858  data+x0, data_out, stream);
859  }
860 */
861 }
862 
864  cudaCheck(cudaSetDevice(deviceID));
865 
866  int iperm;
867  switch(permutation) {
868  case Perm_cX_Y_Z:
869  // XYZ -> ZXY
870  iperm = 0;
871  break;
872  case Perm_Z_cX_Y:
873  // ZXY -> YZX
874  iperm = 1;
875  break;
876  case Perm_Y_Z_cX:
877  // YZX -> XYZ
878  iperm = 2;
879  break;
880  default:
881  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
882  break;
883  }
884 
886  nblock, batchesZXY + iperm*nblock,
887  max_nx_ZXY[iperm], jsize, ksize,
888  isize, jsize, stream);
889 
890 /*
891  int isize_out;
892  switch(permutation) {
893  case Perm_cX_Y_Z:
894  // XYZ -> ZXY
895  isize_out = pmeGrid.K3;
896  break;
897  case Perm_Z_cX_Y:
898  // ZXY -> YZX
899  isize_out = pmeGrid.K2;
900  break;
901  case Perm_Y_Z_cX:
902  // YZX -> XYZ
903  isize_out = pmeGrid.K1/2+1;
904  break;
905  default:
906  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
907  break;
908  }
909 
910  for (int iblock=0;iblock < nblock;iblock++) {
911 
912  int x0 = pos[iblock];
913  int nx = pos[iblock+1] - x0;
914 
915  int width_out;
916  float2* data_out;
917  if (dataPtrsZXY[iblock] == NULL) {
918  // Local transpose, use internal buffer
919  data_out = d_data + jsize*ksize*x0;
920  width_out = ksize;
921  } else {
922  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
923  data_out = dataPtrsZXY[iblock];
924  width_out = isize_out;
925  }
926 
927  transpose_xyz_zxy(
928  nx, jsize, ksize,
929  isize, jsize,
930  width_out, nx,
931  data+x0, data_out, stream);
932  }
933 */
934 }
935 
937  cudaCheck(cudaSetDevice(deviceID));
938  cudaCheck(cudaStreamSynchronize(stream));
939 }
940 
941 void CudaPmeTranspose::copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize) {
942  cudaCheck(cudaSetDevice(deviceID));
943 
944  if (iblock >= nblock)
945  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
946 
947  int x0 = pos[iblock];
948  int nx = pos[iblock+1] - x0;
949 
950  int copySize = jsize*ksize*nx;
951  int copyStart = jsize*ksize*x0;
952 
953  if (copyStart + copySize > dataSize)
954  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
955 
956  if (copySize > h_dataSize)
957  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
958 
959  copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
960 }
961 
962 void CudaPmeTranspose::copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out) {
963  cudaCheck(cudaSetDevice(deviceID));
964 
965  if (iblock >= nblock)
966  NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
967 
968  // Determine block size = how much we're copying
969  int i0, i1, j0, j1, k0, k1;
970  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
971  int ni = i1-i0+1;
972  int nj = j1-j0+1;
973  int nk = k1-k0+1;
974 
975  copy3D_HtoD<float2>(data_in, data_out,
976  0, 0, 0,
977  ni, nj,
978  i0, 0, 0,
979  isize, jsize,
980  ni, nj, nk, stream);
981 }
982 
983 #ifndef P2P_ENABLE_3D
984 //
985 // Copy from temporary buffer to final buffer
986 //
987 void CudaPmeTranspose::copyDataDeviceToDevice(const int iblock, float2* data_out) {
988  cudaCheck(cudaSetDevice(deviceID));
989 
990  if (iblock >= nblock)
991  NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
992 
993  // Determine block size = how much we're copying
994  int i0, i1, j0, j1, k0, k1;
995  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
996  int ni = i1-i0+1;
997  int nj = j1-j0+1;
998  int nk = k1-k0+1;
999 
1000  float2* data_in = d_buffer + i0*nj*nk;
1001 
1002  copy3D_DtoD<float2>(data_in, data_out,
1003  0, 0, 0,
1004  ni, nj,
1005  i0, 0, 0,
1006  isize, jsize,
1007  ni, nj, nk, stream);
1008 }
1009 
1010 //
1011 // Return temporary buffer for block "iblock"
1012 //
1014  if (iblock >= nblock)
1015  NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1016 
1017  // Determine block size = how much we're copying
1018  int i0, i1, j0, j1, k0, k1;
1019  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1020  int ni = i1-i0+1;
1021  int nj = j1-j0+1;
1022  int nk = k1-k0+1;
1023 
1024  return d_buffer + i0*nj*nk;
1025 }
1026 #endif
1027 
1028 void CudaPmeTranspose::copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out,
1029  float2* data_out) {
1030 
1031  int iblock_out = jblock;
1032  int jblock_out = kblock;
1033  int kblock_out = iblock;
1034 
1035  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1036 }
1037 
1038 void CudaPmeTranspose::copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out,
1039  float2* data_out) {
1040 
1041  int iblock_out = kblock;
1042  int jblock_out = iblock;
1043  int kblock_out = jblock;
1044 
1045  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1046 }
1047 
1048 void CudaPmeTranspose::copyDataToPeerDevice(const int iblock,
1049  const int iblock_out, const int jblock_out, const int kblock_out,
1050  int deviceID_out, int permutation_out, float2* data_out) {
1051 
1052  cudaCheck(cudaSetDevice(deviceID));
1053 
1054  // Determine block size = how much we're copying
1055  int i0, i1, j0, j1, k0, k1;
1056  getBlockDim(pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1057  int ni = i1-i0+1;
1058  int nj = j1-j0+1;
1059  int nk = k1-k0+1;
1060 
1061  getPencilDim(pmeGrid, permutation_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1062  int isize_out = i1-i0+1;
1063  int jsize_out = j1-j0+1;
1064 
1065  int x0 = pos[iblock];
1066  float2* data_in = d_data + jsize*ksize*x0;
1067 
1068 #ifndef P2P_ENABLE_3D
1069  // Copy into temporary peer device buffer
1070  copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
1071 #else
1072  copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
1073  0, 0, 0,
1074  ni, nj,
1075  0, 0, 0,
1076  isize_out, jsize_out,
1077  ni, nj, nk, stream);
1078 #endif
1079 
1080 }
1081 
1082 #endif // NAMD_CUDA
int zBlocks
Definition: PmeBase.h:22
Vector a_r() const
Definition: Lattice.h:268
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
const int permutation
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void setDataPtrsYZX(std::vector< float2 * > &dataPtrsNew, float2 *data)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
Definition: Vector.h:64
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
Vector c_r() const
Definition: Lattice.h:270
static __thread atom * atoms
BigReal z
Definition: Vector.h:66
#define cufftCheck(stmt)
void energyAndVirialDone()
Definition: CudaPmeSolver.C:47
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:29
void spreadCharge(Lattice &lattice)
std::vector< int > pos
void CcdCallBacksReset(void *ignored, double curWallTime)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void energyAndVirialDone()
Vector b_r() const
Definition: Lattice.h:269
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
__thread cudaStream_t stream
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
PmeGrid pmeGrid
bool dataDstAllocated
int yBlocks
Definition: PmeBase.h:22
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:123
const int jblock
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
BigReal x
Definition: Vector.h:66
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
void getVirial(double *virial)
const int kblock
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream)
BigReal volume(void) const
Definition: Lattice.h:277
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
bool dataSrcAllocated
__global__ void gather_force(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
int K3
Definition: PmeBase.h:18
const int permutation
void setDataPtrsZXY(std::vector< float2 * > &dataPtrsNew, float2 *data)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:86
void gatherForce(Lattice &lattice, CudaForce *force)
BigReal y
Definition: Vector.h:66
Vector b() const
Definition: Lattice.h:253
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void transposeXYZtoYZX(const float2 *data)
gridSize y
float * dataDst
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
void transposeXYZtoZXY(const float2 *data)
gridSize x
float * dataSrc
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
Vector a() const
Definition: Lattice.h:252
float2 * getBuffer(const int iblock)
Vector c() const
Definition: Lattice.h:254
double BigReal
Definition: common.h:114
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)