NAMD
CudaPmeSolver.C
Go to the documentation of this file.
1 #include "Node.h"
2 #include "Priorities.h"
3 #include "ComputeNonbondedUtil.h"
4 #include "CudaPmeSolverUtil.h"
5 #include "ComputePmeCUDAMgr.h"
6 #include "ComputePmeCUDAMgr.decl.h"
7 #include "CudaPmeSolver.h"
8 #include "DeviceCUDA.h"
9 
10 #ifdef NAMD_CUDA
11 #ifdef WIN32
12 #define __thread __declspec(thread)
13 #endif
14 extern __thread DeviceCUDA *deviceCUDA;
15 //#define DISABLE_P2P
16 
18  pmeGrid = msg->pmeGrid;
19  delete msg;
20 }
21 
22 //
23 // CUDA specific initialization
24 //
26  // Store device proxy
27  deviceProxy = msg->deviceProxy;
28  delete msg;
29  int deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
30  cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
31  CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
32  // Setup fftCompute and pmeKSpaceCompute
33  fftCompute = new CudaFFTCompute(deviceID, stream);
34  pmeKSpaceCompute = new CudaPmeKSpaceCompute(pmeGrid, Perm_cX_Y_Z, 0, 0,
35  ComputeNonbondedUtil::ewaldcof, deviceID, stream);
36 }
37 
38 void CudaPmePencilXYZ::backwardDone() {
39  deviceProxy[CkMyNode()].gatherForce();
40  ((CudaPmeKSpaceCompute *)pmeKSpaceCompute)->energyAndVirialSetCallback(this);
41 
42  // ((CudaPmeKSpaceCompute *)pmeKSpaceCompute)->waitEnergyAndVirial();
43  // submitReductions();
44  // deviceProxy[CkMyNode()].gatherForce();
45 }
46 
48  submitReductions();
49  // deviceProxy[CkMyNode()].gatherForce();
50 }
51 
52 //###########################################################################
53 //###########################################################################
54 //###########################################################################
55 
57  pmeGrid = msg->pmeGrid;
58  pmePencilZ = msg->pmePencilZ;
59  zMap = msg->zMap;
60 
61  delete msg;
62 
63  initBlockSizes();
64 }
65 
67  if (eventCreated) cudaCheck(cudaEventDestroy(event));
68 }
69 
70 //
71 // CUDA specific initialization
72 //
74  // Store device proxy
75  deviceProxy = msg->deviceProxy;
76  delete msg;
77  deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
78  stream = deviceProxy.ckLocalBranch()->getStream();
79  CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
80  // Setup fftCompute and pmeKSpaceCompute
81  fftCompute = new CudaFFTCompute(deviceID, stream);
82  pmeTranspose = new CudaPmeTranspose(pmeGrid, Perm_cX_Y_Z, 0, thisIndex.z, deviceID, stream);
83 
84  deviceBuffers.resize(pmeGrid.xBlocks, DeviceBuffer(-1, false, NULL));
85  numDeviceBuffers = 0;
86 
87  // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
88  cudaCheck(cudaSetDevice(deviceID));
89  cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
90  eventCreated = true;
91 
92 /*
93  bool useMultiGPUfft = true;
94  bool allDeviceOnSameNode = true;
95  for (int x=0;x < pmeGrid.xBlocks;x++) {
96  int pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,0));
97  allDeviceOnSameNode &= (CkNodeOf(pe) == CkMyNode());
98  }
99 
100  if (useMultiGPUfft && allDeviceOnSameNode && pmeGrid.xBlocks > 1) {
101 
102 
103 
104  } else {
105 */
106 
107  for (int x=0;x < pmeGrid.xBlocks;x++) {
108  int pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,0));
109  if (CkNodeOf(pe) == CkMyNode()) {
110  // Get device ID on a device on this node
111  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(x, 0);
112  // Check for Peer-to-Peer access
113  int canAccessPeer = 0;
114  if (deviceID != deviceID0) {
115  cudaCheck(cudaSetDevice(deviceID));
116  cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
117 #ifdef DISABLE_P2P
118  canAccessPeer = 0;
119 #endif
120  if (canAccessPeer) {
121  unsigned int flags = 0;
122  cudaCheck(cudaDeviceEnablePeerAccess(deviceID0, flags));
123  // fprintf(stderr, "device %d can access device %d\n", deviceID, deviceID0);
124  }
125  }
126  numDeviceBuffers++;
127  deviceBuffers[x] = DeviceBuffer(deviceID0, canAccessPeer, NULL);
128  pmePencilZ(x,0,0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
129  }
130  }
131 
132  // }
133 
134 }
135 
136 //
137 // CUDA specific start
138 //
139 void CudaPmePencilXY::start(const CkCallback &cb) {
140  thisProxy[thisIndex].recvDeviceBuffers(cb);
141 }
142 
143 void CudaPmePencilXY::setDeviceBuffers() {
144  std::vector<float2*> dataPtrs(pmeGrid.xBlocks, (float2*)0);
145  for (int x=0;x < pmeGrid.xBlocks;x++) {
146  if (deviceBuffers[x].data != NULL) {
147  if (deviceBuffers[x].deviceID == deviceID || deviceBuffers[x].isPeerDevice) {
148  // Device buffer on same device => directly transpose into destination pencil
149  dataPtrs[x] = deviceBuffers[x].data;
150  // Otherwise, when device buffer on different device on same node => transpose locally and then
151  // use cudaMemcpy3DPeerAsync to perform the copying
152  }
153  }
154  }
155  ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsZXY(dataPtrs, (float2 *)fftCompute->getDataDst());
156 }
157 
158 float2* CudaPmePencilXY::getData(const int i, const bool sameDevice) {
159  float2* data;
160 #ifndef P2P_ENABLE_3D
161  if (sameDevice) {
162  int i0, i1, j0, j1, k0, k1;
163  getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
164  data = (float2 *)fftCompute->getDataDst() + i0;
165  } else {
166  data = ((CudaPmeTranspose *)pmeTranspose)->getBuffer(i);
167  }
168 #else
169  int i0, i1, j0, j1, k0, k1;
170  getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
171  data = (float2 *)fftCompute->getDataDst() + i0;
172 #endif
173  return data;
174 }
175 
176 void CudaPmePencilXY::backwardDone() {
177  deviceProxy[CkMyNode()].gatherForce();
178 }
179 
180 void CudaPmePencilXY::forwardDone() {
181  // Transpose locally
182  pmeTranspose->transposeXYZtoZXY((float2 *)fftCompute->getDataDst());
183 
184  // Direct Device-To-Device communication within node
185  if (numDeviceBuffers > 0) {
186  // Copy data
187  for (int x=0;x < pmeGrid.xBlocks;x++) {
188  if (deviceBuffers[x].data != NULL) {
189  if (deviceBuffers[x].deviceID != deviceID && !deviceBuffers[x].isPeerDevice) {
190  ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceZXY(x, deviceBuffers[x].deviceID,
191  Perm_Z_cX_Y, deviceBuffers[x].data);
192  }
193  }
194  }
195  // Record event for this pencil
196  cudaCheck(cudaEventRecord(event, stream));
197  // Send empty message
198  for (int x=0;x < pmeGrid.xBlocks;x++) {
199  if (deviceBuffers[x].data != NULL) {
200  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
201  msg->dataSize = 0;
202  msg->x = x;
203  msg->y = thisIndex.y;
204  msg->z = thisIndex.z;
205  msg->doEnergy = doEnergy;
206  msg->doVirial = doVirial;
207  msg->lattice = lattice;
208  msg->numStrayAtoms = numStrayAtoms;
209  pmePencilZ(x,0,0).recvBlock(msg);
210  }
211  }
212  }
213 
214  // Copy-Via-Host communication
215  for (int x=0;x < pmeGrid.xBlocks;x++) {
216  if (deviceBuffers[x].data == NULL) {
217  PmeBlockMsg* msg = new (blockSizes[x], PRIORITY_SIZE) PmeBlockMsg();
218  msg->dataSize = blockSizes[x];
219  msg->x = x;
220  msg->y = thisIndex.y;
221  msg->z = thisIndex.z;
222  msg->doEnergy = doEnergy;
223  msg->doVirial = doVirial;
224  msg->lattice = lattice;
225  msg->numStrayAtoms = numStrayAtoms;
226  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(x, msg->data, msg->dataSize);
227  ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
228  pmePencilZ(x,0,0).recvBlock(msg);
229  }
230  }
231 }
232 
233 void CudaPmePencilXY::recvDataFromZ(PmeBlockMsg *msg) {
234  if (msg->dataSize != 0) {
235  // Buffer is coming from a different node
236  ((CudaPmeTranspose *)pmeTranspose)->copyDataHostToDevice(msg->x, msg->data, (float2 *)fftCompute->getDataDst());
237  } else {
238  // Buffer is coming from the same node
239  // Wait for event that was recorded on the sending pencil
240  // device ID = deviceBuffers[msg->x].deviceID
241  // event = deviceBuffers[msg->x].event
242  cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->x].event, 0));
243 #ifndef P2P_ENABLE_3D
244  if (deviceBuffers[msg->x].data != NULL && deviceBuffers[msg->x].deviceID != deviceID && !deviceBuffers[msg->x].isPeerDevice) {
245  // Data is in temporary device buffer, copy it into final fft-buffer
246  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToDevice(msg->x, (float2 *)fftCompute->getDataDst());
247  }
248 #endif
249  }
250  delete msg;
251 }
252 
253 //###########################################################################
254 //###########################################################################
255 //###########################################################################
256 
258  pmeGrid = msg->pmeGrid;
259  pmePencilY = msg->pmePencilY;
260  yMap = msg->yMap;
261 
262  delete msg;
263 
264  initBlockSizes();
265 
266 }
267 
269  if (eventCreated) cudaCheck(cudaEventDestroy(event));
270 }
271 
272 //
273 // CUDA specific initialization
274 //
276  // Store device proxy
277  deviceProxy = msg->deviceProxy;
278  delete msg;
279  deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
280  stream = deviceProxy.ckLocalBranch()->getStream();
281  CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
282  // Setup fftCompute and pmeKSpaceCompute
283  fftCompute = new CudaFFTCompute(deviceID, stream);
284  pmeTranspose = new CudaPmeTranspose(pmeGrid, Perm_cX_Y_Z, thisIndex.y, thisIndex.z, deviceID, stream);
285 
286  // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
287  cudaCheck(cudaSetDevice(deviceID));
288  cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
289  eventCreated = true;
290 
291  deviceBuffers.resize(pmeGrid.xBlocks, DeviceBuffer(-1, false, NULL));
292  numDeviceBuffers = 0;
293 
294  for (int x=0;x < pmeGrid.xBlocks;x++) {
295  int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,thisIndex.z));
296  if (CkNodeOf(pe) == CkMyNode()) {
297  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(x, thisIndex.z);
298  numDeviceBuffers++;
299  deviceBuffers[x] = DeviceBuffer(deviceID0, false, NULL);
300  pmePencilY(x,0,thisIndex.z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
301  }
302  }
303 
304 }
305 
306 //
307 // CUDA specific start
308 //
309 void CudaPmePencilX::start(const CkCallback &cb) {
310  thisProxy[thisIndex].recvDeviceBuffers(cb);
311 }
312 
313 //
314 // Setup direct device buffers
315 //
316 void CudaPmePencilX::setDeviceBuffers() {
317  std::vector<float2*> dataPtrs(pmeGrid.xBlocks, (float2*)0);
318  for (int x=0;x < pmeGrid.xBlocks;x++) {
319  if (deviceBuffers[x].data != NULL) {
320  if (deviceBuffers[x].deviceID == deviceID) {
321  // Device buffer on same device => directly transpose into destination pencil
322  dataPtrs[x] = deviceBuffers[x].data;
323  // Otherwise, when device buffer on different device on same node => transpose locally and then
324  // use cudaMemcpy3DPeerAsync to perform the copying
325  }
326  }
327  }
328  ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsYZX(dataPtrs, (float2 *)fftCompute->getDataDst());
329 }
330 
331 float2* CudaPmePencilX::getData(const int i, const bool sameDevice) {
332  float2* data;
333 #ifndef P2P_ENABLE_3D
334  if (sameDevice) {
335  int i0, i1, j0, j1, k0, k1;
336  getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
337  data = (float2 *)fftCompute->getDataDst() + i0;
338  } else {
339  data = ((CudaPmeTranspose *)pmeTranspose)->getBuffer(i);
340  }
341 #else
342  int i0, i1, j0, j1, k0, k1;
343  getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
344  data = (float2 *)fftCompute->getDataDst() + i0;
345 #endif
346  return data;
347 }
348 
349 void CudaPmePencilX::backwardDone() {
350  deviceProxy[CkMyNode()].gatherForce();
351 }
352 
353 void CudaPmePencilX::forwardDone() {
354  if (pmeTranspose == NULL)
355  NAMD_bug("CudaPmePencilX::forwardDone, pmeTranspose not initialized");
356  if (blockSizes.size() == 0)
357  NAMD_bug("CudaPmePencilX::forwardDone, blockSizes not initialized");
358  // Transpose locally
359  pmeTranspose->transposeXYZtoYZX((float2 *)fftCompute->getDataDst());
360 
361  // Send data to y-pencils that share the same z-coordinate. There are pmeGrid.xBlocks of them
362  // Direct-Device-To-Device communication
363  if (numDeviceBuffers > 0) {
364  // Copy data
365  for (int x=0;x < pmeGrid.xBlocks;x++) {
366  if (deviceBuffers[x].data != NULL) {
367  if (deviceBuffers[x].deviceID != deviceID) {
368  ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceYZX(x, deviceBuffers[x].deviceID,
369  Perm_Y_Z_cX, deviceBuffers[x].data);
370  }
371  }
372  }
373  // Record event for this pencil
374  cudaCheck(cudaEventRecord(event, stream));
375  // Send empty messages
376  for (int x=0;x < pmeGrid.xBlocks;x++) {
377  if (deviceBuffers[x].data != NULL) {
378  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
379  msg->dataSize = 0;
380  msg->x = x;
381  msg->y = thisIndex.y;
382  msg->z = thisIndex.z;
383  msg->doEnergy = doEnergy;
384  msg->doVirial = doVirial;
385  msg->lattice = lattice;
386  msg->numStrayAtoms = numStrayAtoms;
387  pmePencilY(x,0,thisIndex.z).recvBlock(msg);
388  }
389  }
390  }
391 
392  // Copy-To-Host communication
393  for (int x=0;x < pmeGrid.xBlocks;x++) {
394  if (deviceBuffers[x].data == NULL) {
395  PmeBlockMsg* msg = new (blockSizes[x], PRIORITY_SIZE) PmeBlockMsg();
396  msg->dataSize = blockSizes[x];
397  msg->x = x;
398  msg->y = thisIndex.y;
399  msg->z = thisIndex.z;
400  msg->doEnergy = doEnergy;
401  msg->doVirial = doVirial;
402  msg->lattice = lattice;
403  msg->numStrayAtoms = numStrayAtoms;
404  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(x, msg->data, msg->dataSize);
405  ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
406  pmePencilY(x,0,thisIndex.z).recvBlock(msg);
407  }
408  }
409 }
410 
411 void CudaPmePencilX::recvDataFromY(PmeBlockMsg *msg) {
412  if (msg->dataSize != 0) {
413  // Buffer is coming from a different node
414  ((CudaPmeTranspose *)pmeTranspose)->copyDataHostToDevice(msg->x, msg->data, (float2 *)fftCompute->getDataDst());
415  } else {
416  // Buffer is coming from the same node
417  // Wait for event that was recorded on the sending pencil
418  // device ID = deviceBuffers[msg->x].deviceID
419  // event = deviceBuffers[msg->x].event
420  cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->x].event, 0));
421 #ifndef P2P_ENABLE_3D
422  if (deviceBuffers[msg->x].data != NULL && deviceBuffers[msg->x].deviceID != deviceID) {
423  // Data is in temporary device buffer, copy it into final fft-buffer
424  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToDevice(msg->x, (float2 *)fftCompute->getDataDst());
425  }
426 #endif
427  }
428  delete msg;
429 }
430 
431 //###########################################################################
432 //###########################################################################
433 //###########################################################################
434 
436  pmeGrid = msg->pmeGrid;
437  pmePencilX = msg->pmePencilX;
438  pmePencilZ = msg->pmePencilZ;
439  xMap = msg->xMap;
440  zMap = msg->zMap;
441 
442  delete msg;
443 
444  initBlockSizes();
445 }
446 
448  if (eventCreated) cudaCheck(cudaEventDestroy(event));
449 }
450 
451 //
452 // CUDA specific initialization
453 //
455  // Get device proxy
456  // CProxy_ComputePmeCUDADevice deviceProxy = msg->deviceProxy;
457  deviceID = msg->deviceID;
458  stream = msg->stream;
459  CProxy_ComputePmeCUDAMgr mgrProxy = msg->mgrProxy;
460  delete msg;
461  // deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
462  // cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
463  // CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
464  // Setup fftCompute and pmeKSpaceCompute
465  fftCompute = new CudaFFTCompute(deviceID, stream);
466  pmeTranspose = new CudaPmeTranspose(pmeGrid, Perm_Y_Z_cX, thisIndex.z, thisIndex.x, deviceID, stream);
467 
468  // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
469  cudaCheck(cudaSetDevice(deviceID));
470  cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
471  eventCreated = true;
472 
473  deviceBuffersZ.resize(pmeGrid.yBlocks, DeviceBuffer(-1, false, NULL));
474  deviceBuffersX.resize(pmeGrid.yBlocks, DeviceBuffer(-1, false, NULL));
475  numDeviceBuffersZ = 0;
476  numDeviceBuffersX = 0;
477 
478  for (int y=0;y < pmeGrid.yBlocks;y++) {
479  int pe;
480  pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x, y, 0));
481  if (CkNodeOf(pe) == CkMyNode()) {
482  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(thisIndex.x, y);
483  numDeviceBuffersZ++;
484  deviceBuffersZ[y] = DeviceBuffer(deviceID0, false, NULL);
485  pmePencilZ(thisIndex.x, y, 0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID), thisProxy);
486  }
487  pe = xMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0, y, thisIndex.z));
488  if (CkNodeOf(pe) == CkMyNode()) {
489  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(y, thisIndex.z);
490  numDeviceBuffersX++;
491  deviceBuffersX[y] = DeviceBuffer(deviceID0, false, NULL);
492  pmePencilX(0, y, thisIndex.z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID), thisProxy);
493  }
494  }
495 
496 }
497 
498 //
499 // CUDA specific start
500 //
501 void CudaPmePencilY::start(const CkCallback &cb) {
502  thisProxy[thisIndex].recvDeviceBuffers(cb);
503 }
504 
505 //
506 // Setup direct device buffers
507 //
508 void CudaPmePencilY::setDeviceBuffers() {
509  std::vector<float2*> dataPtrsYZX(pmeGrid.yBlocks, (float2*)0);
510  std::vector<float2*> dataPtrsZXY(pmeGrid.yBlocks, (float2*)0);
511  for (int y=0;y < pmeGrid.yBlocks;y++) {
512  if (deviceBuffersZ[y].data != NULL) {
513  if (deviceBuffersZ[y].deviceID == deviceID) {
514  dataPtrsYZX[y] = deviceBuffersZ[y].data;
515  }
516  }
517  if (deviceBuffersX[y].data != NULL) {
518  if (deviceBuffersX[y].deviceID == deviceID) {
519  dataPtrsZXY[y] = deviceBuffersX[y].data;
520  }
521  }
522  }
523  ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsYZX(dataPtrsYZX, (float2 *)fftCompute->getDataDst());
524  ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsZXY(dataPtrsZXY, (float2 *)fftCompute->getDataSrc());
525 }
526 
527 float2* CudaPmePencilY::getDataForX(const int i, const bool sameDevice) {
528  float2* data;
529 #ifndef P2P_ENABLE_3D
530  if (sameDevice) {
531  int i0, i1, j0, j1, k0, k1;
532  getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
533  data = (float2 *)fftCompute->getDataSrc() + i0;
534  } else {
535  data = ((CudaPmeTranspose *)pmeTranspose)->getBuffer(i);
536  }
537 #else
538  int i0, i1, j0, j1, k0, k1;
539  getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
540  data = (float2 *)fftCompute->getDataSrc() + i0;
541 #endif
542  return data;
543 }
544 
545 float2* CudaPmePencilY::getDataForZ(const int i, const bool sameDevice) {
546  float2* data;
547 #ifndef P2P_ENABLE_3D
548  if (sameDevice) {
549  int i0, i1, j0, j1, k0, k1;
550  getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
551  data = (float2 *)fftCompute->getDataDst() + i0;
552  } else {
553  data = ((CudaPmeTranspose *)pmeTranspose)->getBuffer(i);
554  }
555 #else
556  int i0, i1, j0, j1, k0, k1;
557  getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
558  data = (float2 *)fftCompute->getDataDst() + i0;
559 #endif
560  return data;
561 }
562 
563 void CudaPmePencilY::backwardDone() {
564  // Transpose locally
565  pmeTranspose->transposeXYZtoZXY((float2 *)fftCompute->getDataSrc());
566 
567  // Send data to x-pencils that share the same x-coordinate. There are pmeGrid.yBlocks of them
568  // Direct-Device-To-Device communication
569  if (numDeviceBuffersX > 0) {
570  for (int y=0;y < pmeGrid.yBlocks;y++) {
571  if (deviceBuffersX[y].data != NULL) {
572  if (deviceBuffersX[y].deviceID != deviceID) {
573  ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceZXY(y, deviceBuffersX[y].deviceID,
574  Perm_cX_Y_Z, deviceBuffersX[y].data);
575  }
576  }
577  }
578  // Record event for this pencil
579  cudaCheck(cudaEventRecord(event, stream));
580  // Send empty message
581  for (int y=0;y < pmeGrid.yBlocks;y++) {
582  if (deviceBuffersX[y].data != NULL) {
583  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
584  msg->dataSize = 0;
585  msg->x = thisIndex.x;
586  msg->y = y;
587  msg->z = thisIndex.z;
588  pmePencilX(0,y,thisIndex.z).recvBlock(msg);
589  }
590  }
591  }
592 
593  // Copy via host
594  for (int y=0;y < pmeGrid.yBlocks;y++) {
595  if (deviceBuffersX[y].data == NULL) {
596  PmeBlockMsg* msg = new (blockSizes[y], PRIORITY_SIZE) PmeBlockMsg();
597  msg->dataSize = blockSizes[y];
598  msg->x = thisIndex.x;
599  msg->y = y;
600  msg->z = thisIndex.z;
601  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(y, msg->data, msg->dataSize);
602  ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
603  pmePencilX(0,y,thisIndex.z).recvBlock(msg);
604  }
605  }
606 }
607 
608 void CudaPmePencilY::forwardDone() {
609  if (pmeTranspose == NULL)
610  NAMD_bug("CudaPmePencilY::forwardDone, pmeTranspose not initialized");
611  if (blockSizes.size() == 0)
612  NAMD_bug("CudaPmePencilY::forwardDone, blockSizes not initialized");
613 
614  // Transpose locally
615  pmeTranspose->transposeXYZtoYZX((float2 *)fftCompute->getDataDst());
616 
617  // Send data to z-pencils that share the same x-coordinate. There are pmeGrid.yBlocks of them
618  // Direct-Device-To-Device communication
619  if (numDeviceBuffersZ > 0) {
620  for (int y=0;y < pmeGrid.yBlocks;y++) {
621  if (deviceBuffersZ[y].data != NULL) {
622  if (deviceBuffersZ[y].deviceID != deviceID) {
623  ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceYZX(y, deviceBuffersZ[y].deviceID,
624  Perm_Z_cX_Y, deviceBuffersZ[y].data);
625  }
626  }
627  }
628  // Record event for this pencil
629  cudaCheck(cudaEventRecord(event, stream));
630  // Send empty message
631  for (int y=0;y < pmeGrid.yBlocks;y++) {
632  if (deviceBuffersZ[y].data != NULL) {
633  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
634  msg->dataSize = 0;
635  msg->x = thisIndex.x;
636  msg->y = y;
637  msg->z = thisIndex.z;
638  msg->doEnergy = doEnergy;
639  msg->doVirial = doVirial;
640  msg->lattice = lattice;
641  msg->numStrayAtoms = numStrayAtoms;
642  pmePencilZ(thisIndex.x,y,0).recvBlock(msg);
643  }
644  }
645  }
646 
647  // Copy-To-Host communication
648  for (int y=0;y < pmeGrid.yBlocks;y++) {
649  if (deviceBuffersZ[y].data == NULL) {
650  PmeBlockMsg* msg = new (blockSizes[y], PRIORITY_SIZE) PmeBlockMsg();
651  msg->dataSize = blockSizes[y];
652  msg->x = thisIndex.x;
653  msg->y = y;
654  msg->z = thisIndex.z;
655  msg->doEnergy = doEnergy;
656  msg->doVirial = doVirial;
657  msg->lattice = lattice;
658  msg->numStrayAtoms = numStrayAtoms;
659  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(y, msg->data, msg->dataSize);
660  ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
661  pmePencilZ(thisIndex.x,y,0).recvBlock(msg);
662  }
663  }
664 }
665 
666 void CudaPmePencilY::recvDataFromX(PmeBlockMsg *msg) {
667  if (msg->dataSize != 0) {
668  // Buffer is coming from a different node
669  ((CudaPmeTranspose *)pmeTranspose)->copyDataHostToDevice(msg->y, msg->data, (float2 *)fftCompute->getDataSrc());
670  } else {
671  // Buffer is coming from the same node
672  // Wait for event that was recorded on the sending pencil
673  // device ID = deviceBuffersX[msg->y].deviceID
674  // event = deviceBuffersX[msg->y].event
675  cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersX[msg->y].event, 0));
676 #ifndef P2P_ENABLE_3D
677  if (deviceBuffersX[msg->y].data != NULL && deviceBuffersX[msg->y].deviceID != deviceID) {
678  // Data is in temporary device buffer, copy it into final fft-buffer
679  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToDevice(msg->y, (float2 *)fftCompute->getDataSrc());
680  }
681 #endif
682  }
683  delete msg;
684 }
685 
686 void CudaPmePencilY::recvDataFromZ(PmeBlockMsg *msg) {
687  if (msg->dataSize != 0) {
688  // Buffer is coming from a different node
689  ((CudaPmeTranspose *)pmeTranspose)->copyDataHostToDevice(msg->y, msg->data, (float2 *)fftCompute->getDataDst());
690  } else {
691  // Buffer is coming from the same node
692  // Wait for event that was recorded on the sending pencil
693  // device ID = deviceBuffersZ[msg->y].deviceID
694  // event = deviceBuffersZ[msg->y].event
695  cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersZ[msg->y].event, 0));
696 #ifndef P2P_ENABLE_3D
697  if (deviceBuffersZ[msg->y].data != NULL && deviceBuffersZ[msg->y].deviceID != deviceID) {
698  // Data is in temporary device buffer, copy it into final fft-buffer
699  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToDevice(msg->y, (float2 *)fftCompute->getDataDst());
700  }
701 #endif
702  }
703  delete msg;
704 }
705 
706 //###########################################################################
707 //###########################################################################
708 //###########################################################################
709 
711  useXYslab = false;
712  pmeGrid = msg->pmeGrid;
713  pmePencilY = msg->pmePencilY;
714  yMap = msg->yMap;
715 
716  delete msg;
717 
718  initBlockSizes();
719 }
720 
722  useXYslab = true;
723  pmeGrid = msg->pmeGrid;
724  pmePencilXY = msg->pmePencilXY;
725  xyMap = msg->xyMap;
726 
727  delete msg;
728 
729  initBlockSizes();
730 }
731 
733  if (eventCreated) cudaCheck(cudaEventDestroy(event));
734 }
735 
736 //
737 // CUDA specific initialization
738 //
740  // Get device proxy
741  // CProxy_ComputePmeCUDADevice deviceProxy = msg->deviceProxy;
742  deviceID = msg->deviceID;
743  stream = msg->stream;
744  CProxy_ComputePmeCUDAMgr mgrProxy = msg->mgrProxy;
745  delete msg;
746  // deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
747  // cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
748  // CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
749  // Setup fftCompute and pmeKSpaceCompute
750  fftCompute = new CudaFFTCompute(deviceID, stream);
751  pmeTranspose = new CudaPmeTranspose(pmeGrid, Perm_Z_cX_Y, thisIndex.x, thisIndex.y, deviceID, stream);
752  pmeKSpaceCompute = new CudaPmeKSpaceCompute(pmeGrid, Perm_Z_cX_Y, thisIndex.x, thisIndex.y,
753  ComputeNonbondedUtil::ewaldcof, deviceID, stream);
754 
755  // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
756  cudaCheck(cudaSetDevice(deviceID));
757  cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
758  eventCreated = true;
759 
760  deviceBuffers.resize(pmeGrid.zBlocks, DeviceBuffer(-1, false, NULL));
761  numDeviceBuffers = 0;
762 
763  if (useXYslab) {
764  for (int z=0;z < pmeGrid.zBlocks;z++) {
765  int pe = xyMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0,0,z));
766  if (CkNodeOf(pe) == CkMyNode()) {
767  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(0, z);
768  // Check for Peer-to-Peer access
769  int canAccessPeer = 0;
770  if (deviceID != deviceID0) {
771  cudaCheck(cudaSetDevice(deviceID));
772  cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
773  }
774 #ifdef DISABLE_P2P
775  canAccessPeer = 0;
776 #endif
777  numDeviceBuffers++;
778  deviceBuffers[z] = DeviceBuffer(deviceID0, canAccessPeer, NULL);
779  pmePencilXY(0,0,z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
780  }
781  }
782  } else {
783  for (int z=0;z < pmeGrid.zBlocks;z++) {
784  int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x,0,z));
785  if (CkNodeOf(pe) == CkMyNode()) {
786  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(thisIndex.x, z);
787  numDeviceBuffers++;
788  deviceBuffers[z] = DeviceBuffer(deviceID0, false, NULL);
789  pmePencilY(thisIndex.x,0,z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
790  }
791  }
792  }
793 
794 }
795 
796 //
797 // CUDA specific start
798 //
799 void CudaPmePencilZ::start(const CkCallback &cb) {
800  thisProxy[thisIndex].recvDeviceBuffers(cb);
801 }
802 
803 void CudaPmePencilZ::setDeviceBuffers() {
804  std::vector<float2*> dataPtrs(pmeGrid.zBlocks, (float2*)0);
805  for (int z=0;z < pmeGrid.zBlocks;z++) {
806  if (deviceBuffers[z].data != NULL) {
807  if (deviceBuffers[z].deviceID == deviceID || deviceBuffers[z].isPeerDevice) {
808  dataPtrs[z] = deviceBuffers[z].data;
809  }
810  }
811  }
812  if (useXYslab) {
813  ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsYZX(dataPtrs, (float2 *)fftCompute->getDataSrc());
814  } else {
815  ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsZXY(dataPtrs, (float2 *)fftCompute->getDataSrc());
816  }
817 }
818 
819 float2* CudaPmePencilZ::getData(const int i, const bool sameDevice) {
820  float2* data;
821 #ifndef P2P_ENABLE_3D
822  if (sameDevice) {
823  int i0, i1, j0, j1, k0, k1;
824  getBlockDim(pmeGrid, Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
825  data = (float2 *)fftCompute->getDataSrc() + i0;
826  } else {
827  data = ((CudaPmeTranspose *)pmeTranspose)->getBuffer(i);
828  }
829 #else
830  int i0, i1, j0, j1, k0, k1;
831  getBlockDim(pmeGrid, Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
832  data = (float2 *)fftCompute->getDataSrc() + i0;
833 #endif
834  return data;
835 }
836 
837 void CudaPmePencilZ::backwardDone() {
838  // Transpose locally
839  if (useXYslab) {
840  pmeTranspose->transposeXYZtoYZX((float2 *)fftCompute->getDataSrc());
841  } else {
842  pmeTranspose->transposeXYZtoZXY((float2 *)fftCompute->getDataSrc());
843  }
844 
845  if (useXYslab) {
846  // Direct-Device-To-Device communication
847  if (numDeviceBuffers > 0) {
848  for (int z=0;z < pmeGrid.zBlocks;z++) {
849  if (deviceBuffers[z].data != NULL) {
850  if (deviceBuffers[z].deviceID != deviceID && !deviceBuffers[z].isPeerDevice) {
851  ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceYZX(z, deviceBuffers[z].deviceID,
852  Perm_cX_Y_Z, deviceBuffers[z].data);
853  }
854  }
855  }
856  // Record event for this pencil
857  cudaCheck(cudaEventRecord(event, stream));
858  // Send empty message
859  for (int z=0;z < pmeGrid.zBlocks;z++) {
860  if (deviceBuffers[z].data != NULL) {
861  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
862  msg->dataSize = 0;
863  msg->x = thisIndex.x;
864  msg->y = thisIndex.y;
865  msg->z = z;
866  pmePencilXY(0,0,z).recvBlock(msg);
867  }
868  }
869  }
870 
871  // Copy-To-Host communication
872  for (int z=0;z < pmeGrid.zBlocks;z++) {
873  if (deviceBuffers[z].data == NULL) {
874  PmeBlockMsg* msg = new (blockSizes[z], PRIORITY_SIZE) PmeBlockMsg();
875  msg->dataSize = blockSizes[z];
876  msg->x = thisIndex.x;
877  msg->y = thisIndex.y;
878  msg->z = z;
879  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(z, msg->data, msg->dataSize);
880  ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
881  pmePencilXY(0,0,z).recvBlock(msg);
882  }
883  }
884  } else {
885  // Send data to y-pencils that share the same x-coordinate. There are pmeGrid.zBlocks of them
886  // Direct-Device-To-Device communication
887  if (numDeviceBuffers > 0) {
888  for (int z=0;z < pmeGrid.zBlocks;z++) {
889  if (deviceBuffers[z].data != NULL) {
890  if (deviceBuffers[z].deviceID != deviceID) {
891  ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceZXY(z, deviceBuffers[z].deviceID,
892  Perm_Y_Z_cX, deviceBuffers[z].data);
893  }
894  }
895  }
896  // Record event for this pencil
897  cudaCheck(cudaEventRecord(event, stream));
898  // Send empty message
899  for (int z=0;z < pmeGrid.zBlocks;z++) {
900  if (deviceBuffers[z].data != NULL) {
901  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
902  msg->dataSize = 0;
903  msg->x = thisIndex.x;
904  msg->y = thisIndex.y;
905  msg->z = z;
906  pmePencilY(thisIndex.x,0,z).recvBlock(msg);
907  }
908  }
909  }
910 
911  // Copy-To-Host communication
912  for (int z=0;z < pmeGrid.zBlocks;z++) {
913  if (deviceBuffers[z].data == NULL) {
914  PmeBlockMsg* msg = new (blockSizes[z], PRIORITY_SIZE) PmeBlockMsg();
915  msg->dataSize = blockSizes[z];
916  msg->x = thisIndex.x;
917  msg->y = thisIndex.y;
918  msg->z = z;
919  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(z, msg->data, msg->dataSize);
920  ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
921  pmePencilY(thisIndex.x,0,z).recvBlock(msg);
922  }
923  }
924  }
925 
926  // Submit reductions
927  ((CudaPmeKSpaceCompute *)pmeKSpaceCompute)->energyAndVirialSetCallback(this);
928  // ((CudaPmeKSpaceCompute *)pmeKSpaceCompute)->waitEnergyAndVirial();
929  // submitReductions();
930 }
931 
933  submitReductions();
934 }
935 
936 void CudaPmePencilZ::recvDataFromY(PmeBlockMsg *msg) {
937  // NOTE: No need to synchronize stream here since memory copies are in the stream
938  if (msg->dataSize != 0) {
939  // Buffer is coming from a different node
940  ((CudaPmeTranspose *)pmeTranspose)->copyDataHostToDevice(msg->z, msg->data, (float2 *)fftCompute->getDataSrc());
941  } else {
942  // Buffer is coming from the same node
943  // Wait for event that was recorded on the sending pencil
944  // device ID = deviceBuffers[msg->z].deviceID
945  // event = deviceBuffers[msg->z].event
946  cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->z].event, 0));
947 #ifndef P2P_ENABLE_3D
948  if (deviceBuffers[msg->z].data != NULL && deviceBuffers[msg->z].deviceID != deviceID && !deviceBuffers[msg->z].isPeerDevice) {
949  // Data is in temporary device buffer, copy it into final fft-buffer
950  ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToDevice(msg->z, (float2 *)fftCompute->getDataSrc());
951  }
952 #endif
953  }
954  delete msg;
955 }
956 #endif // NAMD_CUDA
957 
958 #include "CudaPmeSolver.def.h"
bool doEnergy
Definition: PmeSolver.h:126
CProxy_PmePencilXMap zMap
Definition: CudaPmeSolver.h:21
void initialize(CudaPmeXInitMsg *msg)
void initialize(CudaPmeXYInitMsg *msg)
Definition: CudaPmeSolver.C:56
CProxy_PmePencilXYMap xyMap
Definition: CudaPmeSolver.h:22
CProxy_CudaPmePencilZ pmePencilZ
Definition: CudaPmeSolver.h:35
CProxy_PmePencilXMap xMap
Definition: CudaPmeSolver.h:36
void energyAndVirialDone()
Definition: CudaPmeSolver.C:47
CProxy_CudaPmePencilX pmePencilX
Definition: CudaPmeSolver.h:33
float2 * data
Definition: PmeSolver.h:123
void initializeDevice(InitDeviceMsg2 *msg)
CProxy_ComputePmeCUDADevice deviceProxy
Definition: CudaPmeSolver.h:44
void initialize(CudaPmeXInitMsg *msg)
void energyAndVirialDone()
int dataSize
Definition: PmeSolver.h:124
__thread cudaStream_t stream
#define PRIORITY_SIZE
Definition: Priorities.h:13
void initializeDevice(InitDeviceMsg *msg)
Definition: CudaPmeSolver.C:73
CProxy_PmePencilXMap zMap
Definition: CudaPmeSolver.h:38
void initializeDevice(InitDeviceMsg *msg)
int numStrayAtoms
Definition: PmeSolver.h:127
void NAMD_bug(const char *err_msg)
Definition: common.C:123
gridSize z
void initializeDevice(InitDeviceMsg2 *msg)
CProxy_CudaPmePencilZ pmePencilZ
Definition: CudaPmeSolver.h:20
cudaStream_t stream
Definition: CudaPmeSolver.h:52
void initialize(CudaPmeXYZInitMsg *msg)
Definition: CudaPmeSolver.C:17
CProxy_CudaPmePencilY pmePencilY
Definition: CudaPmeSolver.h:34
void initialize(CudaPmeXInitMsg *msg)
bool doVirial
Definition: PmeSolver.h:126
void initializeDevice(InitDeviceMsg *msg)
Definition: CudaPmeSolver.C:25
CProxy_CudaPmePencilXY pmePencilXY
Definition: CudaPmeSolver.h:19
CProxy_ComputePmeCUDAMgr mgrProxy
Definition: CudaPmeSolver.h:53
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:86
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
gridSize y
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
CProxy_PmePencilXMap yMap
Definition: CudaPmeSolver.h:37
gridSize x
Lattice lattice
Definition: PmeSolver.h:128