NAMD
ComputePmeCUDAMgr.C
Go to the documentation of this file.
1 #include <vector>
2 #include <numeric>
3 #include <algorithm>
4 #include "Node.h"
5 #include "PatchMap.h"
6 #include "WorkDistrib.h"
7 #include "Priorities.h"
8 
9 #include "CudaUtils.h"
10 
11 #include "SimParameters.h"
12 #include "CudaPmeSolverUtil.h"
13 
14 #include "ComputePmeCUDAMgr.h"
15 
16 #include "CudaPmeSolver.h"
17 #include "ComputePmeCUDA.h"
18 
19 #include "DeviceCUDA.h"
20 #ifdef NAMD_CUDA
21 #ifdef WIN32
22 #define __thread __declspec(thread)
23 #endif
24 extern __thread DeviceCUDA *deviceCUDA;
25 
26 void createStream(cudaStream_t& stream) {
27 #if CUDA_VERSION >= 5050
28  int leastPriority, greatestPriority;
29  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
30  cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority));
31  // cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault,leastPriority));
32 #else
33  cudaCheck(cudaStreamCreate(&stream));
34 #endif
35 }
36 
37 //
38 // CUDA implementation of atom storage
39 //
41 public:
42  CudaPmeAtomStorage(const bool useIndex) : PmeAtomStorage(useIndex) {}
44  if (atom != NULL) dealloc_((void *)atom);
45  if (atomIndex != NULL) dealloc_((void *)atomIndex);
46  if (overflowAtom != NULL) dealloc_((void *)overflowAtom);
47  if (overflowAtomIndex != NULL) dealloc_((void *)overflowAtomIndex);
48  }
49 private:
50 
51  // Allocate array of size bytes
52  void* alloc_(const int size) {
53  void* p;
54  cudaCheck(cudaMallocHost(&p, size));
55  return p;
56  }
57 
58  // Deallocate array
59  void dealloc_(void *p) {
60  cudaCheck(cudaFreeHost(p));
61  }
62 
63 };
64 
65 //
66 // CPU implementation of atom storage
67 //
69 public:
70  CpuPmeAtomStorage(const bool useIndex) : PmeAtomStorage(useIndex) {}
72  if (atom != NULL) dealloc_((void *)atom);
73  if (atomIndex != NULL) dealloc_((void *)atomIndex);
74  if (overflowAtom != NULL) dealloc_((void *)overflowAtom);
75  if (overflowAtomIndex != NULL) dealloc_((void *)overflowAtomIndex);
76  }
77 private:
78 
79  // Allocate array of size bytes
80  void* alloc_(const int size) {
81  return (void *)(new char[size]);
82  }
83 
84  // Deallocate array
85  void dealloc_(void *p) {
86  delete [] (char *)p;
87  }
88 
89 };
90 
92  for (int p=0;p < 10;p++) {
93  pencil[p] = NULL;
94  pencilCapacity[p] = 0;
95  }
96 }
97 PmeAtomFiler::PmeAtomFiler(CkMigrateMessage *m) {
98  for (int p=0;p < 10;p++) {
99  pencil[p] = NULL;
100  pencilCapacity[p] = 0;
101  }
102 }
104  for (int p=0;p < 10;p++) {
105  if (pencil[p] != NULL) delete [] pencil[p];
106  }
107 }
108 
109 //
110 // File atoms into PME pencils. Each atom can belong to maximum 9 pencils
111 // (oy, oz) = origin of the atoms
112 // (miny, minz) = grid minimum corner for this patch
113 // NOTE: This method can only be called locally from the same Pe
114 //
115 void PmeAtomFiler::fileAtoms(const int numAtoms, const CudaAtom* atoms, Lattice &lattice, const PmeGrid &pmeGrid,
116  const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi) {
117 
118  // Make sure there's enough room in the pencil arrays
119  for (int p=0;p < 10;p++) {
120  if (pencil[p] != NULL && pencilCapacity[p] < numAtoms) {
121  delete [] pencil[p];
122  pencil[p] = NULL;
123  pencilCapacity[p] = 0;
124  }
125  if (pencil[p] == NULL) {
126  int newSize = (int)(numAtoms*1.5);
127  pencil[p] = new int[newSize];
128  pencilCapacity[p] = newSize;
129  }
130  pencilSize[p] = 0;
131  }
132 
133  const float recip11 = lattice.a_r().x;
134  const float recip22 = lattice.b_r().y;
135  const float recip33 = lattice.c_r().z;
136  const int order1 = pmeGrid.order - 1;
137  const int K1 = pmeGrid.K1;
138  const int K2 = pmeGrid.K2;
139  const int K3 = pmeGrid.K3;
140  const int yBlocks = pmeGrid.yBlocks;
141  const int zBlocks = pmeGrid.zBlocks;
142 
143  for (int i=0;i < numAtoms;i++) {
144  float frx, fry, frz;
145  // PmeRealSpaceCompute::calcGridCoord(atoms[i].uix, atoms[i].uiy, atoms[i].uiz,
146  // K1, K2, K3, frx, fry, frz);
147  PmeRealSpaceCompute::calcGridCoord(atoms[i].x, atoms[i].y, atoms[i].z, K1, K2, K3, frx, fry, frz);
148  // Charge is spread in the region [y0 ... y0+order-1] x [z0 ... z0+order-1]
149  int y0 = (int)fry;
150  int z0 = (int)frz;
151  if (y0 < 0 || y0 >= K2 || z0 < 0 || z0 >= K3) {
152  // Add to "Stray pencil" and skip to next atom
153  pencil[9][pencilSize[9]++] = i;
154  continue;
155  // fprintf(stderr, "%lf %lf %lf\n", atoms[i].x, atoms[i].y, atoms[i].z);
156  // NAMD_bug("PmeAtomFiler::fileAtoms, charge out of bounds");
157  }
158  // Calculate pencil index for the four corners of the order X order area
159  // The corners determine the pencil indices for this atom.
160  int occupied = 0;
161  int plist[4];
162 #pragma unroll
163  for (int j=0;j < 4;j++) {
164 
165  int py = getPencilIndexY(pmeGrid, (y0 + (j%2)*order1) % K2) - pencilIndexY;
166  if (py < ylo) py += yBlocks;
167  if (py > yhi) py -= yBlocks;
168 
169  int pz = getPencilIndexZ(pmeGrid, (z0 + (j/2)*order1) % K3) - pencilIndexZ;
170  if (pz < zlo) pz += zBlocks;
171  if (pz > zhi) pz -= zBlocks;
172 
173  if (py < ylo || py > yhi || pz < zlo || pz > zhi) {
174  // Add to "Stray pencil" and skip to next atom
175  pencil[9][pencilSize[9]++] = i;
176  goto breakjloop;
177  // fprintf(stderr, "py %d [%d ... %d] pz %d [%d ... %d]\n", pz, zlo, zhi);
178  // NAMD_bug("PmeAtomFiler::fileAtoms, py,pz outside bounds");
179  }
180  // p = 0,1,2,3,4,5,6,7,8 (maximum range)
181  plist[j] = (py-ylo) + (pz-zlo)*3;
182  }
183 
184 #pragma unroll
185  for (int j=0;j < 4;j++) {
186  int p = plist[j];
187  // pbit = 0, 2, 4, 8, 16, 32, 64, 128, 256
188  int pbit = (1 << p);
189  if (!(occupied & pbit)) {
190  pencil[p][pencilSize[p]++] = i;
191  occupied |= pbit;
192  }
193  }
194 
195 breakjloop:
196  continue;
197  }
198 
199 }
200 
201 //
202 // Class constructor
203 //
205  __sdag_init();
206  numDevices = 0;
207  numTotalPatches = 0;
208  numNodesContributed = 0;
209  numDevicesMax = 0;
210 }
211 
212 //
213 // Class constructor
214 //
216  __sdag_init();
217  NAMD_bug("ComputePmeCUDAMgr cannot be migrated");
218  numDevices = 0;
219  numTotalPatches = 0;
220  numNodesContributed = 0;
221  numDevicesMax = 0;
222 }
223 
224 //
225 // Class destructor
226 //
228  for (int i=0;i < extraDevices.size();i++) {
229  cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
230  cudaCheck(cudaStreamDestroy(extraDevices[i].stream));
231  }
232 }
233 
234 //
235 // Returns home pencil (homey, homez)
236 // Home pencil = pencil with most overlap with this patch
237 //
238 void ComputePmeCUDAMgr::getHomePencil(PatchID patchID, int& homey, int& homez) {
239  PatchMap *patchMap = PatchMap::Object();
240 
241  BigReal miny = patchMap->min_b(patchID);
242  BigReal maxy = patchMap->max_b(patchID);
243 
244  BigReal minz = patchMap->min_c(patchID);
245  BigReal maxz = patchMap->max_c(patchID);
246 
247  // Determine home pencil = pencil with most overlap
248 
249  // Calculate patch grid coordinates
250  int patch_y0 = floor((miny+0.5)*pmeGrid.K2);
251  int patch_y1 = floor((maxy+0.5)*pmeGrid.K2)-1;
252  int patch_z0 = floor((minz+0.5)*pmeGrid.K3);
253  int patch_z1 = floor((maxz+0.5)*pmeGrid.K3)-1;
254 
255  if (patch_y0 < 0 || patch_y1 >= pmeGrid.K2 || patch_z0 < 0 || patch_z1 >= pmeGrid.K3) {
256  NAMD_bug("ComputePmeCUDAMgr::getHomePencil, patch bounds are outside grid bounds");
257  }
258 
259  int maxOverlap = 0;
260  homey = -1;
261  homez = -1;
262  for (int iz=0;iz < pmeGrid.zBlocks;iz++) {
263  for (int iy=0;iy < pmeGrid.yBlocks;iy++) {
264  int pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1;
265  getPencilDim(pmeGrid, Perm_X_Y_Z, iy, iz,
266  pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1);
267 
268  if (pencil_y1 - pencil_y0 < pmeGrid.order || pencil_z1 - pencil_z0 < pmeGrid.order)
269  NAMD_bug("ComputePmeCUDAMgr::getHomePencil, pencil size must be >= PMEInterpOrder");
270 
271  int y0 = std::max(patch_y0, pencil_y0);
272  int y1 = std::min(patch_y1, pencil_y1);
273  int z0 = std::max(patch_z0, pencil_z0);
274  int z1 = std::min(patch_z1, pencil_z1);
275 
276  int overlap = (y1-y0 > 0 && z1-z0 > 0) ? (y1-y0)*(z1-z0) : -1;
277 
278  if (overlap > maxOverlap) {
279  maxOverlap = overlap;
280  homey = iy;
281  homez = iz;
282  }
283  }
284  }
285 
286  if (homey == -1 || homez == -1)
287  NAMD_bug("ComputePmeCUDAMgr::getHomePencil, home pencil not found");
288 }
289 
290 //
291 // Calculates maximum number of PME pencils
292 //
293 void ComputePmeCUDAMgr::restrictToMaxPMEPencils() {
294  PatchMap *patchMap = PatchMap::Object();
296  Lattice lattice = simParams->lattice;
297  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
298  BigReal sysdimc = lattice.c_r().unit() * lattice.c();
299  BigReal cutoff = simParams->cutoff;
300  BigReal patchdim = simParams->patchDimension;
301  BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
302  BigReal marginc = 0.5 * ( patchdim - cutoff ) / sysdimc;
303  int numPatches = patchMap->numPatches();
304 
305  pmeGrid.xBlocks = std::min(pmeGrid.xBlocks, pmeGrid.K1);
306 
307  int pid = 0;
308  while (pid < numPatches) {
309  // Get home pencil
310  int homey, homez;
311  getHomePencil(pid, homey, homez);
312  // Y
313  {
314  BigReal miny = patchMap->min_b(pid);
315  BigReal maxy = patchMap->max_b(pid);
316  // min2 (max2) is smallest (largest) grid line for this patch
317  int min2 = ((int) floor(pmeGrid.K2 * (miny+0.5 - marginb)));
318  int max2 = ((int) floor(pmeGrid.K2 * (maxy+0.5 + marginb))) + (pmeGrid.order - 1);
319  // Restrict grid lines to [0 ... pmeGrid.K2-1]
320  if (min2 < 0) min2 += pmeGrid.K2;
321  if (max2 >= pmeGrid.K2) max2 -= pmeGrid.K2;
322  // Get pencil indices for the grid lines
323  int min2pi = getPencilIndexY(pmeGrid, min2);
324  int max2pi = getPencilIndexY(pmeGrid, max2);
325  // Distance from home pencil
326  int dmin2pi = homey - min2pi;
327  if (dmin2pi < 0) dmin2pi += pmeGrid.yBlocks;
328  if (dmin2pi < 0)
329  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin2pi");
330  // If distance is > 1, must decrease the number of y-pencils and try again
331  if (dmin2pi > 1) {
332  pmeGrid.yBlocks--;
333  if (pmeGrid.yBlocks <= 0) break;
334  continue;
335  }
336  int dmax2pi = max2pi - homey;
337  if (dmax2pi < 0) dmax2pi += pmeGrid.yBlocks;
338  if (dmax2pi < 0)
339  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax2pi");
340  // If distance is > 1, must decrease the number of y-pencils and try again
341  if (dmax2pi > 1) {
342  pmeGrid.yBlocks--;
343  if (pmeGrid.yBlocks <= 0) break;
344  continue;
345  }
346  }
347 
348  // Z
349  {
350  BigReal minz = patchMap->min_c(pid);
351  BigReal maxz = patchMap->max_c(pid);
352  // min3 (max3) is smallest (largest) grid line for this patch
353  int min3 = ((int) floor(pmeGrid.K3 * (minz+0.5 - marginc)));
354  int max3 = ((int) floor(pmeGrid.K3 * (maxz+0.5 + marginc))) + (pmeGrid.order - 1);
355  // Restrict grid lines to [0 ... pmeGrid.K3-1]
356  if (min3 < 0) min3 += pmeGrid.K3;
357  if (max3 >= pmeGrid.K3) max3 -= pmeGrid.K3;
358  // Get pencil indices for the grid lines
359  int min3pi = getPencilIndexZ(pmeGrid, min3);
360  int max3pi = getPencilIndexZ(pmeGrid, max3);
361  // Distance from home pencil
362  int dmin3pi = homez - min3pi;
363  if (dmin3pi < 0) dmin3pi += pmeGrid.zBlocks;
364  if (dmin3pi < 0)
365  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin3pi");
366  // If distance is > 1, must decrease the number of z-pencils and try again
367  if (dmin3pi > 1) {
368  pmeGrid.zBlocks--;
369  if (pmeGrid.zBlocks <= 0) break;
370  continue;
371  }
372  int dmax3pi = max3pi - homez;
373  if (dmax3pi < 0) dmax3pi += pmeGrid.zBlocks;
374  if (dmax3pi < 0)
375  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax3pi");
376  // If distance is > 1, must decrease the number of z-pencils and try again
377  if (dmax3pi > 1) {
378  pmeGrid.zBlocks--;
379  if (pmeGrid.zBlocks <= 0) break;
380  continue;
381  }
382  }
383 
384  pid++;
385  }
386 
387  // if (CkMyNode() == 0)
388  // fprintf(stderr, "pmeGrid.yBlocks %d pmeGrid.zBlocks %d\n", pmeGrid.yBlocks, pmeGrid.zBlocks);
389 
390  if (pmeGrid.xBlocks <= 0 || pmeGrid.yBlocks <= 0 || pmeGrid.zBlocks <= 0)
391  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, zero PME pencils found");
392 
393  if (pmeGrid.xBlocks > pmeGrid.K1 || pmeGrid.yBlocks > pmeGrid.K2|| pmeGrid.zBlocks > pmeGrid.K3)
394  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, unable to restrict number of PME pencils");
395 }
396 
397 //
398 // Sets up pencils. May be called multiple times.
399 //
401  SimParameters *simParams = Node::Object()->simParameters;
402 
403  pmeGrid.K1 = simParams->PMEGridSizeX;
404  pmeGrid.K2 = simParams->PMEGridSizeY;
405  pmeGrid.K3 = simParams->PMEGridSizeZ;
406  pmeGrid.order = simParams->PMEInterpOrder;
407  pmeGrid.dim2 = pmeGrid.K2;
408  pmeGrid.dim3 = 2 * (pmeGrid.K3/2 + 1);
409 
410  // Count the total number of devices assuming all nodes have the same number as this node
411  // NOTE: This should be changed in the future to support heterogeneous nodes!!!
412  int numDevicesTmp = deviceCUDA->getNumDevice();
413 
414  int numDeviceTot = CkNumNodes() * numDevicesTmp;
415  // Use approximately 1/4th of the devices for PME
416  int numDeviceToUse = std::max(1, numDeviceTot/4);
417 
418  if (numDeviceToUse < 4) {
419  // 2D Slab
420  pmeGrid.yBlocks = 1;
421  pmeGrid.xBlocks = pmeGrid.zBlocks = numDeviceToUse;
422  } else {
423  // 1D Pencil
424  pmeGrid.yBlocks = (int)sqrt((double)numDeviceToUse);
425  pmeGrid.zBlocks = numDeviceToUse/pmeGrid.yBlocks;
426  pmeGrid.xBlocks = pmeGrid.zBlocks;
427  }
428 
429  if ( simParams->PMEPencilsX > 0 ) pmeGrid.xBlocks = simParams->PMEPencilsX;
430  if ( simParams->PMEPencilsY > 0 ) pmeGrid.yBlocks = simParams->PMEPencilsY;
431  if ( simParams->PMEPencilsZ > 0 ) pmeGrid.zBlocks = simParams->PMEPencilsZ;
432 
433  // Restrict number of pencils to the maximum number
434  restrictToMaxPMEPencils();
435 
436  // Fix pencil numbers if they don't make sense w.r.t. number of devices
437  if (pmeGrid.yBlocks == 1) {
438  // 2D Slab
439  if (pmeGrid.xBlocks > numDeviceTot) pmeGrid.xBlocks = numDeviceTot;
440  if (pmeGrid.zBlocks > numDeviceTot) pmeGrid.zBlocks = numDeviceTot;
441  } else {
442  // 1D Pencil
443  if (pmeGrid.yBlocks*pmeGrid.zBlocks > numDeviceTot ||
444  pmeGrid.xBlocks*pmeGrid.zBlocks > numDeviceTot ||
445  pmeGrid.xBlocks*pmeGrid.yBlocks > numDeviceTot) {
446  pmeGrid.yBlocks = std::min(pmeGrid.yBlocks, (int)sqrt((double)numDeviceTot));
447  pmeGrid.zBlocks = std::min(pmeGrid.zBlocks, numDeviceTot/pmeGrid.yBlocks);
448  }
449  pmeGrid.xBlocks = std::min(pmeGrid.yBlocks, pmeGrid.zBlocks);
450  }
451 
452  // Here (block1, block2, block3) define the size of charge grid pencil in each direction
453  pmeGrid.block1 = ( pmeGrid.K1 + pmeGrid.xBlocks - 1 ) / pmeGrid.xBlocks;
454  pmeGrid.block2 = ( pmeGrid.K2 + pmeGrid.yBlocks - 1 ) / pmeGrid.yBlocks;
455  pmeGrid.block3 = ( pmeGrid.K3 + pmeGrid.zBlocks - 1 ) / pmeGrid.zBlocks;
456 
457  // Determine type of FFT
458  if (pmeGrid.xBlocks == 1 && pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1) {
459  // Single block => 3D FFT
460  pmePencilType = 3;
461  } else if (pmeGrid.yBlocks == 1) {
462  // Blocks in all but y-dimension => 2D FFT
463  pmePencilType = 2;
464  } else {
465  // Blocks in all dimensions => 1D FFT
466  pmePencilType = 1;
467  }
468 
469  //--------------------------------------------------------------------------
470  // Map pencils into Pes
471  xPes.resize(pmeGrid.yBlocks*pmeGrid.zBlocks);
472 
473  if (pmePencilType == 1 || pmePencilType == 2) {
474  zPes.resize(pmeGrid.xBlocks*pmeGrid.yBlocks);
475  }
476  if (pmePencilType == 1) {
477  yPes.resize(pmeGrid.xBlocks*pmeGrid.zBlocks);
478  }
479 
480  // i % numDeviceTot = device index
481  // (i % numDeviceTot)/deviceCUDA->getNumDevice() = node index
482  // (i % CkNodeSize(node)) = pe displacement
483  for (int i=0;i < xPes.size();i++) {
484  int node = (i % numDeviceTot)/numDevicesTmp;
485  xPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
486  }
487  for (int i=0;i < yPes.size();i++) {
488  int node = (i % numDeviceTot)/numDevicesTmp;
489  yPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
490  }
491  for (int i=0;i < zPes.size();i++) {
492  int node = (i % numDeviceTot)/numDevicesTmp;
493  zPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
494  }
495 
496  // char peStr[256];
497  // char *p = peStr;
498  // p += sprintf(p, "%2d | xPes", CkMyPe());
499  // for (int i=0;i < xPes.size();i++)
500  // p += sprintf(p, " %d", xPes[i]);
501  // p += sprintf(p, " yPes");
502  // for (int i=0;i < yPes.size();i++)
503  // p += sprintf(p, " %d", yPes[i]);
504  // p += sprintf(p, " zPes");
505  // for (int i=0;i < zPes.size();i++)
506  // p += sprintf(p, " %d", zPes[i]);
507  // fprintf(stderr, "%s | %d %d\n",peStr, CkNodeFirst(CkMyNode()), CkNodeSize(CkMyNode()));
508 
509  //--------------------------------------------------------------------------
510  // Build global node list for x-pencils
511  nodeDeviceList.resize(xPes.size());
512  numDevices = 0;
513  for (int k=0;k < xPes.size();k++) {
514  nodeDeviceList[k].node = CkNodeOf(xPes[k]);
515  nodeDeviceList[k].device = -1;
516  if (nodeDeviceList[k].node == CkMyNode()) {
517  nodeDeviceList[k].device = numDevices++;
518  }
519  }
520 
521  ijPencilX.clear();
522  ijPencilY.clear();
523  ijPencilZ.clear();
524 
525  // Construct list of pencil coordinates (i,j) for each device held by this node
526  for (int k=0;k < xPes.size();k++) {
527  if (CkMyNode() == CkNodeOf(xPes[k])) {
528  IJ ij;
529  ij.i = k % pmeGrid.yBlocks;
530  ij.j = k / pmeGrid.yBlocks;
531  ijPencilX.push_back(ij);
532  }
533  }
534  if (ijPencilX.size() != numDevices)
535  NAMD_bug("ComputePmeCUDAMgr::setupPencils, error setting up x-pencils and devices");
536 
537  int numDevicesY = 0;
538  for (int k=0;k < yPes.size();k++) {
539  if (CkMyNode() == CkNodeOf(yPes[k])) {
540  IJ ij;
541  ij.i = k % pmeGrid.xBlocks;
542  ij.j = k / pmeGrid.xBlocks;
543  ijPencilY.push_back(ij);
544  numDevicesY++;
545  }
546  }
547 
548  int numDevicesZ = 0;
549  for (int k=0;k < zPes.size();k++) {
550  if (CkMyNode() == CkNodeOf(zPes[k])) {
551  IJ ij;
552  ij.i = k % pmeGrid.xBlocks;
553  ij.j = k / pmeGrid.xBlocks;
554  ijPencilZ.push_back(ij);
555  numDevicesZ++;
556  }
557  }
558 }
559 
560 //
561 // Returns true if PE "pe" is used in PME
562 //
564  for (int i=0;i < xPes.size();i++) {
565  if (pe == xPes[i]) return true;
566  }
567  return false;
568 }
569 
570 //
571 // Returns true if node "node" is used for PME
572 //
574  for (int i=0;i < nodeDeviceList.size();i++) {
575  if (nodeDeviceList[i].node == node) {
576  return true;
577  }
578  }
579  return false;
580 }
581 
582 //
583 // Returns true if device "deviceID" is used for PME
584 //
585 bool ComputePmeCUDAMgr::isPmeDevice(int deviceID) {
586  for (int i=0;i < nodeDeviceList.size();i++) {
587  if (deviceCUDA->getDeviceIDbyRank(nodeDeviceList[i].device % deviceCUDA->getNumDevice()) == deviceID) {
588  return true;
589  }
590  }
591  return false;
592 }
593 
594 //
595 // Initialize compute manager
596 // This gets called on one Pe on each node from Node::startup()
597 //
598 void ComputePmeCUDAMgr::initialize(CkQdMsg *msg) {
599  if (msg != NULL) delete msg;
600 
601  setupPencils();
602 
603  if ( ! CkMyNode() ) {
604  iout << iINFO << "PME using " << pmeGrid.xBlocks << " x " <<
605  pmeGrid.yBlocks << " x " << pmeGrid.zBlocks <<
606  " pencil grid for FFT and reciprocal sum.\n" << endi;
607  }
608 
609  // Initialize list that contains the number of home patches for each device on this manager
610  numHomePatchesList.resize(numDevices, 0);
611 
612  //--------------------------------------------------------------------------
613  // Create devices and atom filer
614  // numDevices = number of devices we'll be using, possibly different on each node
615  // Use root node to compute the maximum number of devices in use over all nodes
616  thisProxy[0].initializeDevicesAndAtomFiler(new NumDevicesMsg(numDevices));
617  //--------------------------------------------------------------------------
618 
619  if (CkMyNode() == 0) {
620 
621  if (pmePencilType == 3) {
622  // Single block => 3D FFT
623  CProxy_PmePencilXYZMap xyzMap = CProxy_PmePencilXYZMap::ckNew(xPes[0]);
624  CkArrayOptions xyzOpts(1);
625  xyzOpts.setMap(xyzMap);
626  xyzOpts.setAnytimeMigration(false);
627  xyzOpts.setStaticInsertion(true);
628  pmePencilXYZ = CProxy_CudaPmePencilXYZ::ckNew(xyzOpts);
629  pmePencilXYZ[0].initialize(new CudaPmeXYZInitMsg(pmeGrid));
630  thisProxy.recvPencils(pmePencilXYZ);
631  } else if (pmePencilType == 2) {
632  // Blocks in all but y-dimension => 2D FFT
633  CProxy_PmePencilXYMap xyMap = CProxy_PmePencilXYMap::ckNew(xPes);
634  CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.xBlocks, zPes);
635  CkArrayOptions xyOpts(1, 1, pmeGrid.zBlocks);
636  CkArrayOptions zOpts(pmeGrid.xBlocks, 1, 1);
637  xyOpts.setMap(xyMap);
638  zOpts.setMap(zMap);
639  xyOpts.setAnytimeMigration(false);
640  zOpts.setAnytimeMigration(false);
641  xyOpts.setStaticInsertion(true);
642  zOpts.setStaticInsertion(true);
643  pmePencilXY = CProxy_CudaPmePencilXY::ckNew(xyOpts);
644  pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
645  // Send pencil proxies to other nodes
646  thisProxy.recvPencils(pmePencilXY, pmePencilZ);
647  pmePencilXY.initialize(new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
648  pmePencilZ.initialize(new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
649  } else {
650  // Blocks in all dimensions => 1D FFT
651  CProxy_PmePencilXMap xMap = CProxy_PmePencilXMap::ckNew(1, 2, pmeGrid.yBlocks, xPes);
652  CProxy_PmePencilXMap yMap = CProxy_PmePencilXMap::ckNew(0, 2, pmeGrid.xBlocks, yPes);
653  CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.xBlocks, zPes);
654  CkArrayOptions xOpts(1, pmeGrid.yBlocks, pmeGrid.zBlocks);
655  CkArrayOptions yOpts(pmeGrid.xBlocks, 1, pmeGrid.zBlocks);
656  CkArrayOptions zOpts(pmeGrid.xBlocks, pmeGrid.yBlocks, 1);
657  xOpts.setMap(xMap);
658  yOpts.setMap(yMap);
659  zOpts.setMap(zMap);
660  xOpts.setAnytimeMigration(false);
661  yOpts.setAnytimeMigration(false);
662  zOpts.setAnytimeMigration(false);
663  xOpts.setStaticInsertion(true);
664  yOpts.setStaticInsertion(true);
665  zOpts.setStaticInsertion(true);
666  pmePencilX = CProxy_CudaPmePencilX::ckNew(xOpts);
667  pmePencilY = CProxy_CudaPmePencilY::ckNew(yOpts);
668  pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
669  // Send pencil proxies to other nodes
670  thisProxy.recvPencils(pmePencilX, pmePencilY, pmePencilZ);
671  pmePencilX.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
672  pmePencilY.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
673  pmePencilZ.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
674  }
675  }
676 
677 }
678 
680  if (CkMyNode() != 0)
681  NAMD_bug("ComputePmeCUDAMgr::createDevicesAndAtomFiler can only be called on root node");
682 
683  // Root node creates all device proxies
684  // NOTE: Only root node has numDevicesMax
685  RecvDeviceMsg* msg = new (numDevicesMax, PRIORITY_SIZE) RecvDeviceMsg();
686  msg->numDevicesMax = numDevicesMax;
687  for (int i=0;i < numDevicesMax;i++) {
688  CProxy_ComputePmeCUDADevice dev = CProxy_ComputePmeCUDADevice::ckNew();
689  memcpy(&msg->dev[i], &dev, sizeof(CProxy_ComputePmeCUDADevice));
690  }
691  thisProxy.recvDevices(msg);
692 
693  CProxy_PmeAtomFiler filer = CProxy_PmeAtomFiler::ckNew();
694  thisProxy.recvAtomFiler(filer);
695 
696 }
697 
698 void ComputePmeCUDAMgr::recvAtomFiler(CProxy_PmeAtomFiler filer) {
699  pmeAtomFiler = filer;
700 }
701 
703  numDevicesMax = msg->numDevicesMax;
704  if (numDevices > numDevicesMax)
705  NAMD_bug("ComputePmeCUDAMgr::recvDevices, numDevices > numDevicesMax");
706  deviceProxy.resize(numDevices);
707  for (int i=0;i < numDevices;i++) {
708  deviceProxy[i] = msg->dev[i];
709  }
710  delete msg;
711 }
712 
713 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilXYZ xyz) {
714  pmePencilXYZ = xyz;
715 }
716 
717 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilXY xy, CProxy_CudaPmePencilZ z) {
718  pmePencilXY = xy;
719  pmePencilZ = z;
720 }
721 
722 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilX x, CProxy_CudaPmePencilY y, CProxy_CudaPmePencilZ z) {
723  pmePencilX = x;
724  pmePencilY = y;
725  pmePencilZ = z;
726 }
727 
728 //
729 // Initialize pencils on this node
730 // This gets called on one rank on each node
731 //
733  if (msg != NULL) delete msg;
734 
735  int numDevicesTmp = deviceCUDA->getNumDevice();
736 
737  // Initialize device proxies for real-space interfacing
738  for (int i=0;i < ijPencilX.size();i++) {
739  // NOTE: i is here the device ID
740  int deviceID = deviceCUDA->getDeviceIDbyRank(i % numDevicesTmp);
741  deviceProxy[i].ckLocalBranch()->initialize(pmeGrid, ijPencilX[i].i, ijPencilX[i].j,
742  deviceID, pmePencilType, thisProxy, pmeAtomFiler);
743  if (pmePencilType == 1) {
744  deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilX);
745  } else if (pmePencilType == 2) {
746  deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXY);
747  } else {
748  deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXYZ);
749  }
750  }
751 
752  // Use above initialized device proxies for the PME pencils that interface with real-space
753  for (int i=0;i < ijPencilX.size();i++) {
754  if (pmePencilType == 1) {
755  pmePencilX(0, ijPencilX[i].i, ijPencilX[i].j).initializeDevice(new InitDeviceMsg(deviceProxy[i]));
756  } else if (pmePencilType == 2) {
757  pmePencilXY(0, 0, ijPencilX[i].j).initializeDevice(new InitDeviceMsg(deviceProxy[i]));
758  } else {
759  pmePencilXYZ[0].initializeDevice(new InitDeviceMsg(deviceProxy[i]));
760  }
761  }
762 
763  // Create extra devices for Y and Z pencils if necessary
764  int n = std::max(ijPencilY.size(), ijPencilZ.size());
765  if (n > ijPencilX.size()) {
766  int nextra = n - ijPencilX.size();
767  extraDevices.resize(nextra);
768  for (int i=0;i < nextra;i++) {
769  extraDevices[i].deviceID = deviceCUDA->getDeviceIDbyRank((i + ijPencilX.size()) % numDevicesTmp);
770  cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
771  createStream(extraDevices[i].stream);
772  }
773  }
774 
775  // Initialize Y pencils
776  for (int i=0;i < ijPencilY.size();i++) {
777  int deviceID;
778  cudaStream_t stream;
779  if (i < ijPencilX.size()) {
780  deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
781  stream = deviceProxy[i].ckLocalBranch()->getStream();
782  } else {
783  deviceID = extraDevices[i-ijPencilX.size()].deviceID;
784  stream = extraDevices[i-ijPencilX.size()].stream;
785  }
786  pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).initializeDevice(new InitDeviceMsg2(deviceID, stream, thisProxy));
787  }
788 
789  // Initialize Z pencils
790  for (int i=0;i < ijPencilZ.size();i++) {
791  int deviceID;
792  cudaStream_t stream;
793  if (i < ijPencilX.size()) {
794  deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
795  stream = deviceProxy[i].ckLocalBranch()->getStream();
796  } else {
797  deviceID = extraDevices[i-ijPencilX.size()].deviceID;
798  stream = extraDevices[i-ijPencilX.size()].stream;
799  }
800  pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).initializeDevice(new InitDeviceMsg2(deviceID, stream, thisProxy));
801  }
802 
803 }
804 
805 //
806 // Activate (start) pencils
807 // This gets called on rank 0 Pe on each node
808 //
810  if (msg != NULL) delete msg;
811 
812  for (int device=0;device < numDevices;device++) {
813  deviceProxy[device].ckLocalBranch()->activate_pencils();
814  }
815 
816  for (int i=0;i < ijPencilY.size();i++) {
817  PmeStartMsg* pmeStartYMsg = new PmeStartMsg();
818  pmeStartYMsg->data = NULL;
819  pmeStartYMsg->dataSize = 0;
820  pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).start(pmeStartYMsg);
821  }
822 
823  for (int i=0;i < ijPencilZ.size();i++) {
824  PmeStartMsg* pmeStartZMsg = new PmeStartMsg();
825  pmeStartZMsg->data = NULL;
826  pmeStartZMsg->dataSize = 0;
827  pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).start(pmeStartZMsg);
828  }
829 
830 }
831 
832 //
833 // Returns node that contains x-pencil i,j
834 //
835 int ComputePmeCUDAMgr::getNode(int i, int j) {
836  if (i < 0 || i >= pmeGrid.yBlocks || j < 0 || j >= pmeGrid.zBlocks)
837  NAMD_bug("ComputePmeCUDAMgr::getNode, pencil index out of bounds");
838  int ind = i + j*pmeGrid.yBlocks;
839  return nodeDeviceList[ind].node;
840 }
841 
842 //
843 // Returns home node for a patch
844 //
846  int homey, homez;
847  getHomePencil(patchID, homey, homez);
848  return getNode(homey, homez);
849 }
850 
851 //
852 // Returns device index on this node that contains x-pencil i,j
853 //
854 int ComputePmeCUDAMgr::getDevice(int i, int j) {
855  if (i < 0 || i >= pmeGrid.yBlocks || j < 0 || j >= pmeGrid.zBlocks)
856  NAMD_bug("ComputePmeCUDAMgr::getDevice, pencil index out of bounds");
857  int ind = i + j*pmeGrid.yBlocks;
858  int device = nodeDeviceList[ind].device;
859  if (device == -1)
860  NAMD_bug("ComputePmeCUDAMgr::getDevice, no device found");
861  return device;
862 }
863 
864 //
865 // Returns device index on this node that contains y-pencil i,j
866 //
868  if (i < 0 || i >= pmeGrid.xBlocks || j < 0 || j >= pmeGrid.zBlocks)
869  NAMD_bug("ComputePmeCUDAMgr::getDevicePencilY, pencil index out of bounds");
870  for (int device=0;device < ijPencilY.size();device++) {
871  if (ijPencilY[device].i == i && ijPencilY[device].j == j) return device;
872  }
873  char str[256];
874  sprintf(str, "ComputePmeCUDAMgr::getDevicePencilY, no device found at i %d j %d",i,j);
875  NAMD_bug(str);
876  return -1;
877 }
878 
879 //
880 // Returns device index on this node that contains z-pencil i,j
881 //
883  if (i < 0 || i >= pmeGrid.xBlocks || j < 0 || j >= pmeGrid.yBlocks)
884  NAMD_bug("ComputePmeCUDAMgr::getDevicePencilZ, pencil index out of bounds");
885  for (int device=0;device < ijPencilZ.size();device++) {
886  if (ijPencilZ[device].i == i && ijPencilZ[device].j == j) return device;
887  }
888  NAMD_bug("ComputePmeCUDAMgr::getDevicePencilZ, no device found");
889  return -1;
890 }
891 
892 //
893 // Returns device ID on this node that contains x-pencil i,j
894 //
896  int device = getDevice(i, j);
897  return deviceProxy[device].ckLocalBranch()->getDeviceID();
898 }
899 
900 //
901 // Returns device ID on this node that contains y-pencil i,j
902 //
904  int device = getDevicePencilY(i, j);
905  return deviceProxy[device].ckLocalBranch()->getDeviceID();
906 }
907 
908 //
909 // Returns device ID on this node that contains z-pencil i,j
910 //
912  int device = getDevicePencilZ(i, j);
913  return deviceProxy[device].ckLocalBranch()->getDeviceID();
914 }
915 
916 //
917 // Skip this round of PME, call skip on all Z-pencils (this is needed to get the reductions submitted)
918 //
920  switch(pmePencilType) {
921  case 1:
922  pmePencilZ.skip();
923  break;
924  case 2:
925  pmePencilZ.skip();
926  break;
927  case 3:
928  pmePencilXYZ[0].skip();
929  break;
930  }
931 }
932 
934  int device = getDevice(msg->i, msg->j);
935  deviceProxy[device].ckLocalBranch()->recvAtoms(msg);
936 }
937 
939  // __sdag_init();
940  numHomePatches = 0;
941  forceCapacity = 0;
942  force = NULL;
943  pmeRealSpaceCompute = NULL;
944  streamCreated = false;
945  lock_numHomePatchesMerged = CmiCreateLock();
946  lock_numPencils = CmiCreateLock();
947  lock_numNeighborsRecv = CmiCreateLock();
948  lock_recvAtoms = CmiCreateLock();
949  numNeighborsExpected = 0;
950  numStrayAtoms = 0;
951  // Reset counters
952  numNeighborsRecv = 0;
953  numHomePatchesRecv = 0;
954  numHomePatchesMerged = 0;
955  atomI = 0;
956  forceI = 1;
957 }
958 
960  // __sdag_init();
961  numHomePatches = 0;
962  forceCapacity = 0;
963  force = NULL;
964  pmeRealSpaceCompute = NULL;
965  streamCreated = false;
966  lock_numHomePatchesMerged = CmiCreateLock();
967  lock_numPencils = CmiCreateLock();
968  lock_numNeighborsRecv = CmiCreateLock();
969  lock_recvAtoms = CmiCreateLock();
970  numNeighborsExpected = 0;
971  numStrayAtoms = 0;
972  // Reset counters
973  numNeighborsRecv = 0;
974  numHomePatchesRecv = 0;
975  numHomePatchesMerged = 0;
976  atomI = 0;
977  forceI = 1;
978 }
979 
981  if (streamCreated) {
982  cudaCheck(cudaSetDevice(deviceID));
983  cudaCheck(cudaStreamDestroy(stream));
984  }
985  for (int j=0;j < 2;j++)
986  for (int i=0;i < pmeAtomStorage[j].size();i++) {
987  if (pmeAtomStorageAllocatedHere[i]) delete pmeAtomStorage[j][i];
988  }
989  if (force != NULL) deallocate_host<CudaForce>(&force);
990  if (pmeRealSpaceCompute != NULL) delete pmeRealSpaceCompute;
991  CmiDestroyLock(lock_numHomePatchesMerged);
992  CmiDestroyLock(lock_numPencils);
993  CmiDestroyLock(lock_numNeighborsRecv);
994  CmiDestroyLock(lock_recvAtoms);
995 }
996 
997 void ComputePmeCUDADevice::initialize(PmeGrid& pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in,
998  int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in,
999  CProxy_PmeAtomFiler pmeAtomFiler_in) {
1000 
1001  deviceID = deviceID_in;
1002  cudaCheck(cudaSetDevice(deviceID));
1003  pmePencilType = pmePencilType_in;
1004  pmeGrid = pmeGrid_in;
1005  pencilIndexY = pencilIndexY_in;
1006  pencilIndexZ = pencilIndexZ_in;
1007  mgrProxy = mgrProxy_in;
1008  pmeAtomFiler = pmeAtomFiler_in;
1009  // Size of the neighboring pencil grid, max 3x3
1010  yNBlocks = std::min(pmeGrid.yBlocks, 3);
1011  zNBlocks = std::min(pmeGrid.zBlocks, 3);
1012  // Local pencil is at y=0,z=0
1013  if (yNBlocks == 1) {
1014  ylo = 0;
1015  yhi = 0;
1016  } else if (yNBlocks == 2) {
1017  ylo = -1;
1018  yhi = 0;
1019  } else {
1020  ylo = -1;
1021  yhi = 1;
1022  }
1023  if (zNBlocks == 1) {
1024  zlo = 0;
1025  zhi = 0;
1026  } else if (zNBlocks == 2) {
1027  zlo = -1;
1028  zhi = 0;
1029  } else {
1030  zlo = -1;
1031  zhi = 1;
1032  }
1033 
1034  neighborForcePencilMsgs.resize(yNBlocks*zNBlocks, NULL);
1035  // neighborForcePencils.resize(yNBlocks*zNBlocks);
1036  for (int j=0;j < 2;j++)
1037  homePatchIndexList[j].resize(yNBlocks*zNBlocks);
1038  neighborPatchIndex.resize(yNBlocks*zNBlocks);
1039 
1040  pmeAtomStorageAllocatedHere.resize(yNBlocks*zNBlocks, false);
1041  for (int j=0;j < 2;j++) {
1042  pmeAtomStorage[j].resize(yNBlocks*zNBlocks, NULL);
1043  for (int z=zlo;z <= zhi;z++) {
1044  for (int y=ylo;y <= yhi;y++) {
1045  int pp = y-ylo + (z-zlo)*yNBlocks;
1046  int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1047  int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1048  if (y == 0 && z == 0) {
1049  // Primary pencil
1050  pmeAtomStorage[j][pp] = new CudaPmeAtomStorage(pmePencilType != 3);
1051  } else {
1052  pmeAtomStorage[j][pp] = new CpuPmeAtomStorage(pmePencilType != 3);
1053  }
1054  pmeAtomStorageAllocatedHere[pp] = true;
1055  }
1056  }
1057  }
1058 
1059  // Create stream for this device
1060  createStream(stream);
1061  streamCreated = true;
1062  pmeRealSpaceCompute = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ,
1063  deviceID, stream);
1064 
1065 }
1066 
1068  return stream;
1069 }
1070 
1072  return deviceID;
1073 }
1074 
1075 CProxy_ComputePmeCUDAMgr ComputePmeCUDADevice::getMgrProxy() {
1076  return mgrProxy;
1077 }
1078 
1079 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in) {
1080  if (pmePencilType != 3)
1081  NAMD_bug("ComputePmeCUDADevice::setPencilProxy(1), invalid pmePencilType");
1082  pmePencilXYZ = pmePencilXYZ_in;
1083 }
1084 
1085 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilXY pmePencilXY_in) {
1086  if (pmePencilType != 2)
1087  NAMD_bug("ComputePmeCUDADevice::setPencilProxy(2), invalid pmePencilType");
1088  pmePencilXY = pmePencilXY_in;
1089 }
1090 
1091 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilX pmePencilX_in) {
1092  if (pmePencilType != 1)
1093  NAMD_bug("ComputePmeCUDADevice::setPencilProxy(3), invalid pmePencilType");
1094  pmePencilX = pmePencilX_in;
1095 }
1096 
1098  if (pmePencilType == 1) {
1099  PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
1100  pmeStartXMsg->data = pmeRealSpaceCompute->getData();
1101  pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
1102  pmePencilX(0, pencilIndexY, pencilIndexZ).start(pmeStartXMsg);
1103  } else if (pmePencilType == 2) {
1104  PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
1105  pmeStartXMsg->data = pmeRealSpaceCompute->getData();
1106  pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
1107  pmePencilXY(0, 0, pencilIndexZ).start(pmeStartXMsg);
1108  } else if (pmePencilType == 3) {
1109  PmeStartMsg* pmeStartMsg = new PmeStartMsg();
1110  pmeStartMsg->data = pmeRealSpaceCompute->getData();
1111  pmeStartMsg->dataSize = pmeRealSpaceCompute->getDataSize();
1112  pmePencilXYZ[0].start(pmeStartMsg);
1113  }
1114 }
1115 
1116 void ComputePmeCUDADevice::initializePatches(int numHomePatches_in) {
1117  numHomePatches = numHomePatches_in;
1118  for (int j=0;j < 2;j++)
1119  numPencils[j].resize(numHomePatches);
1120  for (int j=0;j < 2;j++)
1121  plList[j].resize(numHomePatches);
1122  for (int j=0;j < 2;j++)
1123  homePatchForceMsgs[j].resize(numHomePatches);
1124  // for (int j=0;j < 2;j++)
1125  // numHomeAtoms[j].resize(numHomePatches);
1126  // If we have home patches, register this pencil with the neighbors and with self
1127  if (numHomePatches > 0) {
1128  for (int z=zlo;z <= zhi;z++) {
1129  for (int y=ylo;y <= yhi;y++) {
1130  int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1131  int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1132  int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
1133  mgrProxy[node].registerNeighbor(yt, zt);
1134  }
1135  }
1136  }
1137 }
1138 
1140  CmiLock(lock_numHomePatchesMerged);
1141  numNeighborsExpected++;
1142  CmiUnlock(lock_numHomePatchesMerged);
1143 }
1144 
1145 //
1146 // Recevice atoms from patch and file them into pencils
1147 //
1149 
1150  PmeAtomFiler *pmeAtomFilerPtr = pmeAtomFiler[CkMyPe()].ckLocalBranch();
1151  // Store "virial" and "energy" flags
1152  doVirial = msg->doVirial;
1153  doEnergy = msg->doEnergy;
1154  // Store lattice
1155  lattice = msg->lattice;
1156 
1157  // Primary pencil index
1158  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1159  int p0 = 0;
1160  int pencilPatchIndex[9];
1161  int numStrayAtomsPatch = 0;
1162  if (pmePencilType == 3) {
1163  // 3D box => store atoms directly without index
1164  // NOTE: We don't check for stray atoms here!
1165  pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
1166  } else {
1167 
1168  // File atoms
1169  pmeAtomFilerPtr->fileAtoms(msg->numAtoms, msg->atoms, lattice, pmeGrid,
1170  pencilIndexY, pencilIndexZ, ylo, yhi, zlo, zhi);
1171 
1172  // Loop through pencils and add atoms to pencil atom lists
1173  // NOTE: we only store to neighboring pencil if there are atoms to store
1174  int numAtomsCheck = 0;
1175  for (int p=0;p < 9;p++) {
1176 
1177  int y = (p % 3);
1178  int z = (p / 3);
1179 
1180  int pp = y + z*yNBlocks;
1181  int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
1182  if (pp == pp0) p0 = p;
1183  if (pp == pp0 || numAtoms > 0) {
1184  if (pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1 && (y != 0 || z != 0))
1185  NAMD_bug("ComputePmeCUDADevice::recvAtoms, problem with atom filing");
1186  int* index = pmeAtomFilerPtr->getAtomIndex(p);
1187  pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index);
1188  // Number of patches in this storage tells you how many home patches contributed and
1189  // homePatchIndex (pe) tells you which patch contributed
1190  numAtomsCheck += numAtoms;
1191  }
1192  }
1193 
1194  // Deal with stray atoms
1195  numStrayAtomsPatch = pmeAtomFilerPtr->getNumAtoms(9);
1196  if (numStrayAtomsPatch > 0) {
1197  int* index = pmeAtomFilerPtr->getAtomIndex(9);
1198  CkPrintf("%d stray charges detected. Up to 10 listed below (index in patch, x, y, z):\n", numStrayAtomsPatch);
1199  for (int i=0;i < std::min(numStrayAtomsPatch, 10);i++) {
1200  int j = index[i];
1201  CkPrintf("%d %f %f %f\n", j, msg->atoms[j].x, msg->atoms[j].y, msg->atoms[j].z);
1202  }
1203  }
1204 
1205  if (numAtomsCheck + numStrayAtomsPatch < msg->numAtoms)
1206  NAMD_bug("ComputePmeCUDADevice::recvAtoms, missing atoms");
1207  }
1208 
1209  // Create storage for home patch forces
1210  PmeForceMsg *forceMsg;
1211  if (pmePencilType == 3 && CkNodeOf(msg->pe) == CkMyNode()) {
1212  // 3D FFT and compute resides on the same node => use zero-copy forces
1213  forceMsg = new (0, PRIORITY_SIZE) PmeForceMsg();
1214  forceMsg->zeroCopy = true;
1215  } else {
1216  forceMsg = new (msg->numAtoms, PRIORITY_SIZE) PmeForceMsg();
1217  forceMsg->zeroCopy = false;
1218  }
1219  forceMsg->numAtoms = msg->numAtoms;
1220  forceMsg->pe = msg->pe;
1221  forceMsg->compute = msg->compute;
1222  forceMsg->numStrayAtoms = numStrayAtomsPatch;
1223 
1224  bool done = false;
1225  // ----------------------------- lock start ---------------------------
1226  // Only after writing has finished, we get homePatchIndex
1227  // This quarantees that for whatever thread that receives "done=true", writing has finished on
1228  // ALL threads.
1229  CmiLock(lock_recvAtoms);
1230  numStrayAtoms += numStrayAtomsPatch;
1231  // Secure homePatchIndex. All writes after this must be inside lock-region
1232  int homePatchIndex = numHomePatchesRecv;
1233  // Store primary pencil first
1234  plList[atomI][homePatchIndex].push_back(PencilLocation(pp0, pencilPatchIndex[p0]));
1235  if (pmePencilType != 3) {
1236  // Go back to through neighboring pencils and store "homePatchIndex"
1237  for (int p=0;p < 9;p++) {
1238 
1239  int y = (p % 3);
1240  int z = (p / 3);
1241 
1242  int pp = y + z*yNBlocks;
1243  int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
1244  if (pp != pp0 && numAtoms > 0) {
1245  homePatchIndexList[atomI][pp].push_back(homePatchIndex);
1246  // plList[0...numHomePatches-1] = for each home patch stores the location of pencils that are
1247  // sharing it
1248  // plList[homePatchIndex].size() tells the number of pencils that the home patch is shared with
1249  plList[atomI][homePatchIndex].push_back(PencilLocation(pp, pencilPatchIndex[p]));
1250  }
1251  }
1252  }
1253  homePatchForceMsgs[atomI][homePatchIndex] = forceMsg;
1254  // numHomeAtoms[atomI][homePatchIndex] = msg->numAtoms;
1255  // Set the number of pencils contributing to this home patch
1256  numPencils[atomI][homePatchIndex] = plList[atomI][homePatchIndex].size();
1257  //
1258  numHomePatchesRecv++;
1259  if (numHomePatchesRecv == numHomePatches) {
1260  // Reset counter
1261  numHomePatchesRecv = 0;
1262  done = true;
1263  }
1264  CmiUnlock(lock_recvAtoms);
1265  // ----------------------------- lock end ---------------------------
1266 
1267  // plList[atomI][homePatchIndex] array tells you the location of pencils that are sharing this home patch
1268 
1269  delete msg;
1270 
1271  if (done) {
1272  // Pencil has received all home patches and writing to memory is done => send atoms to neighbors
1274  }
1275 }
1276 
1277 //
1278 // Loop through pencils and send atoms to neighboring nodes
1279 //
1281  for (int z=zlo;z <= zhi;z++) {
1282  for (int y=ylo;y <= yhi;y++) {
1283  // Only send to neighbors, not self
1284  if (y != 0 || z != 0) {
1285  // NOTE: Must send atomI -value since this will change in spreadCharge(), which might occur
1286  // before these sends have been performed
1287  thisProxy[CkMyNode()].sendAtomsToNeighbor(y, z, atomI);
1288  }
1289  }
1290  }
1291  // Register primary pencil
1293 }
1294 
1295 void ComputePmeCUDADevice::sendAtomsToNeighbor(int y, int z, int atomIval) {
1296  // Pencil index
1297  int pp = y-ylo + (z-zlo)*yNBlocks;
1298  // This neighbor pencil is done, finish it up before accessing it
1299  pmeAtomStorage[atomIval][pp]->finish();
1300  // Compute destination neighbor pencil index (yt,zt)
1301  int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1302  int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1303  int numAtoms = pmeAtomStorage[atomIval][pp]->getNumAtoms();
1304  CudaAtom* atoms = pmeAtomStorage[atomIval][pp]->getAtoms();
1305  PmeAtomPencilMsg* msgPencil = new (numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
1306  memcpy(msgPencil->atoms, atoms, numAtoms*sizeof(CudaAtom));
1307  msgPencil->numAtoms = numAtoms;
1308  // Store destination pencil index
1309  msgPencil->y = yt;
1310  msgPencil->z = zt;
1311  // Store source pencil index
1312  msgPencil->srcY = pencilIndexY;
1313  msgPencil->srcZ = pencilIndexZ;
1314  // Store energy and virial flags
1315  msgPencil->doEnergy = doEnergy;
1316  msgPencil->doVirial = doVirial;
1317  // Store lattice
1318  msgPencil->lattice = lattice;
1319  int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
1320  mgrProxy[node].recvAtomsFromNeighbor(msgPencil);
1321 }
1322 
1324  // Store into primary pencil
1325  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1326  // Compute pencil index relative to primary pencil
1327  int y = msg->srcY - pencilIndexY;
1328  if (y < ylo) y += pmeGrid.yBlocks;
1329  if (y > yhi) y -= pmeGrid.yBlocks;
1330  int z = msg->srcZ - pencilIndexZ;
1331  if (z < zlo) z += pmeGrid.zBlocks;
1332  if (z > zhi) z -= pmeGrid.zBlocks;
1333  if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
1334  NAMD_bug("ComputePmeCUDADevice::recvAtomsFromNeighbor, pencil index outside bounds");
1335  }
1336  // Read energy and virial flags
1337  doEnergy = msg->doEnergy;
1338  doVirial = msg->doVirial;
1339  // Read lattice
1340  lattice = msg->lattice;
1341  // Pencil index where atoms came from
1342  int pp = y-ylo + (z-zlo)*yNBlocks;
1343  // Store atoms and mark down the patch index where these atoms were added
1344  neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
1345 
1346  delete msg;
1347 
1349 }
1350 
1352  // Primary pencil
1353  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1354 
1355  bool done = false;
1356  // ----------------------------- lock start ---------------------------
1357  CmiLock(lock_numNeighborsRecv);
1358  numNeighborsRecv++;
1359  if (numNeighborsRecv == numNeighborsExpected) {
1360  // Reset counter
1361  numNeighborsRecv = 0;
1362  done = true;
1363  }
1364  CmiUnlock(lock_numNeighborsRecv);
1365  // ----------------------------- lock end ---------------------------
1366 
1367  if (done) {
1368  // Primary pencil has received all atoms and writing has finished => spread charge
1369  spreadCharge();
1370  }
1371 }
1372 
1374  // Spread charges in primary pencil
1375  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1376  // Primary pencil is done, finish it up before accessing it
1377  // (clearing is done in mergeForcesOnPatch)
1378  pmeAtomStorage[atomI][pp0]->finish();
1379  // Get the number of atoms and pointer to atoms
1380  int numAtoms = pmeAtomStorage[atomI][pp0]->getNumAtoms();
1381  CudaAtom* atoms = pmeAtomStorage[atomI][pp0]->getAtoms();
1382  // Flip atomI <-> forceI
1383  std::swap(atomI, forceI);
1384  // Re-allocate force buffer if needed
1385  reallocate_host<CudaForce>(&force, &forceCapacity, numAtoms, 1.5f);
1386  // (already have the updated lattice)
1387  pmeRealSpaceCompute->copyAtoms(numAtoms, atoms);
1388  // Spread charge
1389  beforeWalltime = CmiWallTimer();
1390  pmeRealSpaceCompute->spreadCharge(lattice);
1391  // Send "charge grid ready to PME solver"
1392  PmeRunMsg *pmeRunMsg = new PmeRunMsg();
1393  pmeRunMsg->doVirial = doVirial;
1394  pmeRunMsg->doEnergy = doEnergy;
1395  pmeRunMsg->lattice = lattice;
1396  pmeRunMsg->numStrayAtoms = numStrayAtoms;
1397  // Reset stray atom counter
1398  numStrayAtoms = 0;
1399  switch(pmePencilType) {
1400  case 1:
1401  pmePencilX(0, pencilIndexY, pencilIndexZ).chargeGridReady(pmeRunMsg);
1402  break;
1403  case 2:
1404  pmePencilXY(0, 0, pencilIndexZ).chargeGridReady(pmeRunMsg);
1405  break;
1406  case 3:
1407  pmePencilXYZ[0].chargeGridReady(pmeRunMsg);
1408  break;
1409  }
1410 }
1411 
1412 //
1413 // After PME solver is done, we return here
1414 //
1416  traceUserBracketEvent(CUDA_PME_SPREADCHARGE_EVENT, beforeWalltime, CmiWallTimer());
1417  beforeWalltime = CmiWallTimer();
1418  // (already have the updated lattice)
1419  pmeRealSpaceCompute->gatherForce(lattice, force);
1420  // Set callback that will call gatherForceDone() once gatherForce is done
1421  ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->gatherForceSetCallback(this);
1422  // ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->waitGatherForceDone();
1423  // gatherForceDone();
1424 }
1425 
1426 static inline void gatherForceDoneLoop(int first, int last, void *result, int paraNum, void *param) {
1428  c->gatherForceDoneSubset(first, last);
1429 }
1430 
1432  for (int homePatchIndex=first;homePatchIndex <= last;homePatchIndex++) {
1433  bool done = false;
1434  // ----------------------------- lock start ---------------------------
1435  // NOTE: We use node-wide lock here for the entire numPencils[] array, while
1436  // we really would only need to each element but this would required
1437  // numHomePatches number of locks.
1438  if (pmePencilType != 3) CmiLock(lock_numPencils);
1439  numPencils[forceI][homePatchIndex]--;
1440  if (numPencils[forceI][homePatchIndex] == 0) done = true;
1441  if (pmePencilType != 3) CmiUnlock(lock_numPencils);
1442  // ----------------------------- lock end ---------------------------
1443  if (done) {
1444  // This home patch is done, launch force merging
1445  mergeForcesOnPatch(homePatchIndex);
1446  }
1447  }
1448 }
1449 
1451  // Primary pencil has the forces
1452 
1453  traceUserBracketEvent(CUDA_PME_GATHERFORCE_EVENT, beforeWalltime, CmiWallTimer());
1454 
1455  // Send forces to neighbors
1457 
1458 #if CMK_SMP && USE_CKLOOP
1459  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1460  if (useCkLoop >= 1) {
1461  CkLoop_Parallelize(gatherForceDoneLoop, 1, (void *)this, CkMyNodeSize(), 0, numHomePatches-1);
1462  } else
1463 #endif
1464 
1465  {
1466  // Loop through home patches and mark the primary pencil as "done"
1467  for (int homePatchIndex=0;homePatchIndex < numHomePatches;homePatchIndex++) {
1468  bool done = false;
1469  // ----------------------------- lock start ---------------------------
1470  // NOTE: We use node-wide lock here for the entire numPencils[] array, while
1471  // we really would only need to each element but this would required
1472  // numHomePatches number of locks.
1473  if (pmePencilType != 3) CmiLock(lock_numPencils);
1474  numPencils[forceI][homePatchIndex]--;
1475  if (numPencils[forceI][homePatchIndex] == 0) done = true;
1476  if (pmePencilType != 3) CmiUnlock(lock_numPencils);
1477  // ----------------------------- lock end ---------------------------
1478  if (done) {
1479  // This home patch is done, launch force merging
1480  thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
1481  }
1482  }
1483  }
1484 
1485  // In case we have no home patches, clear the primary pencil storage here
1486  if (numHomePatches == 0) {
1487  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1488  pmeAtomStorage[forceI][pp0]->clear();
1489  }
1490 
1491 }
1492 
1493 //
1494 // After gatherForce is done, we end up here
1495 //
1497  // Primary pencil has the forces
1498  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1499  int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1500  // Loop through neighboring pencils
1501  for (int z=zlo;z <= zhi;z++) {
1502  for (int y=ylo;y <= yhi;y++) {
1503  // Only send to neighbors, not self
1504  if (y != 0 || z != 0) {
1505  int pp = y-ylo + (z-zlo)*yNBlocks;
1506  int patchIndex = neighborPatchIndex[pp];
1507  int atomStart = (patchIndex == 0) ? 0 : patchPos[patchIndex-1];
1508  int atomEnd = patchPos[patchIndex];
1509  int natom = atomEnd-atomStart;
1510  // copy forces
1512  msg->numAtoms = natom;
1513  memcpy(msg->force, force+atomStart, natom*sizeof(CudaForce));
1514  // Calculate destination pencil index (dstY, dstZ) for this neighbor
1515  int dstY = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1516  int dstZ = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1517  int node = mgrProxy.ckLocalBranch()->getNode(dstY, dstZ);
1518  msg->y = dstY;
1519  msg->z = dstZ;
1520  // Store source pencil index
1521  msg->srcY = pencilIndexY;
1522  msg->srcZ = pencilIndexZ;
1523  mgrProxy[node].recvForcesFromNeighbor(msg);
1524  }
1525  }
1526  }
1527 }
1528 
1530 
1531  // Source pencil index
1532  int y = msg->srcY - pencilIndexY;
1533  if (y < ylo) y += pmeGrid.yBlocks;
1534  if (y > yhi) y -= pmeGrid.yBlocks;
1535  int z = msg->srcZ - pencilIndexZ;
1536  if (z < zlo) z += pmeGrid.zBlocks;
1537  if (z > zhi) z -= pmeGrid.zBlocks;
1538 
1539  if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
1540  NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, pencil index outside bounds");
1541  }
1542 
1543  // Source pencil
1544  int pp = y-ylo + (z-zlo)*yNBlocks;
1545 
1546  // Store message (deleted in mergeForcesOnPatch)
1547  neighborForcePencilMsgs[pp] = msg;
1548 
1549  // neighborForcePencils[pp].force = new CudaForce[msg->numAtoms];
1550  // memcpy(neighborForcePencils[pp].force, msg->force, sizeof(CudaForce)*msg->numAtoms);
1551  // neighborForcePencils[pp].numAtoms = msg->numAtoms;
1552  // neighborForcePencils[pp].y = msg->y;
1553  // neighborForcePencils[pp].z = msg->z;
1554  // neighborForcePencils[pp].srcY = msg->srcY;
1555  // neighborForcePencils[pp].srcZ = msg->srcZ;
1556  // delete msg;
1557 
1558  // numPatches = number of home patches this pencil has
1559  int numPatches = pmeAtomStorage[forceI][pp]->getNumPatches();
1560  if (numPatches != homePatchIndexList[forceI][pp].size()) {
1561  NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, numPatches incorrect");
1562  }
1563  for (int i=0;i < numPatches;i++) {
1564  // this pencil contributed to home patch with index "homePatchIndex"
1565  int homePatchIndex = homePatchIndexList[forceI][pp][i];
1566  // ----------------------------- lock start ---------------------------
1567  // NOTE: We use node-wide lock here for the entire numPencils[] array, while
1568  // we really would only need to each element but this would required
1569  // numHomePatches number of locks.
1570  bool done = false;
1571  CmiLock(lock_numPencils);
1572  numPencils[forceI][homePatchIndex]--;
1573  if (numPencils[forceI][homePatchIndex] == 0) done = true;
1574  CmiUnlock(lock_numPencils);
1575  // ----------------------------- lock end ---------------------------
1576  if (done) {
1577  // This home patch is done, launch force merging
1578  thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
1579  }
1580  }
1581 
1582 }
1583 
1585  // We have all the forces for this patch => merge on a single Pe
1586 
1587  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1588 
1589  // Message that goes out to the compute
1590  PmeForceMsg *forceMsg = homePatchForceMsgs[forceI][homePatchIndex];
1591 
1592  if (pmePencilType == 3) {
1593  // 3D box => simple memory copy will do
1594  // Location of forces in the force[] array
1595  int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1596  // plList[homePatchIndex] array tells you the location of pencils that are sharing this home patch
1597  int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
1598  int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1599  int atomEnd = patchPos[pencilPatchIndex];
1600  int numAtoms = atomEnd-atomStart;
1601  if (forceMsg->zeroCopy) {
1602  // Zero-copy, just pass the pointer
1603  forceMsg->force = force+atomStart;
1604  } else {
1605  memcpy(forceMsg->force, force+atomStart, numAtoms*sizeof(CudaForce));
1606  }
1607  } else {
1608 
1609  // Zero force array
1610  // memset(forceMsg->force, 0, numHomeAtoms[forceI][homePatchIndex]*sizeof(CudaForce));
1611  memset(forceMsg->force, 0, forceMsg->numAtoms*sizeof(CudaForce));
1612 
1613  // Store forces from primary pencil
1614  {
1615  int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1616  int* index = pmeAtomStorage[forceI][pp0]->getAtomIndex();
1617  int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
1618  int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1619  int atomEnd = patchPos[pencilPatchIndex];
1620  int numAtoms = atomEnd-atomStart;
1621 
1622  // Copy in local forces that are stored in the force[] array
1623  for (int i=0;i < numAtoms;i++) {
1624  forceMsg->force[index[atomStart + i]] = force[atomStart + i];
1625  }
1626 
1627  }
1628 
1629  // Add forces from neighboring pencils
1630  for (int j=1;j < plList[forceI][homePatchIndex].size();j++) {
1631  int pp = plList[forceI][homePatchIndex][j].pp;
1632  int pencilPatchIndex = plList[forceI][homePatchIndex][j].pencilPatchIndex;
1633 
1634  int* patchPos = pmeAtomStorage[forceI][pp]->getPatchPos();
1635  int* index = pmeAtomStorage[forceI][pp]->getAtomIndex();
1636  int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1637  int atomEnd = patchPos[pencilPatchIndex];
1638  int numAtoms = atomEnd-atomStart;
1639  CudaForce *dstForce = forceMsg->force;
1640  // CudaForce *srcForce = neighborForcePencils[pp].force;
1641  CudaForce *srcForce = neighborForcePencilMsgs[pp]->force;
1642 
1643  for (int i=0;i < numAtoms;i++) {
1644  dstForce[index[atomStart + i]].x += srcForce[atomStart + i].x;
1645  dstForce[index[atomStart + i]].y += srcForce[atomStart + i].y;
1646  dstForce[index[atomStart + i]].z += srcForce[atomStart + i].z;
1647  }
1648 
1649  }
1650  }
1651 
1652  // Clear storage
1653  plList[forceI][homePatchIndex].clear();
1654 
1655  // ----------------------------- lock start ---------------------------
1656  // bool done = false;
1657  CmiLock(lock_numHomePatchesMerged);
1658  numHomePatchesMerged++;
1659  if (numHomePatchesMerged == numHomePatches) {
1660  // Reset counter
1661  numHomePatchesMerged = 0;
1662 
1663  // Delete messages
1664  for (int i=0;i < neighborForcePencilMsgs.size();i++) {
1665  if (neighborForcePencilMsgs[i] != NULL) {
1666  delete neighborForcePencilMsgs[i];
1667  neighborForcePencilMsgs[i] = NULL;
1668  }
1669  }
1670 
1671  // Done merging and sending forces => clear storage
1672  for (int pp=0;pp < homePatchIndexList[forceI].size();pp++)
1673  homePatchIndexList[forceI][pp].clear();
1674  for (int pp=0;pp < pmeAtomStorage[forceI].size();pp++)
1675  pmeAtomStorage[forceI][pp]->clear();
1676 
1677  }
1678  CmiUnlock(lock_numHomePatchesMerged);
1679  // ----------------------------- lock end ---------------------------
1680 
1681  // Patch is done => send over to the node that contains the ComputePmeCUDA compute,
1682  // this node will then rely the message to the Pe that originally sent the atoms
1683  int pe = forceMsg->pe;
1684  if (CkNodeOf(pe) != CkMyNode())
1685  thisProxy[CkNodeOf(pe)].sendForcesToPatch(forceMsg);
1686  else
1687  sendForcesToPatch(forceMsg);
1688 
1689 }
1690 
1692  // Now we're on the node that has Pe, hence "compute" -pointer is valid
1693  int pe = forceMsg->pe;
1694  ComputePmeCUDA *compute = forceMsg->compute;
1695 
1696  // Store message for use in ComputePmeCUDA, where it'll also be deleted.
1697  if (compute->storePmeForceMsg(forceMsg)) {
1698  // Enqueue on the pe that sent the atoms in the first place
1699  LocalWorkMsg *lmsg = compute->localWorkMsg;
1700  CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
1701  wdProxy[pe].enqueuePme(lmsg);
1702  }
1703 }
1704 #endif // NAMD_CUDA
1705 
1706 #include "ComputePmeCUDAMgr.def.h"
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:19
int getHomeNode(PatchID patchID)
virtual void gatherForce(Lattice &lattice, CudaForce *force)=0
void sendAtomsToNeighbor(int y, int z, int atomIval)
int zBlocks
Definition: PmeBase.h:22
void recvForcesFromNeighbor(PmeForcePencilMsg *msg)
CProxy_ComputePmeCUDAMgr getMgrProxy()
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:107
void initialize(CkQdMsg *msg)
virial xy
ComputePmeCUDA * compute
CpuPmeAtomStorage(const bool useIndex)
Vector a_r() const
Definition: Lattice.h:268
int numStrayAtoms
Definition: PmeSolver.h:111
int dim3
Definition: PmeBase.h:19
int getDeviceIDPencilX(int i, int j)
static void gatherForceDoneLoop(int first, int last, void *result, int paraNum, void *param)
void sendForcesToPatch(PmeForceMsg *forceMsg)
static PatchMap * Object()
Definition: PatchMap.h:27
void setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in)
void getHomePencil(PatchID patchID, int &homey, int &homez)
BigReal max_c(int pid) const
Definition: PatchMap.h:96
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
bool doVirial
Definition: PmeSolver.h:110
int K2
Definition: PmeBase.h:18
SimParameters * simParameters
Definition: Node.h:178
#define CUDA_PME_SPREADCHARGE_EVENT
Definition: DeviceCUDA.h:7
int K1
Definition: PmeBase.h:18
Vector c_r() const
Definition: Lattice.h:270
float z
Definition: NamdTypes.h:154
static __thread atom * atoms
static __thread unsigned int * plist
float x
Definition: NamdTypes.h:158
void recvAtomFiler(CProxy_PmeAtomFiler filer)
void recvAtoms(PmeAtomMsg *msg)
BigReal z
Definition: Vector.h:66
int block1
Definition: PmeBase.h:21
float x
Definition: NamdTypes.h:154
int getNumDevice()
Definition: DeviceCUDA.h:88
static int getPencilIndexY(const PmeGrid &pmeGrid, const int y)
Definition: PmeSolverUtil.h:20
void mergeForcesOnPatch(int homePatchIndex)
if(ComputeNonbondedUtil::goMethod==2)
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:29
#define iout
Definition: InfoStream.h:87
int block2
Definition: PmeBase.h:21
void swap(Array< T > &s, Array< T > &t)
Definition: MsmMap.h:319
void recvDevices(RecvDeviceMsg *msg)
Vector b_r() const
Definition: Lattice.h:269
int getDeviceIDPencilZ(int i, int j)
LocalWorkMsg *const localWorkMsg
Definition: Compute.h:46
BigReal min_b(int pid) const
Definition: PatchMap.h:93
__thread cudaStream_t stream
bool storePmeForceMsg(PmeForceMsg *msg)
void fileAtoms(const int numAtoms, const CudaAtom *atoms, Lattice &lattice, const PmeGrid &pmeGrid, const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi)
static double calcGridCoord(const double x, const double recip11, const int nfftx)
int getNumAtoms(int p)
CudaAtom * overflowAtom
int yBlocks
Definition: PmeBase.h:22
#define PRIORITY_SIZE
Definition: Priorities.h:13
void initialize(PmeGrid &pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in, int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in, CProxy_PmeAtomFiler pmeAtomFiler_in)
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int getDevicePencilZ(int i, int j)
int block3
Definition: PmeBase.h:21
CudaAtom * atoms
void recvAtomsFromNeighbor(PmeAtomPencilMsg *msg)
gridSize z
int dataSize
Definition: PmeSolver.h:104
BigReal x
Definition: Vector.h:66
void initializePatches(int numHomePatches_in)
float z
Definition: NamdTypes.h:158
int PatchID
Definition: NamdTypes.h:182
void createStream(cudaStream_t &stream)
CudaPmeAtomStorage(const bool useIndex)
int getDevice(int i, int j)
int getDeviceIDPencilY(int i, int j)
BigReal max_b(int pid) const
Definition: PatchMap.h:94
void recvAtoms(PmeAtomMsg *msg)
float y
Definition: NamdTypes.h:154
#define simParams
Definition: Output.C:127
float * data
Definition: PmeSolver.h:103
int K3
Definition: PmeBase.h:18
Lattice lattice
Definition: PmeSolver.h:112
int numPatches(void) const
Definition: PatchMap.h:59
void activate_pencils(CkQdMsg *msg)
int getDeviceIDbyRank(int rank)
Definition: DeviceCUDA.h:108
BigReal y
Definition: Vector.h:66
Vector b() const
Definition: Lattice.h:253
bool isPmeNode(int node)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
int getNode(int i, int j)
void initialize_pencils(CkQdMsg *msg)
virtual void spreadCharge(Lattice &lattice)=0
BigReal patchDimension
gridSize y
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
float y
Definition: NamdTypes.h:158
ComputePmeCUDA * compute
BigReal min_c(int pid) const
Definition: PatchMap.h:95
infostream & endi(infostream &s)
Definition: InfoStream.C:38
int getDevicePencilY(int i, int j)
int * getAtomIndex(int p)
int xBlocks
Definition: PmeBase.h:22
void recvPencils(CProxy_CudaPmePencilXYZ xyz)
CProxy_ComputePmeCUDADevice * dev
gridSize x
void gatherForceDoneSubset(int first, int last)
bool doEnergy
Definition: PmeSolver.h:110
static int getPencilIndexZ(const PmeGrid &pmeGrid, const int z)
Definition: PmeSolverUtil.h:25
virtual void copyAtoms(const int numAtoms, const CudaAtom *atoms)=0
bool isPmeDevice(int deviceID)
Vector unit(void) const
Definition: Vector.h:182
Vector c() const
Definition: Lattice.h:254
double BigReal
Definition: common.h:112
#define CUDA_PME_GATHERFORCE_EVENT
Definition: DeviceCUDA.h:8