NAMD
ComputePme.C
Go to the documentation of this file.
1 
7 #ifdef NAMD_FFTW
8 //#define MANUAL_DEBUG_FFTW3 1
9 #ifdef NAMD_FFTW_3
10 #include <fftw3.h>
11 #else
12 // fftw2 doesn't have these defined
13 #define fftwf_malloc fftw_malloc
14 #define fftwf_free fftw_free
15 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
16 #include <fftw.h>
17 #include <rfftw.h>
18 #else
19 #include <sfftw.h>
20 #include <srfftw.h>
21 #endif
22 #endif
23 #endif
24 
25 #include <vector>
26 #include <algorithm>
27 #include <deque>
28 using namespace std;
29 
30 #include "InfoStream.h"
31 #include "Node.h"
32 #include "PatchMap.h"
33 #include "PatchMap.inl"
34 #include "AtomMap.h"
35 #include "ComputePme.h"
36 #include "ComputePmeMgr.decl.h"
37 #include "PmeBase.inl"
38 #include "PmeRealSpace.h"
39 #include "PmeKSpace.h"
40 #include "ComputeNonbondedUtil.h"
41 #include "PatchMgr.h"
42 #include "Molecule.h"
43 #include "ReductionMgr.h"
44 #include "ComputeMgr.h"
45 #include "ComputeMgr.decl.h"
46 // #define DEBUGM
47 #define MIN_DEBUG_LEVEL 3
48 #include "Debug.h"
49 #include "SimParameters.h"
50 #include "WorkDistrib.h"
51 #include "varsizemsg.h"
52 #include "Random.h"
53 #include "ckhashtable.h"
54 #include "Priorities.h"
55 
56 #include "ComputeMoa.h"
57 #include "ComputeMoaMgr.decl.h"
58 
59 //#define USE_RANDOM_TOPO 1
60 
61 //#define USE_TOPO_SFC 1
62 //#define USE_CKLOOP 1
63 //#include "TopoManager.h"
64 
65 #include "DeviceCUDA.h"
66 #ifdef NAMD_CUDA
67 #include <cuda_runtime.h>
68 #include <cuda.h>
69 void cuda_errcheck(const char *msg);
70 #ifdef WIN32
71 #define __thread __declspec(thread)
72 #endif
73 extern __thread DeviceCUDA *deviceCUDA;
74 #endif
75 
76 #include "ComputePmeCUDAKernel.h"
77 
78 #ifndef SQRT_PI
79 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
80 #endif
81 
82 #if CMK_PERSISTENT_COMM
83 #define USE_PERSISTENT 1
84 #endif
85 
86 #if USE_PERSISTENT
87 #define Z_PERSIST 1
88 #define Y_PERSIST 1
89 #define X_PERSIST 1
90 #endif
91 
92 #if defined(NAMD_CUDA) && defined(MEM_OPT_VERSION)
93 #define USE_NODE_PAR_RECEIVE 1
94 #endif
95 
106 
108 
109 class PmeAckMsg : public CMessage_PmeAckMsg {
110 };
111 
112 class PmeGridMsg : public CMessage_PmeGridMsg {
113 public:
114 
116  int sequence;
117  int hasData;
119  int start;
120  int len;
121  int zlistlen;
122  int *zlist;
123  char *fgrid;
124  float *qgrid;
125  CkArrayIndex3D destElem;
126 };
127 
128 class PmeTransMsg : public CMessage_PmeTransMsg {
129 public:
130 
132  int sequence;
133  int hasData;
135  int x_start;
136  int nx;
137  float *qgrid;
138  CkArrayIndex3D destElem;
139 };
140 
141 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
142 public:
144  int *count;
145  CmiNodeLock lock;
146 };
147 
148 class PmeUntransMsg : public CMessage_PmeUntransMsg {
149 public:
150 
152  int y_start;
153  int ny;
154  float *qgrid;
155  CkArrayIndex3D destElem;
156 };
157 
158 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
159 public:
161  int *count;
162  CmiNodeLock lock;
163 };
164 
165 class PmeEvirMsg : public CMessage_PmeEvirMsg {
166 public:
168 };
169 
170 class PmePencilMap : public CBase_PmePencilMap {
171 public:
172  PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
173  : ia(i_a), ib(i_b), nb(n_b),
174  size(n), data(newcopyint(n,d)) {
175  }
176  virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
177  //Return an ``arrayHdl'', given some information about the array
178  return 0;
179  }
180  virtual int procNum(int, const CkArrayIndex &i) {
181  //Return the home processor number for this element of this array
182  return data[ i.data()[ia] * nb + i.data()[ib] ];
183  }
184  virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
185  int mype = CkMyPe();
186  for ( int i=0; i < size; ++i ) {
187  if ( data[i] == mype ) {
188  CkArrayIndex3D ai(0,0,0);
189  ai.data()[ia] = i / nb;
190  ai.data()[ib] = i % nb;
191  if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
192  if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
193  mgr->insertInitial(ai,msg);
194  msg = 0;
195  }
196  }
197  mgr->doneInserting();
198  if ( msg ) CkFreeMsg(msg);
199  }
200 private:
201  const int ia, ib, nb, size;
202  const int* const data;
203  static int* newcopyint(int n, int *d) {
204  int *newd = new int[n];
205  memcpy(newd, d, n*sizeof(int));
206  return newd;
207  }
208 };
209 
210 // use this idiom since messages don't have copy constructors
213  int xBlocks, yBlocks, zBlocks;
214  CProxy_PmeXPencil xPencil;
215  CProxy_PmeYPencil yPencil;
216  CProxy_PmeZPencil zPencil;
217  CProxy_ComputePmeMgr pmeProxy;
218  CProxy_NodePmeMgr pmeNodeProxy;
219  CProxy_PmePencilMap xm;
220  CProxy_PmePencilMap ym;
221  CProxy_PmePencilMap zm;
222 };
223 
224 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
225 public:
228 };
229 
230 
231 struct LocalPmeInfo {
232  int nx, x_start;
233  int ny_after_transpose, y_start_after_transpose;
234 };
235 
236 struct NodePmeInfo {
237  int npe, pe_start, real_node;
238 };
239 
240 
241 static int findRecipEvirPe() {
242  PatchMap *patchMap = PatchMap::Object();
243  {
244  int mype = CkMyPe();
245  if ( patchMap->numPatchesOnNode(mype) ) {
246  return mype;
247  }
248  }
249  {
250  int node = CmiMyNode();
251  int firstpe = CmiNodeFirst(node);
252  int nodeSize = CmiNodeSize(node);
253  int myrank = CkMyRank();
254  for ( int i=0; i<nodeSize; ++i ) {
255  int pe = firstpe + (myrank+i)%nodeSize;
256  if ( patchMap->numPatchesOnNode(pe) ) {
257  return pe;
258  }
259  }
260  }
261  {
262  int *pelist;
263  int nodeSize;
264  CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
265  int myrank;
266  for ( int i=0; i<nodeSize; ++i ) {
267  if ( pelist[i] == CkMyPe() ) myrank = i;
268  }
269  for ( int i=0; i<nodeSize; ++i ) {
270  int pe = pelist[(myrank+i)%nodeSize];
271  if ( patchMap->numPatchesOnNode(pe) ) {
272  return pe;
273  }
274  }
275  }
276  {
277  int mype = CkMyPe();
278  int npes = CkNumPes();
279  for ( int i=0; i<npes; ++i ) {
280  int pe = (mype+i)%npes;
281  if ( patchMap->numPatchesOnNode(pe) ) {
282  return pe;
283  }
284  }
285  }
286  NAMD_bug("findRecipEvirPe() failed!");
287  return -999; // should never happen
288 }
289 
290 
291 //Assigns gridPeMap and transPeMap to different set of processors.
292 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
293  int ncpus = CkNumPes();
294 
295  for ( int i=0; i<numGridPes; ++i ) {
296  gridPeMap[i] = WorkDistrib::peDiffuseOrdering[ncpus - numGridPes + i];
297  }
298  std::sort(gridPeMap,gridPeMap+numGridPes);
299  int firstTransPe = ncpus - numGridPes - numTransPes;
300  if ( firstTransPe < 0 ) {
301  firstTransPe = 0;
302  // 0 should be first in list, skip if possible
303  if ( ncpus > numTransPes ) firstTransPe = 1;
304  }
305  for ( int i=0; i<numTransPes; ++i ) {
306  transPeMap[i] = WorkDistrib::peDiffuseOrdering[firstTransPe + i];
307  }
308  std::sort(transPeMap,transPeMap+numTransPes);
309 }
310 
311 #if USE_TOPOMAP
312 //Topology aware PME allocation
313 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0,
314  int nbpes=0);
315 #endif
316 
317 
318 int compare_bit_reversed(int a, int b) {
319  int d = a ^ b;
320  int c = 1;
321  if ( d ) while ( ! (d & c) ) {
322  c = c << 1;
323  }
324  return (a & c) - (b & c);
325 }
326 
327 inline bool less_than_bit_reversed(int a, int b) {
328  int d = a ^ b;
329  int c = 1;
330  if ( d ) while ( ! (d & c) ) {
331  c = c << 1;
332  }
333  return d && (b & c);
334 }
335 
337  inline bool operator() (int a, int b) const {
338  return less_than_bit_reversed(a,b);
339  }
340 };
341 
342 struct ijpair {
343  int i,j;
344  ijpair() {;}
345  ijpair(int I, int J) : i(I), j(J) {;}
346 };
347 
349  inline bool operator() (const ijpair &a, const ijpair &b) const {
350  return ( less_than_bit_reversed(a.i,b.i)
351  || ( (a.i == b.i) && less_than_bit_reversed(a.j,b.j) ) );
352  }
353 };
354 
355 class ComputePmeMgr : public CBase_ComputePmeMgr, public ComputePmeUtil {
356 public:
357  friend class ComputePme;
358  friend class NodePmeMgr;
359  ComputePmeMgr();
360  ~ComputePmeMgr();
361 
362  void initialize(CkQdMsg*);
363  void initialize_pencils(CkQdMsg*);
364  void activate_pencils(CkQdMsg*);
365  void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
366  void initialize_computes();
367 
368  void sendData(Lattice &, int sequence);
369  void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors);
374  void sendPencils(Lattice &, int sequence);
375  void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe);
376  void recvGrid(PmeGridMsg *);
377  void gridCalc1(void);
378  void sendTransBarrier(void);
379  void sendTransSubset(int first, int last);
380  void sendTrans(void);
381  void fwdSharedTrans(PmeTransMsg *);
382  void recvSharedTrans(PmeSharedTransMsg *);
383  void sendDataHelper(int);
384  void sendPencilsHelper(int);
385  void recvTrans(PmeTransMsg *);
386  void procTrans(PmeTransMsg *);
387  void gridCalc2(void);
388  #ifdef OPENATOM_VERSION
389  void gridCalc2Moa(void);
390  #endif // OPENATOM_VERSION
391  void gridCalc2R(void);
392  void fwdSharedUntrans(PmeUntransMsg *);
393  void recvSharedUntrans(PmeSharedUntransMsg *);
394  void sendUntrans(void);
395  void sendUntransSubset(int first, int last);
396  void recvUntrans(PmeUntransMsg *);
397  void procUntrans(PmeUntransMsg *);
398  void gridCalc3(void);
399  void sendUngrid(void);
400  void sendUngridSubset(int first, int last);
401  void recvUngrid(PmeGridMsg *);
402  void recvAck(PmeAckMsg *);
403  void copyResults(PmeGridMsg *);
404  void copyPencils(PmeGridMsg *);
405  void ungridCalc(void);
406  void recvRecipEvir(PmeEvirMsg *);
407  void addRecipEvirClient(void);
408  void submitReductions();
409 
410 #if 0 && USE_PERSISTENT
411  void setup_recvgrid_persistent();
412 #endif
413 
414  static CmiNodeLock fftw_plan_lock;
415  CmiNodeLock pmemgr_lock; // for accessing this object from other threads
416 
417 #ifdef NAMD_CUDA
418  float *a_data_host;
419  float *a_data_dev;
420  float *f_data_host;
421  float *f_data_dev;
424  static CmiNodeLock cuda_lock;
425  void chargeGridSubmitted(Lattice &lattice, int sequence);
426  cudaEvent_t end_charges;
427  cudaEvent_t *end_forces;
430  double charges_time;
431  double forces_time;
435  int this_pe;
436 
437  void cuda_submit_charges(Lattice &lattice, int sequence);
439  ComputePmeMgr *mgr; Lattice *lattice; int sequence;
440  };
441  static std::deque<cuda_submit_charges_args> cuda_submit_charges_deque;
442  static bool cuda_busy;
443 
445  void sendChargeGridReady();
446 #endif
447  Lattice *saved_lattice; // saved by chargeGridSubmitted
448  int saved_sequence; // saved by chargeGridSubmitted
449  void pollChargeGridReady();
450  void pollForcesReady();
451  void recvChargeGridReady();
452  void chargeGridReady(Lattice &lattice, int sequence);
453 
455 
456 private:
457 
458 #if 0 && USE_PERSISTENT
459  PersistentHandle *recvGrid_handle;
460 #endif
461 
462  CProxy_ComputePmeMgr pmeProxy;
463  CProxy_ComputePmeMgr pmeProxyDir;
464  CProxy_NodePmeMgr pmeNodeProxy;
465  NodePmeMgr *nodePmeMgr;
466  ComputePmeMgr *masterPmeMgr;
467 
468  void addCompute(ComputePme *c) {
469  if ( ! pmeComputes.size() ) initialize_computes();
470  pmeComputes.add(c);
471  c->setMgr(this);
472  }
473 
474  ResizeArray<ComputePme*> heldComputes;
475  PmeGrid myGrid;
476  Lattice lattice;
477  PmeKSpace *myKSpace;
478  float *qgrid;
479  float *kgrid;
480 
481 #ifdef NAMD_FFTW
482 #ifdef NAMD_FFTW_3
483  fftwf_plan *forward_plan_x, *backward_plan_x;
484  fftwf_plan *forward_plan_yz, *backward_plan_yz;
485  fftwf_complex *work;
486 #else
487  fftw_plan forward_plan_x, backward_plan_x;
488  rfftwnd_plan forward_plan_yz, backward_plan_yz;
489  fftw_complex *work;
490 #endif
491 #else
492  float *work;
493 #endif
494 
495  int qsize, fsize, bsize;
496  int offload;
497  BigReal alchLambda; // set on each step in ComputePme::ungridForces()
498  BigReal alchLambda2; // set on each step in ComputePme::ungridForces()
499 
500  float **q_arr;
501  // q_list and q_count not used for offload
502  float **q_list;
503  int q_count;
504  char *f_arr;
505  char *fz_arr;
507  SubmitReduction *reduction;
508 
509  int noWorkCount;
510  int doWorkCount;
511  int ungridForcesCount;
512 
513 #ifdef NAMD_CUDA
514 #define NUM_STREAMS 1
515  cudaStream_t streams[NUM_STREAMS];
516  int stream;
517 
518  float **q_arr_dev;
519  float **v_arr_dev;
520  float *q_data_host;
521  float *q_data_dev;
522  float *v_data_dev;
523  int *ffz_host;
524  int *ffz_dev;
525  int q_data_size;
526  int ffz_size;
527 
528  int f_data_mgr_alloc;
529  float *f_data_mgr_host;
530  float *f_data_mgr_dev;
531  float **afn_host;
532  float **afn_dev;
533 
534  float *bspline_coeffs_dev;
535  float *bspline_dcoeffs_dev;
536 #endif
537  int recipEvirCount; // used in compute only
538  int recipEvirClients; // used in compute only
539  int recipEvirPe; // used in trans only
540 
541  LocalPmeInfo *localInfo;
542  NodePmeInfo *gridNodeInfo;
543  NodePmeInfo *transNodeInfo;
544  int qgrid_size;
545  int qgrid_start;
546  int qgrid_len;
547  int fgrid_start;
548  int fgrid_len;
549 
550  int numSources;
551  int numGridPes;
552  int numTransPes;
553  int numGridNodes;
554  int numTransNodes;
555  int numDestRecipPes;
556  int myGridPe, myGridNode;
557  int myTransPe, myTransNode;
558  int *gridPeMap;
559  int *transPeMap;
560  int *recipPeDest;
561  int *gridPeOrder;
562  int *gridNodeOrder;
563  int *transNodeOrder;
564  int grid_count;
565  int trans_count;
566  int untrans_count;
567  int ungrid_count;
568  PmeGridMsg **gridmsg_reuse;
569  PmeReduction recip_evir2[PME_MAX_EVALS];
570 
571  int compute_sequence; // set from patch computes, used for priorities
572  int grid_sequence; // set from grid messages, used for priorities
573  int useBarrier;
574  int sendTransBarrier_received;
575 
576  int usePencils;
577  int xBlocks, yBlocks, zBlocks;
578  CProxy_PmeXPencil xPencil;
579  CProxy_PmeYPencil yPencil;
580  CProxy_PmeZPencil zPencil;
581  char *pencilActive;
582  ijpair *activePencils;
583  int numPencilsActive;
584  int strayChargeErrors;
585 };
586 
588  return mgr->pmeComputes ;
589 }
590 
591  CmiNodeLock ComputePmeMgr::fftw_plan_lock;
592 #ifdef NAMD_CUDA
593  CmiNodeLock ComputePmeMgr::cuda_lock;
594  std::deque<ComputePmeMgr::cuda_submit_charges_args> ComputePmeMgr::cuda_submit_charges_deque;
596 #endif
597 
598 int isPmeProcessor(int p){
600  if (simParams->usePMECUDA) {
601  return 0;
602  } else {
603  return pencilPMEProcessors[p];
604  }
605 }
606 
607 class NodePmeMgr : public CBase_NodePmeMgr {
608 public:
609  friend class ComputePmeMgr;
610  friend class ComputePme;
611  NodePmeMgr();
612  ~NodePmeMgr();
613  void initialize();
614  void sendDataHelper(int);
615  void sendPencilsHelper(int);
616  void recvTrans(PmeTransMsg *);
617  void recvUntrans(PmeUntransMsg *);
618  void registerXPencil(CkArrayIndex3D, PmeXPencil *);
619  void registerYPencil(CkArrayIndex3D, PmeYPencil *);
620  void registerZPencil(CkArrayIndex3D, PmeZPencil *);
621  void recvXTrans(PmeTransMsg *);
622  void recvYTrans(PmeTransMsg *);
623  void recvYUntrans(PmeUntransMsg *);
624  void recvZGrid(PmeGridMsg *);
625  void recvZUntrans(PmeUntransMsg *);
626 
627  void recvUngrid(PmeGridMsg *);
628 
629  void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
630  xm=_xm; ym=_ym; zm=_zm;
631  }
632  CProxy_PmePencilMap xm;
633  CProxy_PmePencilMap ym;
634  CProxy_PmePencilMap zm;
635 
636 private:
637  CProxy_ComputePmeMgr mgrProxy;
638  ComputePmeMgr *mgrObject;
639  ComputePmeMgr **mgrObjects;
640 #ifdef NAMD_CUDA
641  ComputePmeMgr *masterPmeMgr;
642  int master_pe;
643 #endif
644  CProxy_PmeXPencil xPencil;
645  CProxy_PmeYPencil yPencil;
646  CProxy_PmeZPencil zPencil;
647  CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
648  CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
649  CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;
650 
651 #ifdef NAMD_CUDA
652  cudaEvent_t end_charge_memset;
653  cudaEvent_t end_all_pme_kernels;
654  cudaEvent_t end_potential_memcpy;
655 #endif
656 };
657 
659  mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
660 }
661 
663  delete [] mgrObjects;
664 }
665 
667  CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
668  mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
669  if ( CkMyRank() == 0 ) {
670  mgrProxy = proxy;
671  mgrObject = proxy.ckLocalBranch();
672  }
673 }
674 
676  mgrObject->fwdSharedTrans(msg);
677 }
678 
680  mgrObject->fwdSharedUntrans(msg);
681 }
682 
684 #ifdef NAMD_CUDA
685  masterPmeMgr->recvUngrid(msg);
686 #else
687  NAMD_bug("NodePmeMgr::recvUngrid called in non-CUDA build.");
688 #endif
689 }
690 
691 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
692 {
694  xPencilObj.put(idx)=obj;
696 }
697 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
698 {
700  yPencilObj.put(idx)=obj;
702 }
703 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
704 {
706  zPencilObj.put(idx)=obj;
708 }
709 
710 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup),
711  pmeProxyDir(thisgroup) {
712 
713  CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
714  pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
715  nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
716 
717  pmeNodeProxy.ckLocalBranch()->initialize();
718 
719  if ( CmiMyRank() == 0 ) {
720  fftw_plan_lock = CmiCreateLock();
721  }
722  pmemgr_lock = CmiCreateLock();
723 
724  myKSpace = 0;
725  kgrid = 0;
726  work = 0;
727  grid_count = 0;
728  trans_count = 0;
729  untrans_count = 0;
730  ungrid_count = 0;
731  gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
732  useBarrier = 0;
733  sendTransBarrier_received = 0;
734  usePencils = 0;
735 
736 #ifdef NAMD_CUDA
737  // offload has not been set so this happens on every run
738  if ( CmiMyRank() == 0 ) {
739  cuda_lock = CmiCreateLock();
740  }
741 
742 #if CUDA_VERSION >= 5050
743  int leastPriority, greatestPriority;
744  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
745  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
746  //if ( CkMyNode() == 0 ) {
747  // CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
748  //}
749 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
750 #else
751 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
752 #endif
753 
754  stream = 0;
755  for ( int i=0; i<NUM_STREAMS; ++i ) {
756 #if 1
757  CUDA_STREAM_CREATE(&streams[i]);
758  cuda_errcheck("cudaStreamCreate");
759 #else
760  streams[i] = 0; // XXXX Testing!!!
761 #endif
762  }
763 
764  this_pe = CkMyPe();
765 
766  cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
767  end_forces = 0;
769  check_forces_count = 0;
771 
772  cuda_atoms_count = 0;
773  cuda_atoms_alloc = 0;
774 
775  f_data_mgr_alloc = 0;
776  f_data_mgr_host = 0;
777  f_data_mgr_dev = 0;
778  afn_host = 0;
779  afn_dev = 0;
780 
781 #define CUDA_EVENT_ID_PME_CHARGES 80
782 #define CUDA_EVENT_ID_PME_FORCES 81
783 #define CUDA_EVENT_ID_PME_TICK 82
784 #define CUDA_EVENT_ID_PME_COPY 83
785 #define CUDA_EVENT_ID_PME_KERNEL 84
786  if ( 0 == CkMyPe() ) {
787  traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
788  traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
789  traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
790  traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
791  traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
792  }
793 #endif
794  recipEvirCount = 0;
795  recipEvirClients = 0;
796  recipEvirPe = -999;
797 }
798 
799 
801  CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
802  xPencil = x; yPencil = y; zPencil = z;
803 
804  if(CmiMyRank()==0)
805  {
806  pmeNodeProxy.ckLocalBranch()->xPencil=x;
807  pmeNodeProxy.ckLocalBranch()->yPencil=y;
808  pmeNodeProxy.ckLocalBranch()->zPencil=z;
809  }
810 }
811 
812 #if USE_TOPO_SFC
813  struct Coord
814  {
815  int x, y, z;
816  Coord(): x(0), y(0), z(0) {}
817  Coord(int a, int b, int c): x(a), y(b), z(c) {}
818  };
819  extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
820 
821  void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
822  {
823  SortableResizeArray<int> newprocs(procs.size());
824  int num = 0;
825  for (int i=0; i<result.size(); i++) {
826  Coord &c = result[i];
827  for (int j=0; j<procs.size(); j++) {
828  int pe = procs[j];
829  int x,y,z,t;
830  tmgr.rankToCoordinates(pe, x, y, z, t);
831  if (x==c.x && y==c.y && z==c.z)
832  newprocs[num++] = pe;
833  }
834  }
835  CmiAssert(newprocs.size() == procs.size());
836  procs = newprocs;
837  }
838 
839  int find_level_grid(int x)
840  {
841  int a = sqrt(x);
842  int b;
843  for (; a>0; a--) {
844  if (x%a == 0) break;
845  }
846  if (a==1) a = x;
847  b = x/a;
848  //return a>b?a:b;
849  return b;
850  }
851  CmiNodeLock tmgr_lock;
852 #endif
853 
854 void Pme_init()
855 {
856 #if USE_TOPO_SFC
857  if (CkMyRank() == 0)
858  tmgr_lock = CmiCreateLock();
859 #endif
860 }
861 
862 void ComputePmeMgr::initialize(CkQdMsg *msg) {
863  delete msg;
864 
865  localInfo = new LocalPmeInfo[CkNumPes()];
866  gridNodeInfo = new NodePmeInfo[CkNumNodes()];
867  transNodeInfo = new NodePmeInfo[CkNumNodes()];
868  gridPeMap = new int[CkNumPes()];
869  transPeMap = new int[CkNumPes()];
870  recipPeDest = new int[CkNumPes()];
871  gridPeOrder = new int[CkNumPes()];
872  gridNodeOrder = new int[CkNumNodes()];
873  transNodeOrder = new int[CkNumNodes()];
874 
875  if (CkMyRank() == 0) {
876  pencilPMEProcessors = new char [CkNumPes()];
877  memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
878  }
879 
881  PatchMap *patchMap = PatchMap::Object();
882 
883  offload = simParams->PMEOffload;
884 #ifdef NAMD_CUDA
885  if ( offload && ! deviceCUDA->one_device_per_node() ) {
886  NAMD_die("PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
887  }
888  if ( offload ) {
889  int dev;
890  cudaGetDevice(&dev);
891  cuda_errcheck("in cudaGetDevice");
892  if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
893  cudaDeviceProp deviceProp;
894  cudaGetDeviceProperties(&deviceProp, dev);
895  cuda_errcheck("in cudaGetDeviceProperties");
896  if ( deviceProp.major < 2 )
897  NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
898  }
899 #endif
900 
901  alchLambda = -1.; // illegal value to catch if not updated
902  alchLambda2 = -1.;
903  useBarrier = simParams->PMEBarrier;
904 
905  if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
906  else if ( simParams->PMEPencils > 0 ) usePencils = 1;
907  else {
908  int nrps = simParams->PMEProcessors;
909  if ( nrps <= 0 ) nrps = CkNumPes();
910  if ( nrps > CkNumPes() ) nrps = CkNumPes();
911  int dimx = simParams->PMEGridSizeX;
912  int dimy = simParams->PMEGridSizeY;
913  int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
914  if ( maxslabs > nrps ) maxslabs = nrps;
915  int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
916  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
917  if ( maxpencils > nrps ) maxpencils = nrps;
918  if ( maxpencils > 3 * maxslabs ) usePencils = 1;
919  else usePencils = 0;
920  }
921 
922  if ( usePencils ) {
923  int nrps = simParams->PMEProcessors;
924  if ( nrps <= 0 ) nrps = CkNumPes();
925  if ( nrps > CkNumPes() ) nrps = CkNumPes();
926  if ( simParams->PMEPencils > 1 &&
927  simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
928  xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
929  } else {
930  int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
931  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
932  if ( nb2 > nrps ) nb2 = nrps;
933  if ( nb2 < 1 ) nb2 = 1;
934  int nb = (int) sqrt((float)nb2);
935  if ( nb < 1 ) nb = 1;
936  xBlocks = zBlocks = nb;
937  yBlocks = nb2 / nb;
938  }
939 
940  if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
941  if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
942  if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
943 
944  int dimx = simParams->PMEGridSizeX;
945  int bx = 1 + ( dimx - 1 ) / xBlocks;
946  xBlocks = 1 + ( dimx - 1 ) / bx;
947 
948  int dimy = simParams->PMEGridSizeY;
949  int by = 1 + ( dimy - 1 ) / yBlocks;
950  yBlocks = 1 + ( dimy - 1 ) / by;
951 
952  int dimz = simParams->PMEGridSizeZ / 2 + 1; // complex
953  int bz = 1 + ( dimz - 1 ) / zBlocks;
954  zBlocks = 1 + ( dimz - 1 ) / bz;
955 
956  if ( xBlocks * yBlocks > CkNumPes() ) {
957  NAMD_die("PME pencils xBlocks * yBlocks > numPes");
958  }
959  if ( xBlocks * zBlocks > CkNumPes() ) {
960  NAMD_die("PME pencils xBlocks * zBlocks > numPes");
961  }
962  if ( yBlocks * zBlocks > CkNumPes() ) {
963  NAMD_die("PME pencils yBlocks * zBlocks > numPes");
964  }
965 
966  if ( ! CkMyPe() ) {
967  iout << iINFO << "PME using " << xBlocks << " x " <<
968  yBlocks << " x " << zBlocks <<
969  " pencil grid for FFT and reciprocal sum.\n" << endi;
970  }
971  } else { // usePencils
972 
973  { // decide how many pes to use for reciprocal sum
974 
975  // rules based on work available
976  int minslices = simParams->PMEMinSlices;
977  int dimx = simParams->PMEGridSizeX;
978  int nrpx = ( dimx + minslices - 1 ) / minslices;
979  int dimy = simParams->PMEGridSizeY;
980  int nrpy = ( dimy + minslices - 1 ) / minslices;
981 
982  // rules based on processors available
983  int nrpp = CkNumPes();
984  // if ( nrpp > 32 ) nrpp = 32; // cap to limit messages
985  if ( nrpp < nrpx ) nrpx = nrpp;
986  if ( nrpp < nrpy ) nrpy = nrpp;
987 
988  // user override
989  int nrps = simParams->PMEProcessors;
990  if ( nrps > CkNumPes() ) nrps = CkNumPes();
991  if ( nrps > 0 ) nrpx = nrps;
992  if ( nrps > 0 ) nrpy = nrps;
993 
994  // make sure there aren't any totally empty processors
995  int bx = ( dimx + nrpx - 1 ) / nrpx;
996  nrpx = ( dimx + bx - 1 ) / bx;
997  int by = ( dimy + nrpy - 1 ) / nrpy;
998  nrpy = ( dimy + by - 1 ) / by;
999  if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1000  NAMD_bug("Error in selecting number of PME processors.");
1001  if ( by != ( dimy + nrpy - 1 ) / nrpy )
1002  NAMD_bug("Error in selecting number of PME processors.");
1003 
1004  numGridPes = nrpx;
1005  numTransPes = nrpy;
1006  }
1007  if ( ! CkMyPe() ) {
1008  iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
1009  " processors for FFT and reciprocal sum.\n" << endi;
1010  }
1011 
1012  int sum_npes = numTransPes + numGridPes;
1013  int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1014 
1015 #if 0 // USE_TOPOMAP
1016  /* This code is being disabled permanently for slab PME on Blue Gene machines */
1017  PatchMap * pmap = PatchMap::Object();
1018 
1019  int patch_pes = pmap->numNodesWithPatches();
1020  TopoManager tmgr;
1021  if(tmgr.hasMultipleProcsPerNode())
1022  patch_pes *= 2;
1023 
1024  bool done = false;
1025  if(CkNumPes() > 2*sum_npes + patch_pes) {
1026  done = generateBGLORBPmePeList(transPeMap, numTransPes);
1027  done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1028  }
1029  else
1030  if(CkNumPes() > 2 *max_npes + patch_pes) {
1031  done = generateBGLORBPmePeList(transPeMap, max_npes);
1032  gridPeMap = transPeMap;
1033  }
1034 
1035  if (!done)
1036 #endif
1037  {
1038  //generatePmePeList(transPeMap, max_npes);
1039  //gridPeMap = transPeMap;
1040  generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
1041  }
1042 
1043  if ( ! CkMyPe() ) {
1044  iout << iINFO << "PME GRID LOCATIONS:";
1045  int i;
1046  for ( i=0; i<numGridPes && i<10; ++i ) {
1047  iout << " " << gridPeMap[i];
1048  }
1049  if ( i < numGridPes ) iout << " ...";
1050  iout << "\n" << endi;
1051  iout << iINFO << "PME TRANS LOCATIONS:";
1052  for ( i=0; i<numTransPes && i<10; ++i ) {
1053  iout << " " << transPeMap[i];
1054  }
1055  if ( i < numTransPes ) iout << " ...";
1056  iout << "\n" << endi;
1057  }
1058 
1059  // sort based on nodes and physical nodes
1060  std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
1061 
1062  myGridPe = -1;
1063  myGridNode = -1;
1064  int i = 0;
1065  int node = -1;
1066  int real_node = -1;
1067  for ( i=0; i<numGridPes; ++i ) {
1068  if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1069  if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
1070  int real_node_i = CkNodeOf(gridPeMap[i]);
1071  if ( real_node_i == real_node ) {
1072  gridNodeInfo[node].npe += 1;
1073  } else {
1074  real_node = real_node_i;
1075  ++node;
1076  gridNodeInfo[node].real_node = real_node;
1077  gridNodeInfo[node].pe_start = i;
1078  gridNodeInfo[node].npe = 1;
1079  }
1080  if ( CkMyNode() == real_node_i ) myGridNode = node;
1081  }
1082  numGridNodes = node + 1;
1083  myTransPe = -1;
1084  myTransNode = -1;
1085  node = -1;
1086  real_node = -1;
1087  for ( i=0; i<numTransPes; ++i ) {
1088  if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1089  if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
1090  int real_node_i = CkNodeOf(transPeMap[i]);
1091  if ( real_node_i == real_node ) {
1092  transNodeInfo[node].npe += 1;
1093  } else {
1094  real_node = real_node_i;
1095  ++node;
1096  transNodeInfo[node].real_node = real_node;
1097  transNodeInfo[node].pe_start = i;
1098  transNodeInfo[node].npe = 1;
1099  }
1100  if ( CkMyNode() == real_node_i ) myTransNode = node;
1101  }
1102  numTransNodes = node + 1;
1103 
1104  if ( ! CkMyPe() ) {
1105  iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
1106  << numTransNodes << " TRANS NODES\n" << endi;
1107  }
1108 
1109  { // generate random orderings for grid and trans messages
1110  int i;
1111  for ( i = 0; i < numGridPes; ++i ) {
1112  gridPeOrder[i] = i;
1113  }
1114  Random rand(CkMyPe());
1115  if ( myGridPe < 0 ) {
1116  rand.reorder(gridPeOrder,numGridPes);
1117  } else { // self last
1118  gridPeOrder[myGridPe] = numGridPes-1;
1119  gridPeOrder[numGridPes-1] = myGridPe;
1120  rand.reorder(gridPeOrder,numGridPes-1);
1121  }
1122  for ( i = 0; i < numGridNodes; ++i ) {
1123  gridNodeOrder[i] = i;
1124  }
1125  if ( myGridNode < 0 ) {
1126  rand.reorder(gridNodeOrder,numGridNodes);
1127  } else { // self last
1128  gridNodeOrder[myGridNode] = numGridNodes-1;
1129  gridNodeOrder[numGridNodes-1] = myGridNode;
1130  rand.reorder(gridNodeOrder,numGridNodes-1);
1131  }
1132  for ( i = 0; i < numTransNodes; ++i ) {
1133  transNodeOrder[i] = i;
1134  }
1135  if ( myTransNode < 0 ) {
1136  rand.reorder(transNodeOrder,numTransNodes);
1137  } else { // self last
1138  transNodeOrder[myTransNode] = numTransNodes-1;
1139  transNodeOrder[numTransNodes-1] = myTransNode;
1140  rand.reorder(transNodeOrder,numTransNodes-1);
1141  }
1142  }
1143 
1144  } // ! usePencils
1145 
1146  myGrid.K1 = simParams->PMEGridSizeX;
1147  myGrid.K2 = simParams->PMEGridSizeY;
1148  myGrid.K3 = simParams->PMEGridSizeZ;
1149  myGrid.order = simParams->PMEInterpOrder;
1150  myGrid.dim2 = myGrid.K2;
1151  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
1152 
1153  if ( ! usePencils ) {
1154  myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
1155  myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
1156  myGrid.block3 = myGrid.dim3 / 2; // complex
1157  }
1158 
1159  if ( usePencils ) {
1160  myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
1161  myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
1162  myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks; // complex
1163 
1164 
1165  int pe = 0;
1166  int x,y,z;
1167 
1168  SortableResizeArray<int> zprocs(xBlocks*yBlocks);
1169  SortableResizeArray<int> yprocs(xBlocks*zBlocks);
1170  SortableResizeArray<int> xprocs(yBlocks*zBlocks);
1171 
1172  // decide which pes to use by bit reversal and patch use
1173  int i;
1174  int ncpus = CkNumPes();
1175  SortableResizeArray<int> patches, nopatches, pmeprocs;
1176  PatchMap *pmap = PatchMap::Object();
1177  for ( int icpu=0; icpu<ncpus; ++icpu ) {
1178  int ri = WorkDistrib::peDiffuseOrdering[icpu];
1179  if ( ri ) { // keep 0 for special case
1180  // pretend pe 1 has patches to avoid placing extra PME load on node
1181  if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
1182  else nopatches.add(ri);
1183  }
1184  }
1185 
1186 #if USE_RANDOM_TOPO
1187  Random rand(CkMyPe());
1188  int *tmp = new int[patches.size()];
1189  int nn = patches.size();
1190  for (i=0;i<nn;i++) tmp[i] = patches[i];
1191  rand.reorder(tmp, nn);
1192  patches.resize(0);
1193  for (i=0;i<nn;i++) patches.add(tmp[i]);
1194  delete [] tmp;
1195  tmp = new int[nopatches.size()];
1196  nn = nopatches.size();
1197  for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1198  rand.reorder(tmp, nn);
1199  nopatches.resize(0);
1200  for (i=0;i<nn;i++) nopatches.add(tmp[i]);
1201  delete [] tmp;
1202 #endif
1203 
1204  // only use zero if it eliminates overloading or has patches
1205  int useZero = 0;
1206  int npens = xBlocks*yBlocks;
1207  if ( npens % ncpus == 0 ) useZero = 1;
1208  if ( npens == nopatches.size() + 1 ) useZero = 1;
1209  npens += xBlocks*zBlocks;
1210  if ( npens % ncpus == 0 ) useZero = 1;
1211  if ( npens == nopatches.size() + 1 ) useZero = 1;
1212  npens += yBlocks*zBlocks;
1213  if ( npens % ncpus == 0 ) useZero = 1;
1214  if ( npens == nopatches.size() + 1 ) useZero = 1;
1215 
1216  // add nopatches then patches in reversed order
1217  for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
1218  if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1219  for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
1220  if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1221 
1222  int npes = pmeprocs.size();
1223  for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1224  if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1225 #if !USE_RANDOM_TOPO
1226  zprocs.sort();
1227 #endif
1228  for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1229  if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1230 #if !USE_RANDOM_TOPO
1231  yprocs.sort();
1232 #endif
1233  for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1234  if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1235 #if !USE_RANDOM_TOPO
1236  xprocs.sort();
1237 #endif
1238 
1239 #if USE_TOPO_SFC
1240  CmiLock(tmgr_lock);
1241  //{
1242  TopoManager tmgr;
1243  int xdim = tmgr.getDimNX();
1244  int ydim = tmgr.getDimNY();
1245  int zdim = tmgr.getDimNZ();
1246  int xdim1 = find_level_grid(xdim);
1247  int ydim1 = find_level_grid(ydim);
1248  int zdim1 = find_level_grid(zdim);
1249  if(CkMyPe() == 0)
1250  printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1251 
1252  vector<Coord> result;
1253  SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1254  sort_sfc(xprocs, tmgr, result);
1255  sort_sfc(yprocs, tmgr, result);
1256  sort_sfc(zprocs, tmgr, result);
1257  //}
1258  CmiUnlock(tmgr_lock);
1259 #endif
1260 
1261 
1262  if(CkMyPe() == 0){
1263  iout << iINFO << "PME Z PENCIL LOCATIONS:";
1264  for ( i=0; i<zprocs.size() && i<10; ++i ) {
1265 #if USE_TOPO_SFC
1266  int x,y,z,t;
1267  tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1268  iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
1269 #else
1270  iout << " " << zprocs[i];
1271 #endif
1272  }
1273  if ( i < zprocs.size() ) iout << " ...";
1274  iout << "\n" << endi;
1275  }
1276 
1277  if (CkMyRank() == 0) {
1278  for (pe=0, x = 0; x < xBlocks; ++x)
1279  for (y = 0; y < yBlocks; ++y, ++pe ) {
1280  pencilPMEProcessors[zprocs[pe]] = 1;
1281  }
1282  }
1283 
1284  if(CkMyPe() == 0){
1285  iout << iINFO << "PME Y PENCIL LOCATIONS:";
1286  for ( i=0; i<yprocs.size() && i<10; ++i ) {
1287 #if USE_TOPO_SFC
1288  int x,y,z,t;
1289  tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1290  iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
1291 #else
1292  iout << " " << yprocs[i];
1293 #endif
1294  }
1295  if ( i < yprocs.size() ) iout << " ...";
1296  iout << "\n" << endi;
1297  }
1298 
1299  if (CkMyRank() == 0) {
1300  for (pe=0, z = 0; z < zBlocks; ++z )
1301  for (x = 0; x < xBlocks; ++x, ++pe ) {
1302  pencilPMEProcessors[yprocs[pe]] = 1;
1303  }
1304  }
1305 
1306  if(CkMyPe() == 0){
1307  iout << iINFO << "PME X PENCIL LOCATIONS:";
1308  for ( i=0; i<xprocs.size() && i<10; ++i ) {
1309 #if USE_TOPO_SFC
1310  int x,y,z,t;
1311  tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1312  iout << " " << xprocs[i] << "(" << x << " " << y << " " << z << ")";
1313 #else
1314  iout << " " << xprocs[i];
1315 #endif
1316  }
1317  if ( i < xprocs.size() ) iout << " ...";
1318  iout << "\n" << endi;
1319  }
1320 
1321  if (CkMyRank() == 0) {
1322  for (pe=0, y = 0; y < yBlocks; ++y )
1323  for (z = 0; z < zBlocks; ++z, ++pe ) {
1324  pencilPMEProcessors[xprocs[pe]] = 1;
1325  }
1326  }
1327 
1328 
1329  // creating the pencil arrays
1330  if ( CkMyPe() == 0 ){
1331 #if !USE_RANDOM_TOPO
1332  // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
1333  WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
1334  std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
1335  std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
1336 #endif
1337 #if 1
1338  CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
1339  CProxy_PmePencilMap ym;
1340  if ( simParams->PMEPencilsYLayout )
1341  ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
1342  else
1343  ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
1344  CProxy_PmePencilMap xm;
1345  if ( simParams->PMEPencilsXLayout )
1346  xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
1347  else
1348  xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
1349  pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1350  CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1351  CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1352  CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1353  zo.setAnytimeMigration(false); zo.setStaticInsertion(true);
1354  yo.setAnytimeMigration(false); yo.setStaticInsertion(true);
1355  xo.setAnytimeMigration(false); xo.setStaticInsertion(true);
1356  zPencil = CProxy_PmeZPencil::ckNew(zo); // (xBlocks,yBlocks,1);
1357  yPencil = CProxy_PmeYPencil::ckNew(yo); // (xBlocks,1,zBlocks);
1358  xPencil = CProxy_PmeXPencil::ckNew(xo); // (1,yBlocks,zBlocks);
1359 #else
1360  zPencil = CProxy_PmeZPencil::ckNew(); // (xBlocks,yBlocks,1);
1361  yPencil = CProxy_PmeYPencil::ckNew(); // (xBlocks,1,zBlocks);
1362  xPencil = CProxy_PmeXPencil::ckNew(); // (1,yBlocks,zBlocks);
1363 
1364  for (pe=0, x = 0; x < xBlocks; ++x)
1365  for (y = 0; y < yBlocks; ++y, ++pe ) {
1366  zPencil(x,y,0).insert(zprocs[pe]);
1367  }
1368  zPencil.doneInserting();
1369 
1370  for (pe=0, x = 0; x < xBlocks; ++x)
1371  for (z = 0; z < zBlocks; ++z, ++pe ) {
1372  yPencil(x,0,z).insert(yprocs[pe]);
1373  }
1374  yPencil.doneInserting();
1375 
1376 
1377  for (pe=0, y = 0; y < yBlocks; ++y )
1378  for (z = 0; z < zBlocks; ++z, ++pe ) {
1379  xPencil(0,y,z).insert(xprocs[pe]);
1380  }
1381  xPencil.doneInserting();
1382 #endif
1383 
1384  pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1385  PmePencilInitMsgData msgdata;
1386  msgdata.grid = myGrid;
1387  msgdata.xBlocks = xBlocks;
1388  msgdata.yBlocks = yBlocks;
1389  msgdata.zBlocks = zBlocks;
1390  msgdata.xPencil = xPencil;
1391  msgdata.yPencil = yPencil;
1392  msgdata.zPencil = zPencil;
1393  msgdata.pmeProxy = pmeProxyDir;
1394  msgdata.pmeNodeProxy = pmeNodeProxy;
1395  msgdata.xm = xm;
1396  msgdata.ym = ym;
1397  msgdata.zm = zm;
1398  xPencil.init(new PmePencilInitMsg(msgdata));
1399  yPencil.init(new PmePencilInitMsg(msgdata));
1400  zPencil.init(new PmePencilInitMsg(msgdata));
1401  }
1402 
1403  return; // continue in initialize_pencils() at next startup stage
1404  }
1405 
1406 
1407  int pe;
1408  int nx = 0;
1409  for ( pe = 0; pe < numGridPes; ++pe ) {
1410  localInfo[pe].x_start = nx;
1411  nx += myGrid.block1;
1412  if ( nx > myGrid.K1 ) nx = myGrid.K1;
1413  localInfo[pe].nx = nx - localInfo[pe].x_start;
1414  }
1415  int ny = 0;
1416  for ( pe = 0; pe < numTransPes; ++pe ) {
1417  localInfo[pe].y_start_after_transpose = ny;
1418  ny += myGrid.block2;
1419  if ( ny > myGrid.K2 ) ny = myGrid.K2;
1420  localInfo[pe].ny_after_transpose =
1421  ny - localInfo[pe].y_start_after_transpose;
1422  }
1423 
1424  { // decide how many pes this node exchanges charges with
1425 
1426  PatchMap *patchMap = PatchMap::Object();
1427  Lattice lattice = simParams->lattice;
1428  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1429  BigReal cutoff = simParams->cutoff;
1430  BigReal patchdim = simParams->patchDimension;
1431  int numPatches = patchMap->numPatches();
1432  int numNodes = CkNumPes();
1433  int *source_flags = new int[numNodes];
1434  int node;
1435  for ( node=0; node<numNodes; ++node ) {
1436  source_flags[node] = 0;
1437  recipPeDest[node] = 0;
1438  }
1439 
1440  // // make sure that we don't get ahead of ourselves on this node
1441  // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
1442  // source_flags[CkMyPe()] = 1;
1443  // recipPeDest[myRecipPe] = 1;
1444  // }
1445 
1446  for ( int pid=0; pid < numPatches; ++pid ) {
1447  int pnode = patchMap->node(pid);
1448 #ifdef NAMD_CUDA
1449  if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1450 #endif
1451  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1452  BigReal minx = patchMap->min_a(pid);
1453  BigReal maxx = patchMap->max_a(pid);
1454  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1455  // min1 (max1) is smallest (largest) grid line for this patch
1456  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1457  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1458  for ( int i=min1; i<=max1; ++i ) {
1459  int ix = i;
1460  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1461  while ( ix < 0 ) ix += myGrid.K1;
1462  // set source_flags[pnode] if this patch sends to our node
1463  if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1464  ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1465  source_flags[pnode] = 1;
1466  }
1467  // set dest_flags[] for node that our patch sends to
1468 #ifdef NAMD_CUDA
1469  if ( offload ) {
1470  if ( pnode == CkNodeFirst(CkMyNode()) ) {
1471  recipPeDest[ix / myGrid.block1] = 1;
1472  }
1473  } else
1474 #endif
1475  if ( pnode == CkMyPe() ) {
1476  recipPeDest[ix / myGrid.block1] = 1;
1477  }
1478  }
1479  }
1480 
1481  int numSourcesSamePhysicalNode = 0;
1482  numSources = 0;
1483  numDestRecipPes = 0;
1484  for ( node=0; node<numNodes; ++node ) {
1485  if ( source_flags[node] ) ++numSources;
1486  if ( recipPeDest[node] ) ++numDestRecipPes;
1487  if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1488  }
1489 
1490 #if 0
1491  if ( numSources ) {
1492  CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
1493  CkMyPe(), numSourcesSamePhysicalNode, numSources);
1494  iout << iINFO << "PME " << CkMyPe() << " sources:";
1495  for ( node=0; node<numNodes; ++node ) {
1496  if ( source_flags[node] ) iout << " " << node;
1497  }
1498  iout << "\n" << endi;
1499  }
1500 #endif
1501 
1502  delete [] source_flags;
1503 
1504  // CkPrintf("PME on node %d has %d sources and %d destinations\n",
1505  // CkMyPe(), numSources, numDestRecipPes);
1506 
1507  } // decide how many pes this node exchanges charges with (end)
1508 
1509  ungrid_count = numDestRecipPes;
1510 
1511  sendTransBarrier_received = 0;
1512 
1513  if ( myGridPe < 0 && myTransPe < 0 ) return;
1514  // the following only for nodes doing reciprocal sum
1515 
1516  if ( myTransPe >= 0 ) {
1517  recipEvirPe = findRecipEvirPe();
1518  pmeProxy[recipEvirPe].addRecipEvirClient();
1519  }
1520 
1521  if ( myTransPe >= 0 ) {
1522  int k2_start = localInfo[myTransPe].y_start_after_transpose;
1523  int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
1524  #ifdef OPENATOM_VERSION
1525  if ( simParams->openatomOn ) {
1526  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1527  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
1528  } else {
1529  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1530  }
1531  #else // OPENATOM_VERSION
1532  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1533  #endif // OPENATOM_VERSION
1534  }
1535 
1536  int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
1537  int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
1538  if ( local_size < local_size_2 ) local_size = local_size_2;
1539  qgrid = new float[local_size*numGrids];
1540  if ( numGridPes > 1 || numTransPes > 1 ) {
1541  kgrid = new float[local_size*numGrids];
1542  } else {
1543  kgrid = qgrid;
1544  }
1545  qgrid_size = local_size;
1546 
1547  if ( myGridPe >= 0 ) {
1548  qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
1549  qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
1550  fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
1551  fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
1552  }
1553 
1554  int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
1555 #ifdef NAMD_FFTW
1556  CmiLock(fftw_plan_lock);
1557 #ifdef NAMD_FFTW_3
1558  work = new fftwf_complex[n[0]];
1559  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1560  if ( myGridPe >= 0 ) {
1561  forward_plan_yz=new fftwf_plan[numGrids];
1562  backward_plan_yz=new fftwf_plan[numGrids];
1563  }
1564  if ( myTransPe >= 0 ) {
1565  forward_plan_x=new fftwf_plan[numGrids];
1566  backward_plan_x=new fftwf_plan[numGrids];
1567  }
1568  /* need one plan per grid */
1569  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1570  if ( myGridPe >= 0 ) {
1571  for( int g=0; g<numGrids; g++)
1572  {
1573  forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1574  localInfo[myGridPe].nx,
1575  qgrid + qgrid_size * g,
1576  NULL,
1577  1,
1578  myGrid.dim2 * myGrid.dim3,
1579  (fftwf_complex *)
1580  (qgrid + qgrid_size * g),
1581  NULL,
1582  1,
1583  myGrid.dim2 * (myGrid.dim3/2),
1584  fftwFlags);
1585  }
1586  }
1587  int zdim = myGrid.dim3;
1588  int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
1589  if ( ! CkMyPe() ) iout << " 2..." << endi;
1590  if ( myTransPe >= 0 ) {
1591  for( int g=0; g<numGrids; g++)
1592  {
1593 
1594  forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1595  (fftwf_complex *)
1596  (kgrid+qgrid_size*g),
1597  NULL,
1598  xStride,
1599  1,
1600  (fftwf_complex *)
1601  (kgrid+qgrid_size*g),
1602  NULL,
1603  xStride,
1604  1,
1605  FFTW_FORWARD,fftwFlags);
1606 
1607  }
1608  }
1609  if ( ! CkMyPe() ) iout << " 3..." << endi;
1610  if ( myTransPe >= 0 ) {
1611  for( int g=0; g<numGrids; g++)
1612  {
1613  backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1614  (fftwf_complex *)
1615  (kgrid+qgrid_size*g),
1616  NULL,
1617  xStride,
1618  1,
1619  (fftwf_complex *)
1620  (kgrid+qgrid_size*g),
1621  NULL,
1622  xStride,
1623  1,
1624  FFTW_BACKWARD, fftwFlags);
1625 
1626  }
1627  }
1628  if ( ! CkMyPe() ) iout << " 4..." << endi;
1629  if ( myGridPe >= 0 ) {
1630  for( int g=0; g<numGrids; g++)
1631  {
1632  backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1633  localInfo[myGridPe].nx,
1634  (fftwf_complex *)
1635  (qgrid + qgrid_size * g),
1636  NULL,
1637  1,
1638  myGrid.dim2*(myGrid.dim3/2),
1639  qgrid + qgrid_size * g,
1640  NULL,
1641  1,
1642  myGrid.dim2 * myGrid.dim3,
1643  fftwFlags);
1644  }
1645  }
1646  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1647 
1648 #else
1649  work = new fftw_complex[n[0]];
1650 
1651  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1652  if ( myGridPe >= 0 ) {
1653  forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1654  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1655  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1656  }
1657  if ( ! CkMyPe() ) iout << " 2..." << endi;
1658  if ( myTransPe >= 0 ) {
1659  forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1660  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1661  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1662  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1663  }
1664  if ( ! CkMyPe() ) iout << " 3..." << endi;
1665  if ( myTransPe >= 0 ) {
1666  backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1667  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1668  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1669  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1670  }
1671  if ( ! CkMyPe() ) iout << " 4..." << endi;
1672  if ( myGridPe >= 0 ) {
1673  backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1674  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1675  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1676  }
1677  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1678 #endif
1679  CmiUnlock(fftw_plan_lock);
1680 #else
1681  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
1682 #endif
1683 
1684  if ( myGridPe >= 0 && numSources == 0 )
1685  NAMD_bug("PME grid elements exist without sources.");
1686  grid_count = numSources;
1687  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
1688  trans_count = numGridPes;
1689 }
1690 
1691 
1692 
1694  delete msg;
1695  if ( ! usePencils ) return;
1696 
1698 
1699  PatchMap *patchMap = PatchMap::Object();
1700  Lattice lattice = simParams->lattice;
1701  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1702  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
1703  BigReal cutoff = simParams->cutoff;
1704  BigReal patchdim = simParams->patchDimension;
1705  int numPatches = patchMap->numPatches();
1706 
1707  pencilActive = new char[xBlocks*yBlocks];
1708  for ( int i=0; i<xBlocks; ++i ) {
1709  for ( int j=0; j<yBlocks; ++j ) {
1710  pencilActive[i*yBlocks+j] = 0;
1711  }
1712  }
1713 
1714  for ( int pid=0; pid < numPatches; ++pid ) {
1715  int pnode = patchMap->node(pid);
1716 #ifdef NAMD_CUDA
1717  if ( offload ) {
1718  if ( CkNodeOf(pnode) != CkMyNode() ) continue;
1719  } else
1720 #endif
1721  if ( pnode != CkMyPe() ) continue;
1722 
1723  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1724  int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
1725 
1726  BigReal minx = patchMap->min_a(pid);
1727  BigReal maxx = patchMap->max_a(pid);
1728  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1729  // min1 (max1) is smallest (largest) grid line for this patch
1730  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1731  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1732 
1733  BigReal miny = patchMap->min_b(pid);
1734  BigReal maxy = patchMap->max_b(pid);
1735  BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1736  // min2 (max2) is smallest (largest) grid line for this patch
1737  int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
1738  int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
1739 
1740  for ( int i=min1; i<=max1; ++i ) {
1741  int ix = i;
1742  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1743  while ( ix < 0 ) ix += myGrid.K1;
1744  for ( int j=min2; j<=max2; ++j ) {
1745  int jy = j;
1746  while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
1747  while ( jy < 0 ) jy += myGrid.K2;
1748  pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
1749  }
1750  }
1751  }
1752 
1753  numPencilsActive = 0;
1754  for ( int i=0; i<xBlocks; ++i ) {
1755  for ( int j=0; j<yBlocks; ++j ) {
1756  if ( pencilActive[i*yBlocks+j] ) {
1757  ++numPencilsActive;
1758 #ifdef NAMD_CUDA
1759  if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
1760 #endif
1761  zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1762  }
1763  }
1764  }
1765  activePencils = new ijpair[numPencilsActive];
1766  numPencilsActive = 0;
1767  for ( int i=0; i<xBlocks; ++i ) {
1768  for ( int j=0; j<yBlocks; ++j ) {
1769  if ( pencilActive[i*yBlocks+j] ) {
1770  activePencils[numPencilsActive++] = ijpair(i,j);
1771  }
1772  }
1773  }
1774  if ( simParams->PMESendOrder ) {
1775  std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
1776  } else {
1777  Random rand(CkMyPe());
1778  rand.reorder(activePencils,numPencilsActive);
1779  }
1780  //if ( numPencilsActive ) {
1781  // CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
1782  //}
1783 
1784  ungrid_count = numPencilsActive;
1785 }
1786 
1787 
1789  if ( ! usePencils ) return;
1790  if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1791 }
1792 
1793 
1795 
1796  if ( CmiMyRank() == 0 ) {
1797  CmiDestroyLock(fftw_plan_lock);
1798  }
1799  CmiDestroyLock(pmemgr_lock);
1800 
1801  delete myKSpace;
1802  delete [] localInfo;
1803  delete [] gridNodeInfo;
1804  delete [] transNodeInfo;
1805  delete [] gridPeMap;
1806  delete [] transPeMap;
1807  delete [] recipPeDest;
1808  delete [] gridPeOrder;
1809  delete [] gridNodeOrder;
1810  delete [] transNodeOrder;
1811  delete [] qgrid;
1812  if ( kgrid != qgrid ) delete [] kgrid;
1813  delete [] work;
1814  delete [] gridmsg_reuse;
1815 
1816  if ( ! offload ) {
1817  for (int i=0; i<q_count; ++i) {
1818  delete [] q_list[i];
1819  }
1820  delete [] q_list;
1821  delete [] fz_arr;
1822  }
1823  delete [] f_arr;
1824  delete [] q_arr;
1825 }
1826 
1828  // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
1829  if ( grid_count == 0 ) {
1830  NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
1831  }
1832  if ( grid_count == numSources ) {
1833  lattice = msg->lattice;
1834  grid_sequence = msg->sequence;
1835  }
1836 
1837  int zdim = myGrid.dim3;
1838  int zlistlen = msg->zlistlen;
1839  int *zlist = msg->zlist;
1840  float *qmsg = msg->qgrid;
1841  for ( int g=0; g<numGrids; ++g ) {
1842  char *f = msg->fgrid + fgrid_len * g;
1843  float *q = qgrid + qgrid_size * g;
1844  for ( int i=0; i<fgrid_len; ++i ) {
1845  if ( f[i] ) {
1846  for ( int k=0; k<zlistlen; ++k ) {
1847  q[zlist[k]] += *(qmsg++);
1848  }
1849  }
1850  q += zdim;
1851  }
1852  }
1853 
1854  gridmsg_reuse[numSources-grid_count] = msg;
1855  --grid_count;
1856 
1857  if ( grid_count == 0 ) {
1858  pmeProxyDir[CkMyPe()].gridCalc1();
1859  if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1860  }
1861 }
1862 #ifdef MANUAL_DEBUG_FFTW3
1863 
1864 /* utility functions for manual debugging */
1865 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
1866 {
1867 
1868  char fmt[1000];
1869  char filename[1000];
1870  strncpy(fmt,infilename,999);
1871  strncat(fmt,"_%d.out",999);
1872  sprintf(filename,fmt, pe);
1873  FILE *loutfile = fopen(filename, "w");
1874 #ifdef PAIRCALC_TEST_DUMP
1875  fprintf(loutfile,"%d\n",ydim);
1876 #endif
1877  fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
1878  for(int i=0;i<xdim;i++)
1879  for(int j=0;j<ydim;j++)
1880  for(int k=0;k<zdim;k++)
1881  fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1882  fclose(loutfile);
1883 
1884 }
1885 
1886 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
1887 {
1888  char fmt[1000];
1889  char filename[1000];
1890  strncpy(fmt,infilename,999);
1891  strncat(fmt,"_%d_%d_%d.out",999);
1892  sprintf(filename,fmt, x,y,z);
1893  FILE *loutfile = fopen(filename, "w");
1894  CkAssert(loutfile!=NULL);
1895  CkPrintf("opened %s for dump\n",filename);
1896  fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
1897  for(int i=0;i<xdim;i++)
1898  for(int j=0;j<ydim;j++)
1899  for(int k=0;k<zdim;k++)
1900  fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1901  fclose(loutfile);
1902 }
1903 
1904 #endif
1905 
1907  // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
1908 
1909 #ifdef NAMD_FFTW
1910  for ( int g=0; g<numGrids; ++g ) {
1911 #ifdef NAMD_FFTW_3
1912  fftwf_execute(forward_plan_yz[g]);
1913 #else
1914  rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1915  qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
1916 #endif
1917 
1918  }
1919 #endif
1920 
1921  if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1922 }
1923 
1925  sendTransBarrier_received += 1;
1926  // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
1927  if ( sendTransBarrier_received < numGridPes ) return;
1928  sendTransBarrier_received = 0;
1929  for ( int i=0; i<numGridPes; ++i ) {
1930  pmeProxyDir[gridPeMap[i]].sendTrans();
1931  }
1932 }
1933 
1934 static inline void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param) {
1935  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
1936  mgr->sendTransSubset(first, last);
1937 }
1938 
1940 
1941  untrans_count = numTransPes;
1942 
1943 #if CMK_SMP && USE_CKLOOP
1944  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1945  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
1946  CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
1947  } else
1948 #endif
1949  {
1950  sendTransSubset(0, numTransNodes-1);
1951  }
1952 
1953 }
1954 
1955 void ComputePmeMgr::sendTransSubset(int first, int last) {
1956  // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
1957 
1958  // send data for transpose
1959  int zdim = myGrid.dim3;
1960  int nx = localInfo[myGridPe].nx;
1961  int x_start = localInfo[myGridPe].x_start;
1962  int slicelen = myGrid.K2 * zdim;
1963 
1964  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1965 
1966 #if CMK_BLUEGENEL
1967  CmiNetworkProgressAfter (0);
1968 #endif
1969 
1970  for (int j=first; j<=last; j++) {
1971  int node = transNodeOrder[j]; // different order on each node
1972  int pe = transNodeInfo[node].pe_start;
1973  int npe = transNodeInfo[node].npe;
1974  int totlen = 0;
1975  if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
1976  LocalPmeInfo &li = localInfo[pe];
1977  int cpylen = li.ny_after_transpose * zdim;
1978  totlen += cpylen;
1979  }
1980  PmeTransMsg *newmsg = new (nx * totlen * numGrids,
1982  newmsg->sourceNode = myGridPe;
1983  newmsg->lattice = lattice;
1984  newmsg->x_start = x_start;
1985  newmsg->nx = nx;
1986  for ( int g=0; g<numGrids; ++g ) {
1987  float *qmsg = newmsg->qgrid + nx * totlen * g;
1988  pe = transNodeInfo[node].pe_start;
1989  for (int i=0; i<npe; ++i, ++pe) {
1990  LocalPmeInfo &li = localInfo[pe];
1991  int cpylen = li.ny_after_transpose * zdim;
1992  if ( node == myTransNode ) {
1993  ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
1994  qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
1995  }
1996  float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
1997  for ( int x = 0; x < nx; ++x ) {
1998  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
1999  q += slicelen;
2000  qmsg += cpylen;
2001  }
2002  }
2003  }
2004  newmsg->sequence = grid_sequence;
2005  SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
2006  if ( node == myTransNode ) newmsg->nx = 0;
2007  if ( npe > 1 ) {
2008  if ( node == myTransNode ) fwdSharedTrans(newmsg);
2009  else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
2010  } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
2011  }
2012 }
2013 
2015  // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
2016  int pe = transNodeInfo[myTransNode].pe_start;
2017  int npe = transNodeInfo[myTransNode].npe;
2018  CmiNodeLock lock = CmiCreateLock();
2019  int *count = new int; *count = npe;
2020  for (int i=0; i<npe; ++i, ++pe) {
2023  shmsg->msg = msg;
2024  shmsg->count = count;
2025  shmsg->lock = lock;
2026  pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2027  }
2028 }
2029 
2031  procTrans(msg->msg);
2032  CmiLock(msg->lock);
2033  int count = --(*msg->count);
2034  CmiUnlock(msg->lock);
2035  if ( count == 0 ) {
2036  CmiDestroyLock(msg->lock);
2037  delete msg->count;
2038  delete msg->msg;
2039  }
2040  delete msg;
2041 }
2042 
2044  procTrans(msg);
2045  delete msg;
2046 }
2047 
2049  // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
2050  if ( trans_count == numGridPes ) {
2051  lattice = msg->lattice;
2052  grid_sequence = msg->sequence;
2053  }
2054 
2055  if ( msg->nx ) {
2056  int zdim = myGrid.dim3;
2057  NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2058  int first_pe = nodeInfo.pe_start;
2059  int last_pe = first_pe+nodeInfo.npe-1;
2060  int y_skip = localInfo[myTransPe].y_start_after_transpose
2061  - localInfo[first_pe].y_start_after_transpose;
2062  int ny_msg = localInfo[last_pe].y_start_after_transpose
2063  + localInfo[last_pe].ny_after_transpose
2064  - localInfo[first_pe].y_start_after_transpose;
2065  int ny = localInfo[myTransPe].ny_after_transpose;
2066  int x_start = msg->x_start;
2067  int nx = msg->nx;
2068  for ( int g=0; g<numGrids; ++g ) {
2069  CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2070  (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
2071  nx*ny*zdim*sizeof(float));
2072  }
2073  }
2074 
2075  --trans_count;
2076 
2077  if ( trans_count == 0 ) {
2078  pmeProxyDir[CkMyPe()].gridCalc2();
2079  }
2080 }
2081 
2083  // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
2084 
2085 #if CMK_BLUEGENEL
2086  CmiNetworkProgressAfter (0);
2087 #endif
2088 
2089  int zdim = myGrid.dim3;
2090  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2091  int ny = localInfo[myTransPe].ny_after_transpose;
2092 
2093  for ( int g=0; g<numGrids; ++g ) {
2094  // finish forward FFT (x dimension)
2095 #ifdef NAMD_FFTW
2096 #ifdef NAMD_FFTW_3
2097  fftwf_execute(forward_plan_x[g]);
2098 #else
2099  fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2100  ny * zdim / 2, 1, work, 1, 0);
2101 #endif
2102 #endif
2103  }
2104 
2105 #ifdef OPENATOM_VERSION
2106  if ( ! simParams -> openatomOn ) {
2107 #endif // OPENATOM_VERSION
2108  gridCalc2R();
2109 #ifdef OPENATOM_VERSION
2110  } else {
2111  gridCalc2Moa();
2112  }
2113 #endif // OPENATOM_VERSION
2114 }
2115 
2116 #ifdef OPENATOM_VERSION
2117 void ComputePmeMgr::gridCalc2Moa(void) {
2118 
2119  int zdim = myGrid.dim3;
2120  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2121  int ny = localInfo[myTransPe].ny_after_transpose;
2122 
2124 
2125  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
2126 
2127  for ( int g=0; g<numGrids; ++g ) {
2128  #ifdef OPENATOM_VERSION_DEBUG
2129  CkPrintf("Sending recQ on processor %d \n", CkMyPe());
2130  for ( int i=0; i<=(ny * zdim / 2); ++i)
2131  {
2132  CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
2133  }
2134  #endif // OPENATOM_VERSION_DEBUG
2135 // mqcpProxy[CkMyPe()].recvQ((ny * zdim / 2),((fftw_complex *)(kgrid+qgrid_size*g)));
2136  CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
2137  moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
2138  }
2139 }
2140 #endif // OPENATOM_VERSION
2141 
2143 
2144  int useCkLoop = 0;
2145 #if CMK_SMP && USE_CKLOOP
2146  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
2147  && CkNumPes() >= 2 * numTransPes ) {
2148  useCkLoop = 1;
2149  }
2150 #endif
2151 
2152  int zdim = myGrid.dim3;
2153  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2154  int ny = localInfo[myTransPe].ny_after_transpose;
2155 
2156  for ( int g=0; g<numGrids; ++g ) {
2157  // reciprocal space portion of PME
2159  recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
2160  lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2161  // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
2162 
2163  // start backward FFT (x dimension)
2164 
2165 #ifdef NAMD_FFTW
2166 #ifdef NAMD_FFTW_3
2167  fftwf_execute(backward_plan_x[g]);
2168 #else
2169  fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2170  ny * zdim / 2, 1, work, 1, 0);
2171 #endif
2172 #endif
2173  }
2174 
2175  pmeProxyDir[CkMyPe()].sendUntrans();
2176 }
2177 
2178 static inline void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param) {
2179  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
2180  mgr->sendUntransSubset(first, last);
2181 }
2182 
2184 
2185  trans_count = numGridPes;
2186 
2187  { // send energy and virial
2188  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
2189  for ( int g=0; g<numGrids; ++g ) {
2190  newmsg->evir[g] = recip_evir2[g];
2191  }
2192  SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
2193  CmiEnableUrgentSend(1);
2194  pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2195  CmiEnableUrgentSend(0);
2196  }
2197 
2198 #if CMK_SMP && USE_CKLOOP
2199  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2200  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
2201  CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
2202  } else
2203 #endif
2204  {
2205  sendUntransSubset(0, numGridNodes-1);
2206  }
2207 
2208 }
2209 
2210 void ComputePmeMgr::sendUntransSubset(int first, int last) {
2211 
2212  int zdim = myGrid.dim3;
2213  int y_start = localInfo[myTransPe].y_start_after_transpose;
2214  int ny = localInfo[myTransPe].ny_after_transpose;
2215  int slicelen = myGrid.K2 * zdim;
2216 
2217  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2218 
2219 #if CMK_BLUEGENEL
2220  CmiNetworkProgressAfter (0);
2221 #endif
2222 
2223  // send data for reverse transpose
2224  for (int j=first; j<=last; j++) {
2225  int node = gridNodeOrder[j]; // different order on each node
2226  int pe = gridNodeInfo[node].pe_start;
2227  int npe = gridNodeInfo[node].npe;
2228  int totlen = 0;
2229  if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
2230  LocalPmeInfo &li = localInfo[pe];
2231  int cpylen = li.nx * zdim;
2232  totlen += cpylen;
2233  }
2234  PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
2235  newmsg->sourceNode = myTransPe;
2236  newmsg->y_start = y_start;
2237  newmsg->ny = ny;
2238  for ( int g=0; g<numGrids; ++g ) {
2239  float *qmsg = newmsg->qgrid + ny * totlen * g;
2240  pe = gridNodeInfo[node].pe_start;
2241  for (int i=0; i<npe; ++i, ++pe) {
2242  LocalPmeInfo &li = localInfo[pe];
2243  if ( node == myGridNode ) {
2244  ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
2245  qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2246  float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
2247  int cpylen = ny * zdim;
2248  for ( int x = 0; x < li.nx; ++x ) {
2249  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2250  q += cpylen;
2251  qmsg += slicelen;
2252  }
2253  } else {
2254  CmiMemcpy((void*)qmsg,
2255  (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
2256  li.nx*ny*zdim*sizeof(float));
2257  qmsg += li.nx*ny*zdim;
2258  }
2259  }
2260  }
2261  SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
2262  if ( node == myGridNode ) newmsg->ny = 0;
2263  if ( npe > 1 ) {
2264  if ( node == myGridNode ) fwdSharedUntrans(newmsg);
2265  else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
2266  } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
2267  }
2268 }
2269 
2271  int pe = gridNodeInfo[myGridNode].pe_start;
2272  int npe = gridNodeInfo[myGridNode].npe;
2273  CmiNodeLock lock = CmiCreateLock();
2274  int *count = new int; *count = npe;
2275  for (int i=0; i<npe; ++i, ++pe) {
2277  shmsg->msg = msg;
2278  shmsg->count = count;
2279  shmsg->lock = lock;
2280  pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2281  }
2282 }
2283 
2285  procUntrans(msg->msg);
2286  CmiLock(msg->lock);
2287  int count = --(*msg->count);
2288  CmiUnlock(msg->lock);
2289  if ( count == 0 ) {
2290  CmiDestroyLock(msg->lock);
2291  delete msg->count;
2292  delete msg->msg;
2293  }
2294  delete msg;
2295 }
2296 
2298  procUntrans(msg);
2299  delete msg;
2300 }
2301 
2303  // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
2304 
2305 #if CMK_BLUEGENEL
2306  CmiNetworkProgressAfter (0);
2307 #endif
2308 
2309  NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
2310  int first_pe = nodeInfo.pe_start;
2311  int g;
2312 
2313  if ( msg->ny ) {
2314  int zdim = myGrid.dim3;
2315  int last_pe = first_pe+nodeInfo.npe-1;
2316  int x_skip = localInfo[myGridPe].x_start
2317  - localInfo[first_pe].x_start;
2318  int nx_msg = localInfo[last_pe].x_start
2319  + localInfo[last_pe].nx
2320  - localInfo[first_pe].x_start;
2321  int nx = localInfo[myGridPe].nx;
2322  int y_start = msg->y_start;
2323  int ny = msg->ny;
2324  int slicelen = myGrid.K2 * zdim;
2325  int cpylen = ny * zdim;
2326  for ( g=0; g<numGrids; ++g ) {
2327  float *q = qgrid + qgrid_size * g + y_start * zdim;
2328  float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
2329  for ( int x = 0; x < nx; ++x ) {
2330  CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
2331  q += slicelen;
2332  qmsg += cpylen;
2333  }
2334  }
2335  }
2336 
2337  --untrans_count;
2338 
2339  if ( untrans_count == 0 ) {
2340  pmeProxyDir[CkMyPe()].gridCalc3();
2341  }
2342 }
2343 
2345  // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
2346 
2347  // finish backward FFT
2348 #ifdef NAMD_FFTW
2349  for ( int g=0; g<numGrids; ++g ) {
2350 #ifdef NAMD_FFTW_3
2351  fftwf_execute(backward_plan_yz[g]);
2352 #else
2353  rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2354  (fftw_complex *) (qgrid + qgrid_size * g),
2355  1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
2356 #endif
2357  }
2358 
2359 #endif
2360 
2361  pmeProxyDir[CkMyPe()].sendUngrid();
2362 }
2363 
2364 static inline void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param) {
2365  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
2366  mgr->sendUngridSubset(first, last);
2367 }
2368 
2370 
2371 #if CMK_SMP && USE_CKLOOP
2372  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2373  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
2374  CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
2375  } else
2376 #endif
2377  {
2378  sendUngridSubset(0, numSources-1);
2379  }
2380 
2381  grid_count = numSources;
2382  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
2383 }
2384 
2385 void ComputePmeMgr::sendUngridSubset(int first, int last) {
2386 
2387 #ifdef NAMD_CUDA
2388  const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
2389 #else
2390  const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
2391 #endif
2392 
2393  for ( int j=first; j<=last; ++j ) {
2394  // int msglen = qgrid_len;
2395  PmeGridMsg *newmsg = gridmsg_reuse[j];
2396  int pe = newmsg->sourceNode;
2397  int zdim = myGrid.dim3;
2398  int flen = newmsg->len;
2399  int fstart = newmsg->start;
2400  int zlistlen = newmsg->zlistlen;
2401  int *zlist = newmsg->zlist;
2402  float *qmsg = newmsg->qgrid;
2403  for ( int g=0; g<numGrids; ++g ) {
2404  char *f = newmsg->fgrid + fgrid_len * g;
2405  float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2406  for ( int i=0; i<flen; ++i ) {
2407  if ( f[i] ) {
2408  for ( int k=0; k<zlistlen; ++k ) {
2409  *(qmsg++) = q[zlist[k]];
2410  }
2411  }
2412  q += zdim;
2413  }
2414  }
2415  newmsg->sourceNode = myGridPe;
2416 
2417  SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
2418  CmiEnableUrgentSend(1);
2419 #ifdef NAMD_CUDA
2420  if ( offload ) {
2421  pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2422  } else
2423 #endif
2424  pmeProxyDir[pe].recvUngrid(newmsg);
2425  CmiEnableUrgentSend(0);
2426  }
2427 }
2428 
2430  // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
2431 #ifdef NAMD_CUDA
2432  if ( ! offload ) // would need lock
2433 #endif
2434  if ( ungrid_count == 0 ) {
2435  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2436  }
2437 
2438  if ( usePencils ) copyPencils(msg);
2439  else copyResults(msg);
2440  delete msg;
2441  recvAck(0);
2442 }
2443 
2445  if ( msg ) delete msg;
2446 #ifdef NAMD_CUDA
2447  if ( offload ) {
2448  CmiLock(cuda_lock);
2449  if ( ungrid_count == 0 ) {
2450  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2451  }
2452  int uc = --ungrid_count;
2453  CmiUnlock(cuda_lock);
2454 
2455  if ( uc == 0 ) {
2456  pmeProxyDir[master_pe].ungridCalc();
2457  }
2458  return;
2459  }
2460 #endif
2461  --ungrid_count;
2462 
2463  if ( ungrid_count == 0 ) {
2464  pmeProxyDir[CkMyPe()].ungridCalc();
2465  }
2466 }
2467 
2468 #ifdef NAMD_CUDA
2469 #define count_limit 1000000
2470 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
2471 #define EVENT_STRIDE 10
2472 
2473 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime); // fix Charm++
2474 
2475 void cudaDie(const char *msg, cudaError_t err=cudaSuccess);
2476 
2477 void cuda_check_pme_forces(void *arg, double walltime) {
2478  ComputePmeMgr *argp = (ComputePmeMgr *) arg;
2479 
2480  while ( 1 ) { // process multiple events per call
2481  cudaError_t err = cudaEventQuery(argp->end_forces[argp->forces_done_count/EVENT_STRIDE]);
2482  if ( err == cudaSuccess ) {
2483  argp->check_forces_count = 0;
2484  for ( int i=0; i<EVENT_STRIDE; ++i ) {
2486  if ( ++(argp->forces_done_count) == argp->forces_count ) break;
2487  }
2488  if ( argp->forces_done_count == argp->forces_count ) { // last event
2489  traceUserBracketEvent(CUDA_EVENT_ID_PME_FORCES,argp->forces_time,walltime);
2490  argp->forces_time = walltime - argp->forces_time;
2491  //CkPrintf("cuda_check_pme_forces forces_time == %f\n", argp->forces_time);
2492  return;
2493  } else { // more events
2494  continue; // check next event
2495  }
2496  } else if ( err != cudaErrorNotReady ) {
2497  char errmsg[256];
2498  sprintf(errmsg,"in cuda_check_pme_forces for event %d after polling %d times over %f s on seq %d",
2500  argp->check_forces_count, walltime - argp->forces_time,
2501  argp->saved_sequence);
2502  cudaDie(errmsg,err);
2503  } else if ( ++(argp->check_forces_count) >= count_limit ) {
2504  char errmsg[256];
2505  sprintf(errmsg,"cuda_check_pme_forces for event %d polled %d times over %f s on seq %d",
2507  argp->check_forces_count, walltime - argp->forces_time,
2508  argp->saved_sequence);
2509  cudaDie(errmsg,err);
2510  } else {
2511  break; // call again
2512  }
2513  } // while ( 1 )
2514  CcdCallBacksReset(0,walltime); // fix Charm++
2516 }
2517 #endif // NAMD_CUDA
2518 
2520  // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
2521 
2522  ungridForcesCount = pmeComputes.size();
2523 
2524 #ifdef NAMD_CUDA
2525  if ( offload ) {
2526  //CmiLock(cuda_lock);
2527  cudaSetDevice(deviceCUDA->getDeviceID());
2528 
2529  if ( this == masterPmeMgr ) {
2530  double before = CmiWallTimer();
2531  cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
2532  cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
2533  // try to make the unspecified launch failures go away
2534  cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2535  cuda_errcheck("in ComputePmeMgr::ungridCalc after potential memcpy");
2536  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2537 
2538  const int myrank = CkMyRank();
2539  for ( int i=0; i<CkMyNodeSize(); ++i ) {
2540  if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
2541  nodePmeMgr->mgrObjects[i]->ungridCalc();
2542  }
2543  }
2544  if ( ! pmeComputes.size() ) return;
2545  }
2546 
2547  if ( ! end_forces ) {
2548  int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
2549  end_forces = new cudaEvent_t[n];
2550  for ( int i=0; i<n; ++i ) {
2551  cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
2552  }
2553  }
2554 
2555  const int pcsz = pmeComputes.size();
2556  if ( ! afn_host ) {
2557  cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
2558  cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
2559  cuda_errcheck("malloc params for pme");
2560  }
2561  int totn = 0;
2562  for ( int i=0; i<pcsz; ++i ) {
2563  int n = pmeComputes[i]->numGridAtoms[0];
2564  totn += n;
2565  }
2566  if ( totn > f_data_mgr_alloc ) {
2567  if ( f_data_mgr_alloc ) {
2568  CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2569  cudaFree(f_data_mgr_dev);
2570  cudaFreeHost(f_data_mgr_host);
2571  }
2572  f_data_mgr_alloc = 1.2 * (totn + 100);
2573  cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
2574  cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
2575  cuda_errcheck("malloc forces for pme");
2576  }
2577  // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
2578  float *f_dev = f_data_mgr_dev;
2579  float *f_host = f_data_mgr_host;
2580  for ( int i=0; i<pcsz; ++i ) {
2581  int n = pmeComputes[i]->numGridAtoms[0];
2582  pmeComputes[i]->f_data_dev = f_dev;
2583  pmeComputes[i]->f_data_host = f_host;
2584  afn_host[3*i ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
2585  afn_host[3*i+1] = f_dev;
2586  afn_host[3*i+2] = f_dev + n; // avoid type conversion issues
2587  f_dev += 3*n;
2588  f_host += 3*n;
2589  }
2590  //CmiLock(cuda_lock);
2591  double before = CmiWallTimer();
2592  cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
2593  cuda_errcheck("in ComputePmeMgr::ungridCalc after force pointer memcpy");
2594  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2595  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2596  cuda_errcheck("in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2597  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2598 
2599  for ( int i=0; i<pcsz; ++i ) {
2600  // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
2601  if ( i%EVENT_STRIDE == 0 ) {
2602  int dimy = pcsz - i;
2603  if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
2604  int maxn = 0;
2605  int subtotn = 0;
2606  for ( int j=0; j<dimy; ++j ) {
2607  int n = pmeComputes[i+j]->numGridAtoms[0];
2608  subtotn += n;
2609  if ( n > maxn ) maxn = n;
2610  }
2611  // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
2612  before = CmiWallTimer();
2613  cuda_pme_forces(
2614  bspline_coeffs_dev,
2615  v_arr_dev, afn_dev+3*i, dimy, maxn, /*
2616  pmeComputes[i]->a_data_dev,
2617  pmeComputes[i]->f_data_dev,
2618  n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
2619  streams[stream]);
2620  cuda_errcheck("in ComputePmeMgr::ungridCalc after force kernel submit");
2621  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
2622  before = CmiWallTimer();
2623  cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
2624  cudaMemcpyDeviceToHost, streams[stream]);
2625  cuda_errcheck("in ComputePmeMgr::ungridCalc after force memcpy submit");
2626  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2627  cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
2628  cuda_errcheck("in ComputePmeMgr::ungridCalc after end_forces event");
2629  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2630  }
2631  // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
2632  }
2633  //CmiUnlock(cuda_lock);
2634  } else
2635 #endif // NAMD_CUDA
2636  {
2637  for ( int i=0; i<pmeComputes.size(); ++i ) {
2639  // pmeComputes[i]->ungridForces();
2640  }
2641  }
2642  // submitReductions(); // must follow all ungridForces()
2643 
2644 #ifdef NAMD_CUDA
2645  if ( offload ) {
2646  forces_time = CmiWallTimer();
2647  forces_count = ungridForcesCount;
2648  forces_done_count = 0;
2649  pmeProxy[this_pe].pollForcesReady();
2650  }
2651 #endif
2652 
2653  ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2654 }
2655 
2657 #ifdef NAMD_CUDA
2658  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
2660 #else
2661  NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2662 #endif
2663 }
2664 
2665 void ComputePme::atomUpdate() { atomsChanged = 1; }
2666 
2668 {
2669  DebugM(4,"ComputePme created.\n");
2671  setNumPatches(1);
2672 
2673  CProxy_ComputePmeMgr::ckLocalBranch(
2674  CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(this);
2675 
2676  SimParameters *simParams = Node::Object()->simParameters;
2677 
2678  qmForcesOn = simParams->qmForcesOn;
2679  offload = simParams->PMEOffload;
2680 
2681  numGridsMax = numGrids;
2682 
2683  myGrid.K1 = simParams->PMEGridSizeX;
2684  myGrid.K2 = simParams->PMEGridSizeY;
2685  myGrid.K3 = simParams->PMEGridSizeZ;
2686  myGrid.order = simParams->PMEInterpOrder;
2687  myGrid.dim2 = myGrid.K2;
2688  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
2689 
2690 #ifdef NAMD_CUDA
2691  cuda_atoms_offset = 0;
2692  f_data_host = 0;
2693  f_data_dev = 0;
2694  if ( ! offload )
2695 #endif
2696  {
2697  for ( int g=0; g<numGrids; ++g ) myRealSpace[g] = new PmeRealSpace(myGrid);
2698  }
2699 
2700  atomsChanged = 0;
2701 
2702  qmLoclIndx = 0;
2703  qmLocalCharges = 0;
2704 }
2705 
2707  if (!(patch = PatchMap::Object()->patch(patchID))) {
2708  NAMD_bug("ComputePme used with unknown patch.");
2709  }
2710  positionBox = patch->registerPositionPickup(this);
2711  avgPositionBox = patch->registerAvgPositionPickup(this);
2712  forceBox = patch->registerForceDeposit(this);
2713 #ifdef NAMD_CUDA
2714  if ( offload ) {
2715  myMgr->cuda_atoms_count += patch->getNumAtoms();
2716  }
2717 #endif
2718 }
2719 
2721 
2722  noWorkCount = 0;
2723  doWorkCount = 0;
2724  ungridForcesCount = 0;
2725 
2727 
2728  SimParameters *simParams = Node::Object()->simParameters;
2729 
2730  strayChargeErrors = 0;
2731 
2732 #ifdef NAMD_CUDA
2733  PatchMap *patchMap = PatchMap::Object();
2734  int pe = master_pe = CkNodeFirst(CkMyNode());
2735  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2736  if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
2737  if ( ! patchMap->numPatchesOnNode(pe) ) continue;
2738  if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
2739  if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
2741  && pe != deviceCUDA->getMasterPe() ) {
2742  master_pe = pe;
2743  }
2744  }
2745  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
2746  NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
2747  }
2748 
2749  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
2750  bool cudaFirst = 1;
2751  if ( offload ) {
2752  CmiLock(cuda_lock);
2753  cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
2754  }
2755 
2756  if ( cudaFirst ) {
2757  nodePmeMgr->master_pe = master_pe;
2758  nodePmeMgr->masterPmeMgr = masterPmeMgr;
2759  }
2760 #endif
2761 
2762  qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
2763  fsize = myGrid.K1 * myGrid.dim2;
2764  if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
2765 #ifdef NAMD_CUDA
2766  if ( ! offload )
2767 #endif
2768  {
2769  q_arr = new float*[fsize*numGrids];
2770  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2771  q_list = new float*[fsize*numGrids];
2772  memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
2773  q_count = 0;
2774  }
2775 
2776 #ifdef NAMD_CUDA
2777  if ( cudaFirst || ! offload ) {
2778 #endif
2779  f_arr = new char[fsize*numGrids];
2780  // memset to non-zero value has race condition on BlueGene/Q
2781  // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
2782  for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2783 
2784  for ( int g=0; g<numGrids; ++g ) {
2785  char *f = f_arr + g*fsize;
2786  if ( usePencils ) {
2787  int K1 = myGrid.K1;
2788  int K2 = myGrid.K2;
2789  int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2790  int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2791  int dim2 = myGrid.dim2;
2792  for (int ap=0; ap<numPencilsActive; ++ap) {
2793  int ib = activePencils[ap].i;
2794  int jb = activePencils[ap].j;
2795  int ibegin = ib*block1;
2796  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
2797  int jbegin = jb*block2;
2798  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
2799  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2800  for ( int i=ibegin; i<iend; ++i ) {
2801  for ( int j=jbegin; j<jend; ++j ) {
2802  f[i*dim2+j] = 0;
2803  }
2804  }
2805  }
2806  } else {
2807  int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
2808  bsize = block1 * myGrid.dim2 * myGrid.dim3;
2809  for (int pe=0; pe<numGridPes; pe++) {
2810  if ( ! recipPeDest[pe] ) continue;
2811  int start = pe * bsize;
2812  int len = bsize;
2813  if ( start >= qsize ) { start = 0; len = 0; }
2814  if ( start + len > qsize ) { len = qsize - start; }
2815  int zdim = myGrid.dim3;
2816  int fstart = start / zdim;
2817  int flen = len / zdim;
2818  memset(f + fstart, 0, flen*sizeof(char));
2819  // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
2820  }
2821  }
2822  }
2823 #ifdef NAMD_CUDA
2824  }
2825  if ( offload ) {
2826  cudaSetDevice(deviceCUDA->getDeviceID());
2827  if ( cudaFirst ) {
2828 
2829  int f_alloc_count = 0;
2830  for ( int n=fsize, i=0; i<n; ++i ) {
2831  if ( f_arr[i] == 0 ) {
2832  ++f_alloc_count;
2833  }
2834  }
2835  // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
2836 
2837  q_arr = new float*[fsize*numGrids];
2838  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2839 
2840  float **q_arr_dev_host = new float*[fsize];
2841  cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
2842 
2843  float **v_arr_dev_host = new float*[fsize];
2844  cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
2845 
2846  int q_stride = myGrid.K3+myGrid.order-1;
2847  q_data_size = f_alloc_count * q_stride * sizeof(float);
2848  ffz_size = (fsize + q_stride) * sizeof(int);
2849 
2850  // tack ffz onto end of q_data to allow merged transfer
2851  cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
2852  ffz_host = (int*)(((char*)q_data_host) + q_data_size);
2853  cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
2854  ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
2855  cudaMalloc((void**) &v_data_dev, q_data_size);
2856  cuda_errcheck("malloc grid data for pme");
2857  cudaMemset(q_data_dev, 0, q_data_size + ffz_size); // for first time
2858  cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2859  cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2860  cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2861  cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2862 
2863  f_alloc_count = 0;
2864  for ( int n=fsize, i=0; i<n; ++i ) {
2865  if ( f_arr[i] == 0 ) {
2866  q_arr[i] = q_data_host + f_alloc_count * q_stride;
2867  q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2868  v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2869  ++f_alloc_count;
2870  } else {
2871  q_arr[i] = 0;
2872  q_arr_dev_host[i] = 0;
2873  v_arr_dev_host[i] = 0;
2874  }
2875  }
2876 
2877  cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2878  cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2879  delete [] q_arr_dev_host;
2880  delete [] v_arr_dev_host;
2881  delete [] f_arr;
2882  f_arr = new char[fsize + q_stride];
2883  fz_arr = f_arr + fsize;
2884  memset(f_arr, 0, fsize + q_stride);
2885  memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
2886 
2887  cuda_errcheck("initialize grid data for pme");
2888 
2889  cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
2890  cuda_errcheck("initialize bspline coefficients for pme");
2891 
2892 #define XCOPY(X) masterPmeMgr->X = X;
2893  XCOPY(bspline_coeffs_dev)
2894  XCOPY(bspline_dcoeffs_dev)
2895  XCOPY(q_arr)
2896  XCOPY(q_arr_dev)
2897  XCOPY(v_arr_dev)
2898  XCOPY(q_data_size)
2899  XCOPY(q_data_host)
2900  XCOPY(q_data_dev)
2901  XCOPY(v_data_dev)
2902  XCOPY(ffz_size)
2903  XCOPY(ffz_host)
2904  XCOPY(ffz_dev)
2905  XCOPY(f_arr)
2906  XCOPY(fz_arr)
2907 #undef XCOPY
2908  //CkPrintf("pe %d init first\n", CkMyPe());
2909  } else { // cudaFirst
2910  //CkPrintf("pe %d init later\n", CkMyPe());
2911 #define XCOPY(X) X = masterPmeMgr->X;
2912  XCOPY(bspline_coeffs_dev)
2913  XCOPY(bspline_dcoeffs_dev)
2914  XCOPY(q_arr)
2915  XCOPY(q_arr_dev)
2916  XCOPY(v_arr_dev)
2917  XCOPY(q_data_size)
2918  XCOPY(q_data_host)
2919  XCOPY(q_data_dev)
2920  XCOPY(v_data_dev)
2921  XCOPY(ffz_size)
2922  XCOPY(ffz_host)
2923  XCOPY(ffz_dev)
2924  XCOPY(f_arr)
2925  XCOPY(fz_arr)
2926 #undef XCOPY
2927  } // cudaFirst
2928  CmiUnlock(cuda_lock);
2929  } else // offload
2930 #endif // NAMD_CUDA
2931  {
2932  fz_arr = new char[myGrid.K3+myGrid.order-1];
2933  }
2934 
2935 #if 0 && USE_PERSISTENT
2936  recvGrid_handle = NULL;
2937 #endif
2938 }
2939 
2941 {
2942 #ifdef NAMD_CUDA
2943  if ( ! offload )
2944 #endif
2945  {
2946  for ( int g=0; g<numGridsMax; ++g ) delete myRealSpace[g];
2947  }
2948 }
2949 
2950 #if 0 && USE_PERSISTENT
2951 void ComputePmeMgr::setup_recvgrid_persistent()
2952 {
2953  int K1 = myGrid.K1;
2954  int K2 = myGrid.K2;
2955  int dim2 = myGrid.dim2;
2956  int dim3 = myGrid.dim3;
2957  int block1 = myGrid.block1;
2958  int block2 = myGrid.block2;
2959 
2960  CkArray *zPencil_local = zPencil.ckLocalBranch();
2961  recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
2962  for (int ap=0; ap<numPencilsActive; ++ap) {
2963  int ib = activePencils[ap].i;
2964  int jb = activePencils[ap].j;
2965  int ibegin = ib*block1;
2966  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
2967  int jbegin = jb*block2;
2968  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
2969  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2970  // f is changing
2971  int fcount = 0;
2972  for ( int g=0; g<numGrids; ++g ) {
2973  char *f = f_arr + g*fsize;
2974  for ( int i=ibegin; i<iend; ++i ) {
2975  for ( int j=jbegin; j<jend; ++j ) {
2976  fcount += f[i*dim2+j];
2977  }
2978  }
2979  }
2980  int zlistlen = 0;
2981  for ( int i=0; i<myGrid.K3; ++i ) {
2982  if ( fz_arr[i] ) ++zlistlen;
2983  }
2984  int hd = ( fcount? 1 : 0 ); // has data?
2985  int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
2986  int compress_start = sizeof(PmeGridMsg ) + sizeof(envelope) + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids ;
2987  int compress_size = sizeof(float)*hd*fcount*zlistlen;
2988  int size = compress_start + compress_size + PRIORITY_SIZE/8+6;
2989  recvGrid_handle[ap] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
2990  }
2991 }
2992 #endif
2993 
2995 
2996  if ( patch->flags.doFullElectrostatics ) {
2997  // In QM/MM simulations, atom charges form QM regions need special treatment.
2998  if ( qmForcesOn ) {
2999  return 1;
3000  }
3001  if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount ) return 0; // work to do, enqueue as usual
3002  myMgr->heldComputes.add(this);
3003  return 1; // don't enqueue yet
3004  }
3005 
3006  positionBox->skip();
3007  forceBox->skip();
3008 
3009  if ( ++(myMgr->noWorkCount) == myMgr->pmeComputes.size() ) {
3010  myMgr->noWorkCount = 0;
3011  myMgr->reduction->submit();
3012  }
3013 
3014  atomsChanged = 0;
3015 
3016  return 1; // no work for this step
3017 }
3018 
3020  ++recipEvirClients;
3021 }
3022 
3024  if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
3025  for ( int g=0; g<numGrids; ++g ) {
3026  evir[g] += msg->evir[g];
3027  }
3028  delete msg;
3029  // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
3030  if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
3031 }
3032 
3034 
3035 // iout << CkMyPe() << ") ----> PME doQMWork.\n" << endi ;
3036 
3037 
3038  int numQMAtms = Node::Object()->molecule->get_numQMAtoms();
3039  const Real *qmAtmChrg = Node::Object()->molecule->get_qmAtmChrg() ;
3040  const int *qmAtmIndx = Node::Object()->molecule->get_qmAtmIndx() ;
3041  const Real *qmAtomGroup = Node::Object()->molecule->get_qmAtomGroup() ;
3042 
3043  const CompAtomExt *xExt = patch->getCompAtomExtInfo();
3044 
3045  // Determine number of qm atoms in this patch for the current step.
3046  numLocalQMAtoms = 0;
3047  for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
3048  if ( qmAtomGroup[xExt[paIter].id] != 0 ) {
3049  numLocalQMAtoms++;
3050  }
3051  }
3052 
3053  // We prepare a charge vector with QM charges for use in the PME calculation.
3054 
3055  // Clears data from last step, if there is any.
3056  if (qmLoclIndx != 0)
3057  delete [] qmLoclIndx;
3058  if (qmLocalCharges != 0)
3059  delete [] qmLocalCharges;
3060 
3061  qmLoclIndx = new int[numLocalQMAtoms] ;
3062  qmLocalCharges = new Real[numLocalQMAtoms] ;
3063 
3064  // I am assuming there will be (in general) more QM atoms among all QM groups
3065  // than MM atoms in a patch.
3066  int procAtms = 0;
3067 
3068  for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
3069 
3070  for (int i=0; i<numQMAtms; i++) {
3071 
3072  if (qmAtmIndx[i] == xExt[paIter].id) {
3073 
3074  qmLoclIndx[procAtms] = paIter ;
3075  qmLocalCharges[procAtms] = qmAtmChrg[i];
3076 
3077  procAtms++;
3078  break;
3079  }
3080 
3081  }
3082 
3083  if (procAtms == numLocalQMAtoms)
3084  break;
3085  }
3086 
3087  doWork();
3088  return ;
3089 }
3090 
3092 {
3093  DebugM(4,"Entering ComputePme::doWork().\n");
3094 
3096 #ifdef NAMD_CUDA
3098 #else
3100 #endif
3101  ungridForces();
3102  // CkPrintf("doWork 2 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
3103  if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->submitReductions();
3104  return;
3105  }
3107  // CkPrintf("doWork 1 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
3108 
3109 #ifdef TRACE_COMPUTE_OBJECTS
3110  double traceObjStartTime = CmiWallTimer();
3111 #endif
3112 
3113 #ifdef NAMD_CUDA
3114  if ( offload ) cudaSetDevice(deviceCUDA->getDeviceID());
3115 #endif
3116 
3117  // allocate storage
3118  numLocalAtoms = patch->getNumAtoms();
3119 
3120  Lattice &lattice = patch->flags.lattice;
3121 
3122  localData_alloc.resize(numLocalAtoms*(numGrids+ ((numGrids>1 || selfOn)?1:0)));
3123  localData = localData_alloc.begin();
3124  localPartition_alloc.resize(numLocalAtoms);
3125  localPartition = localPartition_alloc.begin();
3126 
3127  int g;
3128  for ( g=0; g<numGrids; ++g ) {
3129  localGridData[g] = localData + numLocalAtoms*(g+1);
3130  }
3131 
3132  // get positions and charges
3133  PmeParticle * data_ptr = localData;
3134  unsigned char * part_ptr = localPartition;
3135  const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
3137 
3138  {
3139  CompAtom *x = positionBox->open();
3140  // CompAtomExt *xExt = patch->getCompAtomExtInfo();
3141  if ( patch->flags.doMolly ) {
3142  positionBox->close(&x);
3143  x = avgPositionBox->open();
3144  }
3145  int numAtoms = patch->getNumAtoms();
3146 
3147  for(int i=0; i<numAtoms; ++i)
3148  {
3149  data_ptr->x = x[i].position.x;
3150  data_ptr->y = x[i].position.y;
3151  data_ptr->z = x[i].position.z;
3152  data_ptr->cg = coulomb_sqrt * x[i].charge;
3153  ++data_ptr;
3154  *part_ptr = x[i].partition;
3155  ++part_ptr;
3156  }
3157 
3158  // QM loop to overwrite charges of QM atoms.
3159  // They are zero for NAMD, but are updated in ComputeQM.
3160  if ( qmForcesOn ) {
3161 
3162  for(int i=0; i<numLocalQMAtoms; ++i)
3163  {
3164  localData[qmLoclIndx[i]].cg = coulomb_sqrt * qmLocalCharges[i];
3165  }
3166 
3167  }
3168 
3169  if ( patch->flags.doMolly ) { avgPositionBox->close(&x); }
3170  else { positionBox->close(&x); }
3171  }
3172 
3173  // copy to other grids if needed
3174  if ( (alchOn && (!alchDecouple)) || lesOn ) {
3175  for ( g=0; g<numGrids; ++g ) {
3176  PmeParticle *lgd = localGridData[g];
3177  if (g < 2) {
3178  int nga = 0;
3179  for(int i=0; i<numLocalAtoms; ++i) {
3180  if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3)) {
3181  // for FEP/TI: grid 0 gets non-alch + partition 1 + partition 3;
3182  // grid 1 gets non-alch + partition 2 + + partition 4;
3183  lgd[nga++] = localData[i];
3184  }
3185  }
3186  numGridAtoms[g] = nga;
3187  } else {
3188  int nga = 0;
3189  for(int i=0; i<numLocalAtoms; ++i) {
3190  if ( localPartition[i] == 0 ) {
3191  // grid 2 (only if called for with numGrids=3) gets only non-alch
3192  lgd[nga++] = localData[i];
3193  }
3194  }
3195  numGridAtoms[g] = nga;
3196  }
3197  }
3198  } else if ( alchOn && alchDecouple) {
3199  // alchemical decoupling: four grids
3200  // g=0: partition 0 and partition 1
3201  // g=1: partition 0 and partition 2
3202  // g=2: only partition 1 atoms
3203  // g=3: only partition 2 atoms
3204  // plus one grid g=4, only partition 0, if numGrids=5
3205  for ( g=0; g<2; ++g ) { // same as before for first 2
3206  PmeParticle *lgd = localGridData[g];
3207  int nga = 0;
3208  for(int i=0; i<numLocalAtoms; ++i) {
3209  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
3210  lgd[nga++] = localData[i];
3211  }
3212  }
3213  numGridAtoms[g] = nga;
3214  }
3215  for (g=2 ; g<4 ; ++g ) { // only alchemical atoms for these 2
3216  PmeParticle *lgd = localGridData[g];
3217  int nga = 0;
3218  for(int i=0; i<numLocalAtoms; ++i) {
3219  if ( localPartition[i] == (g-1) ) {
3220  lgd[nga++] = localData[i];
3221  }
3222  }
3223  numGridAtoms[g] = nga;
3224  }
3225  for (g=4 ; g<numGrids ; ++g ) { // only non-alchemical atoms
3226  // numGrids=5 only if alchElecLambdaStart > 0
3227  PmeParticle *lgd = localGridData[g];
3228  int nga = 0;
3229  for(int i=0; i<numLocalAtoms; ++i) {
3230  if ( localPartition[i] == 0 ) {
3231  lgd[nga++] = localData[i];
3232  }
3233  }
3234  numGridAtoms[g] = nga;
3235  }
3236  } else if ( selfOn ) {
3237  if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
3238  g = 0;
3239  PmeParticle *lgd = localGridData[g];
3240  int nga = 0;
3241  for(int i=0; i<numLocalAtoms; ++i) {
3242  if ( localPartition[i] == 1 ) {
3243  lgd[nga++] = localData[i];
3244  }
3245  }
3246  numGridAtoms[g] = nga;
3247  } else if ( pairOn ) {
3248  if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
3249  g = 0;
3250  PmeParticle *lgd = localGridData[g];
3251  int nga = 0;
3252  for(int i=0; i<numLocalAtoms; ++i) {
3253  if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
3254  lgd[nga++] = localData[i];
3255  }
3256  }
3257  numGridAtoms[g] = nga;
3258  for ( g=1; g<3; ++g ) {
3259  PmeParticle *lgd = localGridData[g];
3260  int nga = 0;
3261  for(int i=0; i<numLocalAtoms; ++i) {
3262  if ( localPartition[i] == g ) {
3263  lgd[nga++] = localData[i];
3264  }
3265  }
3266  numGridAtoms[g] = nga;
3267  }
3268  } else {
3269  if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
3270  localGridData[0] = localData;
3271  numGridAtoms[0] = numLocalAtoms;
3272  }
3273 
3274  if ( ! myMgr->doWorkCount ) {
3275  myMgr->doWorkCount = myMgr->pmeComputes.size();
3276 
3277 #ifdef NAMD_CUDA
3278  if ( ! offload )
3279 #endif // NAMD_CUDA
3280  {
3281  memset( (void*) myMgr->fz_arr, 0, (myGrid.K3+myGrid.order-1) * sizeof(char) );
3282 
3283  for (int i=0; i<myMgr->q_count; ++i) {
3284  memset( (void*) (myMgr->q_list[i]), 0, (myGrid.K3+myGrid.order-1) * sizeof(float) );
3285  }
3286  }
3287 
3288  for ( g=0; g<numGrids; ++g ) {
3289  myMgr->evir[g] = 0;
3290  }
3291 
3292  myMgr->strayChargeErrors = 0;
3293 
3294  myMgr->compute_sequence = sequence();
3295  }
3296 
3297  if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in doWork()");
3298 
3299  int strayChargeErrors = 0;
3300 
3301  // calculate self energy
3303  for ( g=0; g<numGrids; ++g ) {
3304  BigReal selfEnergy = 0;
3305  data_ptr = localGridData[g];
3306  int i;
3307  for(i=0; i<numGridAtoms[g]; ++i)
3308  {
3309  selfEnergy += data_ptr->cg * data_ptr->cg;
3310  ++data_ptr;
3311  }
3312  selfEnergy *= -1. * ewaldcof / SQRT_PI;
3313  myMgr->evir[g][0] += selfEnergy;
3314 
3315  float **q = myMgr->q_arr + g*myMgr->fsize;
3316  char *f = myMgr->f_arr + g*myMgr->fsize;
3317 
3318  scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
3319 #ifdef NAMD_CUDA
3320  if ( offload ) {
3321  if ( myMgr->cuda_atoms_alloc == 0 ) { // first call
3322  int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
3323  cuda_errcheck("before malloc atom data for pme");
3324  cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
3325  cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
3326  cuda_errcheck("malloc atom data for pme");
3327  myMgr->cuda_atoms_count = 0;
3328  }
3329  cuda_atoms_offset = myMgr->cuda_atoms_count;
3330  int n = numGridAtoms[g];
3331  myMgr->cuda_atoms_count += n;
3332  if ( myMgr->cuda_atoms_count > myMgr->cuda_atoms_alloc ) {
3333  CkPrintf("Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
3334  CkMyPe(), myMgr->cuda_atoms_count, myMgr->cuda_atoms_alloc);
3335  cuda_errcheck("before malloc expanded atom data for pme");
3336  int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
3337  const float *a_data_host_old = myMgr->a_data_host;
3338  cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
3339  cuda_errcheck("malloc expanded host atom data for pme");
3340  memcpy(myMgr->a_data_host, a_data_host_old, 7*cuda_atoms_offset*sizeof(float));
3341  cudaFreeHost((void*) a_data_host_old);
3342  cuda_errcheck("free expanded host atom data for pme");
3343  cudaFree(myMgr->a_data_dev);
3344  cuda_errcheck("free expanded dev atom data for pme");
3345  cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
3346  cuda_errcheck("malloc expanded dev atom data for pme");
3347  }
3348  float *a_data_host = myMgr->a_data_host + 7 * cuda_atoms_offset;
3349  data_ptr = localGridData[g];
3350  double order_1 = myGrid.order - 1;
3351  double K1 = myGrid.K1;
3352  double K2 = myGrid.K2;
3353  double K3 = myGrid.K3;
3354  int found_negative = 0;
3355  for ( int i=0; i<n; ++i ) {
3356  if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
3357  found_negative = 1;
3358  // CkPrintf("low coord: %f %f %f\n", data_ptr[i].x, data_ptr[i].y, data_ptr[i].z);
3359  }
3360  double x_int = (int) data_ptr[i].x;
3361  double y_int = (int) data_ptr[i].y;
3362  double z_int = (int) data_ptr[i].z;
3363  a_data_host[7*i ] = data_ptr[i].x - x_int; // subtract in double precision
3364  a_data_host[7*i+1] = data_ptr[i].y - y_int;
3365  a_data_host[7*i+2] = data_ptr[i].z - z_int;
3366  a_data_host[7*i+3] = data_ptr[i].cg;
3367  x_int -= order_1; if ( x_int < 0 ) x_int += K1;
3368  y_int -= order_1; if ( y_int < 0 ) y_int += K2;
3369  z_int -= order_1; if ( z_int < 0 ) z_int += K3;
3370  a_data_host[7*i+4] = x_int;
3371  a_data_host[7*i+5] = y_int;
3372  a_data_host[7*i+6] = z_int;
3373  }
3374  if ( found_negative ) NAMD_bug("found negative atom coordinate in ComputePme::doWork");
3375  } else
3376 #endif // NAMD_CUDA
3377  {
3378  myRealSpace[g]->set_num_atoms(numGridAtoms[g]);
3379  myRealSpace[g]->fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
3380  }
3381  }
3382  myMgr->strayChargeErrors += strayChargeErrors;
3383 
3384 #ifdef TRACE_COMPUTE_OBJECTS
3385  traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
3386 #endif
3387 
3388  if ( --(myMgr->doWorkCount) == 0 ) {
3389 // cudaDeviceSynchronize(); // XXXX
3390 #ifdef NAMD_CUDA
3391  if ( offload ) {
3393  args.mgr = myMgr;
3394  args.lattice = &lattice;
3395  args.sequence = sequence();
3396  CmiLock(ComputePmeMgr::cuda_lock);
3397  if ( ComputePmeMgr::cuda_busy ) {
3399  } else if ( CkMyPe() == deviceCUDA->getMasterPe() ) {
3400  // avoid adding work to nonbonded data preparation pe
3401  args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
3402  } else {
3403  ComputePmeMgr::cuda_busy = true;
3404  while ( 1 ) {
3405  CmiUnlock(ComputePmeMgr::cuda_lock);
3406  args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
3407  CmiLock(ComputePmeMgr::cuda_lock);
3411  } else {
3412  ComputePmeMgr::cuda_busy = false;
3413  break;
3414  }
3415  }
3416  }
3417  CmiUnlock(ComputePmeMgr::cuda_lock);
3418  } else
3419 #endif // NAMD_CUDA
3420  {
3421  myMgr->chargeGridReady(lattice,sequence());
3422  }
3423  }
3424  atomsChanged = 0;
3425 }
3426 
3427 #ifdef NAMD_CUDA
3428 
3429 void ComputePmeMgr::cuda_submit_charges(Lattice &lattice, int sequence) {
3430 
3431  int n = cuda_atoms_count;
3432  //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
3433  cuda_atoms_count = 0;
3434 
3435  const double before = CmiWallTimer();
3436  cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
3437  cudaMemcpyHostToDevice, streams[stream]);
3438  const double after = CmiWallTimer();
3439 
3440  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3441 
3442  cuda_pme_charges(
3443  bspline_coeffs_dev,
3444  q_arr_dev, ffz_dev, ffz_dev + fsize,
3445  a_data_dev, n,
3446  myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
3447  streams[stream]);
3448  const double after2 = CmiWallTimer();
3449 
3450  chargeGridSubmitted(lattice,sequence); // must be inside lock
3451 
3452  masterPmeMgr->charges_time = before;
3453  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
3454  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
3455 }
3456 
3457 void cuda_check_pme_charges(void *arg, double walltime) {
3458  ComputePmeMgr *argp = (ComputePmeMgr *) arg;
3459 
3460  cudaError_t err = cudaEventQuery(argp->end_charges);
3461  if ( err == cudaSuccess ) {
3462  traceUserBracketEvent(CUDA_EVENT_ID_PME_CHARGES,argp->charges_time,walltime);
3463  argp->charges_time = walltime - argp->charges_time;
3464  argp->sendChargeGridReady();
3465  argp->check_charges_count = 0;
3466  } else if ( err != cudaErrorNotReady ) {
3467  char errmsg[256];
3468  sprintf(errmsg,"in cuda_check_pme_charges after polling %d times over %f s on seq %d",
3469  argp->check_charges_count, walltime - argp->charges_time,
3470  argp->saved_sequence);
3471  cudaDie(errmsg,err);
3472  } else if ( ++(argp->check_charges_count) >= count_limit ) {
3473  char errmsg[256];
3474  sprintf(errmsg,"cuda_check_pme_charges polled %d times over %f s on seq %d",
3475  argp->check_charges_count, walltime - argp->charges_time,
3476  argp->saved_sequence);
3477  cudaDie(errmsg,err);
3478  } else {
3479  CcdCallBacksReset(0,walltime); // fix Charm++
3481  }
3482 }
3483 
3484 void ComputePmeMgr::chargeGridSubmitted(Lattice &lattice, int sequence) {
3485  saved_lattice = &lattice;
3486  saved_sequence = sequence;
3487 
3488  // cudaDeviceSynchronize(); // XXXX TESTING
3489  //int q_stride = myGrid.K3+myGrid.order-1;
3490  //for (int n=fsize+q_stride, j=0; j<n; ++j) {
3491  // if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
3492  // CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
3493  // }
3494  //}
3495  //CmiLock(cuda_lock);
3496 
3497  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
3498  double before = CmiWallTimer();
3499  cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0); // when all streams complete
3500  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3501  cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3502  cudaMemcpyDeviceToHost, streams[stream]);
3503  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
3504  cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
3505  cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]); // for next time
3506  cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3507  //CmiUnlock(cuda_lock);
3508  // cudaDeviceSynchronize(); // XXXX TESTING
3509  // cuda_errcheck("after memcpy grid to host");
3510 
3511  SimParameters *simParams = Node::Object()->simParameters;
3512  if ( ! simParams->useCUDA2 ) {
3513  CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
3514  cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
3515  }
3516 
3517  pmeProxy[master_pe].pollChargeGridReady();
3518  }
3519 }
3520 
3522  for ( int i=0; i<CkMyNodeSize(); ++i ) {
3523  ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
3524  int cs = mgr->pmeComputes.size();
3525  if ( cs ) {
3526  mgr->ungridForcesCount = cs;
3527  mgr->recipEvirCount = mgr->recipEvirClients;
3528  masterPmeMgr->chargeGridSubmittedCount++;
3529  }
3530  }
3531  pmeProxy[master_pe].recvChargeGridReady();
3532 }
3533 #endif // NAMD_CUDA
3534 
3536 #ifdef NAMD_CUDA
3537  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
3539 #else
3540  NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3541 #endif
3542 }
3543 
3546 }
3547 
3548 void ComputePmeMgr::chargeGridReady(Lattice &lattice, int sequence) {
3549 
3550 #ifdef NAMD_CUDA
3551  if ( offload ) {
3552  int errcount = 0;
3553  int q_stride = myGrid.K3+myGrid.order-1;
3554  for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
3555  f_arr[j] = ffz_host[j];
3556  if ( ffz_host[j] & ~1 ) ++errcount;
3557  }
3558  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
3559  }
3560 #endif
3561  recipEvirCount = recipEvirClients;
3562  ungridForcesCount = pmeComputes.size();
3563 
3564  for (int j=0; j<myGrid.order-1; ++j) {
3565  fz_arr[j] |= fz_arr[myGrid.K3+j];
3566  }
3567 
3568  if ( usePencils ) {
3569  sendPencils(lattice,sequence);
3570  } else {
3571  sendData(lattice,sequence);
3572  }
3573 }
3574 
3575 
3576 void ComputePmeMgr::sendPencilsPart(int first, int last, Lattice &lattice, int sequence, int sourcepe) {
3577 
3578  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3579 
3580 #if 0 && USE_PERSISTENT
3581  if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3582 #endif
3583  int K1 = myGrid.K1;
3584  int K2 = myGrid.K2;
3585  int dim2 = myGrid.dim2;
3586  int dim3 = myGrid.dim3;
3587  int block1 = myGrid.block1;
3588  int block2 = myGrid.block2;
3589 
3590  // int savedMessages = 0;
3591  NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3592 
3593  for (int ap=first; ap<=last; ++ap) {
3594  int ib = activePencils[ap].i;
3595  int jb = activePencils[ap].j;
3596  int ibegin = ib*block1;
3597  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3598  int jbegin = jb*block2;
3599  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3600  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3601 
3602  int fcount = 0;
3603  for ( int g=0; g<numGrids; ++g ) {
3604  char *f = f_arr + g*fsize;
3605 #ifdef NAMD_CUDA
3606  if ( offload ) {
3607  int errcount = 0;
3608  for ( int i=ibegin; i<iend; ++i ) {
3609  for ( int j=jbegin; j<jend; ++j ) {
3610  int k = i*dim2+j;
3611  f[k] = ffz_host[k];
3612  fcount += f[k];
3613  if ( ffz_host[k] & ~1 ) ++errcount;
3614  }
3615  }
3616  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
3617  } else
3618 #endif
3619  for ( int i=ibegin; i<iend; ++i ) {
3620  for ( int j=jbegin; j<jend; ++j ) {
3621  fcount += f[i*dim2+j];
3622  }
3623  }
3624  }
3625 
3626 #ifdef NETWORK_PROGRESS
3627  CmiNetworkProgress();
3628 #endif
3629 
3630  if ( ! pencilActive[ib*yBlocks+jb] )
3631  NAMD_bug("PME activePencils list inconsistent");
3632 
3633  int zlistlen = 0;
3634  for ( int i=0; i<myGrid.K3; ++i ) {
3635  if ( fz_arr[i] ) ++zlistlen;
3636  }
3637 
3638  int hd = ( fcount? 1 : 0 ); // has data?
3639  // if ( ! hd ) ++savedMessages;
3640 
3641 
3642  PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
3643  hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3644  msg->sourceNode = sourcepe;
3645  msg->hasData = hd;
3646  msg->lattice = lattice;
3647  if ( hd ) {
3648 #if 0
3649  msg->start = fstart;
3650  msg->len = flen;
3651 #else
3652  msg->start = -1; // obsolete?
3653  msg->len = -1; // obsolete?
3654 #endif
3655  msg->zlistlen = zlistlen;
3656  int *zlist = msg->zlist;
3657  zlistlen = 0;
3658  for ( int i=0; i<myGrid.K3; ++i ) {
3659  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3660  }
3661  char *fmsg = msg->fgrid;
3662  float *qmsg = msg->qgrid;
3663  for ( int g=0; g<numGrids; ++g ) {
3664  char *f = f_arr + g*fsize;
3665  float **q = q_arr + g*fsize;
3666  for ( int i=ibegin; i<iend; ++i ) {
3667  for ( int j=jbegin; j<jend; ++j ) {
3668  *(fmsg++) = f[i*dim2+j];
3669  if( f[i*dim2+j] ) {
3670  for (int h=0; h<myGrid.order-1; ++h) {
3671  q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
3672  }
3673  for ( int k=0; k<zlistlen; ++k ) {
3674  *(qmsg++) = q[i*dim2+j][zlist[k]];
3675  }
3676  }
3677  }
3678  }
3679  }
3680  }
3681 
3682  msg->sequence = compute_sequence;
3683  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3684  CmiEnableUrgentSend(1);
3685 #if USE_NODE_PAR_RECEIVE
3686  msg->destElem=CkArrayIndex3D(ib,jb,0);
3687  CProxy_PmePencilMap lzm = npMgr->zm;
3688  int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
3689  int destnode = CmiNodeOf(destproc);
3690 
3691 #if 0
3692  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3693 #endif
3694  pmeNodeProxy[destnode].recvZGrid(msg);
3695 #if 0
3696  CmiUsePersistentHandle(NULL, 0);
3697 #endif
3698 #else
3699 #if 0
3700  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3701 #endif
3702  zPencil(ib,jb,0).recvGrid(msg);
3703 #if 0
3704  CmiUsePersistentHandle(NULL, 0);
3705 #endif
3706 #endif
3707  CmiEnableUrgentSend(0);
3708  }
3709 
3710 
3711  // if ( savedMessages ) {
3712  // CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
3713  // }
3714 
3715 }
3716 
3717 
3719  nodePmeMgr->sendPencilsHelper(iter);
3720 }
3721 
3723 #ifdef NAMD_CUDA
3724  ComputePmeMgr *obj = masterPmeMgr;
3726 #else
3727  NAMD_bug("NodePmeMgr::sendPencilsHelper called in non-CUDA build");
3728 #endif
3729 }
3730 
3731 void ComputePmeMgr::sendPencils(Lattice &lattice, int sequence) {
3732 
3733  sendDataHelper_lattice = &lattice;
3734  sendDataHelper_sequence = sequence;
3735  sendDataHelper_sourcepe = CkMyPe();
3736 
3737 #ifdef NAMD_CUDA
3738  if ( offload ) {
3739  for ( int ap=0; ap < numPencilsActive; ++ap ) {
3740 #if CMK_MULTICORE
3741  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3742  int ib = activePencils[ap].i;
3743  int jb = activePencils[ap].j;
3744  int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3745  pmeProxy[destproc].sendPencilsHelper(ap);
3746 #else
3747  pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3748 #endif
3749  }
3750  } else
3751 #endif
3752  {
3753  sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
3754  }
3755 
3756  if ( strayChargeErrors ) {
3757  strayChargeErrors = 0;
3758  iout << iERROR << "Stray PME grid charges detected: "
3759  << CkMyPe() << " sending to (x,y)";
3760  int K1 = myGrid.K1;
3761  int K2 = myGrid.K2;
3762  int dim2 = myGrid.dim2;
3763  int block1 = myGrid.block1;
3764  int block2 = myGrid.block2;
3765  for (int ib=0; ib<xBlocks; ++ib) {
3766  for (int jb=0; jb<yBlocks; ++jb) {
3767  int ibegin = ib*block1;
3768  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3769  int jbegin = jb*block2;
3770  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3771  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3772 
3773  for ( int g=0; g<numGrids; ++g ) {
3774  char *f = f_arr + g*fsize;
3775  if ( ! pencilActive[ib*yBlocks+jb] ) {
3776  for ( int i=ibegin; i<iend; ++i ) {
3777  for ( int j=jbegin; j<jend; ++j ) {
3778  if ( f[i*dim2+j] == 3 ) {
3779  f[i*dim2+j] = 2;
3780  iout << " (" << i << "," << j << ")";
3781  }
3782  }
3783  }
3784  }
3785  }
3786  }
3787  }
3788  iout << "\n" << endi;
3789  }
3790 
3791 }
3792 
3793 
3795 
3796  int K1 = myGrid.K1;
3797  int K2 = myGrid.K2;
3798  int dim2 = myGrid.dim2;
3799  int dim3 = myGrid.dim3;
3800  int block1 = myGrid.block1;
3801  int block2 = myGrid.block2;
3802 
3803  // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
3804  int ib = msg->sourceNode / yBlocks;
3805  int jb = msg->sourceNode % yBlocks;
3806 
3807  int ibegin = ib*block1;
3808  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3809  int jbegin = jb*block2;
3810  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3811 
3812  int zlistlen = msg->zlistlen;
3813  int *zlist = msg->zlist;
3814  float *qmsg = msg->qgrid;
3815  int g;
3816  for ( g=0; g<numGrids; ++g ) {
3817  char *f = f_arr + g*fsize;
3818  float **q = q_arr + g*fsize;
3819  for ( int i=ibegin; i<iend; ++i ) {
3820  for ( int j=jbegin; j<jend; ++j ) {
3821  if( f[i*dim2+j] ) {
3822  f[i*dim2+j] = 0;
3823  for ( int k=0; k<zlistlen; ++k ) {
3824  q[i*dim2+j][zlist[k]] = *(qmsg++);
3825  }
3826  for (int h=0; h<myGrid.order-1; ++h) {
3827  q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
3828  }
3829  }
3830  }
3831  }
3832  }
3833 }
3834 
3835 
3836 void ComputePmeMgr::sendDataPart(int first, int last, Lattice &lattice, int sequence, int sourcepe, int errors) {
3837 
3838  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3839 
3840  bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
3841 
3842  CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3843  for (int j=first; j<=last; j++) {
3844  int pe = gridPeOrder[j]; // different order
3845  if ( ! recipPeDest[pe] && ! errors ) continue;
3846  int start = pe * bsize;
3847  int len = bsize;
3848  if ( start >= qsize ) { start = 0; len = 0; }
3849  if ( start + len > qsize ) { len = qsize - start; }
3850  int zdim = myGrid.dim3;
3851  int fstart = start / zdim;
3852  int flen = len / zdim;
3853  int fcount = 0;
3854  int i;
3855 
3856  int g;
3857  for ( g=0; g<numGrids; ++g ) {
3858  char *f = f_arr + fstart + g*fsize;
3859 #ifdef NAMD_CUDA
3860  if ( offload ) {
3861  int errcount = 0;
3862  for ( i=0; i<flen; ++i ) {
3863  f[i] = ffz_host[fstart+i];
3864  fcount += f[i];
3865  if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3866  }
3867  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
3868  } else
3869 #endif
3870  for ( i=0; i<flen; ++i ) {
3871  fcount += f[i];
3872  }
3873  if ( ! recipPeDest[pe] ) {
3874  int errfound = 0;
3875  for ( i=0; i<flen; ++i ) {
3876  if ( f[i] == 3 ) {
3877  errfound = 1;
3878  break;
3879  }
3880  }
3881  if ( errfound ) {
3882  iout << iERROR << "Stray PME grid charges detected: "
3883  << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
3884  int iz = -1;
3885  for ( i=0; i<flen; ++i ) {
3886  if ( f[i] == 3 ) {
3887  f[i] = 2;
3888  int jz = (i+fstart)/myGrid.K2;
3889  if ( iz != jz ) { iout << " " << jz; iz = jz; }
3890  }
3891  }
3892  iout << "\n" << endi;
3893  }
3894  }
3895  }
3896 
3897 #ifdef NETWORK_PROGRESS
3898  CmiNetworkProgress();
3899 #endif
3900 
3901  if ( ! recipPeDest[pe] ) continue;
3902 
3903  int zlistlen = 0;
3904  for ( i=0; i<myGrid.K3; ++i ) {
3905  if ( fz_arr[i] ) ++zlistlen;
3906  }
3907 
3908  PmeGridMsg *msg = new (zlistlen, flen*numGrids,
3909  fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3910 
3911  msg->sourceNode = sourcepe;
3912  msg->lattice = lattice;
3913  msg->start = fstart;
3914  msg->len = flen;
3915  msg->zlistlen = zlistlen;
3916  int *zlist = msg->zlist;
3917  zlistlen = 0;
3918  for ( i=0; i<myGrid.K3; ++i ) {
3919  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3920  }
3921  float *qmsg = msg->qgrid;
3922  for ( g=0; g<numGrids; ++g ) {
3923  char *f = f_arr + fstart + g*fsize;
3924  CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
3925  float **q = q_arr + fstart + g*fsize;
3926  for ( i=0; i<flen; ++i ) {
3927  if ( f[i] ) {
3928  for (int h=0; h<myGrid.order-1; ++h) {
3929  q[i][h] += q[i][myGrid.K3+h];
3930  }
3931  for ( int k=0; k<zlistlen; ++k ) {
3932  *(qmsg++) = q[i][zlist[k]];
3933  }
3934  }
3935  }
3936  }
3937 
3938  msg->sequence = compute_sequence;
3939  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3940  pmeProxy[gridPeMap[pe]].recvGrid(msg);
3941  }
3942 
3943 }
3944 
3946  nodePmeMgr->sendDataHelper(iter);
3947 }
3948 
3950 #ifdef NAMD_CUDA
3951  ComputePmeMgr *obj = masterPmeMgr;
3953 #else
3954  NAMD_bug("NodePmeMgr::sendDataHelper called in non-CUDA build");
3955 #endif
3956 }
3957 
3958 void ComputePmeMgr::sendData(Lattice &lattice, int sequence) {
3959 
3960  sendDataHelper_lattice = &lattice;
3961  sendDataHelper_sequence = sequence;
3962  sendDataHelper_sourcepe = CkMyPe();
3963  sendDataHelper_errors = strayChargeErrors;
3964  strayChargeErrors = 0;
3965 
3966 #ifdef NAMD_CUDA
3967  if ( offload ) {
3968  for ( int i=0; i < numGridPes; ++i ) {
3969  int pe = gridPeOrder[i]; // different order
3970  if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
3971 #if CMK_MULTICORE
3972  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3973  pmeProxy[gridPeMap[pe]].sendDataHelper(i);
3974 #else
3975  pmeNodeProxy[CkMyNode()].sendDataHelper(i);
3976 #endif
3977  }
3978  } else
3979 #endif
3980  {
3981  sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
3982  }
3983 
3984 }
3985 
3987 
3988  int zdim = myGrid.dim3;
3989  int flen = msg->len;
3990  int fstart = msg->start;
3991  int zlistlen = msg->zlistlen;
3992  int *zlist = msg->zlist;
3993  float *qmsg = msg->qgrid;
3994  int g;
3995  for ( g=0; g<numGrids; ++g ) {
3996  char *f = msg->fgrid + g*flen;
3997  float **q = q_arr + fstart + g*fsize;
3998  for ( int i=0; i<flen; ++i ) {
3999  if ( f[i] ) {
4000  f[i] = 0;
4001  for ( int k=0; k<zlistlen; ++k ) {
4002  q[i][zlist[k]] = *(qmsg++);
4003  }
4004  for (int h=0; h<myGrid.order-1; ++h) {
4005  q[i][myGrid.K3+h] = q[i][h];
4006  }
4007  }
4008  }
4009  }
4010 }
4011 
4013 
4014  if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in ungridForces()");
4015 
4016  SimParameters *simParams = Node::Object()->simParameters;
4017 
4018  localResults_alloc.resize(numLocalAtoms* ((numGrids>1 || selfOn)?2:1));
4019  Vector *localResults = localResults_alloc.begin();
4020  Vector *gridResults;
4021 
4022  if ( alchOn || lesOn || selfOn || pairOn ) {
4023  for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
4024  gridResults = localResults + numLocalAtoms;
4025  } else {
4026  gridResults = localResults;
4027  }
4028 
4029  Vector pairForce = 0.;
4030  Lattice &lattice = patch->flags.lattice;
4031  int g = 0;
4032  if(!simParams->commOnly) {
4033  for ( g=0; g<numGrids; ++g ) {
4034 #ifdef NETWORK_PROGRESS
4035  CmiNetworkProgress();
4036 #endif
4037 
4038 #ifdef NAMD_CUDA
4039  if ( offload ) {
4040  int errfound = 0;
4041  for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
4042  // Neither isnan() nor x != x worked when testing on Cray; this does.
4043  if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; } // CUDA NaN
4044  gridResults[i].x = f_data_host[3*i];
4045  gridResults[i].y = f_data_host[3*i+1];
4046  gridResults[i].z = f_data_host[3*i+2];
4047  }
4048  if ( errfound ) {
4049  int errcount = 0;
4050  for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
4051  float f = f_data_host[3*i];
4052  if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { // CUDA NaN
4053  ++errcount;
4054  gridResults[i] = 0.;
4055  }
4056  }
4057  iout << iERROR << "Stray PME grid charges detected: "
4058  << errcount << " atoms on pe " << CkMyPe() << "\n" << endi;
4059  }
4060  } else
4061 #endif // NAMD_CUDA
4062  {
4063  myRealSpace[g]->compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
4064  }
4065  scale_forces(gridResults, numGridAtoms[g], lattice);
4066 
4067  if (alchOn) {
4068  float scale = 1.;
4069  BigReal elecLambdaUp, elecLambdaDown;
4070  BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
4071  myMgr->alchLambda = alchLambda;
4072  BigReal alchLambda2 = simParams->getCurrentLambda2(patch->flags.step);
4073  myMgr->alchLambda2 = alchLambda2;
4074  elecLambdaUp = simParams->getElecLambda(alchLambda);
4075  elecLambdaDown = simParams->getElecLambda(1. - alchLambda);
4076 
4077  if ( g == 0 ) scale = elecLambdaUp;
4078  else if ( g == 1 ) scale = elecLambdaDown;
4079  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4080 
4081  if (alchDecouple) {
4082  if ( g == 2 ) scale = 1 - elecLambdaUp;
4083  else if ( g == 3 ) scale = 1 - elecLambdaDown;
4084  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4085  }
4086  int nga = 0;
4087  if (!alchDecouple) {
4088  if (g < 2 ) {
4089  for(int i=0; i<numLocalAtoms; ++i) {
4090  if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3) ) {
4091  // (g=0: only partition 0 and partiton 1 and partion 3)
4092  // (g=1: only partition 0 and partiton 2 and partion 4)
4093  localResults[i] += gridResults[nga++] * scale;
4094  }
4095  }
4096  } else {
4097  for(int i=0; i<numLocalAtoms; ++i) {
4098  if ( localPartition[i] == 0 ) {
4099  // (g=2: only partition 0)
4100  localResults[i] += gridResults[nga++] * scale;
4101  }
4102  }
4103  }
4104  } else { // alchDecouple
4105  if ( g < 2 ) {
4106  for(int i=0; i<numLocalAtoms; ++i) {
4107  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4108  // g = 0: partition 0 or partition 1
4109  // g = 1: partition 0 or partition 2
4110  localResults[i] += gridResults[nga++] * scale;
4111  }
4112  }
4113  }
4114  else {
4115  for(int i=0; i<numLocalAtoms; ++i) {
4116  if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
4117  // g = 2: partition 1 only
4118  // g = 3: partition 2 only
4119  // g = 4: partition 0 only
4120  localResults[i] += gridResults[nga++] * scale;
4121  }
4122  }
4123  }
4124  }
4125  } else if ( lesOn ) {
4126  float scale = 1.;
4127  if ( alchFepOn ) {
4128  BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
4129  myMgr->alchLambda = alchLambda;
4130  BigReal alchLambda2 = simParams->getCurrentLambda2(patch->flags.step);
4131  myMgr->alchLambda2 = alchLambda2;
4132  if ( g == 0 ) scale = alchLambda;
4133  else if ( g == 1 ) scale = 1. - alchLambda;
4134  } else if ( lesOn ) {
4135  scale = 1.0 / (float)lesFactor;
4136  }
4137  int nga = 0;
4138  for(int i=0; i<numLocalAtoms; ++i) {
4139  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4140  localResults[i] += gridResults[nga++] * scale;
4141  }
4142  }
4143  } else if ( selfOn ) {
4144  PmeParticle *lgd = localGridData[g];
4145  int nga = 0;
4146  for(int i=0; i<numLocalAtoms; ++i) {
4147  if ( localPartition[i] == 1 ) {
4148  pairForce += gridResults[nga]; // should add up to almost zero
4149  localResults[i] += gridResults[nga++];
4150  }
4151  }
4152  } else if ( pairOn ) {
4153  if ( g == 0 ) {
4154  int nga = 0;
4155  for(int i=0; i<numLocalAtoms; ++i) {
4156  if ( localPartition[i] == 1 ) {
4157  pairForce += gridResults[nga];
4158  }
4159  if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
4160  localResults[i] += gridResults[nga++];
4161  }
4162  }
4163  } else if ( g == 1 ) {
4164  int nga = 0;
4165  for(int i=0; i<numLocalAtoms; ++i) {
4166  if ( localPartition[i] == g ) {
4167  pairForce -= gridResults[nga]; // should add up to almost zero
4168  localResults[i] -= gridResults[nga++];
4169  }
4170  }
4171  } else {
4172  int nga = 0;
4173  for(int i=0; i<numLocalAtoms; ++i) {
4174  if ( localPartition[i] == g ) {
4175  localResults[i] -= gridResults[nga++];
4176  }
4177  }
4178  }
4179  }
4180  }
4181  }
4182 
4183  Vector *results_ptr = localResults;
4184 
4185  // add in forces
4186  {
4187  Results *r = forceBox->open();
4188  Force *f = r->f[Results::slow];
4189  int numAtoms = patch->getNumAtoms();
4190 
4191  if ( ! myMgr->strayChargeErrors && ! simParams->commOnly ) {
4192  for(int i=0; i<numAtoms; ++i) {
4193  f[i].x += results_ptr->x;
4194  f[i].y += results_ptr->y;
4195  f[i].z += results_ptr->z;
4196  ++results_ptr;
4197  }
4198  }
4199  forceBox->close(&r);
4200  }
4201 
4202  if ( pairOn || selfOn ) {
4203  ADD_VECTOR_OBJECT(myMgr->reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
4204  }
4205 
4206 }
4207 
4209 
4210  SimParameters *simParams = Node::Object()->simParameters;
4211 
4212  for ( int g=0; g<numGrids; ++g ) {
4213  float scale = 1.;
4214  if (alchOn) {
4215  BigReal elecLambdaUp, elecLambdaDown;
4216  // alchLambda set on each step in ComputePme::ungridForces()
4217  if ( alchLambda < 0 || alchLambda > 1 ) {
4218  NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
4219  }
4220  elecLambdaUp = simParams->getElecLambda(alchLambda);
4221  elecLambdaDown = simParams->getElecLambda(1-alchLambda);
4222  if ( g == 0 ) scale = elecLambdaUp;
4223  else if ( g == 1 ) scale = elecLambdaDown;
4224  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4225  if (alchDecouple) {
4226  if ( g == 2 ) scale = 1-elecLambdaUp;
4227  else if ( g == 3 ) scale = 1-elecLambdaDown;
4228  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4229  }
4230  } else if ( lesOn ) {
4231  scale = 1.0 / lesFactor;
4232  } else if ( pairOn ) {
4233  scale = ( g == 0 ? 1. : -1. );
4234  }
4235  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
4236  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4237  reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4238  reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4239  reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4240  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4241  reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4242  reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4243  reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4244  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4245 
4246  float scale2 = 0.;
4247 
4248  // why is this declared/defined again here?
4249  SimParameters *simParams = Node::Object()->simParameters;
4250 
4251  if (alchFepOn) {
4252  BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4253  elecLambda2Up = simParams->getElecLambda(alchLambda2);
4254  elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
4255  if ( g == 0 ) scale2 = elecLambda2Up;
4256  else if ( g == 1 ) scale2 = elecLambda2Down;
4257  else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4258  if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4259  else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4260  else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4261  }
4262  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
4263 
4264  if (alchThermIntOn) {
4265 
4266  // no decoupling:
4267  // part. 1 <-> all of system except partition 2: g[0] - g[2]
4268  // (interactions between all atoms [partition 0 OR partition 1],
4269  // minus all [within partition 0])
4270  // U = elecLambdaUp * (U[0] - U[2])
4271  // dU/dl = U[0] - U[2];
4272 
4273  // part. 2 <-> all of system except partition 1: g[1] - g[2]
4274  // (interactions between all atoms [partition 0 OR partition 2],
4275  // minus all [within partition 0])
4276  // U = elecLambdaDown * (U[1] - U[2])
4277  // dU/dl = U[1] - U[2];
4278 
4279  // alchDecouple:
4280  // part. 1 <-> part. 0: g[0] - g[2] - g[4]
4281  // (interactions between all atoms [partition 0 OR partition 1]
4282  // minus all [within partition 1] minus all [within partition 0]
4283  // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
4284  // dU/dl = U[0] - U[2] - U[4];
4285 
4286  // part. 2 <-> part. 0: g[1] - g[3] - g[4]
4287  // (interactions between all atoms [partition 0 OR partition 2]
4288  // minus all [within partition 2] minus all [within partition 0]
4289  // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
4290  // dU/dl = U[1] - U[3] - U[4];
4291 
4292 
4293  if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
4294  if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
4295  if (!alchDecouple) {
4296  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4297  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4298  }
4299  else { // alchDecouple
4300  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4301  if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4302  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4303  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4304  }
4305  }
4306  }
4307 
4308  alchLambda = -1.; // illegal value to catch if not updated
4309 
4310  reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
4311  reduction->submit();
4312 
4313  for ( int i=0; i<heldComputes.size(); ++i ) {
4314  WorkDistrib::messageEnqueueWork(heldComputes[i]);
4315  }
4316  heldComputes.resize(0);
4317 }
4318 
4319 #if USE_TOPOMAP
4320 
4321 #define NPRIMES 8
4322 const static unsigned int NAMDPrimes[] = {
4323  3,
4324  5,
4325  7,
4326  11,
4327  13,
4328  17,
4329  19,
4330  23,
4331  29,
4332  31,
4333  37,
4334  59,
4335  73,
4336  93,
4337  113,
4338  157,
4339  307,
4340  617,
4341  1217 //This should b enough for 64K nodes of BGL.
4342 };
4343 
4344 #include "RecBisection.h"
4345 
4346 /***-----------------------------------------------------**********
4347  The Orthogonal Recursive Bisection strategy, which allocates PME
4348  objects close to the patches they communicate, and at the same
4349  time spreads them around the grid
4350 ****----------------------------------------------------------****/
4351 
4352 bool generateBGLORBPmePeList(int *pemap, int numPes,
4353  int *block_pes, int nbpes) {
4354 
4355  PatchMap *pmap = PatchMap::Object();
4356  int *pmemap = new int [CkNumPes()];
4357 
4358  if (pemap == NULL)
4359  return false;
4360 
4361  TopoManager tmgr;
4362 
4363  memset(pmemap, 0, sizeof(int) * CkNumPes());
4364 
4365  for(int count = 0; count < CkNumPes(); count++) {
4366  if(count < nbpes)
4367  pmemap[block_pes[count]] = 1;
4368 
4369  if(pmap->numPatchesOnNode(count)) {
4370  pmemap[count] = 1;
4371 
4372  //Assumes an XYZT mapping !!
4373  if(tmgr.hasMultipleProcsPerNode()) {
4374  pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
4375  }
4376  }
4377  }
4378 
4379  if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
4380  //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
4381  return false;
4382 
4383  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4384  Node *node = nd.ckLocalBranch();
4385  SimParameters *simParams = node->simParameters;
4386 
4387  //first split PME processors into patch groups
4388 
4389  int xsize = 0, ysize = 0, zsize = 0;
4390 
4391  xsize = tmgr.getDimNX();
4392  ysize = tmgr.getDimNY();
4393  zsize = tmgr.getDimNZ();
4394 
4395  int nx = xsize, ny = ysize, nz = zsize;
4396  DimensionMap dm;
4397 
4398  dm.x = 0;
4399  dm.y = 1;
4400  dm.z = 2;
4401 
4402  findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
4403 
4404  //group size processors have to be allocated to each YZ plane
4405  int group_size = numPes/nx;
4406  if(numPes % nx)
4407  group_size ++;
4408 
4409  int my_prime = NAMDPrimes[0];
4410  int density = (ny * nz)/group_size + 1;
4411  int count = 0;
4412 
4413  //Choose a suitable prime Number
4414  for(count = 0; count < NPRIMES; count ++) {
4415  //Find a prime just greater than the density
4416  if(density < NAMDPrimes[count]) {
4417  my_prime = NAMDPrimes[count];
4418  break;
4419  }
4420  }
4421 
4422  if(count == NPRIMES)
4423  my_prime = NAMDPrimes[NPRIMES-1];
4424 
4425  //int gcount = numPes/2;
4426  int gcount = 0;
4427  int npme_pes = 0;
4428 
4429  int coord[3];
4430 
4431  for(int x = 0; x < nx; x++) {
4432  coord[0] = (x + nx/2)%nx;
4433 
4434  for(count=0; count < group_size && npme_pes < numPes; count++) {
4435  int dest = (count + 1) * my_prime;
4436  dest = dest % (ny * nz);
4437 
4438  coord[2] = dest / ny;
4439  coord[1] = dest - coord[2] * ny;
4440 
4441  //Locate where in the actual grid the processor is
4442  int destPe = coord[dm.x] + coord[dm.y] * xsize +
4443  coord[dm.z] * xsize* ysize;
4444 
4445  if(pmemap[destPe] == 0) {
4446  pemap[gcount++] = destPe;
4447  pmemap[destPe] = 1;
4448 
4449  if(tmgr.hasMultipleProcsPerNode())
4450  pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;
4451 
4452  npme_pes ++;
4453  }
4454  else {
4455  for(int pos = 1; pos < ny * nz; pos++) {
4456 
4457  coord[2] += pos / ny;
4458  coord[1] += pos % ny;
4459 
4460  coord[2] = coord[2] % nz;
4461  coord[1] = coord[1] % ny;
4462 
4463  int newdest = coord[dm.x] + coord[dm.y] * xsize +
4464  coord[dm.z] * xsize * ysize;
4465 
4466  if(pmemap[newdest] == 0) {
4467  pemap[gcount++] = newdest;
4468  pmemap[newdest] = 1;
4469 
4470  if(tmgr.hasMultipleProcsPerNode())
4471  pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;
4472 
4473  npme_pes ++;
4474  break;
4475  }
4476  }
4477  }
4478  }
4479 
4480  if(gcount == numPes)
4481  gcount = 0;
4482 
4483  if(npme_pes >= numPes)
4484  break;
4485  }
4486 
4487  delete [] pmemap;
4488 
4489  if(npme_pes != numPes)
4490  //NAMD_bug("ORB PME allocator failed\n");
4491  return false;
4492 
4493  return true;
4494 }
4495 
4496 #endif
4497 
4498 template <class T> class PmePencil : public T {
4499 public:
4501  data = 0;
4502  work = 0;
4503  send_order = 0;
4504  needs_reply = 0;
4505 #if USE_PERSISTENT
4506  trans_handle = untrans_handle = ungrid_handle = NULL;
4507 #endif
4508  }
4510 #ifdef NAMD_FFTW
4511  fftwf_free(data);
4512 #endif
4513  delete [] work;
4514  delete [] send_order;
4515  delete [] needs_reply;
4516  }
4518  imsg=0;
4519  imsgb=0;
4520  hasData=0;
4521  initdata = msg->data;
4522  }
4523  void order_init(int nBlocks) {
4524  send_order = new int[nBlocks];
4525  for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
4526  if ( Node::Object()->simParameters->PMESendOrder ) {
4528  } else {
4529  Random rand(CkMyPe());
4530  rand.reorder(send_order,nBlocks);
4531  }
4532  needs_reply = new int[nBlocks];
4534  }
4538  int sequence; // used for priorities
4539 #ifndef CmiMemoryAtomicType
4540  typedef int AtomicInt;
4541 #else
4542  typedef CmiMemoryAtomicInt AtomicInt;
4543 #endif
4544  AtomicInt imsg; // used in sdag code
4545  AtomicInt imsgb; // Node par uses distinct counter for back path
4546  int hasData; // used in message elimination
4547  int offload;
4548  float *data;
4549  float *work;
4552 #if USE_PERSISTENT
4553  PersistentHandle *trans_handle;
4554  PersistentHandle *untrans_handle;
4555  PersistentHandle *ungrid_handle;
4556 #endif
4557 };
4558 
4559 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
4560 public:
4561  PmeZPencil_SDAG_CODE
4562  PmeZPencil() { __sdag_init(); setMigratable(false); }
4563  PmeZPencil(CkMigrateMessage *) { __sdag_init(); setMigratable (false); imsg=imsgb=0;}
4565  #ifdef NAMD_FFTW
4566  #ifdef NAMD_FFTW_3
4567  delete [] forward_plans;
4568  delete [] backward_plans;
4569  #endif
4570  #endif
4571  }
4572  void fft_init();
4573  void recv_grid(const PmeGridMsg *);
4574  void forward_fft();
4575  void send_trans();
4576  void send_subset_trans(int fromIdx, int toIdx);
4577  void recv_untrans(const PmeUntransMsg *);
4578  void recvNodeAck(PmeAckMsg *);
4580  void node_process_grid(PmeGridMsg *);
4581  void backward_fft();
4582  void send_ungrid(PmeGridMsg *);
4583  void send_all_ungrid();
4584  void send_subset_ungrid(int fromIdx, int toIdx);
4585 private:
4586  ResizeArray<PmeGridMsg *> grid_msgs;
4587  ResizeArray<int> work_zlist;
4588 #ifdef NAMD_FFTW
4589 #ifdef NAMD_FFTW_3
4590  fftwf_plan forward_plan, backward_plan;
4591 
4592  //for ckloop usage
4593  int numPlans;
4594  fftwf_plan *forward_plans, *backward_plans;
4595 #else
4596  rfftwnd_plan forward_plan, backward_plan;
4597 #endif
4598 #endif
4599 
4600  int nx, ny;
4601 #if USE_PERSISTENT
4602  void setup_persistent() {
4603  int hd = 1;// ( hasData ? 1 : 0 );
4604  int zBlocks = initdata.zBlocks;
4605  int block3 = initdata.grid.block3;
4606  int dim3 = initdata.grid.dim3;
4607  CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
4608  CmiAssert(yPencil_local);
4609  trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
4610  for ( int isend=0; isend<zBlocks; ++isend ) {
4611  int kb = send_order[isend];
4612  int nz1 = block3;
4613  if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
4614  int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
4615  int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz1*2 +sizeof( envelope)+PRIORITY_SIZE/8+24;
4616  int compress_start = sizeof(PmeTransMsg)+sizeof(envelope);
4617  int compress_size = sizeof(float)*hd*nx*ny*nz1*2;
4618  trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4619  }
4620  }
4621 
4622  void setup_ungrid_persistent()
4623  {
4624  ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
4625  int limsg;
4626  for ( limsg=0; limsg < grid_msgs.size(); ++limsg ) {
4627  int peer = grid_msgs[limsg]->sourceNode;
4628  //ungrid_handle[limsg] = CmiCreatePersistent(peer, 0);
4629  }
4630  imsg = limsg;
4631  }
4632 #endif
4633 };
4634 
4635 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
4636 public:
4637  PmeYPencil_SDAG_CODE
4638  PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
4639  PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
4640  void fft_init();
4641  void recv_trans(const PmeTransMsg *);
4642  void forward_fft();
4643  void forward_subset_fft(int fromIdx, int toIdx);
4644  void send_trans();
4645  void send_subset_trans(int fromIdx, int toIdx);
4646  void recv_untrans(const PmeUntransMsg *);
4648  void recvNodeAck(PmeAckMsg *);
4650  void backward_fft();
4651  void backward_subset_fft(int fromIdx, int toIdx);
4652  void send_untrans();
4653  void send_subset_untrans(int fromIdx, int toIdx);
4654 private:
4655 #ifdef NAMD_FFTW
4656 #ifdef NAMD_FFTW_3
4657  fftwf_plan forward_plan, backward_plan;
4658 #else
4659  fftw_plan forward_plan, backward_plan;
4660 #endif
4661 #endif
4662 
4663  int nx, nz;
4664 #if USE_PERSISTENT
4665  void setup_persistent() {
4666  int yBlocks = initdata.yBlocks;
4667  int block2 = initdata.grid.block2;
4668  int K2 = initdata.grid.K2;
4669  int hd = 1;
4670  CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
4671  trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
4672  for ( int isend=0; isend<yBlocks; ++isend ) {
4673  int jb = send_order[isend];
4674  int ny1 = block2;
4675  if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4676  int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
4677  int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny1*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
4678  int compress_start = sizeof(PmeTransMsg)+sizeof( envelope);
4679  int compress_size = sizeof(float)*hd*nx*ny1*nz*2;
4680  trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4681  }
4682 
4683  CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
4684  untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
4685  for ( int isend=0; isend<yBlocks; ++isend ) {
4686  int jb = send_order[isend];
4687  int ny1 = block2;
4688  if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4689  int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
4690  int size= sizeof(PmeUntransMsg) + sizeof(float)*nx*ny1*nz*2 + sizeof( envelope) + PRIORITY_SIZE/8+24;
4691  int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope);
4692  int compress_size = sizeof(float)*nx*ny1*nz*2;
4693  untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4694  }
4695  }
4696 #endif
4697 };
4698 
4699 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
4700 public:
4701  PmeXPencil_SDAG_CODE
4702  PmeXPencil() { __sdag_init(); myKSpace = 0; setMigratable(false); imsg=imsgb=0; recipEvirPe = -999; }
4703  PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
4705  #ifdef NAMD_FFTW
4706  #ifdef NAMD_FFTW_3
4707  delete [] forward_plans;
4708  delete [] backward_plans;
4709  #endif
4710  #endif
4711  }
4712  void fft_init();
4713  void recv_trans(const PmeTransMsg *);
4714  void forward_fft();
4715  void pme_kspace();
4716  void backward_fft();
4717  void send_untrans();
4718  void send_subset_untrans(int fromIdx, int toIdx);
4720 #ifdef NAMD_FFTW
4721 #ifdef NAMD_FFTW_3
4722  fftwf_plan forward_plan, backward_plan;
4723 
4724  int numPlans;
4725  fftwf_plan *forward_plans, *backward_plans;
4726 #else
4728 #endif
4729 #endif
4730  int ny, nz;
4732  void evir_init();
4734 #if USE_PERSISTENT
4735  void setup_persistent() {
4736  int xBlocks = initdata.xBlocks;
4737  int block1 = initdata.grid.block1;
4738  int K1 = initdata.grid.K1;
4739  CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
4740  untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
4741  for ( int isend=0; isend<xBlocks; ++isend ) {
4742  int ib = send_order[isend];
4743  int nx1 = block1;
4744  if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
4745  int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
4746  int size = sizeof(PmeUntransMsg) +
4747  sizeof(float)*nx1*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
4748  int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope);
4749  int compress_size = sizeof(float)*nx1*ny*nz*2;
4750  untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4751  }
4752  }
4753 #endif
4754 
4755 };
4756 
4759  initdata.pmeProxy[recipEvirPe].addRecipEvirClient();
4760 }
4761 
4763  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4764  Node *node = nd.ckLocalBranch();
4765  SimParameters *simParams = node->simParameters;
4766 
4767 #if USE_NODE_PAR_RECEIVE
4768  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
4769 #endif
4770 
4771  int K1 = initdata.grid.K1;
4772  int K2 = initdata.grid.K2;
4773  int K3 = initdata.grid.K3;
4774  int dim3 = initdata.grid.dim3;
4775  int block1 = initdata.grid.block1;
4776  int block2 = initdata.grid.block2;
4777 
4778  nx = block1;
4779  if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4780  ny = block2;
4781  if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
4782 
4783 #ifdef NAMD_FFTW
4785 
4786  data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
4787  work = new float[dim3];
4788 
4790 
4791 #ifdef NAMD_FFTW_3
4792  /* need array of sizes for the how many */
4793 
4794  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4795  int sizeLines=nx*ny;
4796  int planLineSizes[1];
4797  planLineSizes[0]=K3;
4798  int ndim=initdata.grid.dim3; // storage space is initdata.grid.dim3
4799  int ndimHalf=ndim/2;
4800  forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
4801  (float *) data, NULL, 1,
4802  ndim,
4803  (fftwf_complex *) data, NULL, 1,
4804  ndimHalf,
4805  fftwFlags);
4806 
4807  backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
4808  (fftwf_complex *) data, NULL, 1,
4809  ndimHalf,
4810  (float *) data, NULL, 1,
4811  ndim,
4812  fftwFlags);
4813 #if CMK_SMP && USE_CKLOOP
4814  if(simParams->useCkLoop) {
4815  //How many FFT plans to be created? The grain-size issue!!.
4816  //Currently, I am choosing the min(nx, ny) to be coarse-grain
4817  numPlans = (nx<=ny?nx:ny);
4818  if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
4819  if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
4820  int howmany = sizeLines/numPlans;
4821  forward_plans = new fftwf_plan[numPlans];
4822  backward_plans = new fftwf_plan[numPlans];
4823  for(int i=0; i<numPlans; i++) {
4824  int dimStride = i*ndim*howmany;
4825  int dimHalfStride = i*ndimHalf*howmany;
4826  forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
4827  ((float *)data)+dimStride, NULL, 1,
4828  ndim,
4829  ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
4830  ndimHalf,
4831  fftwFlags);
4832 
4833  backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
4834  ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
4835  ndimHalf,
4836  ((float *)data)+dimStride, NULL, 1,
4837  ndim,
4838  fftwFlags);
4839  }
4840  }else
4841 #endif
4842  {
4843  forward_plans = NULL;
4844  backward_plans = NULL;
4845  }
4846 #else
4847  forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
4848  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4849  | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
4850  backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
4851  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4852  | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
4853 #endif
4854  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
4855 #else
4856  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
4857 #endif
4858 
4859 #if USE_NODE_PAR_RECEIVE
4860  evir = 0.;
4861  memset(data, 0, sizeof(float) * nx*ny*dim3);
4862 #endif
4863 }
4864 
4866  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4867  Node *node = nd.ckLocalBranch();
4868  SimParameters *simParams = node->simParameters;
4869 
4870 #if USE_NODE_PAR_RECEIVE
4871  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
4872 #endif
4873 
4874  int K1 = initdata.grid.K1;
4875  int K2 = initdata.grid.K2;
4876  int dim2 = initdata.grid.dim2;
4877  int dim3 = initdata.grid.dim3;
4878  int block1 = initdata.grid.block1;
4879  int block3 = initdata.grid.block3;
4880 
4881  nx = block1;
4882  if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4883  nz = block3;
4884  if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
4885 
4886 #ifdef NAMD_FFTW
4888 
4889  data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
4890  work = new float[2*K2];
4891 
4893 
4894 #ifdef NAMD_FFTW_3
4895  /* need array of sizes for the dimensions */
4896  /* ideally this should be implementable as a single multidimensional
4897  * plan, but that has proven tricky to implement, so we maintain the
4898  * loop of 1d plan executions. */
4899  int sizeLines=nz;
4900  int planLineSizes[1];
4901  planLineSizes[0]=K2;
4902  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4903  forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4904  (fftwf_complex *) data, NULL, sizeLines, 1,
4905  (fftwf_complex *) data, NULL, sizeLines, 1,
4906  FFTW_FORWARD,
4907  fftwFlags);
4908  backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4909  (fftwf_complex *) data, NULL, sizeLines, 1,
4910  (fftwf_complex *) data, NULL, sizeLines, 1,
4911  FFTW_BACKWARD,
4912  fftwFlags);
4913  CkAssert(forward_plan != NULL);
4914  CkAssert(backward_plan != NULL);
4915 #else
4916  forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
4917  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4918  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
4919  nz, (fftw_complex *) work, 1);
4920  backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
4921  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4922  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
4923  nz, (fftw_complex *) work, 1);
4924 #endif
4925  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
4926 #else
4927  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
4928 #endif
4929 
4930 #if USE_NODE_PAR_RECEIVE
4931  evir = 0;
4932  CmiMemoryWriteFence();
4933 #endif
4934 }
4935 
4937 {
4938  if ( msg->hasData ) hasData = 1;
4939  needs_reply[msg->sourceNode] = msg->hasData;
4940  recv_trans(msg);
4941  int limsg;
4942  CmiMemoryAtomicFetchAndInc(imsg,limsg);
4943  if(limsg+1 == initdata.yBlocks)
4944  {
4945  if ( hasData ) {
4946  forward_fft();
4947  }
4948  send_trans();
4949  imsg=0;
4950  CmiMemoryWriteFence();
4951  }
4952 }
4953 
4955  delete msg;
4957 }
4958 
4960 {
4961  if ( msg ) {
4962  if ( ! hasData ) NAMD_bug("PmeYPencil::node_process_untrans non-null msg but not hasData");
4963  recv_untrans(msg);
4964  } else if ( hasData ) NAMD_bug("PmeYPencil::node_process_untrans hasData but null msg");
4965  int limsg;
4966  CmiMemoryAtomicFetchAndInc(imsgb,limsg);
4967  if(limsg+1 == initdata.yBlocks)
4968  {
4969  if ( hasData ) {
4970  backward_fft();
4971  }
4972  hasData=0;
4973  imsgb=0;
4974  CmiMemoryWriteFence();
4975  send_untrans();
4976  }
4977 }
4978 
4979 #define DEBUG_NODE_PAR_RECV 0
4980 
4982  // CkPrintf("[%d] NodePmeMgr recvXTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
4983  PmeXPencil *target=xPencilObj.get(msg->destElem);
4984 #if DEBUG_NODE_PAR_RECV
4985  if(target == NULL)
4986  CkAbort("xpencil in recvXTrans not found, debug registeration");
4987 #endif
4988  target->node_process_trans(msg);
4989  delete msg;
4990 }
4991 
4992 
4994  // CkPrintf("[%d] NodePmeMgr recvYTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
4995  PmeYPencil *target=yPencilObj.get(msg->destElem);
4996 #if DEBUG_NODE_PAR_RECV
4997  if(target == NULL)
4998  CkAbort("ypencil in recvYTrans not found, debug registeration");
4999 #endif
5000  target->node_process_trans(msg);
5001  delete msg;
5002  }
5004  // CkPrintf("[%d] NodePmeMgr recvYUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5005  PmeYPencil *target=yPencilObj.get(msg->destElem);
5006 #if DEBUG_NODE_PAR_RECV
5007  if(target == NULL)
5008  CkAbort("ypencil in recvYUntrans not found, debug registeration");
5009 #endif
5010  target->node_process_untrans(msg);
5011  delete msg;
5012  }
5014  //CkPrintf("[%d] NodePmeMgr recvZUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5015  PmeZPencil *target=zPencilObj.get(msg->destElem);
5016 #if DEBUG_NODE_PAR_RECV
5017  if(target == NULL)
5018  CkAbort("zpencil in recvZUntrans not found, debug registeration");
5019 #endif
5020  target->node_process_untrans(msg);
5021  delete msg;
5022 }
5023 
5025  //CkPrintf("[%d] NodePmeMgr %p recvGrid for %d %d %d\n",CkMyPe(),this,msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5026  PmeZPencil *target=zPencilObj.get(msg->destElem);
5027 #if DEBUG_NODE_PAR_RECV
5028  if(target == NULL){
5029  CkAbort("zpencil in recvZGrid not found, debug registeration");
5030  }
5031 #endif
5032  target->node_process_grid(msg); //msg is stored inside node_proces_grid
5033 }
5034 
5036  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
5037  Node *node = nd.ckLocalBranch();
5038  SimParameters *simParams = node->simParameters;
5039 #if USE_NODE_PAR_RECEIVE
5040  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
5041 #endif
5042 
5043  int K1 = initdata.grid.K1;
5044  int K2 = initdata.grid.K2;
5045  int dim3 = initdata.grid.dim3;
5046  int block2 = initdata.grid.block2;
5047  int block3 = initdata.grid.block3;
5048 
5049  ny = block2;
5050  if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
5051  nz = block3;
5052  if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
5053 
5054 #ifdef NAMD_FFTW
5056 
5057  data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
5058  work = new float[2*K1];
5059 
5061 
5062 #ifdef NAMD_FFTW_3
5063  /* need array of sizes for the how many */
5064  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
5065  int sizeLines=ny*nz;
5066  int planLineSizes[1];
5067  planLineSizes[0]=K1;
5068  forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5069  (fftwf_complex *) data, NULL, sizeLines, 1,
5070  (fftwf_complex *) data, NULL, sizeLines, 1,
5071  FFTW_FORWARD,
5072  fftwFlags);
5073  backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5074  (fftwf_complex *) data, NULL, sizeLines, 1,
5075  (fftwf_complex *) data, NULL, sizeLines, 1,
5076  FFTW_BACKWARD,
5077  fftwFlags);
5078 
5079 #if CMK_SMP && USE_CKLOOP
5080  if(simParams->useCkLoop) {
5081  //How many FFT plans to be created? The grain-size issue!!.
5082  //Currently, I am choosing the min(nx, ny) to be coarse-grain
5083  numPlans = (ny<=nz?ny:nz);
5084  // limit attempted parallelism due to false sharing
5085  //if ( numPlans < CkMyNodeSize() ) numPlans = (ny>=nz?ny:nz);
5086  //if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
5087  if ( sizeLines/numPlans < 4 ) numPlans = 1;
5088  int howmany = sizeLines/numPlans;
5089  forward_plans = new fftwf_plan[numPlans];
5090  backward_plans = new fftwf_plan[numPlans];
5091  for(int i=0; i<numPlans; i++) {
5092  int curStride = i*howmany;
5093  forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5094  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5095  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5096  FFTW_FORWARD,
5097  fftwFlags);
5098 
5099  backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5100  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5101  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5102  FFTW_BACKWARD,
5103  fftwFlags);
5104  }
5105  }else
5106 #endif
5107  {
5108  forward_plans = NULL;
5109  backward_plans = NULL;
5110  }
5111 #else
5112  forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
5113  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5114  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
5115  ny*nz, (fftw_complex *) work, 1);
5116  backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
5117  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5118  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
5119  ny*nz, (fftw_complex *) work, 1);
5120 #endif
5121  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
5122 #else
5123  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
5124 #endif
5125 
5127  thisIndex.y*block2, thisIndex.y*block2 + ny,
5128  thisIndex.z*block3, thisIndex.z*block3 + nz);
5129 
5130 }
5131 
5132 // #define FFTCHECK // run a grid of integers through the fft
5133 // #define ZEROCHECK // check for suspicious zeros in fft
5134 
5136 
5137  int dim3 = initdata.grid.dim3;
5138  if ( imsg == 0 ) {
5139  lattice = msg->lattice;
5140  sequence = msg->sequence;
5141 #if ! USE_NODE_PAR_RECEIVE
5142  memset(data, 0, sizeof(float)*nx*ny*dim3);
5143 #endif
5144  }
5145 
5146  if ( ! msg->hasData ) return;
5147 
5148  int zlistlen = msg->zlistlen;
5149 #ifdef NAMD_KNL
5150  int * __restrict msg_zlist = msg->zlist;
5151  int * __restrict zlist = work_zlist.begin();
5152  __assume_aligned(zlist,64);
5153  for ( int k=0; k<zlistlen; ++k ) {
5154  zlist[k] = msg_zlist[k];
5155  }
5156 #else
5157  int * __restrict zlist = msg->zlist;
5158 #endif
5159  char * __restrict fmsg = msg->fgrid;
5160  float * __restrict qmsg = msg->qgrid;
5161  float * __restrict d = data;
5162  int numGrids = 1; // pencil FFT doesn't support multiple grids
5163  for ( int g=0; g<numGrids; ++g ) {
5164  for ( int i=0; i<nx; ++i ) {
5165  for ( int j=0; j<ny; ++j, d += dim3 ) {
5166  if( *(fmsg++) ) {
5167  #pragma ivdep
5168  for ( int k=0; k<zlistlen; ++k ) {
5169  d[zlist[k]] += *(qmsg++);
5170  }
5171  }
5172  }
5173  }
5174  }
5175 }
5176 
5177 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
5178 #ifdef NAMD_FFTW
5179 #ifdef NAMD_FFTW_3
5180  fftwf_plan *plans = (fftwf_plan *)param;
5181  for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
5182 #endif
5183 #endif
5184 }
5185 
5187  evir = 0.;
5188 #ifdef FFTCHECK
5189  int dim3 = initdata.grid.dim3;
5190  int K3 = initdata.grid.K3;
5191  float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
5192  float *d = data;
5193  for ( int i=0; i<nx; ++i ) {
5194  for ( int j=0; j<ny; ++j, d += dim3 ) {
5195  for ( int k=0; k<dim3; ++k ) {
5196  d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
5197  }
5198  }
5199  }
5200 #endif
5201 #ifdef NAMD_FFTW
5202 #ifdef MANUAL_DEBUG_FFTW3
5203  dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5204 #endif
5205 #ifdef NAMD_FFTW_3
5206 #if CMK_SMP && USE_CKLOOP
5207  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5208  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5209  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
5210  //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
5211  //transform the above loop
5212  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5213  return;
5214  }
5215 #endif
5216  fftwf_execute(forward_plan);
5217 #else
5218  rfftwnd_real_to_complex(forward_plan, nx*ny,
5219  data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
5220 #endif
5221 #ifdef MANUAL_DEBUG_FFTW3
5222  dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5223 #endif
5224 
5225 #endif
5226 #ifdef ZEROCHECK
5227  int dim3 = initdata.grid.dim3;
5228  int K3 = initdata.grid.K3;
5229  float *d = data;
5230  for ( int i=0; i<nx; ++i ) {
5231  for ( int j=0; j<ny; ++j, d += dim3 ) {
5232  for ( int k=0; k<dim3; ++k ) {
5233  if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
5234  thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
5235  }
5236  }
5237  }
5238 #endif
5239 }
5240 
5241 /* A single task for partitioned PmeZPencil::send_trans work */
5242 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
5243  PmeZPencil *zpencil = (PmeZPencil *)param;
5244  zpencil->send_subset_trans(first, last);
5245 }
5246 
5247 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
5248  int zBlocks = initdata.zBlocks;
5249  int block3 = initdata.grid.block3;
5250  int dim3 = initdata.grid.dim3;
5251  for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
5252  int kb = send_order[isend];
5253  int nz = block3;
5254  if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5255  int hd = ( hasData ? 1 : 0 );
5256  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5257  msg->lattice = lattice;
5258  msg->sourceNode = thisIndex.y;
5259  msg->hasData = hasData;
5260  msg->nx = ny;
5261  if ( hasData ) {
5262  float *md = msg->qgrid;
5263  const float *d = data;
5264  for ( int i=0; i<nx; ++i ) {
5265  for ( int j=0; j<ny; ++j, d += dim3 ) {
5266  for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
5267  *(md++) = d[2*k];
5268  *(md++) = d[2*k+1];
5269  }
5270  }
5271  }
5272  }
5273  msg->sequence = sequence;
5275 
5276  CmiEnableUrgentSend(1);
5277 #if USE_NODE_PAR_RECEIVE
5278  msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5279 #if Y_PERSIST
5280  CmiUsePersistentHandle(&trans_handle[isend], 1);
5281 #endif
5282  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
5283 #if Y_PERSIST
5284  CmiUsePersistentHandle(NULL, 0);
5285 #endif
5286 #else
5287 #if Y_PERSIST
5288  CmiUsePersistentHandle(&trans_handle[isend], 1);
5289 #endif
5290  initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
5291 #if Y_PERSIST
5292  CmiUsePersistentHandle(NULL, 0);
5293 #endif
5294 #endif
5295  CmiEnableUrgentSend(0);
5296  }
5297 }
5298 
5300 #if USE_PERSISTENT
5301  if (trans_handle == NULL) setup_persistent();
5302 #endif
5303 #if CMK_SMP && USE_CKLOOP
5304  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5305  if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
5306  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
5313  //send_subset_trans(0, initdata.zBlocks-1);
5314  CkLoop_Parallelize(PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1); //not sync
5315  return;
5316  }
5317 #endif
5318  int zBlocks = initdata.zBlocks;
5319  int block3 = initdata.grid.block3;
5320  int dim3 = initdata.grid.dim3;
5321  for ( int isend=0; isend<zBlocks; ++isend ) {
5322  int kb = send_order[isend];
5323  int nz = block3;
5324  if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5325  int hd = ( hasData ? 1 : 0 );
5326  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5327  msg->lattice = lattice;
5328  msg->sourceNode = thisIndex.y;
5329  msg->hasData = hasData;
5330  msg->nx = ny;
5331  if ( hasData ) {
5332  float *md = msg->qgrid;
5333  const float *d = data;
5334  for ( int i=0; i<nx; ++i ) {
5335  for ( int j=0; j<ny; ++j, d += dim3 ) {
5336  for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
5337  *(md++) = d[2*k];
5338  *(md++) = d[2*k+1];
5339  }
5340  }
5341  }
5342  }
5343  msg->sequence = sequence;
5345 
5346  CmiEnableUrgentSend(1);
5347 #if USE_NODE_PAR_RECEIVE
5348  msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5349 #if Y_PERSIST
5350  CmiUsePersistentHandle(&trans_handle[isend], 1);
5351 #endif
5352  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
5353 #if Y_PERSIST
5354  CmiUsePersistentHandle(NULL, 0);
5355 #endif
5356 #else
5357 #if Y_PERSIST
5358  CmiUsePersistentHandle(&trans_handle[isend], 1);
5359 #endif
5360  initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
5361 #if Y_PERSIST
5362  CmiUsePersistentHandle(NULL, 0);
5363 #endif
5364 #endif
5365  CmiEnableUrgentSend(0);
5366  }
5367 }
5368 
5370  if ( imsg == 0 ) {
5371  lattice = msg->lattice;
5372  sequence = msg->sequence;
5373  }
5374  int block2 = initdata.grid.block2;
5375  int K2 = initdata.grid.K2;
5376  int jb = msg->sourceNode;
5377  int ny = msg->nx;
5378  if ( msg->hasData ) {
5379  const float *md = msg->qgrid;
5380  float *d = data;
5381  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5382  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5383  for ( int k=0; k<nz; ++k ) {
5384 #ifdef ZEROCHECK
5385  if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
5386  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5387 #endif
5388  d[2*(j*nz+k)] = *(md++);
5389  d[2*(j*nz+k)+1] = *(md++);
5390  }
5391  }
5392  }
5393  } else {
5394  float *d = data;
5395  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5396  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5397  for ( int k=0; k<nz; ++k ) {
5398  d[2*(j*nz+k)] = 0;
5399  d[2*(j*nz+k)+1] = 0;
5400  }
5401  }
5402  }
5403  }
5404 }
5405 
5406 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
5407  PmeYPencil *ypencil = (PmeYPencil *)param;
5408  ypencil->forward_subset_fft(first, last);
5409 }
5410 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
5411 #ifdef NAMD_FFTW
5412 #ifdef NAMD_FFTW_3
5413  for(int i=fromIdx; i<=toIdx; i++){
5414  fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
5415  * nz * initdata.grid.K2,
5416  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
5417  }
5418 #endif
5419 #endif
5420 }
5421 
5423  evir = 0.;
5424 #ifdef NAMD_FFTW
5425 #ifdef MANUAL_DEBUG_FFTW3
5426  dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5427 #endif
5428 
5429 #ifdef NAMD_FFTW_3
5430 #if CMK_SMP && USE_CKLOOP
5431  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5432  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5433  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
5434  CkLoop_Parallelize(PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
5435  return;
5436  }
5437 #endif
5438  //the above is a transformation of the following loop using CkLoop
5439  for ( int i=0; i<nx; ++i ) {
5440  fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
5441  * nz * initdata.grid.K2,
5442  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
5443  }
5444 #else
5445  for ( int i=0; i<nx; ++i ) {
5446  fftw(forward_plan, nz,
5447  ((fftw_complex *) data) + i * nz * initdata.grid.K2,
5448  nz, 1, (fftw_complex *) work, 1, 0);
5449  }
5450 #endif
5451 #ifdef MANUAL_DEBUG_FFTW3
5452  dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5453 #endif
5454 
5455 #endif
5456 }
5457 
5458 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
5459  PmeYPencil *ypencil = (PmeYPencil *)param;
5460  ypencil->send_subset_trans(first, last);
5461 }
5462 
5463 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
5464  int yBlocks = initdata.yBlocks;
5465  int block2 = initdata.grid.block2;
5466  int K2 = initdata.grid.K2;
5467  for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
5468  int jb = send_order[isend];
5469  int ny = block2;
5470  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5471  int hd = ( hasData ? 1 : 0 );
5472  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5473  msg->lattice = lattice;
5474  msg->sourceNode = thisIndex.x;
5475  msg->hasData = hasData;
5476  msg->nx = nx;
5477  if ( hasData ) {
5478  float *md = msg->qgrid;
5479  const float *d = data;
5480  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5481  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5482  for ( int k=0; k<nz; ++k ) {
5483  *(md++) = d[2*(j*nz+k)];
5484  *(md++) = d[2*(j*nz+k)+1];
5485  #ifdef ZEROCHECK
5486  if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5487  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5488  #endif
5489  }
5490  }
5491  }
5492  if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
5493  thisIndex.x, jb, thisIndex.z);
5494  }
5495  msg->sequence = sequence;
5497  CmiEnableUrgentSend(1);
5498 #if USE_NODE_PAR_RECEIVE
5499  msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5500 #if X_PERSIST
5501  CmiUsePersistentHandle(&trans_handle[isend], 1);
5502 #endif
5503  initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
5504 #if X_PERSIST
5505  CmiUsePersistentHandle(NULL, 0);
5506 #endif
5507 #else
5508 #if X_PERSIST
5509  CmiUsePersistentHandle(&trans_handle[isend], 1);
5510 #endif
5511  initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
5512 #if X_PERSIST
5513  CmiUsePersistentHandle(NULL, 0);
5514 #endif
5515 #endif
5516  CmiEnableUrgentSend(0);
5517  }
5518 }
5519 
5521 #if USE_PERSISTENT
5522  if (trans_handle == NULL) setup_persistent();
5523 #endif
5524 #if CMK_SMP && USE_CKLOOP
5525  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5526  if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
5527  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
5534  //send_subset_trans(0, initdata.yBlocks-1);
5535  CkLoop_Parallelize(PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1); //not sync
5536  return;
5537  }
5538 #endif
5539  int yBlocks = initdata.yBlocks;
5540  int block2 = initdata.grid.block2;
5541  int K2 = initdata.grid.K2;
5542  for ( int isend=0; isend<yBlocks; ++isend ) {
5543  int jb = send_order[isend];
5544  int ny = block2;
5545  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5546  int hd = ( hasData ? 1 : 0 );
5547  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5548  msg->lattice = lattice;
5549  msg->sourceNode = thisIndex.x;
5550  msg->hasData = hasData;
5551  msg->nx = nx;
5552  if ( hasData ) {
5553  float *md = msg->qgrid;
5554  const float *d = data;
5555  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5556  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5557  for ( int k=0; k<nz; ++k ) {
5558  *(md++) = d[2*(j*nz+k)];
5559  *(md++) = d[2*(j*nz+k)+1];
5560 #ifdef ZEROCHECK
5561  if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5562  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5563 #endif
5564  }
5565  }
5566  }
5567  if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
5568  thisIndex.x, jb, thisIndex.z);
5569  }
5570  msg->sequence = sequence;
5572  CmiEnableUrgentSend(1);
5573 #if USE_NODE_PAR_RECEIVE
5574  msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5575 #if X_PERSIST
5576  CmiUsePersistentHandle(&trans_handle[isend], 1);
5577 #endif
5578  initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
5579 #if X_PERSIST
5580  CmiUsePersistentHandle(NULL, 0);
5581 #endif
5582 #else
5583 #if X_PERSIST
5584  CmiUsePersistentHandle(&trans_handle[isend], 1);
5585 #endif
5586  initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
5587 #if X_PERSIST
5588  CmiUsePersistentHandle(NULL, 0);
5589 #endif
5590 
5591 #endif
5592  CmiEnableUrgentSend(0);
5593  }
5594 }
5595 
5597 {
5598  if(msg->hasData) hasData=1;
5599  needs_reply[msg->sourceNode] = msg->hasData;
5600  recv_trans(msg);
5601  int limsg;
5602  CmiMemoryAtomicFetchAndInc(imsg,limsg);
5603  if(limsg+1 == initdata.xBlocks)
5604  {
5605  if(hasData){
5606  forward_fft();
5607  pme_kspace();
5608  backward_fft();
5609  }
5610  send_untrans();
5611  imsg=0;
5612  CmiMemoryWriteFence();
5613  }
5614 }
5615 
5617  if ( imsg == 0 ) {
5618  lattice = msg->lattice;
5619  sequence = msg->sequence;
5620  }
5621  int block1 = initdata.grid.block1;
5622  int K1 = initdata.grid.K1;
5623  int ib = msg->sourceNode;
5624  int nx = msg->nx;
5625  if ( msg->hasData ) {
5626  const float *md = msg->qgrid;
5627  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5628  float *d = data + i*ny*nz*2;
5629  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5630  for ( int k=0; k<nz; ++k ) {
5631 #ifdef ZEROCHECK
5632  if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
5633  ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
5634 #endif
5635  d[2*k] = *(md++);
5636  d[2*k+1] = *(md++);
5637  }
5638  }
5639  }
5640  } else {
5641  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5642  float *d = data + i*ny*nz*2;
5643  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5644  for ( int k=0; k<nz; ++k ) {
5645  d[2*k] = 0;
5646  d[2*k+1] = 0;
5647  }
5648  }
5649  }
5650  }
5651 }
5652 
5654 #ifdef NAMD_FFTW
5655 
5656 #ifdef MANUAL_DEBUG_FFTW3
5657  dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5658 #endif
5659 
5660 #ifdef NAMD_FFTW_3
5661 #if CMK_SMP && USE_CKLOOP
5662  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5663  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5664  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5665  //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
5666  //transform the above loop
5667  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5668  return;
5669  }
5670 #endif
5671  fftwf_execute(forward_plan);
5672 #else
5673  fftw(forward_plan, ny*nz,
5674  ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
5675 #endif
5676 #ifdef MANUAL_DEBUG_FFTW3
5677  dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5678 #endif
5679 
5680 #endif
5681 }
5682 
5684 
5685  evir = 0.;
5686 
5687 #ifdef FFTCHECK
5688  return;
5689 #endif
5690 
5692 
5693  int useCkLoop = 0;
5694 #if CMK_SMP && USE_CKLOOP
5695  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
5696  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks ) {
5697  useCkLoop = 1;
5698  }
5699 #endif
5700 
5701  int numGrids = 1;
5702  for ( int g=0; g<numGrids; ++g ) {
5703  evir[0] = myKSpace->compute_energy(data+0*g,
5704  lattice, ewaldcof, &(evir[1]), useCkLoop);
5705  }
5706 
5707 #if USE_NODE_PAR_RECEIVE
5708  CmiMemoryWriteFence();
5709 #endif
5710 }
5711 
5713 #ifdef NAMD_FFTW
5714 #ifdef MANUAL_DEBUG_FFTW3
5715  dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5716 #endif
5717 
5718 #ifdef NAMD_FFTW_3
5719 #if CMK_SMP && USE_CKLOOP
5720  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5721  if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
5722  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5723  //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
5724  //transform the above loop
5725  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5726  return;
5727  }
5728 #endif
5729  fftwf_execute(backward_plan);
5730 #else
5731  fftw(backward_plan, ny*nz,
5732  ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
5733 #endif
5734 #ifdef MANUAL_DEBUG_FFTW3
5735  dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5736 #endif
5737 #endif
5738 }
5739 
5740 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
5741  PmeXPencil *xpencil = (PmeXPencil *)param;
5742  xpencil->send_subset_untrans(first, last);
5743 }
5744 
5745 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx){
5746