NAMD
ComputeMsm.C
Go to the documentation of this file.
1 
7 #include "InfoStream.h"
8 #include "Node.h"
9 #include "PDB.h"
10 #include "PatchMap.h"
11 #include "PatchMap.inl"
12 #include "AtomMap.h"
13 #include "ComputeMsm.h"
14 #include "PatchMgr.h"
15 #include "Molecule.h"
16 #include "ReductionMgr.h"
17 #include "ComputeMgr.h"
18 #include "ComputeMgr.decl.h"
19 #include "Debug.h"
20 #include "SimParameters.h"
21 #include "WorkDistrib.h"
22 #include "Priorities.h"
23 #include "varsizemsg.h"
24 //#include "ckmulticast.h"
25 #include <stdio.h>
26 #include "MsmMap.h"
27 
28 // MSM (multilevel summation method)
29 // has O(N) algorithmic complexity
30 
31 // use multicast reduction of grids from sections of MsmGridCutoff
32 #define MSM_REDUCE_GRID
33 //#undef MSM_REDUCE_GRID
34 
35 // use the decomposition of grid cutoff to create more work units
36 #define MSM_GRID_CUTOFF_DECOMP
37 //#undef MSM_GRID_CUTOFF_DECOMP
38 
39 // skip over pairs of blocks that do not actually interact
40 #define MSM_SKIP_TOO_DISTANT_BLOCKS
41 //#undef MSM_SKIP_TOO_DISTANT_BLOCKS
42 
43 // skip over pairs of blocks whose overlap is beyond nonzero gc sphere
44 // this search is more expensive than MSM_SKIP_TOO_DISTANT_BLOCKS
45 // and does not eliminate many block pairs
46 #define MSM_SKIP_BEYOND_SPHERE
47 //#undef MSM_SKIP_BEYOND_SPHERE
48 
49 // node aware mapping of chare arrays
50 #define MSM_NODE_MAPPING
51 //#undef MSM_NODE_MAPPING
52 
53 #define MSM_NODE_MAPPING_STATS
54 #undef MSM_NODE_MAPPING_STATS
55 
56 // top of hierarchy calculates smaller blocks of charge to
57 // unfolded image blocks of potential, up to the desired block size,
58 // then sums the unfolded images of potential back into the
59 // actual potential block, thereby greatly reducing the number of
60 // block pairs that would otherwise be scheduled
61 #define MSM_FOLD_FACTOR
62 //#undef MSM_FOLD_FACTOR
63 
64 // report timings for compute routines
65 // for diagnostic purposes only
66 #define MSM_TIMING
67 #undef MSM_TIMING
68 
69 // report profiling for compute routines
70 // for diagnostic purposes only
71 #define MSM_PROFILING
72 #undef MSM_PROFILING
73 
74 // use fixed size grid message
75 // XXX probably does not work anymore
76 #define MSM_FIXED_SIZE_GRID_MSG
77 #undef MSM_FIXED_SIZE_GRID_MSG
78 
79 // turn off computation
80 // for diagnostic purposes only
81 //#define MSM_COMM_ONLY
82 
83 // print diagnostics for memory alignment (for grid cutoff calculation)
84 // for diagnostic purposes only
85 #define DEBUG_MEMORY_ALIGNMENT
86 #undef DEBUG_MEMORY_ALIGNMENT
87 
88 
89 //
90 // This is the main message that gets passed between compute chares.
91 // It is used to bundle blocks of charge (sendUp and send to MsmGridCutoff)
92 // and blocks of potential (sendAcross, sendDown, and sendPatch).
93 //
94 // Higher priority has a numerically lower value.
95 //
96 // The priorities are set as follows:
97 //
98 // sendUp priority = level+1
99 //
100 // (send to MsmGridCutoff) and sendAcross priority
101 // = nlevels + 2*(nlevels - level) - 1
102 //
103 // sendDown and sendPatch priority
104 // = nlevels + 2*(nlevels - level)
105 //
106 // This puts the priority on going up the hierarchy before going across
107 // and puts the priority on finishing the top levels and down before
108 // finishing the lower levels.
109 //
110 
111 class GridMsg : public CkMcastBaseMsg, public CMessage_GridMsg {
112  public:
113  char *gdata;
114  int idnum;
115  int nlower_i;
116  int nlower_j;
117  int nlower_k;
121  int nbytes;
122  int seqnum; // sequence number is used for message priority
123 
124  // put a grid into an allocated message to be sent
125  template <class T>
126  void put(const msm::Grid<T>& g, int id, int seq) {
127  idnum = id;
128  nlower_i = g.lower().i;
129  nlower_j = g.lower().j;
130  nlower_k = g.lower().k;
131  nextent_i = g.extent().i;
132  nextent_j = g.extent().j;
133  nextent_k = g.extent().k;
134  nbytes = g.data().len()*sizeof(T);
135  seqnum = seq;
136  memcpy(gdata, g.data().buffer(), nbytes);
137  }
138 
139  // get the grid from a received message
140  template <class T>
141  void get(msm::Grid<T>& g, int& id, int& seq) {
142  id = idnum;
145  seq = seqnum;
146  ASSERT(g.data().len()*sizeof(T) == nbytes);
147  memcpy(g.data().buffer(), gdata, nbytes);
148  }
149 };
150 
151 
152 class MsmBlockProxyMsg : public CMessage_MsmBlockProxyMsg {
153  public:
154  enum { maxlevels = 32 };
155  char msmBlockProxyData[maxlevels*sizeof(CProxy_MsmBlock)];
156  int nlevels;
157 
158  // put an array into an allocated message to be sent
160  nlevels = a.len();
161  if (nlevels > maxlevels) {
162  NAMD_die("Exceeded maximum number of MSM levels\n");
163  }
164  memcpy(msmBlockProxyData, a.buffer(), nlevels*sizeof(CProxy_MsmBlock));
165  }
166 
167  // get the array from a received message
169  a.resize(nlevels);
170  memcpy(a.buffer(), msmBlockProxyData, nlevels*sizeof(CProxy_MsmBlock));
171  }
172 };
173 
174 
175 class MsmC1HermiteBlockProxyMsg : public CMessage_MsmC1HermiteBlockProxyMsg {
176  public:
177  enum { maxlevels = 32 };
178  char msmBlockProxyData[maxlevels*sizeof(CProxy_MsmC1HermiteBlock)];
179  int nlevels;
180 
181  // put an array into an allocated message to be sent
183  nlevels = a.len();
184  if (nlevels > maxlevels) {
185  NAMD_die("Exceeded maximum number of MSM levels\n");
186  }
187  memcpy(msmBlockProxyData, a.buffer(),
188  nlevels*sizeof(CProxy_MsmC1HermiteBlock));
189  }
190 
191  // get the array from a received message
193  a.resize(nlevels);
194  memcpy(a.buffer(), msmBlockProxyData,
195  nlevels*sizeof(CProxy_MsmC1HermiteBlock));
196  }
197 };
198 
199 
200 class MsmGridCutoffProxyMsg : public CMessage_MsmGridCutoffProxyMsg {
201  public:
202  char msmGridCutoffProxyData[sizeof(CProxy_MsmGridCutoff)];
203 
204  // put proxy into an allocated message to be sent
205  void put(const CProxy_MsmGridCutoff *p) {
206  memcpy(msmGridCutoffProxyData, p, sizeof(CProxy_MsmGridCutoff));
207  }
208 
209  // get the proxy from a received message
210  void get(CProxy_MsmGridCutoff *p) {
211  memcpy(p, msmGridCutoffProxyData, sizeof(CProxy_MsmGridCutoff));
212  }
213 };
214 
215 
217  public CMessage_MsmC1HermiteGridCutoffProxyMsg
218 {
219  public:
220  char msmGridCutoffProxyData[sizeof(CProxy_MsmC1HermiteGridCutoff)];
221 
222  // put proxy into an allocated message to be sent
223  void put(const CProxy_MsmC1HermiteGridCutoff *p) {
224  memcpy(msmGridCutoffProxyData, p,
225  sizeof(CProxy_MsmC1HermiteGridCutoff));
226  }
227 
228  // get the proxy from a received message
229  void get(CProxy_MsmC1HermiteGridCutoff *p) {
230  memcpy(p, msmGridCutoffProxyData,
231  sizeof(CProxy_MsmC1HermiteGridCutoff));
232  }
233 };
234 
235 
236 class MsmGridCutoffInitMsg : public CMessage_MsmGridCutoffInitMsg {
237  public:
238  msm::BlockIndex qhBlockIndex; // charge block index
239  msm::BlockSend ehBlockSend; // potential block sending address
241  : qhBlockIndex(i), ehBlockSend(b) { }
242 };
243 
244 
246  public CkMcastBaseMsg, public CMessage_MsmGridCutoffSetupMsg
247 {
248  public:
249  char msmBlockElementProxyData[sizeof(CProxyElement_MsmBlock)];
250 
251  // put proxy into an allocated message to be sent
252  void put(
253  const CProxyElement_MsmBlock *q //,
254  ) {
255  memcpy(msmBlockElementProxyData, q, sizeof(CProxyElement_MsmBlock));
256  }
257 
258  // get the proxy from a received message
259  void get(
260  CProxyElement_MsmBlock *q //,
261  ) {
262  memcpy(q, msmBlockElementProxyData, sizeof(CProxyElement_MsmBlock));
263  }
264 };
265 
266 
268  public CkMcastBaseMsg, public CMessage_MsmC1HermiteGridCutoffSetupMsg
269 {
270  public:
271  char msmBlockElementProxyData[sizeof(CProxyElement_MsmC1HermiteBlock)];
272 
273  // put proxy into an allocated message to be sent
274  void put(
275  const CProxyElement_MsmC1HermiteBlock *q //,
276  ) {
277  memcpy(msmBlockElementProxyData, q,
278  sizeof(CProxyElement_MsmC1HermiteBlock));
279  }
280 
281  // get the proxy from a received message
282  void get(
283  CProxyElement_MsmC1HermiteBlock *q //,
284  ) {
285  memcpy(q, msmBlockElementProxyData,
286  sizeof(CProxyElement_MsmC1HermiteBlock));
287  }
288 };
289 
290 
291 // Used only when MSM_TIMING is defined
292 class MsmTimer : public CBase_MsmTimer {
293  public:
295 
297  for (int i = 0; i < MAX; i++) timing[i] = 0;
298  }
299  void done(double tm[], int n) {
300  for (int i = 0; i < MAX; i++) timing[i] = tm[i];
301  print();
302  }
303  void print() {
304  CkPrintf("MSM timings:\n");
305  CkPrintf(" anterpolation %8.6f sec\n", timing[ANTERP]);
306  CkPrintf(" interpolation %8.6f sec\n", timing[INTERP]);
307  CkPrintf(" restriction %8.6f sec\n", timing[RESTRICT]);
308  CkPrintf(" prolongation %8.6f sec\n", timing[PROLONGATE]);
309  CkPrintf(" grid cutoff %8.6f sec\n", timing[GRIDCUTOFF]);
310  CkPrintf(" communication %8.6f sec\n", timing[COMM]);
311  }
312 
313  double timing[MAX];
314 };
315 
316 
317 // Used only when MSM_PROFILING is defined
318 class MsmProfiler : public CBase_MsmProfiler {
319  public:
320  enum { MAX = MSM_MAX_BLOCK_SIZE+1 };
321 
323  for (int i = 0; i < MAX; i++) xloopcnt[i] = 0;
324  }
325  void done(int lc[], int n) {
326  for (int i = 0; i < MAX; i++) xloopcnt[i] = lc[i];
327  print();
328  }
329  void print() {
330  int sum = 0;
331  for (int i = 0; i < MAX; i++) sum += xloopcnt[i];
332  CkPrintf("MSM profiling:\n");
333  CkPrintf(" total executions of inner loop: %d\n", sum);
334  for (int i = 0; i < MAX; i++) {
335  CkPrintf(" executing %d times: %d (%5.2f%%)\n",
336  i, xloopcnt[i], 100*double(xloopcnt[i])/sum);
337  }
338  }
339 
340  int xloopcnt[MAX];
341 };
342 
343 
344 // used with PriorityQueue
345 // when determining work mapped to node or PE
346 struct WorkIndex {
347  float work;
348  int index;
349  WorkIndex() : work(0), index(0) { }
350  WorkIndex(float w, int i) : work(w), index(i) { }
351  int operator<=(const WorkIndex& wn) {
352  return (work <= wn.work);
353  }
354 };
355 
356 
358 //
359 // ComputeMsmMgr
360 // chare group containing MSM parameters and constants;
361 // one chare object per PE
362 //
363 
364 class ComputeMsmMgr : public CBase_ComputeMsmMgr {
365  friend struct msm::PatchData;
366  friend class MsmBlock;
367  //friend class MsmGridCutoff;
368  friend class MsmBlockMap;
369  friend class MsmGridCutoffMap;
370 
371 public:
372  ComputeMsmMgr(); // entry
373  ~ComputeMsmMgr();
374 
375  void initialize(MsmInitMsg *); // entry with message
376  void initialize_create(); // entry no message
377 private:
378  void initialize2(); // split in two
379 public:
380 
381  void recvMsmBlockProxy(MsmBlockProxyMsg *); // entry with message
382  void recvMsmGridCutoffProxy(MsmGridCutoffProxyMsg *); // entry with message
383 
385  // entry with message
387  // entry with message
388 
389  void update(CkQdMsg *); // entry with message
390 
391  void compute(msm::Array<int>& patchIDList);
392  // called by local ComputeMsm object
393 
394  void addPotential(GridMsg *); // entry with message
395  void doneCompute(); // called by each local patch
396 
397 #ifdef MSM_TIMING
398  void initTiming() {
399  for (int i = 0; i < MsmTimer::MAX; i++) msmTiming[i] = 0;
400  cntTiming = 0;
401  }
402  // every local object being timed should call this during initialization
403  void addTiming() {
404  numTiming++;
405  }
406  // object calls before being migrated
407  void subtractTiming() {
408  numTiming--;
409  }
410  void doneTiming() {
411  if (++cntTiming >= numTiming) {
412  CkCallback cb(CkReductionTarget(MsmTimer, done), msmTimer);
413  contribute(MsmTimer::MAX*sizeof(double), msmTiming,
414  CkReduction::sum_double, cb);
415  initTiming();
416  }
417  }
418 #endif
419 
420 #ifdef MSM_PROFILING
421  void initProfiling() {
422  for (int i = 0; i < MsmProfiler::MAX; i++) xLoopCnt[i] = 0;
423  cntProfiling = 0;
424  }
425  // every local object being profiled should call this during initialization
426  void addProfiling() {
427  numProfiling++;
428  }
429  // object calls before being migrated
430  void subtractProfiling() {
431  numProfiling--;
432  }
433  void doneProfiling() {
434  if (++cntProfiling >= numProfiling) {
435  CkCallback cb(CkReductionTarget(MsmProfiler, done), msmProfiler);
436  contribute(MsmProfiler::MAX*sizeof(int), xLoopCnt,
437  CkReduction::sum_int, cb);
438  initProfiling(); // reset accumulators for next visit
439  }
440  }
441 #endif
442 
443  void setCompute(ComputeMsm *c) { msmCompute = c; c->setMgr(this); } // local
444 
446 
447  msm::Map& mapData() { return map; }
448 
449  int numLevels() const { return nlevels; }
450 
451  // sign(n) = -1 if n < 0, 0 if n == 0, or 1 if n > 0
452  static inline int sign(int n) {
453  return (n < 0 ? -1 : (n > 0 ? 1 : 0));
454  }
455 
456 //private:
457  void setup_hgrid_1d(BigReal len, BigReal& hh, int& nn,
458  int& ia, int& ib, int isperiodic);
459  void setup_periodic_blocksize(int& bsize, int n);
460 
461  CProxy_ComputeMsmMgr msmProxy;
463 
466 
467  CProxy_MsmGridCutoff msmGridCutoff;
468  CProxy_MsmC1HermiteGridCutoff msmC1HermiteGridCutoff;
469  int numGridCutoff; // length of msmGridCutoff chare array
470 
472 
473  // find patch by patchID
474  // array is length number of patches, initialized to NULL
475  // allocate PatchData for only those patches on this PE
477 
478  // allocate subgrid used for receiving message data in addPotential()
479  // and sending on to PatchData::addPotential()
482 
483 #ifdef MSM_NODE_MAPPING
486  //msm::Array<int> nodecnt;
487  int blockFlatIndex(int level, int i, int j, int k) {
488  int n = 0;
489  for (int l = 0; l < level; l++) {
490  n += map.blockLevel[l].nn();
491  }
492  return (n + map.blockLevel[level].flatindex(i,j,k));
493  }
495  // XXX ratio of work for MsmBlock to MsmGridCutoff?
496  const float scalingFactor = 3;
497  const int volumeFullBlock = map.bsx[0] * map.bsy[0] * map.bsz[0];
498  msm::Ivec gn;
499  if (approx == C1HERMITE) {
500  gn = map.gc_c1hermite[0].extent();
501  }
502  else {
503  gn = map.gc[0].extent();
504  }
505  const int volumeFullCutoff = (map.bsx[0] + gn.i - 1) *
506  (map.bsy[0] + gn.j - 1) * (map.bsz[0] + gn.k - 1);
507  msm::Ivec n = b.nrange.extent();
508  int volumeBlock = n.i * n.j * n.k;
509  msm::Ivec nc = b.nrangeCutoff.extent();
510  int volumeCutoff = nc.i * nc.j * nc.k;
511  return( scalingFactor * (float(volumeBlock) / volumeFullBlock) *
512  (float(volumeCutoff) / volumeFullCutoff) );
513  }
514  float calcGcutWork(const msm::BlockSend& bs) {
515  const int volumeFullBlock = map.bsx[0] * map.bsy[0] * map.bsz[0];
516  msm::Ivec n = bs.nrange_wrap.extent();;
517  int volumeBlock = n.i * n.j * n.k;
518  return( float(volumeBlock) / volumeFullBlock );
519  }
520 #endif
521 
522  // sum local virial factors
526  enum { VXX=0, VXY, VXZ, VYY, VYZ, VZZ, VMAX };
528 
530  gvsum.reset(0);
531  cntVirialContrib = 0;
532  }
535  }
538  }
541  // reduce all gvsum contributions into virial tensor
542  for (int n = 0; n < VMAX; n++) { virial[n] = 0; }
543  int ia = gvsum.ia();
544  int ib = gvsum.ib();
545  int ja = gvsum.ja();
546  int jb = gvsum.jb();
547  int ka = gvsum.ka();
548  int kb = gvsum.kb();
549  for (int k = ka; k <= kb; k++) {
550  for (int j = ja; j <= jb; j++) {
551  for (int i = ia; i <= ib; i++) {
552  Float cu = Float(i);
553  Float cv = Float(j);
554  Float cw = Float(k);
555  Float c = gvsum(i,j,k);
556  Float vx = cu*hufx + cv*hvfx + cw*hwfx;
557  Float vy = cu*hufy + cv*hvfy + cw*hwfy;
558  Float vz = cu*hufz + cv*hvfz + cw*hwfz;
559  virial[VXX] -= c * vx * vx;
560  virial[VXY] -= c * vx * vy;
561  virial[VXZ] -= c * vx * vz;
562  virial[VYY] -= c * vy * vy;
563  virial[VYZ] -= c * vy * vz;
564  virial[VZZ] -= c * vz * vz;
565  }
566  }
567  }
569  }
570  }
571 
572 #ifdef MSM_TIMING
573  CProxy_MsmTimer msmTimer;
574  double msmTiming[MsmTimer::MAX];
575  int numTiming; // total number of objects being timed
576  int cntTiming; // count the objects as they provide timing results
577  CkCallback *cbTiming;
578 #endif
579 
580 #ifdef MSM_PROFILING
581  CProxy_MsmProfiler msmProfiler;
582  int xLoopCnt[MsmProfiler::MAX];
583  int numProfiling; // total number of objects being profiled
584  int cntProfiling; // count the objects as they provide profiling results
585  CkCallback *cbProfiling;
586 #endif
587 
588  Vector c, u, v, w; // rescaled center and lattice vectors
589  Vector ru, rv, rw; // row vectors to transform to unit space
590  int ispu, ispv, ispw; // is periodic along u, v, w?
591 
592  Lattice lattice; // keep local copy of lattice
593  ScaledPosition smin; // keep min values for non-periodic dimensions
594  ScaledPosition smax; // keep max values for non-periodic dimensions
595  BigReal gridspacing; // preferred grid spacing
596  BigReal padding; // padding for non-periodic boundaries
597  BigReal gridScalingFactor; // scaling for Hermite interpolation
598  BigReal a; // cutoff distance
599  BigReal hxlen, hylen, hzlen; // first level grid spacings along basis vectors
600  BigReal hxlen_1, hylen_1, hzlen_1; // inverses of grid spacings
601  Vector hu, hv, hw; // first level grid spacing vectors
603  int nhx, nhy, nhz; // number of h spacings that cover cell
604  int approx; // ID for approximation
605  int split; // ID for splitting
606  int nlevels; // number of grid levels
607  int dispersion; // calculating dispersion forces?
608  BigReal gzero; // self energy factor from splitting
609 
610  Vector sglower; // lower corner of grid in scaled space
611  // corresponds to index (0,0,0)
612 
613  BigReal shx, shy, shz; // grid spacings in scaled space
615  Vector sx_shx; // row vector to transform interpolated force x
616  Vector sy_shy; // row vector to transform interpolated force y
617  Vector sz_shz; // row vector to transform interpolated force z
618  Float srx_x, srx_y, srx_z; // float version of sx_shx
619  Float sry_x, sry_y, sry_z; // float version of sy_shy
620  Float srz_x, srz_y, srz_z; // float version of sz_shz
621 
622  int s_edge;
623  int omega;
624 
627 
632 
633  enum {
634  // Approximation formulas with up to degree 9 polynomials.
636 
637  // Max stencil length for polynomial approximation.
639 
640  // Max stencil length when skipping zeros
641  // (almost half entries are zero for interpolating polynomials).
643 
644  // Number of scalar approximation formulaes
646  };
647 
648  // Degree of polynomial basis function Phi.
649  static const int PolyDegree[NUM_APPROX];
650 
651  // The stencil array lengths below.
652  static const int Nstencil[NUM_APPROX];
653 
654  // Index offsets from the stencil-centered grid element, to get
655  // to the correct contributing grid element.
657 
658  // The grid transfer stencils for the non-factored restriction and
659  // prolongation procedures.
661 
662  // Calculate the smoothing function and its derivative:
663  // g(R) and (d/dR)g(R), where R=r/a.
664  // Use double precision for calculating the MSM constant weights
665  // and coefficients. The resulting coefficents to be used in
666  // the repeatedly called algorithm are stored in single precision.
667  static void splitting(BigReal& g, BigReal& dg, BigReal r_a, int _split) {
668  BigReal s = r_a * r_a; // s = (r/a)^2, assuming 0 <= s <= 1
669  switch (_split) {
670  case TAYLOR2:
671  g = 1 + (s-1)*(-1./2 + (s-1)*(3./8));
672  dg = (2*r_a)*(-1./2 + (s-1)*(3./4));
673  break;
674  case TAYLOR3:
675  g = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16)));
676  dg = (2*r_a)*(-1./2 + (s-1)*(3./4 + (s-1)*(-15./16)));
677  break;
678  case TAYLOR4:
679  g = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16
680  + (s-1)*(35./128))));
681  dg = (2*r_a)*(-1./2 + (s-1)*(3./4 + (s-1)*(-15./16
682  + (s-1)*(35./32))));
683  break;
684  case TAYLOR5:
685  g = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16
686  + (s-1)*(35./128 + (s-1)*(-63./256)))));
687  dg = (2*r_a)*(-1./2 + (s-1)*(3./4 + (s-1)*(-15./16
688  + (s-1)*(35./32 + (s-1)*(-315./256)))));
689  break;
690  case TAYLOR6:
691  g = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16
692  + (s-1)*(35./128 + (s-1)*(-63./256
693  + (s-1)*(231./1024))))));
694  dg = (2*r_a)*(-1./2 + (s-1)*(3./4 + (s-1)*(-15./16
695  + (s-1)*(35./32 + (s-1)*(-315./256
696  + (s-1)*(693./512))))));
697  break;
698  case TAYLOR7:
699  g = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16
700  + (s-1)*(35./128 + (s-1)*(-63./256
701  + (s-1)*(231./1024 + (s-1)*(-429./2048)))))));
702  dg = (2*r_a)*(-1./2 + (s-1)*(3./4 + (s-1)*(-15./16
703  + (s-1)*(35./32 + (s-1)*(-315./256
704  + (s-1)*(693./512 + (s-1)*(-3003./2048)))))));
705  break;
706  case TAYLOR8:
707  g = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16
708  + (s-1)*(35./128 + (s-1)*(-63./256
709  + (s-1)*(231./1024 + (s-1)*(-429./2048
710  + (s-1)*(6435./32768))))))));
711  dg = (2*r_a)*(-1./2 + (s-1)*(3./4 + (s-1)*(-15./16
712  + (s-1)*(35./32 + (s-1)*(-315./256
713  + (s-1)*(693./512 + (s-1)*(-3003./2048
714  + (s-1)*(6435./4096))))))));
715  break;
716  case TAYLOR2_DISP:
717  g = 1 + (s-1)*(-3 + (s-1)*(6));
718  dg = (2*r_a)*(-3 + (s-1)*(12));
719  break;
720  case TAYLOR3_DISP:
721  g = 1 + (s-1)*(-3 + (s-1)*(6 + (s-1)*(-10)));
722  dg = (2*r_a)*(-3 + (s-1)*(12 + (s-1)*(-30)));
723  break;
724  case TAYLOR4_DISP:
725  g = 1 + (s-1)*(-3 + (s-1)*(6 + (s-1)*(-10 + (s-1)*(15))));
726  dg = (2*r_a)*(-3 + (s-1)*(12 + (s-1)*(-30 + (s-1)*(60))));
727  break;
728  case TAYLOR5_DISP:
729  g = 1 + (s-1)*(-3 + (s-1)*(6 + (s-1)*(-10
730  + (s-1)*(15 + (s-1)*(-21)))));
731  dg = (2*r_a)*(-3 + (s-1)*(12 + (s-1)*(-30
732  + (s-1)*(60 + (s-1)*(-105)))));
733  break;
734  case TAYLOR6_DISP:
735  g = 1 + (s-1)*(-3 + (s-1)*(6 + (s-1)*(-10
736  + (s-1)*(15 + (s-1)*(-21 + (s-1)*(28))))));
737  dg = (2*r_a)*(-3 + (s-1)*(12 + (s-1)*(-30
738  + (s-1)*(60 + (s-1)*(-105 + (s-1)*(168))))));
739  break;
740  case TAYLOR7_DISP:
741  g = 1 + (s-1)*(-3 + (s-1)*(6 + (s-1)*(-10
742  + (s-1)*(15 + (s-1)*(-21 + (s-1)*(28
743  + (s-1)*(-36)))))));
744  dg = (2*r_a)*(-3 + (s-1)*(12 + (s-1)*(-30
745  + (s-1)*(60 + (s-1)*(-105 + (s-1)*(168
746  + (s-1)*(-252)))))));
747  break;
748  case TAYLOR8_DISP:
749  g = 1 + (s-1)*(-3 + (s-1)*(6 + (s-1)*(-10
750  + (s-1)*(15 + (s-1)*(-21 + (s-1)*(28
751  + (s-1)*(-36 + (s-1)*(45))))))));
752  dg = (2*r_a)*(-3 + (s-1)*(12 + (s-1)*(-30
753  + (s-1)*(60 + (s-1)*(-105 + (s-1)*(168
754  + (s-1)*(-252 + (s-1)*(360))))))));
755  break;
756  default:
757  NAMD_die("Unknown MSM splitting.");
758  } // switch
759  } // splitting()
760 
761  void stencil_1d(Float phi[], Float t) {
762  switch (approx) {
763  case CUBIC:
764  phi[0] = 0.5f * (1 - t) * (2 - t) * (2 - t);
765  t--;
766  phi[1] = (1 - t) * (1 + t - 1.5f * t * t);
767  t--;
768  phi[2] = (1 + t) * (1 - t - 1.5f * t * t);
769  t--;
770  phi[3] = 0.5f * (1 + t) * (2 + t) * (2 + t);
771  break;
772  case QUINTIC:
773  phi[0] = (1.f/24) * (1-t) * (2-t) * (3-t) * (3-t) * (4-t);
774  t--;
775  phi[1] = (1-t) * (2-t) * (3-t) * ((1.f/6)
776  + t * (0.375f - (5.f/24)*t));
777  t--;
778  phi[2] = (1-t*t) * (2-t) * (0.5f + t * (0.25f - (5.f/12)*t));
779  t--;
780  phi[3] = (1-t*t) * (2+t) * (0.5f - t * (0.25f + (5.f/12)*t));
781  t--;
782  phi[4] = (1+t) * (2+t) * (3+t) * ((1.f/6)
783  - t * (0.375f + (5.f/24)*t));
784  t--;
785  phi[5] = (1.f/24) * (1+t) * (2+t) * (3+t) * (3+t) * (4+t);
786  break;
787  case QUINTIC2:
788  phi[0] = (1.f/24) * (3-t) * (3-t) * (3-t) * (t-2) * (5*t-8);
789  t--;
790  phi[1] = (-1.f/24) * (2-t) * (t-1) * (-48+t*(153+t*(-114+t*25)));
791  t--;
792  phi[2] = (1.f/12) * (1-t) * (12+t*(12+t*(-3+t*(-38+t*25))));
793  t--;
794  phi[3] = (1.f/12) * (1+t) * (12+t*(-12+t*(-3+t*(38+t*25))));
795  t--;
796  phi[4] = (-1.f/24) * (2+t) * (t+1) * (48+t*(153+t*(114+t*25)));
797  t--;
798  phi[5] = (1.f/24) * (3+t) * (3+t) * (3+t) * (t+2) * (5*t+8);
799  break;
800  case SEPTIC:
801  phi[0] = (-1.f/720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-4)*(t-5)*(t-6);
802  t--;
803  phi[1] = (1.f/720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-5)*(-6+t*(-20+7*t));
804  t--;
805  phi[2] = (-1.f/240)*(t*t-1)*(t-2)*(t-3)*(t-4)*(-10+t*(-12+7*t));
806  t--;
807  phi[3] = (1.f/144)*(t*t-1)*(t*t-4)*(t-3)*(-12+t*(-4+7*t));
808  t--;
809  phi[4] = (-1.f/144)*(t*t-1)*(t*t-4)*(t+3)*(-12+t*(4+7*t));
810  t--;
811  phi[5] = (1.f/240)*(t*t-1)*(t+2)*(t+3)*(t+4)*(-10+t*(12+7*t));
812  t--;
813  phi[6] = (-1.f/720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(-6+t*(20+7*t));
814  t--;
815  phi[7] = (1.f/720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+4)*(t+5)*(t+6);
816  break;
817  case SEPTIC3:
818  phi[0] = (3632.f/5) + t*((-7456.f/5) + t*((58786.f/45) + t*(-633
819  + t*((26383.f/144) + t*((-22807.f/720) + t*((727.f/240)
820  + t*(-89.f/720)))))));
821  t--;
822  phi[1] = -440 + t*((25949.f/20) + t*((-117131.f/72) + t*((2247.f/2)
823  + t*((-66437.f/144) + t*((81109.f/720) + t*((-727.f/48)
824  + t*(623.f/720)))))));
825  t--;
826  phi[2] = (138.f/5) + t*((-8617.f/60) + t*((12873.f/40) + t*((-791.f/2)
827  + t*((4557.f/16) + t*((-9583.f/80) + t*((2181.f/80)
828  + t*(-623.f/240)))))));
829  t--;
830  phi[3] = 1 + t*t*((-49.f/36) + t*t*((-959.f/144) + t*((2569.f/144)
831  + t*((-727.f/48) + t*(623.f/144)))));
832  t--;
833  phi[4] = 1 + t*t*((-49.f/36) + t*t*((-959.f/144) + t*((-2569.f/144)
834  + t*((-727.f/48) + t*(-623.f/144)))));
835  t--;
836  phi[5] = (138.f/5) + t*((8617.f/60) + t*((12873.f/40) + t*((791.f/2)
837  + t*((4557.f/16) + t*((9583.f/80) + t*((2181.f/80)
838  + t*(623.f/240)))))));
839  t--;
840  phi[6] = -440 + t*((-25949.f/20) + t*((-117131.f/72) + t*((-2247.f/2)
841  + t*((-66437.f/144) + t*((-81109.f/720) + t*((-727.f/48)
842  + t*(-623.f/720)))))));
843  t--;
844  phi[7] = (3632.f/5) + t*((7456.f/5) + t*((58786.f/45) + t*(633
845  + t*((26383.f/144) + t*((22807.f/720) + t*((727.f/240)
846  + t*(89.f/720)))))));
847  break;
848  case NONIC:
849  phi[0] = (-1.f/40320)*(t-8)*(t-7)*(t-6)*(t-5)*(t-5)*(t-4)*(t-3)*
850  (t-2)*(t-1);
851  t--;
852  phi[1] = (1.f/40320)*(t-7)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*
853  (-8+t*(-35+9*t));
854  t--;
855  phi[2] = (-1.f/10080)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*
856  (-14+t*(-25+9*t));
857  t--;
858  phi[3] = (1.f/1440)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*
859  (-6+t*(-5+3*t));
860  t--;
861  phi[4] = (-1.f/2880)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*
862  (-20+t*(-5+9*t));
863  t--;
864  phi[5] = (1.f/2880)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*
865  (-20+t*(5+9*t));
866  t--;
867  phi[6] = (-1.f/1440)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*
868  (-6+t*(5+3*t));
869  t--;
870  phi[7] = (1.f/10080)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)*
871  (-14+t*(25+9*t));
872  t--;
873  phi[8] = (-1.f/40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)*(t+7)*
874  (-8+t*(35+9*t));
875  t--;
876  phi[9] = (1.f/40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+5)*(t+6)*
877  (t+7)*(t+8);
878  break;
879  case NONIC4:
880  { // begin grouping to define local variables
881  double Tphi[10];
882  double T=t;
883  Tphi[0] = 439375./7+T*(-64188125./504+T*(231125375./2016
884  +T*(-17306975./288+T*(7761805./384+T*(-2895587./640
885  +T*(129391./192+T*(-259715./4032+T*(28909./8064
886  +T*(-3569./40320)))))))));
887  T--;
888  Tphi[1] = -56375+T*(8314091./56+T*(-49901303./288+T*(3763529./32
889  +T*(-19648027./384+T*(9469163./640+T*(-545977./192
890  +T*(156927./448+T*(-28909./1152
891  +T*(3569./4480)))))))));
892  T--;
893  Tphi[2] = 68776./7+T*(-1038011./28+T*(31157515./504+T*(-956669./16
894  +T*(3548009./96+T*(-2422263./160+T*(197255./48
895  +T*(-19959./28+T*(144545./2016
896  +T*(-3569./1120)))))))));
897  T--;
898  Tphi[3] = -154+T*(12757./12+T*(-230123./72+T*(264481./48
899  +T*(-576499./96+T*(686147./160+T*(-96277./48
900  +T*(14221./24+T*(-28909./288+T*(3569./480)))))))));
901  T--;
902  Tphi[4] = 1+T*T*(-205./144+T*T*(91./192+T*(-6181./320
903  +T*(6337./96+T*(-2745./32+T*(28909./576
904  +T*(-3569./320)))))));
905  T--;
906  Tphi[5] = 1+T*T*(-205./144+T*T*(91./192+T*(6181./320
907  +T*(6337./96+T*(2745./32+T*(28909./576
908  +T*(3569./320)))))));
909  T--;
910  Tphi[6] = -154+T*(-12757./12+T*(-230123./72+T*(-264481./48
911  +T*(-576499./96+T*(-686147./160+T*(-96277./48
912  +T*(-14221./24+T*(-28909./288+T*(-3569./480)))))))));
913  T--;
914  Tphi[7] = 68776./7+T*(1038011./28+T*(31157515./504+T*(956669./16
915  +T*(3548009./96+T*(2422263./160+T*(197255./48
916  +T*(19959./28+T*(144545./2016+T*(3569./1120)))))))));
917  T--;
918  Tphi[8] = -56375+T*(-8314091./56+T*(-49901303./288+T*(-3763529./32
919  +T*(-19648027./384+T*(-9469163./640+T*(-545977./192
920  +T*(-156927./448+T*(-28909./1152
921  +T*(-3569./4480)))))))));
922  T--;
923  Tphi[9] = 439375./7+T*(64188125./504+T*(231125375./2016
924  +T*(17306975./288+T*(7761805./384+T*(2895587./640
925  +T*(129391./192+T*(259715./4032+T*(28909./8064
926  +T*(3569./40320)))))))));
927  for (int i=0; i < 10; i++) {
928  phi[i] = Float(Tphi[i]);
929  }
930  } // end grouping to define local variables
931  break;
932  default:
933  NAMD_die("Unknown MSM approximation.");
934  } // switch
935  } // stencil_1d()
936 
937  void d_stencil_1d(Float dphi[], Float phi[], Float t, Float h_1) {
938  switch (approx) {
939  case CUBIC:
940  phi[0] = 0.5f * (1 - t) * (2 - t) * (2 - t);
941  dphi[0] = (1.5f * t - 2) * (2 - t) * h_1;
942  t--;
943  phi[1] = (1 - t) * (1 + t - 1.5f * t * t);
944  dphi[1] = (-5 + 4.5f * t) * t * h_1;
945  t--;
946  phi[2] = (1 + t) * (1 - t - 1.5f * t * t);
947  dphi[2] = (-5 - 4.5f * t) * t * h_1;
948  t--;
949  phi[3] = 0.5f * (1 + t) * (2 + t) * (2 + t);
950  dphi[3] = (1.5f * t + 2) * (2 + t) * h_1;
951  break;
952  case QUINTIC:
953  phi[0] = (1.f/24) * (1-t) * (2-t) * (3-t) * (3-t) * (4-t);
954  dphi[0] = ((-1.f/24) * ((3-t) * (3-t) * (14 + t * (-14 + 3*t))
955  + 2 * (1-t) * (2-t) * (3-t) * (4-t))) * h_1;
956  t--;
957  phi[1] = (1-t) * (2-t) * (3-t) * ((1.f/6)
958  + t * (0.375f - (5.f/24)*t));
959  dphi[1] = (-((1.f/6) + t * (0.375f - (5.f/24)*t)) *
960  (11 + t * (-12 + 3*t)) + (1-t) * (2-t) * (3-t) *
961  (0.375f - (5.f/12)*t)) * h_1;
962  t--;
963  phi[2] = (1-t*t) * (2-t) * (0.5f + t * (0.25f - (5.f/12)*t));
964  dphi[2] = (-(0.5f + t * (0.25f - (5.f/12)*t)) * (1 + t * (4 - 3*t))
965  + (1-t*t) * (2-t) * (0.25f - (5.f/6)*t)) * h_1;
966  t--;
967  phi[3] = (1-t*t) * (2+t) * (0.5f - t * (0.25f + (5.f/12)*t));
968  dphi[3] = ((0.5f + t * (-0.25f - (5.f/12)*t)) * (1 + t * (-4 - 3*t))
969  - (1-t*t) * (2+t) * (0.25f + (5.f/6)*t)) * h_1;
970  t--;
971  phi[4] = (1+t) * (2+t) * (3+t) * ((1.f/6)
972  - t * (0.375f + (5.f/24)*t));
973  dphi[4] = (((1.f/6) + t * (-0.375f - (5.f/24)*t)) *
974  (11 + t * (12 + 3*t)) - (1+t) * (2+t) * (3+t) *
975  (0.375f + (5.f/12)*t)) * h_1;
976  t--;
977  phi[5] = (1.f/24) * (1+t) * (2+t) * (3+t) * (3+t) * (4+t);
978  dphi[5] = ((1.f/24) * ((3+t) * (3+t) * (14 + t * (14 + 3*t))
979  + 2 * (1+t) * (2+t) * (3+t) * (4+t))) * h_1;
980  break;
981  case QUINTIC2:
982  phi[0] = (1.f/24) * (3-t) * (3-t) * (3-t) * (t-2) * (5*t-8);
983  dphi[0] = ((1.f/24) * (3-t) * (3-t) * ((3-t)*(5*t-8)
984  - 3*(t-2)*(5*t-8) + 5*(t-2)*(3-t))) * h_1;
985  t--;
986  phi[1] = (-1.f/24) * (2-t) * (t-1) * (-48+t*(153+t*(-114+t*25)));
987  dphi[1] = ((-1.f/24) * ((2-t)*(-48+t*(153+t*(-114+t*25)))
988  - (t-1)* (-48+t*(153+t*(-114+t*25)))
989  + (2-t)*(t-1)*(153+t*(-228+t*75)))) * h_1;
990  t--;
991  phi[2] = (1.f/12) * (1-t) * (12+t*(12+t*(-3+t*(-38+t*25))));
992  dphi[2] = ((1.f/12) * (-(12+t*(12+t*(-3+t*(-38+t*25))))
993  + (1-t)*(12+t*(-6+t*(-114+t*100))))) * h_1;
994  t--;
995  phi[3] = (1.f/12) * (1+t) * (12+t*(-12+t*(-3+t*(38+t*25))));
996  dphi[3] = ((1.f/12) * ((12+t*(-12+t*(-3+t*(38+t*25))))
997  + (1+t)*(-12+t*(-6+t*(114+t*100))))) * h_1;
998  t--;
999  phi[4] = (-1.f/24) * (2+t) * (t+1) * (48+t*(153+t*(114+t*25)));
1000  dphi[4] = ((-1.f/24) * ((2+t)*(48+t*(153+t*(114+t*25)))
1001  + (t+1)* (48+t*(153+t*(114+t*25)))
1002  + (2+t)*(t+1)*(153+t*(228+t*75)))) * h_1;
1003  t--;
1004  phi[5] = (1.f/24) * (3+t) * (3+t) * (3+t) * (t+2) * (5*t+8);
1005  dphi[5] = ((1.f/24) * (3+t) * (3+t) * ((3+t)*(5*t+8)
1006  + 3*(t+2)*(5*t+8) + 5*(t+2)*(3+t))) * h_1;
1007  break;
1008  case SEPTIC:
1009  phi[0] = (-1.f/720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-4)*(t-5)*(t-6);
1010  dphi[0] = (-1.f/720)*(t-4)*(-1944+t*(3644+t*(-2512+t*(807
1011  +t*(-122+t*7))))) * h_1;
1012  t--;
1013  phi[1] = (1.f/720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-5)*(-6+t*(-20+7*t));
1014  dphi[1] = (1.f/720)*(756+t*(-9940+t*(17724+t*(-12740+t*(4445
1015  +t*(-750+t*49)))))) * h_1;
1016  t--;
1017  phi[2] = (-1.f/240)*(t*t-1)*(t-2)*(t-3)*(t-4)*(-10+t*(-12+7*t));
1018  dphi[2] = (-1.f/240)*(-28+t*(1260+t*(-756+t*(-1260+t*(1365
1019  +t*(-450+t*49)))))) * h_1;
1020  t--;
1021  phi[3] = (1.f/144)*(t*t-1)*(t*t-4)*(t-3)*(-12+t*(-4+7*t));
1022  dphi[3] = (1.f/144)*t*(-560+t*(84+t*(644+t*(-175
1023  +t*(-150+t*49))))) * h_1;
1024  t--;
1025  phi[4] = (-1.f/144)*(t*t-1)*(t*t-4)*(t+3)*(-12+t*(4+7*t));
1026  dphi[4] = (-1.f/144)*t*(560+t*(84+t*(-644+t*(-175
1027  +t*(150+t*49))))) * h_1;
1028  t--;
1029  phi[5] = (1.f/240)*(t*t-1)*(t+2)*(t+3)*(t+4)*(-10+t*(12+7*t));
1030  dphi[5] = (1.f/240)*(-28+t*(-1260+t*(-756+t*(1260+t*(1365
1031  +t*(450+t*49)))))) * h_1;
1032  t--;
1033  phi[6] = (-1.f/720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(-6+t*(20+7*t));
1034  dphi[6] = (-1.f/720)*(756+t*(9940+t*(17724+t*(12740+t*(4445
1035  +t*(750+t*49)))))) * h_1;
1036  t--;
1037  phi[7] = (1.f/720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+4)*(t+5)*(t+6);
1038  dphi[7] = (1.f/720)*(t+4)*(1944+t*(3644+t*(2512+t*(807
1039  +t*(122+t*7))))) * h_1;
1040  break;
1041  case SEPTIC3:
1042  phi[0] = (3632.f/5) + t*((-7456.f/5) + t*((58786.f/45) + t*(-633
1043  + t*((26383.f/144) + t*((-22807.f/720) + t*((727.f/240)
1044  + t*(-89.f/720)))))));
1045  dphi[0] = ((-7456.f/5) + t*((117572.f/45) + t*(-1899
1046  + t*((26383.f/36) + t*((-22807.f/144) + t*((727.f/40)
1047  + t*(-623.f/720))))))) * h_1;
1048  t--;
1049  phi[1] = -440 + t*((25949.f/20) + t*((-117131.f/72) + t*((2247.f/2)
1050  + t*((-66437.f/144) + t*((81109.f/720) + t*((-727.f/48)
1051  + t*(623.f/720)))))));
1052  dphi[1] = ((25949.f/20) + t*((-117131.f/36) + t*((6741.f/2)
1053  + t*((-66437.f/36) + t*((81109.f/144) + t*((-727.f/8)
1054  + t*(4361.f/720))))))) * h_1;
1055  t--;
1056  phi[2] = (138.f/5) + t*((-8617.f/60) + t*((12873.f/40) + t*((-791.f/2)
1057  + t*((4557.f/16) + t*((-9583.f/80) + t*((2181.f/80)
1058  + t*(-623.f/240)))))));
1059  dphi[2] = ((-8617.f/60) + t*((12873.f/20) + t*((-2373.f/2)
1060  + t*((4557.f/4) + t*((-9583.f/16) + t*((6543.f/40)
1061  + t*(-4361.f/240))))))) * h_1;
1062  t--;
1063  phi[3] = 1 + t*t*((-49.f/36) + t*t*((-959.f/144) + t*((2569.f/144)
1064  + t*((-727.f/48) + t*(623.f/144)))));
1065  dphi[3] = (t*((-49.f/18) + t*t*((-959.f/36) + t*((12845.f/144)
1066  + t*((-727.f/8) + t*(4361.f/144)))))) * h_1;
1067  t--;
1068  phi[4] = 1 + t*t*((-49.f/36) + t*t*((-959.f/144) + t*((-2569.f/144)
1069  + t*((-727.f/48) + t*(-623.f/144)))));
1070  dphi[4] = (t*((-49.f/18) + t*t*((-959.f/36) + t*((-12845.f/144)
1071  + t*((-727.f/8) + t*(-4361.f/144)))))) * h_1;
1072  t--;
1073  phi[5] = (138.f/5) + t*((8617.f/60) + t*((12873.f/40) + t*((791.f/2)
1074  + t*((4557.f/16) + t*((9583.f/80) + t*((2181.f/80)
1075  + t*(623.f/240)))))));
1076  dphi[5] = ((8617.f/60) + t*((12873.f/20) + t*((2373.f/2)
1077  + t*((4557.f/4) + t*((9583.f/16) + t*((6543.f/40)
1078  + t*(4361.f/240))))))) * h_1;
1079  t--;
1080  phi[6] = -440 + t*((-25949.f/20) + t*((-117131.f/72) + t*((-2247.f/2)
1081  + t*((-66437.f/144) + t*((-81109.f/720) + t*((-727.f/48)
1082  + t*(-623.f/720)))))));
1083  dphi[6] = ((-25949.f/20) + t*((-117131.f/36) + t*((-6741.f/2)
1084  + t*((-66437.f/36) + t*((-81109.f/144) + t*((-727.f/8)
1085  + t*(-4361.f/720))))))) * h_1;
1086  t--;
1087  phi[7] = (3632.f/5) + t*((7456.f/5) + t*((58786.f/45) + t*(633
1088  + t*((26383.f/144) + t*((22807.f/720) + t*((727.f/240)
1089  + t*(89.f/720)))))));
1090  dphi[7] = ((7456.f/5) + t*((117572.f/45) + t*(1899
1091  + t*((26383.f/36) + t*((22807.f/144) + t*((727.f/40)
1092  + t*(623.f/720))))))) * h_1;
1093  break;
1094  case NONIC:
1095  phi[0] = (-1.f/40320)*(t-8)*(t-7)*(t-6)*(t-5)*(t-5)*(t-4)*(t-3)*
1096  (t-2)*(t-1);
1097  dphi[0] = (-1.f/40320)*(t-5)*(-117648+t*(256552+t*(-221416
1098  +t*(99340+t*(-25261+t*(3667+t*(-283+t*9)))))))*h_1;
1099  t--;
1100  phi[1] = (1.f/40320)*(t-7)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*
1101  (-8+t*(-35+9*t));
1102  dphi[1] = (1.f/40320)*(71856+t*(-795368+t*(1569240+t*(-1357692
1103  +t*(634725+t*(-172116+t*(27090+t*(-2296+t*81))))))))*h_1;
1104  t--;
1105  phi[2] = (-1.f/10080)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*
1106  (-14+t*(-25+9*t));
1107  dphi[2] = (1.f/10080)*(3384+t*(-69080+t*(55026
1108  +t*(62580+t*(-99225+t*(51660+t*(-13104+t*(1640
1109  +t*(-81)))))))))*h_1;
1110  t--;
1111  phi[3] = (1.f/1440)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*
1112  (-6+t*(-5+3*t));
1113  dphi[3] = (1.f/1440)*(72+t*(-6344+t*(2070
1114  +t*(7644+t*(-4725+t*(-828+t*(1260+t*(-328+t*27))))))))*h_1;
1115  t--;
1116  phi[4] = (-1.f/2880)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*
1117  (-20+t*(-5+9*t));
1118  dphi[4] = (-1.f/2880)*t*(10792+t*(-972+t*(-12516
1119  +t*(2205+t*(3924+t*(-882+t*(-328+t*81)))))))*h_1;
1120  t--;
1121  phi[5] = (1.f/2880)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*
1122  (-20+t*(5+9*t));
1123  dphi[5] = (1.f/2880)*t*(-10792+t*(-972+t*(12516
1124  +t*(2205+t*(-3924+t*(-882+t*(328+t*81)))))))*h_1;
1125  t--;
1126  phi[6] = (-1.f/1440)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*
1127  (-6+t*(5+3*t));
1128  dphi[6] = (1.f/1440)*(-72+t*(-6344+t*(-2070
1129  +t*(7644+t*(4725+t*(-828+t*(-1260+t*(-328+t*(-27)))))))))*h_1;
1130  t--;
1131  phi[7] = (1.f/10080)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)*
1132  (-14+t*(25+9*t));
1133  dphi[7] = (1.f/10080)*(-3384+t*(-69080+t*(-55026
1134  +t*(62580+t*(99225+t*(51660+t*(13104+t*(1640+t*81))))))))*h_1;
1135  t--;
1136  phi[8] = (-1.f/40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)*(t+7)*
1137  (-8+t*(35+9*t));
1138  dphi[8] = (-1.f/40320)*(71856+t*(795368+t*(1569240
1139  +t*(1357692+t*(634725+t*(172116+t*(27090+t*(2296
1140  +t*81))))))))*h_1;
1141  t--;
1142  phi[9] = (1.f/40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+5)*(t+6)*
1143  (t+7)*(t+8);
1144  dphi[9] = (1.f/40320)*(t+5)*(117648+t*(256552+t*(221416
1145  +t*(99340+t*(25261+t*(3667+t*(283+t*9)))))))*h_1;
1146  break;
1147  case NONIC4:
1148  { // begin grouping to define local variables
1149  double Tphi[10], Tdphi[10];
1150  double T=t;
1151  Tphi[0] = 439375./7+T*(-64188125./504+T*(231125375./2016
1152  +T*(-17306975./288+T*(7761805./384+T*(-2895587./640
1153  +T*(129391./192+T*(-259715./4032+T*(28909./8064
1154  +T*(-3569./40320)))))))));
1155  Tdphi[0] = (-64188125./504+T*(231125375./1008
1156  +T*(-17306975./96+T*(7761805./96+T*(-2895587./128
1157  +T*(129391./32+T*(-259715./576+T*(28909./1008
1158  +T*(-3569./4480))))))))) * h_1;
1159  T--;
1160  Tphi[1] = -56375+T*(8314091./56+T*(-49901303./288+T*(3763529./32
1161  +T*(-19648027./384+T*(9469163./640+T*(-545977./192
1162  +T*(156927./448+T*(-28909./1152
1163  +T*(3569./4480)))))))));
1164  Tdphi[1] = (8314091./56+T*(-49901303./144+T*(11290587./32
1165  +T*(-19648027./96+T*(9469163./128+T*(-545977./32
1166  +T*(156927./64+T*(-28909./144
1167  +T*(32121./4480))))))))) * h_1;
1168  T--;
1169  Tphi[2] = 68776./7+T*(-1038011./28+T*(31157515./504+T*(-956669./16
1170  +T*(3548009./96+T*(-2422263./160+T*(197255./48
1171  +T*(-19959./28+T*(144545./2016
1172  +T*(-3569./1120)))))))));
1173  Tdphi[2] = (-1038011./28+T*(31157515./252+T*(-2870007./16
1174  +T*(3548009./24+T*(-2422263./32+T*(197255./8
1175  +T*(-19959./4+T*(144545./252
1176  +T*(-32121./1120))))))))) * h_1;
1177  T--;
1178  Tphi[3] = -154+T*(12757./12+T*(-230123./72+T*(264481./48
1179  +T*(-576499./96+T*(686147./160+T*(-96277./48
1180  +T*(14221./24+T*(-28909./288+T*(3569./480)))))))));
1181  Tdphi[3] = (12757./12+T*(-230123./36+T*(264481./16
1182  +T*(-576499./24+T*(686147./32+T*(-96277./8
1183  +T*(99547./24+T*(-28909./36
1184  +T*(10707./160))))))))) * h_1;
1185  T--;
1186  Tphi[4] = 1+T*T*(-205./144+T*T*(91./192+T*(-6181./320
1187  +T*(6337./96+T*(-2745./32+T*(28909./576
1188  +T*(-3569./320)))))));
1189  Tdphi[4] = T*(-205./72+T*T*(91./48+T*(-6181./64
1190  +T*(6337./16+T*(-19215./32+T*(28909./72
1191  +T*(-32121./320))))))) * h_1;
1192  T--;
1193  Tphi[5] = 1+T*T*(-205./144+T*T*(91./192+T*(6181./320
1194  +T*(6337./96+T*(2745./32+T*(28909./576
1195  +T*(3569./320)))))));
1196  Tdphi[5] = T*(-205./72+T*T*(91./48+T*(6181./64
1197  +T*(6337./16+T*(19215./32+T*(28909./72
1198  +T*(32121./320))))))) * h_1;
1199  T--;
1200  Tphi[6] = -154+T*(-12757./12+T*(-230123./72+T*(-264481./48
1201  +T*(-576499./96+T*(-686147./160+T*(-96277./48
1202  +T*(-14221./24+T*(-28909./288+T*(-3569./480)))))))));
1203  Tdphi[6] = (-12757./12+T*(-230123./36+T*(-264481./16
1204  +T*(-576499./24+T*(-686147./32+T*(-96277./8
1205  +T*(-99547./24+T*(-28909./36
1206  +T*(-10707./160))))))))) * h_1;
1207  T--;
1208  Tphi[7] = 68776./7+T*(1038011./28+T*(31157515./504+T*(956669./16
1209  +T*(3548009./96+T*(2422263./160+T*(197255./48
1210  +T*(19959./28+T*(144545./2016+T*(3569./1120)))))))));
1211  Tdphi[7] = (1038011./28+T*(31157515./252+T*(2870007./16
1212  +T*(3548009./24+T*(2422263./32+T*(197255./8
1213  +T*(19959./4+T*(144545./252
1214  +T*(32121./1120))))))))) * h_1;
1215  T--;
1216  Tphi[8] = -56375+T*(-8314091./56+T*(-49901303./288+T*(-3763529./32
1217  +T*(-19648027./384+T*(-9469163./640+T*(-545977./192
1218  +T*(-156927./448+T*(-28909./1152
1219  +T*(-3569./4480)))))))));
1220  Tdphi[8] = (-8314091./56+T*(-49901303./144+T*(-11290587./32
1221  +T*(-19648027./96+T*(-9469163./128+T*(-545977./32
1222  +T*(-156927./64+T*(-28909./144
1223  +T*(-32121./4480))))))))) * h_1;
1224  T--;
1225  Tphi[9] = 439375./7+T*(64188125./504+T*(231125375./2016
1226  +T*(17306975./288+T*(7761805./384+T*(2895587./640
1227  +T*(129391./192+T*(259715./4032+T*(28909./8064
1228  +T*(3569./40320)))))))));
1229  Tdphi[9] = (64188125./504+T*(231125375./1008
1230  +T*(17306975./96+T*(7761805./96+T*(2895587./128
1231  +T*(129391./32+T*(259715./576+T*(28909./1008
1232  +T*(3569./4480))))))))) * h_1;
1233  for (int i=0; i < 10; i++) {
1234  phi[i] = Float(Tphi[i]);
1235  dphi[i] = Float(Tdphi[i]);
1236  }
1237  } // end grouping to define local variables
1238  break;
1239  default:
1240  NAMD_die("Unknown MSM approximation.");
1241  } // switch
1242  } // d_stencil_1d()
1243 
1244  void stencil_1d_c1hermite(Float phi[], Float psi[], Float t, Float h) {
1245  phi[0] = (1 - t) * (1 - t) * (1 + 2*t);
1246  psi[0] = h * t * (1 - t) * (1 - t);
1247  t--;
1248  phi[1] = (1 + t) * (1 + t) * (1 - 2*t);
1249  psi[1] = h * t * (1 + t) * (1 + t);
1250  }
1251 
1253  Float dphi[], Float phi[], Float dpsi[], Float psi[],
1254  Float t, Float h, Float h_1) {
1255  phi[0] = (1 - t) * (1 - t) * (1 + 2*t);
1256  dphi[0] = -6 * t * (1 - t) * h_1;
1257  psi[0] = h * t * (1 - t) * (1 - t);
1258  dpsi[0] = (1 - t) * (1 - 3*t);
1259  t--;
1260  phi[1] = (1 + t) * (1 + t) * (1 - 2*t);
1261  dphi[1] = -6 * t * (1 + t) * h_1;
1262  psi[1] = h * t * (1 + t) * (1 + t);
1263  dpsi[1] = (1 + t) * (1 + 3*t);
1264  }
1265 
1266  static void ndsplitting(BigReal pg[], BigReal s, int n, int _split) {
1267  int k = 0;
1268  if (k == n) return;
1269  if (s <= 1) {
1270  // compute derivatives of smoothed part
1271  switch (_split) {
1272  case TAYLOR2:
1273  pg[k++] = 1 + (s-1)*(-1./2 + (s-1)*(3./8));
1274  if (k == n) break;
1275  pg[k++] = -1./2 + (s-1)*(3./4);
1276  if (k == n) break;
1277  pg[k++] = 3./4;
1278  break;
1279  case TAYLOR3:
1280  pg[k++] = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16)));
1281  if (k == n) break;
1282  pg[k++] = -1./2 + (s-1)*(3./4 + (s-1)*(-15./16));
1283  if (k == n) break;
1284  pg[k++] = 3./4 + (s-1)*(-15./8);
1285  if (k == n) break;
1286  pg[k++] = -15./8;
1287  break;
1288  case TAYLOR4:
1289  pg[k++] = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16
1290  + (s-1)*(35./128))));
1291  if (k == n) break;
1292  pg[k++] = -1./2 + (s-1)*(3./4 + (s-1)*(-15./16 + (s-1)*(35./32)));
1293  if (k == n) break;
1294  pg[k++] = 3./4 + (s-1)*(-15./8 + (s-1)*(105./32));
1295  if (k == n) break;
1296  pg[k++] = -15./8 + (s-1)*(105./16);
1297  if (k == n) break;
1298  pg[k++] = 105./16;
1299  break;
1300  case TAYLOR5:
1301  pg[k++] = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16
1302  + (s-1)*(35./128 + (s-1)*(-63./256)))));
1303  if (k == n) break;
1304  pg[k++] = -1./2 + (s-1)*(3./4 + (s-1)*(-15./16 + (s-1)*(35./32
1305  + (s-1)*(-315./256))));
1306  if (k == n) break;
1307  pg[k++] = 3./4 + (s-1)*(-15./8 + (s-1)*(105./32 + (s-1)*(-315./64)));
1308  if (k == n) break;
1309  pg[k++] = -15./8 + (s-1)*(105./16 + (s-1)*(-945./64));
1310  if (k == n) break;
1311  pg[k++] = 105./16 + (s-1)*(-945./32);
1312  if (k == n) break;
1313  pg[k++] = -945./32;
1314  break;
1315  case TAYLOR6:
1316  pg[k++] = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16
1317  + (s-1)*(35./128 + (s-1)*(-63./256 + (s-1)*(231./1024))))));
1318  if (k == n) break;
1319  pg[k++] = -1./2 + (s-1)*(3./4 + (s-1)*(-15./16 + (s-1)*(35./32
1320  + (s-1)*(-315./256 + (s-1)*(693./512)))));
1321  if (k == n) break;
1322  pg[k++] = 3./4 + (s-1)*(-15./8 + (s-1)*(105./32 + (s-1)*(-315./64
1323  + (s-1)*(3465./512))));
1324  if (k == n) break;
1325  pg[k++] = -15./8 + (s-1)*(105./16 + (s-1)*(-945./64
1326  + (s-1)*(3465./128)));
1327  if (k == n) break;
1328  pg[k++] = 105./16 + (s-1)*(-945./32 + (s-1)*(10395./128));
1329  if (k == n) break;
1330  pg[k++] = -945./32 + (s-1)*(10395./64);
1331  if (k == n) break;
1332  pg[k++] = 10395./64;
1333  break;
1334  case TAYLOR7:
1335  pg[k++] = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16
1336  + (s-1)*(35./128 + (s-1)*(-63./256
1337  + (s-1)*(231./1024 + (s-1)*(-429./2048)))))));
1338  if (k == n) break;
1339  pg[k++] = -1./2 + (s-1)*(3./4 + (s-1)*(-15./16 + (s-1)*(35./32
1340  + (s-1)*(-315./256 + (s-1)*(693./512
1341  + (s-1)*(-3003./2048))))));
1342  if (k == n) break;
1343  pg[k++] = 3./4 + (s-1)*(-15./8 + (s-1)*(105./32 + (s-1)*(-315./64
1344  + (s-1)*(3465./512 + (s-1)*(-9009./1024)))));
1345  if (k == n) break;
1346  pg[k++] = -15./8 + (s-1)*(105./16 + (s-1)*(-945./64 + (s-1)*(3465./128
1347  + (s-1)*(-45045./1024))));
1348  if (k == n) break;
1349  pg[k++] = 105./16 + (s-1)*(-945./32 + (s-1)*(10395./128
1350  + (s-1)*(-45045./256)));
1351  if (k == n) break;
1352  pg[k++] = -945./32 + (s-1)*(10395./64 + (s-1)*(-135135./256));
1353  if (k == n) break;
1354  pg[k++] = 10395./64 + (s-1)*(-135135./128);
1355  if (k == n) break;
1356  pg[k++] = -135135./128;
1357  break;
1358  case TAYLOR8:
1359  pg[k++] = 1 + (s-1)*(-1./2 + (s-1)*(3./8 + (s-1)*(-5./16
1360  + (s-1)*(35./128 + (s-1)*(-63./256
1361  + (s-1)*(231./1024 + (s-1)*(-429./2048
1362  + (s-1)*(6435./32768))))))));
1363  if (k == n) break;
1364  pg[k++] = -1./2 + (s-1)*(3./4 + (s-1)*(-15./16 + (s-1)*(35./32
1365  + (s-1)*(-315./256 + (s-1)*(693./512
1366  + (s-1)*(-3003./2048 + (s-1)*(6435./4096)))))));
1367  if (k == n) break;
1368  pg[k++] = 3./4 + (s-1)*(-15./8 + (s-1)*(105./32 + (s-1)*(-315./64
1369  + (s-1)*(3465./512 + (s-1)*(-9009./1024
1370  + (s-1)*(45045./4096))))));
1371  if (k == n) break;
1372  pg[k++] = -15./8 + (s-1)*(105./16 + (s-1)*(-945./64 + (s-1)*(3465./128
1373  + (s-1)*(-45045./1024 + (s-1)*(135135./2048)))));
1374  if (k == n) break;
1375  pg[k++] = 105./16 + (s-1)*(-945./32 + (s-1)*(10395./128
1376  + (s-1)*(-45045./256 + (s-1)*(675675./2048))));
1377  if (k == n) break;
1378  pg[k++] = -945./32 + (s-1)*(10395./64 + (s-1)*(-135135./256
1379  + (s-1)*(675675./512)));
1380  if (k == n) break;
1381  pg[k++] = 10395./64 + (s-1)*(-135135./128 + (s-1)*(2027025./512));
1382  if (k == n) break;
1383  pg[k++] = -135135./128 + (s-1)*(2027025./256);
1384  if (k == n) break;
1385  pg[k++] = 2027025./256;
1386  break;
1387  default:
1388  NAMD_die("Unknown MSM splitting.");
1389  }
1390  } // if (s <= 1)
1391  else { // (s > 1)
1392  // compute derivatives of s^(-1/2)
1393  const BigReal s_1 = 1./s;
1394  BigReal s_p = sqrt(s_1);
1395  BigReal p = -0.5;
1396  BigReal _c = 1;
1397  pg[k++] = _c * s_p;
1398  while (k < n) {
1399  s_p *= s_1;
1400  _c *= p;
1401  p -= 1;
1402  pg[k++] = _c * s_p;
1403  }
1404  } // else (s > 1)
1405  // higher derivatives are zero
1406  while (k < n) pg[k++] = 0;
1407  } // ndsplitting()
1408 
1409 
1410  static void gc_c1hermite_elem_accum(C1Matrix& matrix, BigReal _c,
1411  Vector rv, BigReal _a, int _split) {
1412  const BigReal a_1 = 1./_a;
1413  const BigReal a_2 = a_1 * a_1;
1414  const BigReal s = (rv * rv) * a_2;
1415  const BigReal dx = -2 * rv.x * a_2; // ds/dx
1416  const BigReal dy = -2 * rv.y * a_2; // ds/dy
1417  const BigReal dz = -2 * rv.z * a_2; // ds/dz
1418  const BigReal dd = 2 * a_2; // d^2s/dx^2 = d^2s/dy^2 = d^2s/dz^2
1419  BigReal tmp;
1420  enum { nderiv = C1_VECTOR_SIZE-1 };
1421  BigReal p[nderiv];
1422  Float *g = matrix.melem;
1423 
1424  // multiply entire matrix by this coefficient
1425  _c = _c * a_1;
1426 
1427  // compute derivatives (d/ds)^k of splitting g(s), s=r^2
1428  ndsplitting(p, s, nderiv, _split);
1429 
1430  // weight 0
1431  tmp = _c * p[0];
1432  g[C1INDEX(D000,D000)] += tmp;
1433 
1434  // weight 1
1435  tmp = _c * p[1] * dx;
1436  g[C1INDEX(D100,D000)] += tmp;
1437  g[C1INDEX(D000,D100)] -= tmp;
1438 
1439  tmp = _c * p[1] * dy;
1440  g[C1INDEX(D010,D000)] += tmp;
1441  g[C1INDEX(D000,D010)] -= tmp;
1442 
1443  tmp = _c * p[1] * dz;
1444  g[C1INDEX(D001,D000)] += tmp;
1445  g[C1INDEX(D000,D001)] -= tmp;
1446 
1447  // C1 splitting returns here
1448 
1449  // weight 2
1450  tmp = _c * p[2] * dx * dy;
1451  g[C1INDEX(D110,D000)] += tmp;
1452  g[C1INDEX(D000,D110)] += tmp;
1453  g[C1INDEX(D100,D010)] -= tmp;
1454  g[C1INDEX(D010,D100)] -= tmp;
1455 
1456  tmp = _c * p[2] * dx * dz;
1457  g[C1INDEX(D101,D000)] += tmp;
1458  g[C1INDEX(D000,D101)] += tmp;
1459  g[C1INDEX(D100,D001)] -= tmp;
1460  g[C1INDEX(D001,D100)] -= tmp;
1461 
1462  tmp = _c * p[2] * dy * dz;
1463  g[C1INDEX(D011,D000)] += tmp;
1464  g[C1INDEX(D000,D011)] += tmp;
1465  g[C1INDEX(D010,D001)] -= tmp;
1466  g[C1INDEX(D001,D010)] -= tmp;
1467 
1468  tmp = _c * (p[2] * dx*dx + p[1] * dd);
1469  g[C1INDEX(D100,D100)] -= tmp;
1470  tmp = _c * (p[2] * dy*dy + p[1] * dd);
1471  g[C1INDEX(D010,D010)] -= tmp;
1472  tmp = _c * (p[2] * dz*dz + p[1] * dd);
1473  g[C1INDEX(D001,D001)] -= tmp;
1474 
1475  // C2 splitting returns here
1476  if (_split == TAYLOR2) return;
1477 
1478  // weight 3
1479  tmp = _c * p[3] * dx * dy * dz;
1480  g[C1INDEX(D111,D000)] += tmp;
1481  g[C1INDEX(D110,D001)] -= tmp;
1482  g[C1INDEX(D101,D010)] -= tmp;
1483  g[C1INDEX(D011,D100)] -= tmp;
1484  g[C1INDEX(D100,D011)] += tmp;
1485  g[C1INDEX(D010,D101)] += tmp;
1486  g[C1INDEX(D001,D110)] += tmp;
1487  g[C1INDEX(D000,D111)] -= tmp;
1488 
1489  tmp = _c * (p[3] * dx*dx * dy + p[2] * dd * dy);
1490  g[C1INDEX(D110,D100)] -= tmp;
1491  g[C1INDEX(D100,D110)] += tmp;
1492 
1493  tmp = _c * (p[3] * dx*dx * dz + p[2] * dd * dz);
1494  g[C1INDEX(D101,D100)] -= tmp;
1495  g[C1INDEX(D100,D101)] += tmp;
1496 
1497  tmp = _c * (p[3] * dy*dy * dx + p[2] * dd * dx);
1498  g[C1INDEX(D110,D010)] -= tmp;
1499  g[C1INDEX(D010,D110)] += tmp;
1500 
1501  tmp = _c * (p[3] * dy*dy * dz + p[2] * dd * dz);
1502  g[C1INDEX(D011,D010)] -= tmp;
1503  g[C1INDEX(D010,D011)] += tmp;
1504 
1505  tmp = _c * (p[3] * dz*dz * dx + p[2] * dd * dx);
1506  g[C1INDEX(D101,D001)] -= tmp;
1507  g[C1INDEX(D001,D101)] += tmp;
1508 
1509  tmp = _c * (p[3] * dz*dz * dy + p[2] * dd * dy);
1510  g[C1INDEX(D011,D001)] -= tmp;
1511  g[C1INDEX(D001,D011)] += tmp;
1512 
1513  // C3 splitting returns here
1514  if (_split == TAYLOR3) return;
1515 
1516  // weight 4
1517  tmp = _c * (p[4] * dx*dx * dy * dz + p[3] * dd * dy * dz);
1518  g[C1INDEX(D111,D100)] -= tmp;
1519  g[C1INDEX(D100,D111)] -= tmp;
1520  g[C1INDEX(D110,D101)] += tmp;
1521  g[C1INDEX(D101,D110)] += tmp;
1522 
1523  tmp = _c * (p[4] * dy*dy * dx * dz + p[3] * dd * dx * dz);
1524  g[C1INDEX(D111,D010)] -= tmp;
1525  g[C1INDEX(D010,D111)] -= tmp;
1526  g[C1INDEX(D110,D011)] += tmp;
1527  g[C1INDEX(D011,D110)] += tmp;
1528 
1529  tmp = _c * (p[4] * dz*dz * dx * dy + p[3] * dd * dx * dy);
1530  g[C1INDEX(D111,D001)] -= tmp;
1531  g[C1INDEX(D001,D111)] -= tmp;
1532  g[C1INDEX(D101,D011)] += tmp;
1533  g[C1INDEX(D011,D101)] += tmp;
1534 
1535  tmp = _c * (p[4] * dx*dx * dy*dy + p[3] * dx*dx * dd
1536  + p[3] * dd * dy*dy + p[2] * dd * dd);
1537  g[C1INDEX(D110,D110)] += tmp;
1538  tmp = _c * (p[4] * dx*dx * dz*dz + p[3] * dx*dx * dd
1539  + p[3] * dd * dz*dz + p[2] * dd * dd);
1540  g[C1INDEX(D101,D101)] += tmp;
1541  tmp = _c * (p[4] * dy*dy * dz*dz + p[3] * dy*dy * dd
1542  + p[3] * dd * dz*dz + p[2] * dd * dd);
1543  g[C1INDEX(D011,D011)] += tmp;
1544 
1545  // C4 splitting returns here
1546  if (_split == TAYLOR4) return;
1547 
1548  // weight 5
1549  tmp = _c * (p[5] * dx*dx * dy*dy * dz + p[4] * dx*dx * dd * dz
1550  + p[4] * dd * dy*dy * dz + p[3] * dd * dd * dz);
1551  g[C1INDEX(D111,D110)] += tmp;
1552  g[C1INDEX(D110,D111)] -= tmp;
1553 
1554  tmp = _c * (p[5] * dx*dx * dz*dz * dy + p[4] * dx*dx * dd * dy
1555  + p[4] * dd * dz*dz * dy + p[3] * dd * dd * dy);
1556  g[C1INDEX(D111,D101)] += tmp;
1557  g[C1INDEX(D101,D111)] -= tmp;
1558 
1559  tmp = _c * (p[5] * dy*dy * dz*dz * dx + p[4] * dy*dy * dd * dx
1560  + p[4] * dd * dz*dz * dx + p[3] * dd * dd * dx);
1561  g[C1INDEX(D111,D011)] += tmp;
1562  g[C1INDEX(D011,D111)] -= tmp;
1563 
1564  // C5 splitting returns here
1565  if (_split == TAYLOR5) return;
1566 
1567  // weight 6
1568  tmp = _c * (p[6] * dx*dx * dy*dy * dz*dz + p[5] * dx*dx * dy*dy * dd
1569  + p[5] * dx*dx * dd * dz*dz + p[5] * dd * dy*dy * dz*dz
1570  + p[4] * dx*dx * dd * dd + p[4] * dd * dy*dy * dd
1571  + p[4] * dd * dd * dz*dz + p[3] * dd * dd * dd);
1572  g[C1INDEX(D111,D111)] -= tmp;
1573 
1574  // calculate full matrix for C6 or higher splitting
1575 
1576  } // gc_c1hermite_elem_accum()
1577 
1578 
1579 }; // ComputeMsmMgr
1580 
1581 
1582 // Degree of polynomial basis function Phi.
1583 // For the purpose of finding the stencil width, Hermite interpolation
1584 // sets this value to 1.
1586  3, 5, 5, 7, 7, 9, 9, 1,
1587 };
1588 
1589 // The stencil array lengths below.
1590 const int ComputeMsmMgr::Nstencil[NUM_APPROX] = {
1591  5, 7, 7, 9, 9, 11, 11, 3,
1592 };
1593 
1594 // Index offsets from the stencil-centered grid element, to get
1595 // to the correct contributing grid element.
1596 const int
1598  // cubic
1599  {-3, -1, 0, 1, 3},
1600 
1601  // quintic C1
1602  {-5, -3, -1, 0, 1, 3, 5},
1603 
1604  // quintic C2 (same as quintic C1)
1605  {-5, -3, -1, 0, 1, 3, 5},
1606 
1607  // septic C1
1608  {-7, -5, -3, -1, 0, 1, 3, 5, 7},
1609 
1610  // septic C3 (same as septic C1)
1611  {-7, -5, -3, -1, 0, 1, 3, 5, 7},
1612 
1613  // nonic C1
1614  {-9, -7, -5, -3, -1, 0, 1, 3, 5, 7, 9},
1615 
1616  // nonic C4 (same as nonic C1)
1617  {-9, -7, -5, -3, -1, 0, 1, 3, 5, 7, 9},
1618 
1619  // C1 Hermite
1620  {-1, 0, 1},
1621 };
1622 
1623 // The grid transfer stencils for the non-factored restriction and
1624 // prolongation procedures.
1625 const Float
1627  // cubic
1628  {-1.f/16, 9.f/16, 1, 9.f/16, -1.f/16},
1629 
1630  // quintic C1
1631  {3.f/256, -25.f/256, 75.f/128, 1, 75.f/128, -25.f/256, 3.f/256},
1632 
1633  // quintic C2 (same as quintic C1)
1634  {3.f/256, -25.f/256, 75.f/128, 1, 75.f/128, -25.f/256, 3.f/256},
1635 
1636  // septic C1
1637  { -5.f/2048, 49.f/2048, -245.f/2048, 1225.f/2048, 1, 1225.f/2048,
1638  -245.f/2048, 49.f/2048, -5.f/2048 },
1639 
1640  // septic C3 (same as septic C3)
1641  { -5.f/2048, 49.f/2048, -245.f/2048, 1225.f/2048, 1, 1225.f/2048,
1642  -245.f/2048, 49.f/2048, -5.f/2048 },
1643 
1644  // nonic C1
1645  { 35.f/65536, -405.f/65536, 567.f/16384, -2205.f/16384,
1646  19845.f/32768, 1, 19845.f/32768, -2205.f/16384, 567.f/16384,
1647  -405.f/65536, 35.f/65536 },
1648 
1649  // nonic C4 (same as nonic C1)
1650  { 35.f/65536, -405.f/65536, 567.f/16384, -2205.f/16384,
1651  19845.f/32768, 1, 19845.f/32768, -2205.f/16384, 567.f/16384,
1652  -405.f/65536, 35.f/65536 },
1653 };
1654 
1655 
1656 // Designates PE assignment for static load balancing of
1657 // MsmBlock-related arrays
1658 class MsmBlockMap : public CkArrayMap {
1659  private:
1660  ComputeMsmMgr *mgrLocal;
1661  int *penum;
1662  int level;
1663  public:
1664  MsmBlockMap(int lvl) {
1665  mgrLocal = CProxy_ComputeMsmMgr::ckLocalBranch(
1666  CkpvAccess(BOCclass_group).computeMsmMgr);
1667 #ifdef MSM_NODE_MAPPING
1668  penum = mgrLocal->blockAssign.buffer();
1669 #else
1670  penum = 0;
1671 #endif
1672  level = lvl;
1673  }
1674  MsmBlockMap(CkMigrateMessage *m) { }
1675  int registerArray(CkArrayIndex& numElements, CkArrayID aid) {
1676  return 0;
1677  }
1678  int procNum(int /*arrayHdl*/, const CkArrayIndex &idx) {
1679  int *pn = (int *)idx.data();
1680 #ifdef MSM_NODE_MAPPING
1681  int n = mgrLocal->blockFlatIndex(level, pn[0], pn[1], pn[2]);
1682  return penum[n];
1683 #else
1684  return 0;
1685 #endif
1686  }
1687 };
1688 
1689 
1690 // Designates PE assignment for static load balancing of
1691 // MsmGridCutoff-related arrays
1692 class MsmGridCutoffMap : public CkArrayMap {
1693  private:
1694  int *penum;
1695  public:
1697  ComputeMsmMgr *mgrLocal = CProxy_ComputeMsmMgr::ckLocalBranch(
1698  CkpvAccess(BOCclass_group).computeMsmMgr);
1699 #ifdef MSM_NODE_MAPPING
1700  penum = mgrLocal->gcutAssign.buffer();
1701 #else
1702  penum = 0;
1703 #endif
1704  }
1705  int registerArray(CkArrayIndex& numElements, CkArrayID aid) {
1706  return 0;
1707  }
1708  int procNum(int /*arrayHdl*/, const CkArrayIndex &idx) {
1709 #if 1
1710  int n = *((int *)idx.data());
1711 #ifdef MSM_NODE_MAPPING
1712  return penum[n];
1713 #else
1714  return 0;
1715 #endif
1716 #else
1717  return 0; // XXX to test load balancing
1718 #endif
1719  }
1720 };
1721 
1722 
1723 namespace msm {
1724 
1725  //
1726  // PatchData
1727  //
1728  // Performs anterpolation and interpolation algorithms.
1729  //
1730  // Surround each NAMD patch with enough grid points to perform
1731  // anterpolation and interpolation without having to do any
1732  // grid wrapping. This does not give a partitioning of the
1733  // MSM finest level grid --- rather, the edges of adjacent
1734  // PatchData grids will overlap or contain image points along
1735  // the periodic boundaries.
1736  //
1737 
1738  struct PatchData {
1751  //BigReal virial[3][3];
1753  int patchID;
1754  int sequence; // from Compute object for message priority
1755 
1756  AtomCoordArray& coordArray() { return coord; }
1757  ForceArray& forceArray() { return force; }
1758 
1759  PatchData(ComputeMsmMgr *pmgr, int pid);
1760  void init(int natoms);
1761 
1762  void anterpolation();
1763  void sendCharge();
1764  void addPotential(const Grid<Float>& epart);
1765  void interpolation();
1766 
1767  void anterpolationC1Hermite();
1768  void sendChargeC1Hermite();
1769  void addPotentialC1Hermite(const Grid<C1Vector>& epart);
1770  void interpolationC1Hermite();
1771  };
1772 
1773 } // namespace msm
1774 
1775 
1777 //
1778 // MsmGridCutoff
1779 //
1780 // Performs grid cutoff part of the computation.
1781 //
1782 // The grid cutoff part is the most computationally intensive part
1783 // of MSM. The templated MsmGridCutoffKernel class takes Vtype
1784 // for charge and potential data (generalizes to vector for Hermite
1785 // interpolation) and takes Mtype for the pre-computed grid coefficient
1786 // weights (generalizes to matrix for Hermite interpolation).
1787 //
1788 
1789 template <class Vtype, class Mtype>
1791  public:
1792  ComputeMsmMgr *mgrLocal; // for quick access to data
1794  msm::BlockIndex qhblockIndex; // source of charges
1795  msm::BlockSend ehblockSend; // destination for potentials
1796  int eia, eib, eja, ejb, eka, ekb, eni, enj, enk; // for "fold factor"
1797  int isfold; // for "fold factor"
1800  msm::Grid<Vtype> ehfold; // for "fold factor"
1805 
1806  MsmGridCutoffKernel() { init(); }
1807 
1808  void init() {
1809  isfold = 0;
1810  mgrLocal = CProxy_ComputeMsmMgr::ckLocalBranch(
1811  CkpvAccess(BOCclass_group).computeMsmMgr);
1812  map = &(mgrLocal->mapData());
1813  mgrLocal->addVirialContrib();
1814 #ifdef MSM_TIMING
1815  mgrLocal->addTiming();
1816 #endif
1817 #ifdef MSM_PROFILING
1818  mgrLocal->addProfiling();
1819 #endif
1820  }
1821 
1822 #ifdef MSM_MIGRATION
1823  void pup(PUP::er& p) {
1824 #ifdef MSM_TIMING
1825  mgrLocal->subtractTiming();
1826 #endif
1827 #ifdef MSM_PROFILING
1828  mgrLocal->subtractProfiling();
1829 #endif
1830  p | qhblockIndex;
1831  p | ehblockSend;
1832  p | eia, p | eib, p | eja, p | ejb, p | eka, p | ekb;
1833  p | eni, p | enj, p | enk;
1834  p | isfold;
1835  }
1836 #endif // MSM_MIGRATION
1837 
1839  qhblockIndex = bmsg->qhBlockIndex;
1840  ehblockSend = bmsg->ehBlockSend;
1841  delete bmsg;
1842 
1843  // set message priority
1844  priority = mgrLocal->nlevels
1845  + 2*(mgrLocal->nlevels - ehblockSend.nblock_wrap.level) - 1;
1846  // allocate qh buffer
1847  qh.init(map->blockLevel[qhblockIndex.level](qhblockIndex.n).nrange);
1848  // allocate eh buffer
1849  eh.init(ehblockSend.nrange);
1850  // preprocess "fold factor" if active for this level
1851  if (map->foldfactor[qhblockIndex.level].active) {
1852  // allocate ehfold buffer
1853  ehfold = eh;
1854  // set index range of potentials
1855  eia = eh.ia();
1856  eib = eh.ib();
1857  eja = eh.ja();
1858  ejb = eh.jb();
1859  eka = eh.ka();
1860  ekb = eh.kb();
1861  eni = eh.ni();
1862  enj = eh.nj();
1863  enk = eh.nk();
1864  if (map->blockLevel[qhblockIndex.level].nn() == 1) {
1865  if (map->ispx) { eia = qh.ia(); eib = qh.ib(); eni = qh.ni(); }
1866  if (map->ispy) { eja = qh.ja(); ejb = qh.jb(); enj = qh.nj(); }
1867  if (map->ispz) { eka = qh.ka(); ekb = qh.kb(); enk = qh.nk(); }
1868  }
1869  else {
1870  // find destination block index
1871  int level = qhblockIndex.level;
1873  ehblockSend.nrange_wrap.lower(), level);
1874  map->wrapBlockIndex(bn);
1875  if (map->ispx) {
1876  eia = bn.n.i * map->bsx[level];
1877  eib = eia + qh.ni() - 1;
1878  eni = qh.ni();
1879  }
1880  if (map->ispy) {
1881  eja = bn.n.j * map->bsy[level];
1882  ejb = eja + qh.nj() - 1;
1883  enj = qh.nj();
1884  }
1885  if (map->ispz) {
1886  eka = bn.n.k * map->bsz[level];
1887  ekb = eka + qh.nk() - 1;
1888  enk = qh.nk();
1889  }
1890  }
1891  isfold = 1;
1892  } // if fold factor
1893  } // setup()
1894 
1896  const msm::Grid<Mtype> *ptrgc,
1897  const msm::Grid<Mtype> *ptrgvc
1898  ) {
1899  pgc = ptrgc;
1900  pgvc = ptrgvc;
1901  } // setupWeights()
1902 
1903 
1904  void compute(GridMsg *gmsg) {
1905 #ifdef MSM_TIMING
1906  double startTime, stopTime;
1907  startTime = CkWallTimer();
1908 #endif
1909  //
1910  // receive block of charges
1911  //
1912  int pid;
1913  // qh is resized only the first time, memory allocation persists
1914  gmsg->get(qh, pid, sequence);
1915  delete gmsg;
1916 #ifdef MSM_TIMING
1917  stopTime = CkWallTimer();
1918  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
1919 #endif
1920 
1921  //
1922  // grid cutoff calculation
1923  // this charge block -> this potential block
1924  //
1925 
1926 #ifdef MSM_TIMING
1927  startTime = stopTime;
1928 #endif
1929  // resets indexing on block
1930  eh.init(ehblockSend.nrange); // (always have to re-init nrange for eh)
1931  eh.reset(0);
1932  // index range of weights
1933  int gia = pgc->ia();
1934  int gib = pgc->ib();
1935  int gja = pgc->ja();
1936  int gjb = pgc->jb();
1937  int gka = pgc->ka();
1938  int gkb = pgc->kb();
1939  int gni = pgc->ni();
1940  int gnj = pgc->nj();
1941  // index range of charge grid
1942  int qia = qh.ia();
1943  int qib = qh.ib();
1944  int qja = qh.ja();
1945  int qjb = qh.jb();
1946  int qka = qh.ka();
1947  int qkb = qh.kb();
1948  int qni = qh.ni();
1949  int qnj = qh.nj();
1950  // index range of potentials
1951  int ia = eh.ia();
1952  int ib = eh.ib();
1953  int ja = eh.ja();
1954  int jb = eh.jb();
1955  int ka = eh.ka();
1956  int kb = eh.kb();
1957 
1958  int index = 0;
1959 
1960  // access buffers directly
1961  const Mtype *gcbuffer = pgc->data().buffer();
1962  //const Mtype *gvcbuffer = pgvc->data().buffer();
1963  const Vtype *qhbuffer = qh.data().buffer();
1964  Vtype *ehbuffer = eh.data().buffer();
1965  //Vtype *gvsumbuffer = mgrLocal->gvsum.data().buffer();
1966 
1967 #ifndef MSM_COMM_ONLY
1968  // loop over potentials
1969  for (int k = ka; k <= kb; k++) {
1970  // clip charges to weights along k
1971  int mka = ( qka >= gka + k ? qka : gka + k );
1972  int mkb = ( qkb <= gkb + k ? qkb : gkb + k );
1973 
1974  for (int j = ja; j <= jb; j++) {
1975  // clip charges to weights along j
1976  int mja = ( qja >= gja + j ? qja : gja + j );
1977  int mjb = ( qjb <= gjb + j ? qjb : gjb + j );
1978 
1979  for (int i = ia; i <= ib; i++) {
1980  // clip charges to weights along i
1981  int mia = ( qia >= gia + i ? qia : gia + i );
1982  int mib = ( qib <= gib + i ? qib : gib + i );
1983 
1984  // accumulate sum to this eh point
1985  Vtype ehsum = 0;
1986 
1987 #if 0
1988  // loop over charge grid
1989  for (int qk = mka; qk <= mkb; qk++) {
1990  int qkoff = (qk - qka) * qnj;
1991  int gkoff = ((qk-k) - gka) * gnj;
1992 
1993  for (int qj = mja; qj <= mjb; qj++) {
1994  int qjkoff = (qkoff + qj - qja) * qni;
1995  int gjkoff = (gkoff + (qj-j) - gja) * gni;
1996 
1997 // help the vectorizer make reasonable decisions
1998 #if defined(__INTEL_COMPILER)
1999 #pragma vector always
2000 #endif
2001  for (int qi = mia; qi <= mib; qi++) {
2002  int qijkoff = qjkoff + qi - qia;
2003  int gijkoff = gjkoff + (qi-i) - gia;
2004 
2005  ehsum += gcbuffer[gijkoff] * qhbuffer[qijkoff];
2006  }
2007  }
2008  } // end loop over charge grid
2009 #else
2010 
2011 #if 0
2012  // loop over charge grid
2013  int nn = mib - mia + 1;
2014  for (int qk = mka; qk <= mkb; qk++) {
2015  int qkoff = (qk - qka) * qnj;
2016  int gkoff = ((qk-k) - gka) * gnj;
2017 
2018  for (int qj = mja; qj <= mjb; qj++) {
2019  int qjkoff = (qkoff + qj - qja) * qni;
2020  int gjkoff = (gkoff + (qj-j) - gja) * gni;
2021 
2022  const Float *qbuf = qhbuffer + (qjkoff - qia + mia);
2023  const Float *gbuf = gcbuffer + (gjkoff - i - gia + mia);
2024 #ifdef MSM_PROFILING
2025  mgrLocal->xLoopCnt[nn]++;
2026 #endif
2027 // help the vectorizer make reasonable decisions
2028 #if defined(__INTEL_COMPILER)
2029 #pragma vector always
2030 #endif
2031  for (int ii = 0; ii < nn; ii++) {
2032  ehsum += gbuf[ii] * qbuf[ii];
2033  }
2034  }
2035  } // end loop over charge grid
2036 #else
2037  // loop over charge grid
2038  int nn = mib - mia + 1;
2039  if (nn == 8) { // hard coded inner loop = 8
2040  int qnji = qnj * qni;
2041  int qkoff = -qka*qnji - qja*qni - qia + mia;
2042  int gnji = gnj * gni;
2043  int gkoff = (-k-gka)*gnji + (-j-gja)*gni - i - gia + mia;
2044 
2045  for (int qk = mka; qk <= mkb; qk++) {
2046  int qjkoff = qkoff + qk*qnji;
2047  int gjkoff = gkoff + qk*gnji;
2048 
2049  for (int qj = mja; qj <= mjb; qj++) {
2050  const Vtype *qbuf = qhbuffer + (qjkoff + qj*qni);
2051  const Mtype *gbuf = gcbuffer + (gjkoff + qj*gni);
2052  //const Mtype *gvcbuf = gvcbuffer + (gjkoff + qj*gni);
2053  //Vtype *gvsumbuf = gvsumbuffer + (gjkoff + qj*gni);
2054 #ifdef MSM_PROFILING
2055  mgrLocal->xLoopCnt[nn]++;
2056 #endif
2057 // help the vectorizer make reasonable decisions
2058 #if defined(__INTEL_COMPILER)
2059 #pragma vector always
2060 #endif
2061  for (int ii = 0; ii < 8; ii++) {
2062  ehsum += gbuf[ii] * qbuf[ii];
2063  //gvsumbuf[ii] += qbuf[ii] * qbuf[ii] * gvcbuf[ii];
2064  }
2065  }
2066  } // end loop over charge grid
2067  }
2068  else { // variable length inner loop < 8
2069  int qnji = qnj * qni;
2070  int qkoff = -qka*qnji - qja*qni - qia + mia;
2071  int gnji = gnj * gni;
2072  int gkoff = (-k-gka)*gnji + (-j-gja)*gni - i - gia + mia;
2073 
2074  for (int qk = mka; qk <= mkb; qk++) {
2075  int qjkoff = qkoff + qk*qnji;
2076  int gjkoff = gkoff + qk*gnji;
2077 
2078  for (int qj = mja; qj <= mjb; qj++) {
2079  const Vtype *qbuf = qhbuffer + (qjkoff + qj*qni);
2080  const Mtype *gbuf = gcbuffer + (gjkoff + qj*gni);
2081  //const Mtype *gvcbuf = gvcbuffer + (gjkoff + qj*gni);
2082  //Vtype *gvsumbuf = gvsumbuffer + (gjkoff + qj*gni);
2083 #ifdef MSM_PROFILING
2084  mgrLocal->xLoopCnt[nn]++;
2085 #endif
2086 // help the vectorizer make reasonable decisions
2087 #if defined(__INTEL_COMPILER)
2088 #pragma vector always
2089 #endif
2090  for (int ii = 0; ii < nn; ii++) {
2091  ehsum += gbuf[ii] * qbuf[ii];
2092  //gvsumbuf[ii] += qbuf[ii] * qbuf[ii] * gvcbuf[ii];
2093  }
2094  }
2095  } // end loop over charge grid
2096  }
2097 #endif // 0
2098 
2099 #endif // 0
2100 
2101  ehbuffer[index] = ehsum;
2102  index++;
2103  }
2104  }
2105  } // end loop over potentials
2106 #endif // !MSM_COMM_ONLY
2107 
2108 #ifdef MSM_PROFILING
2109  mgrLocal->doneProfiling();
2110 #endif
2111 
2112  //
2113  // send block of potentials
2114  //
2115 
2116 #ifdef MSM_FOLD_FACTOR
2117  // if "fold factor" is active for this level,
2118  // need to sum unfolded potential grid back into periodic grid
2119  if (isfold) {
2120  // copy unfolded grid
2121  ehfold = eh;
2122  // reset eh indexing to correctly folded size
2123  eh.set(eia, eni, eja, enj, eka, enk);
2124  eh.reset(0);
2125 #ifdef DEBUG_MSM_GRID
2126  printf("level=%d ehfold: [%d..%d] x [%d..%d] x [%d..%d] "
2127  "(%d x %d x %d)\n"
2128  " eh: [%d..%d] x [%d..%d] x [%d..%d] "
2129  "(%d x %d x %d)\n"
2130  " eh lower: %d %d %d\n",
2131  qhblockIndex.level,
2132  ehfold.ia(), ehfold.ib(),
2133  ehfold.ja(), ehfold.jb(),
2134  ehfold.ka(), ehfold.kb(),
2135  ehfold.ni(), ehfold.nj(), ehfold.nk(),
2136  eh.ia(), eh.ib(),
2137  eh.ja(), eh.jb(),
2138  eh.ka(), eh.kb(),
2139  eh.ni(), eh.nj(), eh.nk(),
2140  ehblockSend.nrange_wrap.lower().i,
2141  ehblockSend.nrange_wrap.lower().j,
2142  ehblockSend.nrange_wrap.lower().k
2143  );
2144 #endif
2145  const Vtype *ehfoldbuf = ehfold.data().buffer();
2146  Vtype *ehbuf = eh.data().buffer();
2147  // now we "fold" eh by calculating the
2148  // wrap around sum of ehfold into correctly sized eh
2149  int index = 0;
2150  for (int k = ka; k <= kb; k++) {
2151  int kk = k;
2152  if (kk < eka) do { kk += enk; } while (kk < eka);
2153  else if (kk > ekb) do { kk -= enk; } while (kk > ekb);
2154  int koff = (kk - eka) * enj;
2155  for (int j = ja; j <= jb; j++) {
2156  int jj = j;
2157  if (jj < eja) do { jj += enj; } while (jj < eja);
2158  else if (jj > ejb) do { jj -= enj; } while (jj > ejb);
2159  int jkoff = (koff + (jj - eja)) * eni;
2160  for (int i = ia; i <= ib; i++, index++) {
2161  int ii = i;
2162  if (ii < eia) do { ii += eni; } while (ii < eia);
2163  else if (ii > eib) do { ii -= eni; } while (ii > eib);
2164  int ijkoff = jkoff + (ii - eia);
2165  ehbuf[ijkoff] += ehfoldbuf[index];
2166  }
2167  }
2168  }
2169  }
2170  else {
2171  // shift grid index range to its true (wrapped) values
2172  eh.updateLower( ehblockSend.nrange_wrap.lower() );
2173  }
2174 #else // !MSM_FOLD_FACTOR
2175  // shift grid index range to its true (wrapped) values
2176  eh.updateLower( ehblockSend.nrange_wrap.lower() );
2177 #endif // MSM_FOLD_FACTOR
2178 
2179 #ifdef MSM_TIMING
2180  stopTime = CkWallTimer();
2181  mgrLocal->msmTiming[MsmTimer::GRIDCUTOFF] += stopTime - startTime;
2182 #endif
2183  } // compute()
2184 
2185 };
2186 
2187 
2188 //
2189 // MsmGridCutoff wraps kernel template for approximations
2190 // that involve only function values (e.g., CUBIC, QUINTIC).
2191 // Elements of 1D chare array.
2192 //
2194  public CBase_MsmGridCutoff,
2195  public MsmGridCutoffKernel<Float,Float>
2196 {
2197  public:
2198  CProxyElement_MsmBlock msmBlockElementProxy; // root of reduction
2199  CkSectionInfo cookie; // need to save cookie for section reduction
2200 #ifdef MSM_REDUCE_GRID
2202 #endif // MSM_REDUCE_GRID
2203 
2205 
2206  MsmGridCutoff(CkMigrateMessage *m)
2207 #if ! defined(MSM_MIGRATION)
2208  { }
2209 #else // MSM_MIGRATION
2210  : CBase_MsmGridCutoff(m) {
2211 #ifdef DEBUG_MSM_MIGRATE
2212  printf("MsmGridCutoff element %d migrated to processor %d\n",
2213  thisIndex, CkMyPe());
2214 #endif
2215  init();
2216  // access type dependent constants from map
2218  &(map->gc[ehblockSend.nblock_wrap.level]),
2219  &(map->gvc[ehblockSend.nblock_wrap.level])
2220  );
2221  }
2222 
2223  virtual void pup(PUP::er& p) {
2224 #ifdef DEBUG_MSM_MIGRATE
2225  printf("MsmGridCutoff element %d pupped on processor %d\n",
2226  thisIndex, CkMyPe());
2227 #endif
2228  CBase_MsmGridCutoff::pup(p); // pack our superclass
2230  }
2231 #endif // MSM_MIGRATION
2232 
2233  void init() {
2235  }
2236 
2237  void setup(MsmGridCutoffInitMsg *bmsg) {
2238  // base class consumes this init proxy message
2240  // access type dependent constants from map
2242  &(map->gc[ehblockSend.nblock_wrap.level]),
2243  &(map->gvc[ehblockSend.nblock_wrap.level])
2244  );
2245 #ifdef MSM_REDUCE_GRID
2246  // allocate full buffer space needed for section reduction
2247  int level = ehblockSend.nblock_wrap.level;
2248  int i = ehblockSend.nblock_wrap.n.i;
2249  int j = ehblockSend.nblock_wrap.n.j;
2250  int k = ehblockSend.nblock_wrap.n.k;
2251  ehfull.init( map->blockLevel[level](i,j,k).nrange );
2252 #endif // MSM_REDUCE_GRID
2253 #ifdef DEBUG_MSM_GRID
2254  printf("MsmGridCutoff[%d]: setup()"
2255  " send to level=%d block=(%d,%d,%d)\n",
2256  thisIndex, ehblockSend.nblock_wrap.level,
2257  ehblockSend.nblock_wrap.n.i,
2258  ehblockSend.nblock_wrap.n.j,
2259  ehblockSend.nblock_wrap.n.k);
2260 #endif
2261  }
2262 
2263  void setupSections(MsmGridCutoffSetupMsg *msg) {
2264 #ifdef DEBUG_MSM_GRID
2265  CkPrintf("MSM GRID CUTOFF %d setup section on PE %d\n",
2266  thisIndex, CkMyPe());
2267 #endif
2268  CkGetSectionInfo(cookie, msg); // init the cookie
2269  msg->get(&msmBlockElementProxy); // get proxy to MsmBlock
2270  delete msg;
2271  }
2272 
2273  void compute(GridMsg *gmsg) {
2274 #ifdef DEBUG_MSM_GRID
2275  printf("MsmGridCutoff %d: compute()\n", thisIndex);
2276 #endif
2277  // base class consumes this grid message
2279 
2280 #ifdef MSM_TIMING
2281  double startTime, stopTime;
2282  startTime = CkWallTimer();
2283 #endif
2284 #ifdef MSM_REDUCE_GRID
2285 
2286  // perform section reduction over potential grids
2287  CProxy_CkMulticastMgr mcastProxy =
2288  CkpvAccess(BOCclass_group).multicastMgr;
2289  CkMulticastMgr *mcastPtr =
2290  CProxy_CkMulticastMgr(mcastProxy).ckLocalBranch();
2291  CkCallback cb(CkIndex_MsmBlock::sumReducedPotential(NULL),
2292  msmBlockElementProxy);
2293  // sum into "full" sized buffer needed for contribute
2294  ehfull.reset(0);
2295  ehfull += eh;
2296  mcastPtr->contribute(
2297  ehfull.nn() * sizeof(Float), ehfull.data().buffer(),
2298  CkReduction::sum_float, cookie, cb);
2299 
2300 #else
2301  // place eh into message
2302  const msm::BlockIndex& bindex = ehblockSend.nblock_wrap;
2303  int msgsz = eh.data().len() * sizeof(Float);
2304  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
2305  SET_PRIORITY(gm, sequence, priority);
2306  gm->put(eh, bindex.level, sequence);
2307  // lookup in ComputeMsmMgr proxy array by level
2308  mgrLocal->msmBlock[bindex.level](
2309  bindex.n.i, bindex.n.j, bindex.n.k).addPotential(gm);
2310 
2311 #endif // MSM_REDUCE_GRID
2312 
2313 #ifdef MSM_TIMING
2314  stopTime = CkWallTimer();
2315  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
2316  mgrLocal->doneTiming();
2317 #endif
2318  } // compute()
2319 
2320 }; // MsmGridCutoff
2321 
2322 
2323 //
2324 // MsmC1HermiteGridCutoff wraps kernel template for
2325 // C1 Hermite approximation. Elements of 1D chare array.
2326 //
2328  public CBase_MsmC1HermiteGridCutoff,
2329  public MsmGridCutoffKernel<C1Vector,C1Matrix>
2330 {
2331  public:
2332  CProxyElement_MsmC1HermiteBlock msmBlockElementProxy; // root of reduction
2333  CkSectionInfo cookie; // need to save cookie for section reduction
2334 #ifdef MSM_REDUCE_GRID
2336 #endif // MSM_REDUCE_GRID
2337 
2339 
2340  MsmC1HermiteGridCutoff(CkMigrateMessage *m)
2341 #if ! defined(MSM_MIGRATION)
2342  { }
2343 #else // MSM_MIGRATION
2344  : CBase_MsmC1HermiteGridCutoff(m) {
2345 #ifdef DEBUG_MSM_MIGRATE
2346  printf("MsmC1HermiteGridCutoff element %d migrated to processor %d\n",
2347  thisIndex, CkMyPe());
2348 #endif
2349  init();
2350  // access type dependent constants from map
2352  &(map->gc_c1hermite[ehblockSend.nblock_wrap.level]),
2353  NULL
2354  );
2355  }
2356 
2357  virtual void pup(PUP::er& p) {
2358 #ifdef DEBUG_MSM_MIGRATE
2359  printf("MsmC1HermiteGridCutoff element %d pupped on processor %d\n",
2360  thisIndex, CkMyPe());
2361 #endif
2362  CBase_MsmC1HermiteGridCutoff::pup(p); // pack our superclass
2364  }
2365 #endif // MSM_MIGRATION
2366 
2367  void init() {
2369  }
2370 
2371  void setup(MsmGridCutoffInitMsg *bmsg) {
2372  // base class consumes this init proxy message
2374  // access type dependent constants from map
2376  &(map->gc_c1hermite[ehblockSend.nblock_wrap.level]),
2377  NULL
2378  );
2379 #ifdef DEBUG_MSM_GRID
2380  printf("MsmC1HermiteGridCutoff[%d]: setup()"
2381  " send to level=%d block=(%d,%d,%d)\n",
2382  thisIndex, ehblockSend.nblock_wrap.level,
2383  ehblockSend.nblock_wrap.n.i,
2384  ehblockSend.nblock_wrap.n.j,
2385  ehblockSend.nblock_wrap.n.k);
2386 #endif
2387 #ifdef MSM_REDUCE_GRID
2388  // allocate full buffer space needed for section reduction
2389  int level = ehblockSend.nblock_wrap.level;
2390  int i = ehblockSend.nblock_wrap.n.i;
2391  int j = ehblockSend.nblock_wrap.n.j;
2392  int k = ehblockSend.nblock_wrap.n.k;
2393  ehfull.init( map->blockLevel[level](i,j,k).nrange );
2394 #endif // MSM_REDUCE_GRID
2395  }
2396 
2397  void setupSections(MsmC1HermiteGridCutoffSetupMsg *msg) {
2398 #ifdef DEBUG_MSM_GRID
2399  CkPrintf("MSM C1 HERMITE GRID CUTOFF %d setup section on PE %d\n",
2400  thisIndex, CkMyPe());
2401 #endif
2402  CkGetSectionInfo(cookie, msg); // init the cookie
2403  msg->get(&msmBlockElementProxy); // get proxy to MsmC1HermiteBlock
2404  delete msg;
2405  }
2406 
2407  void compute(GridMsg *gmsg) {
2408 #ifdef DEBUG_MSM_GRID
2409  printf("MsmC1HermiteGridCutoff %d: compute()\n", thisIndex);
2410 #endif
2411 #if 0
2412  // base class consumes this grid message
2414 #else
2415  compute_specialized(gmsg);
2416 #endif
2417 
2418 #ifdef MSM_TIMING
2419  double startTime, stopTime;
2420  startTime = CkWallTimer();
2421 #endif
2422 #ifdef MSM_REDUCE_GRID
2423 
2424  // perform section reduction over potential grids
2425  CProxy_CkMulticastMgr mcastProxy =
2426  CkpvAccess(BOCclass_group).multicastMgr;
2427  CkMulticastMgr *mcastPtr =
2428  CProxy_CkMulticastMgr(mcastProxy).ckLocalBranch();
2429  CkCallback cb(CkIndex_MsmC1HermiteBlock::sumReducedPotential(NULL),
2430  msmBlockElementProxy);
2431  // sum into "full" sized buffer needed for contribute
2432  ehfull.reset(0);
2433  ehfull += eh;
2434  mcastPtr->contribute(
2435  ehfull.nn() * sizeof(C1Vector), ehfull.data().buffer(),
2436  CkReduction::sum_float, cookie, cb);
2437 
2438 #else
2439  // place eh into message
2440  const msm::BlockIndex& bindex = ehblockSend.nblock_wrap;
2441  int msgsz = eh.data().len() * sizeof(C1Vector);
2442  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
2443  SET_PRIORITY(gm, sequence, priority);
2444  gm->put(eh, bindex.level, sequence);
2445  // lookup in ComputeMsmMgr proxy array by level
2446  mgrLocal->msmC1HermiteBlock[bindex.level](
2447  bindex.n.i, bindex.n.j, bindex.n.k).addPotential(gm);
2448 
2449 #endif // MSM_REDUCE_GRID
2450 
2451 #ifdef MSM_TIMING
2452  stopTime = CkWallTimer();
2453  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
2454  mgrLocal->doneTiming();
2455 #endif
2456  } // compute()
2457 
2458  // try to improve performance of the major computational part
2459  void compute_specialized(GridMsg *gmsg);
2460 
2461 }; // MsmC1HermiteGridCutoff
2462 
2463 void MsmC1HermiteGridCutoff::compute_specialized(GridMsg *gmsg) {
2464 #ifdef MSM_TIMING
2465  double startTime, stopTime;
2466  startTime = CkWallTimer();
2467 #endif
2468  //
2469  // receive block of charges
2470  //
2471  int pid;
2472  // qh is resized only the first time, memory allocation persists
2473  gmsg->get(qh, pid, sequence);
2474  delete gmsg;
2475 #ifdef MSM_TIMING
2476  stopTime = CkWallTimer();
2477  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
2478 #endif
2479 
2480  //
2481  // grid cutoff calculation
2482  // this charge block -> this potential block
2483  //
2484 
2485 #ifdef MSM_TIMING
2486  startTime = stopTime;
2487 #endif
2488  // resets indexing on block
2489  eh.init(ehblockSend.nrange); // (always have to re-init nrange for eh)
2490  eh.reset(0);
2491  // index range of weights
2492  int gia = pgc->ia();
2493  int gib = pgc->ib();
2494  int gja = pgc->ja();
2495  int gjb = pgc->jb();
2496  int gka = pgc->ka();
2497  int gkb = pgc->kb();
2498  int gni = pgc->ni();
2499  int gnj = pgc->nj();
2500  // index range of charge grid
2501  int qia = qh.ia();
2502  int qib = qh.ib();
2503  int qja = qh.ja();
2504  int qjb = qh.jb();
2505  int qka = qh.ka();
2506  int qkb = qh.kb();
2507  int qni = qh.ni();
2508  int qnj = qh.nj();
2509  // index range of potentials
2510  int ia = eh.ia();
2511  int ib = eh.ib();
2512  int ja = eh.ja();
2513  int jb = eh.jb();
2514  int ka = eh.ka();
2515  int kb = eh.kb();
2516 
2517  int index = 0;
2518 
2519  // access buffers directly
2520  const C1Matrix *gcbuffer = pgc->data().buffer();
2521  const C1Vector *qhbuffer = qh.data().buffer();
2522  C1Vector *ehbuffer = eh.data().buffer();
2523 #ifdef DEBUG_MEMORY_ALIGNMENT
2524  printf("gcbuffer mem: addr=%p div32=%lu mod32=%lu\n",
2525  gcbuffer,
2526  (unsigned long)(gcbuffer)/32,
2527  (unsigned long)(gcbuffer)%32);
2528  printf("qhbuffer mem: addr=%p div32=%lu mod32=%lu\n",
2529  qhbuffer,
2530  (unsigned long)(qhbuffer)/32,
2531  (unsigned long)(qhbuffer)%32);
2532  printf("ehbuffer mem: addr=%p div32=%lu mod32=%lu\n",
2533  ehbuffer,
2534  (unsigned long)(ehbuffer)/32,
2535  (unsigned long)(ehbuffer)%32);
2536 #endif
2537 
2538 #ifndef MSM_COMM_ONLY
2539  // loop over potentials
2540  for (int k = ka; k <= kb; k++) {
2541  // clip charges to weights along k
2542  int mka = ( qka >= gka + k ? qka : gka + k );
2543  int mkb = ( qkb <= gkb + k ? qkb : gkb + k );
2544 
2545  for (int j = ja; j <= jb; j++) {
2546  // clip charges to weights along j
2547  int mja = ( qja >= gja + j ? qja : gja + j );
2548  int mjb = ( qjb <= gjb + j ? qjb : gjb + j );
2549 
2550  for (int i = ia; i <= ib; i++) {
2551  // clip charges to weights along i
2552  int mia = ( qia >= gia + i ? qia : gia + i );
2553  int mib = ( qib <= gib + i ? qib : gib + i );
2554 
2555  // accumulate sum to this eh point
2556  C1Vector ehsum = 0;
2557 
2558  // loop over charge grid
2559  int nn = mib - mia + 1;
2560 
2561  {
2562  int qnji = qnj * qni;
2563  int qkoff = -qka*qnji - qja*qni - qia + mia;
2564  int gnji = gnj * gni;
2565  int gkoff = (-k-gka)*gnji + (-j-gja)*gni - i - gia + mia;
2566 
2567  for (int qk = mka; qk <= mkb; qk++) {
2568  int qjkoff = qkoff + qk*qnji;
2569  int gjkoff = gkoff + qk*gnji;
2570 
2571  for (int qj = mja; qj <= mjb; qj++) {
2572  const C1Vector *qbuf = qhbuffer + (qjkoff + qj*qni);
2573  const C1Matrix *gbuf = gcbuffer + (gjkoff + qj*gni);
2574 #ifdef MSM_PROFILING
2575  mgrLocal->xLoopCnt[nn]++;
2576 #endif
2577 // help the vectorizer make reasonable decisions
2578 #if defined(__INTEL_COMPILER)
2579 #pragma vector always
2580 #endif
2581  for (int ii = 0; ii < nn; ii++) {
2582 
2583 #if 0
2584  ehsum += gbuf[ii] * qbuf[ii];
2585 #else
2586  // skip matvec when matrix is 0
2587  // first matrix element tells us if this is the case
2588  if ( *((int *)(gbuf)) != 0) {
2589 
2590  // expand matrix-vector multiply
2591 #if defined(__INTEL_COMPILER)
2592 #pragma vector always
2593 #endif
2594  for (int km=0, jm=0; jm < C1_VECTOR_SIZE; jm++) {
2595  for (int im=0; im < C1_VECTOR_SIZE; im++, km++) {
2596  ehsum.velem[jm] += gbuf->melem[km] * qbuf->velem[im];
2597  }
2598  }
2599  } // if
2600  gbuf++;
2601  qbuf++;
2602 #endif
2603  }
2604  }
2605  } // end loop over charge grid
2606 
2607  }
2608 
2609  ehbuffer[index] = ehsum;
2610  index++;
2611  }
2612  }
2613  } // end loop over potentials
2614 #endif // !MSM_COMM_ONLY
2615 
2616 #ifdef MSM_PROFILING
2617  mgrLocal->doneProfiling();
2618 #endif
2619 
2620  //
2621  // send block of potentials
2622  //
2623 
2624 #ifdef MSM_FOLD_FACTOR
2625  // if "fold factor" is active for this level,
2626  // need to sum unfolded potential grid back into periodic grid
2627  if (isfold) {
2628  // copy unfolded grid
2629  ehfold = eh;
2630  // reset eh indexing to correctly folded size
2631  eh.set(eia, eni, eja, enj, eka, enk);
2632  eh.reset(0);
2633 #ifdef DEBUG_MSM_GRID
2634  printf("level=%d ehfold: [%d..%d] x [%d..%d] x [%d..%d] "
2635  "(%d x %d x %d)\n"
2636  " eh: [%d..%d] x [%d..%d] x [%d..%d] "
2637  "(%d x %d x %d)\n"
2638  " eh lower: %d %d %d\n",
2639  qhblockIndex.level,
2640  ehfold.ia(), ehfold.ib(),
2641  ehfold.ja(), ehfold.jb(),
2642  ehfold.ka(), ehfold.kb(),
2643  ehfold.ni(), ehfold.nj(), ehfold.nk(),
2644  eh.ia(), eh.ib(),
2645  eh.ja(), eh.jb(),
2646  eh.ka(), eh.kb(),
2647  eh.ni(), eh.nj(), eh.nk(),
2648  ehblockSend.nrange_wrap.lower().i,
2649  ehblockSend.nrange_wrap.lower().j,
2650  ehblockSend.nrange_wrap.lower().k
2651  );
2652 #endif
2653  const C1Vector *ehfoldbuf = ehfold.data().buffer();
2654  C1Vector *ehbuf = eh.data().buffer();
2655  // now we "fold" eh by calculating the
2656  // wrap around sum of ehfold into correctly sized eh
2657  int index = 0;
2658  for (int k = ka; k <= kb; k++) {
2659  int kk = k;
2660  if (kk < eka) do { kk += enk; } while (kk < eka);
2661  else if (kk > ekb) do { kk -= enk; } while (kk > ekb);
2662  int koff = (kk - eka) * enj;
2663  for (int j = ja; j <= jb; j++) {
2664  int jj = j;
2665  if (jj < eja) do { jj += enj; } while (jj < eja);
2666  else if (jj > ejb) do { jj -= enj; } while (jj > ejb);
2667  int jkoff = (koff + (jj - eja)) * eni;
2668  for (int i = ia; i <= ib; i++, index++) {
2669  int ii = i;
2670  if (ii < eia) do { ii += eni; } while (ii < eia);
2671  else if (ii > eib) do { ii -= eni; } while (ii > eib);
2672  int ijkoff = jkoff + (ii - eia);
2673  ehbuf[ijkoff] += ehfoldbuf[index];
2674  }
2675  }
2676  }
2677  }
2678  else {
2679  // shift grid index range to its true (wrapped) values
2680  eh.updateLower( ehblockSend.nrange_wrap.lower() );
2681  }
2682 #else // !MSM_FOLD_FACTOR
2683  // shift grid index range to its true (wrapped) values
2684  eh.updateLower( ehblockSend.nrange_wrap.lower() );
2685 #endif // MSM_FOLD_FACTOR
2686 
2687 #ifdef MSM_TIMING
2688  stopTime = CkWallTimer();
2689  mgrLocal->msmTiming[MsmTimer::GRIDCUTOFF] += stopTime - startTime;
2690 #endif
2691 } // MsmC1HermiteGridCutoff::compute_specialized()
2692 
2693 // MsmGridCutoff
2694 //
2696 
2697 
2699 //
2700 // MsmBlock
2701 //
2702 // Performs restriction and prolongation.
2703 //
2704 // Each level of the MSM grid hierarchy is partitioned into MsmBlocks,
2705 // holding both charge and potential grid blocks.
2706 //
2707 // The MsmBlockKernel provides templated routines for the MSM
2708 // restriction and prolongation algorithms. Overall is very small
2709 // part of computational work (less than 2% total for C1 Hermite,
2710 // less than 4% total for cubic).
2711 // XXX Could be made faster with factored restriction and prolongation
2712 // algorithms --- especially important for higher order or for
2713 // generalizing to coarser grid spacing that is not 2h.
2714 // XXX Haven't yet determined factorization for C1 Hermite.
2715 //
2716 // The classes that inherit from MsmBlockKernel provide
2717 // 3D chare array elements for each level with significant management:
2718 // - receive and sum charges from below
2719 // (either PatchData or lower level MsmBlock)
2720 // - calculate restriction to 2h grid
2721 // - send up (if not on highest level)
2722 // - section broadcast to MsmGridCutoff
2723 // - receive and sum potentials from above and from
2724 // section reduction of MsmGridCutoff
2725 // - calculate prolongation to (1/2)h grid and send down,
2726 // OR send to PatchData
2727 //
2728 // XXX Grid cutoff calculation below is now replaced with
2729 // MsmGridCutoff to provide enough parallel work units.
2730 //
2731 
2732 template <class Vtype, class Mtype>
2734  public:
2735  CProxy_ComputeMsmMgr mgrProxy;
2736  ComputeMsmMgr *mgrLocal; // for quick access to data
2741 #ifndef MSM_GRID_CUTOFF_DECOMP
2742  const msm::Grid<Mtype> *gcWeights;
2743  msm::Grid<Vtype> ehCutoff;
2744 #endif
2752 
2754 
2755  int sequence; // from incoming message for message priority
2756 
2758  MsmBlockKernel(CkMigrateMessage *m) { }
2759 
2760  void init();
2761 
2762 #ifndef MSM_GRID_CUTOFF_DECOMP
2763  void setupStencils(
2764  const msm::Grid<Mtype> *res,
2765  const msm::Grid<Mtype> *pro,
2766  const msm::Grid<Mtype> *gc
2767  )
2768  {
2769  resStencil = res;
2770  proStencil = pro;
2771  gcWeights = gc;
2772  }
2773 #else
2775  const msm::Grid<Mtype> *res,
2776  const msm::Grid<Mtype> *pro
2777  )
2778  {
2779  resStencil = res;
2780  proStencil = pro;
2781  }
2782 #endif
2783 
2784  void restrictionKernel();
2785 #ifndef MSM_GRID_CUTOFF_DECOMP
2786  void gridCutoffKernel();
2787 #endif
2788  void prolongationKernel();
2789 
2790 }; // class MsmBlockKernel<Vtype,Mtype>
2791 
2792 template <class Vtype, class Mtype>
2794  blockIndex = bindex;
2795  mgrProxy = CProxy_ComputeMsmMgr(CkpvAccess(BOCclass_group).computeMsmMgr);
2796  mgrLocal = CProxy_ComputeMsmMgr::ckLocalBranch(
2797  CkpvAccess(BOCclass_group).computeMsmMgr);
2798  map = &(mgrLocal->mapData());
2799  bd = &(map->blockLevel[blockIndex.level](blockIndex.n));
2800  qh.init( bd->nrange );
2801  eh.init( bd->nrange );
2802 #ifndef MSM_GRID_CUTOFF_DECOMP
2803  ehCutoff.init( bd->nrangeCutoff );
2804 #endif
2805  qhRestricted.init( bd->nrangeRestricted );
2806  ehProlongated.init( bd->nrangeProlongated );
2807 #ifdef DEBUG_MSM_GRID
2808  printf("MsmBlockKernel level=%d, n=%d %d %d: constructor\n",
2809  blockIndex.level, blockIndex.n.i, blockIndex.n.j, blockIndex.n.k);
2810 #endif
2811 #ifdef MSM_TIMING
2812  mgrLocal->addTiming();
2813 #endif
2814  init();
2815 } // MsmBlockKernel<Vtype,Mtype>::MsmBlockKernel()
2816 
2817 
2818 template <class Vtype, class Mtype>
2820  qh.reset(0);
2821  eh.reset(0);
2822 #ifndef MSM_GRID_CUTOFF_DECOMP
2823  ehCutoff.reset(0);
2824 #endif
2825  qhRestricted.reset(0);
2826  ehProlongated.reset(0);
2827  cntRecvsCharge = 0;
2828  cntRecvsPotential = 0;
2829 } // MsmBlockKernel<Vtype,Mtype>::init()
2830 
2831 
2832 template <class Vtype, class Mtype>
2834 {
2835 #ifdef DEBUG_MSM_GRID
2836  printf("MsmBlockKernel level=%d, id=%d %d %d: restriction\n",
2837  blockIndex.level, blockIndex.n.i, blockIndex.n.j, blockIndex.n.k);
2838 #endif
2839 
2840 #ifdef MSM_TIMING
2841  double startTime, stopTime;
2842  startTime = CkWallTimer();
2843 #endif
2844 
2845 #ifndef MSM_COMM_ONLY
2846  // stencil data for approximating charge on restricted grid
2847  const int approx = mgrLocal->approx;
2848  const int nstencil = ComputeMsmMgr::Nstencil[approx];
2849  const int *offset = ComputeMsmMgr::IndexOffset[approx];
2850  const msm::Grid<Mtype>& res = *resStencil;
2851 
2852  // index range for h grid charges
2853  int ia1 = qh.ia();
2854  int ib1 = qh.ib();
2855  int ja1 = qh.ja();
2856  int jb1 = qh.jb();
2857  int ka1 = qh.ka();
2858  int kb1 = qh.kb();
2859 
2860  // index range for restricted (2h) grid charges
2861  int ia2 = qhRestricted.ia();
2862  int ib2 = qhRestricted.ib();
2863  int ja2 = qhRestricted.ja();
2864  int jb2 = qhRestricted.jb();
2865  int ka2 = qhRestricted.ka();
2866  int kb2 = qhRestricted.kb();
2867 
2868  // reset grid
2869  qhRestricted.reset(0);
2870 
2871  // loop over restricted (2h) grid
2872  for (int k2 = ka2; k2 <= kb2; k2++) {
2873  int k1 = 2 * k2;
2874  for (int j2 = ja2; j2 <= jb2; j2++) {
2875  int j1 = 2 * j2;
2876  for (int i2 = ia2; i2 <= ib2; i2++) {
2877  int i1 = 2 * i2;
2878 
2879  // loop over stencils on h grid
2880  Vtype& q2hsum = qhRestricted(i2,j2,k2);
2881 
2882  for (int k = 0; k < nstencil; k++) {
2883  int kn = k1 + offset[k];
2884  if (kn < ka1) continue;
2885  else if (kn > kb1) break;
2886 
2887  for (int j = 0; j < nstencil; j++) {
2888  int jn = j1 + offset[j];
2889  if (jn < ja1) continue;
2890  else if (jn > jb1) break;
2891 
2892  for (int i = 0; i < nstencil; i++) {
2893  int in = i1 + offset[i];
2894  if (in < ia1) continue;
2895  else if (in > ib1) break;
2896 
2897  q2hsum += res(i,j,k) * qh(in,jn,kn);
2898  }
2899  }
2900  } // end loop over stencils on h grid
2901 
2902  }
2903  }
2904  } // end loop over restricted (2h) grid
2905 #else
2906  qhRestricted.reset(0);
2907 #endif // !MSM_COMM_ONLY
2908 
2909 #ifdef MSM_TIMING
2910  stopTime = CkWallTimer();
2911  mgrLocal->msmTiming[MsmTimer::RESTRICT] += stopTime - startTime;
2912 #endif
2913 } // MsmBlockKernel<Vtype,Mtype>::restrictionKernel()
2914 
2915 
2916 #ifndef MSM_GRID_CUTOFF_DECOMP
2917 template <class Vtype, class Mtype>
2919 {
2920 #ifdef DEBUG_MSM_GRID
2921  printf("MsmBlockKernel level=%d, id=%d %d %d: grid cutoff\n",
2922  blockIndex.level, blockIndex.n.i, blockIndex.n.j, blockIndex.n.k);
2923 #endif
2924 #ifdef MSM_TIMING
2925  double startTime, stopTime;
2926  startTime = CkWallTimer();
2927 #endif
2928 #ifndef MSM_COMM_ONLY
2929  // need grid of weights for this level
2930  msm::Grid<Mtype>& gc = *gcWeights;
2931  // index range of weights
2932  int gia = gc.ia();
2933  int gib = gc.ib();
2934  int gja = gc.ja();
2935  int gjb = gc.jb();
2936  int gka = gc.ka();
2937  int gkb = gc.kb();
2938  // index range of charge grid
2939  int qia = qh.ia();
2940  int qib = qh.ib();
2941  int qja = qh.ja();
2942  int qjb = qh.jb();
2943  int qka = qh.ka();
2944  int qkb = qh.kb();
2945  // index range of potentials
2946  int ia = ehCutoff.ia();
2947  int ib = ehCutoff.ib();
2948  int ja = ehCutoff.ja();
2949  int jb = ehCutoff.jb();
2950  int ka = ehCutoff.ka();
2951  int kb = ehCutoff.kb();
2952  // reset grid
2953  ehCutoff.reset(0);
2954  // loop over potentials
2955  for (int k = ka; k <= kb; k++) {
2956  for (int j = ja; j <= jb; j++) {
2957  for (int i = ia; i <= ib; i++) {
2958  // clip charges to weights
2959  int mia = ( qia >= gia + i ? qia : gia + i );
2960  int mib = ( qib <= gib + i ? qib : gib + i );
2961  int mja = ( qja >= gja + j ? qja : gja + j );
2962  int mjb = ( qjb <= gjb + j ? qjb : gjb + j );
2963  int mka = ( qka >= gka + k ? qka : gka + k );
2964  int mkb = ( qkb <= gkb + k ? qkb : gkb + k );
2965  // accumulate sum to this eh point
2966  Vtype& ehsum = ehCutoff(i,j,k);
2967  // loop over smaller charge grid
2968  for (int qk = mka; qk <= mkb; qk++) {
2969  for (int qj = mja; qj <= mjb; qj++) {
2970  for (int qi = mia; qi <= mib; qi++) {
2971  ehsum += gc(qi-i, qj-j, qk-k) * qh(qi,qj,qk);
2972  }
2973  }
2974  } // end loop over smaller charge grid
2975 
2976  }
2977  }
2978  } // end loop over potentials
2979 #else
2980  ehCutoff.reset(0);
2981 #endif // !MSM_COMM_ONLY
2982 #ifdef MSM_TIMING
2983  stopTime = CkWallTimer();
2984  mgrLocal->msmTiming[MsmTimer::GRIDCUTOFF] += stopTime - startTime;
2985 #endif
2986 } // MsmBlockKernel<Vtype,Mtype>::gridCutoffKernel()
2987 #endif // MSM_GRID_CUTOFF_DECOMP
2988 
2989 
2990 template <class Vtype, class Mtype>
2992 {
2993 #ifdef DEBUG_MSM_GRID
2994  printf("MsmBlockKernel level=%d, id=%d %d %d: prolongation\n",
2995  blockIndex.level, blockIndex.n.i, blockIndex.n.j, blockIndex.n.k);
2996 #endif
2997 
2998 #ifdef MSM_TIMING
2999  double startTime, stopTime;
3000  startTime = CkWallTimer();
3001 #endif
3002 #ifndef MSM_COMM_ONLY
3003  // stencil data for approximating potential on prolongated grid
3004  const int approx = mgrLocal->approx;
3005  const int nstencil = ComputeMsmMgr::Nstencil[approx];
3006  const int *offset = ComputeMsmMgr::IndexOffset[approx];
3007  const msm::Grid<Mtype>& pro = *proStencil;
3008 
3009  // index range for prolongated h grid potentials
3010  int ia1 = ehProlongated.ia();
3011  int ib1 = ehProlongated.ib();
3012  int ja1 = ehProlongated.ja();
3013  int jb1 = ehProlongated.jb();
3014  int ka1 = ehProlongated.ka();
3015  int kb1 = ehProlongated.kb();
3016 
3017  // index range for 2h grid potentials
3018  int ia2 = eh.ia();
3019  int ib2 = eh.ib();
3020  int ja2 = eh.ja();
3021  int jb2 = eh.jb();
3022  int ka2 = eh.ka();
3023  int kb2 = eh.kb();
3024 
3025  // loop over 2h grid
3026  for (int k2 = ka2; k2 <= kb2; k2++) {
3027  int k1 = 2 * k2;
3028  for (int j2 = ja2; j2 <= jb2; j2++) {
3029  int j1 = 2 * j2;
3030  for (int i2 = ia2; i2 <= ib2; i2++) {
3031  int i1 = 2 * i2;
3032 
3033  // loop over stencils on prolongated h grid
3034  for (int k = 0; k < nstencil; k++) {
3035  int kn = k1 + offset[k];
3036  if (kn < ka1) continue;
3037  else if (kn > kb1) break;
3038 
3039  for (int j = 0; j < nstencil; j++) {
3040  int jn = j1 + offset[j];
3041  if (jn < ja1) continue;
3042  else if (jn > jb1) break;
3043 
3044  for (int i = 0; i < nstencil; i++) {
3045  int in = i1 + offset[i];
3046  if (in < ia1) continue;
3047  else if (in > ib1) break;
3048 
3049  ehProlongated(in,jn,kn) += pro(i,j,k) * eh(i2,j2,k2);
3050  }
3051  }
3052  } // end loop over stencils on prolongated h grid
3053 
3054  }
3055  }
3056  } // end loop over 2h grid
3057 #else
3058  ehProlongated.reset(0);
3059 #endif // !MSM_COMM_ONLY
3060 #ifdef MSM_TIMING
3061  stopTime = CkWallTimer();
3062  mgrLocal->msmTiming[MsmTimer::PROLONGATE] += stopTime - startTime;
3063 #endif
3064 } // MsmBlockKernel<Vtype,Mtype>::prolongationKernel()
3065 
3066 
3067 //
3068 // MsmBlock handles grids of function values only
3069 // (for cubic, quintic, etc., approximation)
3070 //
3071 class MsmBlock :
3072  public CBase_MsmBlock,
3073  public MsmBlockKernel<Float,Float>
3074 {
3075  public:
3076  CProxySection_MsmGridCutoff msmGridCutoffBroadcast;
3077  CProxySection_MsmGridCutoff msmGridCutoffReduction;
3078 
3079  MsmBlock(int level) :
3081  msm::BlockIndex(level,
3082  msm::Ivec(thisIndex.x, thisIndex.y, thisIndex.z))
3083  )
3084  {
3085 #ifndef MSM_GRID_CUTOFF_DECOMP
3086  setupStencils(&(map->grespro), &(map->grespro), &(map->gc[level]));
3087 #else
3088  setupStencils(&(map->grespro), &(map->grespro));
3089 #endif
3090  }
3091  MsmBlock(CkMigrateMessage *m) : MsmBlockKernel<Float,Float>(m) { }
3092 
3093  void setupSections();
3094 
3095  void sumReducedPotential(CkReductionMsg *msg) {
3096 #ifdef MSM_TIMING
3097  double startTime, stopTime;
3098  startTime = CkWallTimer();
3099 #endif
3100  msm::Grid<Float> ehfull;
3101  ehfull.init( msm::IndexRange(eh) );
3102  memcpy(ehfull.data().buffer(), msg->getData(), msg->getSize());
3103  delete msg;
3104  int priority = mgrLocal->nlevels
3105  + 2*(mgrLocal->nlevels - blockIndex.level)-1;
3106  int msgsz = ehfull.data().len() * sizeof(Float);
3107  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3108  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3109  gm->put(ehfull, blockIndex.level, sequence); // send my level
3110 #ifdef MSM_TIMING
3111  stopTime = CkWallTimer();
3112  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3113 #endif
3114  addPotential(gm);
3115  }
3116 
3117  void addCharge(GridMsg *); // entry
3118 
3119  void restriction() {
3120  restrictionKernel();
3121  sendUpCharge();
3122  }
3123  void sendUpCharge();
3124  void gridCutoff();
3125 #ifndef MSM_GRID_CUTOFF_DECOMP
3126  void sendAcrossPotential();
3127 #endif
3128 
3129  void addPotential(GridMsg *); // entry
3130 
3131  void prolongation() {
3132  prolongationKernel();
3133  sendDownPotential();
3134  }
3135  void sendDownPotential();
3136  void sendPatch();
3137 }; // class MsmBlock
3138 
3139 
3141 {
3142 #ifdef DEBUG_MSM_GRID
3143  CkPrintf("LEVEL %d MSM BLOCK (%d,%d,%d): "
3144  "creating broadcast section on PE %d\n",
3145  blockIndex.level, thisIndex.x, thisIndex.y, thisIndex.z, CkMyPe());
3146 #endif
3147  CkVec<CkArrayIndex1D> elems;
3148  for (int n = 0; n < bd->indexGridCutoff.len(); n++) {
3149  elems.push_back(CkArrayIndex1D( bd->indexGridCutoff[n] ));
3150  }
3151  msmGridCutoffBroadcast = CProxySection_MsmGridCutoff::ckNew(
3152  mgrLocal->msmGridCutoff, elems.getVec(), elems.size()
3153  );
3154  CProxy_CkMulticastMgr mcastProxy = CkpvAccess(BOCclass_group).multicastMgr;
3155  CkMulticastMgr *mcastPtr = CProxy_CkMulticastMgr(mcastProxy).ckLocalBranch();
3156  msmGridCutoffBroadcast.ckSectionDelegate(mcastPtr);
3157 
3158 #ifdef DEBUG_MSM_GRID
3159  char s[1024];
3160  sprintf(s, "LEVEL %d MSM BLOCK (%d,%d,%d): "
3161  "creating reduction section on PE %d\n",
3162  blockIndex.level, thisIndex.x, thisIndex.y, thisIndex.z, CkMyPe());
3163 #endif
3164  CkVec<CkArrayIndex1D> elems2;
3165 #ifdef DEBUG_MSM_GRID
3166  strcat(s, "receiving from MsmGridCutoff ID:");
3167 #endif
3168  for (int n = 0; n < bd->recvGridCutoff.len(); n++) {
3169 #ifdef DEBUG_MSM_GRID
3170  char t[20];
3171  sprintf(t, " %d", bd->recvGridCutoff[n]);
3172  strcat(s, t);
3173 #endif
3174  elems2.push_back(CkArrayIndex1D( bd->recvGridCutoff[n] ));
3175  }
3176 #ifdef DEBUG_MSM_GRID
3177  strcat(s, "\n");
3178  CkPrintf(s);
3179 #endif
3180  msmGridCutoffReduction = CProxySection_MsmGridCutoff::ckNew(
3181  mgrLocal->msmGridCutoff, elems2.getVec(), elems2.size()
3182  );
3183  msmGridCutoffReduction.ckSectionDelegate(mcastPtr);
3185  CProxyElement_MsmBlock thisElementProxy = thisProxy(thisIndex);
3186  msg->put(&thisElementProxy);
3187 
3188  msmGridCutoffReduction.setupSections(msg); // broadcast to entire section
3189 
3190  /* XXX alternatively, setup default reduction client
3191  *
3192  mcastPtr->setReductionClient(msmGridCutoffReduction,
3193  new CkCallback(CkIndex_MsmBlock::myReductionEntry(NULL),
3194  thisElementProxy));
3195  *
3196  */
3197 }
3198 
3199 
3201 {
3202 #ifdef MSM_TIMING
3203  double startTime, stopTime;
3204  startTime = CkWallTimer();
3205 #endif
3206  int pid;
3207  gm->get(subgrid, pid, sequence);
3208  delete gm;
3209  qh += subgrid;
3210 #ifdef MSM_TIMING
3211  stopTime = CkWallTimer();
3212  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3213 #endif
3214  if (++cntRecvsCharge == bd->numRecvsCharge) {
3215  int nlevels = mgrLocal->numLevels();
3216  if (blockIndex.level < nlevels-1) {
3217  restriction();
3218  }
3219  gridCutoff();
3220  }
3221 } // MsmBlock::addCharge()
3222 
3223 
3225 {
3226 #ifdef MSM_TIMING
3227  double startTime, stopTime;
3228  startTime = CkWallTimer();
3229 #endif
3230  int lnext = blockIndex.level + 1;
3231  // buffer portions of grid to send to Blocks on next level
3232  for (int n = 0; n < bd->sendUp.len(); n++) {
3233  // initialize the proper subgrid indexing range
3234  subgrid.init( bd->sendUp[n].nrange );
3235  // extract the values from the larger grid into the subgrid
3236  qhRestricted.extract(subgrid);
3237  // translate the subgrid indexing range to match the MSM block
3238  subgrid.updateLower( bd->sendUp[n].nrange_wrap.lower() );
3239  // add the subgrid charges into the block
3240  msm::BlockIndex& bindex = bd->sendUp[n].nblock_wrap;
3241  ASSERT(bindex.level == lnext);
3242  // place subgrid into message
3243  // SET MESSAGE PRIORITY
3244  int msgsz = subgrid.nn() * sizeof(Float);
3245  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3246  SET_PRIORITY(gm, sequence, MSM_PRIORITY + lnext);
3247  gm->put(subgrid, blockIndex.level, sequence); // send my level
3248  // lookup in ComputeMsmMgr proxy array by level
3249  mgrLocal->msmBlock[lnext](
3250  bindex.n.i, bindex.n.j, bindex.n.k).addCharge(gm);
3251  } // for
3252 #ifdef MSM_TIMING
3253  stopTime = CkWallTimer();
3254  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3255 #endif
3256 } // MsmBlock::sendUpCharge()
3257 
3258 
3260 {
3261 #ifdef DEBUG_MSM_GRID
3262  printf("MsmBlock level=%d, id=%d %d %d: grid cutoff\n",
3263  blockIndex.level, blockIndex.n.i, blockIndex.n.j, blockIndex.n.k);
3264 #endif
3265 #ifndef MSM_GRID_CUTOFF_DECOMP
3266  gridCutoffKernel();
3267  sendAcrossPotential();
3268 #else // MSM_GRID_CUTOFF_DECOMP
3269 
3270  // send charge block to MsmGridCutoff compute objects
3271 #ifdef MSM_TIMING
3272  double startTime, stopTime;
3273  startTime = CkWallTimer();
3274 #endif
3275  int priority = mgrLocal->nlevels + 2*(mgrLocal->nlevels - blockIndex.level)-1;
3276  int msgsz = qh.data().len() * sizeof(Float);
3277  int len = bd->indexGridCutoff.len();
3278 
3279 #if 0
3280  // send charge message to each MsmGridCutoff compute element in list
3281  for (int n = 0; n < len; n++) {
3282 #ifdef MSM_TIMING
3283  startTime = CkWallTimer();
3284 #endif
3285  int index = bd->indexGridCutoff[n];
3286  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3287  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3288  gm->put(qh, blockIndex.level, sequence); // send my level
3289 #ifdef MSM_TIMING
3290  stopTime = CkWallTimer();
3291  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3292 #endif
3293  mgrLocal->msmGridCutoff[index].compute(gm);
3294  }
3295 #else
3296 
3297  // broadcast charge message to section
3298  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3299  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3300  gm->put(qh, blockIndex.level, sequence); // send my level
3301  msmGridCutoffBroadcast.compute(gm);
3302 #ifdef MSM_TIMING
3303  stopTime = CkWallTimer();
3304  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3305 #endif
3306 
3307 #endif // 0
3308 
3309 #endif // MSM_GRID_CUTOFF_DECOMP
3310 
3311 } // MsmBlock::gridCutoff()
3312 
3313 
3314 #ifndef MSM_GRID_CUTOFF_DECOMP
3315 void MsmBlock::sendAcrossPotential()
3316 {
3317 #ifdef MSM_TIMING
3318  double startTime, stopTime;
3319  startTime = CkWallTimer();
3320 #endif
3321  int lnext = blockIndex.level;
3322  int priority = mgrLocal->nlevels + 2*(mgrLocal->nlevels - blockIndex.level)-1;
3323  // buffer portions of grid to send to Blocks on this level
3324  for (int n = 0; n < bd->sendAcross.len(); n++) {
3325  // initialize the proper subgrid indexing range
3326  subgrid.init( bd->sendAcross[n].nrange );
3327  // extract the values from the larger grid into the subgrid
3328  ehCutoff.extract(subgrid);
3329  // translate the subgrid indexing range to match the MSM block
3330  subgrid.updateLower( bd->sendAcross[n].nrange_wrap.lower() );
3331  // add the subgrid charges into the block
3332  msm::BlockIndex& bindex = bd->sendAcross[n].nblock_wrap;
3333  ASSERT(bindex.level == lnext);
3334  // place subgrid into message
3335  int msgsz = subgrid.nn() * sizeof(Float);
3336  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3337  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3338  gm->put(subgrid, blockIndex.level, sequence); // send my level
3339  // lookup in ComputeMsmMgr proxy array by level
3340  mgrLocal->msmBlock[lnext](
3341  bindex.n.i, bindex.n.j, bindex.n.k).addPotential(gm);
3342  } // for
3343 #ifdef MSM_TIMING
3344  stopTime = CkWallTimer();
3345  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3346 #endif
3347 } // MsmBlock::sendAcrossPotential()
3348 #endif
3349 
3350 
3352 {
3353 #ifdef MSM_TIMING
3354  double startTime, stopTime;
3355  startTime = CkWallTimer();
3356 #endif
3357  int pid;
3358  int pseq;
3359  gm->get(subgrid, pid, pseq); // receive sender's level
3360  delete gm;
3361  eh += subgrid;
3362 #ifdef MSM_TIMING
3363  stopTime = CkWallTimer();
3364  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3365 #endif
3366  if (++cntRecvsPotential == bd->numRecvsPotential) {
3367  if (blockIndex.level > 0) {
3368  prolongation();
3369  }
3370  else {
3371  sendPatch();
3372  }
3373  }
3374 } // MsmBlock::addPotential()
3375 
3376 
3378 {
3379 #ifdef MSM_TIMING
3380  double startTime, stopTime;
3381  startTime = CkWallTimer();
3382 #endif
3383  int lnext = blockIndex.level - 1;
3384  int priority = mgrLocal->nlevels + 2*(mgrLocal->nlevels - blockIndex.level);
3385  // buffer portions of grid to send to Blocks on next level
3386  for (int n = 0; n < bd->sendDown.len(); n++) {
3387  // initialize the proper subgrid indexing range
3388  subgrid.init( bd->sendDown[n].nrange );
3389  // extract the values from the larger grid into the subgrid
3390  ehProlongated.extract(subgrid);
3391  // translate the subgrid indexing range to match the MSM block
3392  subgrid.updateLower( bd->sendDown[n].nrange_wrap.lower() );
3393  // add the subgrid charges into the block
3394  msm::BlockIndex& bindex = bd->sendDown[n].nblock_wrap;
3395  ASSERT(bindex.level == lnext);
3396  // place subgrid into message
3397  int msgsz = subgrid.nn() * sizeof(Float);
3398  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3399  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3400  gm->put(subgrid, blockIndex.level, sequence); // send my level
3401  // lookup in ComputeMsmMgr proxy array by level
3402  mgrLocal->msmBlock[lnext](
3403  bindex.n.i, bindex.n.j, bindex.n.k).addPotential(gm);
3404  } // for
3405 #ifdef MSM_TIMING
3406  stopTime = CkWallTimer();
3407  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3408  mgrLocal->doneTiming();
3409 #endif
3410  init(); // reinitialize for next computation
3411 } // MsmBlock::sendDownPotential()
3412 
3413 
3415 {
3416 #ifdef MSM_TIMING
3417  double startTime, stopTime;
3418  startTime = CkWallTimer();
3419 #endif
3420  int lnext = blockIndex.level;
3421  int priority = mgrLocal->nlevels + 2*(mgrLocal->nlevels - blockIndex.level);
3422  ASSERT(lnext == 0);
3423  // buffer portions of grid to send to Blocks on next level
3424  for (int n = 0; n < bd->sendPatch.len(); n++) {
3425  // initialize the proper subgrid indexing range
3426  subgrid.init( bd->sendPatch[n].nrange );
3427  // extract the values from the larger grid into the subgrid
3428  eh.extract(subgrid);
3429  // translate the subgrid indexing range to match the MSM block
3430  subgrid.updateLower( bd->sendPatch[n].nrange_unwrap.lower() );
3431  // add the subgrid charges into the block, need its patch ID
3432  int pid = bd->sendPatch[n].patchID;
3433  // place subgrid into message
3434  int msgsz = subgrid.nn() * sizeof(Float);
3435  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3436  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3437  gm->put(subgrid, pid, sequence); // send patch ID
3438  // lookup which PE has this patch
3439  PatchMap *pm = PatchMap::Object();
3440  int pe = pm->node(pid);
3441  mgrProxy[pe].addPotential(gm);
3442  }
3443 #ifdef MSM_TIMING
3444  stopTime = CkWallTimer();
3445  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3446  mgrLocal->doneTiming();
3447 #endif
3448  init(); // reinitialize for next computation
3449 } // MsmBlock::sendPatch()
3450 
3451 
3452 //
3453 // MsmC1HermiteBlock handles grids of vector elements
3454 // for C1 Hermite approximation
3455 //
3457  public CBase_MsmC1HermiteBlock,
3458  public MsmBlockKernel<C1Vector,C1Matrix>
3459 {
3460  public:
3461  CProxySection_MsmC1HermiteGridCutoff msmGridCutoffBroadcast;
3462  CProxySection_MsmC1HermiteGridCutoff msmGridCutoffReduction;
3463 
3464  MsmC1HermiteBlock(int level) :
3466  msm::BlockIndex(level,
3467  msm::Ivec(thisIndex.x, thisIndex.y, thisIndex.z))
3468  )
3469  {
3470  int isfirstlevel = (level == 0);
3471  int istoplevel = (level == map->gridrange.len()-1);
3472  const msm::Grid<C1Matrix> *res =
3473  (istoplevel ? NULL : &(map->gres_c1hermite[level]));
3474  const msm::Grid<C1Matrix> *pro =
3475  (isfirstlevel ? NULL : &(map->gpro_c1hermite[level-1]));
3476 #ifndef MSM_GRID_CUTOFF_DECOMP
3477  const msm::Grid<C1Matrix> *gc = &(map->gc_c1hermite[level]);
3478  setupStencils(res, pro, gc);
3479 #else
3480  setupStencils(res, pro);
3481 #endif
3482  }
3483  MsmC1HermiteBlock(CkMigrateMessage *m) :
3485 
3486  void setupSections();
3487 
3488  void sumReducedPotential(CkReductionMsg *msg) {
3489 #ifdef MSM_TIMING
3490  double startTime, stopTime;
3491  startTime = CkWallTimer();
3492 #endif
3493  msm::Grid<C1Vector> ehfull;
3494  ehfull.init( msm::IndexRange(eh) );
3495  memcpy(ehfull.data().buffer(), msg->getData(), msg->getSize());
3496  delete msg;
3497  int priority = mgrLocal->nlevels
3498  + 2*(mgrLocal->nlevels - blockIndex.level)-1;
3499  int msgsz = ehfull.data().len() * sizeof(C1Vector);
3500  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3501  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3502  gm->put(ehfull, blockIndex.level, sequence); // send my level
3503 #ifdef MSM_TIMING
3504  stopTime = CkWallTimer();
3505  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3506 #endif
3507  addPotential(gm);
3508  }
3509 
3510  void addCharge(GridMsg *); // entry
3511 
3512  void restriction() {
3513  restrictionKernel();
3514  sendUpCharge();
3515  }
3516  void sendUpCharge();
3517  void gridCutoff();
3518 #ifndef MSM_GRID_CUTOFF_DECOMP
3519  void sendAcrossPotential();
3520 #endif
3521 
3522  void addPotential(GridMsg *); // entry
3523 
3524  void prolongation() {
3525  prolongationKernel();
3526  sendDownPotential();
3527  }
3528  void sendDownPotential();
3529  void sendPatch();
3530 }; // class MsmC1HermiteBlock
3531 
3532 
3534 {
3535 #ifdef DEBUG_MSM_GRID
3536  CkPrintf("LEVEL %d MSM C1 HERMITE BLOCK (%d,%d,%d): "
3537  "creating broadcast section on PE %d\n",
3538  blockIndex.level, thisIndex.x, thisIndex.y, thisIndex.z, CkMyPe());
3539 #endif
3540  CkVec<CkArrayIndex1D> elems;
3541  for (int n = 0; n < bd->indexGridCutoff.len(); n++) {
3542  elems.push_back(CkArrayIndex1D( bd->indexGridCutoff[n] ));
3543  }
3544  msmGridCutoffBroadcast = CProxySection_MsmC1HermiteGridCutoff::ckNew(
3545  mgrLocal->msmC1HermiteGridCutoff, elems.getVec(), elems.size()
3546  );
3547  CProxy_CkMulticastMgr mcastProxy = CkpvAccess(BOCclass_group).multicastMgr;
3548  CkMulticastMgr *mcastPtr = CProxy_CkMulticastMgr(mcastProxy).ckLocalBranch();
3549  msmGridCutoffBroadcast.ckSectionDelegate(mcastPtr);
3550 
3551 #ifdef DEBUG_MSM_GRID
3552  char s[1024];
3553  sprintf(s, "LEVEL %d MSM C1 HERMITE BLOCK (%d,%d,%d): "
3554  "creating reduction section on PE %d\n",
3555  blockIndex.level, thisIndex.x, thisIndex.y, thisIndex.z, CkMyPe());
3556 #endif
3557  CkVec<CkArrayIndex1D> elems2;
3558 #ifdef DEBUG_MSM_GRID
3559  strcat(s, "receiving from MsmC1HermiteGridCutoff ID:");
3560 #endif
3561  for (int n = 0; n < bd->recvGridCutoff.len(); n++) {
3562 #ifdef DEBUG_MSM_GRID
3563  char t[20];
3564  sprintf(t, " %d", bd->recvGridCutoff[n]);
3565  strcat(s, t);
3566 #endif
3567  elems2.push_back(CkArrayIndex1D( bd->recvGridCutoff[n] ));
3568  }
3569 #ifdef DEBUG_MSM_GRID
3570  strcat(s, "\n");
3571  CkPrintf(s);
3572 #endif
3573  msmGridCutoffReduction = CProxySection_MsmC1HermiteGridCutoff::ckNew(
3574  mgrLocal->msmC1HermiteGridCutoff, elems2.getVec(), elems2.size()
3575  );
3576  msmGridCutoffReduction.ckSectionDelegate(mcastPtr);
3578  CProxyElement_MsmC1HermiteBlock thisElementProxy = thisProxy(thisIndex);
3579  msg->put(&thisElementProxy);
3580 
3581  msmGridCutoffReduction.setupSections(msg); // broadcast to entire section
3582 
3583  /* XXX alternatively, setup default reduction client
3584  *
3585  mcastPtr->setReductionClient(msmGridCutoffReduction,
3586  new CkCallback(CkIndex_MsmC1HermiteBlock::myReductionEntry(NULL),
3587  thisElementProxy));
3588  *
3589  */
3590 } // MsmC1HermiteBlock::setupSections()
3591 
3592 
3594 {
3595 #ifdef MSM_TIMING
3596  double startTime, stopTime;
3597  startTime = CkWallTimer();
3598 #endif
3599  int pid;
3600  gm->get(subgrid, pid, sequence);
3601  delete gm;
3602  qh += subgrid;
3603 #ifdef MSM_TIMING
3604  stopTime = CkWallTimer();
3605  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3606 #endif
3607  if (++cntRecvsCharge == bd->numRecvsCharge) {
3608  int nlevels = mgrLocal->numLevels();
3609  if (blockIndex.level < nlevels-1) {
3610  restriction();
3611  }
3612  gridCutoff();
3613  }
3614 } // MsmC1HermiteBlock::addCharge()
3615 
3616 
3618 {
3619 #ifdef MSM_TIMING
3620  double startTime, stopTime;
3621  startTime = CkWallTimer();
3622 #endif
3623  int lnext = blockIndex.level + 1;
3624  // buffer portions of grid to send to Blocks on next level
3625  for (int n = 0; n < bd->sendUp.len(); n++) {
3626  // initialize the proper subgrid indexing range
3627  subgrid.init( bd->sendUp[n].nrange );
3628  // extract the values from the larger grid into the subgrid
3629  qhRestricted.extract(subgrid);
3630  // translate the subgrid indexing range to match the MSM block
3631  subgrid.updateLower( bd->sendUp[n].nrange_wrap.lower() );
3632  // add the subgrid charges into the block
3633  msm::BlockIndex& bindex = bd->sendUp[n].nblock_wrap;
3634  ASSERT(bindex.level == lnext);
3635  // place subgrid into message
3636  // SET MESSAGE PRIORITY
3637  int msgsz = subgrid.nn() * sizeof(C1Vector);
3638  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3639  SET_PRIORITY(gm, sequence, MSM_PRIORITY + lnext);
3640  gm->put(subgrid, blockIndex.level, sequence); // send my level
3641  // lookup in ComputeMsmMgr proxy array by level
3642  mgrLocal->msmC1HermiteBlock[lnext](
3643  bindex.n.i, bindex.n.j, bindex.n.k).addCharge(gm);
3644  } // for
3645 #ifdef MSM_TIMING
3646  stopTime = CkWallTimer();
3647  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3648 #endif
3649 } // MsmC1HermiteBlock::sendUpCharge()
3650 
3651 
3653 {
3654 #ifdef DEBUG_MSM_GRID
3655  printf("MsmC1HermiteBlock level=%d, id=%d %d %d: grid cutoff\n",
3656  blockIndex.level, blockIndex.n.i, blockIndex.n.j, blockIndex.n.k);
3657 #endif
3658 #ifndef MSM_GRID_CUTOFF_DECOMP
3659  gridCutoffKernel();
3660  sendAcrossPotential();
3661 #else // MSM_GRID_CUTOFF_DECOMP
3662 
3663  // send charge block to MsmGridCutoff compute objects
3664 #ifdef MSM_TIMING
3665  double startTime, stopTime;
3666  startTime = CkWallTimer();
3667 #endif
3668  int priority = mgrLocal->nlevels + 2*(mgrLocal->nlevels - blockIndex.level)-1;
3669  int msgsz = qh.data().len() * sizeof(C1Vector);
3670  int len = bd->indexGridCutoff.len();
3671 
3672 #if 0
3673  // send charge message to each MsmGridCutoff compute element in list
3674  for (int n = 0; n < len; n++) {
3675 #ifdef MSM_TIMING
3676  startTime = CkWallTimer();
3677 #endif
3678  int index = bd->indexGridCutoff[n];
3679  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3680  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3681  gm->put(qh, blockIndex.level, sequence); // send my level
3682 #ifdef MSM_TIMING
3683  stopTime = CkWallTimer();
3684  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3685 #endif
3686  mgrLocal->msmGridCutoff[index].compute(gm);
3687  }
3688 #else
3689 
3690  // broadcast charge message to section
3691  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3692  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3693  gm->put(qh, blockIndex.level, sequence); // send my level
3694  msmGridCutoffBroadcast.compute(gm);
3695 #ifdef MSM_TIMING
3696  stopTime = CkWallTimer();
3697  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3698 #endif
3699 
3700 #endif // 0
3701 
3702 #endif // MSM_GRID_CUTOFF_DECOMP
3703 
3704 } // MsmC1HermiteBlock::gridCutoff()
3705 
3706 
3707 #ifndef MSM_GRID_CUTOFF_DECOMP
3708 void MsmC1HermiteBlock::sendAcrossPotential()
3709 {
3710 #ifdef MSM_TIMING
3711  double startTime, stopTime;
3712  startTime = CkWallTimer();
3713 #endif
3714  int lnext = blockIndex.level;
3715  int priority = mgrLocal->nlevels + 2*(mgrLocal->nlevels - blockIndex.level)-1;
3716  // buffer portions of grid to send to Blocks on this level
3717  for (int n = 0; n < bd->sendAcross.len(); n++) {
3718  // initialize the proper subgrid indexing range
3719  subgrid.init( bd->sendAcross[n].nrange );
3720  // extract the values from the larger grid into the subgrid
3721  ehCutoff.extract(subgrid);
3722  // translate the subgrid indexing range to match the MSM block
3723  subgrid.updateLower( bd->sendAcross[n].nrange_wrap.lower() );
3724  // add the subgrid charges into the block
3725  msm::BlockIndex& bindex = bd->sendAcross[n].nblock_wrap;
3726  ASSERT(bindex.level == lnext);
3727  // place subgrid into message
3728  int msgsz = subgrid.nn() * sizeof(C1Vector);
3729  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3730  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3731  gm->put(subgrid, blockIndex.level, sequence); // send my level
3732  // lookup in ComputeMsmMgr proxy array by level
3733  mgrLocal->msmC1HermiteBlock[lnext](
3734  bindex.n.i, bindex.n.j, bindex.n.k).addPotential(gm);
3735  } // for
3736 #ifdef MSM_TIMING
3737  stopTime = CkWallTimer();
3738  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3739 #endif
3740 } // MsmC1HermiteBlock::sendAcrossPotential()
3741 #endif
3742 
3743 
3745 {
3746 #ifdef MSM_TIMING
3747  double startTime, stopTime;
3748  startTime = CkWallTimer();
3749 #endif
3750  int pid;
3751  int pseq;
3752  gm->get(subgrid, pid, pseq); // receive sender's level
3753  delete gm;
3754  eh += subgrid;
3755 #ifdef MSM_TIMING
3756  stopTime = CkWallTimer();
3757  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3758 #endif
3759  if (++cntRecvsPotential == bd->numRecvsPotential) {
3760  if (blockIndex.level > 0) {
3761  prolongation();
3762  }
3763  else {
3764  sendPatch();
3765  }
3766  }
3767 } // MsmC1HermiteBlock::addPotential()
3768 
3769 
3771 {
3772 #ifdef MSM_TIMING
3773  double startTime, stopTime;
3774  startTime = CkWallTimer();
3775 #endif
3776  int lnext = blockIndex.level - 1;
3777  int priority = mgrLocal->nlevels + 2*(mgrLocal->nlevels - blockIndex.level);
3778  // buffer portions of grid to send to Blocks on next level
3779  for (int n = 0; n < bd->sendDown.len(); n++) {
3780  // initialize the proper subgrid indexing range
3781  subgrid.init( bd->sendDown[n].nrange );
3782  // extract the values from the larger grid into the subgrid
3783  ehProlongated.extract(subgrid);
3784  // translate the subgrid indexing range to match the MSM block
3785  subgrid.updateLower( bd->sendDown[n].nrange_wrap.lower() );
3786  // add the subgrid charges into the block
3787  msm::BlockIndex& bindex = bd->sendDown[n].nblock_wrap;
3788  ASSERT(bindex.level == lnext);
3789  // place subgrid into message
3790  int msgsz = subgrid.nn() * sizeof(C1Vector);
3791  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3792  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3793  gm->put(subgrid, blockIndex.level, sequence); // send my level
3794  // lookup in ComputeMsmMgr proxy array by level
3795  mgrLocal->msmC1HermiteBlock[lnext](
3796  bindex.n.i, bindex.n.j, bindex.n.k).addPotential(gm);
3797  } // for
3798 #ifdef MSM_TIMING
3799  stopTime = CkWallTimer();
3800  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3801  mgrLocal->doneTiming();
3802 #endif
3803  init(); // reinitialize for next computation
3804 } // MsmC1HermiteBlock::sendDownPotential()
3805 
3806 
3808 {
3809 #ifdef MSM_TIMING
3810  double startTime, stopTime;
3811  startTime = CkWallTimer();
3812 #endif
3813  int lnext = blockIndex.level;
3814  int priority = mgrLocal->nlevels + 2*(mgrLocal->nlevels - blockIndex.level);
3815  ASSERT(lnext == 0);
3816  // buffer portions of grid to send to Blocks on next level
3817  for (int n = 0; n < bd->sendPatch.len(); n++) {
3818  // initialize the proper subgrid indexing range
3819  subgrid.init( bd->sendPatch[n].nrange );
3820  // extract the values from the larger grid into the subgrid
3821  eh.extract(subgrid);
3822  // translate the subgrid indexing range to match the MSM block
3823  subgrid.updateLower( bd->sendPatch[n].nrange_unwrap.lower() );
3824  // add the subgrid charges into the block, need its patch ID
3825  int pid = bd->sendPatch[n].patchID;
3826  // place subgrid into message
3827  int msgsz = subgrid.nn() * sizeof(C1Vector);
3828  GridMsg *gm = new(msgsz, sizeof(int)) GridMsg;
3829  SET_PRIORITY(gm, sequence, MSM_PRIORITY + priority);
3830  gm->put(subgrid, pid, sequence); // send patch ID
3831  // lookup which PE has this patch
3832  PatchMap *pm = PatchMap::Object();
3833  int pe = pm->node(pid);
3834  mgrProxy[pe].addPotential(gm);
3835  }
3836 #ifdef MSM_TIMING
3837  stopTime = CkWallTimer();
3838  mgrLocal->msmTiming[MsmTimer::COMM] += stopTime - startTime;
3839  mgrLocal->doneTiming();
3840 #endif
3841  init(); // reinitialize for next computation
3842 } // MsmC1HermiteBlock::sendPatch()
3843 
3844 
3845 // MsmBlock
3846 //
3848 
3849 
3851  msmProxy(thisgroup), msmCompute(0)
3852 {
3853 #ifdef DEBUG_MSM_VERBOSE
3854  printf("ComputeMsmMgr: (constructor) PE %d\n", CkMyPe());
3855 #endif
3856  CkpvAccess(BOCclass_group).computeMsmMgr = thisgroup;
3857 
3858 #ifdef MSM_TIMING
3859  if (CkMyPe() == 0) {
3860  msmTimer = CProxy_MsmTimer::ckNew();
3861  }
3862  initTiming();
3863 #endif
3864 #ifdef MSM_PROFILING
3865  if (CkMyPe() == 0) {
3866  msmProfiler = CProxy_MsmProfiler::ckNew();
3867  }
3868  initProfiling();
3869 #endif
3870 }
3871 
3873 {
3874 #ifdef DEBUG_MSM_VERBOSE
3875  printf("ComputeMsmMgr: (destructor) PE %d\n", CkMyPe());
3876 #endif
3877  // free memory?
3878 }
3879 
3880 
3881 //
3882 // Given basis vector length "len" (and with user grid spacing)
3883 // If using periodic boundary conditions along this basis vector,
3884 // h is calculated to be close to desired grid spacing such that
3885 // nn = 2^k or 3*2^k. For non-periodic boundaries, we can set h
3886 // to the desired grid spacing, and set ia and ib to pad 1/2 the
3887 // interpolating stencil width.
3888 //
3890  int& ia, int& ib, int isperiodic)
3891 {
3892  ASSERT(gridspacing > 0);
3893  if (isperiodic) {
3894  const BigReal hmin = (4./5) * gridspacing;
3895  const BigReal hmax = 1.5 * hmin;
3896  hh = len;
3897  nn = 1; // start with one grid point across length
3898  while (hh >= hmax) {
3899  hh *= 0.5; // halve spacing and double grid points
3900  nn <<= 1;
3901  }
3902  if (hh < hmin) {
3903  if (nn < 4) {
3904  NAMD_die("Basis vector is too short or MSM grid spacing is too large");
3905  }
3906  hh *= (4./3); // scale hh by 4/3 and nn by 3/4
3907  nn >>= 2;
3908  nn *= 3;
3909  }
3910  // now we have: hmin <= h < hmax,
3911  // where nn is a power of 2 times no more than one power of 3
3912  ia = 0;
3913  ib = nn-1;
3914  }
3915  else {
3916  hh = gridspacing;
3917  // Instead of "nn = (int) ceil(len / hh);"
3918  // len is divisible by hh, up to roundoff error, so round to closest nn
3919  nn = (int) floor(len/hh + 0.5);
3920  ia = -s_edge;
3921  ib = nn + s_edge;
3922  }
3923 } // ComputeMsmMgr::setup_hgrid_1d()
3924 
3925 
3926 // make sure that block sizes divide evenly into periodic dimensions
3927 // call only for periodic dimensions
3929 {
3930  if (n % bsize != 0) {
3931  // n is either 2^k or 3*2^k
3932  int newbsize = 1;
3933  if (n % 3 == 0) newbsize = 3;
3934  while (newbsize < bsize && newbsize < n) newbsize *= 2;
3935  if (bsize < newbsize) newbsize /= 2;
3936  if (n % newbsize != 0) {
3937  NAMD_die("MSM grid size for periodic dimensions must be "
3938  "a power of 2 times at most one power of 3");
3939  }
3940  bsize = newbsize;
3941  }
3942  return;
3943 }
3944 
3945 
3946 //
3947 // This is the major routine that sets everything up for MSM based on
3948 // 1. cell basis vectors and/or max and min coordinates plus padding
3949 // 2. cutoff and MSM-related parameters from SimParameter
3950 // Includes determining grid spacings along periodic dimensions,
3951 // determining grid dimensions and number of levels for system,
3952 // then calculating all needed coefficients for grid cutoff part
3953 // and grid transfer parts (restriction and prolongation).
3954 //
3955 // Then sets up Map for parallel decomposition based on
3956 // MSM block size parameters from SimParameter.
3957 //
3958 // Then determines chare array element placement of MsmBlock and
3959 // MsmGridCutoff arrays based on number of PEs and number of nodes.
3960 //
3961 // Then allocates (on PE 0) MsmBlock (3D chare arrays, one per level)
3962 // and MsmGridCutoff (one 1D chare array for all block-block interactions)
3963 // and then broadcasts array proxies across group.
3964 //
3966 {
3967 #ifdef DEBUG_MSM_VERBOSE
3968  printf("ComputeMsmMgr: initialize() PE %d\n", CkMyPe());
3969 #endif
3970 
3971  smin = msg->smin;
3972  smax = msg->smax;
3973  delete msg;
3974 
3975 #if 0
3976  printf("PE%d: initializing MSM\n", CkMyPe());
3977 #endif
3978 
3980 
3981  // get required sim params, check validity
3982  lattice = simParams->lattice;
3983 
3984  // set user-defined extent of system
3985  Vector rmin(simParams->MSMxmin, simParams->MSMymin, simParams->MSMzmin);
3986  Vector rmax(simParams->MSMxmax, simParams->MSMymax, simParams->MSMzmax);
3987  Vector sdmin = lattice.scale(rmin);
3988  Vector sdmax = lattice.scale(rmax);
3989  // swap coordinates between min and max to correct for possible rotation
3990  if (sdmin.x > sdmax.x) { double t=sdmin.x; sdmin.x=sdmax.x; sdmax.x=t; }
3991  if (sdmin.y > sdmax.y) { double t=sdmin.y; sdmin.y=sdmax.y; sdmax.y=t; }
3992  if (sdmin.z > sdmax.z) { double t=sdmin.z; sdmin.z=sdmax.z; sdmax.z=t; }
3993  // extend smin, smax by user-defined extent, where appropriate
3994  if ( ! lattice.a_p() && (sdmin.x != 0 || sdmax.x != 0)) {
3995  if (sdmin.x < smin.x) {
3996  smin.x = sdmin.x;
3997  if (CkMyPe() == 0) {
3998  iout << iINFO << "MSM extending minimum X to "
3999  << simParams->MSMxmin << " A\n" << endi;
4000  }
4001  }
4002  if (sdmax.x > smax.x) {
4003  smax.x = sdmax.x;
4004  if (CkMyPe() == 0) {
4005  iout << iINFO << "MSM extending maximum X to "
4006  << simParams->MSMxmax << " A\n" << endi;
4007  }
4008  }
4009  }
4010  if ( ! lattice.b_p() && (sdmin.y != 0 || sdmax.y != 0)) {
4011  if (sdmin.y < smin.y) {
4012  smin.y = sdmin.y;
4013  if (CkMyPe() == 0) {
4014  iout << iINFO << "MSM extending minimum Y to "
4015  << simParams->MSMymin << " A\n" << endi;
4016  }
4017  }
4018  if (sdmax.y > smax.y) {
4019  smax.y = sdmax.y;
4020  if (CkMyPe() == 0) {
4021  iout << iINFO << "MSM extending maximum Y to "
4022  << simParams->MSMymax << " A\n" << endi;
4023  }
4024  }
4025  }
4026  if ( ! lattice.c_p() && (sdmin.z != 0 || sdmax.z != 0)) {
4027  if (sdmin.z < smin.z) {
4028  smin.z = sdmin.z;
4029  if (CkMyPe() == 0) {
4030  iout << iINFO << "MSM extending minimum Z to "
4031  << simParams->MSMzmin << " A\n" << endi;
4032  }
4033  }
4034  if (sdmax.z > smax.z) {
4035  smax.z = sdmax.z;
4036  if (CkMyPe() == 0) {
4037  iout << iINFO << "MSM extending maximum Z to "
4038  << simParams->MSMzmax << " A\n" << endi;
4039  }
4040  }
4041  }
4042 
4043 #ifdef DEBUG_MSM_VERBOSE
4044  printf("smin = %g %g %g smax = %g %g %g\n",
4045  smin.x, smin.y, smin.z, smax.x, smax.y, smax.z);
4046 #endif
4047 
4048  approx = simParams->MSMApprox;
4049  if (approx < 0 || approx >= NUM_APPROX) {
4050  NAMD_die("MSM: unknown approximation requested (MSMApprox)");
4051  }
4052 
4053  split = simParams->MSMSplit;
4054  if (split < 0 || split >= NUM_SPLIT) {
4055  NAMD_die("MSM: unknown splitting requested (MSMSplit)");
4056  }
4057 
4058  if (CkMyPe() == 0) {
4059  const char *approx_str, *split_str;
4060  switch (approx) {
4061  case CUBIC: approx_str = "C1 cubic"; break;
4062  case QUINTIC: approx_str = "C1 quintic"; break;
4063  case QUINTIC2: approx_str = "C2 quintic"; break;
4064  case SEPTIC: approx_str = "C1 septic"; break;
4065  case SEPTIC3: approx_str = "C3 septic"; break;
4066  case NONIC: approx_str = "C1 nonic"; break;
4067  case NONIC4: approx_str = "C4 nonic"; break;
4068  case C1HERMITE: approx_str = "C1 Hermite"; break;
4069  default: approx_str = "unknown"; break;
4070  }
4071  switch (split) {
4072  case TAYLOR2: split_str = "C2 Taylor"; break;
4073  case TAYLOR3: split_str = "C3 Taylor"; break;
4074  case TAYLOR4: split_str = "C4 Taylor"; break;
4075  case TAYLOR5: split_str = "C5 Taylor"; break;
4076  case TAYLOR6: split_str = "C6 Taylor"; break;
4077  case TAYLOR7: split_str = "C7 Taylor"; break;
4078  case TAYLOR8: split_str = "C8 Taylor"; break;
4079  default: split_str = "unknown"; break;
4080  }
4081  iout << iINFO << "MSM using "
4082  << approx_str << " interpolation\n";
4083  iout << iINFO << "MSM using "
4084  << split_str << " splitting function\n";
4085  }
4086 
4087  a = simParams->cutoff;
4088 
4089  if (approx == C1HERMITE) {
4090  gridScalingFactor = 2;
4091  }
4092  else {
4093  gridScalingFactor = 1;
4094  }
4095 
4097  if (gridspacing <= 0) {
4098  NAMD_die("MSM: grid spacing must be greater than 0 (MSMGridSpacing)");
4099  }
4100  else if (gridspacing >= a) {
4101  NAMD_die("MSM: grid spacing must be less than cutoff (MSMGridSpacing)");
4102  }
4103 
4104  padding = gridScalingFactor * simParams->MSMPadding;
4105  if (padding < 0) {
4106  NAMD_die("MSM: padding must be non-negative (MSMPadding)");
4107  }
4108 
4109  // set maximum number of levels (default 0 adapts levels to system)
4110  nlevels = simParams->MSMLevels;
4111 
4112  // XXX dispersion unused for now
4113  dispersion = 0;
4114  if ( ! dispersion && split >= TAYLOR2_DISP) {
4115  NAMD_die("MSM: requested splitting for long-range dispersion "
4116  "(not implemented)");
4117  }
4118 
4119  // set block sizes for grid decomposition
4120  int bsx = simParams->MSMBlockSizeX / int(gridScalingFactor);
4121  int bsy = simParams->MSMBlockSizeY / int(gridScalingFactor);
4122  int bsz = simParams->MSMBlockSizeZ / int(gridScalingFactor);
4123  if (bsx <= 0 || bsy <= 0 || bsz <= 0) {
4124  NAMD_die("MSM: invalid block size requested (MSMBlockSize[XYZ])");
4125  }
4126 #ifdef MSM_FIXED_SIZE_GRID_MSG
4127  else if (bsx * bsy * bsz > MSM_MAX_BLOCK_VOLUME) {
4128  NAMD_die("MSM: requested block size (MSMBlockSize[XYZ]) too big");
4129  }
4130 #endif
4131  if (CkMyPe() == 0) {
4132  iout << iINFO << "MSM block size decomposition along X is "
4133  << bsx << " grid points\n";
4134  iout << iINFO << "MSM block size decomposition along Y is "
4135  << bsy << " grid points\n";
4136  iout << iINFO << "MSM block size decomposition along Z is "
4137  << bsz << " grid points\n";
4138  }
4139 
4140  s_edge = (PolyDegree[approx] - 1) / 2; // stencil edge size
4141  omega = 2 * PolyDegree[approx]; // smallest non-periodic grid length
4142 
4143  BigReal xlen, ylen, zlen;
4144  Vector sgmin, sgmax; // grid min and max, in scaled coordinates
4145  int ispx = lattice.a_p();
4146  int ispy = lattice.b_p();
4147  int ispz = lattice.c_p();
4148  int ispany = (ispx || ispy || ispz); // is there any periodicity?
4149 
4150  if (ispx) { // periodic along basis vector
4151  xlen = lattice.a().length();
4152  sgmax.x = 0.5;
4153  sgmin.x = -0.5;
4154  }
4155  else { // non-periodic
4156  sgmax.x = smax.x + padding; // pad the edges
4157  sgmin.x = smin.x - padding;
4158  ASSERT(gridspacing > 0);
4159  // restrict center to be on a grid point
4160  BigReal mupper = ceil(sgmax.x / (2*gridspacing));
4161  BigReal mlower = floor(sgmin.x / (2*gridspacing));
4162  sgmax.x = 2*gridspacing*mupper;
4163  sgmin.x = 2*gridspacing*mlower;
4164  xlen = sgmax.x - sgmin.x;
4165  }
4166 #ifdef DEBUG_MSM_VERBOSE
4167  printf("xlen = %g sgmin.x = %g sgmax.x = %g\n", xlen, sgmin.x, sgmax.x);
4168 #endif
4169 
4170  if (ispy) { // periodic along basis vector
4171  ylen = lattice.b().length();
4172  sgmax.y = 0.5;
4173  sgmin.y = -0.5;
4174  }
4175  else { // non-periodic
4176  sgmax.y = smax.y + padding; // pad the edges
4177  sgmin.y = smin.y - padding;
4178  ASSERT(gridspacing > 0);
4179  // restrict center to be on a grid point
4180  BigReal mupper = ceil(sgmax.y / (2*gridspacing));
4181  BigReal mlower = floor(sgmin.y / (2*gridspacing));
4182  sgmax.y = 2*gridspacing*mupper;
4183  sgmin.y = 2*gridspacing*mlower;
4184  ylen = sgmax.y - sgmin.y;
4185  }
4186 #ifdef DEBUG_MSM_VERBOSE
4187  printf("ylen = %g sgmin.y = %g sgmax.y = %g\n", ylen, sgmin.y, sgmax.y);
4188 #endif
4189 
4190  if (ispz) { // periodic along basis vector
4191  zlen = lattice.c().length();
4192  sgmax.z = 0.5;
4193  sgmin.z = -0.5;
4194  }
4195  else { // non-periodic
4196  sgmax.z = smax.z + padding; // pad the edges
4197  sgmin.z = smin.z - padding;
4198  ASSERT(gridspacing > 0);
4199  // restrict center to be on a grid point
4200  BigReal mupper = ceil(sgmax.z / (2*gridspacing));
4201  BigReal mlower = floor(sgmin.z / (2*gridspacing));
4202  sgmax.z = 2*gridspacing*mupper;
4203  sgmin.z = 2*gridspacing*mlower;
4204  zlen = sgmax.z - sgmin.z;
4205  }
4206 #ifdef DEBUG_MSM_VERBOSE
4207  printf("zlen = %g sgmin.z = %g sgmax.z = %g\n", zlen, sgmin.z, sgmax.z);
4208 #endif
4209  sglower = sgmin;
4210 
4211  int ia, ib, ja, jb, ka, kb;
4212  setup_hgrid_1d(xlen, hxlen, nhx, ia, ib, ispx);
4213  setup_hgrid_1d(ylen, hylen, nhy, ja, jb, ispy);
4214  setup_hgrid_1d(zlen, hzlen, nhz, ka, kb, ispz);
4215  hxlen_1 = 1 / hxlen;
4216  hylen_1 = 1 / hylen;
4217  hzlen_1 = 1 / hzlen;
4218  if (CkMyPe() == 0) {
4219  if (ispx || ispy || ispz) {
4220  iout << iINFO << "MSM grid spacing along X is "<< hxlen << " A\n";
4221  iout << iINFO << "MSM grid spacing along Y is "<< hylen << " A\n";
4222  iout << iINFO << "MSM grid spacing along Z is "<< hzlen << " A\n";
4223  }
4224  else {
4225  iout << iINFO << "MSM grid spacing is " << gridspacing << " A\n";
4226  }
4227  if ( ! ispx || ! ispy || ! ispz ) {
4228  iout << iINFO<<"MSM non-periodic padding is "<< padding << " A\n";
4229  }
4230  }
4231 
4232  int ni = ib - ia + 1;
4233  int nj = jb - ja + 1;
4234  int nk = kb - ka + 1;
4235  int n;
4236 
4237 #if 0
4238  // reserve temp space for factored grid transfer operation
4239  n = (nk > omega ? nk : omega); // row along z-dimension
4240  lzd.resize(n);
4241  n *= (nj > omega ? nj : omega); // plane along yz-dimensions
4242  lyzd.resize(n);
4243 #endif
4244 
4245  int lastnelems = 1;
4246  int smallestnbox = 1;
4247  int isclamped = 0;
4248  int maxlevels = nlevels; // user-defined number of levels
4249 
4250 #ifdef DEBUG_MSM_VERBOSE
4251  printf("maxlevels = %d\n", maxlevels);
4252 #endif
4253  if (nlevels <= 0) { // instead we set number of levels
4254  n = ni;
4255  if (n < nj) n = nj;
4256  if (n < nk) n = nk;
4257  for (maxlevels = 1; n > 0; n >>= 1) maxlevels++;
4258  if (ispany == 0) { // no periodicity
4259  // use rule of thumb 3/4 diameter of grid cutoff sphere
4260  int ngci = (int) ceil(3*a / hxlen) - 1;
4261  int ngcj = (int) ceil(3*a / hylen) - 1;
4262  int ngck = (int) ceil(3*a / hzlen) - 1;
4263  int omega3 = omega * omega * omega;
4264  int nhalf = (int) sqrt((double)ni * nj * nk);
4265  lastnelems = (nhalf > omega3 ? nhalf : omega3);
4266  smallestnbox = ngci * ngcj * ngck; // smaller grids don't reduce work
4267  isclamped = 1;
4268  }
4269  }
4270 #ifdef DEBUG_MSM_VERBOSE
4271  printf("maxlevels = %d\n", maxlevels);
4272 #endif
4273 
4274  // allocate space for storing grid dimensions for each level
4275  map.gridrange.resize(maxlevels);
4276 
4277  // set periodicity flags
4278  map.ispx = ispx;
4279  map.ispy = ispy;
4280  map.ispz = ispz;
4281 
4282  int level = 0;
4283  int done = 0;
4284  int alldone = 0;
4285  do {
4286  map.gridrange[level].setbounds(ia, ib, ja, jb, ka, kb);
4287 
4288  // Msm index?
4289 
4290  if (++level == nlevels) done |= 0x07; // user limit on levels
4291 
4292  if (isclamped) {
4293  int nelems = ni * nj * nk;
4294  if (nelems <= lastnelems) done |= 0x07;
4295  if (nelems <= smallestnbox) done |= 0x07;
4296  }
4297 
4298  alldone = (done == 0x07); // make sure all dimensions are done
4299 
4300  if (ispx) {
4301  ni >>= 1;
4302  ib = ni-1;
4303  if (ni & 1) done |= 0x07; // == 3 or 1
4304  else if (ni == 2) done |= 0x01; // can do one more
4305  }
4306  else {
4307  ia = -((-ia+1)/2) - s_edge;
4308  ib = (ib+1)/2 + s_edge;
4309  ni = ib - ia + 1;
4310  if (ni <= omega) done |= 0x01; // can do more restrictions
4311  }
4312 
4313  if (ispy) {
4314  nj >>= 1;
4315  jb = nj-1;
4316  if (nj & 1) done |= 0x07; // == 3 or 1
4317  else if (nj == 2) done |= 0x02; // can do one more
4318  }
4319  else {
4320  ja = -((-ja+1)/2) - s_edge;
4321  jb = (jb+1)/2 + s_edge;
4322  nj = jb - ja + 1;
4323  if (nj <= omega) done |= 0x02; // can do more restrictions
4324  }
4325 
4326  if (ispz) {
4327  nk >>= 1;
4328  kb = nk-1;
4329  if (nk & 1) done |= 0x07; // == 3 or 1
4330  else if (nk == 2) done |= 0x04; // can do one more
4331  }
4332  else {
4333  ka = -((-ka+1)/2) - s_edge;
4334  kb = (kb+1)/2 + s_edge;
4335  nk = kb - ka + 1;
4336  if (nk <= omega) done |= 0x04; // can do more restrictions
4337  }
4338  } while ( ! alldone );
4339  nlevels = level;
4340 
4341  // for periodic boundaries, don't visit top level (all 0)
4342  // toplevel visited only for all nonperiodic boundaries
4343  int toplevel = (ispany ? nlevels : nlevels - 1);
4344 
4345  // resize down to the actual number of levels (does not change alloc)
4346  map.gridrange.resize(nlevels);
4347 
4348  // print out some information about MSM
4349  if (CkMyPe() == 0) {
4350  iout << iINFO << "MSM using " << nlevels << " levels\n";
4351  for (n = 0; n < nlevels; n++) {
4352  char s[100];
4353  snprintf(s, sizeof(s), " level %d: "
4354  "[%d..%d] x [%d..%d] x [%d..%d]\n", n,
4355  map.gridrange[n].ia(), map.gridrange[n].ib(),
4356  map.gridrange[n].ja(), map.gridrange[n].jb(),
4357  map.gridrange[n].ka(), map.gridrange[n].kb());
4358  iout << iINFO << s;
4359  }
4360  iout << endi;
4361  }
4362 
4363  // find grid spacing basis vectors
4364  hu = hxlen * lattice.a().unit();
4365  hv = hylen * lattice.b().unit();
4366  hw = hzlen * lattice.c().unit();
4367  hufx = Float(hu.x);
4368  hufy = Float(hu.y);
4369  hufz = Float(hu.z);
4370  hvfx = Float(hv.x);
4371  hvfy = Float(hv.y);
4372  hvfz = Float(hv.z);
4373  hwfx = Float(hw.x);
4374  hwfy = Float(hw.y);
4375  hwfz = Float(hw.z);
4376 
4377  ru = lattice.a_r();
4378  rv = lattice.b_r();
4379  rw = lattice.c_r();
4380 
4381  // determine grid spacings in scaled space
4382  shx = ru * hu;
4383  shy = rv * hv;
4384  shz = rw * hw;
4385  shx_1 = 1 / shx;
4386  shy_1 = 1 / shy;
4387  shz_1 = 1 / shz;
4388 
4389  // row vectors to transform interpolated force back to real space
4390  // XXX Is not needed.
4391  sx_shx = shx_1 * Vector(ru.x, rv.x, rw.x);
4392  sy_shy = shy_1 * Vector(ru.y, rv.y, rw.y);
4393  sz_shz = shz_1 * Vector(ru.z, rv.z, rw.z);
4394  srx_x = Float(sx_shx.x);
4395  srx_y = Float(sx_shx.y);
4396  srx_z = Float(sx_shx.z);
4397  sry_x = Float(sy_shy.x);
4398  sry_y = Float(sy_shy.y);
4399  sry_z = Float(sy_shy.z);
4400  srz_x = Float(sz_shz.x);
4401  srz_y = Float(sz_shz.y);
4402  srz_z = Float(sz_shz.z);
4403 
4404  Vector pu = cross(hv, hw);
4405  BigReal s = (hu * pu) / (pu * pu);
4406  pu *= s; // pu is orthogonal projection of hu onto hv CROSS hw
4407 
4408  Vector pv = cross(hw, hu);
4409  s = (hv * pv) / (pv * pv);
4410  pv *= s; // pv is orthogonal projection of hv onto hw CROSS hu
4411 
4412  Vector pw = cross(hu, hv);
4413  s = (hw * pw) / (pw * pw);
4414  pw *= s; // pw is orthogonal projection of hw onto hu CROSS hv
4415 
4416  // radii for parallelepiped of weights enclosing grid cutoff sphere
4417  ni = (int) ceil(2*a / pu.length() ) - 1;
4418  nj = (int) ceil(2*a / pv.length() ) - 1;
4419  nk = (int) ceil(2*a / pw.length() ) - 1;
4420 
4421  Float scaling = 1;
4422  Float scaling_factor = 0.5f;
4423  BigReal a_1 = 1/a;
4424  BigReal a_p = a_1;
4425  if (dispersion) {
4426  a_p = a_p * a_p * a_p; // = 1/a^3
4427  a_p = a_p * a_p; // = 1/a^6
4428  scaling_factor = 1.f/64; // = 1/2^6
4429  }
4430  int i, j, k;
4431  if (approx < C1HERMITE) {
4432  // resize gc and gvc constants for number of levels
4433  map.gc.resize(nlevels);
4434  map.gvc.resize(nlevels);
4435 
4436  for (level = 0; level < toplevel; level++) {
4437  map.gc[level].setbounds(-ni, ni, -nj, nj, -nk, nk);
4438  map.gvc[level].setbounds(-ni, ni, -nj, nj, -nk, nk);
4439 
4440  for (k = -nk; k <= nk; k++) {
4441  for (j = -nj; j <= nj; j++) {
4442  for (i = -ni; i <= ni; i++) {
4443  if (level == 0) {
4444  BigReal s, t, gs=0, gt=0, g=0, dgs=0, dgt=0, dg=0;
4445  BigReal vlen = (i*hu + j*hv + k*hw).length();
4446  s = vlen * a_1;
4447  t = 0.5 * s;
4448  if (t >= 1) {
4449  g = 0;
4450  dg = 0;
4451  }
4452  else {
4453  splitting(gt, dgt, t, split);
4454  if (s >= 1) {
4455  BigReal s_1 = 1/s;
4456  if (dispersion) {
4457  gs = s_1 * s_1 * s_1; // = 1/s^3
4458  gs = gs * gs; // = 1/s^6
4459  dgs = -6 * gs * s_1;
4460  }
4461  else {
4462  gs = s_1;
4463  dgs = -gs * s_1;
4464  }
4465  }
4466  else {
4467  splitting(gs, dgs, s, split);
4468  }
4469  g = (gs - scaling_factor * gt) * a_p;
4470  BigReal c=0;
4471  if (i || j || k) {
4472  c = a_p * a_1 / vlen;
4473  }
4474  dg = 0.5 * (dgs - 0.5*scaling_factor * dgt) * c;
4475 
4476  // Msm index?
4477 
4478  }
4479  map.gc[0](i,j,k) = Float(g);
4480  map.gvc[0](i,j,k) = Float(dg);
4481  } // if level 0
4482  else {
4483  map.gc[level](i,j,k) = scaling * map.gc[0](i,j,k);
4484  map.gvc[level](i,j,k) = scaling * map.gvc[0](i,j,k);
4485  }
4486 
4487  } // for i
4488  } // for j
4489  } // for k
4490  scaling *= scaling_factor;
4491 
4492  } // for level
4493 
4494  // for summed virial factors
4495  gvsum.setbounds(-ni, ni, -nj, nj, -nk, nk);
4496  // make sure final virial sum is initialized to 0
4497  for (i = 0; i < VMAX; i++) { virial[i] = 0; }
4498 
4499  if (toplevel < nlevels) {
4500  // nonperiodic along all basis vector directions
4501  // calculate top level weights where all grid points
4502  // interact with each other
4503  ni = map.gridrange[toplevel].ni();
4504  nj = map.gridrange[toplevel].nj();
4505  nk = map.gridrange[toplevel].nk();
4506  map.gc[toplevel].setbounds(-ni, ni, -nj, nj, -nk, nk);
4507 
4508  // Msm index?
4509 
4510  for (k = -nk; k <= nk; k++) {
4511  for (j = -nj; j <= nj; j++) {
4512  for (i = -ni; i <= ni; i++) {
4513  BigReal s, gs, d;
4514  BigReal vlen = (i*hu + j*hv + k*hw).length();
4515  s = vlen * a_1;
4516  if (s >= 1) {
4517  BigReal s_1 = 1/s;
4518  if (dispersion) {
4519  gs = s_1 * s_1 * s_1; // = 1/s^3
4520  gs = gs * gs; // = 1/s^6
4521  }
4522  else {
4523  gs = s_1;
4524  }
4525  }
4526  else {
4527  splitting(gs, d, s, split);
4528  }
4529  map.gc[toplevel](i,j,k) = scaling * Float(gs * a_p);
4530  } // for i
4531  } // for j
4532  } // for k
4533  } // if toplevel
4534 
4535  // generate grespro stencil
4536  const int nstencil = Nstencil[approx];
4537  const Float *phi = PhiStencil[approx];
4538  map.grespro.set(0, nstencil, 0, nstencil, 0, nstencil);
4539  for (k = 0; k < nstencil; k++) {
4540  for (j = 0; j < nstencil; j++) {
4541  for (i = 0; i < nstencil; i++) {
4542  map.grespro(i,j,k) = phi[i] * phi[j] * phi[k];
4543  }
4544  }
4545  }
4546 
4547  } // end if approx < C1HERMITE
4548  else {
4549  // C1HERMITE
4550  // resize gc_c1hermite constants for number of levels
4551  map.gc_c1hermite.resize(nlevels);
4552  scaling = 1;
4553 
4554  for (level = 0; level < toplevel; level++) {
4555 
4556  Vector hmu = scaling * hu;
4557  Vector hmv = scaling * hv;
4558  Vector hmw = scaling * hw;
4559  BigReal am = scaling * a;
4560 
4561  map.gc_c1hermite[level].setbounds(-ni, ni, -nj, nj, -nk, nk);
4562 
4563  for (k = -nk; k <= nk; k++) {
4564  for (j = -nj; j <= nj; j++) {
4565  for (i = -ni; i <= ni; i++) {
4566  C1Matrix& m = map.gc_c1hermite[level](i,j,k);
4567  Vector rv = i*hmu + j*hmv + k*hmw;
4568  BigReal r2 = rv * rv;
4569  m.set(0);
4570  if (r2 < 4*am*am) {
4571  // accumulate D( g_{a}(0,r) ) term for this level
4572  gc_c1hermite_elem_accum(m, 1, rv, am, split);
4573  // accumulate D( -g_{2a}(0,r) ) term for this level
4574  gc_c1hermite_elem_accum(m, -1, rv, 2*am, split);
4575  } // if within cutoff
4576  }
4577  }
4578  } // end loop over gc_c1hermite elements for this level
4579  scaling *= 2; // double grid spacing and cutoff at each iteration
4580 
4581  } // end loop over levels
4582 
4583  if (toplevel < nlevels) {
4584  Vector hmu = scaling * hu;
4585  Vector hmv = scaling * hv;
4586  Vector hmw = scaling * hw;
4587  BigReal am = scaling * a;
4588 
4589  // nonperiodic along all basis vector directions
4590  // calculate top level weights where all grid points
4591  // interact with each other
4592  ni = map.gridrange[toplevel].ni();
4593  nj = map.gridrange[toplevel].nj();
4594  nk = map.gridrange[toplevel].nk();
4595  map.gc_c1hermite[toplevel].setbounds(-ni, ni, -nj, nj, -nk, nk);
4596 
4597  for (k = -nk; k <= nk; k++) {
4598  for (j = -nj; j <= nj; j++) {
4599  for (i = -ni; i <= ni; i++) {
4600  C1Matrix& m = map.gc_c1hermite[level](i,j,k);
4601  Vector rv = i*hmu + j*hmv + k*hmw;
4602  m.set(0);
4603  // accumulate D( g_{a}(0,r) ) term for this level
4604  gc_c1hermite_elem_accum(m, 1, rv, am, split);
4605  }
4606  }
4607  } // end loop over gc_c1hermite elements for top level
4608 
4609  } // end if top level
4610 
4611  // C1 Hermite restriction and prolongation stencils
4612  map.gres_c1hermite.resize(nlevels-1);
4613  map.gpro_c1hermite.resize(nlevels-1);
4614 
4615  enum {
4616  ND = 3, // stencil diameter
4617  NR = ND/2 // stencil radius
4618  };
4619 
4620  // the master basis functions PHI0 and PHI1 for the 3-point stencil
4621  // and their derivatives DPHI0 and DPHI1
4622  const double PHI0[ND] = { 0.5, 1, 0.5 };
4623  const double DPHI0[ND] = { 1.5, 0, -1.5 };
4624  const double PHI1[ND] = { -0.125, 0, 0.125 };
4625  const double DPHI1[ND] = { -0.25, 1, -0.25 };
4626 
4627  // for intermediate calculations
4628  double xphi0_base_array[ND];
4629  double dxphi0_base_array[ND];
4630  double yphi0_base_array[ND];
4631  double dyphi0_base_array[ND];
4632  double zphi0_base_array[ND];
4633  double dzphi0_base_array[ND];
4634  double xphi1_base_array[ND];
4635  double dxphi1_base_array[ND];
4636  double yphi1_base_array[ND];
4637  double dyphi1_base_array[ND];
4638  double zphi1_base_array[ND];
4639  double dzphi1_base_array[ND];
4640  // will point to center of stencil arrays
4641  double *xphi0, *dxphi0, *xphi1, *dxphi1;
4642  double *yphi0, *dyphi0, *yphi1, *dyphi1;
4643  double *zphi0, *dzphi0, *zphi1, *dzphi1;
4644 
4645  for (n = 0; n < ND; n++) {
4646  xphi0_base_array[n] = PHI0[n];
4647  dxphi0_base_array[n] = hxlen_1 * DPHI0[n]; // scale by grid spacing
4648  xphi1_base_array[n] = hxlen * PHI1[n]; // scale by grid spacing
4649  dxphi1_base_array[n] = DPHI1[n];
4650  yphi0_base_array[n] = PHI0[n];
4651  dyphi0_base_array[n] = hylen_1 * DPHI0[n]; // scale by grid spacing
4652  yphi1_base_array[n] = hylen * PHI1[n]; // scale by grid spacing
4653  dyphi1_base_array[n] = DPHI1[n];
4654  zphi0_base_array[n] = PHI0[n];
4655  dzphi0_base_array[n] = hzlen_1 * DPHI0[n]; // scale by grid spacing
4656  zphi1_base_array[n] = hzlen * PHI1[n]; // scale by grid spacing
4657  dzphi1_base_array[n] = DPHI1[n];
4658  }
4659  xphi0 = xphi0_base_array + NR; // point into center of arrays
4660  dxphi0 = dxphi0_base_array + NR;
4661  xphi1 = xphi1_base_array + NR;
4662  dxphi1 = dxphi1_base_array + NR;
4663  yphi0 = yphi0_base_array + NR;
4664  dyphi0 = dyphi0_base_array + NR;
4665  yphi1 = yphi1_base_array + NR;
4666  dyphi1 = dyphi1_base_array + NR;
4667  zphi0 = zphi0_base_array + NR;
4668  dzphi0 = dzphi0_base_array + NR;
4669  zphi1 = zphi1_base_array + NR;
4670  dzphi1 = dzphi1_base_array + NR;
4671 
4672  for (level = 0; level < nlevels-1; level++) {
4673  // allocate space for restriction and prolongation stencils
4674  map.gres_c1hermite[level].set(0, ND, 0, ND, 0, ND);
4675  map.gpro_c1hermite[level].set(0, ND, 0, ND, 0, ND);
4676 
4677  // scale up to next level grid spacing
4678  //
4679  // have to determine for each dimension whether or not
4680  // a periodic grid spacing has increased
4681  // (equivalent to if there are fewer grid points)
4682  for (n = -NR; n <= NR; n++) {
4683  if ( ! ispx ||
4684  map.gridrange[level+1].ni() < map.gridrange[level].ni() ) {
4685  dxphi0[n] *= 0.5;
4686  xphi1[n] *= 2;
4687  }
4688  if ( ! ispy ||
4689  map.gridrange[level+1].nj() < map.gridrange[level].nj() ) {
4690  dyphi0[n] *= 0.5;
4691  yphi1[n] *= 2;
4692  }
4693  if ( ! ispz ||
4694  map.gridrange[level+1].nk() < map.gridrange[level].nk() ) {
4695  dzphi0[n] *= 0.5;
4696  zphi1[n] *= 2;
4697  }
4698  }
4699 
4700  // loop over restriction stencil matrices
4701  // calculate from partial derivatives
4702  for (k = -NR; k <= NR; k++) {
4703  for (j = -NR; j <= NR; j++) {
4704  for (i = -NR; i <= NR; i++) {
4705  Float *t = map.gres_c1hermite[level](i+NR,j+NR,k+NR).melem;
4706 
4707  t[C1INDEX(D000,D000)] = xphi0[i] * yphi0[j] * zphi0[k];
4708  t[C1INDEX(D000,D100)] = dxphi0[i] * yphi0[j] * zphi0[k];
4709  t[C1INDEX(D000,D010)] = xphi0[i] * dyphi0[j] * zphi0[k];
4710  t[C1INDEX(D000,D001)] = xphi0[i] * yphi0[j] * dzphi0[k];
4711  t[C1INDEX(D000,D110)] = dxphi0[i] * dyphi0[j] * zphi0[k];
4712  t[C1INDEX(D000,D101)] = dxphi0[i] * yphi0[j] * dzphi0[k];
4713  t[C1INDEX(D000,D011)] = xphi0[i] * dyphi0[j] * dzphi0[k];
4714  t[C1INDEX(D000,D111)] = dxphi0[i] * dyphi0[j] * dzphi0[k];
4715 
4716  t[C1INDEX(D100,D000)] = xphi1[i] * yphi0[j] * zphi0[k];
4717  t[C1INDEX(D100,D100)] = dxphi1[i] * yphi0[j] * zphi0[k];
4718  t[C1INDEX(D100,D010)] = xphi1[i] * dyphi0[j] * zphi0[k];
4719  t[C1INDEX(D100,D001)] = xphi1[i] * yphi0[j] * dzphi0[k];
4720  t[C1INDEX(D100,D110)] = dxphi1[i] * dyphi0[j] * zphi0[k];
4721  t[C1INDEX(D100,D101)] = dxphi1[i] * yphi0[j] * dzphi0[k];
4722  t[C1INDEX(D100,D011)] = xphi1[i] * dyphi0[j] * dzphi0[k];
4723  t[C1INDEX(D100,D111)] = dxphi1[i] * dyphi0[j] * dzphi0[k];
4724 
4725  t[C1INDEX(D010,D000)] = xphi0[i] * yphi1[j] * zphi0[k];
4726  t[C1INDEX(D010,D100)] = dxphi0[i] * yphi1[j] * zphi0[k];
4727  t[C1INDEX(D010,D010)] = xphi0[i] * dyphi1[j] * zphi0[k];
4728  t[C1INDEX(D010,D001)] = xphi0[i] * yphi1[j] * dzphi0[k];
4729  t[C1INDEX(D010,D110)] = dxphi0[i] * dyphi1[j] * zphi0[k];
4730  t[C1INDEX(D010,D101)] = dxphi0[i] * yphi1[j] * dzphi0[k];
4731  t[C1INDEX(D010,D011)] = xphi0[i] * dyphi1[j] * dzphi0[k];
4732  t[C1INDEX(D010,D111)] = dxphi0[i] * dyphi1[j] * dzphi0[k];
4733 
4734  t[C1INDEX(D001,D000)] = xphi0[i] * yphi0[j] * zphi1[k];
4735  t[C1INDEX(D001,D100)] = dxphi0[i] * yphi0[j] * zphi1[k];
4736  t[C1INDEX(D001,D010)] = xphi0[i] * dyphi0[j] * zphi1[k];
4737  t[C1INDEX(D001,D001)] = xphi0[i] * yphi0[j] * dzphi1[k];
4738  t[C1INDEX(D001,D110)] = dxphi0[i] * dyphi0[j] * zphi1[k];
4739  t[C1INDEX(D001,D101)] = dxphi0[i] * yphi0[j] * dzphi1[k];
4740  t[C1INDEX(D001,D011)] = xphi0[i] * dyphi0[j] * dzphi1[k];
4741  t[C1INDEX(D001,D111)] = dxphi0[i] * dyphi0[j] * dzphi1[k];
4742 
4743  t[C1INDEX(D110,D000)] = xphi1[i] * yphi1[j] * zphi0[k];
4744  t[C1INDEX(D110,D100)] = dxphi1[i] * yphi1[j] * zphi0[k];
4745  t[C1INDEX(D110,D010)] = xphi1[i] * dyphi1[j] * zphi0[k];
4746  t[C1INDEX(D110,D001)] = xphi1[i] * yphi1[j] * dzphi0[k];
4747  t[C1INDEX(D110,D110)] = dxphi1[i] * dyphi1[j] * zphi0[k];
4748  t[C1INDEX(D110,D101)] = dxphi1[i] * yphi1[j] * dzphi0[k];
4749  t[C1INDEX(D110,D011)] = xphi1[i] * dyphi1[j] * dzphi0[k];
4750  t[C1INDEX(D110,D111)] = dxphi1[i] * dyphi1[j] * dzphi0[k];
4751 
4752  t[C1INDEX(D101,D000)] = xphi1[i] * yphi0[j] * zphi1[k];
4753  t[C1INDEX(D101,D100)] = dxphi1[i] * yphi0[j] * zphi1[k];
4754  t[C1INDEX(D101,D010)] = xphi1[i] * dyphi0[j] * zphi1[k];
4755  t[C1INDEX(D101,D001)] = xphi1[i] * yphi0[j] * dzphi1[k];
4756  t[C1INDEX(D101,D110)] = dxphi1[i] * dyphi0[j] * zphi1[k];
4757  t[C1INDEX(D101,D101)] = dxphi1[i] * yphi0[j] * dzphi1[k];
4758  t[C1INDEX(D101,D011)] = xphi1[i] * dyphi0[j] * dzphi1[k];
4759  t[C1INDEX(D101,D111)] = dxphi1[i] * dyphi0[j] * dzphi1[k];
4760 
4761  t[C1INDEX(D011,D000)] = xphi0[i] * yphi1[j] * zphi1[k];
4762  t[C1INDEX(D011,D100)] = dxphi0[i] * yphi1[j] * zphi1[k];
4763  t[C1INDEX(D011,D010)] = xphi0[i] * dyphi1[j] * zphi1[k];
4764  t[C1INDEX(D011,D001)] = xphi0[i] * yphi1[j] * dzphi1[k];
4765  t[C1INDEX(D011,D110)] = dxphi0[i] * dyphi1[j] * zphi1[k];
4766  t[C1INDEX(D011,D101)] = dxphi0[i] * yphi1[j] * dzphi1[k];
4767  t[C1INDEX(D011,D011)] = xphi0[i] * dyphi1[j] * dzphi1[k];
4768  t[C1INDEX(D011,D111)] = dxphi0[i] * dyphi1[j] * dzphi1[k];
4769 
4770  t[C1INDEX(D111,D000)] = xphi1[i] * yphi1[j] * zphi1[k];
4771  t[C1INDEX(D111,D100)] = dxphi1[i] * yphi1[j] * zphi1[k];
4772  t[C1INDEX(D111,D010)] = xphi1[i] * dyphi1[j] * zphi1[k];
4773  t[C1INDEX(D111,D001)] = xphi1[i] * yphi1[j] * dzphi1[k];
4774  t[C1INDEX(D111,D110)] = dxphi1[i] * dyphi1[j] * zphi1[k];
4775  t[C1INDEX(D111,D101)] = dxphi1[i] * yphi1[j] * dzphi1[k];
4776  t[C1INDEX(D111,D011)] = xphi1[i] * dyphi1[j] * dzphi1[k];
4777  t[C1INDEX(D111,D111)] = dxphi1[i] * dyphi1[j] * dzphi1[k];
4778  }
4779  }
4780  } // end loops over restriction stencil matrices
4781 
4782  // loop over prolongation stencil matrices
4783  // prolongation stencil matrices are the transpose of restriction
4784  for (k = -NR; k <= NR; k++) {
4785  for (j = -NR; j <= NR; j++) {
4786  for (i = -NR; i <= NR; i++) {
4787  Float *t = map.gres_c1hermite[level](i+NR,j+NR,k+NR).melem;
4788  Float *tt = map.gpro_c1hermite[level](i+NR,j+NR,k+NR).melem;
4789 
4790  tt[C1INDEX(D000,D000)] = t[C1INDEX(D000,D000)];
4791  tt[C1INDEX(D000,D100)] = t[C1INDEX(D100,D000)];
4792  tt[C1INDEX(D000,D010)] = t[C1INDEX(D010,D000)];
4793  tt[C1INDEX(D000,D001)] = t[C1INDEX(D001,D000)];
4794  tt[C1INDEX(D000,D110)] = t[C1INDEX(D110,D000)];
4795  tt[C1INDEX(D000,D101)] = t[C1INDEX(D101,D000)];
4796  tt[C1INDEX(D000,D011)] = t[C1INDEX(D011,D000)];
4797  tt[C1INDEX(D000,D111)] = t[C1INDEX(D111,D000)];
4798 
4799  tt[C1INDEX(D100,D000)] = t[C1INDEX(D000,D100)];
4800  tt[C1INDEX(D100,D100)] = t[C1INDEX(D100,D100)];
4801  tt[C1INDEX(D100,D010)] = t[C1INDEX(D010,D100)];
4802  tt[C1INDEX(D100,D001)] = t[C1INDEX(D001,D100)];
4803  tt[C1INDEX(D100,D110)] = t[C1INDEX(D110,D100)];
4804  tt[C1INDEX(D100,D101)] = t[C1INDEX(D101,D100)];
4805  tt[C1INDEX(D100,D011)] = t[C1INDEX(D011,D100)];
4806  tt[C1INDEX(D100,D111)] = t[C1INDEX(D111,D100)];
4807 
4808  tt[C1INDEX(D010,D000)] = t[C1INDEX(D000,D010)];
4809  tt[C1INDEX(D010,D100)] = t[C1INDEX(D100,D010)];
4810  tt[C1INDEX(D010,D010)] = t[C1INDEX(D010,D010)];
4811  tt[C1INDEX(D010,D001)] = t[C1INDEX(D001,D010)];
4812  tt[C1INDEX(D010,D110)] = t[C1INDEX(D110,D010)];
4813  tt[C1INDEX(D010,D101)] = t[C1INDEX(D101,D010)];
4814  tt[C1INDEX(D010,D011)] = t[C1INDEX(D011,D010)];
4815  tt[C1INDEX(D010,D111)] = t[C1INDEX(D111,D010)];
4816 
4817  tt[C1INDEX(D001,D000)] = t[C1INDEX(D000,D001)];
4818  tt[C1INDEX(D001,D100)] = t[C1INDEX(D100,D001)];
4819  tt[C1INDEX(D001,D010)] = t[C1INDEX(D010,D001)];
4820  tt[C1INDEX(D001,D001)] = t[C1INDEX(D001,D001)];
4821  tt[C1INDEX(D001,D110)] = t[C1INDEX(D110,D001)];
4822  tt[C1INDEX(D001,D101)] = t[C1INDEX(D101,D001)];
4823  tt[C1INDEX(D001,D011)] = t[C1INDEX(D011,D001)];
4824  tt[C1INDEX(D001,D111)] = t[C1INDEX(D111,D001)];
4825 
4826  tt[C1INDEX(D110,D000)] = t[C1INDEX(D000,D110)];
4827  tt[C1INDEX(D110,D100)] = t[C1INDEX(D100,D110)];
4828  tt[C1INDEX(D110,D010)] = t[C1INDEX(D010,D110)];
4829  tt[C1INDEX(D110,D001)] = t[C1INDEX(D001,D110)];
4830  tt[C1INDEX(D110,D110)] = t[C1INDEX(D110,D110)];
4831  tt[C1INDEX(D110,D101)] = t[C1INDEX(D101,D110)];
4832  tt[C1INDEX(D110,D011)] = t[C1INDEX(D011,D110)];
4833  tt[C1INDEX(D110,D111)] = t[C1INDEX(D111,D110)];
4834 
4835  tt[C1INDEX(D101,D000)] = t[C1INDEX(D000,D101)];
4836  tt[C1INDEX(D101,D100)] = t[C1INDEX(D100,D101)];
4837  tt[C1INDEX(D101,D010)] = t[C1INDEX(D010,D101)];
4838  tt[C1INDEX(D101,D001)] = t[C1INDEX(D001,D101)];
4839  tt[C1INDEX(D101,D110)] = t[C1INDEX(D110,D101)];
4840  tt[C1INDEX(D101,D101)] = t[C1INDEX(D101,D101)];
4841  tt[C1INDEX(D101,D011)] = t[C1INDEX(D011,D101)];
4842  tt[C1INDEX(D101,D111)] = t[C1INDEX(D111,D101)];
4843 
4844  tt[C1INDEX(D011,D000)] = t[C1INDEX(D000,D011)];
4845  tt[C1INDEX(D011,D100)] = t[C1INDEX(D100,D011)];
4846  tt[C1INDEX(D011,D010)] = t[C1INDEX(D010,D011)];
4847  tt[C1INDEX(D011,D001)] = t[C1INDEX(D001,D011)];
4848  tt[C1INDEX(D011,D110)] = t[C1INDEX(D110,D011)];
4849  tt[C1INDEX(D011,D101)] = t[C1INDEX(D101,D011)];
4850  tt[C1INDEX(D011,D011)] = t[C1INDEX(D011,D011)];
4851  tt[C1INDEX(D011,D111)] = t[C1INDEX(D111,D011)];
4852 
4853  tt[C1INDEX(D111,D000)] = t[C1INDEX(D000,D111)];
4854  tt[C1INDEX(D111,D100)] = t[C1INDEX(D100,D111)];
4855  tt[C1INDEX(D111,D010)] = t[C1INDEX(D010,D111)];
4856  tt[C1INDEX(D111,D001)] = t[C1INDEX(D001,D111)];
4857  tt[C1INDEX(D111,D110)] = t[C1INDEX(D110,D111)];
4858  tt[C1INDEX(D111,D101)] = t[C1INDEX(D101,D111)];
4859  tt[C1INDEX(D111,D011)] = t[C1INDEX(D011,D111)];
4860  tt[C1INDEX(D111,D111)] = t[C1INDEX(D111,D111)];
4861  }
4862  }
4863  } // end loops over prolongation stencil matrices
4864 
4865  } // end loop over levels
4866 
4867  } // end if C1HERMITE
4868 
4869  // calculate self energy factor for splitting
4870  BigReal gs=0, d=0;
4871  splitting(gs, d, 0, split);
4872  gzero = gs * a_p;
4873 
4874  if (CkMyPe() == 0) {
4875  iout << iINFO << "MSM finished calculating stencils\n" << endi;
4876  }
4877 
4878  // allocate map for patches
4879  PatchMap *pm = PatchMap::Object();
4880  int numpatches = pm->numPatches();
4881  map.patchList.resize(numpatches);
4882 #ifdef DEBUG_MSM_VERBOSE
4883  printf("numPatches = %d\n", numpatches);
4884 #endif
4885 
4886  // allocate map for blocks for each grid level
4887  map.blockLevel.resize(nlevels);
4888  map.bsx.resize(nlevels);
4889  map.bsy.resize(nlevels);
4890  map.bsz.resize(nlevels);
4891 #ifdef MSM_FOLD_FACTOR
4892  map.foldfactor.resize(nlevels);
4893 #endif
4894  for (level = 0; level < nlevels; level++) {
4895  msm::IndexRange& g = map.gridrange[level];
4897  int gia = g.ia();
4898  int gni = g.ni();
4899  int gja = g.ja();
4900  int gnj = g.nj();
4901  int gka = g.ka();
4902  int gnk = g.nk();
4903  map.bsx[level] = bsx;
4904  map.bsy[level] = bsy;
4905  map.bsz[level] = bsz;
4906  if (/* map.bsx[level] < gni ||
4907  map.bsy[level] < gnj ||
4908  map.bsz[level] < gnk */ 1) {
4909  // make sure that block sizes divide evenly into periodic dimensions
4910  if (ispx) setup_periodic_blocksize(map.bsx[level], gni);
4911  if (ispy) setup_periodic_blocksize(map.bsy[level], gnj);
4912  if (ispz) setup_periodic_blocksize(map.bsz[level], gnk);
4913 #ifdef MSM_DEBUG_VERBOSE
4914  if (CkMyPe() == 0) {
4915  printf("level = %d\n map.bs* = %d %d %d gn* = %d %d %d\n",
4916  level, map.bsx[level], map.bsy[level], map.bsz[level],gni,gnj,gnk);
4917  }
4918 #endif
4919  // subdivide grid into multiple blocks
4920  // == ceil(gni / bsx), etc.
4921  int bni = (gni / map.bsx[level]) + (gni % map.bsx[level] != 0);
4922  int bnj = (gnj / map.bsy[level]) + (gnj % map.bsy[level] != 0);
4923  int bnk = (gnk / map.bsz[level]) + (gnk % map.bsz[level] != 0);
4924 #ifdef MSM_FOLD_FACTOR
4925  if (/* level > 2 && */ (bni == 1 || bnj == 1 || bnk == 1)) {
4926  map.foldfactor[level].set(bsx / gni, bsy / gnj, bsz / gnk);
4927 #if 0
4928  if (CkMyPe() == 0) {
4929  printf("Setting MSM FoldFactor level %d: %d %d %d\n",
4930  level, bsx / gni, bsy / gnj, bsz / gnk);
4931  }
4932 #endif
4933  }
4934 #endif
4935  b.set(0, bni, 0, bnj, 0, bnk);
4936  for (k = 0; k < bnk; k++) {
4937  for (j = 0; j < bnj; j++) {
4938  for (i = 0; i < bni; i++) {
4939  b(i,j,k).reset();
4940  int ia = gia + i*map.bsx[level];
4941  int ib = ia + map.bsx[level] - 1;
4942  int ja = gja + j*map.bsy[level];
4943  int jb = ja + map.bsy[level] - 1;
4944  int ka = gka + k*map.bsz[level];
4945  int kb = ka + map.bsz[level] - 1;
4946  if (ib >= gia + gni) ib = gia + gni - 1;
4947  if (jb >= gja + gnj) jb = gja + gnj - 1;
4948  if (kb >= gka + gnk) kb = gka + gnk - 1;
4949  b(i,j,k).nrange.setbounds(ia, ib, ja, jb, ka, kb);
4950  }
4951  }
4952  }
4953  }
4954  /*
4955  else {
4956  // make entire grid into single block
4957  b.set(0, 1, 0, 1, 0, 1);
4958  b(0,0,0).reset();
4959  b(0,0,0).nrange.set(gia, gni, gja, gnj, gka, gnk);
4960  // set correct block dimensions
4961  map.bsx[level] = gni;
4962  map.bsy[level] = gnj;
4963  map.bsz[level] = gnk;
4964  }
4965  */
4966  }
4967  //CkExit();
4968 #ifdef DEBUG_MSM_VERBOSE
4969  printf("Done allocating map for grid levels\n");
4970  printf("Grid level decomposition:\n");
4971  for (level = 0; level < nlevels; level++) {
4973  int bia = b.ia();
4974  int bib = b.ib();
4975  int bja = b.ja();
4976  int bjb = b.jb();
4977  int bka = b.ka();
4978  int bkb = b.kb();
4979  for (k = bka; k <= bkb; k++) {
4980  for (j = bja; j <= bjb; j++) {
4981  for (i = bia; i <= bib; i++) {
4982  int ia = b(i,j,k).nrange.ia();
4983  int ib = b(i,j,k).nrange.ib();
4984  int ja = b(i,j,k).nrange.ja();
4985  int jb = b(i,j,k).nrange.jb();
4986  int ka = b(i,j,k).nrange.ka();
4987  int kb = b(i,j,k).nrange.kb();
4988  printf("level=%d id=%d %d %d [%d..%d] x [%d..%d] x [%d..%d]"
4989  " --> %d\n",
4990  level, i, j, k, ia, ib, ja, jb, ka, kb,
4991  b(i,j,k).nrange.nn());
4992  }
4993  }
4994  }
4995  }
4996 #endif
4997  if (CkMyPe() == 0) {
4998  iout << iINFO << "MSM finished creating map for grid levels\n" << endi;
4999  }
5000 
5001  initialize2();
5002 } // ComputeMsmMgr::initialize()
5003 
5004 
5005 void ComputeMsmMgr::initialize2()
5006 {
5008  PatchMap *pm = PatchMap::Object();
5009  int numpatches = pm->numPatches();
5010  int i, j, k, n, level;
5011 
5012  // initialize grid of PatchDiagram
5013  // a = cutoff
5014  BigReal sysdima = lattice.a_r().unit() * lattice.a();
5015  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
5016  BigReal sysdimc = lattice.c_r().unit() * lattice.c();
5017  BigReal patchdim = simParams->patchDimension;
5018  BigReal xmargin = 0.5 * (patchdim - a) / sysdima;
5019  BigReal ymargin = 0.5 * (patchdim - a) / sysdimb;
5020  BigReal zmargin = 0.5 * (patchdim - a) / sysdimc;
5021 #if 0
5022  // set min and max grid indices for patch covering
5023  // for non-periodic boundaries they conform to grid
5024  // periodic permits wrapping, so set to min/max for int
5025  int ia_min = (lattice.a_p() ? INT_MIN : map.gridrange[0].ia());
5026  int ib_max = (lattice.a_p() ? INT_MAX : map.gridrange[0].ib());
5027  int ja_min = (lattice.b_p() ? INT_MIN : map.gridrange[0].ja());
5028  int jb_max = (lattice.b_p() ? INT_MAX : map.gridrange[0].jb());
5029  int ka_min = (lattice.c_p() ? INT_MIN : map.gridrange[0].ka());
5030  int kb_max = (lattice.c_p() ? INT_MAX : map.gridrange[0].kb());
5031 #endif
5032  int pid;
5033  for (pid = 0; pid < numpatches; pid++) {
5034  // shortcut reference to this patch diagram
5035  msm::PatchDiagram& p = map.patchList[pid];
5036  p.reset();
5037  // find extent of patch including margin
5038  BigReal xmin = pm->min_a(pid) - xmargin;
5039  BigReal xmax = pm->max_a(pid) + xmargin;
5040  BigReal ymin = pm->min_b(pid) - ymargin;
5041  BigReal ymax = pm->max_b(pid) + ymargin;
5042  BigReal zmin = pm->min_c(pid) - zmargin;
5043  BigReal zmax = pm->max_c(pid) + zmargin;
5044  // find grid point covering of patch plus outer edge stencil
5045  int ia = int(floor((xmin - sglower.x) * shx_1)) - s_edge;
5046  int ib = int(floor((xmax - sglower.x) * shx_1)) + 1 + s_edge;
5047  int ja = int(floor((ymin - sglower.y) * shy_1)) - s_edge;
5048  int jb = int(floor((ymax - sglower.y) * shy_1)) + 1 + s_edge;
5049  int ka = int(floor((zmin - sglower.z) * shz_1)) - s_edge;
5050  int kb = int(floor((zmax - sglower.z) * shz_1)) + 1 + s_edge;
5051  // for edge patches along non-periodic boundaries
5052  // clamp subgrid to full grid boundaries
5053  if ( ! lattice.a_p() ) { // non-periodic along lattice basis vector a
5054  int mi = pm->index_a(pid);
5055  if (mi == 0) ia = map.gridrange[0].ia();
5056  if (mi == pm->gridsize_a()-1) ib = map.gridrange[0].ib();
5057  }
5058  if ( ! lattice.b_p() ) { // non-periodic along lattice basis vector b
5059  int mj = pm->index_b(pid);
5060  if (mj == 0) ja = map.gridrange[0].ja();
5061  if (mj == pm->gridsize_b()-1) jb = map.gridrange[0].jb();
5062  }
5063  if ( ! lattice.c_p() ) { // non-periodic along lattice basis vector a
5064  int mk = pm->index_c(pid);
5065  if (mk == 0) ka = map.gridrange[0].ka();
5066  if (mk == pm->gridsize_c()-1) kb = map.gridrange[0].kb();
5067  }
5068 #if 0
5069  // truncate subgrid covering to grid min/max
5070  // so that subgrid does not extend beyond full grid
5071  // works for both periodic and non-periodic boundary conditions
5072  if (ia < ia_min) ia = ia_min;
5073  if (ib > ib_max) ib = ib_max;
5074  if (ja < ja_min) ja = ja_min;
5075  if (jb > jb_max) jb = jb_max;
5076  if (ka < ka_min) ka = ka_min;
5077  if (kb > kb_max) kb = kb_max;
5078  // check for edge patch and extend subgrid to grid min/max
5079  // so that subgrid fully covers up to the edge of full grid
5080  int mi = pm->index_a(pid);
5081  int mj = pm->index_b(pid);
5082  int mk = pm->index_c(pid);
5083  int npi = pm->gridsize_a();
5084  int npj = pm->gridsize_b();
5085  int npk = pm->gridsize_c();
5086  if (mi == 0) ia = ia_min;
5087  if (mi == npi-1) ib = ib_max;
5088  if (mj == 0) ja = ja_min;
5089  if (mj == npj-1) jb = jb_max;
5090  if (mk == 0) ka = ka_min;
5091  if (mk == npk-1) kb = kb_max;
5092 #endif
5093 #if 0
5094  printf("patch %d: grid covering: [%d..%d] x [%d..%d] x [%d..%d]\n",
5095  pid, ia, ib, ja, jb, ka, kb);
5096  fflush(stdout);
5097 #endif
5098  // set the index range for this patch's surrounding grid points
5099  p.nrange.setbounds(ia,ib,ja,jb,ka,kb);
5100  // find lower and upper blocks of MSM h-grid
5101  msm::BlockIndex blower = map.blockOfGridIndex(msm::Ivec(ia,ja,ka),0);
5102  msm::BlockIndex bupper = map.blockOfGridIndex(msm::Ivec(ib,jb,kb),0);
5103  int maxarrlen = (bupper.n.i - blower.n.i + 1) *
5104  (bupper.n.j - blower.n.j + 1) * (bupper.n.k - blower.n.k + 1);
5105  p.send.setmax(maxarrlen); // allocate space for send array
5106  // loop over the blocks
5107 #if 0
5108  printf("blower: level=%d n=%d %d %d bupper: level=%d n=%d %d %d\n",
5109  blower.level, blower.n.i, blower.n.j, blower.n.k,
5110  bupper.level, bupper.n.i, bupper.n.j, bupper.n.k);
5111  fflush(stdout);
5112 #endif
5113  for (int kk = blower.n.k; kk <= bupper.n.k; kk++) {
5114  for (int jj = blower.n.j; jj <= bupper.n.j; jj++) {
5115  for (int ii = blower.n.i; ii <= bupper.n.i; ii++) {
5116 #if 0
5117  printf("ii=%d jj=%d kk=%d\n", ii, jj, kk);
5118  fflush(stdout);
5119 #endif
5120  // determine actual block and range to send to
5121  msm::BlockSend bs;
5122  bs.nblock.n = msm::Ivec(ii,jj,kk);
5123  bs.nblock.level = 0;
5125  map.wrapBlockSend(bs); // determine wrapping to true block index
5126  p.send.append(bs); // append this block to the send array
5127  // increment counter for receive block
5128  map.blockLevel[0](bs.nblock_wrap.n).numRecvsCharge++;
5129  // initialize patch send back from this block
5130  msm::PatchSend ps;
5131  ps.nrange = bs.nrange_wrap;
5132  ps.nrange_unwrap = bs.nrange;
5133  ps.patchID = pid;
5134  map.blockLevel[0](bs.nblock_wrap.n).sendPatch.append(ps);
5135  // increment number of receives back to this patch
5136  p.numRecvs++;
5137  }
5138  }
5139  }
5140  // number of receives should be same as number of sends
5141  ASSERT(p.numRecvs == p.send.len() );
5142  }
5143 #ifdef DEBUG_MSM_VERBOSE
5144 if (CkMyPe() == 0) {
5145  printf("Done allocating map for patches\n");
5146  printf("Patch level decomposition:\n");
5147  for (pid = 0; pid < numpatches; pid++) {
5148  msm::PatchDiagram& p = map.patchList[pid];
5149  int ia = p.nrange.ia();
5150  int ib = p.nrange.ib();
5151  int ja = p.nrange.ja();
5152  int jb = p.nrange.jb();
5153  int ka = p.nrange.ka();
5154  int kb = p.nrange.kb();
5155  printf("patch id=%d [%d..%d] x [%d..%d] x [%d..%d]\n",
5156  pid, ia, ib, ja, jb, ka, kb);
5157  }
5158 }
5159 #endif
5160  if (CkMyPe() == 0) {
5161  iout << iINFO << "MSM finished creating map for patches\n" << endi;
5162  }
5163 
5164  // initialize grid of BlockDiagram for each level
5165  int polydeg = PolyDegree[approx];
5166  numGridCutoff = 0;
5167  for (level = 0; level < nlevels; level++) {
5169  int bni = b.ni();
5170  int bnj = b.nj();
5171  int bnk = b.nk();
5172 #ifdef MSM_SKIP_BEYOND_SPHERE
5173  int gia, gib, gja, gjb, gka, gkb;
5174  if (approx == C1HERMITE) {
5175  gia = map.gc_c1hermite[level].ia();
5176  gib = map.gc_c1hermite[level].ib();
5177  gja = map.gc_c1hermite[level].ja();
5178  gjb = map.gc_c1hermite[level].jb();
5179  gka = map.gc_c1hermite[level].ka();
5180  gkb = map.gc_c1hermite[level].kb();
5181  }
5182  else {
5183  gia = map.gc[level].ia();
5184  gib = map.gc[level].ib();
5185  gja = map.gc[level].ja();
5186  gjb = map.gc[level].jb();
5187  gka = map.gc[level].ka();
5188  gkb = map.gc[level].kb();
5189  }
5190 #endif
5191 #ifdef MSM_SKIP_TOO_DISTANT_BLOCKS
5192  int bsx = map.bsx[level];
5193  int bsy = map.bsy[level];
5194  int bsz = map.bsz[level];
5195 #endif
5196 #ifdef MSM_FOLD_FACTOR
5197  if (map.foldfactor[level].active) {
5198  bsx *= map.foldfactor[level].numrep.i;
5199  bsy *= map.foldfactor[level].numrep.j;
5200  bsz *= map.foldfactor[level].numrep.k;
5201  }
5202 #endif
5203  for (k = 0; k < bnk; k++) {
5204  for (j = 0; j < bnj; j++) {
5205  for (i = 0; i < bni; i++) {
5206 
5207  // Grid cutoff calculation, sendAcross
5208  int ia = b(i,j,k).nrange.ia();
5209  int ib = b(i,j,k).nrange.ib();
5210  int ja = b(i,j,k).nrange.ja();
5211  int jb = b(i,j,k).nrange.jb();
5212  int ka = b(i,j,k).nrange.ka();
5213  int kb = b(i,j,k).nrange.kb();
5214  if (approx == C1HERMITE) {
5215  ia += map.gc_c1hermite[level].ia();
5216  ib += map.gc_c1hermite[level].ib();
5217  ja += map.gc_c1hermite[level].ja();
5218  jb += map.gc_c1hermite[level].jb();
5219  ka += map.gc_c1hermite[level].ka();
5220  kb += map.gc_c1hermite[level].kb();
5221  }
5222  else {
5223  ia += map.gc[level].ia();
5224  ib += map.gc[level].ib();
5225  ja += map.gc[level].ja();
5226  jb += map.gc[level].jb();
5227  ka += map.gc[level].ka();
5228  kb += map.gc[level].kb();
5229  }
5230  msm::Ivec na = map.clipIndexToLevel(msm::Ivec(ia,ja,ka), level);
5231  msm::Ivec nb = map.clipIndexToLevel(msm::Ivec(ib,jb,kb), level);
5232  b(i,j,k).nrangeCutoff.setbounds(na.i, nb.i, na.j, nb.j, na.k, nb.k);
5233  // determine sendAcross blocks
5234 #ifdef MSM_FOLD_FACTOR
5235  msm::BlockIndex blower = map.blockOfGridIndexFold(na, level);
5236  msm::BlockIndex bupper = map.blockOfGridIndexFold(nb, level);
5237 #else
5238  msm::BlockIndex blower = map.blockOfGridIndex(na, level);
5239  msm::BlockIndex bupper = map.blockOfGridIndex(nb, level);
5240 #endif
5241  int maxarrlen = (bupper.n.i - blower.n.i + 1) *
5242  (bupper.n.j - blower.n.j + 1) * (bupper.n.k - blower.n.k + 1);
5243  b(i,j,k).sendAcross.setmax(maxarrlen); // allocate send array
5244  b(i,j,k).indexGridCutoff.setmax(maxarrlen); // alloc indexing
5245  b(i,j,k).recvGridCutoff.setmax(maxarrlen); // alloc indexing
5246  // loop over sendAcross blocks
5247  int ii, jj, kk;
5248 #if 0
5249  {
5250  msm::IndexRange& bn = b(i,j,k).nrange;
5251  printf("ME %4d [%d..%d] x [%d..%d] x [%d..%d]\n",
5252  bn.nn(),
5253  bn.ia(), bn.ib(),
5254  bn.ja(), bn.jb(),
5255  bn.ka(), bn.kb());
5256  }
5257 #endif
5258  for (kk = blower.n.k; kk <= bupper.n.k; kk++) {
5259  for (jj = blower.n.j; jj <= bupper.n.j; jj++) {
5260  for (ii = blower.n.i; ii <= bupper.n.i; ii++) {
5261 #ifdef MSM_SKIP_TOO_DISTANT_BLOCKS
5262  // make sure that block (ii,jj,kk) interacts with (i,j,k)
5263  int si = sign(ii-i);
5264  int sj = sign(jj-j);
5265  int sk = sign(kk-k);
5266  int di = (ii-i)*bsx + si*(1-bsx);
5267  int dj = (jj-j)*bsy + sj*(1-bsy);
5268  int dk = (kk-k)*bsz + sk*(1-bsz);
5269  Vector d = di*hu + dj*hv + dk*hw;
5270  if (d.length2() >= 4*a*a) continue;
5271 #endif
5272  // determine actual block and range to send to
5273  msm::BlockSend bs;
5274  bs.nblock.n = msm::Ivec(ii,jj,kk);
5275  bs.nblock.level = level;
5276 #ifdef MSM_FOLD_FACTOR
5278  b(i,j,k).nrangeCutoff);
5279  map.wrapBlockSendFold(bs); // wrap to true block index
5280 #else
5282  b(i,j,k).nrangeCutoff);
5283  map.wrapBlockSend(bs); // wrap to true block index
5284 #endif
5285 #ifdef MSM_SKIP_BEYOND_SPHERE
5286 #if 0
5287