NAMD
NamdCentLB.C
Go to the documentation of this file.
1 /*****************************************************************************
2  * $Source: /home/cvs/namd/cvsroot/namd2/src/NamdCentLB.C,v $
3  * $Author: jim $
4  * $Date: 2017/03/30 20:06:17 $
5  * $Revision: 1.125 $
6  *****************************************************************************/
7 
8 #if !defined(WIN32) || defined(__CYGWIN__)
9 #include <unistd.h>
10 #endif
11 #include <fcntl.h>
12 
13 #include "InfoStream.h"
14 #include "NamdCentLB.h"
15 #include "NamdCentLB.def.h"
16 #include "Node.h"
17 #include "PatchMap.h"
18 #include "ComputeMap.h"
19 #include "LdbCoordinator.h"
20 
21 // #define DUMP_LDBDATA 1
22 // #define LOAD_LDBDATA 1
23 
24 double *cpuloads = NULL;
25 
27  // CkPrintf("[%d] creating NamdCentLB %d\n",CkMyPe(),loadbalancer);
28  int seqno = LdbInfra::Object()->getLoadbalancerTicket();
29  loadbalancer = CProxy_NamdCentLB::ckNew(CkLBOptions(seqno));
30  // CkPrintf("[%d] created NamdCentLB %d\n",CkMyPe(),loadbalancer);
31  if (CkMyRank() == 0 && cpuloads == NULL) {
32  cpuloads = new double[CkNumPes()];
33  CmiAssert(cpuloads != NULL);
34  for (int i=0; i<CkNumPes(); i++) cpuloads[i] = 0.0;
35  }
36 }
37 
39  return new NamdCentLB((CkMigrateMessage*)NULL);
40 }
41 
45 NamdCentLB::NamdCentLB(CkMigrateMessage *msg): CentralLB(msg) {
46  processorArray = 0;
47  patchArray = 0;
48  computeArray = 0;
49 }
50 
51 NamdCentLB::NamdCentLB(const CkLBOptions& opt): CentralLB(opt)
52 {
53  // if (CkMyPe()==0)
54  // CkPrintf("[%d] NamdCentLB created\n",CkMyPe());
55  processorArray = 0;
56  patchArray = 0;
57  computeArray = 0;
58 }
59 
60 /*
61 NamdCentLB::~NamdCentLB()
62 {
63  delete [] processorArray;
64  delete [] patchArray;
65  delete [] computeArray;
66 }
67 */
68 
69 bool NamdCentLB::QueryBalanceNow(int _step)
70 {
71  // CkPrintf("[%d] Balancing on step %d\n",CkMyPe(),_step);
72  if ( LdbCoordinator::Object()->takingLdbData ) {
73  return true;
74  } else {
75  return false;
76  }
77 }
78 
79 bool NamdCentLB::QueryDumpData()
80 {
81 #if 0
82  if (LdbCoordinator::Object()->ldbCycleNum == 1) return true;
83  if (LdbCoordinator::Object()->ldbCycleNum == 2) return true;
84 #endif
85  return false;
86 }
87 
88 CLBMigrateMsg* NamdCentLB::Strategy(LDStats* stats)
89 {
90  // CkPrintf("LDB: All statistics received at %f, %f\n",
91  // CmiTimer(),CmiWallTimer());
92 
93  int numProcessors = stats->nprocs();
95  ComputeMap *computeMap = ComputeMap::Object();
96  const int numComputes = computeMap->numComputes();
98 
99  // these sizes should never change
100  if ( ! processorArray ) processorArray = new processorInfo[numProcessors];
101  if ( ! patchArray ) patchArray = new patchInfo[numPatches];
102  if ( ! computeArray ) computeArray = new computeInfo[numComputes];
103 
104  int nMoveableComputes = buildData(stats);
105 
106 #if LDB_DEBUG
107 #define DUMP_LDBDATA 1
108 #define LOAD_LDBDATA 1
109 #endif
110 
111 #if DUMP_LDBDATA
112  dumpDataASCII("ldbd_before", numProcessors, numPatches, nMoveableComputes);
113 #elif LOAD_LDBDATA
114  loadDataASCII("ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
115  // CkExit();
116 #endif
117 
118  double averageLoad = 0.;
119  double avgCompute = 0.;
120  if ( nMoveableComputes ) {
121  int i;
122  double total = 0.;
123  double maxCompute = 0.;
124  int maxi = 0;
125  for (i=0; i<nMoveableComputes; i++) {
126  double load = computeArray[i].load;
127  total += load;
128  if ( load > maxCompute ) { maxCompute = load; maxi = i; }
129  }
130  avgCompute = total / nMoveableComputes;
131 
132  int P = stats->nprocs();
133  int numPesAvailable = 0;
134  for (i=0; i<P; i++) {
135  if (processorArray[i].available) {
136  ++numPesAvailable;
137  total += processorArray[i].backgroundLoad;
138  }
139  }
140  if (numPesAvailable == 0)
141  NAMD_die("No processors available for load balancing!\n");
142 
143  averageLoad = total/numPesAvailable;
144  CkPrintf("LDB: Largest compute %d load %f is %.1f%% of average load %f\n",
145  LdbIdField(computeArray[maxi].handle.id, 0),
146  maxCompute, 100. * maxCompute / averageLoad, averageLoad);
147  CkPrintf("LDB: Average compute %f is %.1f%% of average load %f\n",
148  avgCompute, 100. * avgCompute / averageLoad, averageLoad);
149  }
150 
151  if ( step() == 1 ) {
152  // compute splitting only
153  // partitions are stored as char but mostly limited by
154  // high load noise at low outer-loop iteration counts
155  int maxParts = 10;
156 #ifdef NAMD_CUDA
157 //split LCPO compute very small, else CUDA compute is delayed
158  if (simParams->LCPOOn) {
159  maxParts = 20;
160  }
161 #endif
162  int totalAddedParts = 0;
163  double maxCompute = averageLoad / 10.;
164  if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;
165  if ( simParams->ldbRelativeGrainsize > 0. ) {
166  maxCompute = averageLoad * simParams->ldbRelativeGrainsize;
167  }
168  CkPrintf("LDB: Partitioning computes with target load %f\n", maxCompute);
169  double maxUnsplit = 0.;
170  for (int i=0; i<nMoveableComputes; i++) {
171  computeArray[i].processor = computeArray[i].oldProcessor;
172  const int cid = LdbIdField(computeArray[i].handle.id, 0);
173  const double load = computeArray[i].load;
174  if ( computeMap->numPartitions(cid) == 0 ) {
175  if ( load > maxUnsplit ) maxUnsplit = load;
176  continue;
177  }
178  int nparts = (int) ceil(load / maxCompute);
179  if ( nparts > maxParts ) nparts = maxParts;
180  if ( nparts < 1 ) nparts = 1;
181  if ( 0 && nparts > 1 ) {
182  CkPrintf("LDB: Partitioning compute %d with load %f by %d\n",
183  cid, load, nparts);
184  }
185  computeMap->setNewNumPartitions(cid,nparts);
186  totalAddedParts += nparts - 1;
187  }
188  CkPrintf("LDB: Increased migratable compute count from %d to %d\n",
189  nMoveableComputes,nMoveableComputes+totalAddedParts);
190  CkPrintf("LDB: Largest unpartitionable compute is %f\n", maxUnsplit);
191  } else if (simParams->ldbStrategy == LDBSTRAT_DEFAULT) { // default
192  if (step() < 4)
193  TorusLB(computeArray, patchArray, processorArray,
194  nMoveableComputes, numPatches, numProcessors);
195  else
196  RefineTorusLB(computeArray, patchArray, processorArray,
197  nMoveableComputes, numPatches, numProcessors, 1);
198  } else if (simParams->ldbStrategy == LDBSTRAT_COMPREHENSIVE) {
199  TorusLB(computeArray, patchArray, processorArray,
200  nMoveableComputes, numPatches, numProcessors);
201  } else if (simParams->ldbStrategy == LDBSTRAT_REFINEONLY) {
202  RefineTorusLB(computeArray, patchArray, processorArray,
203  nMoveableComputes, numPatches, numProcessors, 1);
204  } else if (simParams->ldbStrategy == LDBSTRAT_OLD) {
205  if (step() < 4)
206  Alg7(computeArray, patchArray, processorArray,
207  nMoveableComputes, numPatches, numProcessors);
208  else
209  RefineOnly(computeArray, patchArray, processorArray,
210  nMoveableComputes, numPatches, numProcessors);
211  }
212 
213 #if LDB_DEBUG && USE_TOPOMAP
214  TopoManager tmgr;
215  int pe1, pe2, pe3, hops=0;
216  /* This is double counting the hops
217  for(int i=0; i<nMoveableComputes; i++)
218  {
219  pe1 = computeArray[i].processor;
220  pe2 = patchArray[computeArray[i].patch1].processor;
221  pe3 = patchArray[computeArray[i].patch2].processor;
222  hops += tmgr.getHopsBetweenRanks(pe1, pe2);
223  if(computeArray[i].patch1 != computeArray[i].patch2)
224  hops += tmgr.getHopsBetweenRanks(pe1, pe3);
225  }*/
226  for (int i=0; i<numPatches; i++) {
227  //int num = patchArray[i].proxiesOn.numElements();
228  pe1 = patchArray[i].processor;
229  Iterator nextProc;
230  processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.iterator((Iterator *)&nextProc);
231  while (p) {
232  pe2 = p->Id;
233  hops += tmgr.getHopsBetweenRanks(pe1, pe2);
234  p = (processorInfo *)patchArray[i].proxiesOn.next((Iterator*)&nextProc);
235  }
236  }
237  CkPrintf("Load Balancing: Number of Hops: %d\n", hops);
238 #endif
239 
240 #if DUMP_LDBDATA
241  dumpDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);
242 #elif LOAD_LDBDATA
243  dumpDataASCII("ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
244  // loadDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);
245  // CkExit();
246 #endif
247 
248  // For error checking:
249  // Count up computes, to see if somebody doesn't have any computes
250  int i;
251 #if 0
252  int* computeCount = new int[numProcessors];
253  for(i=0; i<numProcessors; i++)
254  computeCount[i]=0;
255  for(i=0; i<nMoveableComputes; i++)
256  computeCount[computeArray[i].processor]++;
257  for(i=0; i<numProcessors; i++) {
258  if (computeCount[i]==0)
259  iout << iINFO <<"Warning: Processor " << i
260  << " has NO moveable computes.\n" << endi;
261  }
262  delete [] computeCount;
263 #endif
264 
265  CkVec<MigrateInfo *> migrateInfo;
266  for(i=0;i<nMoveableComputes;i++) {
267  if (computeArray[i].processor != computeArray[i].oldProcessor) {
268  // CkPrintf("[%d] Obj %d migrating from %d to %d\n",
269  // CkMyPe(),computeArray[i].handle.id.id[0],
270  // computeArray[i].processor,computeArray[i].oldProcessor);
271  MigrateInfo *migrateMe = new MigrateInfo;
272  migrateMe->obj = computeArray[i].handle;
273  migrateMe->from_pe = computeArray[i].oldProcessor;
274  migrateMe->to_pe = computeArray[i].processor;
275  migrateInfo.insertAtEnd(migrateMe);
276 
277  // sneak in updates to ComputeMap
278  computeMap->setNewNode(LdbIdField(computeArray[i].handle.id, 0),
279  computeArray[i].processor);
280  }
281  }
282 
283  int migrate_count=migrateInfo.length();
284  // CkPrintf("NamdCentLB migrating %d elements\n",migrate_count);
285  CLBMigrateMsg* msg = new(migrate_count,CkNumPes(),CkNumPes(),0) CLBMigrateMsg;
286 
287  msg->n_moves = migrate_count;
288  for(i=0; i < migrate_count; i++) {
289  MigrateInfo* item = migrateInfo[i];
290  msg->moves[i] = *item;
291  delete item;
292  migrateInfo[i] = 0;
293  }
294 
295  for (i=0; i<numProcessors; i++) {
296  cpuloads[i] = processorArray[i].load;
297  }
298 
299  delete [] processorArray;
300  delete [] patchArray;
301  delete [] computeArray;
302 
303  processorArray = NULL;
304  patchArray = NULL;
305  computeArray = NULL;
306 
307  return msg;
308 };
309 
310 #ifndef WIN32
311 
312 void NamdCentLB::dumpDataASCII(char *file, int numProcessors,
313  int numPatches, int numComputes)
314 {
315  char filename[128];
316  sprintf(filename, "%s.%d", file, step());
317  FILE* fp = fopen(filename,"w");
318  if (fp == NULL){
319  perror("dumpLDStatsASCII");
320  return;
321  }
322  CkPrintf("***** DUMP data to file: %s ***** \n", filename);
323  fprintf(fp,"%d %d %d\n",numProcessors,numPatches,numComputes);
324 
325  int i;
326  for(i=0;i<numProcessors;i++) {
327  processorInfo* p = processorArray + i;
328  fprintf(fp,"%d %e %e %e %e\n",p->Id,p->load,p->backgroundLoad,p->computeLoad,p->idleTime);
329  }
330 
331  for(i=0;i < numPatches; i++) {
332  patchInfo* p = patchArray + i;
333  fprintf(fp,"%d %e %d %d\n",p->Id,p->load,p->processor,p->numAtoms);
334  }
335 
336  for(i=0; i < numComputes; i++) {
337  computeInfo* c = computeArray + i;
338  fprintf(fp,"%d %e %d %d %d %d",c->Id,c->load,c->patch1,c->patch2,
339  c->processor,c->oldProcessor);
340  fprintf(fp, "\n");
341  }
342 
343  // dump patchSet
344  for (i=0; i< numProcessors; i++) {
345  int num = processorArray[i].proxies.numElements();
346  fprintf(fp, "%d %d: ", i, num);
347  Iterator nextProxy;
348  patchInfo *p = (patchInfo *)processorArray[i].proxies.
349  iterator((Iterator *)&nextProxy);
350  while (p) {
351  fprintf(fp, "%d ", p->Id);
352  p = (patchInfo *)processorArray[i].proxies.
353  next((Iterator*)&nextProxy);
354  }
355  fprintf(fp, "\n");
356  }
357  // dump proxiesOn
358  for (i=0; i<numPatches; i++) {
359  int num = patchArray[i].proxiesOn.numElements();
360  fprintf(fp, "%d %d: ", i, num);
361  Iterator nextProc;
362  processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.
363  iterator((Iterator *)&nextProc);
364  while (p) {
365  fprintf(fp, "%d ", p->Id);
366  p = (processorInfo *)patchArray[i].proxiesOn.
367  next((Iterator*)&nextProc);
368  }
369  fprintf(fp, "\n");
370  }
371 
372  fclose(fp);
373  //CkExit();
374 }
375 
376 void NamdCentLB::loadDataASCII(char *file, int &numProcessors,
377  int &numPatches, int &numComputes)
378 {
379  char filename[128];
380  //sprintf(filename, "%s.%d", file, step());
381  sprintf(filename, "%s", file);
382 
383  CkPrintf("***** Load ascii data from file: %s ***** \n", filename);
384 
385  FILE* fp = fopen(filename, "r");
386  if (fp == NULL){
387  perror("loadDataASCII");
388  return;
389  }
390 
391  fscanf(fp,"%d %d %d",&numProcessors,&numPatches,&numComputes);
392 
393  printf("numProcs: %d numPatches: %d numComputes: %d\n", numProcessors,numPatches, numComputes);
394 
395  delete [] processorArray;
396  delete [] patchArray;
397  delete [] computeArray;
398  processorArray = new processorInfo[numProcessors];
399  patchArray = new patchInfo[numPatches];
400  computeArray = new computeInfo[numComputes];
401 
402  int i;
403  for(i=0;i<numProcessors;i++) {
404  processorInfo* p = processorArray + i;
405  fscanf(fp,"%d %le %le %le", &p->Id, &p->load, &p->backgroundLoad, &p->computeLoad);
406  fscanf(fp,"%le\n", &p->idleTime);
407  if (p->Id != i) CmiAbort("Reading processorArray error!");
408 // p->backgroundLoad = 0.0;
409  }
410 
411  for(i=0;i < numPatches; i++) {
412  patchInfo* p = patchArray + i;
413  fscanf(fp,"%d %le %d %d\n",&p->Id,&p->load,&p->processor,&p->numAtoms);
414  if (p->Id != i || p->processor > numProcessors || p->processor < 0)
415  CmiAbort("Reading patchArray error!");
416  }
417 
418  for(i=0; i < numComputes; i++) {
419  computeInfo* c = computeArray + i;
420  fscanf(fp,"%d %le %d %d %d %d",&c->Id,&c->load,&c->patch1,&c->patch2,
421  &c->processor,&c->oldProcessor);
422 
423  if (c->patch1 < 0 || c->patch1 > numPatches || c->patch2 < 0 || c->patch2 > numPatches)
424  CmiAbort("Reading computeArray error!");
425  // printf("%d %e %d %d %d %d\n", c->Id,c->load,c->patch1,c->patch2,c->processor,c->oldProcessor);
426  }
427 
428  // dump patchSet
429  for (i=0; i< numProcessors; i++) {
430  int num, curp;
431  fscanf(fp,"%d %d: ",&curp, &num);
432  if(curp != i)
433  CmiAbort("Reading patchsSet error!");
434  for (int j=0; j<num; j++) {
435  int id;
436  fscanf(fp,"%d",&id);
437  processorArray[i].proxies.unchecked_insert(&patchArray[id]);
438  }
439  }
440  // dump proxiesOn
441  for (i=0; i<numPatches; i++) {
442  int num, curp;
443  fscanf(fp,"%d %d: ",&curp, &num);
444  if(curp != i)
445  CmiAbort("Reading proxiesOn error!");
446  for (int j=0; j<num; j++) {
447  int id;
448  fscanf(fp,"%d",&id);
449  patchArray[i].proxiesOn.insert(&processorArray[id]);
450  }
451  }
452 
453  fclose(fp);
454 }
455 #endif
456 
457 extern int isPmeProcessor(int);
458 #ifdef MEM_OPT_VERSION
459 extern int isOutputProcessor(int);
460 #endif
461 #if defined(NAMD_MIC)
462 extern int isMICProcessor(int);
463 #endif
464 
465 int NamdCentLB::buildData(LDStats* stats)
466 {
467  int n_pes = stats->nprocs();
468 
469  PatchMap* patchMap = PatchMap::Object();
470  ComputeMap* computeMap = ComputeMap::Object();
472 
473  BigReal bgfactor = simParams->ldbBackgroundScaling;
474  BigReal pmebgfactor = simParams->ldbPMEBackgroundScaling;
475  BigReal homebgfactor = simParams->ldbHomeBackgroundScaling;
476  int pmeOn = simParams->PMEOn;
477  int unLoadPme = simParams->ldbUnloadPME;
478  int pmeBarrier = simParams->PMEBarrier;
479  int unLoadZero = simParams->ldbUnloadZero;
480  int unLoadOne = simParams->ldbUnloadOne;
481  int unLoadIO= simParams->ldbUnloadOutputPEs;
482  int i;
483  for (i=0; i<n_pes; ++i) {
484  processorArray[i].Id = i;
485  processorArray[i].available = true;
486  if ( pmeOn && isPmeProcessor(i) ) {
487  processorArray[i].backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;
488  } else if (patchMap->numPatchesOnNode(i) > 0) {
489  processorArray[i].backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;
490  } else {
491  processorArray[i].backgroundLoad = bgfactor * stats->procs[i].bg_walltime;
492  }
493  processorArray[i].idleTime = stats->procs[i].idletime;
494  processorArray[i].load = processorArray[i].computeLoad = 0.0;
495  }
496 
497 /* *********** this code is defunct *****************
498 #if 0
499  double bgfactor = 1.0 + 1.0 * CkNumPes()/1000.0;
500  if ( bgfactor > 2.0 ) bgfactor = 2.0;
501  iout << iINFO << "Scaling background load by " << bgfactor << ".\n" << endi;
502  int i;
503  for (i=0; i<n_pes; i++) {
504  processorArray[i].Id = i;
505  processorArray[i].backgroundLoad = bgfactor * stats[i].bg_walltime;
506  }
507 
508  double bg_weight = 0.7;
509 
510  int i;
511  for (i=0; i<n_pes; i++) {
512  processorArray[i].Id = i;
513  if (patchMap->numPatchesOnNode(i) > 0)
514  processorArray[i].backgroundLoad = bg_weight * stats->procs[i].bg_walltime;
515  else
516  processorArray[i].backgroundLoad = stats[i].bg_walltime;
517  }
518 
519  //Modification to reduce the coputeload on PME processors
520  const SimParameters* simParams = Node::Object()->simParameters;
521 
522  // CkPrintf("BACKGROUND LOAD\n");
523  if(simParams->PMEOn) {
524  double bgfactor = 1.0 + 1.0 * CkNumPes()/1000.0;
525  if ( bgfactor > 2.0 ) bgfactor = 2.0;
526  for (i=0; i<n_pes; i++) {
527  // CkPrintf("BG[%d] = %5.5lf,", i, processorArray[i].backgroundLoad);
528  if(isPmeProcessor(i)) {
529  processorArray[i].backgroundLoad *= bgfactor;
530  }
531  // CkPrintf("%5.5lf; ", processorArray[i].backgroundLoad);
532  }
533  }
534  // CkPrintf("\n");
535 #endif
536 *********** end of defunct code *********** */
537 
538  if (unLoadZero) processorArray[0].available = false;
539  if (unLoadOne) processorArray[1].available = false;
540 
541  // if all pes are Pme, disable this flag
542  if (pmeOn && unLoadPme) {
543  for (i=0; i<n_pes; i++) {
544  if (!isPmeProcessor(i)) break;
545  }
546  if (i == n_pes) {
547  iout << iINFO << "Turned off unLoadPme flag!\n" << endi;
548  unLoadPme = 0;
549  }
550  }
551 
552  if (pmeOn && unLoadPme) {
553  for (i=0; i<n_pes; i++) {
554  if ((pmeBarrier && i==0) || isPmeProcessor(i))
555  processorArray[i].available = false;
556  }
557  }
558  // if all pes are output, disable this flag
559 #ifdef MEM_OPT_VERSION
560 
561  if (unLoadIO) {
562  if (simParams->numoutputprocs == n_pes) {
563  iout << iINFO << "Turned off unLoadIO flag!\n" << endi;
564  unLoadIO = 0;
565  }
566  }
567  if (unLoadIO){
568  iout << iINFO << "Testing for output processors!\n" << endi;
569  for (i=0; i<n_pes; i++) {
570  if (isOutputProcessor(stats->procs[i].pe))
571  {
572  // iout << iINFO << "Removed output PE "<< stats->procs[i].pe <<" from available list!\n" << endi;
573  processorArray[i].available = false;
574  }
575  else
576  {
577  // iout << iINFO << "Nonoutput PE "<< stats->procs[i].pe <<" is in available list!\n" << endi;
578  }
579  }
580  }
581 #endif
582 
583  // Unload PEs driving MIC devices, if need be
584  #if defined(NAMD_MIC)
585  if (simParams->mic_unloadMICPEs != 0) {
586  for (i = 0; i < n_pes; i++) {
587  if (isMICProcessor(i) != 0) { processorArray[i].available = false; }
588  }
589  }
590  #endif
591 
592  int nMoveableComputes=0;
593  int nProxies = 0; // total number of estimated proxies
594  int nIdleComputes = 0;
595 
596  int j;
597  for (j=0; j < stats->n_objs; j++) {
598  const LDObjData &this_obj = stats->objData[j];
599  int frompe = stats->from_proc[j];
600 
601  // filter out non-NAMD managed objects (like PME array)
602  if (this_obj.omID().id.idx != 1) {
603  // CkPrintf("non-NAMD object %d on pe %d with walltime %lf\n",
604  // this_obj.id().id[0], stats->from_proc[j], this_obj.wallTime);
605  processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
606  continue;
607  }
608 
609  if (LdbIdField(this_obj.id(), 1) == PATCH_TYPE) { // Its a patch
610  const int pid = LdbIdField(this_obj.id(), 0);
611  int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
612 
613  patchArray[pid].Id = pid;
614  patchArray[pid].numAtoms = 0;
615  patchArray[pid].processor = stats->from_proc[j];
616  const int numProxies =
617 #if USE_TOPOMAP
618  requiredProxiesOnProcGrid(pid,neighborNodes);
619 #else
620  requiredProxies(pid, neighborNodes);
621 #endif
622 
623  nProxies += numProxies;
624 
625  for (int k=0; k<numProxies; k++) {
626  processorArray[neighborNodes[k]].proxies.unchecked_insert(&patchArray[pid]);
627  patchArray[pid].proxiesOn.unchecked_insert(&processorArray[neighborNodes[k]]);
628  }
629  processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
630  } else if (LdbIdField(this_obj.id(), 1) == BONDED_TYPE) { // Its a bonded compute
631  processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
632  } else if (this_obj.migratable) { // Its a compute
633  if ( this_obj.wallTime == 0. ) { // don't migrate idle computes
634  ++nIdleComputes;
635  } else {
636  const int cid = LdbIdField(this_obj.id(), 0);
637  const int p0 = computeMap->pid(cid,0);
638 
639  // For self-interactions, just return the same pid twice
640  int p1;
641  if (computeMap->numPids(cid) > 1)
642  p1 = computeMap->pid(cid,1);
643  else p1 = p0;
644  computeArray[nMoveableComputes].Id = cid;
645  computeArray[nMoveableComputes].oldProcessor = stats->from_proc[j];
646  processorArray[stats->from_proc[j]].computeLoad += this_obj.wallTime;
647  computeArray[nMoveableComputes].processor = -1;
648  computeArray[nMoveableComputes].patch1 = p0;
649  computeArray[nMoveableComputes].patch2 = p1;
650  computeArray[nMoveableComputes].handle = this_obj.handle;
651  computeArray[nMoveableComputes].load = this_obj.wallTime;
652  nMoveableComputes++;
653  }
654  } else {
655  processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
656  }
657  }
658 
659  if ( nIdleComputes )
660  CkPrintf("LDB: %d computes have load of zero\n", nIdleComputes);
661 
662 /* *********** this code is defunct *****************
663 #if 0
664  int averageProxy = nProxies / n_pes;
665  CkPrintf("total proxies: %d, avervage: %d\n", nProxies, averageProxy);
666  for (i=0; i<n_pes; i++) {
667  // too many proxies on this node, weight the background load
668  int proxies = processorArray[i].proxies.numElements();
669  if (proxies > averageProxy) {
670  double factor = 1.0*(proxies-averageProxy)/nProxies;
671  processorArray[i].backgroundLoad *= (1.0 + factor);
672  CkPrintf("On [%d]: too many proxies: %d, increased bg load by %f\n", i, nProxies, factor);
673  }
674  }
675 #endif
676 *********** end of defunct code *********** */
677 
678  for (i=0; i<n_pes; i++) {
679  processorArray[i].load = processorArray[i].backgroundLoad + processorArray[i].computeLoad;
680  }
681  stats->clear();
682  return nMoveableComputes;
683 }
684 
685 // Figure out which proxies we will definitely create on other
686 // nodes, without regard for non-bonded computes. This code is swiped
687 // from ProxyMgr, and changes there probable need to be propagated here.
688 
689 int NamdCentLB::requiredProxies(PatchID id, int neighborNodes[])
690 {
691  PatchMap* patchMap = PatchMap::Object();
692  int myNode = patchMap->node(id);
693  int nProxyNodes = 0;
694 
695 #define IF_NEW_NODE \
696  int j; \
697  for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \
698  if ( j == nProxyNodes )
699 
701  neighbors[0] = id;
702  int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
703  for ( int i = 0; i < numNeighbors; ++i ) {
704  const int proxyNode = patchMap->basenode(neighbors[i]);
705  if ( proxyNode != myNode ) {
706  IF_NEW_NODE {
707  neighborNodes[nProxyNodes] = proxyNode;
708  nProxyNodes++;
709  }
710  }
711  }
712 
713  // Distribute initial default proxies across empty processors.
714  // This shouldn't be necessary, but may constrain the load balancer
715  // and avoid placing too many proxies on a single processor. -JCP
716 
717  // This code needs to be turned off when the creation of ST is
718  // shifted to the load balancers -ASB
719 
720 #if 1
721  int numPes = CkNumPes();
722  int numPatches = patchMap->numPatches();
723  int emptyNodes = numPes - numPatches;
724  if ( emptyNodes > numPatches ) {
725  int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;
726  int maxNodesPerPatch = PatchMap::MaxOneAway + PatchMap::MaxTwoAway;
727  if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;
728  int proxyNode = (myNode + 1) % numPes;
729  while ( nProxyNodes < nodesPerPatch &&
730  ! patchMap->numPatchesOnNode(proxyNode) ) {
731  if ( proxyNode != myNode ) {
732  IF_NEW_NODE {
733  neighborNodes[nProxyNodes] = proxyNode;
734  nProxyNodes++;
735  }
736  }
737  proxyNode = (proxyNode + 1) % numPes;
738  }
739  proxyNode = (myNode - 1 + numPes) % numPes;
740  while ( nProxyNodes < nodesPerPatch &&
741  ! patchMap->numPatchesOnNode(proxyNode) ) {
742  if ( proxyNode != myNode ) {
743  IF_NEW_NODE {
744  neighborNodes[nProxyNodes] = proxyNode;
745  nProxyNodes++;
746  }
747  }
748  proxyNode = (proxyNode - 1 + numPes) % numPes;
749  }
750  proxyNode = (myNode + 1) % numPes;
751  int count = 0;
752  while ( nProxyNodes < nodesPerPatch ) {
753  if ( ! patchMap->numPatchesOnNode(proxyNode) && proxyNode != myNode ) {
754  IF_NEW_NODE {
755  neighborNodes[nProxyNodes] = proxyNode;
756  nProxyNodes++;
757  }
758  }
759  proxyNode = (proxyNode + 1) % numPes;
760  count ++; if (count == numPes) break; // we looped all
761  }
762  } else {
763  int proxyNode = myNode - 1;
764  if ( proxyNode >= 0 && ! patchMap->numPatchesOnNode(proxyNode) ) {
765  if ( proxyNode != myNode ) {
766  IF_NEW_NODE {
767  neighborNodes[nProxyNodes] = proxyNode;
768  nProxyNodes++;
769  }
770  }
771  }
772  proxyNode = myNode + 1;
773  if ( proxyNode < numPes && ! patchMap->numPatchesOnNode(proxyNode) ) {
774  if ( proxyNode != myNode ) {
775  IF_NEW_NODE {
776  neighborNodes[nProxyNodes] = proxyNode;
777  nProxyNodes++;
778  }
779  }
780  }
781  }
782 #endif
783 
784  return nProxyNodes;
785 }
786 
787 #if USE_TOPOMAP
788 // Figure out which proxies we will definitely create on other nodes,
789 // without regard for non-bonded computes. This code is swiped from
790 // ProxyMgr, and changes there probable need to be propagated here.
791 // The proxies are placed on nearby processors on the 3d-grid along
792 // the X, Y, Z and T dimensions
793 
794 int NamdCentLB::requiredProxiesOnProcGrid(PatchID id, int neighborNodes[])
795 {
796  enum proxyHere { No, Yes };
797  int numPes = CkNumPes();
798  proxyHere *proxyNodes = new proxyHere[numPes];
799  int nProxyNodes;
800  int i, j, k, l;
801 
802  int xsize = 0, ysize = 0, zsize = 0, tsize = 0;
803  int my_x = 0, my_y = 0, my_z = 0, my_t = 0;
804 
805  PatchMap* patchMap = PatchMap::Object();
806  int myNode = patchMap->node(id);
807 
808  TopoManager tmgr;
809  xsize = tmgr.getDimNX();
810  ysize = tmgr.getDimNY();
811  zsize = tmgr.getDimNZ();
812  tsize = tmgr.getDimNT();
813 
814  tmgr.rankToCoordinates(myNode, my_x, my_y, my_z, my_t);
815 
816  if(xsize * ysize * zsize * tsize != CkNumPes()) {
817  delete [] proxyNodes;
818  return requiredProxies(id, neighborNodes);
819  }
820 
821  // Note all home patches.
822  for ( i = 0; i < numPes; ++i )
823  {
824  proxyNodes[i] = No;
825  }
826  nProxyNodes = 0;
827 
828  // Check all two-away neighbors.
829  // This is really just one-away neighbors, since
830  // two-away always returns zero: RKB
832 
833  // Assign a proxy to all your neighbors. But dont increment counter
834  // because these have to be there anyway.
835  neighbors[0] = id;
836  int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
837 
838  // Small Flag chooses between different loadbalancing schemes.
839  // Small Flag == true, patches are close to each other
840  // false, patches are far from each other
841  bool smallFlag = false;
842  double pnodes = CkNumPes();
843  pnodes *= 0.25;
844  smallFlag = (patchMap->numPatches() > pnodes )?1:0;
845 
846  //If there are lot of patches its likely they will all be neighbors,
847  //so all we need to do is to place proxies on downstream patches.
848  //if (smallFlag) {
849  for ( i = 1; i < numNeighbors; ++i )
850  {
851  int proxyNode = patchMap->basenode(neighbors[i]);
852 
853  if (proxyNode != myNode)
854  if (proxyNodes[proxyNode] == No)
855  {
856  proxyNodes[proxyNode] = Yes;
857  neighborNodes[nProxyNodes] = proxyNode;
858  nProxyNodes++;
859  }
860  }
861  //}
862 
863  if (step() > 2) {
864  delete [] proxyNodes;
865  return nProxyNodes;
866  }
867 
868  // Place numPesPerPatch proxies on the 3d torus neighbors of a processor
869 
870  int numPatches = patchMap->numPatches();
871  int emptyNodes = numPes - numPatches;
872  //if ( emptyNodes > numPatches ) {
873 
874  int nodesPerPatch = nProxyNodes + 4 * (emptyNodes-1) / numPatches + 1;
875  int proxyNode = 0 ;
876  int proxy_x=0, proxy_y=0, proxy_z=0;
877 
878  //Choose from the 26 neighbors of mynode.
879  //CkAssert(nodesPerPatch - nProxyNodes <= 26);
880  //Too few patches otherwise, try twoaway?
881 
882  for(k=-1; k<= 1; k++) {
883  proxy_z = (my_z + k + zsize) % zsize;
884  for(j=-1; j <= 1; j++) {
885  proxy_y = (my_y + j + ysize) % ysize;
886  for(i = -1; i <= 1; i++) {
887  proxy_x = (my_x + i + xsize) % xsize;
888  for(l = 0; l < tsize; l++) {
889  if(i == 0 && j == 0 && k == 0 && l == 0)
890  continue;
891 
892  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
893 
894  if((! patchMap->numPatchesOnNode(proxyNode) || !smallFlag) &&
895  proxyNodes[proxyNode] == No) {
896  proxyNodes[proxyNode] = Yes;
897  neighborNodes[nProxyNodes] = proxyNode;
898  nProxyNodes++;
899  }
900 
901  if(nProxyNodes >= nodesPerPatch ||
902  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
903  break;
904  } // end for
905 
906  if(nProxyNodes >= nodesPerPatch ||
907  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
908  break;
909  } // end for
910 
911  if(nProxyNodes >= nodesPerPatch ||
912  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
913  break;
914  } // end for
915 
916  if(nProxyNodes >= nodesPerPatch ||
917  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
918  break;
919  } // end for
920 
921 #if 1
922  if(!smallFlag) {
923  for(k=-2; k<= 2; k+=2) {
924  proxy_z = (my_z + k + zsize) % zsize;
925  for(j=-2; j <= 2; j+=2) {
926  proxy_y = (my_y + j + ysize) % ysize;
927  for(i = -2; i <= 2; i+=2) {
928  proxy_x = (my_x + i + xsize) % xsize;
929  for(l = 0; l < tsize; l++) {
930  if(i == 0 && j == 0 && k == 0 && l == 0)
931  continue;
932 
933  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
934 
935  if((! patchMap->numPatchesOnNode(proxyNode) || !smallFlag) &&
936  proxyNodes[proxyNode] == No) {
937  proxyNodes[proxyNode] = Yes;
938  neighborNodes[nProxyNodes] = proxyNode;
939  nProxyNodes++;
940  }
941 
942  if(nProxyNodes >= nodesPerPatch ||
943  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
944  break;
945  } // end for
946 
947  if(nProxyNodes >= nodesPerPatch ||
948  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
949  break;
950  } // end for
951 
952  if(nProxyNodes >= nodesPerPatch ||
953  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
954  break;
955  } // end for
956 
957  if(nProxyNodes >= nodesPerPatch ||
958  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
959  break;
960  } // end for
961  }
962 
963 #else
964  #if 0
965  const SimParameters* params = Node::Object()->simParameters;
966 
967  if(!smallFlag) {
968  //Add two-away proxies
969  if(patchMap->numaway_a() == 2) {
970  proxy_y = (my_y + 2) % ysize;
971  proxy_x = my_x % xsize;
972  proxy_z = my_z % zsize;
973 
974  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
975  if(proxyNodes[proxyNode] == No) {
976  proxyNodes[proxyNode] = Yes;
977  neighborNodes[nProxyNodes] = proxyNode;
978  nProxyNodes++;
979  }
980 
981  proxy_y = (my_y - 2 + ysize) % ysize;
982  proxy_x = my_x % xsize;
983  proxy_z = my_z % zsize;
984 
985  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
986  if(proxyNodes[proxyNode] == No) {
987  proxyNodes[proxyNode] = Yes;
988  neighborNodes[nProxyNodes] = proxyNode;
989  nProxyNodes++;
990  }
991  }
992 
993  //Add two away proxies
994  if(patchMap->numaway_b() == 2) {
995  proxy_y = my_y % ysize;
996  proxy_x = my_x % xsize;
997  proxy_z = (my_z + 2) % zsize;
998 
999  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1000  if(proxyNodes[proxyNode] == No) {
1001  proxyNodes[proxyNode] = Yes;
1002  neighborNodes[nProxyNodes] = proxyNode;
1003  nProxyNodes++;
1004  }
1005 
1006  proxy_y = my_y % ysize;
1007  proxy_x = my_x % xsize;
1008  proxy_z = (my_z - 2 + zsize) % zsize;
1009 
1010  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1011  if(proxyNodes[proxyNode] == No) {
1012  proxyNodes[proxyNode] = Yes;
1013  neighborNodes[nProxyNodes] = proxyNode;
1014  nProxyNodes++;
1015  }
1016  }
1017 
1018  //Add two away proxies
1019  if(patchMap->numaway_c() == 2) {
1020  proxy_y = my_y % ysize;
1021  proxy_x = (my_x + 2) % xsize;
1022  proxy_z = my_z % zsize;
1023 
1024  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1025  if(proxyNodes[proxyNode] == No) {
1026  proxyNodes[proxyNode] = Yes;
1027  neighborNodes[nProxyNodes] = proxyNode;
1028  nProxyNodes++;
1029  }
1030 
1031  proxy_y = my_y % ysize;
1032  proxy_x = (my_x - 2 + xsize) % xsize;
1033  proxy_z = my_z % zsize;
1034 
1035  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1036  if(proxyNodes[proxyNode] == No) {
1037  proxyNodes[proxyNode] = Yes;
1038  neighborNodes[nProxyNodes] = proxyNode;
1039  nProxyNodes++;
1040  }
1041  }
1042  }
1043  #endif
1044 #endif
1045 
1046  // CkPrintf("Returning %d proxies\n", nProxyNodes);
1047 
1048  delete [] proxyNodes;
1049  return nProxyNodes;
1050 }
1051 
1052 #endif
static Node * Object()
Definition: Node.h:86
BlockLoad::TempStorage load
int patch1
Definition: elements.h:23
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:107
#define IF_NEW_NODE
represents bonded compute
NamdCentLB * AllocateNamdCentLB()
Definition: NamdCentLB.C:38
Definition: Alg7.h:13
void setNewNumPartitions(ComputeID cid, char numPartitions)
Definition: ComputeMap.h:144
BigReal ldbRelativeGrainsize
int numComputes(void)
Definition: ComputeMap.h:101
static PatchMap * Object()
Definition: PatchMap.h:27
double * cpuloads
Definition: NamdCentLB.C:24
int numElements()
Definition: Set.C:144
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
SimParameters * simParameters
Definition: Node.h:178
LargeIRSet proxies
Definition: elements.h:46
#define LDBSTRAT_REFINEONLY
Definition: SimParameters.h:67
Bool ldbUnloadOutputPEs
int Id
Definition: elements.h:16
CLBMigrateMsg * Strategy(LDStats *stats)
Definition: NamdCentLB.C:88
int isMICProcessor(int pe)
Definition: ComputeMgr.C:1881
int processor
Definition: elements.h:24
#define iout
Definition: InfoStream.h:87
int oldProcessor
Definition: elements.h:25
static double averageLoad
Definition: ProxyMgr.C:696
int numaway_b(void) const
Definition: PatchMap.h:69
void insert(InfoRecord *)
Definition: Set.C:49
int basenode(int pid) const
Definition: PatchMap.h:117
#define LDBSTRAT_DEFAULT
Definition: SimParameters.h:65
int isPmeProcessor(int)
Definition: ComputePme.C:598
BigReal ldbHomeBackgroundScaling
static Units next(Units u)
Definition: ParseOptions.C:48
#define LDBSTRAT_OLD
Definition: SimParameters.h:68
double idleTime
Definition: elements.h:40
int patch2
Definition: elements.h:23
void CreateNamdCentLB()
Definition: NamdCentLB.C:26
int numPartitions(ComputeID cid)
Definition: ComputeMap.C:135
int PatchID
Definition: NamdTypes.h:182
void setNewNode(ComputeID cid, NodeID node)
Definition: ComputeMap.h:120
const int & LdbIdField(const LdbId &id, const int index)
int numAtoms
Definition: elements.h:32
void NAMD_die(const char *err_msg)
Definition: common.C:83
static LdbCoordinator * Object()
BigReal ldbBackgroundScaling
represents a patch
double load
Definition: elements.h:15
Definition: Set.h:19
#define LDBSTRAT_COMPREHENSIVE
Definition: SimParameters.h:66
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:714
LDObjHandle handle
Definition: elements.h:26
#define simParams
Definition: Output.C:127
int numPatches(void) const
Definition: PatchMap.h:59
int node(int pid) const
Definition: PatchMap.h:114
IRSet proxiesOn
Definition: elements.h:33
static ComputeMap * Object()
Definition: ComputeMap.h:89
BigReal ldbPMEBackgroundScaling
double computeLoad
Definition: elements.h:41
int numPids(ComputeID cid)
Definition: ComputeMap.C:103
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
void unchecked_insert(InfoRecord *)
Definition: Set.C:32
int numaway_c(void) const
Definition: PatchMap.h:70
NamdCentLB(const CkLBOptions &opt)
Definition: NamdCentLB.C:51
int processor
Definition: elements.h:31
int pid(ComputeID cid, int i)
Definition: ComputeMap.C:109
infostream & endi(infostream &s)
Definition: InfoStream.C:38
int isOutputProcessor(int pe)
int numaway_a(void) const
Definition: PatchMap.h:68
double backgroundLoad
Definition: elements.h:39
bool available
Definition: elements.h:44
double BigReal
Definition: common.h:112