NAMD
NamdHybridLB.C
Go to the documentation of this file.
1 /*****************************************************************************
2  * $Source: /home/cvs/namd/cvsroot/namd2/src/NamdHybridLB.C,v $
3  * $Author: jim $
4  * $Date: 2017/03/30 20:06:17 $
5  * $Revision: 1.40 $
6  *****************************************************************************/
7 
8 #if !defined(WIN32) || defined(__CYGWIN__)
9 #include <unistd.h>
10 #endif
11 #include <fcntl.h>
12 
13 #include "InfoStream.h"
14 #include "NamdHybridLB.h"
15 #include "Node.h"
16 #include "PatchMap.h"
17 #include "ComputeMap.h"
18 #include "LdbCoordinator.h"
19 
20 class SplitComputesMsg : public CMessage_SplitComputesMsg {
21 public:
22  double maxUnsplit;
23  double averageLoad;
24  double avgCompute;
25  double maxCompute;
29  int n;
30  int *cid;
31  float *load;
32 };
33 
34 #include "NamdHybridLB.def.h"
35 
36 // #define DUMP_LDBDATA 1
37 // #define LOAD_LDBDATA 1
38 
39 extern int isPmeProcessor(int);
40 #ifdef MEM_OPT_VERSION
41 extern int isOutputProcessor(int);
42 #endif
43 // Load array defined in NamdCentLB.C
44 extern double *cpuloads;
45 
50  int seqno = LdbInfra::Object()->getLoadbalancerTicket();
51  CProxy_NamdHybridLB::ckNew(CkLBOptions(seqno));
52 
53  // creating an array to store the loads of all processors
54  // to be used with proxy spanning tree
55  if (CkMyPe() == 0 && cpuloads == NULL) {
56  cpuloads = new double[CkNumPes()];
57  CmiAssert(cpuloads != NULL);
58  for (int i=0; i<CkNumPes(); i++) cpuloads[i] = 0.0;
59  }
60 }
61 
65 NamdHybridLB::NamdHybridLB(const CkLBOptions& opt): HybridBaseLB(opt)
66 {
67  // setting the name
68  lbname = (char *)"NamdHybridLB";
69 
70  delete tree; // delete the tree built from the base class
72  if (CkNumPes() <= simParams->hybridGroupSize) {
73  tree = new TwoLevelTree; // similar to centralized load balancing
74  }
75  else {
76  tree = new ThreeLevelTree(simParams->hybridGroupSize);
77  initTree();
78  // can only do shrink strategy on levels > 1
79  statsStrategy = SHRINK_NULL;
80  }
81 
82  // initializing thisProxy
83  thisProxy = CProxy_NamdHybridLB(thisgroup);
84 
85  // initializing the central LB
86  centralLB = AllocateNamdCentLB();
87 
88  // initializing the dummy LB
89  dummyLB = AllocateNamdDummyLB();
90 
91  // assigning initial values to variables
92  from_procs = NULL;
93  computeArray = NULL;
94  patchArray = NULL;
95  processorArray = NULL;
96  updateCount = 0;
97  splitCount = 0;
98  splitComputesMsgs = 0;
99  updateFlag = false;
100  collectFlag = false;
101 
102 }
103 
109 bool NamdHybridLB::QueryBalanceNow(int _step){
110  if ( LdbCoordinator::Object()->takingLdbData ) {
111  return true;
112  } else {
113  return false;
114  }
115 }
116 
117 bool NamdHybridLB::QueryDumpData() {
118 #if 0
119  if (LdbCoordinator::Object()->ldbCycleNum == 1) return true;
120  if (LdbCoordinator::Object()->ldbCycleNum == 2) return true;
121 #endif
122  return false;
123 }
124 
125 #if 0
126 
131 LBVectorMigrateMsg* NamdHybridLB::VectorStrategy(LDStats* stats){
132  CkPrintf("[%d] Using Vector Strategy to balance the load\n",CkMyPe());
133  LBVectorMigrateMsg* msg = new(0,0) LBVectorMigrateMsg;
134  msg->n_moves = 0;
135  msg->level = currentLevel;
136  return msg;
137 }
138 #endif
139 
140 /*
141  * Runs the load balancing strategy
142  */
143 CLBMigrateMsg* NamdHybridLB::Strategy(LDStats* stats)
144 {
145  int i;
146  // CkPrintf("[%d] NamdHybridLB at Strategy\n",CkMyPe());
147 
148  // calling the centralLB for level 1
149  if(currentLevel == 1){
150  LevelData *lData = levelData[currentLevel];
151  CLBMigrateMsg *msg;
152  LocalLBInfoMsg *newMsg;
153  msg = GrpLevelStrategy(stats);
154 
155  // creating a new message to send to its parent
156  newMsg = new(msg->n_moves,endPE-startPE+1) LocalLBInfoMsg;
157  newMsg->n_moves = msg->n_moves;
158  newMsg->startPE = startPE;
159  newMsg->endPE = endPE;
160  for(i=0; i<msg->n_moves; i++){
161  newMsg->moves[i] = msg->moves[i];
162  }
163  for(i=0; i<endPE-startPE+1; i++){
164  newMsg->cpuloads[i] = peLoads[i];
165  }
166  delete [] peLoads;
167  thisProxy[0].UpdateLocalLBInfo(newMsg);
168  return msg;
169  }else{
170  dummyLB->work(stats);
171  return createMigrateMsg(stats);
172  }
173 }
174 
179  int children;
180  int i;
181 
182  // getting the number of children
183  children = tree->numNodes(currentLevel);
184  // CkPrintf("[%d] Updating compute map, total %d\n",CkMyPe(),siblings);
185 
186  // getting the compute map to insert the changes coming from the children
187  ComputeMap *computeMap = ComputeMap::Object();
188 
189  // traversing the set of moves in msg
190  for(i=0; i<msg->n_moves; i++){
191  if (msg->moves[i].to_pe != -1)
192  computeMap->setNewNode(LdbIdField(msg->moves[i].obj.id, 0),msg->moves[i].to_pe);
193  }
194 
195  // CODING
196  // updating cpuloads array
197  for(i=msg->startPE; i<=msg->endPE; i++){
198  cpuloads[i] = msg->cpuloads[i-msg->startPE];
199  }
200 
201  // checking if all children have sent the update
202  updateCount++;
203  if(updateCount == children){
204  updateCount = 0;
205  updateFlag = true;
206  // CkPrintf("[%d] UPDATE READY\n",CkMyPe());
207  }
208 
209  // checking if the collect info is ready
210  if(updateFlag && collectFlag){
211  updateFlag = false;
212  collectFlag = false;
213  thisProxy[parent_backup].CollectInfo(loc_backup, n_backup, fromlevel_backup);
214  }
215 
216  delete msg;
217 }
218 
220  const int children = tree->numNodes(1);
221 
222  if ( ! splitComputesMsgs ) {
223  splitComputesMsgs = new SplitComputesMsg*[children];
224  }
225 
226  splitComputesMsgs[splitCount] = msg;
227 
228  if ( ++splitCount == children ) {
229  splitCount = 0;
230 
232  ComputeMap *computeMap = ComputeMap::Object();
233 
234  double maxUnsplit = 0.;
235  double averageLoad = 0.;
236  double avgCompute = 0.;
237  double maxCompute = 0.;
238  int maxComputeId = -1;
239  int nMoveableComputes = 0;
240  int numPesAvailable = 0;
241  int nToSplit = 0;
242 
243  for ( int j=0; j < children; ++j ) {
244  SplitComputesMsg *msg = splitComputesMsgs[j];
245  if ( msg->maxUnsplit > maxUnsplit ) { maxUnsplit = msg->maxUnsplit; }
246  if ( msg->maxCompute > maxCompute ) { maxCompute = msg->maxCompute; maxComputeId = msg->maxComputeId; }
247  averageLoad += msg->averageLoad * msg->numPesAvailable;
248  numPesAvailable += msg->numPesAvailable;
249  avgCompute += msg->avgCompute * msg->nMoveableComputes;
250  nMoveableComputes += msg->nMoveableComputes;
251  nToSplit += msg->n;
252  }
253 
254  averageLoad /= numPesAvailable;
255  if ( nMoveableComputes ) avgCompute /= nMoveableComputes; else avgCompute = 0.;
256 
257  CkPrintf("LDB: Largest compute %d load %f is %.1f%% of average load %f\n",
258  maxComputeId, maxCompute, 100. * maxCompute / averageLoad, averageLoad);
259  CkPrintf("LDB: Average compute %f is %.1f%% of average load %f\n",
260  avgCompute, 100. * avgCompute / averageLoad, averageLoad);
261 
262  if ( ! nToSplit ) {
263  for ( int j=0; j < children; ++j ) {
264  delete splitComputesMsgs[j];
265  }
266  } else {
267  // partitions are stored as char but mostly limited by
268  // high load noise at low outer-loop iteration counts
269  int maxParts = 10;
270 #ifdef NAMD_CUDA
271 //split LCPO compute very small, else CUDA compute is delayed
272  if (simParams->LCPOOn) {
273  maxParts = 20;
274  }
275 #endif
276  int totalAddedParts = 0;
277  maxCompute = averageLoad / 10.;
278  if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;
279  if ( simParams->ldbRelativeGrainsize > 0. ) {
280  maxCompute = averageLoad * simParams->ldbRelativeGrainsize;
281  }
282  CkPrintf("LDB: Partitioning computes with target load %f\n", maxCompute);
283 
284  for ( int j=0; j < children; ++j ) {
285  SplitComputesMsg *msg = splitComputesMsgs[j];
286  for (int i=0; i < msg->n; ++i) {
287  int nparts = (int) ceil(msg->load[i] / maxCompute);
288  if ( nparts > maxParts ) nparts = maxParts;
289  if ( nparts < 1 ) nparts = 1;
290  computeMap->setNewNumPartitions(msg->cid[i],nparts);
291  totalAddedParts += nparts - 1;
292  }
293  delete msg;
294  }
295 
296  CkPrintf("LDB: Increased migratable compute count from %d to %d\n",
297  nMoveableComputes,nMoveableComputes+totalAddedParts);
298  CkPrintf("LDB: Largest unpartitionable compute is %f\n", maxUnsplit);
299  }
300  }
301 }
302 
303 
308 CLBMigrateMsg* NamdHybridLB::GrpLevelStrategy(LDStats* stats) {
309  int numProcessors = stats->nprocs(); // number of processors at group level
311  ComputeMap *computeMap = ComputeMap::Object();
312  const int numComputes = computeMap->numComputes();
313  const int numGroupComputes = stats->n_migrateobjs;
315 
316  if ( ! processorArray ) processorArray = new processorInfo[numProcessors];
317  // these data structures are global and need to be distributed
318  if ( ! patchArray ) patchArray = new patchInfo[numPatches];
319  if ( ! computeArray ) computeArray = new computeInfo[numGroupComputes];
320  if ( ! from_procs ) from_procs = new int[numGroupComputes];
321 
322  int nMoveableComputes = buildData(stats);
323  CmiAssert(nMoveableComputes <= numGroupComputes);
324 
325 
326 #if LDB_DEBUG
327 #define DUMP_LDBDATA 1
328 #define LOAD_LDBDATA 1
329 #endif
330 
331 #if DUMP_LDBDATA
332  dumpDataASCII("ldbd_before", numProcessors, numPatches, nMoveableComputes);
333 #elif LOAD_LDBDATA
334  loadDataASCII("ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
335  // CkExit();
336 #endif
337 
338  double averageLoad = 0.;
339  double avgCompute;
340  double maxCompute;
341  int maxComputeId;
342  int numPesAvailable;
343  {
344  int i;
345  double total = 0.;
346  maxCompute = 0.;
347  int maxi = -1;
348  for (i=0; i<nMoveableComputes; i++) {
349  double load = computeArray[i].load;
350  total += load;
351  if ( load > maxCompute ) { maxCompute = load; maxi = i; }
352  }
353  avgCompute = total / nMoveableComputes;
354  maxComputeId = maxi < 0 ? -1 : LdbIdField(computeArray[maxi].handle.id, 0);
355 
356  int P = stats->nprocs();
357  numPesAvailable = 0;
358  for (i=0; i<P; i++) {
359  if (processorArray[i].available) {
360  ++numPesAvailable;
361  total += processorArray[i].backgroundLoad;
362  }
363  }
364  if (numPesAvailable == 0)
365  NAMD_die("No processors available for load balancing!\n");
366 
367  averageLoad = total/numPesAvailable;
368  }
369 
370  int i_split = 0;
371  double maxUnsplit = 0.;
372 
373  if ( step() == 1 ) {
374  for (int i=0; i<nMoveableComputes; i++) {
375  const int cid = LdbIdField(computeArray[i].handle.id, 0);
376  if ( computeMap->numPartitions(cid) == 0 ) {
377  const double load = computeArray[i].load;
378  if ( load > maxUnsplit ) maxUnsplit = load;
379  continue;
380  }
381  ++i_split;
382  }
383  }
384 
385  {
386  SplitComputesMsg *msg = new(i_split,i_split) SplitComputesMsg;
387  msg->maxUnsplit = maxUnsplit;
388  msg->averageLoad = averageLoad;
389  msg->avgCompute = avgCompute;
390  msg->maxCompute = maxCompute;
391  msg->maxComputeId = maxComputeId;
392  msg->nMoveableComputes = nMoveableComputes;
393  msg->numPesAvailable = numPesAvailable;
394  msg->n = i_split;
395 
396  if ( step() == 1 ) {
397  i_split = 0;
398  for (int i=0; i<nMoveableComputes; i++) {
399  computeArray[i].processor = computeArray[i].oldProcessor;
400  const int cid = LdbIdField(computeArray[i].handle.id, 0);
401  if ( computeMap->numPartitions(cid) == 0 ) {
402  continue;
403  }
404  msg->cid[i_split] = cid;
405  msg->load[i_split] = computeArray[i].load;
406  ++i_split;
407  }
408  }
409 
410  thisProxy[0].splitComputes(msg);
411  }
412 
413  if ( step() == 1 ) {
414  // compute splitting only
415  } else if (simParams->ldbStrategy == LDBSTRAT_DEFAULT) { // default
416  if (step() < 4)
417  TorusLB(computeArray, patchArray, processorArray,
418  nMoveableComputes, numPatches, numProcessors);
419  else
420  RefineTorusLB(computeArray, patchArray, processorArray,
421  nMoveableComputes, numPatches, numProcessors, 1);
422  } else if (simParams->ldbStrategy == LDBSTRAT_COMPREHENSIVE) {
423  TorusLB(computeArray, patchArray, processorArray,
424  nMoveableComputes, numPatches, numProcessors);
425  } else if (simParams->ldbStrategy == LDBSTRAT_REFINEONLY) {
426  RefineTorusLB(computeArray, patchArray, processorArray,
427  nMoveableComputes, numPatches, numProcessors, 1);
428  } else if (simParams->ldbStrategy == LDBSTRAT_OLD) {
429  NAMD_die("Old load balancer strategy is not compatible with hybrid balancer.");
430  if (step() < 4)
431  Alg7(computeArray, patchArray, processorArray,
432  nMoveableComputes, numPatches, numProcessors);
433  else
434  RefineOnly(computeArray, patchArray, processorArray,
435  nMoveableComputes, numPatches, numProcessors);
436  }
437 
438 #if LDB_DEBUG && USE_TOPOMAP
439  TopoManager tmgr;
440  int pe1, pe2, pe3, hops=0;
441  /* This is double counting the hops
442  for(int i=0; i<nMoveableComputes; i++)
443  {
444  pe1 = computeArray[i].processor;
445  pe2 = patchArray[computeArray[i].patch1].processor;
446  pe3 = patchArray[computeArray[i].patch2].processor;
447  hops += tmgr.getHopsBetweenRanks(pe1, pe2);
448  if(computeArray[i].patch1 != computeArray[i].patch2)
449  hops += tmgr.getHopsBetweenRanks(pe1, pe3);
450  }*/
451  for (int i=0; i<numPatches; i++) {
452  //int num = patchArray[i].proxiesOn.numElements();
453  pe1 = patchArray[i].processor;
454  Iterator nextProc;
455  processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.iterator((Iterator *)&nextProc);
456  while (p) {
457  pe2 = p->Id;
458  hops += tmgr.getHopsBetweenRanks(pe1, pe2);
459  p = (processorInfo *)patchArray[i].proxiesOn.next((Iterator*)&nextProc);
460  }
461  }
462  CkPrintf("Load Balancing: Number of Hops: %d\n", hops);
463 #endif
464 
465 #if DUMP_LDBDATA
466  dumpDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);
467 #elif LOAD_LDBDATA
468  dumpDataASCII("ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
469  // loadDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);
470  // CkExit();
471 #endif
472 
473  // For error checking:
474  // Count up computes, to see if somebody doesn't have any computes
475  int i;
476 #if 0
477  int* computeCount = new int[numProcessors];
478  for(i=0; i<numProcessors; i++)
479  computeCount[i]=0;
480  for(i=0; i<nMoveableComputes; i++)
481  computeCount[computeArray[i].processor]++;
482  for(i=0; i<numProcessors; i++) {
483  if (computeCount[i]==0)
484  iout << iINFO <<"Warning: Processor " << i
485  << " has NO moveable computes.\n" << endi;
486  }
487  delete [] computeCount;
488 #endif
489 
490  CkVec<MigrateInfo *> migrateInfo;
491  for(i=0;i<nMoveableComputes;i++) {
492  if (computeArray[i].processor != from_procs[i]+stats->procs[0].pe) {
493  /* CkPrintf("[%d] Obj %d migrating from %d (%d) to %d\n",
494  CkMyPe(),computeArray[i].handle.id.id[0],
495  from_procs[i], computeArray[i].oldProcessor, computeArray[i].processor); */
496  MigrateInfo *migrateMe = new MigrateInfo;
497  migrateMe->obj = computeArray[i].handle;
498  //migrateMe->from_pe = computeArray[i].oldProcessor;
499  int frompe = from_procs[i];
500  if (frompe == numProcessors)
501  frompe = -1;
502  else
503  frompe = frompe + stats->procs[0].pe;
504  migrateMe->from_pe = frompe;
505  migrateMe->to_pe = computeArray[i].processor;
506  if (frompe == -1) {
507  // don't know yet which processor this compute belongs to, but
508  // inform receiver
509  LDObjData obj;
510  obj.handle = computeArray[i].handle;
511  thisProxy[computeArray[i].processor].ObjMigrated(obj, NULL, 0, currentLevel-1);
512  }
513  migrateInfo.insertAtEnd(migrateMe);
514 
515  // sneak in updates to ComputeMap
516  //ERASE CkPrintf("%d setting %d to processor %d\n",CkMyPe(),computeArray[i].handle.id.id[0],computeArray[i].processor);
517  computeMap->setNewNode(LdbIdField(computeArray[i].handle.id, 0),
518  computeArray[i].processor);
519  }
520  }
521  // CkPrintf("LOAD BALANCING READY %d\n",CkMyPe());
522 
523  LBMigrateMsg* msg;
524  msg = createMigrateMsg(migrateInfo, numProcessors);
525 
526  peLoads = new double [numProcessors];
527  startPE = processorArray[0].Id;
528  endPE = processorArray[numProcessors-1].Id;
529  // CkPrintf("[%d] numProcessors=%d, %d to %d\n",CkMyPe(),numProcessors,processorArray[0].Id,processorArray[numProcessors-1].Id);
530  for (i=0; i<numProcessors; i++) {
531  peLoads[i] = processorArray[i].load;
532  }
533 
534 
535  delete [] from_procs;
536  delete [] processorArray;
537  delete [] patchArray;
538  delete [] computeArray;
539 
540  from_procs = NULL;
541  processorArray = NULL;
542  patchArray = NULL;
543  computeArray = NULL;
544 
545  return msg;
546 
547 }
548 
549 void NamdHybridLB::dumpDataASCII(char *file, int numProcessors,
550  int numPatches, int numComputes)
551 {
552  char filename[128];
553  sprintf(filename, "%s_%d.%d", file, CkMyPe(), step());
554  FILE* fp = fopen(filename,"w");
555  if (fp == NULL){
556  perror("dumpLDStatsASCII");
557  return;
558  }
559  // CkPrintf("***** DUMP data to file: %s ***** \n", filename);
560  fprintf(fp,"%d %d %d\n",numProcessors,numPatches,numComputes);
561 
562  int i;
563  for(i=0;i<numProcessors;i++) {
564  processorInfo* p = processorArray + i;
565  fprintf(fp,"%d %e %e %e %e\n",p->Id,p->load,p->backgroundLoad,p->computeLoad,p->idleTime);
566  }
567 
568  for(i=0;i < numPatches; i++) {
569  patchInfo* p = patchArray + i;
570  fprintf(fp,"%d %e %d %d\n",p->Id,p->load,p->processor,p->numAtoms);
571  }
572 
573  for(i=0; i < numComputes; i++) {
574  computeInfo* c = computeArray + i;
575  fprintf(fp,"%d %e %d %d %d %d",c->Id,c->load,c->patch1,c->patch2,
576  c->processor,c->oldProcessor);
577  fprintf(fp, "\n");
578  }
579 
580  // dump patchSet
581  for (i=0; i< numProcessors; i++) {
582  int num = processorArray[i].proxies.numElements();
583  fprintf(fp, "%d %d: ", i, num);
584  Iterator nextProxy;
585  patchInfo *p = (patchInfo *)processorArray[i].proxies.
586  iterator((Iterator *)&nextProxy);
587  while (p) {
588  fprintf(fp, "%d ", p->Id);
589  p = (patchInfo *)processorArray[i].proxies.
590  next((Iterator*)&nextProxy);
591  }
592  fprintf(fp, "\n");
593  }
594  // dump proxiesOn
595  for (i=0; i<numPatches; i++) {
596  int num = patchArray[i].proxiesOn.numElements();
597  fprintf(fp, "%d %d: ", i, num);
598  Iterator nextProc;
599  processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.
600  iterator((Iterator *)&nextProc);
601  while (p) {
602  fprintf(fp, "%d ", p->Id);
603  p = (processorInfo *)patchArray[i].proxiesOn.
604  next((Iterator*)&nextProc);
605  }
606  fprintf(fp, "\n");
607  }
608 
609  fclose(fp);
610  //CkExit();
611 }
612 
613 
617 int NamdHybridLB::buildData(LDStats* stats) {
618  int n_pes = stats->nprocs();
619 
620  PatchMap* patchMap = PatchMap::Object();
621  ComputeMap* computeMap = ComputeMap::Object();
622  const SimParameters* simParams = Node::Object()->simParameters;
623 
624  BigReal bgfactor = simParams->ldbBackgroundScaling;
625  BigReal pmebgfactor = simParams->ldbPMEBackgroundScaling;
626  BigReal homebgfactor = simParams->ldbHomeBackgroundScaling;
627  int pmeOn = simParams->PMEOn;
628  int unLoadPme = simParams->ldbUnloadPME;
629  int pmeBarrier = simParams->PMEBarrier;
630  int unLoadZero = simParams->ldbUnloadZero;
631  int unLoadOne = simParams->ldbUnloadOne;
632  int unLoadIO= simParams->ldbUnloadOutputPEs;
633  // traversing the list of processors and getting their load information
634  int i, pe_no;
635  for (i=0; i<n_pes; ++i) {
636  pe_no = stats->procs[i].pe;
637 
638  // BACKUP processorArray[i].Id = i;
639  processorArray[i].Id = pe_no; // absolute pe number
640  processorArray[i].available = true;
641  // BACKUP if ( pmeOn && isPmeProcessor(i) )
642  if ( pmeOn && isPmeProcessor(pe_no) ) {
643  processorArray[i].backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;
644  // BACKUP } else if (patchMap->numPatchesOnNode(i) > 0) {
645  } else if (patchMap->numPatchesOnNode(pe_no) > 0) {
646  processorArray[i].backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;
647  } else {
648  processorArray[i].backgroundLoad = bgfactor * stats->procs[i].bg_walltime;
649  }
650  processorArray[i].idleTime = stats->procs[i].idletime;
651  processorArray[i].load = processorArray[i].computeLoad = 0.0;
652  }
653 
654  // If I am group zero, then offload processor 0 and 1 in my group
655  if(stats->procs[0].pe == 0) {
656  if(unLoadZero) processorArray[0].available = false;
657  if(unLoadOne) processorArray[1].available = false;
658  }
659 
660  // if all pes are Pme, disable this flag
661  if (pmeOn && unLoadPme) {
662  for (i=0; i<n_pes; i++) {
663  if(!isPmeProcessor(stats->procs[i].pe)) break;
664  }
665  if (i == n_pes) {
666  iout << iINFO << "Turned off unLoadPme flag!\n" << endi;
667  unLoadPme = 0;
668  }
669  }
670 
671  if (pmeOn && unLoadPme) {
672  for (i=0; i<n_pes; i++) {
673  if ((pmeBarrier && i==0) || isPmeProcessor(stats->procs[i].pe))
674  processorArray[i].available = false;
675  }
676  }
677 
678  // if all pes are output, disable this flag
679 #ifdef MEM_OPT_VERSION
680  if (unLoadIO) {
681  if (simParams->numoutputprocs == n_pes) {
682  iout << iINFO << "Turned off unLoadIO flag!\n" << endi;
683  unLoadIO = 0;
684  }
685  }
686  if (unLoadIO){
687  for (i=0; i<n_pes; i++) {
688  if (isOutputProcessor(stats->procs[i].pe))
689  processorArray[i].available = false;
690  }
691  }
692 #endif
693 
694  // need to go over all patches to get all required proxies
695  int numPatches = patchMap->numPatches();
696  int totalLocalProxies = 0;
697  int totalProxies = 0;
698  for ( int pid=0; pid<numPatches; ++pid ) {
699  int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
700 
701  patchArray[pid].Id = pid;
702  patchArray[pid].numAtoms = 0;
703  patchArray[pid].processor = patchMap->node(pid);
704 
705  const int numProxies =
706 #if 0 // USE_TOPOMAP - this function needs to be there for the hybrid case
707  requiredProxiesOnProcGrid(pid,neighborNodes);
708 #else
709  requiredProxies(pid, neighborNodes);
710 #endif
711 
712  int numLocalProxies = 0;
713  for (int k=0; k<numProxies; k++) {
714  if( (neighborNodes[k] >= stats->procs[0].pe) && (neighborNodes[k] <= stats->procs[n_pes-1].pe) ){
715  ++numLocalProxies;
716  int index = neighborNodes[k] - stats->procs[0].pe;
717  processorArray[index].proxies.unchecked_insert(&patchArray[pid]);
718  patchArray[pid].proxiesOn.unchecked_insert(&processorArray[index]);
719  }
720  }
721 #if 0
722  if ( numLocalProxies ) {
723  CkPrintf("LDB Pe %d patch %d has %d local of %d total proxies\n",
724  CkMyPe(), pid, numLocalProxies, numProxies);
725  }
726 #endif
727  totalLocalProxies += numLocalProxies;
728  totalProxies += numProxies;
729  }
730 #if 0
731  CkPrintf("LDB Pe %d has %d local of %d total proxies\n",
732  CkMyPe(), totalLocalProxies, totalProxies);
733 #endif
734 
735  int nMoveableComputes=0;
736  int index;
737 
738  int j;
739 
740  // this loop goes over only the objects in this group
741  for(j=0; j < stats->n_objs; j++) {
742  const LDObjData &this_obj = stats->objData[j];
743  int frompe = stats->from_proc[j];
744 
745  // filter out non-NAMD managed objects (like PME array)
746  if (this_obj.omID().id.idx != 1) {
747  // CmiAssert(frompe>=0 && frompe<n_pes);
748  // CkPrintf("non-NAMD object %d on pe %d with walltime %lf\n",
749  // this_obj.id().id[0], frompe + stats->procs[0].pe, this_obj.wallTime);
750  processorArray[frompe].backgroundLoad += this_obj.wallTime;
751  continue;
752  }
753 
754  if (LdbIdField(this_obj.id(), 1) == PATCH_TYPE) { // Its a patch
755  // handled above to get required proxies from all patches
756  processorArray[frompe].backgroundLoad += this_obj.wallTime;
757  } else if (LdbIdField(this_obj.id(), 1) == BONDED_TYPE) { // Its a bonded compute
758  processorArray[frompe].backgroundLoad += this_obj.wallTime;
759  } else if (this_obj.migratable && this_obj.wallTime != 0.) { // Its a compute
760 
761  const int cid = LdbIdField(this_obj.id(), 0);
762  const int p0 = computeMap->pid(cid,0);
763 
764  // For self-interactions, just return the same pid twice
765  int p1;
766  if (computeMap->numPids(cid) > 1)
767  p1 = computeMap->pid(cid,1);
768  else p1 = p0;
769  computeArray[nMoveableComputes].Id = cid;
770  //BACKUP computeArray[nMoveableComputes].oldProcessor = stats->from_proc[j];
771  if (frompe >= n_pes) { // from outside
772 CkPrintf("assigning random old processor...this looks broken\n");
773  computeArray[nMoveableComputes].oldProcessor = CrnRand()%n_pes + stats->procs[0].pe; // random
774  }
775  else {
776  computeArray[nMoveableComputes].oldProcessor = frompe + stats->procs[0].pe;
777  }
778  from_procs[nMoveableComputes] = frompe;
779 
780  //BACKUP2 index = stats->from_proc[j] - stats->procs[0].pe;
781  //BACKUP processorArray[stats->from_proc[j]].computeLoad += this_obj.wallTime;
782  int index = computeArray[nMoveableComputes].oldProcessor - stats->procs[0].pe;
783  processorArray[index].computeLoad += this_obj.wallTime;
784  computeArray[nMoveableComputes].processor = -1;
785  computeArray[nMoveableComputes].patch1 = p0;
786  computeArray[nMoveableComputes].patch2 = p1;
787  computeArray[nMoveableComputes].handle = this_obj.handle;
788  computeArray[nMoveableComputes].load = this_obj.wallTime;
789  nMoveableComputes++;
790  }
791  }
792 
793  for (i=0; i<n_pes; i++) {
794  processorArray[i].load = processorArray[i].backgroundLoad + processorArray[i].computeLoad;
795  }
796  stats->clear();
797  return nMoveableComputes;
798 }
799 
800 
801 int NamdHybridLB::requiredProxies(PatchID id, int neighborNodes[])
802 {
803  PatchMap* patchMap = PatchMap::Object();
804  int myNode = patchMap->node(id);
805  int nProxyNodes = 0;
806 
807 #define IF_NEW_NODE \
808  int j; \
809  for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \
810  if ( j == nProxyNodes )
811 
813  neighbors[0] = id;
814  int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
815  for ( int i = 0; i < numNeighbors; ++i ) {
816  const int proxyNode = patchMap->basenode(neighbors[i]);
817  if ( proxyNode != myNode ) {
818  IF_NEW_NODE {
819  neighborNodes[nProxyNodes] = proxyNode;
820  nProxyNodes++;
821  }
822  }
823  }
824 
825  // Distribute initial default proxies across empty processors.
826  // This shouldn't be necessary, but may constrain the load balancer
827  // and avoid placing too many proxies on a single processor. -JCP
828 
829  // This code needs to be turned off when the creation of ST is
830  // shifted to the load balancers -ASB
831 
832 #if 1
833  int numNodes = CkNumPes();
834  int numPatches = patchMap->numPatches();
835  int emptyNodes = numNodes - numPatches;
836  if ( emptyNodes > numPatches ) {
837  int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;
838  int maxNodesPerPatch = PatchMap::MaxOneAway + PatchMap::MaxTwoAway;
839  if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;
840  int proxyNode = (myNode + 1) % numNodes;
841  while ( nProxyNodes < nodesPerPatch &&
842  ! patchMap->numPatchesOnNode(proxyNode) ) {
843  if ( proxyNode != myNode ) {
844  IF_NEW_NODE {
845  neighborNodes[nProxyNodes] = proxyNode;
846  nProxyNodes++;
847  }
848  }
849  proxyNode = (proxyNode + 1) % numNodes;
850  }
851  proxyNode = (myNode - 1 + numNodes) % numNodes;
852  while ( nProxyNodes < nodesPerPatch &&
853  ! patchMap->numPatchesOnNode(proxyNode) ) {
854  if ( proxyNode != myNode ) {
855  IF_NEW_NODE {
856  neighborNodes[nProxyNodes] = proxyNode;
857  nProxyNodes++;
858  }
859  }
860  proxyNode = (proxyNode - 1 + numNodes) % numNodes;
861  }
862  proxyNode = (myNode + 1) % numNodes;
863  int count = 0;
864  while ( nProxyNodes < nodesPerPatch ) {
865  if ( ! patchMap->numPatchesOnNode(proxyNode) && proxyNode != myNode ) {
866  IF_NEW_NODE {
867  neighborNodes[nProxyNodes] = proxyNode;
868  nProxyNodes++;
869  }
870  }
871  proxyNode = (proxyNode + 1) % numNodes;
872  count ++; if (count == numNodes) break; // we looped all
873  }
874  } else {
875  int proxyNode = myNode - 1;
876  if ( proxyNode >= 0 && ! patchMap->numPatchesOnNode(proxyNode) ) {
877  if ( proxyNode != myNode ) {
878  IF_NEW_NODE {
879  neighborNodes[nProxyNodes] = proxyNode;
880  nProxyNodes++;
881  }
882  }
883  }
884  proxyNode = myNode + 1;
885  if ( proxyNode < numNodes && ! patchMap->numPatchesOnNode(proxyNode) ) {
886  if ( proxyNode != myNode ) {
887  IF_NEW_NODE {
888  neighborNodes[nProxyNodes] = proxyNode;
889  nProxyNodes++;
890  }
891  }
892  }
893  }
894 #endif
895 
896  return nProxyNodes;
897 }
898 
899 #if 0
900 void NamdHybridLB::CollectInfo(Location *loc, int n, int fromlevel)
901 {
902  int atlevel = fromlevel + 1;
903  LevelData *lData = levelData[atlevel];
904  lData->info_recved++;
905 
906  CkVec<Location> &matchedObjs = lData->matchedObjs;
907 CmiAssert(0);
908 
909  // sort into mactched and unmatched list
910  std::map<LDObjKey, int> &unmatchedObjs = lData->unmatchedObjs;
911  for (int i=0; i<n; i++) {
912  std::map<LDObjKey, int>::iterator iter = unmatchedObjs.find(loc[i].key);
913  if (iter != unmatchedObjs.end()) {
914  CmiAssert(iter->second != -1 || loc[i].loc != -1);
915  if (loc[i].loc == -1) loc[i].loc = iter->second;
916  matchedObjs.push_back(loc[i]);
917  unmatchedObjs.erase(iter);
918  }
919  else
920  unmatchedObjs[loc[i].key] = loc[i].loc;
921  }
922 
923 // DEBUGF(("[%d] level %d has %d unmatched and %d matched. \n", CkMyPe(), atlevel, unmatchedObjs.size(), matchedObjs.size()));
924 
925  if (lData->info_recved == lData->nChildren) {
926  lData->info_recved = 0;
927  if (_lb_args.debug() > 1)
928  CkPrintf("[%d] CollectInfo at level %d started at %f\n",
929  CkMyPe(), atlevel, CkWallTimer());
930  if (lData->parent != -1) {
931 
932  // NAMD specific
933  CkVec<Location> unmatchedbuf;
934  for(std::map<LDObjKey, int>::const_iterator it = unmatchedObjs.begin(); it != unmatchedObjs.end(); ++it){
935  unmatchedbuf.push_back(Location(it->first, it->second));
936  }
937  // checking if update of ComputeMap is ready before calling parent
938  if(CkMyPe() == 0){
939  if(updateFlag){
940  updateFlag = false;
941  collectFlag = false;
942  thisProxy[lData->parent].CollectInfo(unmatchedbuf.getVec(), unmatchedbuf.size(), atlevel);
943  }else{
944  CkPrintf("[%d] COMPUTEMAP UPDATE NOT READY\n",CkMyPe());
945  collectFlag = true;
946  parent_backup = lData->parent;
947  loc_backup = unmatchedbuf.getVec();
948  n_backup = unmatchedbuf.size();
949  fromlevel_backup = atlevel;
950  }
951  }else{
952  // send only unmatched ones up the tree
953  thisProxy[lData->parent].CollectInfo(unmatchedbuf.getVec(), unmatchedbuf.size(), atlevel);
954  }
955 
956  }
957  else { // root
958  // we should have all answers now
959  CmiAssert(unmatchedObjs.size() == 0);
960  // start send match list down
961  thisProxy.PropagateInfo(matchedObjs.getVec(), matchedObjs.size(), atlevel, lData->nChildren, lData->children);
962  lData->statsData->clear();
963  }
964  }
965 }
966 #endif
static Node * Object()
Definition: Node.h:86
BlockLoad::TempStorage load
void UpdateLocalLBInfo(LocalLBInfoMsg *msg)
Definition: NamdHybridLB.C:178
NamdDummyLB * AllocateNamdDummyLB()
Definition: NamdDummyLB.C:20
int patch1
Definition: elements.h:23
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:107
represents bonded compute
NamdCentLB * AllocateNamdCentLB()
Definition: NamdCentLB.C:38
Definition: Alg7.h:13
void setNewNumPartitions(ComputeID cid, char numPartitions)
Definition: ComputeMap.h:144
BigReal ldbRelativeGrainsize
int numComputes(void)
Definition: ComputeMap.h:101
static PatchMap * Object()
Definition: PatchMap.h:27
double * cpuloads
Definition: NamdCentLB.C:24
int numElements()
Definition: Set.C:144
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
SimParameters * simParameters
Definition: Node.h:178
LargeIRSet proxies
Definition: elements.h:46
#define LDBSTRAT_REFINEONLY
Definition: SimParameters.h:67
Bool ldbUnloadOutputPEs
int Id
Definition: elements.h:16
int processor
Definition: elements.h:24
#define iout
Definition: InfoStream.h:87
int oldProcessor
Definition: elements.h:25
static double averageLoad
Definition: ProxyMgr.C:696
int basenode(int pid) const
Definition: PatchMap.h:117
NamdHybridLB(const CkLBOptions &opt)
Default constructor.
Definition: NamdHybridLB.C:65
#define LDBSTRAT_DEFAULT
Definition: SimParameters.h:65
int isPmeProcessor(int)
Definition: ComputePme.C:598
BigReal ldbHomeBackgroundScaling
static Units next(Units u)
Definition: ParseOptions.C:48
double * cpuloads
Definition: NamdHybridLB.h:37
void splitComputes(SplitComputesMsg *)
Definition: NamdHybridLB.C:219
void CreateNamdHybridLB()
Definition: NamdHybridLB.C:49
#define LDBSTRAT_OLD
Definition: SimParameters.h:68
double idleTime
Definition: elements.h:40
int patch2
Definition: elements.h:23
int numPartitions(ComputeID cid)
Definition: ComputeMap.C:135
#define IF_NEW_NODE
int PatchID
Definition: NamdTypes.h:182
void setNewNode(ComputeID cid, NodeID node)
Definition: ComputeMap.h:120
const int & LdbIdField(const LdbId &id, const int index)
int numAtoms
Definition: elements.h:32
void NAMD_die(const char *err_msg)
Definition: common.C:83
static LdbCoordinator * Object()
BigReal ldbBackgroundScaling
represents a patch
double load
Definition: elements.h:15
Definition: Set.h:19
MigrateInfo * moves
Definition: NamdHybridLB.h:36
#define LDBSTRAT_COMPREHENSIVE
Definition: SimParameters.h:66
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:714
LDObjHandle handle
Definition: elements.h:26
#define simParams
Definition: Output.C:127
int numPatches(void) const
Definition: PatchMap.h:59
int node(int pid) const
Definition: PatchMap.h:114
void work(LDStats *stats)
Definition: NamdDummyLB.C:44
IRSet proxiesOn
Definition: elements.h:33
static ComputeMap * Object()
Definition: ComputeMap.h:89
BigReal ldbPMEBackgroundScaling
double computeLoad
Definition: elements.h:41
int numPids(ComputeID cid)
Definition: ComputeMap.C:103
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
void unchecked_insert(InfoRecord *)
Definition: Set.C:32
int processor
Definition: elements.h:31
int pid(ComputeID cid, int i)
Definition: ComputeMap.C:109
infostream & endi(infostream &s)
Definition: InfoStream.C:38
int isOutputProcessor(int pe)
double backgroundLoad
Definition: elements.h:39
bool available
Definition: elements.h:44
double BigReal
Definition: common.h:112