8 #if !defined(WIN32) || defined(__CYGWIN__)
15 #include "NamdCentLB.def.h"
28 int seqno = LdbInfra::Object()->getLoadbalancerTicket();
29 loadbalancer = CProxy_NamdCentLB::ckNew(CkLBOptions(seqno));
31 if (CkMyRank() == 0 &&
cpuloads == NULL) {
34 for (
int i=0; i<CkNumPes(); i++)
cpuloads[i] = 0.0;
39 return new NamdCentLB((CkMigrateMessage*)NULL);
69 bool NamdCentLB::QueryBalanceNow(
int _step)
79 bool NamdCentLB::QueryDumpData()
93 int numProcessors = stats->nprocs();
100 if ( ! processorArray ) processorArray =
new processorInfo[numProcessors];
102 if ( ! computeArray ) computeArray =
new computeInfo[numComputes];
104 int nMoveableComputes = buildData(stats);
107 #define DUMP_LDBDATA 1
108 #define LOAD_LDBDATA 1
112 dumpDataASCII(
"ldbd_before", numProcessors, numPatches, nMoveableComputes);
114 loadDataASCII(
"ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
119 double avgCompute = 0.;
120 if ( nMoveableComputes ) {
123 double maxCompute = 0.;
125 for (i=0; i<nMoveableComputes; i++) {
128 if ( load > maxCompute ) { maxCompute =
load; maxi = i; }
130 avgCompute = total / nMoveableComputes;
132 int P = stats->nprocs();
133 int numPesAvailable = 0;
134 for (i=0; i<P; i++) {
135 if (processorArray[i].available) {
140 if (numPesAvailable == 0)
141 NAMD_die(
"No processors available for load balancing!\n");
143 averageLoad = total/numPesAvailable;
144 CkPrintf(
"LDB: Largest compute %d load %f is %.1f%% of average load %f\n",
146 maxCompute, 100. * maxCompute / averageLoad, averageLoad);
147 CkPrintf(
"LDB: Average compute %f is %.1f%% of average load %f\n",
148 avgCompute, 100. * avgCompute / averageLoad, averageLoad);
162 int totalAddedParts = 0;
163 double maxCompute = averageLoad / 10.;
164 if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;
168 CkPrintf(
"LDB: Partitioning computes with target load %f\n", maxCompute);
169 double maxUnsplit = 0.;
170 for (
int i=0; i<nMoveableComputes; i++) {
172 const int cid =
LdbIdField(computeArray[i].handle.id, 0);
173 const double load = computeArray[i].
load;
175 if ( load > maxUnsplit ) maxUnsplit =
load;
178 int nparts = (int) ceil(load / maxCompute);
179 if ( nparts > maxParts ) nparts = maxParts;
180 if ( nparts < 1 ) nparts = 1;
181 if ( 0 && nparts > 1 ) {
182 CkPrintf(
"LDB: Partitioning compute %d with load %f by %d\n",
186 totalAddedParts += nparts - 1;
188 CkPrintf(
"LDB: Increased migratable compute count from %d to %d\n",
189 nMoveableComputes,nMoveableComputes+totalAddedParts);
190 CkPrintf(
"LDB: Largest unpartitionable compute is %f\n", maxUnsplit);
193 TorusLB(computeArray, patchArray, processorArray,
194 nMoveableComputes, numPatches, numProcessors);
197 nMoveableComputes, numPatches, numProcessors, 1);
199 TorusLB(computeArray, patchArray, processorArray,
200 nMoveableComputes, numPatches, numProcessors);
203 nMoveableComputes, numPatches, numProcessors, 1);
206 Alg7(computeArray, patchArray, processorArray,
207 nMoveableComputes, numPatches, numProcessors);
209 RefineOnly(computeArray, patchArray, processorArray,
210 nMoveableComputes, numPatches, numProcessors);
213 #if LDB_DEBUG && USE_TOPOMAP
215 int pe1, pe2, pe3, hops=0;
233 hops += tmgr.getHopsBetweenRanks(pe1, pe2);
237 CkPrintf(
"Load Balancing: Number of Hops: %d\n", hops);
241 dumpDataASCII(
"ldbd_after", numProcessors, numPatches, nMoveableComputes);
243 dumpDataASCII(
"ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
252 int* computeCount =
new int[numProcessors];
253 for(i=0; i<numProcessors; i++)
255 for(i=0; i<nMoveableComputes; i++)
256 computeCount[computeArray[i].processor]++;
257 for(i=0; i<numProcessors; i++) {
258 if (computeCount[i]==0)
259 iout <<
iINFO <<
"Warning: Processor " << i
260 <<
" has NO moveable computes.\n" <<
endi;
262 delete [] computeCount;
265 CkVec<MigrateInfo *> migrateInfo;
266 for(i=0;i<nMoveableComputes;i++) {
267 if (computeArray[i].processor != computeArray[i].oldProcessor) {
271 MigrateInfo *migrateMe =
new MigrateInfo;
272 migrateMe->obj = computeArray[i].
handle;
274 migrateMe->to_pe = computeArray[i].
processor;
275 migrateInfo.insertAtEnd(migrateMe);
283 int migrate_count=migrateInfo.length();
285 CLBMigrateMsg* msg =
new(migrate_count,CkNumPes(),CkNumPes(),0) CLBMigrateMsg;
287 msg->n_moves = migrate_count;
288 for(i=0; i < migrate_count; i++) {
289 MigrateInfo* item = migrateInfo[i];
290 msg->moves[i] = *item;
295 for (i=0; i<numProcessors; i++) {
299 delete [] processorArray;
300 delete [] patchArray;
301 delete [] computeArray;
303 processorArray = NULL;
312 void NamdCentLB::dumpDataASCII(
char *file,
int numProcessors,
316 sprintf(filename,
"%s.%d", file, step());
317 FILE* fp = fopen(filename,
"w");
319 perror(
"dumpLDStatsASCII");
322 CkPrintf(
"***** DUMP data to file: %s ***** \n", filename);
323 fprintf(fp,
"%d %d %d\n",numProcessors,numPatches,numComputes);
326 for(i=0;i<numProcessors;i++) {
336 for(i=0; i < numComputes; i++) {
344 for (i=0; i< numProcessors; i++) {
346 fprintf(fp,
"%d %d: ", i, num);
351 fprintf(fp,
"%d ", p->
Id);
352 p = (
patchInfo *)processorArray[i].proxies.
360 fprintf(fp,
"%d %d: ", i, num);
365 fprintf(fp,
"%d ", p->
Id);
376 void NamdCentLB::loadDataASCII(
char *file,
int &numProcessors,
377 int &numPatches,
int &numComputes)
381 sprintf(filename,
"%s", file);
383 CkPrintf(
"***** Load ascii data from file: %s ***** \n", filename);
385 FILE* fp = fopen(filename,
"r");
387 perror(
"loadDataASCII");
391 fscanf(fp,
"%d %d %d",&numProcessors,&numPatches,&numComputes);
393 printf(
"numProcs: %d numPatches: %d numComputes: %d\n", numProcessors,numPatches, numComputes);
395 delete [] processorArray;
396 delete [] patchArray;
397 delete [] computeArray;
403 for(i=0;i<numProcessors;i++) {
407 if (p->
Id != i) CmiAbort(
"Reading processorArray error!");
415 CmiAbort(
"Reading patchArray error!");
418 for(i=0; i < numComputes; i++) {
424 CmiAbort(
"Reading computeArray error!");
429 for (i=0; i< numProcessors; i++) {
431 fscanf(fp,
"%d %d: ",&curp, &num);
433 CmiAbort(
"Reading patchsSet error!");
434 for (
int j=0; j<num; j++) {
443 fscanf(fp,
"%d %d: ",&curp, &num);
445 CmiAbort(
"Reading proxiesOn error!");
446 for (
int j=0; j<num; j++) {
458 #ifdef MEM_OPT_VERSION
461 #if defined(NAMD_MIC)
465 int NamdCentLB::buildData(LDStats* stats)
467 int n_pes = stats->nprocs();
476 int pmeOn = simParams->
PMEOn;
483 for (i=0; i<n_pes; ++i) {
484 processorArray[i].
Id = i;
487 processorArray[i].
backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;
489 processorArray[i].
backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;
491 processorArray[i].
backgroundLoad = bgfactor * stats->procs[i].bg_walltime;
493 processorArray[i].
idleTime = stats->procs[i].idletime;
538 if (unLoadZero) processorArray[0].
available =
false;
539 if (unLoadOne) processorArray[1].
available =
false;
542 if (pmeOn && unLoadPme) {
543 for (i=0; i<n_pes; i++) {
552 if (pmeOn && unLoadPme) {
553 for (i=0; i<n_pes; i++) {
555 processorArray[i].available =
false;
559 #ifdef MEM_OPT_VERSION
569 for (i=0; i<n_pes; i++) {
584 #if defined(NAMD_MIC)
586 for (i = 0; i < n_pes; i++) {
592 int nMoveableComputes=0;
594 int nIdleComputes = 0;
597 for (j=0; j < stats->n_objs; j++) {
598 const LDObjData &this_obj = stats->objData[j];
599 int frompe = stats->from_proc[j];
602 if (this_obj.omID().id.idx != 1) {
605 processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
613 patchArray[pid].
Id = pid;
615 patchArray[pid].
processor = stats->from_proc[j];
616 const int numProxies =
618 requiredProxiesOnProcGrid(pid,neighborNodes);
620 requiredProxies(pid, neighborNodes);
623 nProxies += numProxies;
625 for (
int k=0; k<numProxies; k++) {
629 processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
631 processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
632 }
else if (this_obj.migratable) {
633 if ( this_obj.wallTime == 0. ) {
637 const int p0 = computeMap->
pid(cid,0);
641 if (computeMap->
numPids(cid) > 1)
642 p1 = computeMap->
pid(cid,1);
644 computeArray[nMoveableComputes].
Id = cid;
645 computeArray[nMoveableComputes].
oldProcessor = stats->from_proc[j];
646 processorArray[stats->from_proc[j]].
computeLoad += this_obj.wallTime;
647 computeArray[nMoveableComputes].
processor = -1;
648 computeArray[nMoveableComputes].
patch1 = p0;
649 computeArray[nMoveableComputes].
patch2 = p1;
650 computeArray[nMoveableComputes].
handle = this_obj.handle;
651 computeArray[nMoveableComputes].
load = this_obj.wallTime;
655 processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
660 CkPrintf(
"LDB: %d computes have load of zero\n", nIdleComputes);
678 for (i=0; i<n_pes; i++) {
682 return nMoveableComputes;
689 int NamdCentLB::requiredProxies(
PatchID id,
int neighborNodes[])
692 int myNode = patchMap->
node(
id);
695 #define IF_NEW_NODE \
697 for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \
698 if ( j == nProxyNodes )
703 for (
int i = 0; i < numNeighbors; ++i ) {
704 const int proxyNode = patchMap->
basenode(neighbors[i]);
705 if ( proxyNode != myNode ) {
707 neighborNodes[nProxyNodes] = proxyNode;
721 int numPes = CkNumPes();
724 if ( emptyNodes > numPatches ) {
725 int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;
727 if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;
728 int proxyNode = (myNode + 1) % numPes;
729 while ( nProxyNodes < nodesPerPatch &&
731 if ( proxyNode != myNode ) {
733 neighborNodes[nProxyNodes] = proxyNode;
737 proxyNode = (proxyNode + 1) % numPes;
739 proxyNode = (myNode - 1 + numPes) % numPes;
740 while ( nProxyNodes < nodesPerPatch &&
742 if ( proxyNode != myNode ) {
744 neighborNodes[nProxyNodes] = proxyNode;
748 proxyNode = (proxyNode - 1 + numPes) % numPes;
750 proxyNode = (myNode + 1) % numPes;
752 while ( nProxyNodes < nodesPerPatch ) {
755 neighborNodes[nProxyNodes] = proxyNode;
759 proxyNode = (proxyNode + 1) % numPes;
760 count ++;
if (count == numPes)
break;
763 int proxyNode = myNode - 1;
765 if ( proxyNode != myNode ) {
767 neighborNodes[nProxyNodes] = proxyNode;
772 proxyNode = myNode + 1;
773 if ( proxyNode < numPes && ! patchMap->numPatchesOnNode(proxyNode) ) {
774 if ( proxyNode != myNode ) {
776 neighborNodes[nProxyNodes] = proxyNode;
794 int NamdCentLB::requiredProxiesOnProcGrid(
PatchID id,
int neighborNodes[])
796 enum proxyHere { No, Yes };
797 int numPes = CkNumPes();
798 proxyHere *proxyNodes =
new proxyHere[numPes];
802 int xsize = 0, ysize = 0, zsize = 0, tsize = 0;
803 int my_x = 0, my_y = 0, my_z = 0, my_t = 0;
806 int myNode = patchMap->
node(
id);
809 xsize = tmgr.getDimNX();
810 ysize = tmgr.getDimNY();
811 zsize = tmgr.getDimNZ();
812 tsize = tmgr.getDimNT();
814 tmgr.rankToCoordinates(myNode, my_x, my_y, my_z, my_t);
816 if(xsize * ysize * zsize * tsize != CkNumPes()) {
817 delete [] proxyNodes;
818 return requiredProxies(
id, neighborNodes);
822 for ( i = 0; i < numPes; ++i )
841 bool smallFlag =
false;
842 double pnodes = CkNumPes();
844 smallFlag = (patchMap->
numPatches() > pnodes )?1:0;
849 for ( i = 1; i < numNeighbors; ++i )
851 int proxyNode = patchMap->
basenode(neighbors[i]);
853 if (proxyNode != myNode)
854 if (proxyNodes[proxyNode] == No)
856 proxyNodes[proxyNode] = Yes;
857 neighborNodes[nProxyNodes] = proxyNode;
864 delete [] proxyNodes;
874 int nodesPerPatch = nProxyNodes + 4 * (emptyNodes-1) / numPatches + 1;
876 int proxy_x=0, proxy_y=0, proxy_z=0;
882 for(k=-1; k<= 1; k++) {
883 proxy_z = (my_z + k + zsize) % zsize;
884 for(j=-1; j <= 1; j++) {
885 proxy_y = (my_y + j + ysize) % ysize;
886 for(i = -1; i <= 1; i++) {
887 proxy_x = (my_x + i + xsize) % xsize;
888 for(l = 0; l < tsize; l++) {
889 if(i == 0 && j == 0 && k == 0 && l == 0)
892 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
895 proxyNodes[proxyNode] == No) {
896 proxyNodes[proxyNode] = Yes;
897 neighborNodes[nProxyNodes] = proxyNode;
901 if(nProxyNodes >= nodesPerPatch ||
906 if(nProxyNodes >= nodesPerPatch ||
911 if(nProxyNodes >= nodesPerPatch ||
916 if(nProxyNodes >= nodesPerPatch ||
923 for(k=-2; k<= 2; k+=2) {
924 proxy_z = (my_z + k + zsize) % zsize;
925 for(j=-2; j <= 2; j+=2) {
926 proxy_y = (my_y + j + ysize) % ysize;
927 for(i = -2; i <= 2; i+=2) {
928 proxy_x = (my_x + i + xsize) % xsize;
929 for(l = 0; l < tsize; l++) {
930 if(i == 0 && j == 0 && k == 0 && l == 0)
933 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
936 proxyNodes[proxyNode] == No) {
937 proxyNodes[proxyNode] = Yes;
938 neighborNodes[nProxyNodes] = proxyNode;
942 if(nProxyNodes >= nodesPerPatch ||
947 if(nProxyNodes >= nodesPerPatch ||
952 if(nProxyNodes >= nodesPerPatch ||
957 if(nProxyNodes >= nodesPerPatch ||
970 proxy_y = (my_y + 2) % ysize;
971 proxy_x = my_x % xsize;
972 proxy_z = my_z % zsize;
974 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
975 if(proxyNodes[proxyNode] == No) {
976 proxyNodes[proxyNode] = Yes;
977 neighborNodes[nProxyNodes] = proxyNode;
981 proxy_y = (my_y - 2 + ysize) % ysize;
982 proxy_x = my_x % xsize;
983 proxy_z = my_z % zsize;
985 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
986 if(proxyNodes[proxyNode] == No) {
987 proxyNodes[proxyNode] = Yes;
988 neighborNodes[nProxyNodes] = proxyNode;
995 proxy_y = my_y % ysize;
996 proxy_x = my_x % xsize;
997 proxy_z = (my_z + 2) % zsize;
999 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1000 if(proxyNodes[proxyNode] == No) {
1001 proxyNodes[proxyNode] = Yes;
1002 neighborNodes[nProxyNodes] = proxyNode;
1006 proxy_y = my_y % ysize;
1007 proxy_x = my_x % xsize;
1008 proxy_z = (my_z - 2 + zsize) % zsize;
1010 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1011 if(proxyNodes[proxyNode] == No) {
1012 proxyNodes[proxyNode] = Yes;
1013 neighborNodes[nProxyNodes] = proxyNode;
1020 proxy_y = my_y % ysize;
1021 proxy_x = (my_x + 2) % xsize;
1022 proxy_z = my_z % zsize;
1024 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1025 if(proxyNodes[proxyNode] == No) {
1026 proxyNodes[proxyNode] = Yes;
1027 neighborNodes[nProxyNodes] = proxyNode;
1031 proxy_y = my_y % ysize;
1032 proxy_x = (my_x - 2 + xsize) % xsize;
1033 proxy_z = my_z % zsize;
1035 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1036 if(proxyNodes[proxyNode] == No) {
1037 proxyNodes[proxyNode] = Yes;
1038 neighborNodes[nProxyNodes] = proxyNode;
1048 delete [] proxyNodes;
BlockLoad::TempStorage load
std::ostream & iINFO(std::ostream &s)
represents bonded compute
NamdCentLB * AllocateNamdCentLB()
void setNewNumPartitions(ComputeID cid, char numPartitions)
BigReal ldbRelativeGrainsize
static PatchMap * Object()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
SimParameters * simParameters
#define LDBSTRAT_REFINEONLY
CLBMigrateMsg * Strategy(LDStats *stats)
int isMICProcessor(int pe)
static double averageLoad
int numaway_b(void) const
void insert(InfoRecord *)
int basenode(int pid) const
BigReal ldbHomeBackgroundScaling
static Units next(Units u)
int numPartitions(ComputeID cid)
void setNewNode(ComputeID cid, NodeID node)
const int & LdbIdField(const LdbId &id, const int index)
void NAMD_die(const char *err_msg)
static LdbCoordinator * Object()
BigReal ldbBackgroundScaling
#define LDBSTRAT_COMPREHENSIVE
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
int numPatches(void) const
static ComputeMap * Object()
BigReal ldbPMEBackgroundScaling
int numPids(ComputeID cid)
int numPatchesOnNode(int node)
void unchecked_insert(InfoRecord *)
int numaway_c(void) const
NamdCentLB(const CkLBOptions &opt)
int pid(ComputeID cid, int i)
infostream & endi(infostream &s)
int isOutputProcessor(int pe)
int numaway_a(void) const