26 #include "WorkDistrib.decl.h"
30 #include "main.decl.h"
49 #include "TopoManager.h"
53 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
55 #define __thread __declspec(thread)
61 #define MIN_DEBUG_LEVEL 2
63 #ifdef MEM_OPT_VERSION
91 randtopo = CmiGetArgFlag(argv,
"+randtopo");
92 if ( CkMyPe() >= CkNumPes() )
return;
93 #if CCD_COND_FN_EXISTS
94 CcdCallOnCondition(CcdTOPOLOGY_AVAIL, (CcdCondFn)
build_ordering, (
void*)0);
96 CcdCallOnCondition(CcdTOPOLOGY_AVAIL, (CcdVoidFn)build_ordering, (
void*)0);
107 CkpvAccess(BOCclass_group).workDistrib = thisgroup;
108 patchMapArrived =
false;
109 computeMapArrived =
false;
112 #define MACHINE_PROGRESS
114 #define MACHINE_PROGRESS { traceUserEvent(eventMachineProgress); CmiMachineProgressImpl(); }
115 if ( CkMyNodeSize() > 1 )
NAMD_bug(
"CkMyNodeSize() > 1 for non-smp build");
127 if ( d )
while ( ! (d & c) ) {
130 return (a & c) - (b & c);
136 if ( d )
while ( ! (d & c) ) {
147 if ( c < 0 )
return true;
148 if ( c > 0 )
return false;
151 if ( c < 0 )
return true;
152 if ( c > 0 )
return false;
164 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
172 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
186 const int numPhys = CmiNumPhysicalNodes();
187 const int numNode = CmiNumNodes();
188 const int numPe = CmiNumPes();
198 for (
int ph=0; ph<numPhys; ++ph ) {
200 CmiGetPesOnPhysicalNode(ph, &pes, &npes);
201 for (
int i=0; i<npes; ++i, ++k ) {
204 numNodeInPhys[ph] = 0;
205 for (
int i=0, j=0; i<npes; i += CmiNodeSize(CmiNodeOf(pes[i])), ++j ) {
206 rankInPhysOfNode[CmiNodeOf(pes[i])] = j;
207 numNodeInPhys[ph] += 1;
212 if ( ! CkMyNode() ) {
213 iout <<
iWARN <<
"RANDOMIZING PHYSICAL NODE ORDERING\n" <<
endi;
216 for (
int j=0; j<numPhys; ++j ) {
217 randPhysOrder[j] = j;
220 for (
int j=0, k=0; j<numPhys; ++j ) {
221 const int ph = randPhysOrder[j];
223 CmiGetPesOnPhysicalNode(ph, &pes, &npes);
224 for (
int i=0; i<npes; ++i, ++k ) {
230 for (
int i=0; i<numPe; ++i ) {
236 for (
int i=0; i<numPe; ++i ) {
241 if ( 0 && CmiMyNode() == 0 )
for (
int i=0; i<numPe; ++i ) {
242 CkPrintf(
"order %5d %5d %5d %5d %5d\n", i,
273 int x_begin,
int x_end,
int y_begin,
int y_end,
275 int *result,
int ydim
277 int x_len = x_end - x_begin;
278 int y_len = y_end - y_begin;
279 if ( x_len == 1 && y_len == 1 ) {
281 if ( 0 ) CkPrintf(
"pme %5d %5d on pe %5d at %f %f\n", x_begin, y_begin, *pe_begin,
282 coord[*pe_begin].
x, coord[*pe_begin].
y);
283 result[x_begin*ydim + y_begin] = *pe_begin;
286 int *pe_end = pe_begin + x_len * y_len;
287 if ( x_len >= y_len ) {
289 int x_split = x_begin + x_len / 2;
290 int* pe_split = pe_begin + (x_split - x_begin) * y_len;
296 int y_split = y_begin + y_len / 2;
297 int* pe_split = pe_begin + (y_split - y_begin) * x_len;
305 int numpes = CkNumPes();
309 for (
int i=0; i<numpes; ++i ) {
315 for (
int i=0, npatches=patchMap->
numPatches(); i<npatches; ++i ) {
316 int pe = patchMap->
node(i);
318 sumPos[pe] += patchMap->
center(i);
320 const int npmepes = xdim*ydim;
322 for (
int i=0; i<npmepes; ++i ) {
323 int pe = sortpes[i] = pmepes[i];
328 int node = CkNodeOf(pe);
329 int nsize = CkNodeSize(node);
330 int pe2 = CkNodeFirst(node);
331 for (
int j=0; j<nsize; ++j, ++pe2 ) {
338 int node = CmiPhysicalNodeID(pe);
340 CmiGetPesOnPhysicalNode(node, &nlist, &nsize);
341 for (
int j=0; j<nsize; ++j ) {
348 avgPos[pe] = sum / cnt;
358 saveComputeMapReturnEP = ep;
359 saveComputeMapReturnChareID = chareID;
362 CProxy_WorkDistrib(thisgroup).recvComputeMapChanges(mapMsg);
387 for (i=0; i<nc; i++) {
388 int data = computeMap->
newNode(i);
392 for (i=0; i<nc; i++) {
400 }
else if ( ! CkMyRank() ) {
404 if ( i != nc )
NAMD_bug(
"WorkDistrib::recvComputeMapChanges check 1 failed\n");
405 for (i=0; i<nc; i++) {
411 if ( i != nc )
NAMD_bug(
"WorkDistrib::recvComputeMapChanges check 2 failed\n");
412 for (i=0; i<nc; i++) {
418 if ( i != nc )
NAMD_bug(
"WorkDistrib::recvComputeMapChanges check 3 failed\n");
423 CkCallback cb(CkIndex_WorkDistrib::doneSaveComputeMap(NULL), 0, thisgroup);
424 contribute(0, NULL, CkReduction::random, cb);
430 CkSendMsgBranch(saveComputeMapReturnEP, CkAllocMsg(0,0,0), 0, saveComputeMapReturnChareID);
433 #ifdef MEM_OPT_VERSION
438 void WorkDistrib::fillAtomListForOnePatch(
int pid,
FullAtomList &alist){
442 0.5*(patchMap->
min_b(pid)+patchMap->
max_b(pid)),
443 0.5*(patchMap->
min_c(pid)+patchMap->
max_c(pid)));
445 int n = alist.
size();
461 for(
int j=0; j < n; j++)
468 if ( a[j].migrationGroupSize ) {
469 if ( a[j].migrationGroupSize != a[j].hydrogenGroupSize ) {
474 for (
int k=a[j].hydrogenGroupSize; k<mgs;
482 pos = lattice.
nearest(pos,center,&mother_transform);
486 a[j].
position = lattice.
nearest(a[j].position, center, &(a[j].transform));
515 }
else if ((a[j].status &
DrudeAtom)!=0) {
524 for(
int j=0; j < n; j+=size) {
527 NAMD_bug(
"Mother atom with hydrogenGroupSize of 0!");
530 for (
int k = 0; k < size; ++k ) {
531 allfixed = ( allfixed && (a[j+k].
atomFixed) );
533 for (
int k = 0; k < size; ++k ) {
540 int numAtomsInPatch = n;
541 int numFixedAtomsInPatch = 0;
542 int numAtomsInFixedGroupsInPatch = 0;
543 for(
int j=0; j < n; j++) {
544 numFixedAtomsInPatch += ( a[j].
atomFixed ? 1 : 0 );
545 numAtomsInFixedGroupsInPatch += ( a[j].
groupFixed ? 1 : 0 );
547 iout <<
"PATCH_DETAILS:"
548 <<
" on proc " << CkMyPe()
549 <<
" patch " << patchId
550 <<
" atoms " << numAtomsInPatch
551 <<
" fixed_atoms " << numFixedAtomsInPatch
552 <<
" fixed_groups " << numAtomsInFixedGroupsInPatch
572 int totalAtoms = inAtoms.
size();
573 for(i=0;i<totalAtoms;i++)
575 Real atomMs=inAtoms[i].mass;
587 kbToverM = sqrt(kbT * 1.0 / atomMs);
589 for (randnum=0.0, j=0; j<12; j++)
591 randnum += vel_random.uniform();
596 inAtoms[i].velocity.x = randnum*kbToverM;
598 for (randnum=0.0, j=0; j<12; j++)
600 randnum += vel_random.uniform();
605 inAtoms[i].velocity.y = randnum*kbToverM;
607 for (randnum=0.0, j=0; j<12; j++)
609 randnum += vel_random.uniform();
614 inAtoms[i].velocity.z = randnum*kbToverM;
626 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
627 Node *node = nd.ckLocalBranch();
629 CProxy_PatchMgr pm(CkpvAccess(BOCclass_group).patchMgr);
630 PatchMgr *patchMgr = pm.ckLocalBranch();
643 read_binary_file((std::string(basename)+
".coor").c_str(), positions, numAtoms);
644 read_binary_file((std::string(basename)+
".vel").c_str(), velocities, numAtoms);
646 PDB coorpdb((std::string(basename)+
".coor").c_str());
648 NAMD_die(
"Incorrect atom count in coordinate pdb file");
651 velocities_from_PDB((std::string(basename)+
".vel").c_str(), velocities, numAtoms);
662 if (current == NULL) {
668 velocities_from_PDB(current->
data, velocities, numAtoms);
671 velocities_from_binfile(current->
data, velocities, numAtoms);
676 random_velocities(params->
initialTemp, molecule, velocities, numAtoms);
682 remove_com_motion(velocities, molecule, numAtoms);
691 for ( i=0; i < numAtoms; i++ ) {
693 if ( ! h.
isMP )
continue;
701 for ( i=0; i < sortAtoms.
size(); i++ ) {
706 sortAtoms.
size(),numAtoms,
712 for (
int pid = 0; pid <
numPatches; ++pid ) {
713 int iend = breaks[pid];
714 for ( ; i<iend; ++i ) {
723 for (
int k=0; k<mgs; ++k ) {
743 CkPrintf(
"patch %d (%d %d %d) has %d atoms\n",
753 for(i=0; i < numAtoms; i++)
782 delete [] velocities;
790 int n = atoms[i].
size();
812 if ( a[j].migrationGroupSize ) {
813 if ( a[j].migrationGroupSize != a[j].hydrogenGroupSize ) {
817 for (
int k=a[j].hydrogenGroupSize; k<mgs;
824 pos = lattice.
nearest(pos,center,&mother_transform);
829 a[j].position, center, &(a[j].transform));
843 if ( alchOn || lesOn || pairInteractionOn || pressureProfileTypes) {
853 int size, allfixed, k;
854 for(j=0; j < n; j+=size) {
857 NAMD_bug(
"Mother atom with hydrogenGroupSize of 0!");
860 for ( k = 0; k < size; ++k ) {
861 allfixed = ( allfixed && (a[j+k].
atomFixed) );
863 for ( k = 0; k < size; ++k ) {
870 int numAtomsInPatch = n;
871 int numFixedAtomsInPatch = 0;
872 int numAtomsInFixedGroupsInPatch = 0;
873 for(j=0; j < n; j++) {
874 numFixedAtomsInPatch += ( a[j].
atomFixed ? 1 : 0 );
875 numAtomsInFixedGroupsInPatch += ( a[j].
groupFixed ? 1 : 0 );
877 iout <<
"PATCH_DETAILS:"
878 <<
" patch " << patchId
879 <<
" atoms " << numAtomsInPatch
880 <<
" fixed_atoms " << numFixedAtomsInPatch
881 <<
" fixed_groups " << numAtomsInFixedGroupsInPatch
897 CProxy_PatchMgr pm(CkpvAccess(BOCclass_group).patchMgr);
898 PatchMgr *patchMgr = pm.ckLocalBranch();
904 #ifdef MEM_OPT_VERSION
915 int numAtoms = atoms[i].
size();
916 if ( numAtoms > maxAtoms ) { maxAtoms = numAtoms; maxPatch = i; }
918 iout <<
iINFO <<
"LARGEST PATCH (" << maxPatch <<
919 ") HAS " << maxAtoms <<
" ATOMS\n" <<
endi;
925 DebugM(3,
"Created " << i <<
" patches so far.\n");
936 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
937 Node *node = nd.ckLocalBranch();
938 CProxy_PatchMgr pm(CkpvAccess(BOCclass_group).patchMgr);
939 PatchMgr *patchMgr = pm.ckLocalBranch();
946 if (patchMap->
node(i) != node->
myid() )
948 DebugM(3,
"patchMgr->movePatch("
949 << i <<
"," << patchMap->
node(i) <<
")\n");
959 CProxy_PatchMgr pm(CkpvAccess(BOCclass_group).patchMgr);
960 PatchMgr *patchMgr = pm.ckLocalBranch();
984 if ( CkNumPes() == 1 ) {
985 patchMapArrived =
true;
990 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
991 Node *node = nd.ckLocalBranch();
994 #ifdef NODEAWARE_PROXY_SPANNINGTREE
995 || CkNumPes() > CkNumNodes()
996 ) && ( CkNumNodes() > 1
1001 #ifdef NODEAWARE_PROXY_SPANNINGTREE
1002 if ( CkNumPes() > CkNumNodes() && CkNumNodes() > 1
1013 CProxy_WorkDistrib workProxy(thisgroup);
1014 workProxy[0].savePatchMap(mapMsg);
1026 if ( CkMyRank() ) patchMapArrived =
true;
1028 if ( patchMapArrived && CkMyPe() ) {
1032 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1033 Node *node = nd.ckLocalBranch();
1036 #ifdef NODEAWARE_PROXY_SPANNINGTREE
1037 || CkNumPes() > CkNumNodes()
1038 ) && ( CkNumNodes() > 1
1043 #ifdef NODEAWARE_PROXY_SPANNINGTREE
1044 if ( CkNumPes() > CkNumNodes() && CkNumNodes() > 1
1050 if ( patchMapArrived ) {
1051 if ( CkMyRank() + 1 < CkNodeSize(CkMyNode()) ) {
1052 ((CProxy_WorkDistrib(thisgroup))[CkMyPe()+1]).
savePatchMap(msg);
1059 patchMapArrived =
true;
1061 int self = CkMyNode();
1062 int range_begin = 0;
1063 int range_end = CkNumNodes();
1064 while (
self != range_begin ) {
1066 int split = range_begin + ( range_end - range_begin ) / 2;
1067 if (
self < split ) { range_end =
split; }
1068 else { range_begin =
split; }
1070 int send_near =
self + 1;
1071 int send_far = send_near + ( range_end - send_near ) / 2;
1075 if ( send_far < range_end ) pids[npid++] = CkNodeFirst(send_far);
1076 if ( send_near < send_far ) pids[npid++] = CkNodeFirst(send_near);
1077 pids[npid++] = CkMyPe();
1078 CProxy_WorkDistrib(thisgroup).savePatchMap(msg,npid,pids);
1084 if ( CkMyRank() )
return;
1086 if ( CkNumNodes() == 1 ) {
1087 computeMapArrived =
true;
1097 }
else if ( ! CkMyRank() ) {
1103 computeMapArrived =
true;
1112 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1113 Node *node = nd.ckLocalBranch();
1119 #ifndef MEM_OPT_VERSION
1127 double maxNumPatches = 1.e9;
1131 DebugM(3,
"Mapping patches\n");
1132 if ( lattice.
a_p() && lattice.
b_p() && lattice.
c_p() ) {
1133 xmin = 0.; xmax = 0.;
1145 printf(
"+++ center=%.4f %.4f %.4f\n",
1147 printf(
"+++ xmin=%.4f xmax=%.4f\n", xmin.
x, xmax.
x);
1148 printf(
"+++ ymin=%.4f ymax=%.4f\n", xmin.
y, xmax.
y);
1149 printf(
"+++ zmin=%.4f zmax=%.4f\n", xmin.
z, xmax.
z);
1161 iout <<
iINFO <<
"ORIGINAL ATOMS MINMAX IS " << xmin <<
" " << xmax <<
"\n" <<
endi;
1162 double frac = ( (double)totalAtoms - 10000. ) / (double)totalAtoms;
1163 if ( frac < 0.9 ) { frac = 0.9; }
1166 iout <<
iINFO <<
"ADJUSTED ATOMS MINMAX IS " << xmin <<
" " << xmax <<
"\n" <<
endi;
1171 origin_shift = lattice.
a_r() * lattice.
origin();
1172 xmin.
x -= origin_shift;
1173 xmax.
x -= origin_shift;
1174 origin_shift = lattice.
b_r() * lattice.
origin();
1175 xmin.
y -= origin_shift;
1176 xmax.
y -= origin_shift;
1177 origin_shift = lattice.
c_r() * lattice.
origin();
1178 xmin.
z -= origin_shift;
1179 xmax.
z -= origin_shift;
1188 if (params->
LCPOOn && patchSize < 32.4) {
1189 if ( twoAwayX > 0 || twoAwayY > 0 || twoAwayZ > 0 ) {
1190 iout <<
iWARN <<
"Ignoring twoAway[XYZ] due to LCPO SASA implementation.\n" <<
endi;
1192 twoAwayX = twoAwayY = twoAwayZ = 0;
1196 if ( twoAwayX > 0 ) maxNumPatches = 1.e9;
1197 if ( twoAwayY > 0 ) maxNumPatches = 1.e9;
1198 if ( twoAwayZ > 0 ) maxNumPatches = 1.e9;
1201 iout <<
iINFO <<
"LIMITING NUMBER OF PATCHES TO " <<
1202 maxNumPatches <<
"\n" <<
endi;
1205 int numpes = CkNumPes();
1209 delete [] patchMap->nPatchesOnNode;
1210 patchMap->nPatchesOnNode =
new int[numpes];
1211 memset(patchMap->nPatchesOnNode, 0, numpes*
sizeof(
int));
1214 #if defined(NAMD_CUDA) || defined(NAMD_HIP) || defined(NAMD_MIC)
1219 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1220 if ( numPatches < numpes && twoAwayX < 0 ) {
1224 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1226 if ( numPatches < numpes && twoAwayY < 0 ) {
1230 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1232 if ( numPatches < numpes && twoAwayZ < 0 ) {
1236 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1238 if ( numPatches < numpes ) {
1239 #if defined(NAMD_MIC)
1240 NAMD_die(
"MIC-enabled NAMD requires at least one patch per thread.");
1243 if ( numPatches % numpes && numPatches <= 1.4 * numpes ) {
1244 int exactFit = numPatches - numPatches % numpes;
1245 int newNumPatches = patchMap->
sizeGrid(
1247 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1248 if ( newNumPatches == exactFit ) {
1249 iout <<
iINFO <<
"REDUCING NUMBER OF PATCHES TO IMPROVE LOAD BALANCE\n" <<
endi;
1250 maxNumPatches = exactFit;
1254 patchMap->
makePatches(xmin,xmax,lattice,patchSize,maxNumPatches,
1256 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1260 int availPes = numpes;
1266 #ifdef MEM_OPT_VERSION
1279 int numPatches = patchMap->
sizeGrid(
1281 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1282 if ( ( numPatches > (0.3*availPes) || numPatches > maxNumPatches
1283 ) && twoAwayZ < 0 ) {
1287 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1289 if ( ( numPatches > (0.6*availPes) || numPatches > maxNumPatches
1290 ) && twoAwayY < 0 ) {
1294 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1296 if ( ( numPatches > availPes || numPatches > maxNumPatches
1297 ) && twoAwayX < 0 ) {
1301 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1303 if ( numPatches > availPes && numPatches <= (1.4*availPes) && availPes <= maxNumPatches ) {
1304 int newNumPatches = patchMap->
sizeGrid(
1306 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1307 if ( newNumPatches <= availPes && numPatches <= (1.4*newNumPatches) ) {
1308 iout <<
iINFO <<
"REDUCING NUMBER OF PATCHES TO IMPROVE LOAD BALANCE\n" <<
endi;
1309 maxNumPatches = availPes;
1313 patchMap->
makePatches(xmin,xmax,lattice,patchSize,maxNumPatches,
1315 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1332 #if (CMK_BLUEGENEP | CMK_BLUEGENEL) && USE_TOPOMAP
1334 int numPes = tmgr.getDimNX() * tmgr.getDimNY() * tmgr.getDimNZ();
1335 if (numPes > patchMap->
numPatches() && (assignPatchesTopoGridRecBisection() > 0)) {
1336 CkPrintf (
"Blue Gene/L topology partitioner finished successfully \n");
1340 assignPatchesSpaceFillingCurve();
1342 int *nAtoms =
new int[nNodes];
1345 for(i=0; i < nNodes; i++)
1355 #ifdef MEM_OPT_VERSION
1356 numAtoms += patchMap->numAtoms(i);
1357 nAtoms[patchMap->
node(i)] += patchMap->numAtoms(i);
1359 if (patchMap->
patch(i)) {
1366 if ( numAtoms !=
Node::Object()->molecule->numAtoms ) {
1367 for(i=0; i < nNodes; i++)
1368 iout <<
iINFO << nAtoms[i] <<
" atoms assigned to node " << i <<
"\n" <<
endi;
1370 NAMD_die(
"Incorrect atom count in WorkDistrib::assignNodeToPatch\n");
1412 void WorkDistrib::assignPatchesToLowestLoadNode()
1415 int assignedNode = 0;
1417 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1418 Node *node = nd.ckLocalBranch();
1425 int *
load =
new int[ncpus];
1426 int *assignedNodes =
new int[patchMap->
numPatches()];
1427 for (
int i=0; i<ncpus; i++) {
1430 CkPrintf(
"assignPatchesToLowestLoadNode\n");
1431 int defaultNode = 0;
1438 for(pid=0; pid < patchMap->
numPatches(); pid++) {
1439 assignedNode = defaultNode;
1440 for (
int i=assignedNode + 1; i < ncpus; i++) {
1441 if (load[i] < load[assignedNode]) assignedNode = i;
1443 assignedNodes[pid] = assignedNode;
1444 #ifdef MEM_OPT_VERSION
1445 load[assignedNode] += patchMap->numAtoms(pid) + 1;
1452 sortNodesAndAssign(assignedNodes);
1453 delete[] assignedNodes;
1457 void WorkDistrib::assignPatchesBitReversal()
1461 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1462 Node *node = nd.ckLocalBranch();
1470 if ( ncpus <= npatches )
1471 NAMD_bug(
"WorkDistrib::assignPatchesBitReversal called improperly");
1475 for (
int i = 1; i < ncpus; ++i ) {
1480 sortNodesAndAssign(seq.begin());
1481 if ( ncpus > 2*npatches ) sortNodesAndAssign(seq.begin()+npatches, 1);
1493 float a1 = ((float)
a_total)/((float)npatches);
1495 float b1 = ((float)
b_total)/((float)npatches);
1497 float c1 = ((float)
c_total)/((float)npatches);
1499 return ((a1 == a2) && (b1 == b2) && (c1 == c2));
1502 float a1 = ((float)
a_total)/((float)npatches);
1504 float b1 = ((float)
b_total)/((float)npatches);
1506 float c1 = ((float)
c_total)/((float)npatches);
1508 return ( (a1 < a2) || ((a1 == a2) && (b1 < b2)) ||
1509 ((a1 == a2) && (b1 == b2) && (c1 < c2)) );
1513 void WorkDistrib::sortNodesAndAssign(
int *assignedNode,
int baseNodes) {
1519 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1520 Node *node = nd.ckLocalBranch();
1528 for ( i=0; i < nnodes; ++i ) {
1529 allnodes[i].node = i;
1531 for ( pid=0; pid<npatches; ++pid ) {
1533 allnodes[assignedNode[pid]].npatches++;
1534 allnodes[assignedNode[pid]].a_total += patchMap->
index_a(pid);
1535 allnodes[assignedNode[pid]].b_total += patchMap->
index_b(pid);
1536 allnodes[assignedNode[pid]].c_total += patchMap->
index_c(pid);
1539 usednodes.resize(0);
1540 for ( i=0; i < nnodes; ++i ) {
1541 if ( allnodes[i].npatches ) usednodes.add(allnodes[i]);
1545 for ( i=0; i < nnodes; ++i ) {
1547 if ( allnodes[pe].npatches ) allnodes[usednodes[i2++].node].node = pe;
1550 for ( pid=0; pid<npatches; ++pid ) {
1552 if ( ! baseNodes ) {
1553 patchMap->
assignNode(pid, allnodes[assignedNode[pid]].node);
1560 void WorkDistrib::assignPatchesRoundRobin()
1564 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1565 Node *node = nd.ckLocalBranch();
1571 int *assignedNode =
new int[patchMap->
numPatches()];
1573 for(pid=0; pid < patchMap->
numPatches(); pid++) {
1574 assignedNode[pid] = pid % ncpus;
1577 sortNodesAndAssign(assignedNode);
1578 delete [] assignedNode;
1582 void WorkDistrib::assignPatchesRecursiveBisection()
1585 int *assignedNode =
new int[patchMap->
numPatches()];
1592 int usedNodes = numNodes;
1593 int unusedNodes = 0;
1594 CkPrintf(
"assignPatchesRecursiveBisection\n");
1600 unusedNodes = numNodes - usedNodes;
1602 if ( recBisec.partition(assignedNode) ) {
1603 if ( unusedNodes !=0 ) {
1604 for (
int i=0; i<patchMap->
numPatches(); ++i ) {
1605 assignedNode[i] += unusedNodes;
1608 sortNodesAndAssign(assignedNode);
1609 delete [] assignedNode;
1614 delete [] assignedNode;
1617 <<
"WorkDistrib: Recursive bisection fails, "
1618 <<
"invoking space-filling curve algorithm\n";
1619 assignPatchesSpaceFillingCurve();
1630 return CmiGetFirstPeOnPhysicalNode(CmiPhysicalNodeID(pe));
1634 int na=
tmgr.getDimNA();
1635 int nb=
tmgr.getDimNB();
1636 int nc=
tmgr.getDimNC();
1637 int nd=
tmgr.getDimND();
1638 int ne=
tmgr.getDimNE();
1640 int na=
tmgr.getDimNX();
1641 int nb=
tmgr.getDimNY();
1642 int nc=
tmgr.getDimNZ();
1651 for (
int i=0; i<na; ++i ) { a_flags[i] = 0; }
1652 for (
int i=0; i<nb; ++i ) { b_flags[i] = 0; }
1653 for (
int i=0; i<nc; ++i ) { c_flags[i] = 0; }
1654 for (
int i=0; i<nd; ++i ) { d_flags[i] = 0; }
1655 for (
int i=0; i<ne; ++i ) { e_flags[i] = 0; }
1656 int npes = CkNumPes();
1657 for (
int pe=0; pe<npes; ++pe ) {
1660 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,d,e,t);
1662 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,t);
1665 if ( a < 0 || a >= na )
NAMD_bug(
"inconsistent torus topology!");
1666 if ( b < 0 || b >= nb )
NAMD_bug(
"inconsistent torus topology!");
1667 if ( c < 0 || c >= nc )
NAMD_bug(
"inconsistent torus topology!");
1668 if ( d < 0 || d >= nd )
NAMD_bug(
"inconsistent torus topology!");
1669 if ( e < 0 || e >= ne )
NAMD_bug(
"inconsistent torus topology!");
1676 iout <<
iINFO <<
"TORUS A SIZE " << na <<
" USING";
1677 for (
int i=0; i<na; ++i ) {
if ( a_flags[i] )
iout <<
" " << i; }
1679 iout <<
iINFO <<
"TORUS B SIZE " << nb <<
" USING";
1680 for (
int i=0; i<nb; ++i ) {
if ( b_flags[i] )
iout <<
" " << i; }
1682 iout <<
iINFO <<
"TORUS C SIZE " << nc <<
" USING";
1683 for (
int i=0; i<nc; ++i ) {
if ( c_flags[i] )
iout <<
" " << i; }
1686 iout <<
iINFO <<
"TORUS D SIZE " << nd <<
" USING";
1687 for (
int i=0; i<nd; ++i ) {
if ( d_flags[i] )
iout <<
" " << i; }
1689 iout <<
iINFO <<
"TORUS E SIZE " << ne <<
" USING";
1690 for (
int i=0; i<ne; ++i ) {
if ( e_flags[i] )
iout <<
" " << i; }
1697 if (
tmgr.absA(na) == 0 )
1699 if (
tmgr.absX(na) == 0 )
1701 for (
int i=0, gaplen=0, gapstart=0; i<2*na; ++i ) {
1702 if ( a_flags[i%na] ) gapstart = i+1;
1703 else if ( i - gapstart >= gaplen ) {
1704 a_rot = 2*na-i-1; gaplen = i - gapstart;
1708 if (
tmgr.absB(nb) == 0 )
1710 if (
tmgr.absY(nb) == 0 )
1712 for (
int i=0, gaplen=0, gapstart=0; i<2*nb; ++i ) {
1713 if ( b_flags[i%nb] ) gapstart = i+1;
1714 else if ( i - gapstart >= gaplen ) {
1715 b_rot = 2*nb-i-1; gaplen = i - gapstart;
1719 if (
tmgr.absC(nc) == 0 )
1721 if (
tmgr.absZ(nc) == 0 )
1723 for (
int i=0, gaplen=0, gapstart=0; i<2*nc; ++i ) {
1724 if ( c_flags[i%nc] ) gapstart = i+1;
1725 else if ( i - gapstart >= gaplen ) {
1726 c_rot = 2*nc-i-1; gaplen = i - gapstart;
1730 if (
tmgr.absD(nd) == 0 )
1731 for (
int i=0, gaplen=0, gapstart=0; i<2*nd; ++i ) {
1732 if ( d_flags[i%nd] ) gapstart = i+1;
1733 else if ( i - gapstart >= gaplen ) {
1734 d_rot = 2*nd-i-1; gaplen = i - gapstart;
1737 if (
tmgr.absE(ne) == 0 )
1738 for (
int i=0, gaplen=0, gapstart=0; i<2*ne; ++i ) {
1739 if ( e_flags[i%ne] ) gapstart = i+1;
1740 else if ( i - gapstart >= gaplen ) {
1741 e_rot = 2*ne-i-1; gaplen = i - gapstart;
1746 int a_min=na, a_max=-1;
1747 int b_min=nb, b_max=-1;
1748 int c_min=nc, c_max=-1;
1749 int d_min=nd, d_max=-1;
1750 int e_min=ne, e_max=-1;
1751 for (
int pe=0; pe<npes; ++pe ) {
1754 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,d,e,t);
1756 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,t);
1764 if ( a < a_min ) a_min = a;
1765 if ( b < b_min ) b_min = b;
1766 if ( c < c_min ) c_min = c;
1767 if ( d < d_min ) d_min = d;
1768 if ( e < e_min ) e_min = e;
1769 if ( a > a_max ) a_max = a;
1770 if ( b > b_max ) b_max = b;
1771 if ( c > c_max ) c_max = c;
1772 if ( d > d_max ) d_max = d;
1773 if ( e > e_max ) e_max = e;
1775 int a_len = a_max - a_min + 1;
1776 int b_len = b_max - b_min + 1;
1777 int c_len = c_max - c_min + 1;
1778 int d_len = d_max - d_min + 1;
1779 int e_len = e_max - e_min + 1;
1781 lensort[0] = (a_len << 3) + 0;
1782 lensort[1] = (b_len << 3) + 1;
1783 lensort[2] = (c_len << 3) + 2;
1784 lensort[3] = (d_len << 3) + 3;
1785 lensort[4] = (e_len << 3) + 4;
1789 for (
int i=0; i<5; ++i ) {
if ( (lensort[i] & 7) == 0 )
a_dim = 4-i; }
1790 for (
int i=0; i<5; ++i ) {
if ( (lensort[i] & 7) == 1 )
b_dim = 4-i; }
1791 for (
int i=0; i<5; ++i ) {
if ( (lensort[i] & 7) == 2 )
c_dim = 4-i; }
1792 for (
int i=0; i<5; ++i ) {
if ( (lensort[i] & 7) == 3 )
d_dim = 4-i; }
1793 for (
int i=0; i<5; ++i ) {
if ( (lensort[i] & 7) == 4 )
e_dim = 4-i; }
1795 if ( a_len >= b_len && a_len >= c_len ) {
1797 if ( b_len >= c_len ) {
1802 }
else if ( b_len >= a_len && b_len >= c_len ) {
1804 if ( a_len >= c_len ) {
1811 if ( a_len >= b_len ) {
1818 iout <<
iINFO <<
"TORUS MINIMAL MESH SIZE IS " << a_len <<
" BY " << b_len <<
" BY " << c_len
1820 <<
" BY " << d_len <<
" BY " << e_len
1828 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,d,e,t);
1830 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,t);
1849 int crds1[3], crds2[3];
1852 for (
int i=0; i<3; ++i ) {
1854 if ( crds1[d] != crds2[d] )
return ( crds1[d] < crds2[d] );
1857 return ( index[pe1] < index[pe2] );
1861 if ( node_begin == node_end )
return node_begin;
1862 int tmins[3], tmaxs[3], tlens[3], sortdims[3];
1863 coords(*node_begin, tmins);
1864 coords(*node_begin, tmaxs);
1865 for (
int *peitr = node_begin; peitr != node_end; ++peitr ) {
1868 for (
int i=0; i<3; ++i ) {
1869 if ( tvals[i] < tmins[i] ) tmins[i] = tvals[i];
1870 if ( tvals[i] > tmaxs[i] ) tmaxs[i] = tvals[i];
1873 for (
int i=0; i<3; ++i ) {
1874 tlens[i] = tmaxs[i] - tmins[i];
1876 sortdims[0] = splitdim;
1877 for (
int i=0, j=0; i<3; ++i ) {
1878 if ( i != splitdim ) sortdims[++j] = i;
1880 if ( tlens[sortdims[1]] < tlens[sortdims[2]] ) {
1881 int tmp = sortdims[1];
1882 sortdims[1] = sortdims[2];
1886 int *nodes = node_begin;
1887 int nnodes = node_end - node_begin;
1890 int c_split =
coord(nodes[0],splitdim);
1891 for (
int i=0; i<nnodes; ++i ) {
1892 if (
coord(nodes[i],splitdim) != c_split ) {
1893 int mid = (nnodes+1)/2;
1894 if ( abs(i-mid) < abs(i_split-mid) ) {
1896 c_split =
coord(i,splitdim);
1902 for (
int i=0; i<nnodes; ++i ) {
1903 if ( ! CmiPeOnSamePhysicalNode(nodes[i_split],nodes[i]) ) {
1904 int mid = (nnodes+1)/2;
1905 if ( abs(i-mid) < abs(i_split-mid) ) i_split = i;
1909 return ( node_begin + i_split );
1919 if ( a1 < a2 )
return true;
1920 if ( a1 > a2 )
return false;
1921 int dir = ( (a1 & 1) ? -1 : 1 );
1924 if ( b1 * dir < b2 * dir )
return true;
1925 if ( b1 * dir > b2 * dir )
return false;
1926 dir *= ( (b1 & 1) ? -1 : 1 );
1929 if ( c1 * dir < c2 * dir )
return true;
1940 if ( a1 < a2 )
return true;
1941 if ( a1 > a2 )
return false;
1942 int dir = ( (a1 & 1) ? -1 : 1 );
1945 if ( b1 * dir < b2 * dir )
return true;
1946 if ( b1 * dir > b2 * dir )
return false;
1947 dir *= ( (b1 & 1) ? -1 : 1 );
1950 if ( c1 * dir < c2 * dir )
return true;
1961 if ( a1 < a2 )
return true;
1962 if ( a1 > a2 )
return false;
1963 int dir = ( (a1 & 1) ? -1 : 1 );
1966 if ( b1 * dir < b2 * dir )
return true;
1967 if ( b1 * dir > b2 * dir )
return false;
1968 dir *= ( (b1 & 1) ? -1 : 1 );
1971 if ( c1 * dir < c2 * dir )
return true;
1977 int *patch_begin,
int *patch_end,
1978 int *node_begin,
int *node_end,
1980 double *sortedLoads,
1987 int *patches = patch_begin;
1988 int npatches = patch_end - patch_begin;
1989 int *nodes = node_begin;
1990 int nnodes = node_end - node_begin;
1994 double totalRawLoad = 0;
1995 for (
int i=0; i<npatches; ++i ) {
1997 #ifdef MEM_OPT_VERSION
1998 double load = patchMap->numAtoms(pid) + emptyPatchLoad;
2002 patchLoads[pid] =
load;
2003 sortedLoads[i] =
load;
2004 totalRawLoad +=
load;
2006 std::sort(sortedLoads,sortedLoads+npatches);
2010 double maxPatchLoad = 1;
2011 for (
int i=0; i<npatches; ++i ) {
2012 double load = sortedLoads[i];
2013 double total = sumLoad + (npatches-i) * load;
2014 if ( nnodes * load > total )
break;
2016 maxPatchLoad =
load;
2018 double totalLoad = 0;
2019 for (
int i=0; i<npatches; ++i ) {
2021 if ( patchLoads[pid] > maxPatchLoad ) patchLoads[pid] = maxPatchLoad;
2022 totalLoad += patchLoads[pid];
2024 if ( nnodes * maxPatchLoad > totalLoad )
2025 NAMD_bug(
"algorithm failure in WorkDistrib recursive_bisect_with_curve()");
2027 int a_len, b_len, c_len;
2028 int a_min, b_min, c_min;
2030 a_min = patchMap->
index_a(patches[0]);
2031 b_min = patchMap->
index_b(patches[0]);
2032 c_min = patchMap->
index_c(patches[0]);
2036 for (
int i=1; i<npatches; ++i ) {
2037 int a = patchMap->
index_a(patches[i]);
2038 int b = patchMap->
index_b(patches[i]);
2039 int c = patchMap->
index_c(patches[i]);
2040 if ( a < a_min ) a_min = a;
2041 if ( b < b_min ) b_min = b;
2042 if ( c < c_min ) c_min = c;
2043 if ( a > a_max ) a_max = a;
2044 if ( b > b_max ) b_max = b;
2045 if ( c > c_max ) c_max = c;
2047 a_len = a_max - a_min;
2048 b_len = b_max - b_min;
2049 c_len = c_max - c_min;
2052 int *node_split = node_begin;
2055 if ( a_len >= b_len && a_len >= c_len ) {
2057 }
else if ( b_len >= a_len && b_len >= c_len ) {
2059 }
else if ( c_len >= a_len && c_len >= b_len ) {
2063 if ( node_split == node_begin ) {
2068 for (
int i=0; i<nnodes; ++i ) {
2069 if ( ! CmiPeOnSamePhysicalNode(nodes[i_split],nodes[i]) ) {
2070 int mid = (nnodes+1)/2;
2071 if ( abs(i-mid) < abs(i_split-mid) ) i_split = i;
2075 node_split = node_begin + i_split;
2078 bool final_patch_sort =
false;
2080 if ( node_split == node_begin ) {
2082 nnodes == CmiNumPesOnPhysicalNode(CmiPhysicalNodeID(*node_begin)) ) {
2084 tmgr.
coords(*node_begin, crds);
2085 CkPrintf(
"WorkDistrib: physnode %5d pe %5d node %5d at %5d %5d %5d from %5d %5d %5d has %5d patches %5d x %5d x %5d load %7f pes %5d\n",
2086 CmiPhysicalNodeID(*node_begin), *node_begin,
2087 CkNodeOf(*node_begin), crds[0], crds[1], crds[2],
2088 a_min, b_min, c_min, npatches,
2089 a_len+1, b_len+1, c_len+1, totalRawLoad, nnodes);
2093 final_patch_sort =
true;
2097 for (
int i=0; i<nnodes; ++i ) {
2098 if ( CmiNodeOf(nodes[i_split]) != CmiNodeOf(nodes[i]) ) {
2099 int mid = (nnodes+1)/2;
2100 if ( abs(i-mid) < abs(i_split-mid) ) i_split = i;
2104 node_split = node_begin + i_split;
2107 if ( node_split == node_begin ) {
2109 nnodes == CmiNodeSize(CmiNodeOf(*node_begin)) ) {
2111 tmgr.
coords(*node_begin, crds);
2112 CkPrintf(
"WorkDistrib: node %5d pe %5d has %5d patches %5d x %5d x %5d load %7f pes %5d\n",
2113 CmiNodeOf(*node_begin), *node_begin, npatches,
2114 a_len+1, b_len+1, c_len+1, totalRawLoad, nnodes);
2118 node_split = node_begin + nnodes/2;
2121 if ( nnodes == 1 ) {
2123 int *node = node_begin;
2125 for (
int i=0; i < npatches; ++i ) {
2126 int pid = patches[i];
2127 assignedNode[pid] = *node;
2128 sumLoad += patchLoads[pid];
2129 if ( 0 ) CkPrintf(
"assign %5d node %5d patch %5d %5d %5d load %7f total %7f\n",
2134 patchLoads[pid], sumLoad);
2140 if ( final_patch_sort ) {
2143 }
else if ( a_len >= b_len && a_len >= c_len ) {
2144 if ( 0 ) CkPrintf(
"sort a\n");
2146 }
else if ( b_len >= a_len && b_len >= c_len ) {
2147 if ( 0 ) CkPrintf(
"sort b\n");
2149 }
else if ( c_len >= a_len && c_len >= b_len ) {
2150 if ( 0 ) CkPrintf(
"sort c\n");
2156 int *node = node_begin;
2158 for ( patch_split = patch_begin;
2159 patch_split != patch_end && node != node_split;
2161 sumLoad += patchLoads[*patch_split];
2162 double targetLoad = totalLoad *
2163 ((double)(node-node_begin+1) / (double)nnodes);
2164 if ( 0 ) CkPrintf(
"test %5d node %5d patch %5d %5d %5d load %7f target %7f\n",
2165 patch_split - patch_begin, *node,
2166 patchMap->
index_a(*patch_split),
2167 patchMap->
index_b(*patch_split),
2168 patchMap->
index_c(*patch_split),
2169 sumLoad, targetLoad);
2170 double extra = ( patch_split+1 != patch_end ? 0.5 * patchLoads[*(patch_split+1)] : 0 );
2171 if ( node+1 < node_end && sumLoad + extra >= targetLoad ) { ++node; }
2173 double targetLoad = totalLoad *
2174 ((double)(node_split-node_begin) / (double)nnodes);
2175 if ( 0 ) CkPrintf(
"split node %5d/%5d patch %5d/%5d load %7f target %7f\n",
2176 node_split-node_begin, nnodes,
2177 patch_split-patch_begin, npatches,
2178 sumLoad, targetLoad);
2183 patch_begin, patch_split, node_begin, node_split,
2184 patchLoads, sortedLoads, assignedNode, tmgr);
2186 patch_split, patch_end, node_split, node_end,
2187 patchLoads, sortedLoads, assignedNode, tmgr);
2191 void WorkDistrib::assignPatchesSpaceFillingCurve()
2202 NAMD_die(
"simulateInitialMapping not supported by assignPatchesSpaceFillingCurve()");
2208 patchOrdering[i] = i;
2212 nodeOrdering.resize(0);
2213 for (
int i=0; i<numNodes; ++i ) {
2216 if ( pe == 0 )
continue;
2218 if ( pe == 1 )
continue;
2221 #ifdef MEM_OPT_VERSION
2226 nodeOrdering.add(pe);
2227 if ( 0 ) CkPrintf(
"using pe %5d\n", pe);
2230 int *node_begin = nodeOrdering.begin();
2231 int *node_end = nodeOrdering.end();
2235 std::sort(node_begin, node_end, pe_sortop_compact());
2237 int *basenode_begin = node_begin;
2238 int *basenode_end = node_end;
2240 basenode_begin = node_end;
2242 std::sort(basenode_begin, basenode_end, pe_sortop_compact());
2246 iout <<
iWARN <<
"IGNORING TORUS TOPOLOGY DURING PATCH PLACEMENT\n" <<
endi;
2250 patchOrdering.begin(), patchOrdering.end(),
2251 node_begin, node_end,
2252 patchLoads.begin(), sortedLoads.begin(), assignedNode, tmgr);
2254 std::sort(node_begin, node_end, pe_sortop_compact());
2256 int samenodecount = 0;
2259 int node = assignedNode[pid];
2261 int nodeidx = std::lower_bound(node_begin, node_end, node,
2262 pe_sortop_compact()) - node_begin;
2263 int basenode = basenode_begin[nodeidx];
2265 if ( CmiPeOnSamePhysicalNode(node,basenode) ) ++samenodecount;
2268 iout <<
iINFO <<
"Placed " << (samenodecount*100./
numPatches) <<
"% of base nodes on same physical node as patch\n" <<
endi;
2270 delete [] assignedNode;
2278 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
2279 Node *node = nd.ckLocalBranch();
2281 DebugM(3,
"Mapping computes\n");
2290 mapComputeHomePatches(computeDPMTAType);
2292 NAMD_die(
"This binary does not include DPMTA (FMA).");
2297 mapComputeHomePatches(computeDPMEType);
2304 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2318 DebugM(2,
"adding ComputeGlobal\n");
2333 #ifdef CHARM_HAS_MSA
2344 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2347 mapComputeNode(computeBondedCUDAType);
2352 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2366 mapComputeNonbonded();
2437 void WorkDistrib::mapComputeHomeTuples(
ComputeType type)
2441 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
2442 Node *node = nd.ckLocalBranch();
2450 char *isBaseNode =
new char[numNodes];
2451 memset(isBaseNode,0,numNodes*
sizeof(
char));
2455 isBaseNode[patchMap->
basenode(j)] = 1;
2458 for(
int i=0; i<numNodes; i++) {
2459 if ( isBaseNode[i] ) {
2464 delete [] isBaseNode;
2468 void WorkDistrib::mapComputeHomePatches(
ComputeType type)
2472 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
2473 Node *node = nd.ckLocalBranch();
2481 for(
int i=0; i<numNodes; i++) {
2489 void WorkDistrib::mapComputePatch(
ComputeType type)
2500 computeMap->
newPid(cid,i);
2507 void WorkDistrib::mapComputeNode(
ComputeType type)
2515 int ncpus = CkNumPes();
2521 for(
int i=0; i<ncpus; i++) {
2528 void WorkDistrib::mapComputeNonbonded(
void)
2536 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
2537 Node *node = nd.ckLocalBranch();
2539 int ncpus = CkNumPes();
2540 int nodesize = CkMyNodeSize();
2554 double partScaling = 1.0;
2556 partScaling = ((double)ncpus) / ((double)patchMap->
numPatches());
2562 int numPartitions = 1;
2565 #ifdef MEM_OPT_VERSION
2566 int64 numFixed = patchMap->numFixedAtoms(i);
2567 int64 numAtoms = patchMap->numAtoms(i);
2575 numPartitions = (int) ( partScaling * ( 0.5 +
2576 (numAtoms*numAtoms-numFixed*numFixed) / (
double)(2*divide*divide) ) );
2578 if (numPartitions < 1) numPartitions = 1;
2582 DebugM(4,
"Mapping " << numPartitions <<
" ComputeNonbondedSelf objects for patch " << i <<
"\n");
2597 computeMap->
newPid(cid,i);
2602 for(
int p1=0; p1 <patchMap->
numPatches(); p1++)
2606 for(j=0;j<numNeighbors;j++)
2608 int p2 = oneAway[j];
2609 int dsp = oneAwayDownstream[j];
2611 int numPartitions = 1;
2614 #ifdef MEM_OPT_VERSION
2615 int64 numAtoms1 = patchMap->numAtoms(p1);
2616 int64 numAtoms2 = patchMap->numAtoms(p2);
2617 int64 numFixed1 = patchMap->numFixedAtoms(p1);
2618 int64 numFixed2 = patchMap->numFixedAtoms(p2);
2627 const int t2 = oneAwayTrans[j];
2634 const int ia1 = patchMap->
index_a(p1);
2636 const int ib1 = patchMap->
index_b(p1);
2638 const int ic1 = patchMap->
index_c(p1);
2641 if ( abs(ia2-ia1) > nax ||
2642 abs(ib2-ib1) > nay ||
2643 abs(ic2-ic1) > naz )
2644 NAMD_bug(
"Bad patch distance in WorkDistrib::mapComputeNonbonded");
2647 if ( ia1 == ia2 ) --distance;
2648 else if ( ia1 == ia2 + nax - 1 ) --distance;
2649 else if ( ia1 + nax - 1 == ia2 ) --distance;
2650 if ( ib1 == ib2 ) --distance;
2651 else if ( ib1 == ib2 + nay - 1 ) --distance;
2652 else if ( ib1 + nay - 1 == ib2 ) --distance;
2653 if ( ic1 == ic2 ) --distance;
2654 else if ( ic1 == ic2 + naz - 1 ) --distance;
2655 else if ( ic1 + naz - 1 == ic2 ) --distance;
2657 if ( distance == 0 ) {
2659 }
else if (distance == 1) {
2665 numPartitions = (int) ( partScaling * ( 0.5 +
2666 (numAtoms1*numAtoms2-numFixed1*numFixed2)/(
double)(divide*divide) ) );
2668 if ( numPartitions < 1 ) numPartitions = 1;
2674 for(
int partition=0; partition < numPartitions; partition++)
2678 computeMap->
newPid(cid,p1);
2679 computeMap->
newPid(cid,p2,oneAwayTrans[j]);
2680 patchMap->
newCid(p1,cid);
2681 patchMap->
newCid(p2,cid);
2688 void WorkDistrib::mapComputeLCPO(
void) {
2693 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
2694 Node *node = nd.ckLocalBranch();
2696 int ncpus = CkNumPes();
2697 int nodesize = CkMyNodeSize();
2698 const int maxPatches = 8;
2700 int numPatchesInOctet;
2701 PatchID patchesInOctet[maxPatches];
2702 int oneAwayTrans[maxPatches];
2705 int numPartitions = 1;
2715 for(
int partition=0; partition < numPartitions; partition++) {
2721 for (
int p = 0; p < numPatchesInOctet; p++) {
2722 computeMap->
newPid(cid, patchesInOctet[p], oneAwayTrans[p]);
2724 for (
int p = 0; p < numPatchesInOctet; p++) {
2725 patchMap->
newCid(patchesInOctet[p],cid);
2738 NAMD_bug(
"compute->sequence() < 0 in WorkDistrib::messageEnqueueWork");
2744 int type = compute->
type();
2745 int cid = compute->
cid;
2747 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
2751 wdProxy[CkMyPe()].enqueueExcls(msg);
2755 wdProxy[CkMyPe()].enqueueBonds(msg);
2759 wdProxy[CkMyPe()].enqueueAngles(msg);
2763 wdProxy[CkMyPe()].enqueueDihedrals(msg);
2767 wdProxy[CkMyPe()].enqueueImpropers(msg);
2771 wdProxy[CkMyPe()].enqueueThole(msg);
2775 wdProxy[CkMyPe()].enqueueAniso(msg);
2779 wdProxy[CkMyPe()].enqueueCrossterms(msg);
2784 wdProxy[CkMyPe()].enqueueGromacsPair(msg);
2787 case computeLCPOType:
2788 wdProxy[CkMyPe()].enqueueLCPO(msg);
2791 switch ( seq % 2 ) {
2794 switch ( gbisPhase ) {
2796 wdProxy[CkMyPe()].enqueueSelfA1(msg);
2799 wdProxy[CkMyPe()].enqueueSelfA2(msg);
2802 wdProxy[CkMyPe()].enqueueSelfA3(msg);
2808 switch ( gbisPhase ) {
2810 wdProxy[CkMyPe()].enqueueSelfB1(msg);
2813 wdProxy[CkMyPe()].enqueueSelfB2(msg);
2816 wdProxy[CkMyPe()].enqueueSelfB3(msg);
2821 NAMD_bug(
"WorkDistrib::messageEnqueueSelf case statement error!");
2825 switch ( seq % 2 ) {
2828 switch ( gbisPhase ) {
2830 wdProxy[CkMyPe()].enqueueWorkA1(msg);
2833 wdProxy[CkMyPe()].enqueueWorkA2(msg);
2836 wdProxy[CkMyPe()].enqueueWorkA3(msg);
2842 switch ( gbisPhase ) {
2844 wdProxy[CkMyPe()].enqueueWorkB1(msg);
2847 wdProxy[CkMyPe()].enqueueWorkB2(msg);
2850 wdProxy[CkMyPe()].enqueueWorkB3(msg);
2855 wdProxy[CkMyPe()].enqueueWorkC(msg);
2858 NAMD_bug(
"WorkDistrib::messageEnqueueWork case statement error!");
2862 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2866 switch ( gbisPhase ) {
2868 wdProxy[CkMyPe()].enqueueCUDA(msg);
2871 wdProxy[CkMyPe()].enqueueCUDAP2(msg);
2874 wdProxy[CkMyPe()].enqueueCUDAP3(msg);
2883 wdProxy[CkMyPe()].enqueueMIC(msg);
2888 wdProxy[CkMyPe()].enqueuePme(msg);
2890 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2892 wdProxy[CkMyPe()].enqueuePme(msg);
2896 wdProxy[CkMyPe()].enqueueWork(msg);
2907 NAMD_bug(
"compute->sequence() < 0 in WorkDistrib::messageEnqueueWork");
2913 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
2915 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2917 switch ( gbisPhase ) {
2919 wdProxy[CkMyPe()].finishCUDA(msg);
2922 wdProxy[CkMyPe()].finishCUDAP2(msg);
2925 wdProxy[CkMyPe()].finishCUDAP3(msg);
2940 NAMD_bug(
"compute->sequence() < 0 in WorkDistrib::messageFinishMIC");
2946 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
2949 wdProxy[CkMyPe()].finishMIC(msg);
2958 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2964 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2970 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2976 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2982 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2988 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2994 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3000 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3006 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3014 NAMD_bug(
"\nWorkDistrib LocalWorkMsg recycling failed! Check enqueueGromacsPair from WorkDistrib.C\n");
3021 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3027 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3032 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3037 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3042 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3048 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3053 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3058 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3064 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3069 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3074 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3080 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3085 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3090 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3098 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3153 void WorkDistrib::velocities_from_PDB(
const char *filename,
3154 Vector *v,
int totalAtoms)
3160 v_pdb =
new PDB(filename);
3161 if ( v_pdb == NULL )
3163 NAMD_die(
"memory allocation failed in Node::velocities_from_PDB");
3172 sprintf(err_msg,
"FOUND %d COORDINATES IN VELOCITY PDB!!",
3182 for (i=0; i<totalAtoms; i++)
3207 void WorkDistrib::velocities_from_binfile(
const char *fname,
Vector *vels,
int n)
3228 Vector *v,
int totalAtoms)
3245 for (i=0; i<totalAtoms; i++)
3247 if (structure->
atommass(i) <= 0.) {
3250 kbToverM = sqrt(kbT *
3251 ( lesOn && structure->
get_fep_type(i) ? tempFactor : 1.0 ) /
3264 for (randnum=0.0, j=0; j<12; j++)
3266 randnum += vel_random.uniform();
3271 v[i].
x = randnum*kbToverM;
3273 for (randnum=0.0, j=0; j<12; j++)
3275 randnum += vel_random.uniform();
3280 v[i].
y = randnum*kbToverM;
3282 for (randnum=0.0, j=0; j<12; j++)
3284 randnum += vel_random.uniform();
3289 v[i].
z = randnum*kbToverM;
3292 if ( simParams->
drudeOn )
for (i=0; i<totalAtoms; i++) {
3311 void WorkDistrib::remove_com_motion(
Vector *vel,
Molecule *structure,
int n)
3321 mv += mass * vel[i];
3327 iout <<
iINFO <<
"REMOVING COM VELOCITY "
3330 for (i=0; i<n; i++) { vel[i] -= mv; }
3339 int WorkDistrib::assignPatchesTopoGridRecBisection() {
3342 int *assignedNode =
new int[patchMap->
numPatches()];
3349 int usedNodes = numNodes;
3350 CkPrintf(
"assignPatchesTopoGridRecBisection\n");
3358 int xsize = 0, ysize = 0, zsize = 0;
3362 xsize = tmgr.getDimNX();
3363 ysize = tmgr.getDimNY();
3364 zsize = tmgr.getDimNZ();
3367 int rc = recBisec.partitionProcGrid(xsize, ysize, zsize, assignedNode);
3369 delete [] assignedNode;
3376 #if defined(NAMD_MIC)
3377 extern void mic_hostDeviceLDB();
3378 extern void mic_contributeHostDeviceLDB(
int idLen,
int *
id);
3379 extern void mic_setDeviceLDBParams(
int dt,
int hs,
int sp1,
int pp1,
int pp2);
3383 #if defined(NAMD_MIC)
3384 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
3385 wdProxy.initHostDeviceLDB();
3390 #if defined(NAMD_MIC)
3391 mic_hostDeviceLDB();
3396 #if defined(NAMD_MIC)
3397 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
3398 wdProxy[0].contributeHostDeviceLDB(peSetLen, peSet);
3403 #if defined(NAMD_MIC)
3404 mic_contributeHostDeviceLDB(peSetLen, peSet);
3409 #if defined(NAMD_MIC)
3410 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
3411 wdProxy.setDeviceLDBParams(dt, hs, sp1, pp1, pp2);
3416 #if defined(NAMD_MIC)
3417 mic_setDeviceLDBParams(dt, hs, sp1, pp1, pp2);
3422 #include "WorkDistrib.def.h"
static int offset_b(int i)
void setDeviceLDBParams(int dt, int hs, int sp1, int pp1, int pp2)
BlockLoad::TempStorage load
void enqueueMIC(LocalWorkMsg *msg)
Real langevin_param(int atomnum) const
std::ostream & iINFO(std::ostream &s)
static void sortPmePes(int *pmepes, int xdim, int ydim)
Bool simulateInitialMapping
static void messageFinishMIC(Compute *)
int isSendSpanningTreeUnset()
patch_sortop_curve_b(PatchMap *m)
void enqueueAngles(LocalWorkMsg *msg)
static void messageFinishCUDA(Compute *)
PatchID assignToPatch(Position p, const Lattice &l)
void setNewNumPartitions(ComputeID cid, char numPartitions)
static int offset_c(int i)
static void recursive_bisect_with_curve(int *patch_begin, int *patch_end, int *node_begin, int *node_end, double *patchLoads, double *sortedLoads, int *assignedNode, TopoManagerWrapper &tmgr)
void saveComputeMap(const char *fname)
static ProxyMgr * Object()
unsigned int hydrogenGroupSize
static int * peCompactOrdering
void finishCUDAP3(LocalWorkMsg *msg)
void enqueueCrossterms(LocalWorkMsg *msg)
static void partition(int *order, const FullAtom *atoms, int begin, int end)
int isRecvSpanningTreeUnset()
void enqueuePme(LocalWorkMsg *msg)
int gridsize_c(void) const
static PatchMap * Object()
int operator==(const nodesort &o) const
void enqueueWorkA3(LocalWorkMsg *msg)
void enqueueWork(LocalWorkMsg *msg)
BigReal max_c(int pid) const
void enqueueGromacsPair(LocalWorkMsg *msg)
void enqueueSelfA1(LocalWorkMsg *msg)
void finishCUDAP2(LocalWorkMsg *msg)
BigReal min_a(int pid) const
static void send_contributeHostDeviceLDB(int peSetLen, int *peSet)
SimParameters * simParameters
void loadComputeMap(const char *fname)
int index_a(int pid) const
static __thread atom * atoms
void createHomePatch(PatchID pid, FullAtomList &a)
void sendAtoms(PatchID pid, FullAtomList &a)
void enqueueExcls(LocalWorkMsg *msg)
void enqueueBonds(LocalWorkMsg *msg)
std::ostream & endi(std::ostream &s)
void enqueueAniso(LocalWorkMsg *msg)
void enqueueSelfB1(LocalWorkMsg *msg)
Real rigid_bond_length(int atomnum) const
void enqueueWorkB1(LocalWorkMsg *msg)
static void messageEnqueueWork(Compute *)
static void peOrderingReady()
std::ostream & iWARN(std::ostream &s)
bool operator()(int pe1, int pe2) const
MIStream * get(char &data)
int operator<(const nodesort &o) const
int sizeGrid(ScaledPosition xmin, ScaledPosition xmax, const Lattice &lattice, BigReal patchSize, double maxNumPatches, int staticAtomAssignment, int asplit, int bsplit, int csplit)
ComputeID storeCompute(int node, int maxPids, ComputeType type, int partition=-1, int numPartitions=0)
int numaway_b(void) const
Patch * patch(PatchID pid)
void enqueueSelfA3(LocalWorkMsg *msg)
char computeMapFilename[NAMD_FILENAME_BUFFER_SIZE]
Position nearest(Position data, ScaledPosition ref) const
int basenode(int pid) const
bool operator()(int a, int b) const
void movePatch(PatchID, NodeID)
LocalWorkMsg *const localWorkMsg
BigReal min_b(int pid) const
void recvComputeMapChanges(ComputeMapChangeMsg *)
char newNumPartitions(ComputeID cid)
bool operator()(int p1, int p2) const
void reorder(Elem *a, int n)
HydrogenGroup hydrogenGroup
void enqueueCUDA(LocalWorkMsg *msg)
void sendComputeMap(void)
void enqueueWorkB2(LocalWorkMsg *msg)
void enqueueCUDAP2(LocalWorkMsg *msg)
void assignBaseNode(PatchID, NodeID)
static void recursive_bisect_coord(int x_begin, int x_end, int y_begin, int y_end, int *pe_begin, ScaledPosition *coord, int *result, int ydim)
void newCid(int pid, int cid)
void enqueueSelfB3(LocalWorkMsg *msg)
int coord(int pe, int dim)
TopoManagerWrapper & tmgr
void enqueueWorkC(LocalWorkMsg *msg)
pe_sortop_bit_reversed(int *r)
int gridsize_a(void) const
int compare_bit_reversed(int a, int b)
void reinitAtoms(const char *basename=0)
void enqueueThole(LocalWorkMsg *msg)
void enqueueWorkA2(LocalWorkMsg *msg)
void createHomePatches(void)
void NAMD_bug(const char *err_msg)
static int offset_a(int i)
void enqueueImpropers(LocalWorkMsg *msg)
static int eventMachineProgress
void enqueueLCPO(LocalWorkMsg *msg)
int oneOrTwoAwayNeighbors(int pid, PatchID *neighbor_ids, PatchID *downstream_ids=0, int *transform_ids=0)
Bool staticAtomAssignment
pe_sortop_coord_y(ScaledPosition *s)
Bool replicaUniformPatchGrids
int index_b(int pid) const
bool operator()(int a, int b) const
void finishCUDA(LocalWorkMsg *msg)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void setNewNode(ComputeID cid, NodeID node)
virtual void finishPatch(int)
void NAMD_die(const char *err_msg)
void enqueueCUDAP3(LocalWorkMsg *msg)
static int * peDiffuseOrderingIndex
void enqueueWorkA1(LocalWorkMsg *msg)
Bool pressureProfileEwaldOn
std::vector< std::string > split(const std::string &text, std::string delimiter)
static int * peDiffuseOrdering
void makePatches(ScaledPosition xmin, ScaledPosition xmax, const Lattice &lattice, BigReal patchSize, double maxNumPatches, int staticAtomAssignment, int replicaUniformPatchGrids, int lcpo, int asplit, int bsplit, int csplit)
unsigned char get_fep_type(int anum) const
BigReal max_b(int pid) const
int index_c(int pid) const
static int peOrderingInit
void find_extremes(const Lattice &, BigReal frac=1.0)
void saveComputeMapChanges(int, CkGroupID)
int add(const Elem &elem)
void finishCUDAPatch(FinishWorkMsg *msg)
BigReal max_a(int pid) const
void savePatchMap(PatchMapMsg *msg)
static int * peCompactOrderingIndex
ScaledPosition center(int pid) const
BlockRadixSort::TempStorage sort
void topo_getargs(char **)
static void buildNodeAwarePeOrdering(void)
patch_sortop_curve_a(PatchMap *m)
int pressureProfileAtomTypes
int atomsInMigrationGroup
void newPid(ComputeID cid, int pid, int trans=13)
static void send_setDeviceLDBParams(int dt, int hs, int sp1, int pp1, int pp2)
Index atomvdwtype(int anum) const
int numPatches(void) const
Position apply_transform(Position data, const Transform &t) const
bool operator()(int p1, int p2) const
void enqueueSelfA2(LocalWorkMsg *msg)
static ComputeMap * Object()
static void build_ordering(void *)
Real atomcharge(int anum) const
__thread DeviceCUDA * deviceCUDA
void distributeHomePatches(void)
StringList * find(const char *name) const
void assignNode(PatchID, NodeID)
bool operator()(int a, int b) const
bool less_than_bit_reversed(int a, int b)
patch_sortop_curve_c(PatchMap *m)
void enqueueSelfB2(LocalWorkMsg *msg)
int numPatchesOnNode(int node)
MOStream * put(char data)
Real atommass(int anum) const
static void send_initHostDeviceLDB()
int numaway_c(void) const
FullAtomList * createAtomLists(const char *basename=0)
unsigned int nonbondedGroupSize
BigReal min_c(int pid) const
#define SET_PRIORITY(MSG, SEQ, PRIO)
pe_sortop_coord_x(ScaledPosition *s)
void enqueueDihedrals(LocalWorkMsg *msg)
Bool is_atom_fixed(int atomnum) const
void finishMIC(LocalWorkMsg *msg)
void contributeHostDeviceLDB(int peSetLen, int *peSet)
void pack(char *buf, int size)
int isOutputProcessor(int pe)
void doneSaveComputeMap(CkReductionMsg *)
void unpack(MIStream *msg)
int numaway_a(void) const
void get_all_positions(Vector *)
pe_sortop_topo(TopoManagerWrapper &t, int *d)
bool operator()(int p1, int p2) const
void coords(int pe, int *crds)
void enqueueWorkB3(LocalWorkMsg *msg)
int gridsize_b(void) const
Bool noPatchesOnOutputPEs
int * sortAndSplit(int *node_begin, int *node_end, int splitdim)
void get_extremes(ScaledPosition &xmin, ScaledPosition &xmax) const
void sortAtomsForPatches(int *order, int *breaks, const FullAtom *atoms, int nmgrps, int natoms, int ni, int nj, int nk)
void assignNodeToPatch(void)
int getPatchesInOctet(int pid, PatchID *pids, int *transform_ids=0)
NodeID newNode(ComputeID cid)