26 #include "WorkDistrib.decl.h"
30 #include "main.decl.h"
49 #include "TopoManager.h"
55 #define __thread __declspec(thread)
61 #define MIN_DEBUG_LEVEL 2
63 #ifdef MEM_OPT_VERSION
91 randtopo = CmiGetArgFlag(argv,
"+randtopo");
92 if ( CkMyPe() >= CkNumPes() )
return;
93 CcdCallOnCondition(CcdTOPOLOGY_AVAIL, (CcdVoidFn)
build_ordering, (
void*)0);
103 CkpvAccess(BOCclass_group).workDistrib = thisgroup;
104 patchMapArrived =
false;
105 computeMapArrived =
false;
108 #define MACHINE_PROGRESS
110 #define MACHINE_PROGRESS { traceUserEvent(eventMachineProgress); CmiMachineProgressImpl(); }
111 if ( CkMyNodeSize() > 1 )
NAMD_bug(
"CkMyNodeSize() > 1 for non-smp build");
123 if ( d )
while ( ! (d & c) ) {
126 return (a & c) - (b & c);
132 if ( d )
while ( ! (d & c) ) {
143 if ( c < 0 )
return true;
144 if ( c > 0 )
return false;
147 if ( c < 0 )
return true;
148 if ( c > 0 )
return false;
182 const int numPhys = CmiNumPhysicalNodes();
183 const int numNode = CmiNumNodes();
184 const int numPe = CmiNumPes();
194 for (
int ph=0; ph<numPhys; ++ph ) {
196 CmiGetPesOnPhysicalNode(ph, &pes, &npes);
197 for (
int i=0; i<npes; ++i, ++k ) {
200 numNodeInPhys[ph] = 0;
201 for (
int i=0, j=0; i<npes; i += CmiNodeSize(CmiNodeOf(pes[i])), ++j ) {
202 rankInPhysOfNode[CmiNodeOf(pes[i])] = j;
203 numNodeInPhys[ph] += 1;
208 if ( ! CkMyNode() ) {
209 iout <<
iWARN <<
"RANDOMIZING PHYSICAL NODE ORDERING\n" <<
endi;
212 for (
int j=0; j<numPhys; ++j ) {
213 randPhysOrder[j] = j;
216 for (
int j=0, k=0; j<numPhys; ++j ) {
217 const int ph = randPhysOrder[j];
219 CmiGetPesOnPhysicalNode(ph, &pes, &npes);
220 for (
int i=0; i<npes; ++i, ++k ) {
226 for (
int i=0; i<numPe; ++i ) {
232 for (
int i=0; i<numPe; ++i ) {
237 if ( 0 && CmiMyNode() == 0 )
for (
int i=0; i<numPe; ++i ) {
238 CkPrintf(
"order %5d %5d %5d %5d %5d\n", i,
269 int x_begin,
int x_end,
int y_begin,
int y_end,
271 int *result,
int ydim
273 int x_len = x_end - x_begin;
274 int y_len = y_end - y_begin;
275 if ( x_len == 1 && y_len == 1 ) {
277 if ( 0 ) CkPrintf(
"pme %5d %5d on pe %5d at %f %f\n", x_begin, y_begin, *pe_begin,
278 coord[*pe_begin].
x, coord[*pe_begin].
y);
279 result[x_begin*ydim + y_begin] = *pe_begin;
282 int *pe_end = pe_begin + x_len * y_len;
283 if ( x_len >= y_len ) {
285 int x_split = x_begin + x_len / 2;
286 int* pe_split = pe_begin + (x_split - x_begin) * y_len;
292 int y_split = y_begin + y_len / 2;
293 int* pe_split = pe_begin + (y_split - y_begin) * x_len;
301 int numpes = CkNumPes();
305 for (
int i=0; i<numpes; ++i ) {
311 for (
int i=0, npatches=patchMap->
numPatches(); i<npatches; ++i ) {
312 int pe = patchMap->
node(i);
314 sumPos[pe] += patchMap->
center(i);
316 const int npmepes = xdim*ydim;
318 for (
int i=0; i<npmepes; ++i ) {
319 int pe = sortpes[i] = pmepes[i];
324 int node = CkNodeOf(pe);
325 int nsize = CkNodeSize(node);
326 int pe2 = CkNodeFirst(node);
327 for (
int j=0; j<nsize; ++j, ++pe2 ) {
334 int node = CmiPhysicalNodeID(pe);
336 CmiGetPesOnPhysicalNode(node, &nlist, &nsize);
337 for (
int j=0; j<nsize; ++j ) {
344 avgPos[pe] = sum / cnt;
354 saveComputeMapReturnEP = ep;
355 saveComputeMapReturnChareID = chareID;
358 CProxy_WorkDistrib(thisgroup).recvComputeMapChanges(mapMsg);
383 for (i=0; i<nc; i++) {
384 int data = computeMap->
newNode(i);
388 for (i=0; i<nc; i++) {
396 }
else if ( ! CkMyRank() ) {
400 if ( i != nc )
NAMD_bug(
"WorkDistrib::recvComputeMapChanges check 1 failed\n");
401 for (i=0; i<nc; i++) {
407 if ( i != nc )
NAMD_bug(
"WorkDistrib::recvComputeMapChanges check 2 failed\n");
408 for (i=0; i<nc; i++) {
414 if ( i != nc )
NAMD_bug(
"WorkDistrib::recvComputeMapChanges check 3 failed\n");
419 CkCallback cb(CkIndex_WorkDistrib::doneSaveComputeMap(NULL), 0, thisgroup);
420 contribute(0, NULL, CkReduction::random, cb);
426 CkSendMsgBranch(saveComputeMapReturnEP, CkAllocMsg(0,0,0), 0, saveComputeMapReturnChareID);
429 #ifdef MEM_OPT_VERSION
434 void WorkDistrib::fillAtomListForOnePatch(
int pid,
FullAtomList &alist){
438 0.5*(patchMap->
min_b(pid)+patchMap->
max_b(pid)),
439 0.5*(patchMap->
min_c(pid)+patchMap->
max_c(pid)));
441 int n = alist.
size();
457 for(
int j=0; j < n; j++)
464 if ( a[j].migrationGroupSize ) {
465 if ( a[j].migrationGroupSize != a[j].hydrogenGroupSize ) {
470 for (
int k=a[j].hydrogenGroupSize; k<mgs;
478 pos = lattice.
nearest(pos,center,&mother_transform);
482 a[j].
position = lattice.
nearest(a[j].position, center, &(a[j].transform));
511 }
else if ((a[j].status &
DrudeAtom)!=0) {
520 for(
int j=0; j < n; j+=size) {
523 NAMD_bug(
"Mother atom with hydrogenGroupSize of 0!");
526 for (
int k = 0; k < size; ++k ) {
527 allfixed = ( allfixed && (a[j+k].
atomFixed) );
529 for (
int k = 0; k < size; ++k ) {
536 int numAtomsInPatch = n;
537 int numFixedAtomsInPatch = 0;
538 int numAtomsInFixedGroupsInPatch = 0;
539 for(
int j=0; j < n; j++) {
540 numFixedAtomsInPatch += ( a[j].
atomFixed ? 1 : 0 );
541 numAtomsInFixedGroupsInPatch += ( a[j].
groupFixed ? 1 : 0 );
543 iout <<
"PATCH_DETAILS:"
544 <<
" on proc " << CkMyPe()
545 <<
" patch " << patchId
546 <<
" atoms " << numAtomsInPatch
547 <<
" fixed_atoms " << numFixedAtomsInPatch
548 <<
" fixed_groups " << numAtomsInFixedGroupsInPatch
568 int totalAtoms = inAtoms.
size();
569 for(i=0;i<totalAtoms;i++)
571 Real atomMs=inAtoms[i].mass;
583 kbToverM = sqrt(kbT * 1.0 / atomMs);
585 for (randnum=0.0, j=0; j<12; j++)
587 randnum += vel_random.uniform();
592 inAtoms[i].velocity.x = randnum*kbToverM;
594 for (randnum=0.0, j=0; j<12; j++)
596 randnum += vel_random.uniform();
601 inAtoms[i].velocity.y = randnum*kbToverM;
603 for (randnum=0.0, j=0; j<12; j++)
605 randnum += vel_random.uniform();
610 inAtoms[i].velocity.z = randnum*kbToverM;
622 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
623 Node *node = nd.ckLocalBranch();
625 CProxy_PatchMgr pm(CkpvAccess(BOCclass_group).patchMgr);
626 PatchMgr *patchMgr = pm.ckLocalBranch();
639 read_binary_file((std::string(basename)+
".coor").c_str(), positions, numAtoms);
640 read_binary_file((std::string(basename)+
".vel").c_str(), velocities, numAtoms);
642 PDB coorpdb((std::string(basename)+
".coor").c_str());
644 NAMD_die(
"Incorrect atom count in coordinate pdb file");
647 velocities_from_PDB((std::string(basename)+
".vel").c_str(), velocities, numAtoms);
658 if (current == NULL) {
664 velocities_from_PDB(current->
data, velocities, numAtoms);
667 velocities_from_binfile(current->
data, velocities, numAtoms);
672 random_velocities(params->
initialTemp, molecule, velocities, numAtoms);
678 remove_com_motion(velocities, molecule, numAtoms);
687 for ( i=0; i < numAtoms; i++ ) {
689 if ( ! h.
isMP )
continue;
697 for ( i=0; i < sortAtoms.
size(); i++ ) {
702 sortAtoms.
size(),numAtoms,
708 for (
int pid = 0; pid <
numPatches; ++pid ) {
709 int iend = breaks[pid];
710 for ( ; i<iend; ++i ) {
719 for (
int k=0; k<mgs; ++k ) {
739 CkPrintf(
"patch %d (%d %d %d) has %d atoms\n",
749 for(i=0; i < numAtoms; i++)
778 delete [] velocities;
786 int n = atoms[i].
size();
808 if ( a[j].migrationGroupSize ) {
809 if ( a[j].migrationGroupSize != a[j].hydrogenGroupSize ) {
813 for (
int k=a[j].hydrogenGroupSize; k<mgs;
820 pos = lattice.
nearest(pos,center,&mother_transform);
825 a[j].position, center, &(a[j].transform));
839 if ( alchOn || lesOn || pairInteractionOn || pressureProfileTypes) {
849 int size, allfixed, k;
850 for(j=0; j < n; j+=size) {
853 NAMD_bug(
"Mother atom with hydrogenGroupSize of 0!");
856 for ( k = 0; k < size; ++k ) {
857 allfixed = ( allfixed && (a[j+k].
atomFixed) );
859 for ( k = 0; k < size; ++k ) {
866 int numAtomsInPatch = n;
867 int numFixedAtomsInPatch = 0;
868 int numAtomsInFixedGroupsInPatch = 0;
869 for(j=0; j < n; j++) {
870 numFixedAtomsInPatch += ( a[j].
atomFixed ? 1 : 0 );
871 numAtomsInFixedGroupsInPatch += ( a[j].
groupFixed ? 1 : 0 );
873 iout <<
"PATCH_DETAILS:"
874 <<
" patch " << patchId
875 <<
" atoms " << numAtomsInPatch
876 <<
" fixed_atoms " << numFixedAtomsInPatch
877 <<
" fixed_groups " << numAtomsInFixedGroupsInPatch
893 CProxy_PatchMgr pm(CkpvAccess(BOCclass_group).patchMgr);
894 PatchMgr *patchMgr = pm.ckLocalBranch();
900 #ifdef MEM_OPT_VERSION
911 int numAtoms = atoms[i].
size();
912 if ( numAtoms > maxAtoms ) { maxAtoms = numAtoms; maxPatch = i; }
914 iout <<
iINFO <<
"LARGEST PATCH (" << maxPatch <<
915 ") HAS " << maxAtoms <<
" ATOMS\n" <<
endi;
921 DebugM(3,
"Created " << i <<
" patches so far.\n");
932 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
933 Node *node = nd.ckLocalBranch();
934 CProxy_PatchMgr pm(CkpvAccess(BOCclass_group).patchMgr);
935 PatchMgr *patchMgr = pm.ckLocalBranch();
942 if (patchMap->
node(i) != node->
myid() )
944 DebugM(3,
"patchMgr->movePatch("
945 << i <<
"," << patchMap->
node(i) <<
")\n");
955 CProxy_PatchMgr pm(CkpvAccess(BOCclass_group).patchMgr);
956 PatchMgr *patchMgr = pm.ckLocalBranch();
980 if ( CkNumPes() == 1 ) {
981 patchMapArrived =
true;
986 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
987 Node *node = nd.ckLocalBranch();
990 #ifdef NODEAWARE_PROXY_SPANNINGTREE
991 || CkNumPes() > CkNumNodes()
992 ) && ( CkNumNodes() > 1
997 #ifdef NODEAWARE_PROXY_SPANNINGTREE
998 if ( CkNumPes() > CkNumNodes() && CkNumNodes() > 1
1009 CProxy_WorkDistrib workProxy(thisgroup);
1010 workProxy[0].savePatchMap(mapMsg);
1022 if ( CkMyRank() ) patchMapArrived =
true;
1024 if ( patchMapArrived && CkMyPe() ) {
1028 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1029 Node *node = nd.ckLocalBranch();
1032 #ifdef NODEAWARE_PROXY_SPANNINGTREE
1033 || CkNumPes() > CkNumNodes()
1034 ) && ( CkNumNodes() > 1
1039 #ifdef NODEAWARE_PROXY_SPANNINGTREE
1040 if ( CkNumPes() > CkNumNodes() && CkNumNodes() > 1
1046 if ( patchMapArrived ) {
1047 if ( CkMyRank() + 1 < CkNodeSize(CkMyNode()) ) {
1048 ((CProxy_WorkDistrib(thisgroup))[CkMyPe()+1]).
savePatchMap(msg);
1055 patchMapArrived =
true;
1057 int self = CkMyNode();
1058 int range_begin = 0;
1059 int range_end = CkNumNodes();
1060 while (
self != range_begin ) {
1062 int split = range_begin + ( range_end - range_begin ) / 2;
1063 if (
self < split ) { range_end =
split; }
1064 else { range_begin =
split; }
1066 int send_near =
self + 1;
1067 int send_far = send_near + ( range_end - send_near ) / 2;
1071 if ( send_far < range_end ) pids[npid++] = CkNodeFirst(send_far);
1072 if ( send_near < send_far ) pids[npid++] = CkNodeFirst(send_near);
1073 pids[npid++] = CkMyPe();
1074 CProxy_WorkDistrib(thisgroup).savePatchMap(msg,npid,pids);
1080 if ( CkMyRank() )
return;
1082 if ( CkNumNodes() == 1 ) {
1083 computeMapArrived =
true;
1093 }
else if ( ! CkMyRank() ) {
1099 computeMapArrived =
true;
1108 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1109 Node *node = nd.ckLocalBranch();
1115 #ifndef MEM_OPT_VERSION
1123 double maxNumPatches = 1.e9;
1127 DebugM(3,
"Mapping patches\n");
1128 if ( lattice.
a_p() && lattice.
b_p() && lattice.
c_p() ) {
1129 xmin = 0.; xmax = 0.;
1141 printf(
"+++ center=%.4f %.4f %.4f\n",
1143 printf(
"+++ xmin=%.4f xmax=%.4f\n", xmin.
x, xmax.
x);
1144 printf(
"+++ ymin=%.4f ymax=%.4f\n", xmin.
y, xmax.
y);
1145 printf(
"+++ zmin=%.4f zmax=%.4f\n", xmin.
z, xmax.
z);
1157 iout <<
iINFO <<
"ORIGINAL ATOMS MINMAX IS " << xmin <<
" " << xmax <<
"\n" <<
endi;
1158 double frac = ( (double)totalAtoms - 10000. ) / (double)totalAtoms;
1159 if ( frac < 0.9 ) { frac = 0.9; }
1162 iout <<
iINFO <<
"ADJUSTED ATOMS MINMAX IS " << xmin <<
" " << xmax <<
"\n" <<
endi;
1167 origin_shift = lattice.
a_r() * lattice.
origin();
1168 xmin.
x -= origin_shift;
1169 xmax.
x -= origin_shift;
1170 origin_shift = lattice.
b_r() * lattice.
origin();
1171 xmin.
y -= origin_shift;
1172 xmax.
y -= origin_shift;
1173 origin_shift = lattice.
c_r() * lattice.
origin();
1174 xmin.
z -= origin_shift;
1175 xmax.
z -= origin_shift;
1184 if (params->
LCPOOn && patchSize < 32.4) {
1185 if ( twoAwayX > 0 || twoAwayY > 0 || twoAwayZ > 0 ) {
1186 iout <<
iWARN <<
"Ignoring twoAway[XYZ] due to LCPO SASA implementation.\n" <<
endi;
1188 twoAwayX = twoAwayY = twoAwayZ = 0;
1192 if ( twoAwayX > 0 ) maxNumPatches = 1.e9;
1193 if ( twoAwayY > 0 ) maxNumPatches = 1.e9;
1194 if ( twoAwayZ > 0 ) maxNumPatches = 1.e9;
1197 iout <<
iINFO <<
"LIMITING NUMBER OF PATCHES TO " <<
1198 maxNumPatches <<
"\n" <<
endi;
1201 int numpes = CkNumPes();
1205 delete [] patchMap->nPatchesOnNode;
1206 patchMap->nPatchesOnNode =
new int[numpes];
1207 memset(patchMap->nPatchesOnNode, 0, numpes*
sizeof(
int));
1210 #if defined(NAMD_CUDA) || defined(NAMD_MIC)
1215 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1216 if ( numPatches < numpes && twoAwayX < 0 ) {
1220 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1222 if ( numPatches < numpes && twoAwayY < 0 ) {
1226 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1228 if ( numPatches < numpes && twoAwayZ < 0 ) {
1232 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1234 if ( numPatches < numpes ) {
1235 #if defined(NAMD_MIC)
1236 NAMD_die(
"MIC-enabled NAMD requires at least one patch per thread.");
1239 if ( numPatches % numpes && numPatches <= 1.4 * numpes ) {
1240 int exactFit = numPatches - numPatches % numpes;
1241 int newNumPatches = patchMap->
sizeGrid(
1243 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1244 if ( newNumPatches == exactFit ) {
1245 iout <<
iINFO <<
"REDUCING NUMBER OF PATCHES TO IMPROVE LOAD BALANCE\n" <<
endi;
1246 maxNumPatches = exactFit;
1250 patchMap->
makePatches(xmin,xmax,lattice,patchSize,maxNumPatches,
1252 twoAwayX>0 ? 2 : 1, twoAwayY>0 ? 2 : 1, twoAwayZ>0 ? 2 : 1);
1256 int availPes = numpes;
1262 #ifdef MEM_OPT_VERSION
1275 int numPatches = patchMap->
sizeGrid(
1277 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1278 if ( ( numPatches > (0.3*availPes) || numPatches > maxNumPatches
1279 ) && twoAwayZ < 0 ) {
1283 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1285 if ( ( numPatches > (0.6*availPes) || numPatches > maxNumPatches
1286 ) && twoAwayY < 0 ) {
1290 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1292 if ( ( numPatches > availPes || numPatches > maxNumPatches
1293 ) && twoAwayX < 0 ) {
1297 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1299 if ( numPatches > availPes && numPatches <= (1.4*availPes) && availPes <= maxNumPatches ) {
1300 int newNumPatches = patchMap->
sizeGrid(
1302 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1303 if ( newNumPatches <= availPes && numPatches <= (1.4*newNumPatches) ) {
1304 iout <<
iINFO <<
"REDUCING NUMBER OF PATCHES TO IMPROVE LOAD BALANCE\n" <<
endi;
1305 maxNumPatches = availPes;
1309 patchMap->
makePatches(xmin,xmax,lattice,patchSize,maxNumPatches,
1311 twoAwayX ? 2 : 1, twoAwayY ? 2 : 1, twoAwayZ ? 2 : 1);
1328 #if (CMK_BLUEGENEP | CMK_BLUEGENEL) && USE_TOPOMAP
1330 int numPes = tmgr.getDimNX() * tmgr.getDimNY() * tmgr.getDimNZ();
1331 if (numPes > patchMap->
numPatches() && (assignPatchesTopoGridRecBisection() > 0)) {
1332 CkPrintf (
"Blue Gene/L topology partitioner finished successfully \n");
1336 assignPatchesSpaceFillingCurve();
1338 int *nAtoms =
new int[nNodes];
1341 for(i=0; i < nNodes; i++)
1351 #ifdef MEM_OPT_VERSION
1352 numAtoms += patchMap->numAtoms(i);
1353 nAtoms[patchMap->
node(i)] += patchMap->numAtoms(i);
1355 if (patchMap->
patch(i)) {
1362 if ( numAtoms !=
Node::Object()->molecule->numAtoms ) {
1363 for(i=0; i < nNodes; i++)
1364 iout <<
iINFO << nAtoms[i] <<
" atoms assigned to node " << i <<
"\n" <<
endi;
1366 NAMD_die(
"Incorrect atom count in WorkDistrib::assignNodeToPatch\n");
1408 void WorkDistrib::assignPatchesToLowestLoadNode()
1411 int assignedNode = 0;
1413 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1414 Node *node = nd.ckLocalBranch();
1421 int *
load =
new int[ncpus];
1422 int *assignedNodes =
new int[patchMap->
numPatches()];
1423 for (
int i=0; i<ncpus; i++) {
1426 CkPrintf(
"assignPatchesToLowestLoadNode\n");
1427 int defaultNode = 0;
1434 for(pid=0; pid < patchMap->
numPatches(); pid++) {
1435 assignedNode = defaultNode;
1436 for (
int i=assignedNode + 1; i < ncpus; i++) {
1437 if (load[i] < load[assignedNode]) assignedNode = i;
1439 assignedNodes[pid] = assignedNode;
1440 #ifdef MEM_OPT_VERSION
1441 load[assignedNode] += patchMap->numAtoms(pid) + 1;
1448 sortNodesAndAssign(assignedNodes);
1449 delete[] assignedNodes;
1453 void WorkDistrib::assignPatchesBitReversal()
1457 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1458 Node *node = nd.ckLocalBranch();
1466 if ( ncpus <= npatches )
1467 NAMD_bug(
"WorkDistrib::assignPatchesBitReversal called improperly");
1471 for (
int i = 1; i < ncpus; ++i ) {
1476 sortNodesAndAssign(seq.begin());
1477 if ( ncpus > 2*npatches ) sortNodesAndAssign(seq.begin()+npatches, 1);
1489 float a1 = ((float)
a_total)/((float)npatches);
1491 float b1 = ((float)
b_total)/((float)npatches);
1493 float c1 = ((float)
c_total)/((float)npatches);
1495 return ((a1 == a2) && (b1 == b2) && (c1 == c2));
1498 float a1 = ((float)
a_total)/((float)npatches);
1500 float b1 = ((float)
b_total)/((float)npatches);
1502 float c1 = ((float)
c_total)/((float)npatches);
1504 return ( (a1 < a2) || ((a1 == a2) && (b1 < b2)) ||
1505 ((a1 == a2) && (b1 == b2) && (c1 < c2)) );
1509 void WorkDistrib::sortNodesAndAssign(
int *assignedNode,
int baseNodes) {
1515 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1516 Node *node = nd.ckLocalBranch();
1524 for ( i=0; i < nnodes; ++i ) {
1525 allnodes[i].node = i;
1527 for ( pid=0; pid<npatches; ++pid ) {
1529 allnodes[assignedNode[pid]].npatches++;
1530 allnodes[assignedNode[pid]].a_total += patchMap->
index_a(pid);
1531 allnodes[assignedNode[pid]].b_total += patchMap->
index_b(pid);
1532 allnodes[assignedNode[pid]].c_total += patchMap->
index_c(pid);
1535 usednodes.resize(0);
1536 for ( i=0; i < nnodes; ++i ) {
1537 if ( allnodes[i].npatches ) usednodes.add(allnodes[i]);
1541 for ( i=0; i < nnodes; ++i ) {
1543 if ( allnodes[pe].npatches ) allnodes[usednodes[i2++].node].node = pe;
1546 for ( pid=0; pid<npatches; ++pid ) {
1548 if ( ! baseNodes ) {
1549 patchMap->
assignNode(pid, allnodes[assignedNode[pid]].node);
1556 void WorkDistrib::assignPatchesRoundRobin()
1560 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
1561 Node *node = nd.ckLocalBranch();
1567 int *assignedNode =
new int[patchMap->
numPatches()];
1569 for(pid=0; pid < patchMap->
numPatches(); pid++) {
1570 assignedNode[pid] = pid % ncpus;
1573 sortNodesAndAssign(assignedNode);
1574 delete [] assignedNode;
1578 void WorkDistrib::assignPatchesRecursiveBisection()
1581 int *assignedNode =
new int[patchMap->
numPatches()];
1588 int usedNodes = numNodes;
1589 int unusedNodes = 0;
1590 CkPrintf(
"assignPatchesRecursiveBisection\n");
1596 unusedNodes = numNodes - usedNodes;
1598 if ( recBisec.partition(assignedNode) ) {
1599 if ( unusedNodes !=0 ) {
1600 for (
int i=0; i<patchMap->
numPatches(); ++i ) {
1601 assignedNode[i] += unusedNodes;
1604 sortNodesAndAssign(assignedNode);
1605 delete [] assignedNode;
1610 delete [] assignedNode;
1613 <<
"WorkDistrib: Recursive bisection fails, "
1614 <<
"invoking space-filling curve algorithm\n";
1615 assignPatchesSpaceFillingCurve();
1626 return CmiGetFirstPeOnPhysicalNode(CmiPhysicalNodeID(pe));
1630 int na=
tmgr.getDimNA();
1631 int nb=
tmgr.getDimNB();
1632 int nc=
tmgr.getDimNC();
1633 int nd=
tmgr.getDimND();
1634 int ne=
tmgr.getDimNE();
1636 int na=
tmgr.getDimNX();
1637 int nb=
tmgr.getDimNY();
1638 int nc=
tmgr.getDimNZ();
1647 for (
int i=0; i<na; ++i ) { a_flags[i] = 0; }
1648 for (
int i=0; i<nb; ++i ) { b_flags[i] = 0; }
1649 for (
int i=0; i<nc; ++i ) { c_flags[i] = 0; }
1650 for (
int i=0; i<nd; ++i ) { d_flags[i] = 0; }
1651 for (
int i=0; i<ne; ++i ) { e_flags[i] = 0; }
1652 int npes = CkNumPes();
1653 for (
int pe=0; pe<npes; ++pe ) {
1656 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,d,e,t);
1658 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,t);
1661 if ( a < 0 || a >= na )
NAMD_bug(
"inconsistent torus topology!");
1662 if ( b < 0 || b >= nb )
NAMD_bug(
"inconsistent torus topology!");
1663 if ( c < 0 || c >= nc )
NAMD_bug(
"inconsistent torus topology!");
1664 if ( d < 0 || d >= nd )
NAMD_bug(
"inconsistent torus topology!");
1665 if ( e < 0 || e >= ne )
NAMD_bug(
"inconsistent torus topology!");
1672 iout <<
iINFO <<
"TORUS A SIZE " << na <<
" USING";
1673 for (
int i=0; i<na; ++i ) {
if ( a_flags[i] )
iout <<
" " << i; }
1675 iout <<
iINFO <<
"TORUS B SIZE " << nb <<
" USING";
1676 for (
int i=0; i<nb; ++i ) {
if ( b_flags[i] )
iout <<
" " << i; }
1678 iout <<
iINFO <<
"TORUS C SIZE " << nc <<
" USING";
1679 for (
int i=0; i<nc; ++i ) {
if ( c_flags[i] )
iout <<
" " << i; }
1682 iout <<
iINFO <<
"TORUS D SIZE " << nd <<
" USING";
1683 for (
int i=0; i<nd; ++i ) {
if ( d_flags[i] )
iout <<
" " << i; }
1685 iout <<
iINFO <<
"TORUS E SIZE " << ne <<
" USING";
1686 for (
int i=0; i<ne; ++i ) {
if ( e_flags[i] )
iout <<
" " << i; }
1693 if (
tmgr.absA(na) == 0 )
1695 if (
tmgr.absX(na) == 0 )
1697 for (
int i=0, gaplen=0, gapstart=0; i<2*na; ++i ) {
1698 if ( a_flags[i%na] ) gapstart = i+1;
1699 else if ( i - gapstart >= gaplen ) {
1700 a_rot = 2*na-i-1; gaplen = i - gapstart;
1704 if (
tmgr.absB(nb) == 0 )
1706 if (
tmgr.absY(nb) == 0 )
1708 for (
int i=0, gaplen=0, gapstart=0; i<2*nb; ++i ) {
1709 if ( b_flags[i%nb] ) gapstart = i+1;
1710 else if ( i - gapstart >= gaplen ) {
1711 b_rot = 2*nb-i-1; gaplen = i - gapstart;
1715 if (
tmgr.absC(nc) == 0 )
1717 if (
tmgr.absZ(nc) == 0 )
1719 for (
int i=0, gaplen=0, gapstart=0; i<2*nc; ++i ) {
1720 if ( c_flags[i%nc] ) gapstart = i+1;
1721 else if ( i - gapstart >= gaplen ) {
1722 c_rot = 2*nc-i-1; gaplen = i - gapstart;
1726 if (
tmgr.absD(nd) == 0 )
1727 for (
int i=0, gaplen=0, gapstart=0; i<2*nd; ++i ) {
1728 if ( d_flags[i%nd] ) gapstart = i+1;
1729 else if ( i - gapstart >= gaplen ) {
1730 d_rot = 2*nd-i-1; gaplen = i - gapstart;
1733 if (
tmgr.absE(ne) == 0 )
1734 for (
int i=0, gaplen=0, gapstart=0; i<2*ne; ++i ) {
1735 if ( e_flags[i%ne] ) gapstart = i+1;
1736 else if ( i - gapstart >= gaplen ) {
1737 e_rot = 2*ne-i-1; gaplen = i - gapstart;
1742 int a_min=na, a_max=-1;
1743 int b_min=nb, b_max=-1;
1744 int c_min=nc, c_max=-1;
1745 int d_min=nd, d_max=-1;
1746 int e_min=ne, e_max=-1;
1747 for (
int pe=0; pe<npes; ++pe ) {
1750 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,d,e,t);
1752 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,t);
1760 if ( a < a_min ) a_min = a;
1761 if ( b < b_min ) b_min = b;
1762 if ( c < c_min ) c_min = c;
1763 if ( d < d_min ) d_min = d;
1764 if ( e < e_min ) e_min = e;
1765 if ( a > a_max ) a_max = a;
1766 if ( b > b_max ) b_max = b;
1767 if ( c > c_max ) c_max = c;
1768 if ( d > d_max ) d_max = d;
1769 if ( e > e_max ) e_max = e;
1771 int a_len = a_max - a_min + 1;
1772 int b_len = b_max - b_min + 1;
1773 int c_len = c_max - c_min + 1;
1774 int d_len = d_max - d_min + 1;
1775 int e_len = e_max - e_min + 1;
1777 lensort[0] = (a_len << 3) + 0;
1778 lensort[1] = (b_len << 3) + 1;
1779 lensort[2] = (c_len << 3) + 2;
1780 lensort[3] = (d_len << 3) + 3;
1781 lensort[4] = (e_len << 3) + 4;
1785 for (
int i=0; i<5; ++i ) {
if ( (lensort[i] & 7) == 0 )
a_dim = 4-i; }
1786 for (
int i=0; i<5; ++i ) {
if ( (lensort[i] & 7) == 1 )
b_dim = 4-i; }
1787 for (
int i=0; i<5; ++i ) {
if ( (lensort[i] & 7) == 2 )
c_dim = 4-i; }
1788 for (
int i=0; i<5; ++i ) {
if ( (lensort[i] & 7) == 3 )
d_dim = 4-i; }
1789 for (
int i=0; i<5; ++i ) {
if ( (lensort[i] & 7) == 4 )
e_dim = 4-i; }
1791 if ( a_len >= b_len && a_len >= c_len ) {
1793 if ( b_len >= c_len ) {
1798 }
else if ( b_len >= a_len && b_len >= c_len ) {
1800 if ( a_len >= c_len ) {
1807 if ( a_len >= b_len ) {
1814 iout <<
iINFO <<
"TORUS MINIMAL MESH SIZE IS " << a_len <<
" BY " << b_len <<
" BY " << c_len
1816 <<
" BY " << d_len <<
" BY " << e_len
1824 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,d,e,t);
1826 tmgr.rankToCoordinates(
fixpe(pe),a,b,c,t);
1845 int crds1[3], crds2[3];
1848 for (
int i=0; i<3; ++i ) {
1850 if ( crds1[d] != crds2[d] )
return ( crds1[d] < crds2[d] );
1853 return ( index[pe1] < index[pe2] );
1857 if ( node_begin == node_end )
return node_begin;
1858 int tmins[3], tmaxs[3], tlens[3], sortdims[3];
1859 coords(*node_begin, tmins);
1860 coords(*node_begin, tmaxs);
1861 for (
int *peitr = node_begin; peitr != node_end; ++peitr ) {
1864 for (
int i=0; i<3; ++i ) {
1865 if ( tvals[i] < tmins[i] ) tmins[i] = tvals[i];
1866 if ( tvals[i] > tmaxs[i] ) tmaxs[i] = tvals[i];
1869 for (
int i=0; i<3; ++i ) {
1870 tlens[i] = tmaxs[i] - tmins[i];
1872 sortdims[0] = splitdim;
1873 for (
int i=0, j=0; i<3; ++i ) {
1874 if ( i != splitdim ) sortdims[++j] = i;
1876 if ( tlens[sortdims[1]] < tlens[sortdims[2]] ) {
1877 int tmp = sortdims[1];
1878 sortdims[1] = sortdims[2];
1882 int *nodes = node_begin;
1883 int nnodes = node_end - node_begin;
1886 int c_split =
coord(nodes[0],splitdim);
1887 for (
int i=0; i<nnodes; ++i ) {
1888 if (
coord(nodes[i],splitdim) != c_split ) {
1889 int mid = (nnodes+1)/2;
1890 if ( abs(i-mid) < abs(i_split-mid) ) {
1892 c_split =
coord(i,splitdim);
1898 for (
int i=0; i<nnodes; ++i ) {
1899 if ( ! CmiPeOnSamePhysicalNode(nodes[i_split],nodes[i]) ) {
1900 int mid = (nnodes+1)/2;
1901 if ( abs(i-mid) < abs(i_split-mid) ) i_split = i;
1905 return ( node_begin + i_split );
1915 if ( a1 < a2 )
return true;
1916 if ( a1 > a2 )
return false;
1917 int dir = ( (a1 & 1) ? -1 : 1 );
1920 if ( b1 * dir < b2 * dir )
return true;
1921 if ( b1 * dir > b2 * dir )
return false;
1922 dir *= ( (b1 & 1) ? -1 : 1 );
1925 if ( c1 * dir < c2 * dir )
return true;
1936 if ( a1 < a2 )
return true;
1937 if ( a1 > a2 )
return false;
1938 int dir = ( (a1 & 1) ? -1 : 1 );
1941 if ( b1 * dir < b2 * dir )
return true;
1942 if ( b1 * dir > b2 * dir )
return false;
1943 dir *= ( (b1 & 1) ? -1 : 1 );
1946 if ( c1 * dir < c2 * dir )
return true;
1957 if ( a1 < a2 )
return true;
1958 if ( a1 > a2 )
return false;
1959 int dir = ( (a1 & 1) ? -1 : 1 );
1962 if ( b1 * dir < b2 * dir )
return true;
1963 if ( b1 * dir > b2 * dir )
return false;
1964 dir *= ( (b1 & 1) ? -1 : 1 );
1967 if ( c1 * dir < c2 * dir )
return true;
1973 int *patch_begin,
int *patch_end,
1974 int *node_begin,
int *node_end,
1976 double *sortedLoads,
1983 int *patches = patch_begin;
1984 int npatches = patch_end - patch_begin;
1985 int *nodes = node_begin;
1986 int nnodes = node_end - node_begin;
1989 double totalRawLoad = 0;
1990 for (
int i=0; i<npatches; ++i ) {
1992 #ifdef MEM_OPT_VERSION
1993 double load = patchMap->numAtoms(pid) + 10;
1997 patchLoads[pid] =
load;
1998 sortedLoads[i] =
load;
1999 totalRawLoad +=
load;
2001 std::sort(sortedLoads,sortedLoads+npatches);
2005 double maxPatchLoad = 1;
2006 for (
int i=0; i<npatches; ++i ) {
2007 double load = sortedLoads[i];
2008 double total = sumLoad + (npatches-i) * load;
2009 if ( nnodes * load > total )
break;
2011 maxPatchLoad =
load;
2013 double totalLoad = 0;
2014 for (
int i=0; i<npatches; ++i ) {
2016 if ( patchLoads[pid] > maxPatchLoad ) patchLoads[pid] = maxPatchLoad;
2017 totalLoad += patchLoads[pid];
2019 if ( nnodes * maxPatchLoad > totalLoad )
2020 NAMD_bug(
"algorithm failure in WorkDistrib recursive_bisect_with_curve()");
2022 int a_len, b_len, c_len;
2023 int a_min, b_min, c_min;
2025 a_min = patchMap->
index_a(patches[0]);
2026 b_min = patchMap->
index_b(patches[0]);
2027 c_min = patchMap->
index_c(patches[0]);
2031 for (
int i=1; i<npatches; ++i ) {
2032 int a = patchMap->
index_a(patches[i]);
2033 int b = patchMap->
index_b(patches[i]);
2034 int c = patchMap->
index_c(patches[i]);
2035 if ( a < a_min ) a_min = a;
2036 if ( b < b_min ) b_min = b;
2037 if ( c < c_min ) c_min = c;
2038 if ( a > a_max ) a_max = a;
2039 if ( b > b_max ) b_max = b;
2040 if ( c > c_max ) c_max = c;
2042 a_len = a_max - a_min;
2043 b_len = b_max - b_min;
2044 c_len = c_max - c_min;
2047 int *node_split = node_begin;
2050 if ( a_len >= b_len && a_len >= c_len ) {
2052 }
else if ( b_len >= a_len && b_len >= c_len ) {
2054 }
else if ( c_len >= a_len && c_len >= b_len ) {
2058 if ( node_split == node_begin ) {
2063 for (
int i=0; i<nnodes; ++i ) {
2064 if ( ! CmiPeOnSamePhysicalNode(nodes[i_split],nodes[i]) ) {
2065 int mid = (nnodes+1)/2;
2066 if ( abs(i-mid) < abs(i_split-mid) ) i_split = i;
2070 node_split = node_begin + i_split;
2073 bool final_patch_sort =
false;
2075 if ( node_split == node_begin ) {
2077 nnodes == CmiNumPesOnPhysicalNode(CmiPhysicalNodeID(*node_begin)) ) {
2079 tmgr.
coords(*node_begin, crds);
2080 CkPrintf(
"WorkDistrib: physnode %5d pe %5d node %5d at %5d %5d %5d from %5d %5d %5d has %5d patches %5d x %5d x %5d load %7f pes %5d\n",
2081 CmiPhysicalNodeID(*node_begin), *node_begin,
2082 CkNodeOf(*node_begin), crds[0], crds[1], crds[2],
2083 a_min, b_min, c_min, npatches,
2084 a_len+1, b_len+1, c_len+1, totalRawLoad, nnodes);
2088 final_patch_sort =
true;
2092 for (
int i=0; i<nnodes; ++i ) {
2093 if ( CmiNodeOf(nodes[i_split]) != CmiNodeOf(nodes[i]) ) {
2094 int mid = (nnodes+1)/2;
2095 if ( abs(i-mid) < abs(i_split-mid) ) i_split = i;
2099 node_split = node_begin + i_split;
2102 if ( node_split == node_begin ) {
2104 nnodes == CmiNodeSize(CmiNodeOf(*node_begin)) ) {
2106 tmgr.
coords(*node_begin, crds);
2107 CkPrintf(
"WorkDistrib: node %5d pe %5d has %5d patches %5d x %5d x %5d load %7f pes %5d\n",
2108 CmiNodeOf(*node_begin), *node_begin, npatches,
2109 a_len+1, b_len+1, c_len+1, totalRawLoad, nnodes);
2113 node_split = node_begin + nnodes/2;
2116 if ( nnodes == 1 ) {
2118 int *node = node_begin;
2120 for (
int i=0; i < npatches; ++i ) {
2121 int pid = patches[i];
2122 assignedNode[pid] = *node;
2123 sumLoad += patchLoads[pid];
2124 if ( 0 ) CkPrintf(
"assign %5d node %5d patch %5d %5d %5d load %7f total %7f\n",
2129 patchLoads[pid], sumLoad);
2135 if ( final_patch_sort ) {
2138 }
else if ( a_len >= b_len && a_len >= c_len ) {
2139 if ( 0 ) CkPrintf(
"sort a\n");
2141 }
else if ( b_len >= a_len && b_len >= c_len ) {
2142 if ( 0 ) CkPrintf(
"sort b\n");
2144 }
else if ( c_len >= a_len && c_len >= b_len ) {
2145 if ( 0 ) CkPrintf(
"sort c\n");
2151 int *node = node_begin;
2153 for ( patch_split = patch_begin;
2154 patch_split != patch_end && node != node_split;
2156 sumLoad += patchLoads[*patch_split];
2157 double targetLoad = totalLoad *
2158 ((double)(node-node_begin+1) / (double)nnodes);
2159 if ( 0 ) CkPrintf(
"test %5d node %5d patch %5d %5d %5d load %7f target %7f\n",
2160 patch_split - patch_begin, *node,
2161 patchMap->
index_a(*patch_split),
2162 patchMap->
index_b(*patch_split),
2163 patchMap->
index_c(*patch_split),
2164 sumLoad, targetLoad);
2165 double extra = ( patch_split+1 != patch_end ? 0.5 * patchLoads[*(patch_split+1)] : 0 );
2166 if ( node+1 < node_end && sumLoad + extra >= targetLoad ) { ++node; }
2168 double targetLoad = totalLoad *
2169 ((double)(node_split-node_begin) / (double)nnodes);
2170 if ( 0 ) CkPrintf(
"split node %5d/%5d patch %5d/%5d load %7f target %7f\n",
2171 node_split-node_begin, nnodes,
2172 patch_split-patch_begin, npatches,
2173 sumLoad, targetLoad);
2178 patch_begin, patch_split, node_begin, node_split,
2179 patchLoads, sortedLoads, assignedNode, tmgr);
2181 patch_split, patch_end, node_split, node_end,
2182 patchLoads, sortedLoads, assignedNode, tmgr);
2186 void WorkDistrib::assignPatchesSpaceFillingCurve()
2197 NAMD_die(
"simulateInitialMapping not supported by assignPatchesSpaceFillingCurve()");
2203 patchOrdering[i] = i;
2207 nodeOrdering.resize(0);
2208 for (
int i=0; i<numNodes; ++i ) {
2211 if ( pe == 0 )
continue;
2213 if ( pe == 1 )
continue;
2216 #ifdef MEM_OPT_VERSION
2221 nodeOrdering.add(pe);
2222 if ( 0 ) CkPrintf(
"using pe %5d\n", pe);
2225 int *node_begin = nodeOrdering.begin();
2226 int *node_end = nodeOrdering.end();
2230 std::sort(node_begin, node_end, pe_sortop_compact());
2232 int *basenode_begin = node_begin;
2233 int *basenode_end = node_end;
2235 basenode_begin = node_end;
2237 std::sort(basenode_begin, basenode_end, pe_sortop_compact());
2241 iout <<
iWARN <<
"IGNORING TORUS TOPOLOGY DURING PATCH PLACEMENT\n" <<
endi;
2245 patchOrdering.begin(), patchOrdering.end(),
2246 node_begin, node_end,
2247 patchLoads.begin(), sortedLoads.begin(), assignedNode, tmgr);
2249 std::sort(node_begin, node_end, pe_sortop_compact());
2251 int samenodecount = 0;
2254 int node = assignedNode[pid];
2256 int nodeidx = std::lower_bound(node_begin, node_end, node,
2257 pe_sortop_compact()) - node_begin;
2258 int basenode = basenode_begin[nodeidx];
2260 if ( CmiPeOnSamePhysicalNode(node,basenode) ) ++samenodecount;
2263 iout <<
iINFO <<
"Placed " << (samenodecount*100./
numPatches) <<
"% of base nodes on same physical node as patch\n" <<
endi;
2265 delete [] assignedNode;
2273 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
2274 Node *node = nd.ckLocalBranch();
2276 DebugM(3,
"Mapping computes\n");
2285 mapComputeHomePatches(computeDPMTAType);
2287 NAMD_die(
"This binary does not include DPMTA (FMA).");
2292 mapComputeHomePatches(computeDPMEType);
2313 DebugM(2,
"adding ComputeGlobal\n");
2328 #ifdef CHARM_HAS_MSA
2342 mapComputeNode(computeBondedCUDAType);
2361 mapComputeNonbonded();
2432 void WorkDistrib::mapComputeHomeTuples(
ComputeType type)
2436 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
2437 Node *node = nd.ckLocalBranch();
2445 char *isBaseNode =
new char[numNodes];
2446 memset(isBaseNode,0,numNodes*
sizeof(
char));
2450 isBaseNode[patchMap->
basenode(j)] = 1;
2453 for(
int i=0; i<numNodes; i++) {
2454 if ( isBaseNode[i] ) {
2459 delete [] isBaseNode;
2463 void WorkDistrib::mapComputeHomePatches(
ComputeType type)
2467 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
2468 Node *node = nd.ckLocalBranch();
2476 for(
int i=0; i<numNodes; i++) {
2484 void WorkDistrib::mapComputePatch(
ComputeType type)
2495 computeMap->
newPid(cid,i);
2502 void WorkDistrib::mapComputeNode(
ComputeType type)
2510 int ncpus = CkNumPes();
2516 for(
int i=0; i<ncpus; i++) {
2523 void WorkDistrib::mapComputeNonbonded(
void)
2531 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
2532 Node *node = nd.ckLocalBranch();
2534 int ncpus = CkNumPes();
2535 int nodesize = CkMyNodeSize();
2549 double partScaling = 1.0;
2551 partScaling = ((double)ncpus) / ((double)patchMap->
numPatches());
2557 int numPartitions = 1;
2560 #ifdef MEM_OPT_VERSION
2561 int64 numFixed = patchMap->numFixedAtoms(i);
2562 int64 numAtoms = patchMap->numAtoms(i);
2570 numPartitions = (int) ( partScaling * ( 0.5 +
2571 (numAtoms*numAtoms-numFixed*numFixed) / (
double)(2*divide*divide) ) );
2573 if (numPartitions < 1) numPartitions = 1;
2577 DebugM(4,
"Mapping " << numPartitions <<
" ComputeNonbondedSelf objects for patch " << i <<
"\n");
2592 computeMap->
newPid(cid,i);
2597 for(
int p1=0; p1 <patchMap->
numPatches(); p1++)
2601 for(j=0;j<numNeighbors;j++)
2603 int p2 = oneAway[j];
2604 int dsp = oneAwayDownstream[j];
2606 int numPartitions = 1;
2609 #ifdef MEM_OPT_VERSION
2610 int64 numAtoms1 = patchMap->numAtoms(p1);
2611 int64 numAtoms2 = patchMap->numAtoms(p2);
2612 int64 numFixed1 = patchMap->numFixedAtoms(p1);
2613 int64 numFixed2 = patchMap->numFixedAtoms(p2);
2622 const int t2 = oneAwayTrans[j];
2629 const int ia1 = patchMap->
index_a(p1);
2631 const int ib1 = patchMap->
index_b(p1);
2633 const int ic1 = patchMap->
index_c(p1);
2636 if ( abs(ia2-ia1) > nax ||
2637 abs(ib2-ib1) > nay ||
2638 abs(ic2-ic1) > naz )
2639 NAMD_bug(
"Bad patch distance in WorkDistrib::mapComputeNonbonded");
2642 if ( ia1 == ia2 ) --distance;
2643 else if ( ia1 == ia2 + nax - 1 ) --distance;
2644 else if ( ia1 + nax - 1 == ia2 ) --distance;
2645 if ( ib1 == ib2 ) --distance;
2646 else if ( ib1 == ib2 + nay - 1 ) --distance;
2647 else if ( ib1 + nay - 1 == ib2 ) --distance;
2648 if ( ic1 == ic2 ) --distance;
2649 else if ( ic1 == ic2 + naz - 1 ) --distance;
2650 else if ( ic1 + naz - 1 == ic2 ) --distance;
2652 if ( distance == 0 ) {
2654 }
else if (distance == 1) {
2660 numPartitions = (int) ( partScaling * ( 0.5 +
2661 (numAtoms1*numAtoms2-numFixed1*numFixed2)/(
double)(divide*divide) ) );
2663 if ( numPartitions < 1 ) numPartitions = 1;
2669 for(
int partition=0; partition < numPartitions; partition++)
2673 computeMap->
newPid(cid,p1);
2674 computeMap->
newPid(cid,p2,oneAwayTrans[j]);
2675 patchMap->
newCid(p1,cid);
2676 patchMap->
newCid(p2,cid);
2683 void WorkDistrib::mapComputeLCPO(
void) {
2688 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
2689 Node *node = nd.ckLocalBranch();
2691 int ncpus = CkNumPes();
2692 int nodesize = CkMyNodeSize();
2693 const int maxPatches = 8;
2695 int numPatchesInOctet;
2696 PatchID patchesInOctet[maxPatches];
2697 int oneAwayTrans[maxPatches];
2700 int numPartitions = 1;
2710 for(
int partition=0; partition < numPartitions; partition++) {
2716 for (
int p = 0; p < numPatchesInOctet; p++) {
2717 computeMap->
newPid(cid, patchesInOctet[p], oneAwayTrans[p]);
2719 for (
int p = 0; p < numPatchesInOctet; p++) {
2720 patchMap->
newCid(patchesInOctet[p],cid);
2733 NAMD_bug(
"compute->sequence() < 0 in WorkDistrib::messageEnqueueWork");
2739 int type = compute->
type();
2740 int cid = compute->
cid;
2742 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
2746 wdProxy[CkMyPe()].enqueueExcls(msg);
2750 wdProxy[CkMyPe()].enqueueBonds(msg);
2754 wdProxy[CkMyPe()].enqueueAngles(msg);
2758 wdProxy[CkMyPe()].enqueueDihedrals(msg);
2762 wdProxy[CkMyPe()].enqueueImpropers(msg);
2766 wdProxy[CkMyPe()].enqueueThole(msg);
2770 wdProxy[CkMyPe()].enqueueAniso(msg);
2774 wdProxy[CkMyPe()].enqueueCrossterms(msg);
2779 wdProxy[CkMyPe()].enqueueGromacsPair(msg);
2782 case computeLCPOType:
2783 wdProxy[CkMyPe()].enqueueLCPO(msg);
2786 switch ( seq % 2 ) {
2789 switch ( gbisPhase ) {
2791 wdProxy[CkMyPe()].enqueueSelfA1(msg);
2794 wdProxy[CkMyPe()].enqueueSelfA2(msg);
2797 wdProxy[CkMyPe()].enqueueSelfA3(msg);
2803 switch ( gbisPhase ) {
2805 wdProxy[CkMyPe()].enqueueSelfB1(msg);
2808 wdProxy[CkMyPe()].enqueueSelfB2(msg);
2811 wdProxy[CkMyPe()].enqueueSelfB3(msg);
2816 NAMD_bug(
"WorkDistrib::messageEnqueueSelf case statement error!");
2820 switch ( seq % 2 ) {
2823 switch ( gbisPhase ) {
2825 wdProxy[CkMyPe()].enqueueWorkA1(msg);
2828 wdProxy[CkMyPe()].enqueueWorkA2(msg);
2831 wdProxy[CkMyPe()].enqueueWorkA3(msg);
2837 switch ( gbisPhase ) {
2839 wdProxy[CkMyPe()].enqueueWorkB1(msg);
2842 wdProxy[CkMyPe()].enqueueWorkB2(msg);
2845 wdProxy[CkMyPe()].enqueueWorkB3(msg);
2850 wdProxy[CkMyPe()].enqueueWorkC(msg);
2853 NAMD_bug(
"WorkDistrib::messageEnqueueWork case statement error!");
2861 switch ( gbisPhase ) {
2863 wdProxy[CkMyPe()].enqueueCUDA(msg);
2866 wdProxy[CkMyPe()].enqueueCUDAP2(msg);
2869 wdProxy[CkMyPe()].enqueueCUDAP3(msg);
2878 wdProxy[CkMyPe()].enqueueMIC(msg);
2883 wdProxy[CkMyPe()].enqueuePme(msg);
2887 wdProxy[CkMyPe()].enqueuePme(msg);
2891 wdProxy[CkMyPe()].enqueueWork(msg);
2902 NAMD_bug(
"compute->sequence() < 0 in WorkDistrib::messageEnqueueWork");
2908 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
2912 switch ( gbisPhase ) {
2914 wdProxy[CkMyPe()].finishCUDA(msg);
2917 wdProxy[CkMyPe()].finishCUDAP2(msg);
2920 wdProxy[CkMyPe()].finishCUDAP3(msg);
2935 NAMD_bug(
"compute->sequence() < 0 in WorkDistrib::messageFinishMIC");
2941 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
2944 wdProxy[CkMyPe()].finishMIC(msg);
2953 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2959 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2965 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2971 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2977 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2983 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2989 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
2995 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3001 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3009 NAMD_bug(
"\nWorkDistrib LocalWorkMsg recycling failed! Check enqueueGromacsPair from WorkDistrib.C\n");
3016 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3022 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3027 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3032 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3037 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3043 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3048 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3053 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3059 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3064 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3069 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3075 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3080 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3085 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3093 NAMD_bug(
"WorkDistrib LocalWorkMsg recycling failed!");
3148 void WorkDistrib::velocities_from_PDB(
const char *filename,
3149 Vector *v,
int totalAtoms)
3155 v_pdb =
new PDB(filename);
3156 if ( v_pdb == NULL )
3158 NAMD_die(
"memory allocation failed in Node::velocities_from_PDB");
3167 sprintf(err_msg,
"FOUND %d COORDINATES IN VELOCITY PDB!!",
3177 for (i=0; i<totalAtoms; i++)
3202 void WorkDistrib::velocities_from_binfile(
const char *fname,
Vector *vels,
int n)
3223 Vector *v,
int totalAtoms)
3240 for (i=0; i<totalAtoms; i++)
3242 if (structure->
atommass(i) <= 0.) {
3245 kbToverM = sqrt(kbT *
3246 ( lesOn && structure->
get_fep_type(i) ? tempFactor : 1.0 ) /
3259 for (randnum=0.0, j=0; j<12; j++)
3261 randnum += vel_random.uniform();
3266 v[i].
x = randnum*kbToverM;
3268 for (randnum=0.0, j=0; j<12; j++)
3270 randnum += vel_random.uniform();
3275 v[i].
y = randnum*kbToverM;
3277 for (randnum=0.0, j=0; j<12; j++)
3279 randnum += vel_random.uniform();
3284 v[i].
z = randnum*kbToverM;
3287 if ( simParams->
drudeOn )
for (i=0; i<totalAtoms; i++) {
3306 void WorkDistrib::remove_com_motion(
Vector *vel,
Molecule *structure,
int n)
3316 mv += mass * vel[i];
3322 iout <<
iINFO <<
"REMOVING COM VELOCITY "
3325 for (i=0; i<n; i++) { vel[i] -= mv; }
3334 int WorkDistrib::assignPatchesTopoGridRecBisection() {
3337 int *assignedNode =
new int[patchMap->
numPatches()];
3344 int usedNodes = numNodes;
3345 CkPrintf(
"assignPatchesTopoGridRecBisection\n");
3353 int xsize = 0, ysize = 0, zsize = 0;
3357 xsize = tmgr.getDimNX();
3358 ysize = tmgr.getDimNY();
3359 zsize = tmgr.getDimNZ();
3362 int rc = recBisec.partitionProcGrid(xsize, ysize, zsize, assignedNode);
3364 delete [] assignedNode;
3371 #if defined(NAMD_MIC)
3372 extern void mic_hostDeviceLDB();
3373 extern void mic_contributeHostDeviceLDB(
int idLen,
int *
id);
3374 extern void mic_setDeviceLDBParams(
int dt,
int hs,
int sp1,
int pp1,
int pp2);
3378 #if defined(NAMD_MIC)
3379 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
3380 wdProxy.initHostDeviceLDB();
3385 #if defined(NAMD_MIC)
3386 mic_hostDeviceLDB();
3391 #if defined(NAMD_MIC)
3392 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
3393 wdProxy[0].contributeHostDeviceLDB(peSetLen, peSet);
3398 #if defined(NAMD_MIC)
3399 mic_contributeHostDeviceLDB(peSetLen, peSet);
3404 #if defined(NAMD_MIC)
3405 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
3406 wdProxy.setDeviceLDBParams(dt, hs, sp1, pp1, pp2);
3411 #if defined(NAMD_MIC)
3412 mic_setDeviceLDBParams(dt, hs, sp1, pp1, pp2);
3417 #include "WorkDistrib.def.h"
static int offset_b(int i)
void setDeviceLDBParams(int dt, int hs, int sp1, int pp1, int pp2)
BlockLoad::TempStorage load
void enqueueMIC(LocalWorkMsg *msg)
Real langevin_param(int atomnum) const
std::ostream & iINFO(std::ostream &s)
static void sortPmePes(int *pmepes, int xdim, int ydim)
Bool simulateInitialMapping
static void messageFinishMIC(Compute *)
int isSendSpanningTreeUnset()
patch_sortop_curve_b(PatchMap *m)
void enqueueAngles(LocalWorkMsg *msg)
static void messageFinishCUDA(Compute *)
PatchID assignToPatch(Position p, const Lattice &l)
void setNewNumPartitions(ComputeID cid, char numPartitions)
static int offset_c(int i)
static void recursive_bisect_with_curve(int *patch_begin, int *patch_end, int *node_begin, int *node_end, double *patchLoads, double *sortedLoads, int *assignedNode, TopoManagerWrapper &tmgr)
void saveComputeMap(const char *fname)
static ProxyMgr * Object()
unsigned int hydrogenGroupSize
static int * peCompactOrdering
void finishCUDAP3(LocalWorkMsg *msg)
void enqueueCrossterms(LocalWorkMsg *msg)
static void partition(int *order, const FullAtom *atoms, int begin, int end)
int isRecvSpanningTreeUnset()
void enqueuePme(LocalWorkMsg *msg)
int gridsize_c(void) const
static PatchMap * Object()
int operator==(const nodesort &o) const
void enqueueWorkA3(LocalWorkMsg *msg)
void enqueueWork(LocalWorkMsg *msg)
BigReal max_c(int pid) const
void enqueueGromacsPair(LocalWorkMsg *msg)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void enqueueSelfA1(LocalWorkMsg *msg)
void finishCUDAP2(LocalWorkMsg *msg)
BigReal min_a(int pid) const
static void send_contributeHostDeviceLDB(int peSetLen, int *peSet)
SimParameters * simParameters
void loadComputeMap(const char *fname)
int index_a(int pid) const
static __thread atom * atoms
void createHomePatch(PatchID pid, FullAtomList &a)
void sendAtoms(PatchID pid, FullAtomList &a)
void enqueueExcls(LocalWorkMsg *msg)
void enqueueBonds(LocalWorkMsg *msg)
void enqueueAniso(LocalWorkMsg *msg)
void enqueueSelfB1(LocalWorkMsg *msg)
Real rigid_bond_length(int atomnum) const
void enqueueWorkB1(LocalWorkMsg *msg)
static void messageEnqueueWork(Compute *)
static void peOrderingReady()
std::ostream & iWARN(std::ostream &s)
bool operator()(int pe1, int pe2) const
MIStream * get(char &data)
int operator<(const nodesort &o) const
int sizeGrid(ScaledPosition xmin, ScaledPosition xmax, const Lattice &lattice, BigReal patchSize, double maxNumPatches, int staticAtomAssignment, int asplit, int bsplit, int csplit)
ComputeID storeCompute(int node, int maxPids, ComputeType type, int partition=-1, int numPartitions=0)
int numaway_b(void) const
Patch * patch(PatchID pid)
void enqueueSelfA3(LocalWorkMsg *msg)
char computeMapFilename[NAMD_FILENAME_BUFFER_SIZE]
Position nearest(Position data, ScaledPosition ref) const
int basenode(int pid) const
bool operator()(int a, int b) const
void movePatch(PatchID, NodeID)
LocalWorkMsg *const localWorkMsg
BigReal min_b(int pid) const
void recvComputeMapChanges(ComputeMapChangeMsg *)
char newNumPartitions(ComputeID cid)
bool operator()(int p1, int p2) const
void reorder(Elem *a, int n)
HydrogenGroup hydrogenGroup
void enqueueCUDA(LocalWorkMsg *msg)
void sendComputeMap(void)
void enqueueWorkB2(LocalWorkMsg *msg)
void enqueueCUDAP2(LocalWorkMsg *msg)
void assignBaseNode(PatchID, NodeID)
static void recursive_bisect_coord(int x_begin, int x_end, int y_begin, int y_end, int *pe_begin, ScaledPosition *coord, int *result, int ydim)
void newCid(int pid, int cid)
void enqueueSelfB3(LocalWorkMsg *msg)
int coord(int pe, int dim)
TopoManagerWrapper & tmgr
void enqueueWorkC(LocalWorkMsg *msg)
pe_sortop_bit_reversed(int *r)
int gridsize_a(void) const
int compare_bit_reversed(int a, int b)
void reinitAtoms(const char *basename=0)
void enqueueThole(LocalWorkMsg *msg)
void enqueueWorkA2(LocalWorkMsg *msg)
void createHomePatches(void)
void NAMD_bug(const char *err_msg)
static int offset_a(int i)
void enqueueImpropers(LocalWorkMsg *msg)
static int eventMachineProgress
void enqueueLCPO(LocalWorkMsg *msg)
int oneOrTwoAwayNeighbors(int pid, PatchID *neighbor_ids, PatchID *downstream_ids=0, int *transform_ids=0)
Bool staticAtomAssignment
pe_sortop_coord_y(ScaledPosition *s)
Bool replicaUniformPatchGrids
int index_b(int pid) const
bool operator()(int a, int b) const
void finishCUDA(LocalWorkMsg *msg)
void setNewNode(ComputeID cid, NodeID node)
virtual void finishPatch(int)
void NAMD_die(const char *err_msg)
void enqueueCUDAP3(LocalWorkMsg *msg)
static int * peDiffuseOrderingIndex
void enqueueWorkA1(LocalWorkMsg *msg)
Bool pressureProfileEwaldOn
std::vector< std::string > split(const std::string &text, std::string delimiter)
static int * peDiffuseOrdering
void makePatches(ScaledPosition xmin, ScaledPosition xmax, const Lattice &lattice, BigReal patchSize, double maxNumPatches, int staticAtomAssignment, int replicaUniformPatchGrids, int lcpo, int asplit, int bsplit, int csplit)
unsigned char get_fep_type(int anum) const
BigReal max_b(int pid) const
int index_c(int pid) const
static int peOrderingInit
void find_extremes(const Lattice &, BigReal frac=1.0)
void saveComputeMapChanges(int, CkGroupID)
int add(const Elem &elem)
void finishCUDAPatch(FinishWorkMsg *msg)
BigReal max_a(int pid) const
void savePatchMap(PatchMapMsg *msg)
static int * peCompactOrderingIndex
ScaledPosition center(int pid) const
BlockRadixSort::TempStorage sort
void topo_getargs(char **)
static void buildNodeAwarePeOrdering(void)
patch_sortop_curve_a(PatchMap *m)
int pressureProfileAtomTypes
int atomsInMigrationGroup
void newPid(ComputeID cid, int pid, int trans=13)
static void send_setDeviceLDBParams(int dt, int hs, int sp1, int pp1, int pp2)
Index atomvdwtype(int anum) const
int numPatches(void) const
Position apply_transform(Position data, const Transform &t) const
bool operator()(int p1, int p2) const
void enqueueSelfA2(LocalWorkMsg *msg)
static ComputeMap * Object()
static void build_ordering(void *)
Real atomcharge(int anum) const
__thread DeviceCUDA * deviceCUDA
void distributeHomePatches(void)
StringList * find(const char *name) const
void assignNode(PatchID, NodeID)
bool operator()(int a, int b) const
bool less_than_bit_reversed(int a, int b)
patch_sortop_curve_c(PatchMap *m)
void enqueueSelfB2(LocalWorkMsg *msg)
int numPatchesOnNode(int node)
MOStream * put(char data)
Real atommass(int anum) const
static void send_initHostDeviceLDB()
int numaway_c(void) const
FullAtomList * createAtomLists(const char *basename=0)
unsigned int nonbondedGroupSize
BigReal min_c(int pid) const
infostream & endi(infostream &s)
#define SET_PRIORITY(MSG, SEQ, PRIO)
pe_sortop_coord_x(ScaledPosition *s)
void enqueueDihedrals(LocalWorkMsg *msg)
Bool is_atom_fixed(int atomnum) const
void finishMIC(LocalWorkMsg *msg)
void contributeHostDeviceLDB(int peSetLen, int *peSet)
void pack(char *buf, int size)
int isOutputProcessor(int pe)
void doneSaveComputeMap(CkReductionMsg *)
void unpack(MIStream *msg)
int numaway_a(void) const
void get_all_positions(Vector *)
pe_sortop_topo(TopoManagerWrapper &t, int *d)
bool operator()(int p1, int p2) const
void coords(int pe, int *crds)
void enqueueWorkB3(LocalWorkMsg *msg)
int gridsize_b(void) const
Bool noPatchesOnOutputPEs
int * sortAndSplit(int *node_begin, int *node_end, int splitdim)
void get_extremes(ScaledPosition &xmin, ScaledPosition &xmax) const
void sortAtomsForPatches(int *order, int *breaks, const FullAtom *atoms, int nmgrps, int natoms, int ni, int nj, int nk)
void assignNodeToPatch(void)
int getPatchesInOctet(int pid, PatchID *pids, int *transform_ids=0)
NodeID newNode(ComputeID cid)