6 #include <cuda_runtime.h>
31 #define __thread __declspec(thread)
36 extern __thread cudaStream_t
stream;
37 extern __thread cudaStream_t
stream2;
43 if ((err = cudaGetLastError()) != cudaSuccess) {
45 gethostname(host, 128); host[127] = 0;
46 char devstr[128] =
"";
48 if ( cudaGetDevice(&devnum) == cudaSuccess ) {
49 sprintf(devstr,
" device %d", devnum);
51 cudaDeviceProp deviceProp;
52 if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
53 sprintf(devstr,
" device %d pci %x:%x:%x", devnum,
54 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
57 sprintf(errmsg,
"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
63 if ( a == b )
return 0;
64 for (
int bit = 1; bit; bit *= 2 ) {
65 if ( (a&bit) != (b&bit) )
return ((a&bit) < (b&bit));
89 int tsize = (((dim+16+31)/32)*32)-16;
90 if ( tsize < dim )
NAMD_bug(
"ComputeNonbondedCUDA::build_lj_table bad tsize");
94 for (
int i=0; i<dim; ++i, row += tsize ) {
95 for (
int j=0; j<dim; ++j ) {
105 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
106 CkPrintf(
"Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
118 const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
122 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
126 union {
double f;
int32 i[2]; } byte_order_test;
127 byte_order_test.f = 1.0;
128 int32 *r2iilist = (
int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
131 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
132 int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;
156 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
158 BigReal ener = table_a + diffa *
159 ( ( table_d * diffa + table_c ) * diffa + table_b);
172 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
174 BigReal ener = table_a + diffa *
175 ( ( table_d * diffa + table_c ) * diffa + table_b);
188 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
189 t[i].y = 2. * -1. * grad;
190 BigReal ener = table_a + diffa *
191 ( ( table_d * diffa + table_c ) * diffa + table_b);
192 et[i].y = -1. * ener;
204 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
206 BigReal ener = table_a + diffa *
207 ( ( table_d * diffa + table_c ) * diffa + table_b);
237 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
238 CkPrintf(
"Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
244 return ( li[1] < lj[1] );
258 #ifdef MEM_OPT_VERSION
259 int natoms = mol->exclSigPoolSize;
273 for (
int i=0; i<natoms; ++i ) {
276 #ifdef MEM_OPT_VERSION
279 for (
int j=0; j<n; ++j ) { curList.
add(sig->
fullOffset[j]); }
283 int n = mol_list[0] + 1;
284 for (
int j=1; j<n; ++j ) {
285 curList.
add(mol_list[j] - i);
291 for ( j=0; j<unique_lists.
size(); ++j ) {
292 if ( n != unique_lists[j][0] )
continue;
294 for ( k=0; k<n; ++k ) {
295 if ( unique_lists[j][k+3] != curList[k] )
break;
299 if ( j == unique_lists.
size() ) {
303 maxdiff = -1 * curList[0];
304 if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
306 for (
int k=0; k<n; ++k ) {
307 list[k+3] = curList[k];
309 unique_lists.
add(list);
311 listsByAtom[i] = unique_lists[j];
315 long int totalbits = 0;
316 int nlists = unique_lists.
size();
317 for (
int j=0; j<nlists; ++j ) {
318 int32 *list = unique_lists[j];
319 int maxdiff = list[1];
320 list[2] = totalbits + maxdiff;
321 totalbits += 2*maxdiff + 1;
323 for (
int i=0; i<natoms; ++i ) {
327 delete [] listsByAtom;
329 if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
332 long int bytesneeded = totalbits / 8;
333 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
334 CkPrintf(
"Info: Found %d unique exclusion lists needing %ld bytes\n",
335 unique_lists.
size(), bytesneeded);
339 if ( bytesneeded > bytesavail ) {
341 sprintf(errmsg,
"Found %d unique exclusion lists needing %ld bytes "
342 "but only %ld bytes can be addressed with 32-bit int.",
343 unique_lists.
size(), bytesneeded, bytesavail);
348 #define SET_EXCL(EXCL,BASE,DIFF) \
349 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
351 unsigned int *exclusion_bits =
new unsigned int[totalbits/32];
352 memset(exclusion_bits, 0, totalbits/8);
355 for (
int i=0; i<unique_lists.
size(); ++i ) {
356 base += unique_lists[i][1];
357 if ( unique_lists[i][2] != (
int32)base ) {
358 NAMD_bug(
"ComputeNonbondedCUDA::build_exclusions base != stored");
360 int n = unique_lists[i][0];
361 for (
int j=0; j<n; ++j ) {
362 SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
364 base += unique_lists[i][1] + 1;
369 delete [] exclusion_bits;
381 cr.
pid[0] = pid; cr.
pid[1] = pid;
399 cr.
pid[0] = pid[0]; cr.
pid[1] = pid[1];
405 offset.
x += (t1%3-1) - (t2%3-1);
406 offset.
y += ((t1/3)%3-1) - ((t2/3)%3-1);
407 offset.
z += (t1/9-1) - (t2/9-1);
419 NAMD_bug(
"unregister_compute unimplemented");
436 CkPrintf(
"C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
439 if (
sizeof(patch_pair) & 15 )
NAMD_bug(
"sizeof(patch_pair) % 16 != 0");
440 if (
sizeof(atom) & 15 )
NAMD_bug(
"sizeof(atom) % 16 != 0");
441 if (
sizeof(atom_param) & 15 )
NAMD_bug(
"sizeof(atom_param) % 16 != 0");
453 NAMD_die(
"pressure profile not supported in CUDA");
492 NAMD_bug(
"ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
508 NAMD_die(
"CUDA kernel cannot use +nomergegrids with GBIS simulations");
517 NAMD_die(
"CUDA kernel requires +mergegrids if +nostreaming is used");
521 #if CUDA_VERSION >= 5050
522 int leastPriority, greatestPriority;
523 cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
525 if ( leastPriority != greatestPriority ) {
526 if ( CkMyNode() == 0 ) {
528 CkPrintf(
"CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
531 greatestPriority = leastPriority;
533 if (params->
usePMECUDA) greatestPriority = leastPriority;
534 cudaStreamCreateWithPriority(&
stream,cudaStreamDefault,greatestPriority);
535 cudaStreamCreateWithPriority(&
stream2,cudaStreamDefault,leastPriority);
539 cudaStreamCreate(&
stream);
542 cudaDeviceProp deviceProp;
543 cudaGetDeviceProperties(&deviceProp, dev);
545 if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
546 if ( CkMyNode() == 0 ) CkPrintf(
"CUDA device %d supports concurrent kernels.\n", dev);
561 cudaEventCreateWithFlags(&
start_calc,cudaEventDisableTiming);
584 }
else if ( CkNumNodes() < 2 ) {
617 for (
int i=0; i<npatches; ++i ) {
620 if ( pr.
hostPe == CkMyPe() ) {
644 if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
652 if ( ppi != ppj )
return ppi < ppj;
659 int *pesOnNodeSharingDevice =
new int[CkMyNodeSize()];
660 int numPesOnNodeSharingDevice = 0;
661 int masterIndex = -1;
664 if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
665 if ( CkNodeOf(pe) == CkMyNode() ) {
666 pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
679 for (
int i=0; i<npatches; ++i ) {
682 if ( CkNodeOf(homePe) == CkMyNode() ) {
683 homePatchByRank[CkRankOf(homePe)].
add(pid);
686 for (
int i=0; i<CkMyNodeSize(); ++i ) {
688 std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
689 int masterBoost = ( CkMyRank() == i ? 2 : 0 );
690 for (
int j=0; j<homePatchByRank[i].
size(); ++j ) {
691 int pid = homePatchByRank[i][j];
692 patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
697 int *count =
new int[npatches];
698 memset(count, 0,
sizeof(
int)*npatches);
699 int *pcount =
new int[numPesOnNodeSharingDevice];
700 memset(pcount, 0,
sizeof(
int)*numPesOnNodeSharingDevice);
701 int *rankpcount =
new int[CkMyNodeSize()];
702 memset(rankpcount, 0,
sizeof(
int)*CkMyNodeSize());
703 char *table =
new char[npatches*numPesOnNodeSharingDevice];
704 memset(table, 0, npatches*numPesOnNodeSharingDevice);
706 int unassignedpatches = npatches;
709 for (
int i=0; i<npatches; ++i ) {
714 unassignedpatches = 0;
715 pcount[masterIndex] = npatches;
719 for (
int i=0; i<npatches; ++i ) {
723 for (
int j=0; j<numPesOnNodeSharingDevice; ++j ) {
724 int pe = pesOnNodeSharingDevice[j];
725 if ( pe == homePe ) {
726 pr.
hostPe = pe; --unassignedpatches;
730 table[i*numPesOnNodeSharingDevice+j] = 1;
733 if ( pr.
hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
734 pr.
hostPe = homePe; --unassignedpatches;
735 rankpcount[CkRankOf(homePe)] += 1;
740 for (
int i=0; i<npatches; ++i ) {
743 if ( pr.
hostPe != -1 )
continue;
746 for (
int j=0; j<numPesOnNodeSharingDevice; ++j ) {
747 if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
751 pr.
hostPe = pesOnNodeSharingDevice[lastj];
756 while ( unassignedpatches ) {
758 for ( i=0; i<npatches; ++i ) {
759 if ( ! table[i*numPesOnNodeSharingDevice+assignj] )
continue;
762 if ( pr.
hostPe != -1 )
continue;
763 pr.
hostPe = pesOnNodeSharingDevice[assignj];
765 pcount[assignj] += 1;
766 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
769 if ( i<npatches )
continue;
770 for ( i=0; i<npatches; ++i ) {
773 if ( pr.
hostPe != -1 )
continue;
774 if ( count[i] )
continue;
775 pr.
hostPe = pesOnNodeSharingDevice[assignj];
777 pcount[assignj] += 1;
778 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
781 if ( i<npatches )
continue;
782 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
785 for (
int i=0; i<npatches; ++i ) {
794 for (
int j=0; j<numPesOnNodeSharingDevice; ++j ) {
795 int pe = pesOnNodeSharingDevice[j];
796 int rank = pe - CkNodeFirst(CkMyNode());
799 if ( pe == CkMyPe() )
continue;
800 if ( ! pcount[j] && ! rankpcount[rank] )
continue;
801 rankpcount[rank] = 0;
806 for (
int j=0; j<CkMyNodeSize(); ++j ) {
807 int pe = CkNodeFirst(CkMyNode()) + j;
810 if ( ! rankpcount[j] )
continue;
811 if ( pe == CkMyPe() )
continue;
818 delete [] pesOnNodeSharingDevice;
821 delete [] rankpcount;
871 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
876 #define GBISP(...) CkPrintf(__VA_ARGS__);
881 #define count_limit 1000000
951 sprintf(errmsg,
"cuda_check_progress polled %d times over %f s on step %d",
966 if ( err == cudaSuccess ) {
975 }
else if ( err != cudaErrorNotReady ) {
977 sprintf(errmsg,
"in cuda_check_remote_progress after polling %d times over %f s on step %d",
983 sprintf(errmsg,
"cuda_check_remote_progress polled %d times over %f s on step %d",
988 NAMD_bug(
"nonzero check_local_count in cuda_check_remote_progress");
999 if ( err == cudaSuccess ) {
1005 }
else if ( err != cudaErrorNotReady ) {
1007 sprintf(errmsg,
"in cuda_check_local_progress after polling %d times over %f s on step %d",
1013 sprintf(errmsg,
"cuda_check_local_progress polled %d times over %f s on step %d",
1018 NAMD_bug(
"nonzero check_remote_count in cuda_check_local_progress");
1074 if ( rpri != rprj )
return rpri > rprj;
1079 if ( ppi != ppj )
return ppi < ppj;
1118 GBISP(
"GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
1138 GBISP(
"GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.
patchID);
1145 GBISP(
"GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.
patchID);
1149 GBISP(
"GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.
patchID);
1153 GBISP(
"GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.
patchID);
1156 GBISP(
"opened GBIS boxes");
1160 if (
master ==
this )
return 0;
1173 GBISP(
"C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
1189 if (
master ==
this && kernel_launch_state > 2 ) {
1223 1.2f, cudaHostAllocMapped);
1226 force_ready_queue[k] = -1;
1231 1.2f, cudaHostAllocMapped);
1235 1.2f, cudaHostAllocMapped);
1239 1.2f, cudaHostAllocMapped);
1281 for (
int i=0; i<nrc; ++i ) {
1284 for (
int i=0; i<nlc; ++i ) {
1289 patch_pair_num.
resize(npatches);
1290 for (
int i=0; i<npatches; ++i ) {
1292 patch_pair_num[i] = 0;
1296 patch_pairs.
resize(ncomputes);
1297 for (
int i=0; i<ncomputes; ++i ) {
1301 patch_pair_num[lp1]++;
1302 if (lp1 != lp2) patch_pair_num[lp2]++;
1303 patch_pair &pp = patch_pairs[i];
1309 for (
int i=0; i<ncomputes; ++i ) {
1313 patch_pair &pp = patch_pairs[i];
1314 pp.patch1_ind = lp1;
1315 pp.patch2_ind = lp2;
1316 pp.patch1_num_pairs = patch_pair_num[lp1];
1317 pp.patch2_num_pairs = patch_pair_num[lp2];
1320 if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
1321 CkPrintf(
"Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
1328 int len = patch_pairs.
size();
1331 if ( len != nlc + nrc )
NAMD_bug(
"array size mismatch in ComputeNonbondedCUDA reordering");
1336 for (
int i=0; i<len; ++i ) {
1339 if ( boi < nrc ) { dest = --irc; }
else { dest = --ilc; }
1341 new_patch_pairs[dest] = patch_pairs[boi];
1343 if ( irc != 0 || ilc != nrc )
NAMD_bug(
"block index mismatch in ComputeNonbondedCUDA reordering");
1345 patch_pairs.
swap(new_patch_pairs);
1350 for ( i=0; i<npatches; ++i ) {
1357 int nfreeatoms = natoms;
1360 for (
int j=0; j<natoms; ++j ) {
1361 if ( aExt[j].atomFixed ) --nfreeatoms;
1367 istart += 16 - (natoms & 15);
1389 int exclmask_start = 0;
1391 for (
int i=0; i<ncomputes; ++i ) {
1395 patch_pair &pp = patch_pairs[i];
1402 pp.plist_start = bfstart;
1404 int size1 = (pp.patch1_size-1)/
WARPSIZE+1;
1405 int size2 = (pp.patch2_size-1)/
WARPSIZE+1;
1406 pp.plist_size = (size1*size2-1)/32+1;
1407 bfstart += pp.plist_size;
1408 pp.exclmask_start = exclmask_start;
1409 exclmask_start += size1*size2;
1421 float maxAtomMovement = 0.;
1422 float maxPatchTolerance = 0.;
1428 if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
1431 if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
1440 for (
int k=0; k<n; ++k ) {
1442 ap[k].vdw_type = a[j].
vdwType;
1444 ap[k].index = aExt[j].
id;
1445 #ifdef MEM_OPT_VERSION
1448 #else // ! MEM_OPT_VERSION
1451 #endif // MEM_OPT_VERSION
1458 memcpy(ap, ac,
sizeof(atom)*n);
1462 atom *ap =
atoms + start;
1463 for (
int k=0; k<n; ++k ) {
1465 ap[k].position.x = a[j].
position.
x - center.
x;
1466 ap[k].position.y = a[j].
position.
y - center.
y;
1467 ap[k].position.z = a[j].
position.
z - center.
z;
1468 ap[k].charge = charge_scaling * a[j].
charge;
1507 lata.x = lattice.
a().
x;
1508 lata.y = lattice.
a().
y;
1509 lata.z = lattice.
a().
z;
1510 latb.x = lattice.
b().
x;
1511 latb.y = lattice.
b().
y;
1512 latb.z = lattice.
b().
z;
1513 latc.x = lattice.
c().
x;
1514 latc.y = lattice.
c().
y;
1515 latc.z = lattice.
c().
z;
1518 for (
int ic=0; ic<ncomputes; ++ic ) {
1519 patch_pair &pp = patch_pairs[ic];
1520 atom *a1 =
atoms + pp.patch1_atom_start;
1521 int n1 = pp.patch1_size;
1522 atom *a2 =
atoms + pp.patch2_atom_start;
1523 int n2 = pp.patch2_size;
1524 float offx = pp.offset.
x * lata.x
1525 + pp.offset.y * latb.x
1526 + pp.offset.z * latc.x;
1527 float offy = pp.offset.x * lata.y
1528 + pp.offset.y * latb.y
1529 + pp.offset.z * latc.y;
1530 float offz = pp.offset.x * lata.z
1531 + pp.offset.y * latb.z
1532 + pp.offset.z * latc.z;
1534 int atoms_tried = 0;
1535 int blocks_tried = 0;
1537 int blocks_used = 0;
1538 for (
int ii=0; ii<n1; ii+=32 ) {
1539 for (
int jj=0; jj<n2; jj+=16 ) {
1541 for (
int j=jj; j<jj+16 && j<n2; ++j ) {
1543 for (
int i=ii; i<ii+32 && i<n1; ++i ) {
1544 float dx = offx + a1[i].position.x - a2[j].position.x;
1545 float dy = offy + a1[i].position.y - a2[j].position.y;
1546 float dz = offz + a1[i].position.z - a2[j].position.z;
1547 float r2 = dx*dx + dy*dy + dz*dz;
1548 if ( r2 <
cutoff2 ) atom_used = 1;
1551 if ( atom_used ) { block_used = 1; ++atoms_used; }
1554 if ( block_used ) { ++blocks_used; }
1557 CkPrintf(
"blocks = %d/%d (%f) atoms = %d/%d (%f)\n",
1558 blocks_used, blocks_tried, blocks_used/(
float)blocks_tried,
1559 atoms_used, atoms_tried, atoms_used/(
float)atoms_tried);
1571 GBISP(
"doWork[%d] accessing arrays for P%d\n",CkMyPe(),
gbisPhase);
1577 for (
int k=0; k<pr.
numAtoms; ++k ) {
1579 intRad0[k] = pr.
intRad[2*j+0];
1580 intRadS[k] = pr.
intRad[2*j+1];
1585 for (
int k=0; k<pr.
numAtoms; ++k ) {
1591 for (
int k=0; k<pr.
numAtoms; ++k ) {
1600 kernel_launch_state = 1;
1634 GBISP(
"C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
1635 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(),
sequence(), \
1656 if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
1657 walltime = CkWallTimer();
1661 switch ( kernel_launch_state ) {
1665 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
1677 if ( simParams->
GBISOn) {
1687 if ( simParams->
GBISOn) {
1725 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
1743 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
1764 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
1798 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1799 cuda_check_local_progress\n", CkMyPe())
1803 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1804 cuda_check_local_calc\n", CkMyPe())
1814 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
1830 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
1849 GBISP(
"C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
1854 GBISP(
"C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
1874 int start = pr.localStart;
1876 int nfree = pr.numAtoms;
1885 for (
int k=0; k<nfree; ++k ) {
1893 f_slow[j].
x += af_slow[k].x;
1894 f_slow[j].
y += af_slow[k].y;
1895 f_slow[j].
z += af_slow[k].z;
1911 GBISP(
"C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
1923 for (
int i=0; i<patches.
size(); ++i ) {
1929 GBISP(
"GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.
patchID);
1940 if ( i % 31 == 0 )
for (
int j=0; j<3; ++j ) {
1941 CkPrintf(
"Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
1952 for (
int k=0; k<pr.
numAtoms; ++k ) {
1954 pr.
psiSum[j] += psiSumMaster[k];
1956 GBISP(
"C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
1962 for (
int k=0; k<pr.
numAtoms; ++k ) {
1964 pr.
dEdaSum[j] += dEdaSumMaster[k];
1966 GBISP(
"C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
1970 GBISP(
"C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
1978 GBISP(
"C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
1990 virial_slow *= (-1./6.);
1998 ( localHostedPatches.size() ||
master == this ) ) {
1999 GBISP(
"not finished, call again\n");
2004 GBISP(
"finished\n");
2020 GBISP(
"C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
2023 GBISP(
"C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
2039 for (
int i=0; i<patch_len; ++i ) {
2043 for (
int i=0; i<order_len; ++i ) {
2048 for (
int i=0; i<patch_len; ++i ) {
2049 iout <<
"patch_last " << i <<
" " << plast[i] <<
"\n";
2074 nexcluded += ((
int *)
virials)[16*i+12];
2088 Tensor virial_slow_tensor;
2116 float upload_ms, remote_calc_ms;
2117 float local_calc_ms, total_ms;
2119 cudaEventElapsedTime(&upload_ms, start_upload,
start_calc);
2129 CkPrintf(
"CUDA EVENT TIMING: %d %f %f %f %f\n",
2130 CkMyPe(), upload_ms, remote_calc_ms,
2131 local_calc_ms, total_ms);
2136 CkPrintf(
"CUDA TIMING: %d %f ms/step on node %d\n",
void setNumPatches(int n)
ResizeArray< int > remoteHostedPatches
void sendNonbondedCUDASlaveEnqueuePatch(ComputeNonbondedCUDA *c, int, int, int, int, FinishWorkMsg *)
static __thread int * block_order
Box< Patch, GBReal > * registerDEdaSumDeposit(Compute *cid)
static BigReal * fast_table
#define COMPUTE_PROXY_PRIORITY
void cuda_bind_force_table(const float4 *t, const float4 *et)
void sendNonbondedCUDASlaveSkip(ComputeNonbondedCUDA *c, int)
void sendBuildCudaForceTable()
void build_cuda_exclusions()
static BigReal * scor_table
void sendYieldDevice(int pe)
Type * getNewArray(int n)
#define PROXY_RESULTS_PRIORITY
ResizeArray< int > localActivePatches
static void messageFinishCUDA(Compute *)
void cuda_bind_forces(float4 *f, float4 *f_slow)
static __thread int check_count
static bool sortop_bitreverse(int a, int b)
Box< Patch, GBReal > * psiSumBox
void build_cuda_force_table()
void unregister_cuda_compute(ComputeID c)
static __thread double cuda_timer_total
BigReal solvent_dielectric
int cuda_stream_finished()
static __thread int intRadSH_size
static __thread int check_remote_count
static ProxyMgr * Object()
void cuda_check_progress(void *arg, double walltime)
ResizeArray< compute_record > computeRecords
void cuda_bind_exclusions(const unsigned int *t, int n)
void cuda_check_local_progress(void *arg, double walltime)
void cuda_bind_atoms(const atom *a)
#define CUDA_TRACE_POLL_REMOTE
Box< Patch, Results > * forceBox
static BigReal dielectric_1
static void build_lj_table()
static __thread cudaEvent_t end_remote_download
static PatchMap * Object()
void setMergeGrids(const int val)
#define CUDA_TRACE_REMOTE(START, END)
static __thread ComputeMgr * computeMgr
#define ADD_TENSOR_OBJECT(R, RL, D)
SimParameters * simParameters
static __thread int dummy_size
int index_a(int pid) const
static const Molecule * mol
const ComputeNonbondedCUDA::patch_record * pr
static __thread float * bornRadH
static __thread int2 * exclusionsByAtom
static BigReal * vdwa_table
static __thread float * dHdrPrefixH
static void messageEnqueueWork(Compute *)
#define PROXY_DATA_PRIORITY
SubmitReduction * willSubmit(int setID, int size=-1)
Box< Patch, Real > * registerBornRadPickup(Compute *cid)
int get_table_dim() const
static ReductionMgr * Object(void)
void cuda_bind_GBIS_energy(float *e)
bool pid_compare_priority(int pidi, int pidj)
Patch * patch(PatchID pid)
Box< Patch, Real > * intRadBox
static PatchMap * ObjectOnPe(int pe)
cr_sortop_distance & distop
void sendNonbondedCUDASlaveReady(int, int, int, int)
void CcdCallBacksReset(void *ignored, double curWallTime)
void cuda_bind_GBIS_dEdaSum(GBReal *dEdaSumH)
SubmitReduction * reduction
static __thread int dHdrPrefixH_size
static __thread cudaEvent_t end_local_download
BigReal coulomb_radius_offset
static BigReal * r2_table
void register_cuda_compute_self(ComputeID c, PatchID pid)
void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
__thread cudaStream_t stream
static __thread float * slow_virials
bool operator()(int pidj, int pidi)
static __thread int force_ready_queue_next
void send_build_cuda_force_table()
static __thread int intRad0H_size
void cuda_check_local_calc(void *arg, double walltime)
#define CUDA_POLL(FN, ARG)
static __thread ResizeArray< int > * patch_pair_num_ptr
Box< Patch, GBReal > * registerPsiSumDeposit(Compute *cid)
static __thread patch_pair * patch_pairs
CudaAtom * getCudaAtomList()
static __thread float * intRadSH
const TableEntry * table_val(unsigned int i, unsigned int j) const
void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int plist_len, int nexclmask)
static __thread double kernel_time
void setGpuIsMine(const int val)
ComputeNonbondedCUDA ** slaves
void cuda_bind_vdw_types(const int *t)
void cuda_bind_lj_table(const float2 *t, int _lj_table_size)
void sendCreateNonbondedCUDASlave(int, int)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
void NAMD_bug(const char *err_msg)
Box< Patch, Real > * registerIntRadPickup(Compute *cid)
void cuda_bind_GBIS_dHdrPrefix(float *dHdrPrefixH)
static __thread int force_ready_queue_size
static __thread int num_remote_atoms
int index_b(int pid) const
static __thread int virials_size
void cuda_bind_GBIS_bornRad(float *bornRadH)
ResizeArray< int > remoteActivePatches
cr_sortop_distance(const Lattice &lattice)
int getPesSharingDevice(const int i)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
void createProxy(PatchID pid)
ComputeNonbondedCUDA * master
ResizeArray< compute_record > localComputeRecords
void NAMD_die(const char *err_msg)
static __thread float * virials
void cuda_bind_virials(float *v, int *queue, int *blockorder)
void requirePatch(int pid)
ResizeArray< compute_record > remoteComputeRecords
LocalWorkMsg * localWorkMsg2
static AtomMap * Object()
void cuda_bind_GBIS_psiSum(GBReal *psiSumH)
void cuda_GBIS_P3(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
static __thread int cuda_timer_count
static __thread ResizeArray< patch_pair > * patch_pairs_ptr
static __thread int bornRadH_size
static __thread int num_virials
ResizeArray< int > hostedPatches
ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr, ComputeNonbondedCUDA *m=0, int idx=-1)
ResizeArray< int > activePatches
static void build_exclusions()
int index_c(int pid) const
bool operator()(ComputeNonbondedCUDA::compute_record i, ComputeNonbondedCUDA::compute_record j)
static __thread int vdw_types_size
static __thread double remote_submit_time
int add(const Elem &elem)
static __thread double local_submit_time
void cuda_GBIS_P1(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
bool operator()(ComputeNonbondedCUDA::compute_record j, ComputeNonbondedCUDA::compute_record i)
ScaledPosition center(int pid) const
BlockRadixSort::TempStorage sort
void cuda_check_remote_progress(void *arg, double walltime)
static __thread int force_ready_queue_len
static __thread int energy_gbis_size
static __thread float * dummy_dev
int numPatches(void) const
void swap(ResizeArray< Elem > &ra)
#define CUDA_TRACE_POLL_LOCAL
void cuda_errcheck(const char *msg)
Position unscale(ScaledPosition s) const
void cuda_bind_GBIS_intRad(float *intRad0H, float *intRadSH)
static BigReal * vdwb_table
int getNextPeSharingGpu()
ResizeArray< int > localHostedPatches
void sendNonbondedCUDASlaveEnqueue(ComputeNonbondedCUDA *c, int, int, int, int)
static __thread float * energy_gbis
Box< Patch, GBReal > * dEdaSumBox
Box< Patch, Real > * registerDHdrPrefixPickup(Compute *cid)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
int reversePriorityRankInPe
static __thread int kernel_launch_state
void cuda_check_remote_calc(void *arg, double walltime)
static const LJTable * ljTable
void recvYieldDevice(int pe)
__thread DeviceCUDA * deviceCUDA
ComputeNonbondedCUDA * slave
static __thread cudaEvent_t start_calc
bool operator()(int32 *li, int32 *lj)
static __thread atom_param * atom_params
__thread int max_grid_size
ResizeArray< patch_record > patchRecords
cr_sortop_reverse_priority(cr_sortop_distance &sod, const ComputeNonbondedCUDA::patch_record *patchrecs)
Box< Patch, CompAtom > * positionBox
BigReal pairlistTolerance
static __thread int block_order_size
static __thread int * vdw_types
void cuda_GBIS_P2(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
__thread cudaStream_t stream2
infostream & endi(infostream &s)
Box< Patch, Real > * dHdrPrefixBox
static __thread int atom_params_size
static __thread int * force_ready_queue
void cuda_bind_atom_params(const atom_param *t)
#define SET_EXCL(EXCL, BASE, DIFF)
#define CUDA_TRACE_LOCAL(START, END)
static void build_force_table()
Box< Patch, Real > * bornRadBox
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
int getNumPesSharingDevice()
static __thread int num_atoms
const int32 * get_full_exclusions_for_atom(int anum) const
static __thread int num_local_atoms
void close(Data **const t)
static __thread float * intRad0H
static __thread int check_local_count
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
#define PATCH_PRIORITY(PID)
void messageFinishPatch(int)
CompAtomExt * getCompAtomExtInfo()
static __thread ComputeNonbondedCUDA * cudaCompute
void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[])
Box< Patch, Results > * registerForceDeposit(Compute *cid)