NAMD
Classes | Public Member Functions | Public Attributes | Static Public Attributes | Friends | List of all members
ComputePmeMgr Class Reference
Inheritance diagram for ComputePmeMgr:
ComputePmeUtil

Classes

struct  cuda_submit_charges_args
 

Public Member Functions

 ComputePmeMgr ()
 
 ~ComputePmeMgr ()
 
void initialize (CkQdMsg *)
 
void initialize_pencils (CkQdMsg *)
 
void activate_pencils (CkQdMsg *)
 
void recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
 
void initialize_computes ()
 
void sendData (Lattice &, int sequence)
 
void sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)
 
void sendPencils (Lattice &, int sequence)
 
void sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)
 
void recvGrid (PmeGridMsg *)
 
void gridCalc1 (void)
 
void sendTransBarrier (void)
 
void sendTransSubset (int first, int last)
 
void sendTrans (void)
 
void fwdSharedTrans (PmeTransMsg *)
 
void recvSharedTrans (PmeSharedTransMsg *)
 
void sendDataHelper (int)
 
void sendPencilsHelper (int)
 
void recvTrans (PmeTransMsg *)
 
void procTrans (PmeTransMsg *)
 
void gridCalc2 (void)
 
void gridCalc2R (void)
 
void fwdSharedUntrans (PmeUntransMsg *)
 
void recvSharedUntrans (PmeSharedUntransMsg *)
 
void sendUntrans (void)
 
void sendUntransSubset (int first, int last)
 
void recvUntrans (PmeUntransMsg *)
 
void procUntrans (PmeUntransMsg *)
 
void gridCalc3 (void)
 
void sendUngrid (void)
 
void sendUngridSubset (int first, int last)
 
void recvUngrid (PmeGridMsg *)
 
void recvAck (PmeAckMsg *)
 
void copyResults (PmeGridMsg *)
 
void copyPencils (PmeGridMsg *)
 
void ungridCalc (void)
 
void recvRecipEvir (PmeEvirMsg *)
 
void addRecipEvirClient (void)
 
void submitReductions ()
 
void chargeGridSubmitted (Lattice &lattice, int sequence)
 
void cuda_submit_charges (Lattice &lattice, int sequence)
 
void sendChargeGridReady ()
 
void pollChargeGridReady ()
 
void pollForcesReady ()
 
void recvChargeGridReady ()
 
void chargeGridReady (Lattice &lattice, int sequence)
 
- Public Member Functions inherited from ComputePmeUtil
 ComputePmeUtil ()
 
 ~ComputePmeUtil ()
 

Public Attributes

LatticesendDataHelper_lattice
 
int sendDataHelper_sequence
 
int sendDataHelper_sourcepe
 
int sendDataHelper_errors
 
CmiNodeLock pmemgr_lock
 
float * a_data_host
 
float * a_data_dev
 
float * f_data_host
 
float * f_data_dev
 
int cuda_atoms_count
 
int cuda_atoms_alloc
 
cudaEvent_t end_charges
 
cudaEvent_t * end_forces
 
int forces_count
 
int forces_done_count
 
double charges_time
 
double forces_time
 
int check_charges_count
 
int check_forces_count
 
int master_pe
 
int this_pe
 
int chargeGridSubmittedCount
 
Latticesaved_lattice
 
int saved_sequence
 
ResizeArray< ComputePme * > pmeComputes
 

Static Public Attributes

static CmiNodeLock fftw_plan_lock
 
static CmiNodeLock cuda_lock
 
static std::deque
< cuda_submit_charges_args
cuda_submit_charges_deque
 
static bool cuda_busy
 
- Static Public Attributes inherited from ComputePmeUtil
static int numGrids
 
static Bool alchOn
 
static Bool alchFepOn
 
static Bool alchThermIntOn
 
static Bool alchDecouple
 
static BigReal alchElecLambdaStart
 
static Bool lesOn
 
static int lesFactor
 
static Bool pairOn
 
static Bool selfOn
 

Friends

class ComputePme
 
class NodePmeMgr
 

Additional Inherited Members

- Static Public Member Functions inherited from ComputePmeUtil
static void select (void)
 

Detailed Description

Definition at line 355 of file ComputePme.C.

Constructor & Destructor Documentation

ComputePmeMgr::ComputePmeMgr ( )

Definition at line 710 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, stream, and this_pe.

710  : pmeProxy(thisgroup),
711  pmeProxyDir(thisgroup) {
712 
713  CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
714  pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
715  nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
716 
717  pmeNodeProxy.ckLocalBranch()->initialize();
718 
719  if ( CmiMyRank() == 0 ) {
720  fftw_plan_lock = CmiCreateLock();
721  }
722  pmemgr_lock = CmiCreateLock();
723 
724  myKSpace = 0;
725  kgrid = 0;
726  work = 0;
727  grid_count = 0;
728  trans_count = 0;
729  untrans_count = 0;
730  ungrid_count = 0;
731  gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
732  useBarrier = 0;
733  sendTransBarrier_received = 0;
734  usePencils = 0;
735 
736 #ifdef NAMD_CUDA
737  // offload has not been set so this happens on every run
738  if ( CmiMyRank() == 0 ) {
739  cuda_lock = CmiCreateLock();
740  }
741 
742 #if CUDA_VERSION >= 5050
743  int leastPriority, greatestPriority;
744  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
745  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
746  //if ( CkMyNode() == 0 ) {
747  // CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
748  //}
749 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
750 #else
751 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
752 #endif
753 
754  stream = 0;
755  for ( int i=0; i<NUM_STREAMS; ++i ) {
756 #if 1
757  CUDA_STREAM_CREATE(&streams[i]);
758  cuda_errcheck("cudaStreamCreate");
759 #else
760  streams[i] = 0; // XXXX Testing!!!
761 #endif
762  }
763 
764  this_pe = CkMyPe();
765 
766  cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
767  end_forces = 0;
769  check_forces_count = 0;
771 
772  cuda_atoms_count = 0;
773  cuda_atoms_alloc = 0;
774 
775  f_data_mgr_alloc = 0;
776  f_data_mgr_host = 0;
777  f_data_mgr_dev = 0;
778  afn_host = 0;
779  afn_dev = 0;
780 
781 #define CUDA_EVENT_ID_PME_CHARGES 80
782 #define CUDA_EVENT_ID_PME_FORCES 81
783 #define CUDA_EVENT_ID_PME_TICK 82
784 #define CUDA_EVENT_ID_PME_COPY 83
785 #define CUDA_EVENT_ID_PME_KERNEL 84
786  if ( 0 == CkMyPe() ) {
787  traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
788  traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
789  traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
790  traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
791  traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
792  }
793 #endif
794  recipEvirCount = 0;
795  recipEvirClients = 0;
796  recipEvirPe = -999;
797 }
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:414
cudaEvent_t end_charges
Definition: ComputePme.C:426
#define CUDA_STREAM_CREATE(X)
#define CUDA_EVENT_ID_PME_COPY
CmiNodeLock pmemgr_lock
Definition: ComputePme.C:415
int check_charges_count
Definition: ComputePme.C:432
int cuda_atoms_alloc
Definition: ComputePme.C:423
#define CUDA_EVENT_ID_PME_FORCES
#define CUDA_EVENT_ID_PME_TICK
int chargeGridSubmittedCount
Definition: ComputePme.C:444
int cuda_atoms_count
Definition: ComputePme.C:422
void cuda_errcheck(const char *msg)
cudaEvent_t * end_forces
Definition: ComputePme.C:427
#define CUDA_EVENT_ID_PME_KERNEL
#define CUDA_EVENT_ID_PME_CHARGES
#define NUM_STREAMS
Definition: ComputePme.C:514
int check_forces_count
Definition: ComputePme.C:433
static CmiNodeLock cuda_lock
Definition: ComputePme.C:424
ComputePmeMgr::~ComputePmeMgr ( )

Definition at line 1794 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

1794  {
1795 
1796  if ( CmiMyRank() == 0 ) {
1797  CmiDestroyLock(fftw_plan_lock);
1798  }
1799  CmiDestroyLock(pmemgr_lock);
1800 
1801  delete myKSpace;
1802  delete [] localInfo;
1803  delete [] gridNodeInfo;
1804  delete [] transNodeInfo;
1805  delete [] gridPeMap;
1806  delete [] transPeMap;
1807  delete [] recipPeDest;
1808  delete [] gridPeOrder;
1809  delete [] gridNodeOrder;
1810  delete [] transNodeOrder;
1811  delete [] qgrid;
1812  if ( kgrid != qgrid ) delete [] kgrid;
1813  delete [] work;
1814  delete [] gridmsg_reuse;
1815 
1816  if ( ! offload ) {
1817  for (int i=0; i<q_count; ++i) {
1818  delete [] q_list[i];
1819  }
1820  delete [] q_list;
1821  delete [] fz_arr;
1822  }
1823  delete [] f_arr;
1824  delete [] q_arr;
1825 }
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:414
CmiNodeLock pmemgr_lock
Definition: ComputePme.C:415

Member Function Documentation

void ComputePmeMgr::activate_pencils ( CkQdMsg *  msg)

Definition at line 1788 of file ComputePme.C.

1788  {
1789  if ( ! usePencils ) return;
1790  if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1791 }
void ComputePmeMgr::addRecipEvirClient ( void  )

Definition at line 3019 of file ComputePme.C.

3019  {
3020  ++recipEvirClients;
3021 }
void ComputePmeMgr::chargeGridReady ( Lattice lattice,
int  sequence 
)

Definition at line 3548 of file ComputePme.C.

References PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< T >::size().

Referenced by ComputePme::doWork(), and recvChargeGridReady().

3548  {
3549 
3550 #ifdef NAMD_CUDA
3551  if ( offload ) {
3552  int errcount = 0;
3553  int q_stride = myGrid.K3+myGrid.order-1;
3554  for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
3555  f_arr[j] = ffz_host[j];
3556  if ( ffz_host[j] & ~1 ) ++errcount;
3557  }
3558  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
3559  }
3560 #endif
3561  recipEvirCount = recipEvirClients;
3562  ungridForcesCount = pmeComputes.size();
3563 
3564  for (int j=0; j<myGrid.order-1; ++j) {
3565  fz_arr[j] |= fz_arr[myGrid.K3+j];
3566  }
3567 
3568  if ( usePencils ) {
3569  sendPencils(lattice,sequence);
3570  } else {
3571  sendData(lattice,sequence);
3572  }
3573 }
void sendPencils(Lattice &, int sequence)
Definition: ComputePme.C:3731
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:123
void sendData(Lattice &, int sequence)
Definition: ComputePme.C:3958
int K3
Definition: PmeBase.h:18
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:454
int size(void) const
Definition: ResizeArray.h:127
void ComputePmeMgr::chargeGridSubmitted ( Lattice lattice,
int  sequence 
)

Definition at line 3484 of file ComputePme.C.

References chargeGridSubmittedCount, computeMgr, CUDA_EVENT_ID_PME_COPY, deviceCUDA, end_charges, DeviceCUDA::getMasterPe(), master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and SimParameters::useCUDA2.

Referenced by cuda_submit_charges().

3484  {
3485  saved_lattice = &lattice;
3486  saved_sequence = sequence;
3487 
3488  // cudaDeviceSynchronize(); // XXXX TESTING
3489  //int q_stride = myGrid.K3+myGrid.order-1;
3490  //for (int n=fsize+q_stride, j=0; j<n; ++j) {
3491  // if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
3492  // CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
3493  // }
3494  //}
3495  //CmiLock(cuda_lock);
3496 
3497  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
3498  double before = CmiWallTimer();
3499  cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0); // when all streams complete
3500  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3501  cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3502  cudaMemcpyDeviceToHost, streams[stream]);
3503  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
3504  cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
3505  cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]); // for next time
3506  cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3507  //CmiUnlock(cuda_lock);
3508  // cudaDeviceSynchronize(); // XXXX TESTING
3509  // cuda_errcheck("after memcpy grid to host");
3510 
3512  if ( ! simParams->useCUDA2 ) {
3513  CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
3514  cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
3515  }
3516 
3517  pmeProxy[master_pe].pollChargeGridReady();
3518  }
3519 }
static Node * Object()
Definition: Node.h:86
Lattice * saved_lattice
Definition: ComputePme.C:447
cudaEvent_t end_charges
Definition: ComputePme.C:426
static __thread ComputeMgr * computeMgr
SimParameters * simParameters
Definition: Node.h:178
#define CUDA_EVENT_ID_PME_COPY
int getMasterPe()
Definition: DeviceCUDA.h:100
int chargeGridSubmittedCount
Definition: ComputePme.C:444
#define simParams
Definition: Output.C:127
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
int saved_sequence
Definition: ComputePme.C:448
void ComputePmeMgr::copyPencils ( PmeGridMsg msg)

Definition at line 3794 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, ComputePmeUtil::numGrids, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

3794  {
3795 
3796  int K1 = myGrid.K1;
3797  int K2 = myGrid.K2;
3798  int dim2 = myGrid.dim2;
3799  int dim3 = myGrid.dim3;
3800  int block1 = myGrid.block1;
3801  int block2 = myGrid.block2;
3802 
3803  // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
3804  int ib = msg->sourceNode / yBlocks;
3805  int jb = msg->sourceNode % yBlocks;
3806 
3807  int ibegin = ib*block1;
3808  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3809  int jbegin = jb*block2;
3810  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3811 
3812  int zlistlen = msg->zlistlen;
3813  int *zlist = msg->zlist;
3814  float *qmsg = msg->qgrid;
3815  int g;
3816  for ( g=0; g<numGrids; ++g ) {
3817  char *f = f_arr + g*fsize;
3818  float **q = q_arr + g*fsize;
3819  for ( int i=ibegin; i<iend; ++i ) {
3820  for ( int j=jbegin; j<jend; ++j ) {
3821  if( f[i*dim2+j] ) {
3822  f[i*dim2+j] = 0;
3823  for ( int k=0; k<zlistlen; ++k ) {
3824  q[i*dim2+j][zlist[k]] = *(qmsg++);
3825  }
3826  for (int h=0; h<myGrid.order-1; ++h) {
3827  q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
3828  }
3829  }
3830  }
3831  }
3832  }
3833 }
int dim2
Definition: PmeBase.h:19
int dim3
Definition: PmeBase.h:19
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:21
int block2
Definition: PmeBase.h:21
int sourceNode
Definition: ComputePme.C:115
int order
Definition: PmeBase.h:20
float * qgrid
Definition: ComputePme.C:124
int * zlist
Definition: ComputePme.C:122
int K3
Definition: PmeBase.h:18
int zlistlen
Definition: ComputePme.C:121
void ComputePmeMgr::copyResults ( PmeGridMsg msg)

Definition at line 3986 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, ComputePmeUtil::numGrids, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

3986  {
3987 
3988  int zdim = myGrid.dim3;
3989  int flen = msg->len;
3990  int fstart = msg->start;
3991  int zlistlen = msg->zlistlen;
3992  int *zlist = msg->zlist;
3993  float *qmsg = msg->qgrid;
3994  int g;
3995  for ( g=0; g<numGrids; ++g ) {
3996  char *f = msg->fgrid + g*flen;
3997  float **q = q_arr + fstart + g*fsize;
3998  for ( int i=0; i<flen; ++i ) {
3999  if ( f[i] ) {
4000  f[i] = 0;
4001  for ( int k=0; k<zlistlen; ++k ) {
4002  q[i][zlist[k]] = *(qmsg++);
4003  }
4004  for (int h=0; h<myGrid.order-1; ++h) {
4005  q[i][myGrid.K3+h] = q[i][h];
4006  }
4007  }
4008  }
4009  }
4010 }
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
int order
Definition: PmeBase.h:20
float * qgrid
Definition: ComputePme.C:124
int * zlist
Definition: ComputePme.C:122
int K3
Definition: PmeBase.h:18
int zlistlen
Definition: ComputePme.C:121
char * fgrid
Definition: ComputePme.C:123
void ComputePmeMgr::cuda_submit_charges ( Lattice lattice,
int  sequence 
)

Definition at line 3429 of file ComputePme.C.

References a_data_dev, a_data_host, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

Referenced by ComputePme::doWork().

3429  {
3430 
3431  int n = cuda_atoms_count;
3432  //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
3433  cuda_atoms_count = 0;
3434 
3435  const double before = CmiWallTimer();
3436  cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
3437  cudaMemcpyHostToDevice, streams[stream]);
3438  const double after = CmiWallTimer();
3439 
3440  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3441 
3442  cuda_pme_charges(
3443  bspline_coeffs_dev,
3444  q_arr_dev, ffz_dev, ffz_dev + fsize,
3445  a_data_dev, n,
3446  myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
3447  streams[stream]);
3448  const double after2 = CmiWallTimer();
3449 
3450  chargeGridSubmitted(lattice,sequence); // must be inside lock
3451 
3452  masterPmeMgr->charges_time = before;
3453  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
3454  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
3455 }
float * a_data_dev
Definition: ComputePme.C:419
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
#define CUDA_EVENT_ID_PME_COPY
int order
Definition: PmeBase.h:20
int K3
Definition: PmeBase.h:18
int cuda_atoms_count
Definition: ComputePme.C:422
#define CUDA_EVENT_ID_PME_KERNEL
double charges_time
Definition: ComputePme.C:430
void chargeGridSubmitted(Lattice &lattice, int sequence)
Definition: ComputePme.C:3484
float * a_data_host
Definition: ComputePme.C:418
void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg msg)

Definition at line 2014 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by sendTransSubset().

2014  {
2015  // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
2016  int pe = transNodeInfo[myTransNode].pe_start;
2017  int npe = transNodeInfo[myTransNode].npe;
2018  CmiNodeLock lock = CmiCreateLock();
2019  int *count = new int; *count = npe;
2020  for (int i=0; i<npe; ++i, ++pe) {
2023  shmsg->msg = msg;
2024  shmsg->count = count;
2025  shmsg->lock = lock;
2026  pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2027  }
2028 }
#define PRIORITY_SIZE
Definition: Priorities.h:13
#define PME_TRANS_PRIORITY
Definition: Priorities.h:31
void recvSharedTrans(PmeSharedTransMsg *)
Definition: ComputePme.C:2030
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg msg)

Definition at line 2270 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by sendUntransSubset().

2270  {
2271  int pe = gridNodeInfo[myGridNode].pe_start;
2272  int npe = gridNodeInfo[myGridNode].npe;
2273  CmiNodeLock lock = CmiCreateLock();
2274  int *count = new int; *count = npe;
2275  for (int i=0; i<npe; ++i, ++pe) {
2277  shmsg->msg = msg;
2278  shmsg->count = count;
2279  shmsg->lock = lock;
2280  pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2281  }
2282 }
CmiNodeLock lock
Definition: ComputePme.C:162
PmeUntransMsg * msg
Definition: ComputePme.C:160
void ComputePmeMgr::gridCalc1 ( void  )

Definition at line 1906 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

1906  {
1907  // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
1908 
1909 #ifdef NAMD_FFTW
1910  for ( int g=0; g<numGrids; ++g ) {
1911 #ifdef NAMD_FFTW_3
1912  fftwf_execute(forward_plan_yz[g]);
1913 #else
1914  rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1915  qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
1916 #endif
1917 
1918  }
1919 #endif
1920 
1921  if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1922 }
int dim2
Definition: PmeBase.h:19
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
void ComputePmeMgr::gridCalc2 ( void  )

Definition at line 2082 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and simParams.

2082  {
2083  // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
2084 
2085 #if CMK_BLUEGENEL
2086  CmiNetworkProgressAfter (0);
2087 #endif
2088 
2089  int zdim = myGrid.dim3;
2090  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2091  int ny = localInfo[myTransPe].ny_after_transpose;
2092 
2093  for ( int g=0; g<numGrids; ++g ) {
2094  // finish forward FFT (x dimension)
2095 #ifdef NAMD_FFTW
2096 #ifdef NAMD_FFTW_3
2097  fftwf_execute(forward_plan_x[g]);
2098 #else
2099  fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2100  ny * zdim / 2, 1, work, 1, 0);
2101 #endif
2102 #endif
2103  }
2104 
2105 #ifdef OPENATOM_VERSION
2106  if ( ! simParams -> openatomOn ) {
2107 #endif // OPENATOM_VERSION
2108  gridCalc2R();
2109 #ifdef OPENATOM_VERSION
2110  } else {
2111  gridCalc2Moa();
2112  }
2113 #endif // OPENATOM_VERSION
2114 }
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
int ny_after_transpose
Definition: ComputePme.C:233
#define simParams
Definition: Output.C:127
void gridCalc2R(void)
Definition: ComputePme.C:2142
void ComputePmeMgr::gridCalc2R ( void  )

Definition at line 2142 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

2142  {
2143 
2144  int useCkLoop = 0;
2145 #if CMK_SMP && USE_CKLOOP
2146  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
2147  && CkNumPes() >= 2 * numTransPes ) {
2148  useCkLoop = 1;
2149  }
2150 #endif
2151 
2152  int zdim = myGrid.dim3;
2153  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2154  int ny = localInfo[myTransPe].ny_after_transpose;
2155 
2156  for ( int g=0; g<numGrids; ++g ) {
2157  // reciprocal space portion of PME
2159  recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
2160  lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2161  // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
2162 
2163  // start backward FFT (x dimension)
2164 
2165 #ifdef NAMD_FFTW
2166 #ifdef NAMD_FFTW_3
2167  fftwf_execute(backward_plan_x[g]);
2168 #else
2169  fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2170  ny * zdim / 2, 1, work, 1, 0);
2171 #endif
2172 #endif
2173  }
2174 
2175  pmeProxyDir[CkMyPe()].sendUntrans();
2176 }
static Node * Object()
Definition: Node.h:86
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
double compute_energy(float q_arr[], const Lattice &lattice, double ewald, double virial[], int useCkLoop)
Definition: PmeKSpace.C:321
#define CKLOOP_CTRL_PME_KSPACE
Definition: SimParameters.h:96
int ny_after_transpose
Definition: ComputePme.C:233
double BigReal
Definition: common.h:114
void ComputePmeMgr::gridCalc3 ( void  )

Definition at line 2344 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

2344  {
2345  // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
2346 
2347  // finish backward FFT
2348 #ifdef NAMD_FFTW
2349  for ( int g=0; g<numGrids; ++g ) {
2350 #ifdef NAMD_FFTW_3
2351  fftwf_execute(backward_plan_yz[g]);
2352 #else
2353  rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2354  (fftw_complex *) (qgrid + qgrid_size * g),
2355  1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
2356 #endif
2357  }
2358 
2359 #endif
2360 
2361  pmeProxyDir[CkMyPe()].sendUngrid();
2362 }
int dim2
Definition: PmeBase.h:19
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
void ComputePmeMgr::initialize ( CkQdMsg *  msg)

Definition at line 862 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, cuda_errcheck(), SimParameters::cutoff, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, ResizeArray< Elem >::end(), endi(), fftw_plan_lock, SimParameters::FFTWEstimate, SimParameters::FFTWPatient, findRecipEvirPe(), generatePmePeList2(), DeviceCUDA::getDeviceID(), PmePencilInitMsgData::grid, if(), iINFO(), iout, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, SimParameters::lattice, PatchMap::max_a(), PatchMap::min_a(), NAMD_bug(), NAMD_die(), PatchMap::node(), NodePmeInfo::npe, ComputePmeUtil::numGrids, PatchMap::numNodesWithPatches(), PatchMap::numPatches(), numPatches, PatchMap::numPatchesOnNode(), LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, PatchMap::Object(), Node::Object(), DeviceCUDA::one_device_per_node(), PmeGrid::order, SimParameters::patchDimension, NodePmeInfo::pe_start, WorkDistrib::peDiffuseOrdering, pencilPMEProcessors, SimParameters::PMEBarrier, SimParameters::PMEGridSizeX, SimParameters::PMEGridSizeY, SimParameters::PMEGridSizeZ, SimParameters::PMEInterpOrder, SimParameters::PMEMinPoints, SimParameters::PMEMinSlices, PmePencilInitMsgData::pmeNodeProxy, SimParameters::PMEOffload, SimParameters::PMEPencils, SimParameters::PMEPencilsX, SimParameters::PMEPencilsXLayout, SimParameters::PMEPencilsY, SimParameters::PMEPencilsYLayout, SimParameters::PMEPencilsZ, SimParameters::PMEProcessors, PmePencilInitMsgData::pmeProxy, NodePmeInfo::real_node, Random::reorder(), ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), SortableResizeArray< Type >::sort(), sort, WorkDistrib::sortPmePes(), Vector::unit(), x, LocalPmeInfo::x_start, PmePencilInitMsgData::xBlocks, PmePencilInitMsgData::xm, PmePencilInitMsgData::xPencil, y, LocalPmeInfo::y_start_after_transpose, PmePencilInitMsgData::yBlocks, PmePencilInitMsgData::ym, PmePencilInitMsgData::yPencil, z, PmePencilInitMsgData::zBlocks, PmePencilInitMsgData::zm, and PmePencilInitMsgData::zPencil.

862  {
863  delete msg;
864 
865  localInfo = new LocalPmeInfo[CkNumPes()];
866  gridNodeInfo = new NodePmeInfo[CkNumNodes()];
867  transNodeInfo = new NodePmeInfo[CkNumNodes()];
868  gridPeMap = new int[CkNumPes()];
869  transPeMap = new int[CkNumPes()];
870  recipPeDest = new int[CkNumPes()];
871  gridPeOrder = new int[CkNumPes()];
872  gridNodeOrder = new int[CkNumNodes()];
873  transNodeOrder = new int[CkNumNodes()];
874 
875  if (CkMyRank() == 0) {
876  pencilPMEProcessors = new char [CkNumPes()];
877  memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
878  }
879 
881  PatchMap *patchMap = PatchMap::Object();
882 
883  offload = simParams->PMEOffload;
884 #ifdef NAMD_CUDA
885  if ( offload && ! deviceCUDA->one_device_per_node() ) {
886  NAMD_die("PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
887  }
888  if ( offload ) {
889  int dev;
890  cudaGetDevice(&dev);
891  cuda_errcheck("in cudaGetDevice");
892  if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
893  cudaDeviceProp deviceProp;
894  cudaGetDeviceProperties(&deviceProp, dev);
895  cuda_errcheck("in cudaGetDeviceProperties");
896  if ( deviceProp.major < 2 )
897  NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
898  }
899 #endif
900 
901  alchLambda = -1.; // illegal value to catch if not updated
902  alchLambda2 = -1.;
903  useBarrier = simParams->PMEBarrier;
904 
905  if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
906  else if ( simParams->PMEPencils > 0 ) usePencils = 1;
907  else {
908  int nrps = simParams->PMEProcessors;
909  if ( nrps <= 0 ) nrps = CkNumPes();
910  if ( nrps > CkNumPes() ) nrps = CkNumPes();
911  int dimx = simParams->PMEGridSizeX;
912  int dimy = simParams->PMEGridSizeY;
913  int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
914  if ( maxslabs > nrps ) maxslabs = nrps;
915  int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
916  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
917  if ( maxpencils > nrps ) maxpencils = nrps;
918  if ( maxpencils > 3 * maxslabs ) usePencils = 1;
919  else usePencils = 0;
920  }
921 
922  if ( usePencils ) {
923  int nrps = simParams->PMEProcessors;
924  if ( nrps <= 0 ) nrps = CkNumPes();
925  if ( nrps > CkNumPes() ) nrps = CkNumPes();
926  if ( simParams->PMEPencils > 1 &&
927  simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
928  xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
929  } else {
930  int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
931  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
932  if ( nb2 > nrps ) nb2 = nrps;
933  if ( nb2 < 1 ) nb2 = 1;
934  int nb = (int) sqrt((float)nb2);
935  if ( nb < 1 ) nb = 1;
936  xBlocks = zBlocks = nb;
937  yBlocks = nb2 / nb;
938  }
939 
940  if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
941  if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
942  if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
943 
944  int dimx = simParams->PMEGridSizeX;
945  int bx = 1 + ( dimx - 1 ) / xBlocks;
946  xBlocks = 1 + ( dimx - 1 ) / bx;
947 
948  int dimy = simParams->PMEGridSizeY;
949  int by = 1 + ( dimy - 1 ) / yBlocks;
950  yBlocks = 1 + ( dimy - 1 ) / by;
951 
952  int dimz = simParams->PMEGridSizeZ / 2 + 1; // complex
953  int bz = 1 + ( dimz - 1 ) / zBlocks;
954  zBlocks = 1 + ( dimz - 1 ) / bz;
955 
956  if ( xBlocks * yBlocks > CkNumPes() ) {
957  NAMD_die("PME pencils xBlocks * yBlocks > numPes");
958  }
959  if ( xBlocks * zBlocks > CkNumPes() ) {
960  NAMD_die("PME pencils xBlocks * zBlocks > numPes");
961  }
962  if ( yBlocks * zBlocks > CkNumPes() ) {
963  NAMD_die("PME pencils yBlocks * zBlocks > numPes");
964  }
965 
966  if ( ! CkMyPe() ) {
967  iout << iINFO << "PME using " << xBlocks << " x " <<
968  yBlocks << " x " << zBlocks <<
969  " pencil grid for FFT and reciprocal sum.\n" << endi;
970  }
971  } else { // usePencils
972 
973  { // decide how many pes to use for reciprocal sum
974 
975  // rules based on work available
976  int minslices = simParams->PMEMinSlices;
977  int dimx = simParams->PMEGridSizeX;
978  int nrpx = ( dimx + minslices - 1 ) / minslices;
979  int dimy = simParams->PMEGridSizeY;
980  int nrpy = ( dimy + minslices - 1 ) / minslices;
981 
982  // rules based on processors available
983  int nrpp = CkNumPes();
984  // if ( nrpp > 32 ) nrpp = 32; // cap to limit messages
985  if ( nrpp < nrpx ) nrpx = nrpp;
986  if ( nrpp < nrpy ) nrpy = nrpp;
987 
988  // user override
989  int nrps = simParams->PMEProcessors;
990  if ( nrps > CkNumPes() ) nrps = CkNumPes();
991  if ( nrps > 0 ) nrpx = nrps;
992  if ( nrps > 0 ) nrpy = nrps;
993 
994  // make sure there aren't any totally empty processors
995  int bx = ( dimx + nrpx - 1 ) / nrpx;
996  nrpx = ( dimx + bx - 1 ) / bx;
997  int by = ( dimy + nrpy - 1 ) / nrpy;
998  nrpy = ( dimy + by - 1 ) / by;
999  if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1000  NAMD_bug("Error in selecting number of PME processors.");
1001  if ( by != ( dimy + nrpy - 1 ) / nrpy )
1002  NAMD_bug("Error in selecting number of PME processors.");
1003 
1004  numGridPes = nrpx;
1005  numTransPes = nrpy;
1006  }
1007  if ( ! CkMyPe() ) {
1008  iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
1009  " processors for FFT and reciprocal sum.\n" << endi;
1010  }
1011 
1012  int sum_npes = numTransPes + numGridPes;
1013  int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1014 
1015 #if 0 // USE_TOPOMAP
1016  /* This code is being disabled permanently for slab PME on Blue Gene machines */
1017  PatchMap * pmap = PatchMap::Object();
1018 
1019  int patch_pes = pmap->numNodesWithPatches();
1020  TopoManager tmgr;
1021  if(tmgr.hasMultipleProcsPerNode())
1022  patch_pes *= 2;
1023 
1024  bool done = false;
1025  if(CkNumPes() > 2*sum_npes + patch_pes) {
1026  done = generateBGLORBPmePeList(transPeMap, numTransPes);
1027  done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1028  }
1029  else
1030  if(CkNumPes() > 2 *max_npes + patch_pes) {
1031  done = generateBGLORBPmePeList(transPeMap, max_npes);
1032  gridPeMap = transPeMap;
1033  }
1034 
1035  if (!done)
1036 #endif
1037  {
1038  //generatePmePeList(transPeMap, max_npes);
1039  //gridPeMap = transPeMap;
1040  generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
1041  }
1042 
1043  if ( ! CkMyPe() ) {
1044  iout << iINFO << "PME GRID LOCATIONS:";
1045  int i;
1046  for ( i=0; i<numGridPes && i<10; ++i ) {
1047  iout << " " << gridPeMap[i];
1048  }
1049  if ( i < numGridPes ) iout << " ...";
1050  iout << "\n" << endi;
1051  iout << iINFO << "PME TRANS LOCATIONS:";
1052  for ( i=0; i<numTransPes && i<10; ++i ) {
1053  iout << " " << transPeMap[i];
1054  }
1055  if ( i < numTransPes ) iout << " ...";
1056  iout << "\n" << endi;
1057  }
1058 
1059  // sort based on nodes and physical nodes
1060  std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
1061 
1062  myGridPe = -1;
1063  myGridNode = -1;
1064  int i = 0;
1065  int node = -1;
1066  int real_node = -1;
1067  for ( i=0; i<numGridPes; ++i ) {
1068  if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1069  if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
1070  int real_node_i = CkNodeOf(gridPeMap[i]);
1071  if ( real_node_i == real_node ) {
1072  gridNodeInfo[node].npe += 1;
1073  } else {
1074  real_node = real_node_i;
1075  ++node;
1076  gridNodeInfo[node].real_node = real_node;
1077  gridNodeInfo[node].pe_start = i;
1078  gridNodeInfo[node].npe = 1;
1079  }
1080  if ( CkMyNode() == real_node_i ) myGridNode = node;
1081  }
1082  numGridNodes = node + 1;
1083  myTransPe = -1;
1084  myTransNode = -1;
1085  node = -1;
1086  real_node = -1;
1087  for ( i=0; i<numTransPes; ++i ) {
1088  if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1089  if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
1090  int real_node_i = CkNodeOf(transPeMap[i]);
1091  if ( real_node_i == real_node ) {
1092  transNodeInfo[node].npe += 1;
1093  } else {
1094  real_node = real_node_i;
1095  ++node;
1096  transNodeInfo[node].real_node = real_node;
1097  transNodeInfo[node].pe_start = i;
1098  transNodeInfo[node].npe = 1;
1099  }
1100  if ( CkMyNode() == real_node_i ) myTransNode = node;
1101  }
1102  numTransNodes = node + 1;
1103 
1104  if ( ! CkMyPe() ) {
1105  iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
1106  << numTransNodes << " TRANS NODES\n" << endi;
1107  }
1108 
1109  { // generate random orderings for grid and trans messages
1110  int i;
1111  for ( i = 0; i < numGridPes; ++i ) {
1112  gridPeOrder[i] = i;
1113  }
1114  Random rand(CkMyPe());
1115  if ( myGridPe < 0 ) {
1116  rand.reorder(gridPeOrder,numGridPes);
1117  } else { // self last
1118  gridPeOrder[myGridPe] = numGridPes-1;
1119  gridPeOrder[numGridPes-1] = myGridPe;
1120  rand.reorder(gridPeOrder,numGridPes-1);
1121  }
1122  for ( i = 0; i < numGridNodes; ++i ) {
1123  gridNodeOrder[i] = i;
1124  }
1125  if ( myGridNode < 0 ) {
1126  rand.reorder(gridNodeOrder,numGridNodes);
1127  } else { // self last
1128  gridNodeOrder[myGridNode] = numGridNodes-1;
1129  gridNodeOrder[numGridNodes-1] = myGridNode;
1130  rand.reorder(gridNodeOrder,numGridNodes-1);
1131  }
1132  for ( i = 0; i < numTransNodes; ++i ) {
1133  transNodeOrder[i] = i;
1134  }
1135  if ( myTransNode < 0 ) {
1136  rand.reorder(transNodeOrder,numTransNodes);
1137  } else { // self last
1138  transNodeOrder[myTransNode] = numTransNodes-1;
1139  transNodeOrder[numTransNodes-1] = myTransNode;
1140  rand.reorder(transNodeOrder,numTransNodes-1);
1141  }
1142  }
1143 
1144  } // ! usePencils
1145 
1146  myGrid.K1 = simParams->PMEGridSizeX;
1147  myGrid.K2 = simParams->PMEGridSizeY;
1148  myGrid.K3 = simParams->PMEGridSizeZ;
1149  myGrid.order = simParams->PMEInterpOrder;
1150  myGrid.dim2 = myGrid.K2;
1151  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
1152 
1153  if ( ! usePencils ) {
1154  myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
1155  myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
1156  myGrid.block3 = myGrid.dim3 / 2; // complex
1157  }
1158 
1159  if ( usePencils ) {
1160  myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
1161  myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
1162  myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks; // complex
1163 
1164 
1165  int pe = 0;
1166  int x,y,z;
1167 
1168  SortableResizeArray<int> zprocs(xBlocks*yBlocks);
1169  SortableResizeArray<int> yprocs(xBlocks*zBlocks);
1170  SortableResizeArray<int> xprocs(yBlocks*zBlocks);
1171 
1172  // decide which pes to use by bit reversal and patch use
1173  int i;
1174  int ncpus = CkNumPes();
1175  SortableResizeArray<int> patches, nopatches, pmeprocs;
1176  PatchMap *pmap = PatchMap::Object();
1177  for ( int icpu=0; icpu<ncpus; ++icpu ) {
1178  int ri = WorkDistrib::peDiffuseOrdering[icpu];
1179  if ( ri ) { // keep 0 for special case
1180  // pretend pe 1 has patches to avoid placing extra PME load on node
1181  if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
1182  else nopatches.add(ri);
1183  }
1184  }
1185 
1186 #if USE_RANDOM_TOPO
1187  Random rand(CkMyPe());
1188  int *tmp = new int[patches.size()];
1189  int nn = patches.size();
1190  for (i=0;i<nn;i++) tmp[i] = patches[i];
1191  rand.reorder(tmp, nn);
1192  patches.resize(0);
1193  for (i=0;i<nn;i++) patches.add(tmp[i]);
1194  delete [] tmp;
1195  tmp = new int[nopatches.size()];
1196  nn = nopatches.size();
1197  for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1198  rand.reorder(tmp, nn);
1199  nopatches.resize(0);
1200  for (i=0;i<nn;i++) nopatches.add(tmp[i]);
1201  delete [] tmp;
1202 #endif
1203 
1204  // only use zero if it eliminates overloading or has patches
1205  int useZero = 0;
1206  int npens = xBlocks*yBlocks;
1207  if ( npens % ncpus == 0 ) useZero = 1;
1208  if ( npens == nopatches.size() + 1 ) useZero = 1;
1209  npens += xBlocks*zBlocks;
1210  if ( npens % ncpus == 0 ) useZero = 1;
1211  if ( npens == nopatches.size() + 1 ) useZero = 1;
1212  npens += yBlocks*zBlocks;
1213  if ( npens % ncpus == 0 ) useZero = 1;
1214  if ( npens == nopatches.size() + 1 ) useZero = 1;
1215 
1216  // add nopatches then patches in reversed order
1217  for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
1218  if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1219  for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
1220  if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1221 
1222  int npes = pmeprocs.size();
1223  for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1224  if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1225 #if !USE_RANDOM_TOPO
1226  zprocs.sort();
1227 #endif
1228  for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1229  if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1230 #if !USE_RANDOM_TOPO
1231  yprocs.sort();
1232 #endif
1233  for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1234  if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1235 #if !USE_RANDOM_TOPO
1236  xprocs.sort();
1237 #endif
1238 
1239 #if USE_TOPO_SFC
1240  CmiLock(tmgr_lock);
1241  //{
1242  TopoManager tmgr;
1243  int xdim = tmgr.getDimNX();
1244  int ydim = tmgr.getDimNY();
1245  int zdim = tmgr.getDimNZ();
1246  int xdim1 = find_level_grid(xdim);
1247  int ydim1 = find_level_grid(ydim);
1248  int zdim1 = find_level_grid(zdim);
1249  if(CkMyPe() == 0)
1250  printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1251 
1252  vector<Coord> result;
1253  SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1254  sort_sfc(xprocs, tmgr, result);
1255  sort_sfc(yprocs, tmgr, result);
1256  sort_sfc(zprocs, tmgr, result);
1257  //}
1258  CmiUnlock(tmgr_lock);
1259 #endif
1260 
1261 
1262  if(CkMyPe() == 0){
1263  iout << iINFO << "PME Z PENCIL LOCATIONS:";
1264  for ( i=0; i<zprocs.size() && i<10; ++i ) {
1265 #if USE_TOPO_SFC
1266  int x,y,z,t;
1267  tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1268  iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
1269 #else
1270  iout << " " << zprocs[i];
1271 #endif
1272  }
1273  if ( i < zprocs.size() ) iout << " ...";
1274  iout << "\n" << endi;
1275  }
1276 
1277  if (CkMyRank() == 0) {
1278  for (pe=0, x = 0; x < xBlocks; ++x)
1279  for (y = 0; y < yBlocks; ++y, ++pe ) {
1280  pencilPMEProcessors[zprocs[pe]] = 1;
1281  }
1282  }
1283 
1284  if(CkMyPe() == 0){
1285  iout << iINFO << "PME Y PENCIL LOCATIONS:";
1286  for ( i=0; i<yprocs.size() && i<10; ++i ) {
1287 #if USE_TOPO_SFC
1288  int x,y,z,t;
1289  tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1290  iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
1291 #else
1292  iout << " " << yprocs[i];
1293 #endif
1294  }
1295  if ( i < yprocs.size() ) iout << " ...";
1296  iout << "\n" << endi;
1297  }
1298 
1299  if (CkMyRank() == 0) {
1300  for (pe=0, z = 0; z < zBlocks; ++z )
1301  for (x = 0; x < xBlocks; ++x, ++pe ) {
1302  pencilPMEProcessors[yprocs[pe]] = 1;
1303  }
1304  }
1305 
1306  if(CkMyPe() == 0){
1307  iout << iINFO << "PME X PENCIL LOCATIONS:";
1308  for ( i=0; i<xprocs.size() && i<10; ++i ) {
1309 #if USE_TOPO_SFC
1310  int x,y,z,t;
1311  tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1312  iout << " " << xprocs[i] << "(" << x << " " << y << " " << z << ")";
1313 #else
1314  iout << " " << xprocs[i];
1315 #endif
1316  }
1317  if ( i < xprocs.size() ) iout << " ...";
1318  iout << "\n" << endi;
1319  }
1320 
1321  if (CkMyRank() == 0) {
1322  for (pe=0, y = 0; y < yBlocks; ++y )
1323  for (z = 0; z < zBlocks; ++z, ++pe ) {
1324  pencilPMEProcessors[xprocs[pe]] = 1;
1325  }
1326  }
1327 
1328 
1329  // creating the pencil arrays
1330  if ( CkMyPe() == 0 ){
1331 #if !USE_RANDOM_TOPO
1332  // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
1333  WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
1334  std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
1335  std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
1336 #endif
1337 #if 1
1338  CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
1339  CProxy_PmePencilMap ym;
1340  if ( simParams->PMEPencilsYLayout )
1341  ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
1342  else
1343  ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
1344  CProxy_PmePencilMap xm;
1345  if ( simParams->PMEPencilsXLayout )
1346  xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
1347  else
1348  xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
1349  pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1350  CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1351  CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1352  CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1353  zo.setAnytimeMigration(false); zo.setStaticInsertion(true);
1354  yo.setAnytimeMigration(false); yo.setStaticInsertion(true);
1355  xo.setAnytimeMigration(false); xo.setStaticInsertion(true);
1356  zPencil = CProxy_PmeZPencil::ckNew(zo); // (xBlocks,yBlocks,1);
1357  yPencil = CProxy_PmeYPencil::ckNew(yo); // (xBlocks,1,zBlocks);
1358  xPencil = CProxy_PmeXPencil::ckNew(xo); // (1,yBlocks,zBlocks);
1359 #else
1360  zPencil = CProxy_PmeZPencil::ckNew(); // (xBlocks,yBlocks,1);
1361  yPencil = CProxy_PmeYPencil::ckNew(); // (xBlocks,1,zBlocks);
1362  xPencil = CProxy_PmeXPencil::ckNew(); // (1,yBlocks,zBlocks);
1363 
1364  for (pe=0, x = 0; x < xBlocks; ++x)
1365  for (y = 0; y < yBlocks; ++y, ++pe ) {
1366  zPencil(x,y,0).insert(zprocs[pe]);
1367  }
1368  zPencil.doneInserting();
1369 
1370  for (pe=0, x = 0; x < xBlocks; ++x)
1371  for (z = 0; z < zBlocks; ++z, ++pe ) {
1372  yPencil(x,0,z).insert(yprocs[pe]);
1373  }
1374  yPencil.doneInserting();
1375 
1376 
1377  for (pe=0, y = 0; y < yBlocks; ++y )
1378  for (z = 0; z < zBlocks; ++z, ++pe ) {
1379  xPencil(0,y,z).insert(xprocs[pe]);
1380  }
1381  xPencil.doneInserting();
1382 #endif
1383 
1384  pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1385  PmePencilInitMsgData msgdata;
1386  msgdata.grid = myGrid;
1387  msgdata.xBlocks = xBlocks;
1388  msgdata.yBlocks = yBlocks;
1389  msgdata.zBlocks = zBlocks;
1390  msgdata.xPencil = xPencil;
1391  msgdata.yPencil = yPencil;
1392  msgdata.zPencil = zPencil;
1393  msgdata.pmeProxy = pmeProxyDir;
1394  msgdata.pmeNodeProxy = pmeNodeProxy;
1395  msgdata.xm = xm;
1396  msgdata.ym = ym;
1397  msgdata.zm = zm;
1398  xPencil.init(new PmePencilInitMsg(msgdata));
1399  yPencil.init(new PmePencilInitMsg(msgdata));
1400  zPencil.init(new PmePencilInitMsg(msgdata));
1401  }
1402 
1403  return; // continue in initialize_pencils() at next startup stage
1404  }
1405 
1406 
1407  int pe;
1408  int nx = 0;
1409  for ( pe = 0; pe < numGridPes; ++pe ) {
1410  localInfo[pe].x_start = nx;
1411  nx += myGrid.block1;
1412  if ( nx > myGrid.K1 ) nx = myGrid.K1;
1413  localInfo[pe].nx = nx - localInfo[pe].x_start;
1414  }
1415  int ny = 0;
1416  for ( pe = 0; pe < numTransPes; ++pe ) {
1417  localInfo[pe].y_start_after_transpose = ny;
1418  ny += myGrid.block2;
1419  if ( ny > myGrid.K2 ) ny = myGrid.K2;
1420  localInfo[pe].ny_after_transpose =
1421  ny - localInfo[pe].y_start_after_transpose;
1422  }
1423 
1424  { // decide how many pes this node exchanges charges with
1425 
1426  PatchMap *patchMap = PatchMap::Object();
1427  Lattice lattice = simParams->lattice;
1428  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1429  BigReal cutoff = simParams->cutoff;
1430  BigReal patchdim = simParams->patchDimension;
1431  int numPatches = patchMap->numPatches();
1432  int numNodes = CkNumPes();
1433  int *source_flags = new int[numNodes];
1434  int node;
1435  for ( node=0; node<numNodes; ++node ) {
1436  source_flags[node] = 0;
1437  recipPeDest[node] = 0;
1438  }
1439 
1440  // // make sure that we don't get ahead of ourselves on this node
1441  // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
1442  // source_flags[CkMyPe()] = 1;
1443  // recipPeDest[myRecipPe] = 1;
1444  // }
1445 
1446  for ( int pid=0; pid < numPatches; ++pid ) {
1447  int pnode = patchMap->node(pid);
1448 #ifdef NAMD_CUDA
1449  if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1450 #endif
1451  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1452  BigReal minx = patchMap->min_a(pid);
1453  BigReal maxx = patchMap->max_a(pid);
1454  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1455  // min1 (max1) is smallest (largest) grid line for this patch
1456  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1457  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1458  for ( int i=min1; i<=max1; ++i ) {
1459  int ix = i;
1460  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1461  while ( ix < 0 ) ix += myGrid.K1;
1462  // set source_flags[pnode] if this patch sends to our node
1463  if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1464  ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1465  source_flags[pnode] = 1;
1466  }
1467  // set dest_flags[] for node that our patch sends to
1468 #ifdef NAMD_CUDA
1469  if ( offload ) {
1470  if ( pnode == CkNodeFirst(CkMyNode()) ) {
1471  recipPeDest[ix / myGrid.block1] = 1;
1472  }
1473  } else
1474 #endif
1475  if ( pnode == CkMyPe() ) {
1476  recipPeDest[ix / myGrid.block1] = 1;
1477  }
1478  }
1479  }
1480 
1481  int numSourcesSamePhysicalNode = 0;
1482  numSources = 0;
1483  numDestRecipPes = 0;
1484  for ( node=0; node<numNodes; ++node ) {
1485  if ( source_flags[node] ) ++numSources;
1486  if ( recipPeDest[node] ) ++numDestRecipPes;
1487  if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1488  }
1489 
1490 #if 0
1491  if ( numSources ) {
1492  CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
1493  CkMyPe(), numSourcesSamePhysicalNode, numSources);
1494  iout << iINFO << "PME " << CkMyPe() << " sources:";
1495  for ( node=0; node<numNodes; ++node ) {
1496  if ( source_flags[node] ) iout << " " << node;
1497  }
1498  iout << "\n" << endi;
1499  }
1500 #endif
1501 
1502  delete [] source_flags;
1503 
1504  // CkPrintf("PME on node %d has %d sources and %d destinations\n",
1505  // CkMyPe(), numSources, numDestRecipPes);
1506 
1507  } // decide how many pes this node exchanges charges with (end)
1508 
1509  ungrid_count = numDestRecipPes;
1510 
1511  sendTransBarrier_received = 0;
1512 
1513  if ( myGridPe < 0 && myTransPe < 0 ) return;
1514  // the following only for nodes doing reciprocal sum
1515 
1516  if ( myTransPe >= 0 ) {
1517  recipEvirPe = findRecipEvirPe();
1518  pmeProxy[recipEvirPe].addRecipEvirClient();
1519  }
1520 
1521  if ( myTransPe >= 0 ) {
1522  int k2_start = localInfo[myTransPe].y_start_after_transpose;
1523  int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
1524  #ifdef OPENATOM_VERSION
1525  if ( simParams->openatomOn ) {
1526  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1527  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
1528  } else {
1529  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1530  }
1531  #else // OPENATOM_VERSION
1532  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1533  #endif // OPENATOM_VERSION
1534  }
1535 
1536  int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
1537  int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
1538  if ( local_size < local_size_2 ) local_size = local_size_2;
1539  qgrid = new float[local_size*numGrids];
1540  if ( numGridPes > 1 || numTransPes > 1 ) {
1541  kgrid = new float[local_size*numGrids];
1542  } else {
1543  kgrid = qgrid;
1544  }
1545  qgrid_size = local_size;
1546 
1547  if ( myGridPe >= 0 ) {
1548  qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
1549  qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
1550  fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
1551  fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
1552  }
1553 
1554  int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
1555 #ifdef NAMD_FFTW
1556  CmiLock(fftw_plan_lock);
1557 #ifdef NAMD_FFTW_3
1558  work = new fftwf_complex[n[0]];
1559  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1560  if ( myGridPe >= 0 ) {
1561  forward_plan_yz=new fftwf_plan[numGrids];
1562  backward_plan_yz=new fftwf_plan[numGrids];
1563  }
1564  if ( myTransPe >= 0 ) {
1565  forward_plan_x=new fftwf_plan[numGrids];
1566  backward_plan_x=new fftwf_plan[numGrids];
1567  }
1568  /* need one plan per grid */
1569  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1570  if ( myGridPe >= 0 ) {
1571  for( int g=0; g<numGrids; g++)
1572  {
1573  forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1574  localInfo[myGridPe].nx,
1575  qgrid + qgrid_size * g,
1576  NULL,
1577  1,
1578  myGrid.dim2 * myGrid.dim3,
1579  (fftwf_complex *)
1580  (qgrid + qgrid_size * g),
1581  NULL,
1582  1,
1583  myGrid.dim2 * (myGrid.dim3/2),
1584  fftwFlags);
1585  }
1586  }
1587  int zdim = myGrid.dim3;
1588  int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
1589  if ( ! CkMyPe() ) iout << " 2..." << endi;
1590  if ( myTransPe >= 0 ) {
1591  for( int g=0; g<numGrids; g++)
1592  {
1593 
1594  forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1595  (fftwf_complex *)
1596  (kgrid+qgrid_size*g),
1597  NULL,
1598  xStride,
1599  1,
1600  (fftwf_complex *)
1601  (kgrid+qgrid_size*g),
1602  NULL,
1603  xStride,
1604  1,
1605  FFTW_FORWARD,fftwFlags);
1606 
1607  }
1608  }
1609  if ( ! CkMyPe() ) iout << " 3..." << endi;
1610  if ( myTransPe >= 0 ) {
1611  for( int g=0; g<numGrids; g++)
1612  {
1613  backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1614  (fftwf_complex *)
1615  (kgrid+qgrid_size*g),
1616  NULL,
1617  xStride,
1618  1,
1619  (fftwf_complex *)
1620  (kgrid+qgrid_size*g),
1621  NULL,
1622  xStride,
1623  1,
1624  FFTW_BACKWARD, fftwFlags);
1625 
1626  }
1627  }
1628  if ( ! CkMyPe() ) iout << " 4..." << endi;
1629  if ( myGridPe >= 0 ) {
1630  for( int g=0; g<numGrids; g++)
1631  {
1632  backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1633  localInfo[myGridPe].nx,
1634  (fftwf_complex *)
1635  (qgrid + qgrid_size * g),
1636  NULL,
1637  1,
1638  myGrid.dim2*(myGrid.dim3/2),
1639  qgrid + qgrid_size * g,
1640  NULL,
1641  1,
1642  myGrid.dim2 * myGrid.dim3,
1643  fftwFlags);
1644  }
1645  }
1646  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1647 
1648 #else
1649  work = new fftw_complex[n[0]];
1650 
1651  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1652  if ( myGridPe >= 0 ) {
1653  forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1654  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1655  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1656  }
1657  if ( ! CkMyPe() ) iout << " 2..." << endi;
1658  if ( myTransPe >= 0 ) {
1659  forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1660  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1661  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1662  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1663  }
1664  if ( ! CkMyPe() ) iout << " 3..." << endi;
1665  if ( myTransPe >= 0 ) {
1666  backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1667  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1668  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1669  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1670  }
1671  if ( ! CkMyPe() ) iout << " 4..." << endi;
1672  if ( myGridPe >= 0 ) {
1673  backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1674  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1675  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1676  }
1677  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1678 #endif
1679  CmiUnlock(fftw_plan_lock);
1680 #else
1681  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
1682 #endif
1683 
1684  if ( myGridPe >= 0 && numSources == 0 )
1685  NAMD_bug("PME grid elements exist without sources.");
1686  grid_count = numSources;
1687  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
1688  trans_count = numGridPes;
1689 }
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:19
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:414
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:107
static void sortPmePes(int *pmepes, int xdim, int ydim)
Definition: WorkDistrib.C:300
int numNodesWithPatches(void)
Definition: PatchMap.h:61
Vector a_r() const
Definition: Lattice.h:268
int dim3
Definition: PmeBase.h:19
CProxy_ComputePmeMgr pmeProxy
Definition: ComputePme.C:217
static PatchMap * Object()
Definition: PatchMap.h:27
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
BigReal min_a(int pid) const
Definition: PatchMap.h:91
int K2
Definition: PmeBase.h:18
CProxy_PmeZPencil zPencil
Definition: ComputePme.C:216
SimParameters * simParameters
Definition: Node.h:178
int K1
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:21
CProxy_PmeYPencil yPencil
Definition: ComputePme.C:215
CProxy_PmePencilMap zm
Definition: ComputePme.C:221
CProxy_PmePencilMap xm
Definition: ComputePme.C:219
if(ComputeNonbondedUtil::goMethod==2)
CProxy_NodePmeMgr pmeNodeProxy
Definition: ComputePme.C:218
#define iout
Definition: InfoStream.h:87
int block2
Definition: PmeBase.h:21
Definition: Random.h:37
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int block3
Definition: PmeBase.h:21
void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes)
Definition: ComputePme.C:292
gridSize z
void NAMD_die(const char *err_msg)
Definition: common.C:83
static int findRecipEvirPe()
Definition: ComputePme.C:241
static int * peDiffuseOrdering
Definition: WorkDistrib.h:115
int ny_after_transpose
Definition: ComputePme.C:233
int getDeviceID()
Definition: DeviceCUDA.h:107
int add(const Elem &elem)
Definition: ResizeArray.h:97
BigReal max_a(int pid) const
Definition: PatchMap.h:92
BlockRadixSort::TempStorage sort
long long int64
Definition: common.h:34
#define simParams
Definition: Output.C:127
int K3
Definition: PmeBase.h:18
int numPatches(void) const
Definition: PatchMap.h:59
void resize(int i)
Definition: ResizeArray.h:84
int node(int pid) const
Definition: PatchMap.h:114
void cuda_errcheck(const char *msg)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
BigReal patchDimension
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
gridSize y
CProxy_PmePencilMap ym
Definition: ComputePme.C:220
int size(void) const
Definition: ResizeArray.h:127
infostream & endi(infostream &s)
Definition: InfoStream.C:38
gridSize x
bool one_device_per_node()
Definition: DeviceCUDA.C:398
char * pencilPMEProcessors
Definition: ComputePme.C:107
Vector a() const
Definition: Lattice.h:252
Vector unit(void) const
Definition: Vector.h:182
double BigReal
Definition: common.h:114
CProxy_PmeXPencil xPencil
Definition: ComputePme.C:214
int y_start_after_transpose
Definition: ComputePme.C:233
void ComputePmeMgr::initialize_computes ( )

Definition at line 2720 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, DeviceCUDA::getDeviceID(), DeviceCUDA::getMasterPe(), ijpair::i, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, master_pe, NAMD_bug(), ComputePmeUtil::numGrids, PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, ReductionMgr::willSubmit(), and XCOPY.

2720  {
2721 
2722  noWorkCount = 0;
2723  doWorkCount = 0;
2724  ungridForcesCount = 0;
2725 
2727 
2729 
2730  strayChargeErrors = 0;
2731 
2732 #ifdef NAMD_CUDA
2733  PatchMap *patchMap = PatchMap::Object();
2734  int pe = master_pe = CkNodeFirst(CkMyNode());
2735  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2736  if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
2737  if ( ! patchMap->numPatchesOnNode(pe) ) continue;
2738  if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
2739  if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
2741  && pe != deviceCUDA->getMasterPe() ) {
2742  master_pe = pe;
2743  }
2744  }
2745  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
2746  NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
2747  }
2748 
2749  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
2750  bool cudaFirst = 1;
2751  if ( offload ) {
2752  CmiLock(cuda_lock);
2753  cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
2754  }
2755 
2756  if ( cudaFirst ) {
2757  nodePmeMgr->master_pe = master_pe;
2758  nodePmeMgr->masterPmeMgr = masterPmeMgr;
2759  }
2760 #endif
2761 
2762  qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
2763  fsize = myGrid.K1 * myGrid.dim2;
2764  if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
2765 #ifdef NAMD_CUDA
2766  if ( ! offload )
2767 #endif
2768  {
2769  q_arr = new float*[fsize*numGrids];
2770  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2771  q_list = new float*[fsize*numGrids];
2772  memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
2773  q_count = 0;
2774  }
2775 
2776 #ifdef NAMD_CUDA
2777  if ( cudaFirst || ! offload ) {
2778 #endif
2779  f_arr = new char[fsize*numGrids];
2780  // memset to non-zero value has race condition on BlueGene/Q
2781  // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
2782  for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2783 
2784  for ( int g=0; g<numGrids; ++g ) {
2785  char *f = f_arr + g*fsize;
2786  if ( usePencils ) {
2787  int K1 = myGrid.K1;
2788  int K2 = myGrid.K2;
2789  int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2790  int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2791  int dim2 = myGrid.dim2;
2792  for (int ap=0; ap<numPencilsActive; ++ap) {
2793  int ib = activePencils[ap].i;
2794  int jb = activePencils[ap].j;
2795  int ibegin = ib*block1;
2796  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
2797  int jbegin = jb*block2;
2798  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
2799  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2800  for ( int i=ibegin; i<iend; ++i ) {
2801  for ( int j=jbegin; j<jend; ++j ) {
2802  f[i*dim2+j] = 0;
2803  }
2804  }
2805  }
2806  } else {
2807  int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
2808  bsize = block1 * myGrid.dim2 * myGrid.dim3;
2809  for (int pe=0; pe<numGridPes; pe++) {
2810  if ( ! recipPeDest[pe] ) continue;
2811  int start = pe * bsize;
2812  int len = bsize;
2813  if ( start >= qsize ) { start = 0; len = 0; }
2814  if ( start + len > qsize ) { len = qsize - start; }
2815  int zdim = myGrid.dim3;
2816  int fstart = start / zdim;
2817  int flen = len / zdim;
2818  memset(f + fstart, 0, flen*sizeof(char));
2819  // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
2820  }
2821  }
2822  }
2823 #ifdef NAMD_CUDA
2824  }
2825  if ( offload ) {
2826  cudaSetDevice(deviceCUDA->getDeviceID());
2827  if ( cudaFirst ) {
2828 
2829  int f_alloc_count = 0;
2830  for ( int n=fsize, i=0; i<n; ++i ) {
2831  if ( f_arr[i] == 0 ) {
2832  ++f_alloc_count;
2833  }
2834  }
2835  // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
2836 
2837  q_arr = new float*[fsize*numGrids];
2838  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2839 
2840  float **q_arr_dev_host = new float*[fsize];
2841  cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
2842 
2843  float **v_arr_dev_host = new float*[fsize];
2844  cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
2845 
2846  int q_stride = myGrid.K3+myGrid.order-1;
2847  q_data_size = f_alloc_count * q_stride * sizeof(float);
2848  ffz_size = (fsize + q_stride) * sizeof(int);
2849 
2850  // tack ffz onto end of q_data to allow merged transfer
2851  cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
2852  ffz_host = (int*)(((char*)q_data_host) + q_data_size);
2853  cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
2854  ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
2855  cudaMalloc((void**) &v_data_dev, q_data_size);
2856  cuda_errcheck("malloc grid data for pme");
2857  cudaMemset(q_data_dev, 0, q_data_size + ffz_size); // for first time
2858  cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2859  cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2860  cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2861  cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2862 
2863  f_alloc_count = 0;
2864  for ( int n=fsize, i=0; i<n; ++i ) {
2865  if ( f_arr[i] == 0 ) {
2866  q_arr[i] = q_data_host + f_alloc_count * q_stride;
2867  q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2868  v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2869  ++f_alloc_count;
2870  } else {
2871  q_arr[i] = 0;
2872  q_arr_dev_host[i] = 0;
2873  v_arr_dev_host[i] = 0;
2874  }
2875  }
2876 
2877  cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2878  cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2879  delete [] q_arr_dev_host;
2880  delete [] v_arr_dev_host;
2881  delete [] f_arr;
2882  f_arr = new char[fsize + q_stride];
2883  fz_arr = f_arr + fsize;
2884  memset(f_arr, 0, fsize + q_stride);
2885  memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
2886 
2887  cuda_errcheck("initialize grid data for pme");
2888 
2889  cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
2890  cuda_errcheck("initialize bspline coefficients for pme");
2891 
2892 #define XCOPY(X) masterPmeMgr->X = X;
2893  XCOPY(bspline_coeffs_dev)
2894  XCOPY(bspline_dcoeffs_dev)
2895  XCOPY(q_arr)
2896  XCOPY(q_arr_dev)
2897  XCOPY(v_arr_dev)
2898  XCOPY(q_data_size)
2899  XCOPY(q_data_host)
2900  XCOPY(q_data_dev)
2901  XCOPY(v_data_dev)
2902  XCOPY(ffz_size)
2903  XCOPY(ffz_host)
2904  XCOPY(ffz_dev)
2905  XCOPY(f_arr)
2906  XCOPY(fz_arr)
2907 #undef XCOPY
2908  //CkPrintf("pe %d init first\n", CkMyPe());
2909  } else { // cudaFirst
2910  //CkPrintf("pe %d init later\n", CkMyPe());
2911 #define XCOPY(X) X = masterPmeMgr->X;
2912  XCOPY(bspline_coeffs_dev)
2913  XCOPY(bspline_dcoeffs_dev)
2914  XCOPY(q_arr)
2915  XCOPY(q_arr_dev)
2916  XCOPY(v_arr_dev)
2917  XCOPY(q_data_size)
2918  XCOPY(q_data_host)
2919  XCOPY(q_data_dev)
2920  XCOPY(v_data_dev)
2921  XCOPY(ffz_size)
2922  XCOPY(ffz_host)
2923  XCOPY(ffz_dev)
2924  XCOPY(f_arr)
2925  XCOPY(fz_arr)
2926 #undef XCOPY
2927  } // cudaFirst
2928  CmiUnlock(cuda_lock);
2929  } else // offload
2930 #endif // NAMD_CUDA
2931  {
2932  fz_arr = new char[myGrid.K3+myGrid.order-1];
2933  }
2934 
2935 #if 0 && USE_PERSISTENT
2936  recvGrid_handle = NULL;
2937 #endif
2938 }
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:19
int dim3
Definition: PmeBase.h:19
static PatchMap * Object()
Definition: PatchMap.h:27
int K2
Definition: PmeBase.h:18
SimParameters * simParameters
Definition: Node.h:178
int K1
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
int order
Definition: PmeBase.h:20
int getMasterPe()
Definition: DeviceCUDA.h:100
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int chargeGridSubmittedCount
Definition: ComputePme.C:444
#define XCOPY(X)
int getDeviceID()
Definition: DeviceCUDA.h:107
#define simParams
Definition: Output.C:127
int K3
Definition: PmeBase.h:18
void cuda_errcheck(const char *msg)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
int i
Definition: ComputePme.C:343
void cuda_init_bspline_coeffs(float **c, float **dc, int order)
int j
Definition: ComputePme.C:343
static CmiNodeLock cuda_lock
Definition: ComputePme.C:424
void ComputePmeMgr::initialize_pencils ( CkQdMsg *  msg)

Definition at line 1693 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, SimParameters::cutoff, deviceCUDA, DeviceCUDA::getMasterPe(), PmeGrid::K1, PmeGrid::K2, SimParameters::lattice, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), numPatches, PatchMap::Object(), Node::Object(), PmeGrid::order, SimParameters::patchDimension, SimParameters::PMESendOrder, Random::reorder(), Node::simParameters, simParams, sort, and Vector::unit().

1693  {
1694  delete msg;
1695  if ( ! usePencils ) return;
1696 
1698 
1699  PatchMap *patchMap = PatchMap::Object();
1700  Lattice lattice = simParams->lattice;
1701  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1702  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
1703  BigReal cutoff = simParams->cutoff;
1704  BigReal patchdim = simParams->patchDimension;
1705  int numPatches = patchMap->numPatches();
1706 
1707  pencilActive = new char[xBlocks*yBlocks];
1708  for ( int i=0; i<xBlocks; ++i ) {
1709  for ( int j=0; j<yBlocks; ++j ) {
1710  pencilActive[i*yBlocks+j] = 0;
1711  }
1712  }
1713 
1714  for ( int pid=0; pid < numPatches; ++pid ) {
1715  int pnode = patchMap->node(pid);
1716 #ifdef NAMD_CUDA
1717  if ( offload ) {
1718  if ( CkNodeOf(pnode) != CkMyNode() ) continue;
1719  } else
1720 #endif
1721  if ( pnode != CkMyPe() ) continue;
1722 
1723  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1724  int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
1725 
1726  BigReal minx = patchMap->min_a(pid);
1727  BigReal maxx = patchMap->max_a(pid);
1728  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1729  // min1 (max1) is smallest (largest) grid line for this patch
1730  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1731  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1732 
1733  BigReal miny = patchMap->min_b(pid);
1734  BigReal maxy = patchMap->max_b(pid);
1735  BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1736  // min2 (max2) is smallest (largest) grid line for this patch
1737  int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
1738  int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
1739 
1740  for ( int i=min1; i<=max1; ++i ) {
1741  int ix = i;
1742  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1743  while ( ix < 0 ) ix += myGrid.K1;
1744  for ( int j=min2; j<=max2; ++j ) {
1745  int jy = j;
1746  while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
1747  while ( jy < 0 ) jy += myGrid.K2;
1748  pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
1749  }
1750  }
1751  }
1752 
1753  numPencilsActive = 0;
1754  for ( int i=0; i<xBlocks; ++i ) {
1755  for ( int j=0; j<yBlocks; ++j ) {
1756  if ( pencilActive[i*yBlocks+j] ) {
1757  ++numPencilsActive;
1758 #ifdef NAMD_CUDA
1759  if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
1760 #endif
1761  zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1762  }
1763  }
1764  }
1765  activePencils = new ijpair[numPencilsActive];
1766  numPencilsActive = 0;
1767  for ( int i=0; i<xBlocks; ++i ) {
1768  for ( int j=0; j<yBlocks; ++j ) {
1769  if ( pencilActive[i*yBlocks+j] ) {
1770  activePencils[numPencilsActive++] = ijpair(i,j);
1771  }
1772  }
1773  }
1774  if ( simParams->PMESendOrder ) {
1775  std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
1776  } else {
1777  Random rand(CkMyPe());
1778  rand.reorder(activePencils,numPencilsActive);
1779  }
1780  //if ( numPencilsActive ) {
1781  // CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
1782  //}
1783 
1784  ungrid_count = numPencilsActive;
1785 }
static Node * Object()
Definition: Node.h:86
Vector a_r() const
Definition: Lattice.h:268
static PatchMap * Object()
Definition: PatchMap.h:27
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
BigReal min_a(int pid) const
Definition: PatchMap.h:91
int K2
Definition: PmeBase.h:18
SimParameters * simParameters
Definition: Node.h:178
int K1
Definition: PmeBase.h:18
int block1
Definition: PmeBase.h:21
int block2
Definition: PmeBase.h:21
Vector b_r() const
Definition: Lattice.h:269
BigReal min_b(int pid) const
Definition: PatchMap.h:93
Definition: Random.h:37
int order
Definition: PmeBase.h:20
int getMasterPe()
Definition: DeviceCUDA.h:100
BigReal max_b(int pid) const
Definition: PatchMap.h:94
BigReal max_a(int pid) const
Definition: PatchMap.h:92
BlockRadixSort::TempStorage sort
#define simParams
Definition: Output.C:127
int numPatches(void) const
Definition: PatchMap.h:59
int node(int pid) const
Definition: PatchMap.h:114
Vector b() const
Definition: Lattice.h:253
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
BigReal patchDimension
Vector a() const
Definition: Lattice.h:252
Vector unit(void) const
Definition: Vector.h:182
double BigReal
Definition: common.h:114
void ComputePmeMgr::pollChargeGridReady ( )

Definition at line 3535 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

3535  {
3536 #ifdef NAMD_CUDA
3537  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
3539 #else
3540  NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3541 #endif
3542 }
#define CUDA_POLL(FN, ARG)
Definition: ComputePme.C:2470
void CcdCallBacksReset(void *ignored, double curWallTime)
void NAMD_bug(const char *err_msg)
Definition: common.C:123
void cuda_check_pme_charges(void *arg, double walltime)
Definition: ComputePme.C:3457
void ComputePmeMgr::pollForcesReady ( )

Definition at line 2656 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

2656  {
2657 #ifdef NAMD_CUDA
2658  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
2660 #else
2661  NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2662 #endif
2663 }
#define CUDA_POLL(FN, ARG)
Definition: ComputePme.C:2470
void CcdCallBacksReset(void *ignored, double curWallTime)
void NAMD_bug(const char *err_msg)
Definition: common.C:123
void cuda_check_pme_forces(void *arg, double walltime)
Definition: ComputePme.C:2477
void ComputePmeMgr::procTrans ( PmeTransMsg msg)

Definition at line 2048 of file ComputePme.C.

References PmeGrid::dim3, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

2048  {
2049  // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
2050  if ( trans_count == numGridPes ) {
2051  lattice = msg->lattice;
2052  grid_sequence = msg->sequence;
2053  }
2054 
2055  if ( msg->nx ) {
2056  int zdim = myGrid.dim3;
2057  NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2058  int first_pe = nodeInfo.pe_start;
2059  int last_pe = first_pe+nodeInfo.npe-1;
2060  int y_skip = localInfo[myTransPe].y_start_after_transpose
2061  - localInfo[first_pe].y_start_after_transpose;
2062  int ny_msg = localInfo[last_pe].y_start_after_transpose
2063  + localInfo[last_pe].ny_after_transpose
2064  - localInfo[first_pe].y_start_after_transpose;
2065  int ny = localInfo[myTransPe].ny_after_transpose;
2066  int x_start = msg->x_start;
2067  int nx = msg->nx;
2068  for ( int g=0; g<numGrids; ++g ) {
2069  CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2070  (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
2071  nx*ny*zdim*sizeof(float));
2072  }
2073  }
2074 
2075  --trans_count;
2076 
2077  if ( trans_count == 0 ) {
2078  pmeProxyDir[CkMyPe()].gridCalc2();
2079  }
2080 }
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
float * qgrid
Definition: ComputePme.C:137
int ny_after_transpose
Definition: ComputePme.C:233
Lattice lattice
Definition: ComputePme.C:134
int y_start_after_transpose
Definition: ComputePme.C:233
void ComputePmeMgr::procUntrans ( PmeUntransMsg msg)

Definition at line 2302 of file ComputePme.C.

References PmeGrid::dim3, PmeGrid::K2, NodePmeInfo::npe, ComputePmeUtil::numGrids, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, x, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

2302  {
2303  // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
2304 
2305 #if CMK_BLUEGENEL
2306  CmiNetworkProgressAfter (0);
2307 #endif
2308 
2309  NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
2310  int first_pe = nodeInfo.pe_start;
2311  int g;
2312 
2313  if ( msg->ny ) {
2314  int zdim = myGrid.dim3;
2315  int last_pe = first_pe+nodeInfo.npe-1;
2316  int x_skip = localInfo[myGridPe].x_start
2317  - localInfo[first_pe].x_start;
2318  int nx_msg = localInfo[last_pe].x_start
2319  + localInfo[last_pe].nx
2320  - localInfo[first_pe].x_start;
2321  int nx = localInfo[myGridPe].nx;
2322  int y_start = msg->y_start;
2323  int ny = msg->ny;
2324  int slicelen = myGrid.K2 * zdim;
2325  int cpylen = ny * zdim;
2326  for ( g=0; g<numGrids; ++g ) {
2327  float *q = qgrid + qgrid_size * g + y_start * zdim;
2328  float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
2329  for ( int x = 0; x < nx; ++x ) {
2330  CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
2331  q += slicelen;
2332  qmsg += cpylen;
2333  }
2334  }
2335  }
2336 
2337  --untrans_count;
2338 
2339  if ( untrans_count == 0 ) {
2340  pmeProxyDir[CkMyPe()].gridCalc3();
2341  }
2342 }
float * qgrid
Definition: ComputePme.C:154
int dim3
Definition: PmeBase.h:19
int K2
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
gridSize x
void ComputePmeMgr::recvAck ( PmeAckMsg msg)

Definition at line 2444 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

2444  {
2445  if ( msg ) delete msg;
2446 #ifdef NAMD_CUDA
2447  if ( offload ) {
2448  CmiLock(cuda_lock);
2449  if ( ungrid_count == 0 ) {
2450  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2451  }
2452  int uc = --ungrid_count;
2453  CmiUnlock(cuda_lock);
2454 
2455  if ( uc == 0 ) {
2456  pmeProxyDir[master_pe].ungridCalc();
2457  }
2458  return;
2459  }
2460 #endif
2461  --ungrid_count;
2462 
2463  if ( ungrid_count == 0 ) {
2464  pmeProxyDir[CkMyPe()].ungridCalc();
2465  }
2466 }
void NAMD_bug(const char *err_msg)
Definition: common.C:123
static CmiNodeLock cuda_lock
Definition: ComputePme.C:424
void ComputePmeMgr::recvArrays ( CProxy_PmeXPencil  x,
CProxy_PmeYPencil  y,
CProxy_PmeZPencil  z 
)

Definition at line 800 of file ComputePme.C.

References x, y, and z.

801  {
802  xPencil = x; yPencil = y; zPencil = z;
803 
804  if(CmiMyRank()==0)
805  {
806  pmeNodeProxy.ckLocalBranch()->xPencil=x;
807  pmeNodeProxy.ckLocalBranch()->yPencil=y;
808  pmeNodeProxy.ckLocalBranch()->zPencil=z;
809  }
810 }
gridSize z
gridSize y
gridSize x
void ComputePmeMgr::recvChargeGridReady ( )

Definition at line 3544 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

3544  {
3546 }
Lattice * saved_lattice
Definition: ComputePme.C:447
void chargeGridReady(Lattice &lattice, int sequence)
Definition: ComputePme.C:3548
int saved_sequence
Definition: ComputePme.C:448
void ComputePmeMgr::recvGrid ( PmeGridMsg msg)

Definition at line 1827 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::lattice, NAMD_bug(), ComputePmeUtil::numGrids, PmeGridMsg::qgrid, PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

1827  {
1828  // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
1829  if ( grid_count == 0 ) {
1830  NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
1831  }
1832  if ( grid_count == numSources ) {
1833  lattice = msg->lattice;
1834  grid_sequence = msg->sequence;
1835  }
1836 
1837  int zdim = myGrid.dim3;
1838  int zlistlen = msg->zlistlen;
1839  int *zlist = msg->zlist;
1840  float *qmsg = msg->qgrid;
1841  for ( int g=0; g<numGrids; ++g ) {
1842  char *f = msg->fgrid + fgrid_len * g;
1843  float *q = qgrid + qgrid_size * g;
1844  for ( int i=0; i<fgrid_len; ++i ) {
1845  if ( f[i] ) {
1846  for ( int k=0; k<zlistlen; ++k ) {
1847  q[zlist[k]] += *(qmsg++);
1848  }
1849  }
1850  q += zdim;
1851  }
1852  }
1853 
1854  gridmsg_reuse[numSources-grid_count] = msg;
1855  --grid_count;
1856 
1857  if ( grid_count == 0 ) {
1858  pmeProxyDir[CkMyPe()].gridCalc1();
1859  if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1860  }
1861 }
int dim3
Definition: PmeBase.h:19
int sequence
Definition: ComputePme.C:116
static int numGrids
Definition: ComputePme.h:32
Lattice lattice
Definition: ComputePme.C:118
void NAMD_bug(const char *err_msg)
Definition: common.C:123
float * qgrid
Definition: ComputePme.C:124
int * zlist
Definition: ComputePme.C:122
int zlistlen
Definition: ComputePme.C:121
char * fgrid
Definition: ComputePme.C:123
void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg msg)

Definition at line 3023 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), ComputePmeUtil::numGrids, pmeComputes, ResizeArray< T >::size(), and submitReductions().

3023  {
3024  if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
3025  for ( int g=0; g<numGrids; ++g ) {
3026  evir[g] += msg->evir[g];
3027  }
3028  delete msg;
3029  // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
3030  if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
3031 }
static int numGrids
Definition: ComputePme.h:32
PmeReduction * evir
Definition: ComputePme.C:167
void NAMD_bug(const char *err_msg)
Definition: common.C:123
void submitReductions()
Definition: ComputePme.C:4208
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:454
int size(void) const
Definition: ResizeArray.h:127
void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg msg)

Definition at line 2030 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

2030  {
2031  procTrans(msg->msg);
2032  CmiLock(msg->lock);
2033  int count = --(*msg->count);
2034  CmiUnlock(msg->lock);
2035  if ( count == 0 ) {
2036  CmiDestroyLock(msg->lock);
2037  delete msg->count;
2038  delete msg->msg;
2039  }
2040  delete msg;
2041 }
PmeTransMsg * msg
Definition: ComputePme.C:143
void procTrans(PmeTransMsg *)
Definition: ComputePme.C:2048
CmiNodeLock lock
Definition: ComputePme.C:145
void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg msg)

Definition at line 2284 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

2284  {
2285  procUntrans(msg->msg);
2286  CmiLock(msg->lock);
2287  int count = --(*msg->count);
2288  CmiUnlock(msg->lock);
2289  if ( count == 0 ) {
2290  CmiDestroyLock(msg->lock);
2291  delete msg->count;
2292  delete msg->msg;
2293  }
2294  delete msg;
2295 }
void procUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2302
CmiNodeLock lock
Definition: ComputePme.C:162
PmeUntransMsg * msg
Definition: ComputePme.C:160
void ComputePmeMgr::recvTrans ( PmeTransMsg msg)

Definition at line 2043 of file ComputePme.C.

References procTrans().

2043  {
2044  procTrans(msg);
2045  delete msg;
2046 }
void procTrans(PmeTransMsg *)
Definition: ComputePme.C:2048
void ComputePmeMgr::recvUngrid ( PmeGridMsg msg)

Definition at line 2429 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

2429  {
2430  // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
2431 #ifdef NAMD_CUDA
2432  if ( ! offload ) // would need lock
2433 #endif
2434  if ( ungrid_count == 0 ) {
2435  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2436  }
2437 
2438  if ( usePencils ) copyPencils(msg);
2439  else copyResults(msg);
2440  delete msg;
2441  recvAck(0);
2442 }
void recvAck(PmeAckMsg *)
Definition: ComputePme.C:2444
void NAMD_bug(const char *err_msg)
Definition: common.C:123
void copyPencils(PmeGridMsg *)
Definition: ComputePme.C:3794
void copyResults(PmeGridMsg *)
Definition: ComputePme.C:3986
void ComputePmeMgr::recvUntrans ( PmeUntransMsg msg)

Definition at line 2297 of file ComputePme.C.

References procUntrans().

2297  {
2298  procUntrans(msg);
2299  delete msg;
2300 }
void procUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2302
void ComputePmeMgr::sendChargeGridReady ( )

Definition at line 3521 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, pmeComputes, and ResizeArray< T >::size().

Referenced by cuda_check_pme_charges().

3521  {
3522  for ( int i=0; i<CkMyNodeSize(); ++i ) {
3523  ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
3524  int cs = mgr->pmeComputes.size();
3525  if ( cs ) {
3526  mgr->ungridForcesCount = cs;
3527  mgr->recipEvirCount = mgr->recipEvirClients;
3528  masterPmeMgr->chargeGridSubmittedCount++;
3529  }
3530  }
3531  pmeProxy[master_pe].recvChargeGridReady();
3532 }
int chargeGridSubmittedCount
Definition: ComputePme.C:444
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:454
int size(void) const
Definition: ResizeArray.h:127
void ComputePmeMgr::sendData ( Lattice lattice,
int  sequence 
)

Definition at line 3958 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

3958  {
3959 
3960  sendDataHelper_lattice = &lattice;
3961  sendDataHelper_sequence = sequence;
3962  sendDataHelper_sourcepe = CkMyPe();
3963  sendDataHelper_errors = strayChargeErrors;
3964  strayChargeErrors = 0;
3965 
3966 #ifdef NAMD_CUDA
3967  if ( offload ) {
3968  for ( int i=0; i < numGridPes; ++i ) {
3969  int pe = gridPeOrder[i]; // different order
3970  if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
3971 #if CMK_MULTICORE
3972  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3973  pmeProxy[gridPeMap[pe]].sendDataHelper(i);
3974 #else
3975  pmeNodeProxy[CkMyNode()].sendDataHelper(i);
3976 #endif
3977  }
3978  } else
3979 #endif
3980  {
3981  sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
3982  }
3983 
3984 }
int sendDataHelper_sequence
Definition: ComputePme.C:371
int sendDataHelper_sourcepe
Definition: ComputePme.C:372
Lattice * sendDataHelper_lattice
Definition: ComputePme.C:370
int sendDataHelper_errors
Definition: ComputePme.C:373
void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors)
Definition: ComputePme.C:3836
void ComputePmeMgr::sendDataHelper ( int  iter)

Definition at line 3945 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

3945  {
3946  nodePmeMgr->sendDataHelper(iter);
3947 }
void sendDataHelper(int)
Definition: ComputePme.C:3949
void ComputePmeMgr::sendDataPart ( int  first,
int  last,
Lattice lattice,
int  sequence,
int  sourcepe,
int  errors 
)

Definition at line 3836 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), if(), iout, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), ComputePmeUtil::numGrids, PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

3836  {
3837 
3838  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3839 
3840  bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
3841 
3842  CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3843  for (int j=first; j<=last; j++) {
3844  int pe = gridPeOrder[j]; // different order
3845  if ( ! recipPeDest[pe] && ! errors ) continue;
3846  int start = pe * bsize;
3847  int len = bsize;
3848  if ( start >= qsize ) { start = 0; len = 0; }
3849  if ( start + len > qsize ) { len = qsize - start; }
3850  int zdim = myGrid.dim3;
3851  int fstart = start / zdim;
3852  int flen = len / zdim;
3853  int fcount = 0;
3854  int i;
3855 
3856  int g;
3857  for ( g=0; g<numGrids; ++g ) {
3858  char *f = f_arr + fstart + g*fsize;
3859 #ifdef NAMD_CUDA
3860  if ( offload ) {
3861  int errcount = 0;
3862  for ( i=0; i<flen; ++i ) {
3863  f[i] = ffz_host[fstart+i];
3864  fcount += f[i];
3865  if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3866  }
3867  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
3868  } else
3869 #endif
3870  for ( i=0; i<flen; ++i ) {
3871  fcount += f[i];
3872  }
3873  if ( ! recipPeDest[pe] ) {
3874  int errfound = 0;
3875  for ( i=0; i<flen; ++i ) {
3876  if ( f[i] == 3 ) {
3877  errfound = 1;
3878  break;
3879  }
3880  }
3881  if ( errfound ) {
3882  iout << iERROR << "Stray PME grid charges detected: "
3883  << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
3884  int iz = -1;
3885  for ( i=0; i<flen; ++i ) {
3886  if ( f[i] == 3 ) {
3887  f[i] = 2;
3888  int jz = (i+fstart)/myGrid.K2;
3889  if ( iz != jz ) { iout << " " << jz; iz = jz; }
3890  }
3891  }
3892  iout << "\n" << endi;
3893  }
3894  }
3895  }
3896 
3897 #ifdef NETWORK_PROGRESS
3898  CmiNetworkProgress();
3899 #endif
3900 
3901  if ( ! recipPeDest[pe] ) continue;
3902 
3903  int zlistlen = 0;
3904  for ( i=0; i<myGrid.K3; ++i ) {
3905  if ( fz_arr[i] ) ++zlistlen;
3906  }
3907 
3908  PmeGridMsg *msg = new (zlistlen, flen*numGrids,
3909  fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3910 
3911  msg->sourceNode = sourcepe;
3912  msg->lattice = lattice;
3913  msg->start = fstart;
3914  msg->len = flen;
3915  msg->zlistlen = zlistlen;
3916  int *zlist = msg->zlist;
3917  zlistlen = 0;
3918  for ( i=0; i<myGrid.K3; ++i ) {
3919  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3920  }
3921  float *qmsg = msg->qgrid;
3922  for ( g=0; g<numGrids; ++g ) {
3923  char *f = f_arr + fstart + g*fsize;
3924  CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
3925  float **q = q_arr + fstart + g*fsize;
3926  for ( i=0; i<flen; ++i ) {
3927  if ( f[i] ) {
3928  for (int h=0; h<myGrid.order-1; ++h) {
3929  q[i][h] += q[i][myGrid.K3+h];
3930  }
3931  for ( int k=0; k<zlistlen; ++k ) {
3932  *(qmsg++) = q[i][zlist[k]];
3933  }
3934  }
3935  }
3936  }
3937 
3938  msg->sequence = compute_sequence;
3939  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3940  pmeProxy[gridPeMap[pe]].recvGrid(msg);
3941  }
3942 
3943 }
int dim2
Definition: PmeBase.h:19
int dim3
Definition: PmeBase.h:19
int sequence
Definition: ComputePme.C:116
int K2
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:21
if(ComputeNonbondedUtil::goMethod==2)
Lattice lattice
Definition: ComputePme.C:118
#define iout
Definition: InfoStream.h:87
void recvGrid(PmeGridMsg *)
Definition: ComputePme.C:1827
int sourceNode
Definition: ComputePme.C:115
#define PRIORITY_SIZE
Definition: Priorities.h:13
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:123
float * qgrid
Definition: ComputePme.C:124
int * zlist
Definition: ComputePme.C:122
int K3
Definition: PmeBase.h:18
#define PME_GRID_PRIORITY
Definition: Priorities.h:30
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:109
int zlistlen
Definition: ComputePme.C:121
infostream & endi(infostream &s)
Definition: InfoStream.C:38
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:123
void ComputePmeMgr::sendPencils ( Lattice lattice,
int  sequence 
)

Definition at line 3731 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), ijpair::i, iERROR(), iout, ijpair::j, PmeGrid::K1, PmeGrid::K2, ComputePmeUtil::numGrids, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

3731  {
3732 
3733  sendDataHelper_lattice = &lattice;
3734  sendDataHelper_sequence = sequence;
3735  sendDataHelper_sourcepe = CkMyPe();
3736 
3737 #ifdef NAMD_CUDA
3738  if ( offload ) {
3739  for ( int ap=0; ap < numPencilsActive; ++ap ) {
3740 #if CMK_MULTICORE
3741  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3742  int ib = activePencils[ap].i;
3743  int jb = activePencils[ap].j;
3744  int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3745  pmeProxy[destproc].sendPencilsHelper(ap);
3746 #else
3747  pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3748 #endif
3749  }
3750  } else
3751 #endif
3752  {
3753  sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
3754  }
3755 
3756  if ( strayChargeErrors ) {
3757  strayChargeErrors = 0;
3758  iout << iERROR << "Stray PME grid charges detected: "
3759  << CkMyPe() << " sending to (x,y)";
3760  int K1 = myGrid.K1;
3761  int K2 = myGrid.K2;
3762  int dim2 = myGrid.dim2;
3763  int block1 = myGrid.block1;
3764  int block2 = myGrid.block2;
3765  for (int ib=0; ib<xBlocks; ++ib) {
3766  for (int jb=0; jb<yBlocks; ++jb) {
3767  int ibegin = ib*block1;
3768  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3769  int jbegin = jb*block2;
3770  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3771  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3772 
3773  for ( int g=0; g<numGrids; ++g ) {
3774  char *f = f_arr + g*fsize;
3775  if ( ! pencilActive[ib*yBlocks+jb] ) {
3776  for ( int i=ibegin; i<iend; ++i ) {
3777  for ( int j=jbegin; j<jend; ++j ) {
3778  if ( f[i*dim2+j] == 3 ) {
3779  f[i*dim2+j] = 2;
3780  iout << " (" << i << "," << j << ")";
3781  }
3782  }
3783  }
3784  }
3785  }
3786  }
3787  }
3788  iout << "\n" << endi;
3789  }
3790 
3791 }
int dim2
Definition: PmeBase.h:19
CProxy_PmePencilMap zm
Definition: ComputePme.C:634
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:21
#define iout
Definition: InfoStream.h:87
int block2
Definition: PmeBase.h:21
int sendDataHelper_sequence
Definition: ComputePme.C:371
int sendDataHelper_sourcepe
Definition: ComputePme.C:372
Lattice * sendDataHelper_lattice
Definition: ComputePme.C:370
int i
Definition: ComputePme.C:343
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:109
infostream & endi(infostream &s)
Definition: InfoStream.C:38
int j
Definition: ComputePme.C:343
void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe)
Definition: ComputePme.C:3576
void ComputePmeMgr::sendPencilsHelper ( int  iter)

Definition at line 3718 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

3718  {
3719  nodePmeMgr->sendPencilsHelper(iter);
3720 }
void sendPencilsHelper(int)
Definition: ComputePme.C:3722
void ComputePmeMgr::sendPencilsPart ( int  first,
int  last,
Lattice lattice,
int  sequence,
int  sourcepe 
)

Definition at line 3576 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, ijpair::i, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), ComputePmeUtil::numGrids, PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

3576  {
3577 
3578  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3579 
3580 #if 0 && USE_PERSISTENT
3581  if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3582 #endif
3583  int K1 = myGrid.K1;
3584  int K2 = myGrid.K2;
3585  int dim2 = myGrid.dim2;
3586  int dim3 = myGrid.dim3;
3587  int block1 = myGrid.block1;
3588  int block2 = myGrid.block2;
3589 
3590  // int savedMessages = 0;
3591  NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3592 
3593  for (int ap=first; ap<=last; ++ap) {
3594  int ib = activePencils[ap].i;
3595  int jb = activePencils[ap].j;
3596  int ibegin = ib*block1;
3597  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3598  int jbegin = jb*block2;
3599  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3600  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3601 
3602  int fcount = 0;
3603  for ( int g=0; g<numGrids; ++g ) {
3604  char *f = f_arr + g*fsize;
3605 #ifdef NAMD_CUDA
3606  if ( offload ) {
3607  int errcount = 0;
3608  for ( int i=ibegin; i<iend; ++i ) {
3609  for ( int j=jbegin; j<jend; ++j ) {
3610  int k = i*dim2+j;
3611  f[k] = ffz_host[k];
3612  fcount += f[k];
3613  if ( ffz_host[k] & ~1 ) ++errcount;
3614  }
3615  }
3616  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
3617  } else
3618 #endif
3619  for ( int i=ibegin; i<iend; ++i ) {
3620  for ( int j=jbegin; j<jend; ++j ) {
3621  fcount += f[i*dim2+j];
3622  }
3623  }
3624  }
3625 
3626 #ifdef NETWORK_PROGRESS
3627  CmiNetworkProgress();
3628 #endif
3629 
3630  if ( ! pencilActive[ib*yBlocks+jb] )
3631  NAMD_bug("PME activePencils list inconsistent");
3632 
3633  int zlistlen = 0;
3634  for ( int i=0; i<myGrid.K3; ++i ) {
3635  if ( fz_arr[i] ) ++zlistlen;
3636  }
3637 
3638  int hd = ( fcount? 1 : 0 ); // has data?
3639  // if ( ! hd ) ++savedMessages;
3640 
3641 
3642  PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
3643  hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3644  msg->sourceNode = sourcepe;
3645  msg->hasData = hd;
3646  msg->lattice = lattice;
3647  if ( hd ) {
3648 #if 0
3649  msg->start = fstart;
3650  msg->len = flen;
3651 #else
3652  msg->start = -1; // obsolete?
3653  msg->len = -1; // obsolete?
3654 #endif
3655  msg->zlistlen = zlistlen;
3656  int *zlist = msg->zlist;
3657  zlistlen = 0;
3658  for ( int i=0; i<myGrid.K3; ++i ) {
3659  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3660  }
3661  char *fmsg = msg->fgrid;
3662  float *qmsg = msg->qgrid;
3663  for ( int g=0; g<numGrids; ++g ) {
3664  char *f = f_arr + g*fsize;
3665  float **q = q_arr + g*fsize;
3666  for ( int i=ibegin; i<iend; ++i ) {
3667  for ( int j=jbegin; j<jend; ++j ) {
3668  *(fmsg++) = f[i*dim2+j];
3669  if( f[i*dim2+j] ) {
3670  for (int h=0; h<myGrid.order-1; ++h) {
3671  q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
3672  }
3673  for ( int k=0; k<zlistlen; ++k ) {
3674  *(qmsg++) = q[i*dim2+j][zlist[k]];
3675  }
3676  }
3677  }
3678  }
3679  }
3680  }
3681 
3682  msg->sequence = compute_sequence;
3683  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3684  CmiEnableUrgentSend(1);
3685 #if USE_NODE_PAR_RECEIVE
3686  msg->destElem=CkArrayIndex3D(ib,jb,0);
3687  CProxy_PmePencilMap lzm = npMgr->zm;
3688  int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
3689  int destnode = CmiNodeOf(destproc);
3690 
3691 #if 0
3692  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3693 #endif
3694  pmeNodeProxy[destnode].recvZGrid(msg);
3695 #if 0
3696  CmiUsePersistentHandle(NULL, 0);
3697 #endif
3698 #else
3699 #if 0
3700  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3701 #endif
3702  zPencil(ib,jb,0).recvGrid(msg);
3703 #if 0
3704  CmiUsePersistentHandle(NULL, 0);
3705 #endif
3706 #endif
3707  CmiEnableUrgentSend(0);
3708  }
3709 
3710 
3711  // if ( savedMessages ) {
3712  // CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
3713  // }
3714 
3715 }
int dim2
Definition: PmeBase.h:19
CProxy_PmePencilMap zm
Definition: ComputePme.C:634
int dim3
Definition: PmeBase.h:19
int sequence
Definition: ComputePme.C:116
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:21
Lattice lattice
Definition: ComputePme.C:118
int block2
Definition: PmeBase.h:21
int sourceNode
Definition: ComputePme.C:115
#define PRIORITY_SIZE
Definition: Priorities.h:13
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:123
CkArrayIndex3D destElem
Definition: ComputePme.C:125
float * qgrid
Definition: ComputePme.C:124
int * zlist
Definition: ComputePme.C:122
int K3
Definition: PmeBase.h:18
#define PME_GRID_PRIORITY
Definition: Priorities.h:30
int i
Definition: ComputePme.C:343
int zlistlen
Definition: ComputePme.C:121
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:123
int j
Definition: ComputePme.C:343
void ComputePmeMgr::sendTrans ( void  )

Definition at line 1939 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

1939  {
1940 
1941  untrans_count = numTransPes;
1942 
1943 #if CMK_SMP && USE_CKLOOP
1944  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1945  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
1946  CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
1947  } else
1948 #endif
1949  {
1950  sendTransSubset(0, numTransNodes-1);
1951  }
1952 
1953 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
static void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:1934
#define CKLOOP_CTRL_PME_SENDTRANS
Definition: SimParameters.h:95
void sendTransSubset(int first, int last)
Definition: ComputePme.C:1955
void ComputePmeMgr::sendTransBarrier ( void  )

Definition at line 1924 of file ComputePme.C.

1924  {
1925  sendTransBarrier_received += 1;
1926  // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
1927  if ( sendTransBarrier_received < numGridPes ) return;
1928  sendTransBarrier_received = 0;
1929  for ( int i=0; i<numGridPes; ++i ) {
1930  pmeProxyDir[gridPeMap[i]].sendTrans();
1931  }
1932 }
void ComputePmeMgr::sendTransSubset ( int  first,
int  last 
)

Definition at line 1955 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), PmeGrid::K2, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, x, PmeTransMsg::x_start, LocalPmeInfo::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendTrans(), and sendTrans().

1955  {
1956  // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
1957 
1958  // send data for transpose
1959  int zdim = myGrid.dim3;
1960  int nx = localInfo[myGridPe].nx;
1961  int x_start = localInfo[myGridPe].x_start;
1962  int slicelen = myGrid.K2 * zdim;
1963 
1964  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1965 
1966 #if CMK_BLUEGENEL
1967  CmiNetworkProgressAfter (0);
1968 #endif
1969 
1970  for (int j=first; j<=last; j++) {
1971  int node = transNodeOrder[j]; // different order on each node
1972  int pe = transNodeInfo[node].pe_start;
1973  int npe = transNodeInfo[node].npe;
1974  int totlen = 0;
1975  if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
1976  LocalPmeInfo &li = localInfo[pe];
1977  int cpylen = li.ny_after_transpose * zdim;
1978  totlen += cpylen;
1979  }
1980  PmeTransMsg *newmsg = new (nx * totlen * numGrids,
1982  newmsg->sourceNode = myGridPe;
1983  newmsg->lattice = lattice;
1984  newmsg->x_start = x_start;
1985  newmsg->nx = nx;
1986  for ( int g=0; g<numGrids; ++g ) {
1987  float *qmsg = newmsg->qgrid + nx * totlen * g;
1988  pe = transNodeInfo[node].pe_start;
1989  for (int i=0; i<npe; ++i, ++pe) {
1990  LocalPmeInfo &li = localInfo[pe];
1991  int cpylen = li.ny_after_transpose * zdim;
1992  if ( node == myTransNode ) {
1993  ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
1994  qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
1995  }
1996  float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
1997  for ( int x = 0; x < nx; ++x ) {
1998  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
1999  q += slicelen;
2000  qmsg += cpylen;
2001  }
2002  }
2003  }
2004  newmsg->sequence = grid_sequence;
2005  SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
2006  if ( node == myTransNode ) newmsg->nx = 0;
2007  if ( npe > 1 ) {
2008  if ( node == myTransNode ) fwdSharedTrans(newmsg);
2009  else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
2010  } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
2011  }
2012 }
int dim3
Definition: PmeBase.h:19
int K2
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
if(ComputeNonbondedUtil::goMethod==2)
float * qgrid
Definition: ComputePme.C:137
void fwdSharedTrans(PmeTransMsg *)
Definition: ComputePme.C:2014
#define PRIORITY_SIZE
Definition: Priorities.h:13
int sourceNode
Definition: ComputePme.C:131
#define PME_TRANS_PRIORITY
Definition: Priorities.h:31
int ny_after_transpose
Definition: ComputePme.C:233
Lattice lattice
Definition: ComputePme.C:134
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
gridSize x
int y_start_after_transpose
Definition: ComputePme.C:233
void ComputePmeMgr::sendUngrid ( void  )

Definition at line 2369 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

2369  {
2370 
2371 #if CMK_SMP && USE_CKLOOP
2372  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2373  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
2374  CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
2375  } else
2376 #endif
2377  {
2378  sendUngridSubset(0, numSources-1);
2379  }
2380 
2381  grid_count = numSources;
2382  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
2383 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
static int numGrids
Definition: ComputePme.h:32
static void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2364
#define CKLOOP_CTRL_PME_SENDUNTRANS
Definition: SimParameters.h:98
void sendUngridSubset(int first, int last)
Definition: ComputePme.C:2385
void ComputePmeMgr::sendUngridSubset ( int  first,
int  last 
)

Definition at line 2385 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::len, ComputePmeUtil::numGrids, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, PmeGridMsg::qgrid, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

2385  {
2386 
2387 #ifdef NAMD_CUDA
2388  const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
2389 #else
2390  const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
2391 #endif
2392 
2393  for ( int j=first; j<=last; ++j ) {
2394  // int msglen = qgrid_len;
2395  PmeGridMsg *newmsg = gridmsg_reuse[j];
2396  int pe = newmsg->sourceNode;
2397  int zdim = myGrid.dim3;
2398  int flen = newmsg->len;
2399  int fstart = newmsg->start;
2400  int zlistlen = newmsg->zlistlen;
2401  int *zlist = newmsg->zlist;
2402  float *qmsg = newmsg->qgrid;
2403  for ( int g=0; g<numGrids; ++g ) {
2404  char *f = newmsg->fgrid + fgrid_len * g;
2405  float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2406  for ( int i=0; i<flen; ++i ) {
2407  if ( f[i] ) {
2408  for ( int k=0; k<zlistlen; ++k ) {
2409  *(qmsg++) = q[zlist[k]];
2410  }
2411  }
2412  q += zdim;
2413  }
2414  }
2415  newmsg->sourceNode = myGridPe;
2416 
2417  SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
2418  CmiEnableUrgentSend(1);
2419 #ifdef NAMD_CUDA
2420  if ( offload ) {
2421  pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2422  } else
2423 #endif
2424  pmeProxyDir[pe].recvUngrid(newmsg);
2425  CmiEnableUrgentSend(0);
2426  }
2427 }
#define PME_UNGRID_PRIORITY
Definition: Priorities.h:74
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
#define PME_OFFLOAD_UNGRID_PRIORITY
Definition: Priorities.h:42
int sourceNode
Definition: ComputePme.C:115
float * qgrid
Definition: ComputePme.C:124
int * zlist
Definition: ComputePme.C:122
int zlistlen
Definition: ComputePme.C:121
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:123
void ComputePmeMgr::sendUntrans ( void  )

Definition at line 2183 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, ComputePmeUtil::numGrids, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

2183  {
2184 
2185  trans_count = numGridPes;
2186 
2187  { // send energy and virial
2188  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
2189  for ( int g=0; g<numGrids; ++g ) {
2190  newmsg->evir[g] = recip_evir2[g];
2191  }
2192  SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
2193  CmiEnableUrgentSend(1);
2194  pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2195  CmiEnableUrgentSend(0);
2196  }
2197 
2198 #if CMK_SMP && USE_CKLOOP
2199  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2200  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
2201  CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
2202  } else
2203 #endif
2204  {
2205  sendUntransSubset(0, numGridNodes-1);
2206  }
2207 
2208 }
static Node * Object()
Definition: Node.h:86
#define PME_UNGRID_PRIORITY
Definition: Priorities.h:74
SimParameters * simParameters
Definition: Node.h:178
static int numGrids
Definition: ComputePme.h:32
void sendUntransSubset(int first, int last)
Definition: ComputePme.C:2210
PmeReduction * evir
Definition: ComputePme.C:167
#define CKLOOP_CTRL_PME_SENDUNTRANS
Definition: SimParameters.h:98
#define PRIORITY_SIZE
Definition: Priorities.h:13
void recvRecipEvir(PmeEvirMsg *)
Definition: ComputePme.C:3023
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
static void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2178
void ComputePmeMgr::sendUntransSubset ( int  first,
int  last 
)

Definition at line 2210 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), PmeGrid::K2, NodePmeInfo::npe, ComputePmeUtil::numGrids, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, x, LocalPmeInfo::x_start, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

2210  {
2211 
2212  int zdim = myGrid.dim3;
2213  int y_start = localInfo[myTransPe].y_start_after_transpose;
2214  int ny = localInfo[myTransPe].ny_after_transpose;
2215  int slicelen = myGrid.K2 * zdim;
2216 
2217  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2218 
2219 #if CMK_BLUEGENEL
2220  CmiNetworkProgressAfter (0);
2221 #endif
2222 
2223  // send data for reverse transpose
2224  for (int j=first; j<=last; j++) {
2225  int node = gridNodeOrder[j]; // different order on each node
2226  int pe = gridNodeInfo[node].pe_start;
2227  int npe = gridNodeInfo[node].npe;
2228  int totlen = 0;
2229  if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
2230  LocalPmeInfo &li = localInfo[pe];
2231  int cpylen = li.nx * zdim;
2232  totlen += cpylen;
2233  }
2234  PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
2235  newmsg->sourceNode = myTransPe;
2236  newmsg->y_start = y_start;
2237  newmsg->ny = ny;
2238  for ( int g=0; g<numGrids; ++g ) {
2239  float *qmsg = newmsg->qgrid + ny * totlen * g;
2240  pe = gridNodeInfo[node].pe_start;
2241  for (int i=0; i<npe; ++i, ++pe) {
2242  LocalPmeInfo &li = localInfo[pe];
2243  if ( node == myGridNode ) {
2244  ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
2245  qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2246  float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
2247  int cpylen = ny * zdim;
2248  for ( int x = 0; x < li.nx; ++x ) {
2249  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2250  q += cpylen;
2251  qmsg += slicelen;
2252  }
2253  } else {
2254  CmiMemcpy((void*)qmsg,
2255  (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
2256  li.nx*ny*zdim*sizeof(float));
2257  qmsg += li.nx*ny*zdim;
2258  }
2259  }
2260  }
2261  SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
2262  if ( node == myGridNode ) newmsg->ny = 0;
2263  if ( npe > 1 ) {
2264  if ( node == myGridNode ) fwdSharedUntrans(newmsg);
2265  else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
2266  } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
2267  }
2268 }
float * qgrid
Definition: ComputePme.C:154
int dim3
Definition: PmeBase.h:19
int K2
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
if(ComputeNonbondedUtil::goMethod==2)
#define PRIORITY_SIZE
Definition: Priorities.h:13
void fwdSharedUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2270
int ny_after_transpose
Definition: ComputePme.C:233
#define PME_UNTRANS_PRIORITY
Definition: Priorities.h:33
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
gridSize x
int y_start_after_transpose
Definition: ComputePme.C:233
void ComputePmeMgr::submitReductions ( )

Definition at line 4208 of file ComputePme.C.

References ComputePmeUtil::alchDecouple, ComputePmeUtil::alchFepOn, ComputePmeUtil::alchOn, ComputePmeUtil::alchThermIntOn, SimParameters::getElecLambda(), SubmitReduction::item(), ComputePmeUtil::lesFactor, ComputePmeUtil::lesOn, WorkDistrib::messageEnqueueWork(), NAMD_bug(), ComputePmeUtil::numGrids, Node::Object(), ComputePmeUtil::pairOn, REDUCTION_ELECT_ENERGY_PME_TI_1, REDUCTION_ELECT_ENERGY_PME_TI_2, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_ELECT_ENERGY_SLOW_F, REDUCTION_STRAY_CHARGE_ERRORS, ResizeArray< T >::resize(), Node::simParameters, ResizeArray< T >::size(), and SubmitReduction::submit().

Referenced by ComputePme::doWork(), and recvRecipEvir().

4208  {
4209 
4211 
4212  for ( int g=0; g<numGrids; ++g ) {
4213  float scale = 1.;
4214  if (alchOn) {
4215  BigReal elecLambdaUp, elecLambdaDown;
4216  // alchLambda set on each step in ComputePme::ungridForces()
4217  if ( alchLambda < 0 || alchLambda > 1 ) {
4218  NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
4219  }
4220  elecLambdaUp = simParams->getElecLambda(alchLambda);
4221  elecLambdaDown = simParams->getElecLambda(1-alchLambda);
4222  if ( g == 0 ) scale = elecLambdaUp;
4223  else if ( g == 1 ) scale = elecLambdaDown;
4224  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4225  if (alchDecouple) {
4226  if ( g == 2 ) scale = 1-elecLambdaUp;
4227  else if ( g == 3 ) scale = 1-elecLambdaDown;
4228  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4229  }
4230  } else if ( lesOn ) {
4231  scale = 1.0 / lesFactor;
4232  } else if ( pairOn ) {
4233  scale = ( g == 0 ? 1. : -1. );
4234  }
4235  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
4236  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4237  reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4238  reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4239  reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4240  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4241  reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4242  reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4243  reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4244  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4245 
4246  float scale2 = 0.;
4247 
4248  // why is this declared/defined again here?
4249  SimParameters *simParams = Node::Object()->simParameters;
4250 
4251  if (alchFepOn) {
4252  BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4253  elecLambda2Up = simParams->getElecLambda(alchLambda2);
4254  elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
4255  if ( g == 0 ) scale2 = elecLambda2Up;
4256  else if ( g == 1 ) scale2 = elecLambda2Down;
4257  else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4258  if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4259  else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4260  else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4261  }
4262  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
4263 
4264  if (alchThermIntOn) {
4265 
4266  // no decoupling:
4267  // part. 1 <-> all of system except partition 2: g[0] - g[2]
4268  // (interactions between all atoms [partition 0 OR partition 1],
4269  // minus all [within partition 0])
4270  // U = elecLambdaUp * (U[0] - U[2])
4271  // dU/dl = U[0] - U[2];
4272 
4273  // part. 2 <-> all of system except partition 1: g[1] - g[2]
4274  // (interactions between all atoms [partition 0 OR partition 2],
4275  // minus all [within partition 0])
4276  // U = elecLambdaDown * (U[1] - U[2])
4277  // dU/dl = U[1] - U[2];
4278 
4279  // alchDecouple:
4280  // part. 1 <-> part. 0: g[0] - g[2] - g[4]
4281  // (interactions between all atoms [partition 0 OR partition 1]
4282  // minus all [within partition 1] minus all [within partition 0]
4283  // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
4284  // dU/dl = U[0] - U[2] - U[4];
4285 
4286  // part. 2 <-> part. 0: g[1] - g[3] - g[4]
4287  // (interactions between all atoms [partition 0 OR partition 2]
4288  // minus all [within partition 2] minus all [within partition 0]
4289  // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
4290  // dU/dl = U[1] - U[3] - U[4];
4291 
4292 
4293  if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
4294  if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
4295  if (!alchDecouple) {
4296  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4297  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4298  }
4299  else { // alchDecouple
4300  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4301  if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4302  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4303  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4304  }
4305  }
4306  }
4307 
4308  alchLambda = -1.; // illegal value to catch if not updated
4309 
4310  reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
4311  reduction->submit();
4312 
4313  for ( int i=0; i<heldComputes.size(); ++i ) {
4314  WorkDistrib::messageEnqueueWork(heldComputes[i]);
4315  }
4316  heldComputes.resize(0);
4317 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
BigReal & item(int i)
Definition: ReductionMgr.h:312
static int numGrids
Definition: ComputePme.h:32
static Bool alchOn
Definition: ComputePme.h:33
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2727
void NAMD_bug(const char *err_msg)
Definition: common.C:123
static Bool alchDecouple
Definition: ComputePme.h:36
static int lesFactor
Definition: ComputePme.h:39
#define simParams
Definition: Output.C:127
static Bool pairOn
Definition: ComputePme.h:40
static Bool lesOn
Definition: ComputePme.h:38
void resize(int i)
Definition: ResizeArray.h:84
void submit(void)
Definition: ReductionMgr.h:323
int size(void) const
Definition: ResizeArray.h:127
static Bool alchFepOn
Definition: ComputePme.h:34
BigReal getElecLambda(const BigReal)
double BigReal
Definition: common.h:114
static Bool alchThermIntOn
Definition: ComputePme.h:35
void ComputePmeMgr::ungridCalc ( void  )

Definition at line 2519 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, deviceCUDA, end_forces, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, DeviceCUDA::getDeviceID(), PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), PmeGrid::order, pmeComputes, ResizeArray< T >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

2519  {
2520  // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
2521 
2522  ungridForcesCount = pmeComputes.size();
2523 
2524 #ifdef NAMD_CUDA
2525  if ( offload ) {
2526  //CmiLock(cuda_lock);
2527  cudaSetDevice(deviceCUDA->getDeviceID());
2528 
2529  if ( this == masterPmeMgr ) {
2530  double before = CmiWallTimer();
2531  cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
2532  cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
2533  // try to make the unspecified launch failures go away
2534  cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2535  cuda_errcheck("in ComputePmeMgr::ungridCalc after potential memcpy");
2536  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2537 
2538  const int myrank = CkMyRank();
2539  for ( int i=0; i<CkMyNodeSize(); ++i ) {
2540  if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
2541  nodePmeMgr->mgrObjects[i]->ungridCalc();
2542  }
2543  }
2544  if ( ! pmeComputes.size() ) return;
2545  }
2546 
2547  if ( ! end_forces ) {
2548  int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
2549  end_forces = new cudaEvent_t[n];
2550  for ( int i=0; i<n; ++i ) {
2551  cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
2552  }
2553  }
2554 
2555  const int pcsz = pmeComputes.size();
2556  if ( ! afn_host ) {
2557  cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
2558  cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
2559  cuda_errcheck("malloc params for pme");
2560  }
2561  int totn = 0;
2562  for ( int i=0; i<pcsz; ++i ) {
2563  int n = pmeComputes[i]->numGridAtoms[0];
2564  totn += n;
2565  }
2566  if ( totn > f_data_mgr_alloc ) {
2567  if ( f_data_mgr_alloc ) {
2568  CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2569  cudaFree(f_data_mgr_dev);
2570  cudaFreeHost(f_data_mgr_host);
2571  }
2572  f_data_mgr_alloc = 1.2 * (totn + 100);
2573  cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
2574  cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
2575  cuda_errcheck("malloc forces for pme");
2576  }
2577  // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
2578  float *f_dev = f_data_mgr_dev;
2579  float *f_host = f_data_mgr_host;
2580  for ( int i=0; i<pcsz; ++i ) {
2581  int n = pmeComputes[i]->numGridAtoms[0];
2582  pmeComputes[i]->f_data_dev = f_dev;
2583  pmeComputes[i]->f_data_host = f_host;
2584  afn_host[3*i ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
2585  afn_host[3*i+1] = f_dev;
2586  afn_host[3*i+2] = f_dev + n; // avoid type conversion issues
2587  f_dev += 3*n;
2588  f_host += 3*n;
2589  }
2590  //CmiLock(cuda_lock);
2591  double before = CmiWallTimer();
2592  cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
2593  cuda_errcheck("in ComputePmeMgr::ungridCalc after force pointer memcpy");
2594  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2595  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2596  cuda_errcheck("in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2597  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2598 
2599  for ( int i=0; i<pcsz; ++i ) {
2600  // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
2601  if ( i%EVENT_STRIDE == 0 ) {
2602  int dimy = pcsz - i;
2603  if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
2604  int maxn = 0;
2605  int subtotn = 0;
2606  for ( int j=0; j<dimy; ++j ) {
2607  int n = pmeComputes[i+j]->numGridAtoms[0];
2608  subtotn += n;
2609  if ( n > maxn ) maxn = n;
2610  }
2611  // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
2612  before = CmiWallTimer();
2613  cuda_pme_forces(
2614  bspline_coeffs_dev,
2615  v_arr_dev, afn_dev+3*i, dimy, maxn, /*
2616  pmeComputes[i]->a_data_dev,
2617  pmeComputes[i]->f_data_dev,
2618  n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
2619  streams[stream]);
2620  cuda_errcheck("in ComputePmeMgr::ungridCalc after force kernel submit");
2621  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
2622  before = CmiWallTimer();
2623  cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
2624  cudaMemcpyDeviceToHost, streams[stream]);
2625  cuda_errcheck("in ComputePmeMgr::ungridCalc after force memcpy submit");
2626  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2627  cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
2628  cuda_errcheck("in ComputePmeMgr::ungridCalc after end_forces event");
2629  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2630  }
2631  // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
2632  }
2633  //CmiUnlock(cuda_lock);
2634  } else
2635 #endif // NAMD_CUDA
2636  {
2637  for ( int i=0; i<pmeComputes.size(); ++i ) {
2639  // pmeComputes[i]->ungridForces();
2640  }
2641  }
2642  // submitReductions(); // must follow all ungridForces()
2643 
2644 #ifdef NAMD_CUDA
2645  if ( offload ) {
2646  forces_time = CmiWallTimer();
2647  forces_count = ungridForcesCount;
2648  forces_done_count = 0;
2649  pmeProxy[this_pe].pollForcesReady();
2650  }
2651 #endif
2652 
2653  ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2654 }
double forces_time
Definition: ComputePme.C:431
float * a_data_dev
Definition: ComputePme.C:419
#define EVENT_STRIDE
Definition: ComputePme.C:2471
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
#define CUDA_EVENT_ID_PME_COPY
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2727
float * f_data_host
Definition: ComputePme.C:420
int order
Definition: PmeBase.h:20
#define CUDA_EVENT_ID_PME_TICK
float * f_data_dev
Definition: ComputePme.C:421
void ungridCalc(void)
Definition: ComputePme.C:2519
int getDeviceID()
Definition: DeviceCUDA.h:107
int K3
Definition: PmeBase.h:18
void cuda_errcheck(const char *msg)
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:454
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
cudaEvent_t * end_forces
Definition: ComputePme.C:427
#define CUDA_EVENT_ID_PME_KERNEL
int size(void) const
Definition: ResizeArray.h:127
int forces_done_count
Definition: ComputePme.C:429

Friends And Related Function Documentation

friend class ComputePme
friend

Definition at line 357 of file ComputePme.C.

friend class NodePmeMgr
friend

Definition at line 358 of file ComputePme.C.

Member Data Documentation

float* ComputePmeMgr::a_data_dev

Definition at line 419 of file ComputePme.C.

Referenced by cuda_submit_charges(), ComputePme::doWork(), and ungridCalc().

float* ComputePmeMgr::a_data_host

Definition at line 418 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ComputePme::doWork().

int ComputePmeMgr::chargeGridSubmittedCount
double ComputePmeMgr::charges_time

Definition at line 430 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

int ComputePmeMgr::check_charges_count

Definition at line 432 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

int ComputePmeMgr::check_forces_count

Definition at line 433 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 423 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ComputePme::doWork().

int ComputePmeMgr::cuda_atoms_count
bool ComputePmeMgr::cuda_busy
static

Definition at line 442 of file ComputePme.C.

Referenced by ComputePme::doWork().

CmiNodeLock ComputePmeMgr::cuda_lock
static

Definition at line 424 of file ComputePme.C.

Referenced by ComputePmeMgr(), ComputePme::doWork(), initialize_computes(), and recvAck().

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque
static

Definition at line 441 of file ComputePme.C.

Referenced by ComputePme::doWork().

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 426 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 427 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

float* ComputePmeMgr::f_data_dev

Definition at line 421 of file ComputePme.C.

Referenced by ungridCalc().

float* ComputePmeMgr::f_data_host

Definition at line 420 of file ComputePme.C.

Referenced by ungridCalc().

CmiNodeLock ComputePmeMgr::fftw_plan_lock
static
int ComputePmeMgr::forces_count

Definition at line 428 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::forces_done_count

Definition at line 429 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

double ComputePmeMgr::forces_time

Definition at line 431 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::master_pe
ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes
CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 415 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

Lattice* ComputePmeMgr::saved_lattice

Definition at line 447 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

int ComputePmeMgr::saved_sequence
int ComputePmeMgr::sendDataHelper_errors

Definition at line 373 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

Lattice* ComputePmeMgr::sendDataHelper_lattice
int ComputePmeMgr::sendDataHelper_sequence
int ComputePmeMgr::sendDataHelper_sourcepe
int ComputePmeMgr::this_pe

Definition at line 435 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().


The documentation for this class was generated from the following file: