NAMD
Classes | Public Member Functions | List of all members
CudaComputeNonbonded Class Reference

#include <CudaComputeNonbonded.h>

Inheritance diagram for CudaComputeNonbonded:
Compute ComputeNonbondedUtil

Classes

struct  ComputeRecord
 
struct  PatchRecord
 

Public Member Functions

 CudaComputeNonbonded (ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 
 ~CudaComputeNonbonded ()
 
void registerComputeSelf (ComputeID cid, PatchID pid)
 
void registerComputePair (ComputeID cid, PatchID *pid, int *trans)
 
void assignPatches (ComputeMgr *computeMgrIn)
 
virtual void initialize ()
 
virtual void atomUpdate ()
 
virtual int noWork ()
 
virtual void doWork ()
 
void launchWork ()
 
void finishReductions ()
 
void unregisterBoxesOnPe ()
 
void assignPatchesOnPe ()
 
void openBoxesOnPe ()
 
void skipPatchesOnPe ()
 
void finishPatchesOnPe ()
 
void finishPatchOnPe (int i)
 
void messageEnqueueWork ()
 
virtual void patchReady (PatchID, int doneMigration, int seq)
 
virtual void gbisP2PatchReady (PatchID, int seq)
 
virtual void gbisP3PatchReady (PatchID, int seq)
 
- Public Member Functions inherited from Compute
 Compute (ComputeID)
 
int type ()
 
virtual ~Compute ()
 
void setNumPatches (int n)
 
int getNumPatches ()
 
int sequence (void)
 
int priority (void)
 
int getGBISPhase (void)
 
- Public Member Functions inherited from ComputeNonbondedUtil
 ComputeNonbondedUtil ()
 
 ~ComputeNonbondedUtil ()
 
void calcGBIS (nonbonded *params, GBISParamStruct *gbisParams)
 

Additional Inherited Members

- Public Types inherited from ComputeNonbondedUtil
enum  {
  exclChecksumIndex, pairlistWarningIndex, electEnergyIndex, fullElectEnergyIndex,
  vdwEnergyIndex, goNativeEnergyIndex, goNonnativeEnergyIndex, groLJEnergyIndex,
  groGaussEnergyIndex, electEnergyIndex_s, fullElectEnergyIndex_s, vdwEnergyIndex_s,
  electEnergyIndex_ti_1, fullElectEnergyIndex_ti_1, vdwEnergyIndex_ti_1, electEnergyIndex_ti_2,
  fullElectEnergyIndex_ti_2, vdwEnergyIndex_ti_2, TENSOR =(virialIndex), TENSOR =(virialIndex),
  VECTOR =(pairVDWForceIndex), VECTOR =(pairVDWForceIndex), reductionDataSize
}
 
- Static Public Member Functions inherited from ComputeNonbondedUtil
static void select (void)
 
static void submitReductionData (BigReal *, SubmitReduction *)
 
static void submitPressureProfileData (BigReal *, SubmitReduction *)
 
static BigReal square (const BigReal &x, const BigReal &y, const BigReal &z)
 
static void calc_error (nonbonded *)
 
static void calc_pair (nonbonded *)
 
static void calc_pair_energy (nonbonded *)
 
static void calc_pair_fullelect (nonbonded *)
 
static void calc_pair_energy_fullelect (nonbonded *)
 
static void calc_pair_merge_fullelect (nonbonded *)
 
static void calc_pair_energy_merge_fullelect (nonbonded *)
 
static void calc_pair_slow_fullelect (nonbonded *)
 
static void calc_pair_energy_slow_fullelect (nonbonded *)
 
static void calc_self (nonbonded *)
 
static void calc_self_energy (nonbonded *)
 
static void calc_self_fullelect (nonbonded *)
 
static void calc_self_energy_fullelect (nonbonded *)
 
static void calc_self_merge_fullelect (nonbonded *)
 
static void calc_self_energy_merge_fullelect (nonbonded *)
 
static void calc_self_slow_fullelect (nonbonded *)
 
static void calc_self_energy_slow_fullelect (nonbonded *)
 
static void calc_pair_energy_fep (nonbonded *)
 
static void calc_pair_energy_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_fep (nonbonded *)
 
static void calc_self_energy_fep (nonbonded *)
 
static void calc_self_energy_fullelect_fep (nonbonded *)
 
static void calc_self_energy_merge_fullelect_fep (nonbonded *)
 
static void calc_self_energy_slow_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_ti (nonbonded *)
 
static void calc_pair_ti (nonbonded *)
 
static void calc_pair_energy_fullelect_ti (nonbonded *)
 
static void calc_pair_fullelect_ti (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_ti (nonbonded *)
 
static void calc_pair_merge_fullelect_ti (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_ti (nonbonded *)
 
static void calc_pair_slow_fullelect_ti (nonbonded *)
 
static void calc_self_energy_ti (nonbonded *)
 
static void calc_self_ti (nonbonded *)
 
static void calc_self_energy_fullelect_ti (nonbonded *)
 
static void calc_self_fullelect_ti (nonbonded *)
 
static void calc_self_energy_merge_fullelect_ti (nonbonded *)
 
static void calc_self_merge_fullelect_ti (nonbonded *)
 
static void calc_self_energy_slow_fullelect_ti (nonbonded *)
 
static void calc_self_slow_fullelect_ti (nonbonded *)
 
static void calc_pair_les (nonbonded *)
 
static void calc_pair_energy_les (nonbonded *)
 
static void calc_pair_fullelect_les (nonbonded *)
 
static void calc_pair_energy_fullelect_les (nonbonded *)
 
static void calc_pair_merge_fullelect_les (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_les (nonbonded *)
 
static void calc_pair_slow_fullelect_les (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_les (nonbonded *)
 
static void calc_self_les (nonbonded *)
 
static void calc_self_energy_les (nonbonded *)
 
static void calc_self_fullelect_les (nonbonded *)
 
static void calc_self_energy_fullelect_les (nonbonded *)
 
static void calc_self_merge_fullelect_les (nonbonded *)
 
static void calc_self_energy_merge_fullelect_les (nonbonded *)
 
static void calc_self_slow_fullelect_les (nonbonded *)
 
static void calc_self_energy_slow_fullelect_les (nonbonded *)
 
static void calc_pair_energy_int (nonbonded *)
 
static void calc_pair_energy_fullelect_int (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_int (nonbonded *)
 
static void calc_self_energy_int (nonbonded *)
 
static void calc_self_energy_fullelect_int (nonbonded *)
 
static void calc_self_energy_merge_fullelect_int (nonbonded *)
 
static void calc_pair_pprof (nonbonded *)
 
static void calc_pair_energy_pprof (nonbonded *)
 
static void calc_pair_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_fullelect_pprof (nonbonded *)
 
static void calc_pair_merge_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_pprof (nonbonded *)
 
static void calc_pair_slow_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_pprof (nonbonded *)
 
static void calc_self_pprof (nonbonded *)
 
static void calc_self_energy_pprof (nonbonded *)
 
static void calc_self_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_fullelect_pprof (nonbonded *)
 
static void calc_self_merge_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_merge_fullelect_pprof (nonbonded *)
 
static void calc_self_slow_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_slow_fullelect_pprof (nonbonded *)
 
static void calc_pair_tabener (nonbonded *)
 
static void calc_pair_energy_tabener (nonbonded *)
 
static void calc_pair_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_fullelect_tabener (nonbonded *)
 
static void calc_pair_merge_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_tabener (nonbonded *)
 
static void calc_pair_slow_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_tabener (nonbonded *)
 
static void calc_self_tabener (nonbonded *)
 
static void calc_self_energy_tabener (nonbonded *)
 
static void calc_self_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_fullelect_tabener (nonbonded *)
 
static void calc_self_merge_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_merge_fullelect_tabener (nonbonded *)
 
static void calc_self_slow_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_slow_fullelect_tabener (nonbonded *)
 
static void calc_pair_go (nonbonded *)
 
static void calc_pair_energy_go (nonbonded *)
 
static void calc_pair_fullelect_go (nonbonded *)
 
static void calc_pair_energy_fullelect_go (nonbonded *)
 
static void calc_pair_merge_fullelect_go (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_go (nonbonded *)
 
static void calc_pair_slow_fullelect_go (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_go (nonbonded *)
 
static void calc_self_go (nonbonded *)
 
static void calc_self_energy_go (nonbonded *)
 
static void calc_self_fullelect_go (nonbonded *)
 
static void calc_self_energy_fullelect_go (nonbonded *)
 
static void calc_self_merge_fullelect_go (nonbonded *)
 
static void calc_self_energy_merge_fullelect_go (nonbonded *)
 
static void calc_self_slow_fullelect_go (nonbonded *)
 
static void calc_self_energy_slow_fullelect_go (nonbonded *)
 
- Public Attributes inherited from Compute
const ComputeID cid
 
LDObjHandle ldObjHandle
 
LocalWorkMsg *const localWorkMsg
 
- Static Public Attributes inherited from ComputeNonbondedUtil
static void(* calcPair )(nonbonded *)
 
static void(* calcPairEnergy )(nonbonded *)
 
static void(* calcSelf )(nonbonded *)
 
static void(* calcSelfEnergy )(nonbonded *)
 
static void(* calcFullPair )(nonbonded *)
 
static void(* calcFullPairEnergy )(nonbonded *)
 
static void(* calcFullSelf )(nonbonded *)
 
static void(* calcFullSelfEnergy )(nonbonded *)
 
static void(* calcMergePair )(nonbonded *)
 
static void(* calcMergePairEnergy )(nonbonded *)
 
static void(* calcMergeSelf )(nonbonded *)
 
static void(* calcMergeSelfEnergy )(nonbonded *)
 
static void(* calcSlowPair )(nonbonded *)
 
static void(* calcSlowPairEnergy )(nonbonded *)
 
static void(* calcSlowSelf )(nonbonded *)
 
static void(* calcSlowSelfEnergy )(nonbonded *)
 
static Bool commOnly
 
static Bool fixedAtomsOn
 
static Bool qmForcesOn
 
static BigReal cutoff
 
static BigReal cutoff2
 
static float cutoff2_f
 
static BigReal dielectric_1
 
static const LJTableljTable = 0
 
static const Moleculemol
 
static BigReal r2_delta
 
static BigReal r2_delta_1
 
static int rowsize
 
static int columnsize
 
static int r2_delta_exp
 
static BigRealtable_alloc = 0
 
static BigRealtable_ener = 0
 
static BigRealtable_short
 
static BigRealtable_noshort
 
static BigRealfast_table
 
static BigRealscor_table
 
static BigRealslow_table
 
static BigRealcorr_table
 
static BigRealfull_table
 
static BigRealvdwa_table
 
static BigRealvdwb_table
 
static BigRealr2_table
 
static int table_length
 
static BigReal scaling
 
static BigReal scale14
 
static BigReal switchOn
 
static BigReal switchOn_1
 
static BigReal switchOn2
 
static BigReal v_vdwa
 
static BigReal v_vdwb
 
static BigReal k_vdwa
 
static BigReal k_vdwb
 
static BigReal cutoff_3
 
static BigReal cutoff_6
 
static float v_vdwa_f
 
static float v_vdwb_f
 
static float k_vdwa_f
 
static float k_vdwb_f
 
static float cutoff_3_f
 
static float cutoff_6_f
 
static float switchOn_f
 
static float A6_f
 
static float B6_f
 
static float C6_f
 
static float A12_f
 
static float B12_f
 
static float C12_f
 
static BigReal c0
 
static BigReal c1
 
static BigReal c3
 
static BigReal c5
 
static BigReal c6
 
static BigReal c7
 
static BigReal c8
 
static Bool alchFepOn
 
static Bool alchThermIntOn
 
static Bool alchWCAOn
 
static BigReal alchVdwShiftCoeff
 
static Bool vdwForceSwitching
 
static Bool alchDecouple
 
static Bool lesOn
 
static int lesFactor
 
static BigReal lesScaling
 
static BigReallambda_table = 0
 
static Bool pairInteractionOn
 
static Bool pairInteractionSelf
 
static Bool pressureProfileOn
 
static int pressureProfileSlabs
 
static int pressureProfileAtomTypes
 
static BigReal pressureProfileThickness
 
static BigReal pressureProfileMin
 
static Bool accelMDOn
 
static Bool drudeNbthole
 
static BigReal ewaldcof
 
static BigReal pi_ewaldcof
 
static int vdw_switch_mode
 
static Bool goGroPair
 
static Bool goForcesOn
 
static int goMethod
 
- Protected Member Functions inherited from Compute
void enqueueWork ()
 
- Protected Attributes inherited from Compute
int computeType
 
int basePriority
 
int gbisPhase
 
int gbisPhasePriority [3]
 

Detailed Description

Definition at line 21 of file CudaComputeNonbonded.h.

Constructor & Destructor Documentation

CudaComputeNonbonded::CudaComputeNonbonded ( ComputeID  c,
int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 36 of file CudaComputeNonbonded.C.

References cudaCheck, SimParameters::GBISOn, Compute::gbisPhase, NAMD_die(), Node::Object(), SimParameters::pressureProfileOn, Node::simParameters, and simParams.

37  :
38 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
39 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
40 
41  cudaCheck(cudaSetDevice(deviceID));
42 
43  exclusionsByAtom = NULL;
44 
45  vdwTypes = NULL;
46  vdwTypesSize = 0;
47 
48  exclIndexMaxDiff = NULL;
49  exclIndexMaxDiffSize = 0;
50 
51  atomIndex = NULL;
52  atomIndexSize = 0;
53 
54  atomStorageSize = 0;
55 
56  // Atom and charge storage
57  atoms = NULL;
58  atomsSize = 0;
59 
60  // Force storage
61  h_forces = NULL;
62  h_forcesSize = 0;
63  h_forcesSlow = NULL;
64  h_forcesSlowSize = 0;
65 
66  d_forces = NULL;
67  d_forcesSize = 0;
68  d_forcesSlow = NULL;
69  d_forcesSlowSize = 0;
70 
71  // GBIS
72  intRad0H = NULL;
73  intRad0HSize = 0;
74  intRadSH = NULL;
75  intRadSHSize = 0;
76  psiSumH = NULL;
77  psiSumHSize = 0;
78  bornRadH = NULL;
79  bornRadHSize = 0;
80  dEdaSumH = NULL;
81  dEdaSumHSize = 0;
82  dHdrPrefixH = NULL;
83  dHdrPrefixHSize = 0;
84  maxShmemPerBlock = 0;
85  cudaPatches = NULL;
86 
87  atomsChangedIn = true;
88  atomsChanged = true;
89  computesChanged = true;
90 
91  forceDoneEventRecord = false;
92 
94  if (simParams->pressureProfileOn) {
95  NAMD_die("CudaComputeNonbonded, pressure profile not supported");
96  }
97 
98  if (simParams->GBISOn) gbisPhase = 3;
99 
100  doSkip = false;
101 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
static __thread atom * atoms
static __thread float * bornRadH
static __thread int2 * exclusionsByAtom
static __thread float * dHdrPrefixH
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
static __thread float * intRadSH
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
void NAMD_die(const char *err_msg)
Definition: common.C:83
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ vdwTypes
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:127
Bool pressureProfileOn
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
Compute(ComputeID)
Definition: Compute.C:33
static __thread float * intRad0H
CudaComputeNonbonded::~CudaComputeNonbonded ( )

Definition at line 106 of file CudaComputeNonbonded.C.

References cudaCheck, and ComputeMgr::sendUnregisterBoxesOnPe().

106  {
107  cudaCheck(cudaSetDevice(deviceID));
108  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
109  if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
110  if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
111  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
112  if (h_forces != NULL) deallocate_host<float4>(&h_forces);
113  if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
114  if (d_forces != NULL) deallocate_device<float4>(&d_forces);
115  if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
116 
117  // GBIS
118  if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
119  if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
120  if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
121  if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
122  if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
123  if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
124 
125  if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
126 
127  if (patches.size() > 0) {
128  deallocate_host<VirialEnergy>(&h_virialEnergy);
129  deallocate_device<VirialEnergy>(&d_virialEnergy);
130  cudaCheck(cudaStreamDestroy(stream));
131  cudaCheck(cudaEventDestroy(forceDoneEvent));
132  CmiDestroyLock(lock);
133  delete reduction;
134  }
135 
136  // NOTE: unregistering happens in [sync] -entry method
138 
139 }
static __thread ComputeMgr * computeMgr
static __thread atom * atoms
static __thread float * bornRadH
static __thread int2 * exclusionsByAtom
static __thread float * dHdrPrefixH
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__thread cudaStream_t stream
static __thread float * intRadSH
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1686
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ vdwTypes
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
static __thread float * intRad0H

Member Function Documentation

void CudaComputeNonbonded::assignPatches ( ComputeMgr computeMgrIn)

Definition at line 363 of file CudaComputeNonbonded.C.

References PatchMap::basePatchIDList(), deviceCUDA, findHomePatchPe(), findProxyPatchPes(), DeviceCUDA::getDeviceCount(), DeviceCUDA::getMasterPeForDeviceID(), Compute::getNumPatches(), DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputePmeCUDAMgr::isPmePe(), NAMD_bug(), PatchMap::Object(), ComputePmeCUDAMgr::Object(), PatchMap::ObjectOnPe(), ComputeMgr::sendAssignPatchesOnPe(), Compute::setNumPatches(), and sort.

Referenced by ComputeMgr::createComputes().

363  {
364  // Remove duplicate patches
365  std::sort(patches.begin(), patches.end());
366  std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
367  patches.erase(last, patches.end());
368  // Set number of patches
369  setNumPatches(patches.size());
370  masterPe = CkMyPe();
371  computeMgr = computeMgrIn;
372  // Start patch counter
373  patchesCounter = getNumPatches();
374  // Patch ID map
375  std::map<PatchID, int> pidMap;
376 #if 1
377  //-------------------------------------------------------
378  // Copied in from ComputeNonbondedCUDA::assignPatches()
379  //-------------------------------------------------------
380 
381  std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
382  int numPesOnNodeSharingDevice = 0;
383  int masterIndex = -1;
384  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
385  int pe = deviceCUDA->getPesSharingDevice(i);
386  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
387  if ( CkNodeOf(pe) == CkMyNode() ) {
388  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
389  }
390  }
391 
392  std::vector<int> count(patches.size(), 0);
393  std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
394  std::vector<int> rankpcount(CkMyNodeSize(), 0);
395  std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
396 
397  PatchMap* patchMap = PatchMap::Object();
398 
399  int unassignedpatches = patches.size();
400 
401  for (int i=0;i < patches.size(); ++i) {
402  patches[i].pe = -1;
403  }
404 
405  // assign if home pe and build table of natural proxies
406  for (int i=0;i < patches.size(); ++i) {
407  int pid = patches[i].patchID;
408  // homePe = PE where the patch currently resides
409  int homePe = patchMap->node(pid);
410  for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
411  int pe = pesOnNodeSharingDevice[j];
412  // If homePe is sharing this device, assign this patch to homePe
413  if ( pe == homePe ) {
414  patches[i].pe = pe;
415  --unassignedpatches;
416  pcount[j] += 1;
417  }
418  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
419  table[i*numPesOnNodeSharingDevice + j] = 1;
420  }
421  }
422  // Assign this patch to homePe, if it resides on the same node
423  if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
424  patches[i].pe = homePe;
425  --unassignedpatches;
426  rankpcount[CkRankOf(homePe)] += 1;
427  }
428  }
429  // assign if only one pe has a required proxy
430  for (int i=0; i < patches.size(); ++i) {
431  int pid = patches[i].patchID;
432  if ( patches[i].pe != -1 ) continue;
433  int c = 0;
434  int lastj;
435  for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
436  if ( table[i*numPesOnNodeSharingDevice + j] ) {
437  ++c;
438  lastj = j;
439  }
440  }
441  count[i] = c;
442  if ( c == 1 ) {
443  patches[i].pe = pesOnNodeSharingDevice[lastj];
444  --unassignedpatches;
445  pcount[lastj] += 1;
446  }
447  }
448  int assignj = 0;
449  while ( unassignedpatches ) {
450  int i;
451  for (i=0;i < patches.size(); ++i) {
452  if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
453  int pid = patches[i].patchID;
454  // patch_record &pr = patchRecords[pid];
455  if ( patches[i].pe != -1 ) continue;
456  patches[i].pe = pesOnNodeSharingDevice[assignj];
457  --unassignedpatches;
458  pcount[assignj] += 1;
459  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
460  break;
461  }
462  if (i < patches.size() ) continue; // start search again
463  for ( i=0;i < patches.size(); ++i ) {
464  int pid = patches[i].patchID;
465  // patch_record &pr = patchRecords[pid];
466  if ( patches[i].pe != -1 ) continue;
467  if ( count[i] ) continue;
468  patches[i].pe = pesOnNodeSharingDevice[assignj];
469  --unassignedpatches;
470  pcount[assignj] += 1;
471  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
472  break;
473  }
474  if ( i < patches.size() ) continue; // start search again
475  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
476  }
477 
478  // For each rank, list of patches
479  rankPatches.resize(CkMyNodeSize());
480  for (int i=0; i < patches.size(); ++i) {
481  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
482  pidMap[patches[i].patchID] = i;
483  }
484 
485  // for ( int i=0; i < patches.size(); ++i ) {
486  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
487  // }
488 
489 /*
490  slavePes = new int[CkMyNodeSize()];
491  slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
492  numSlaves = 0;
493  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
494  int pe = pesOnNodeSharingDevice[j];
495  int rank = pe - CkNodeFirst(CkMyNode());
496  // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
497  // CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
498  if ( pe == CkMyPe() ) continue;
499  if ( ! pcount[j] && ! rankpcount[rank] ) continue;
500  rankpcount[rank] = 0; // skip in rank loop below
501  slavePes[numSlaves] = pe;
502  computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
503  ++numSlaves;
504  }
505  for ( int j=0; j<CkMyNodeSize(); ++j ) {
506  int pe = CkNodeFirst(CkMyNode()) + j;
507  // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
508  // CkMyPe(),j,pe,rankpcount[j]);
509  if ( ! rankpcount[j] ) continue;
510  if ( pe == CkMyPe() ) continue;
511  slavePes[numSlaves] = pe;
512  computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
513  ++numSlaves;
514  }
515 */
516 
517 #else
518  // For each rank, list of patches
519  rankPatches.resize(CkMyNodeSize());
520  // For each rank, list of home patch IDs
521  PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
522  for (int i=0;i < CkMyNodeSize();i++) {
523  int pe = CkNodeFirst(CkMyNode()) + i;
524  PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
525  }
526  std::vector<int> proxyPatchPes;
527  std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
528  //--------------------------------------------------------
529  // Build a list of PEs to avoid
530  std::vector<int> pesToAvoid;
531 #if 0
532  // Avoid other GPUs' master PEs
533  for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
534  int pe = deviceCUDA->getMasterPeForDeviceID(i);
535  if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
536  }
537  // Avoid PEs that are involved in PME
538  ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
539  for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
540  if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
541  }
542  // Set counters of avoidable PEs to high numbers
543  for (int i=0;i < pesToAvoid.size();i++) {
544  int pe = pesToAvoid[i];
545  peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
546  }
547 #endif
548  // Avoid master Pe somewhat
549  peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
550  //--------------------------------------------------------
551  for (int i=0;i < patches.size();i++) {
552  PatchID pid = patches[i].patchID;
553  int pe = findHomePatchPe(rankHomePatchIDs, pid);
554  if (pe == -1) {
555  // Patch not present on this node => try finding a ProxyPatch
556  findProxyPatchPes(proxyPatchPes, pid);
557  if (proxyPatchPes.size() == 0) {
558  // No ProxyPatch => create one on rank that has the least ProxyPatches
559  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
560  pe = CkNodeFirst(CkMyNode()) + rank;
561  peProxyPatchCounter[rank]++;
562  } else {
563  // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
564  // this is done by finding the entry with minimum peProxyPatchCounter -value
565  // Find miniumum among proxyPatchPes, i.e., find the minimum among
566  // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
567  // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
568  // [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
569  // - proxyPatchPes.begin();
570  // pe = proxyPatchPes[pppi];
571  int minCounter = (1 << 30);
572  for (int j=0;j < proxyPatchPes.size();j++) {
573  if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
574  pe = proxyPatchPes[j];
575  minCounter = peProxyPatchCounter[CkRankOf(pe)];
576  }
577  }
578  if (pe == -1)
579  NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
580  peProxyPatchCounter[CkRankOf(pe)]++;
581  }
582  } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
583  // Found home patch on this node, but it's on PE that should be avoided => find a new one
584  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
585  pe = CkNodeFirst(CkMyNode()) + rank;
586  peProxyPatchCounter[rank]++;
587  }
588  if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
589  NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
590  rankPatches[CkRankOf(pe)].push_back(i);
591  pidMap[pid] = i;
592  }
593 
594  delete [] rankHomePatchIDs;
595 #endif
596  // Setup computes using pidMap
597  for (int i=0;i < computes.size();i++) {
598  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
599  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
600  }
601  for (int i=0;i < CkMyNodeSize();i++) {
602  if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
603  }
604  computeMgr->sendAssignPatchesOnPe(pes, this);
605 }
void setNumPatches(int n)
Definition: Compute.h:52
int getDeviceCount()
Definition: DeviceCUDA.h:87
static PatchMap * Object()
Definition: PatchMap.h:27
static __thread ComputeMgr * computeMgr
void basePatchIDList(int pe, PatchIDList &)
Definition: PatchMap.C:454
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
static ComputePmeCUDAMgr * Object()
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:381
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:102
int PatchID
Definition: NamdTypes.h:182
BlockRadixSort::TempStorage sort
int getNumPatches()
Definition: Compute.h:53
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:101
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1586
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)
void CudaComputeNonbonded::assignPatchesOnPe ( )

Definition at line 301 of file CudaComputeNonbonded.C.

References ResizeArray< T >::add(), NAMD_bug(), PatchMap::node(), PatchMap::Object(), ResizeArray< T >::size(), and sort.

Referenced by ComputeMgr::recvAssignPatchesOnPe().

301  {
302  if (rankPatches[CkMyRank()].size() == 0)
303  NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
304 
305  // calculate priority rank of local home patch within pe
306  {
307  PatchMap* patchMap = PatchMap::Object();
308  ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
309  for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
310  int i = rankPatches[CkMyRank()][k];
311  int pid = patches[i].patchID;
312  int homePe = patchMap->node(pid);
313  if ( CkNodeOf(homePe) == CkMyNode() ) {
314  int2 pid_index;
315  pid_index.x = pid;
316  pid_index.y = i;
317  homePatchByRank[CkRankOf(homePe)].add(pid_index);
318  }
319  }
320  for ( int i=0; i<CkMyNodeSize(); ++i ) {
322  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
323  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
324  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
325  int index = homePatchByRank[i][j].y;
326  patches[index].reversePriorityRankInPe = j + masterBoost;
327  }
328  }
329  }
330 
331  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
332  assignPatch(rankPatches[CkMyRank()][i]);
333  }
334 }
static PatchMap * Object()
Definition: PatchMap.h:27
void NAMD_bug(const char *err_msg)
Definition: common.C:123
BlockRadixSort::TempStorage sort
int node(int pid) const
Definition: PatchMap.h:114
void CudaComputeNonbonded::atomUpdate ( void  )
virtual

Reimplemented from Compute.

Definition at line 644 of file CudaComputeNonbonded.C.

644  {
645  atomsChangedIn = true;
646 }
void CudaComputeNonbonded::doWork ( void  )
virtual

Reimplemented from Compute.

Definition at line 921 of file CudaComputeNonbonded.C.

References Flags::doEnergy, Flags::doFullElectrostatics, Flags::doNonbonded, Flags::doVirial, SimParameters::GBISOn, Compute::gbisPhase, NAMD_bug(), Node::Object(), ComputeMgr::sendOpenBoxesOnPe(), and Node::simParameters.

921  {
922  if (CkMyPe() != masterPe)
923  NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
924 
925  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
926  // atomsChangedIn can be set to true by any Pe
927  // atomsChanged can only be set by masterPe
928  // This use of double varibles makes sure we don't have race condition
929  atomsChanged = atomsChangedIn;
930  atomsChangedIn = false;
931 
933 
934  if (patches.size() == 0) return; // No work do to
935 
936  // Take the flags from the first patch on this Pe
937  // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
938  Flags &flags = patches[0].patch->flags;
939 
940  doSlow = flags.doFullElectrostatics;
941  doEnergy = flags.doEnergy;
942  doVirial = flags.doVirial;
943 
944  if (flags.doNonbonded) {
945 
946  if (simParams->GBISOn) {
947  gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
948  }
949 
950  if (!simParams->GBISOn || gbisPhase == 1) {
951  if ( computesChanged ) {
952  updateComputes();
953  }
954  if (atomsChanged) {
955  // Re-calculate patch atom numbers and storage
956  updatePatches();
957  reSortDone = false;
958  }
959  reallocateArrays();
960  }
961 
962  // Open boxes on Pes and launch work to masterPe
963  computeMgr->sendOpenBoxesOnPe(pes, this);
964 
965  } else {
966  // No work to do, skip
967  skip();
968  }
969 
970 }
static Node * Object()
Definition: Node.h:86
static __thread ComputeMgr * computeMgr
SimParameters * simParameters
Definition: Node.h:178
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int doEnergy
Definition: PatchTypes.h:20
int doFullElectrostatics
Definition: PatchTypes.h:23
int doNonbonded
Definition: PatchTypes.h:22
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:127
int doVirial
Definition: PatchTypes.h:21
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1639
void CudaComputeNonbonded::finishPatchesOnPe ( )

Definition at line 1371 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchesOnPe().

1371  {
1372  finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1373 }
void CudaComputeNonbonded::finishPatchOnPe ( int  i)

Definition at line 1378 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchOnPe().

1378  {
1379  std::vector<int> v(1, i);
1380  finishSetOfPatchesOnPe(v);
1381 }
void CudaComputeNonbonded::finishReductions ( )

Definition at line 1212 of file CudaComputeNonbonded.C.

References ADD_TENSOR_OBJECT, cudaCheck, VirialEnergy::energyElec, VirialEnergy::energyGBIS, VirialEnergy::energySlow, VirialEnergy::energyVdw, SimParameters::GBISOn, CudaTileListKernel::getNumExcluded(), SubmitReduction::item(), NAMD_bug(), Node::Object(), REDUCTION_COMPUTE_CHECKSUM, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, Node::simParameters, SubmitReduction::submit(), VirialEnergy::virial, VirialEnergy::virialSlow, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

Referenced by ComputeMgr::recvFinishReductions().

1212  {
1213 
1214  if (CkMyPe() != masterPe)
1215  NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
1216 
1217  // fprintf(stderr, "%d finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
1218 
1219  if (!doSkip) {
1220 
1221  if (doStreaming && (doVirial || doEnergy)) {
1222  // For streaming kernels, we must wait for virials and forces to be copied back to CPU
1223  if (!forceDoneEventRecord)
1224  NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1225  cudaCheck(cudaEventSynchronize(forceDoneEvent));
1226  forceDoneEventRecord = false;
1227  }
1228 
1229  if (doVirial) {
1230  Tensor virialTensor;
1231  virialTensor.xx = h_virialEnergy->virial[0];
1232  virialTensor.xy = h_virialEnergy->virial[1];
1233  virialTensor.xz = h_virialEnergy->virial[2];
1234  virialTensor.yx = h_virialEnergy->virial[3];
1235  virialTensor.yy = h_virialEnergy->virial[4];
1236  virialTensor.yz = h_virialEnergy->virial[5];
1237  virialTensor.zx = h_virialEnergy->virial[6];
1238  virialTensor.zy = h_virialEnergy->virial[7];
1239  virialTensor.zz = h_virialEnergy->virial[8];
1240  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1241  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1242  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1243  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
1244  if (doSlow) {
1245  Tensor virialTensor;
1246  virialTensor.xx = h_virialEnergy->virialSlow[0];
1247  virialTensor.xy = h_virialEnergy->virialSlow[1];
1248  virialTensor.xz = h_virialEnergy->virialSlow[2];
1249  virialTensor.yx = h_virialEnergy->virialSlow[3];
1250  virialTensor.yy = h_virialEnergy->virialSlow[4];
1251  virialTensor.yz = h_virialEnergy->virialSlow[5];
1252  virialTensor.zx = h_virialEnergy->virialSlow[6];
1253  virialTensor.zy = h_virialEnergy->virialSlow[7];
1254  virialTensor.zz = h_virialEnergy->virialSlow[8];
1255  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1256  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1257  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1258  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
1259  }
1260  }
1261  if (doEnergy) {
1262  // if (doSlow)
1263  // printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
1265  reduction->item(REDUCTION_LJ_ENERGY) += h_virialEnergy->energyVdw;
1266  reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((simParams->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
1267  // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
1268  if (doSlow) reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
1269  // fprintf(stderr, "h_virialEnergy->energyElec %lf\n", h_virialEnergy->energyElec);
1270  }
1271 
1272  reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
1273  }
1274  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1275  reduction->submit();
1276 
1277  // Reset flags
1278  doSkip = false;
1279  computesChanged = false;
1280 }
static Node * Object()
Definition: Node.h:86
BigReal zy
Definition: Tensor.h:19
BigReal xz
Definition: Tensor.h:17
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:43
SimParameters * simParameters
Definition: Node.h:178
BigReal & item(int i)
Definition: ReductionMgr.h:312
BigReal yz
Definition: Tensor.h:18
void NAMD_bug(const char *err_msg)
Definition: common.C:123
BigReal yx
Definition: Tensor.h:18
BigReal xx
Definition: Tensor.h:17
double virialSlow[9]
BigReal zz
Definition: Tensor.h:19
#define simParams
Definition: Output.C:127
Definition: Tensor.h:15
BigReal xy
Definition: Tensor.h:17
BigReal yy
Definition: Tensor.h:18
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void submit(void)
Definition: ReductionMgr.h:323
BigReal zx
Definition: Tensor.h:19
void CudaComputeNonbonded::gbisP2PatchReady ( PatchID  pid,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 244 of file CudaComputeNonbonded.C.

References Compute::gbisP2PatchReady().

244  {
245  CmiLock(lock);
246  Compute::gbisP2PatchReady(pid, seq);
247  CmiUnlock(lock);
248 }
virtual void gbisP2PatchReady(PatchID, int seq)
Definition: Compute.C:84
void CudaComputeNonbonded::gbisP3PatchReady ( PatchID  pid,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 250 of file CudaComputeNonbonded.C.

References Compute::gbisP3PatchReady().

250  {
251  CmiLock(lock);
252  Compute::gbisP3PatchReady(pid, seq);
253  CmiUnlock(lock);
254 }
virtual void gbisP3PatchReady(PatchID, int seq)
Definition: Compute.C:94
void CudaComputeNonbonded::initialize ( void  )
virtual

Reimplemented from Compute.

Definition at line 607 of file CudaComputeNonbonded.C.

References cudaCheck, ReductionMgr::Object(), Compute::priority(), REDUCTIONS_BASIC, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

607  {
608  if (patches.size() > 0) {
609  // Allocate CUDA version of patches
610  cudaCheck(cudaSetDevice(deviceID));
611  allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
612 
613  allocate_host<VirialEnergy>(&h_virialEnergy, 1);
614  allocate_device<VirialEnergy>(&d_virialEnergy, 1);
615 
616  /* JM: Queries for maximum sharedMemoryPerBlock on deviceID
617  */
618  cudaDeviceProp props;
619  cudaCheck(cudaGetDeviceProperties(&props, deviceID)); //Gets properties of 'deviceID device'
620  maxShmemPerBlock = props.sharedMemPerBlock;
621 
622 #if CUDA_VERSION >= 5050
623  int leastPriority, greatestPriority;
624  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
625  int priority = (doStreaming) ? leastPriority : greatestPriority;
626  // int priority = greatestPriority;
627  cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
628 #else
629  cudaCheck(cudaStreamCreate(&stream));
630 #endif
631  cudaCheck(cudaEventCreate(&forceDoneEvent));
632 
633  buildExclusions();
634 
635  lock = CmiCreateLock();
636 
638  }
639 }
void buildExclusions()
Definition: CompressPsf.C:1197
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
__thread cudaStream_t stream
int priority(void)
Definition: Compute.h:65
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void CudaComputeNonbonded::launchWork ( )

Definition at line 972 of file CudaComputeNonbonded.C.

References CudaComputeNonbonded::PatchRecord::atomStart, cudaCheck, ComputeNonbondedUtil::cutoff, SimParameters::GBISOn, Compute::gbisPhase, CudaTileListKernel::getEmptyPatches(), CudaTileListKernel::getNumEmptyPatches(), CudaTileListKernel::getNumPatches(), CudaComputeNonbondedKernel::getPatchReadyQueue(), SubmitReduction::item(), NAMD_bug(), CudaComputeNonbonded::PatchRecord::numAtoms, Node::Object(), CudaComputeNonbondedKernel::reduceVirialEnergy(), REDUCTION_PAIRLIST_WARNINGS, Flags::savePairlists, Node::simParameters, and Flags::usePairlists.

Referenced by ComputeMgr::recvLaunchWork().

972  {
973  if (CkMyPe() != masterPe)
974  NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
975 
976  beforeForceCompute = CkWallTimer();
977 
978  cudaCheck(cudaSetDevice(deviceID));
980 
981  //execute only during GBIS phase 1, or if not using GBIS
982  if (!simParams->GBISOn || gbisPhase == 1) {
983 
984  if ( atomsChanged || computesChanged ) {
985  // Invalidate pair lists
986  pairlistsValid = false;
987  pairlistTolerance = 0.0f;
988  }
989 
990  // Get maximum atom movement and patch tolerance
991  float maxAtomMovement = 0.0f;
992  float maxPatchTolerance = 0.0f;
993  getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
994  // Update pair-list cutoff
995  Flags &flags = patches[0].patch->flags;
996  savePairlists = false;
997  usePairlists = false;
998  if ( flags.savePairlists ) {
999  savePairlists = true;
1000  usePairlists = true;
1001  } else if ( flags.usePairlists ) {
1002  if ( ! pairlistsValid ||
1003  ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1004  reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
1005  } else {
1006  usePairlists = true;
1007  }
1008  }
1009  if ( ! usePairlists ) {
1010  pairlistsValid = false;
1011  }
1012  float plcutoff = cutoff;
1013  if ( savePairlists ) {
1014  pairlistsValid = true;
1015  pairlistTolerance = 2. * maxPatchTolerance;
1016  plcutoff += pairlistTolerance;
1017  }
1018  plcutoff2 = plcutoff * plcutoff;
1019 
1020  // if (atomsChanged)
1021  // CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1022  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1023 
1024  } // if (!simParams->GBISOn || gbisPhase == 1)
1025 
1026  // Calculate PME & VdW forces
1027  if (!simParams->GBISOn || gbisPhase == 1) {
1028  doForce();
1029  if (doStreaming) {
1030  patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
1031  patchReadyQueueLen = tileListKernel.getNumPatches();
1032  patchReadyQueueNext = 0;
1033  // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
1034  int numEmptyPatches = tileListKernel.getNumEmptyPatches();
1035  int* emptyPatches = tileListKernel.getEmptyPatches();
1036  for (int i=0;i < numEmptyPatches;i++) {
1037  PatchRecord &pr = patches[emptyPatches[i]];
1038  memset(h_forces+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1039  if (doSlow) memset(h_forcesSlow+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1040  patchReadyQueue[i] = emptyPatches[i];
1041  }
1042  if (patchReadyQueueLen != patches.size())
1043  NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1044  }
1045  }
1046 
1047  // For GBIS phase 1 at pairlist update, we must re-sort tile list
1048  // before calling doGBISphase1().
1049  if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
1050  // In this code path doGBISphase1() is called in forceDone()
1051  forceDoneSetCallback();
1052  return;
1053  }
1054 
1055  // GBIS Phases
1056  if (simParams->GBISOn) {
1057  if (gbisPhase == 1) {
1058  doGBISphase1();
1059  } else if (gbisPhase == 2) {
1060  doGBISphase2();
1061  } else if (gbisPhase == 3) {
1062  doGBISphase3();
1063  }
1064  }
1065 
1066  // Copy forces to host
1067  if (!simParams->GBISOn || gbisPhase == 3) {
1068  if (!doStreaming) {
1069  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1070  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1071  }
1072  }
1073 
1074  if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
1075  // For GBIS, energies are ready after phase 2
1076  nonbondedKernel.reduceVirialEnergy(tileListKernel,
1077  atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
1078  d_forces, d_forcesSlow, d_virialEnergy, stream);
1079  copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1080  }
1081 
1082  // Setup call back
1083  forceDoneSetCallback();
1084 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
int savePairlists
Definition: PatchTypes.h:39
BigReal & item(int i)
Definition: ReductionMgr.h:312
int usePairlists
Definition: PatchTypes.h:38
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int atomStorageSize
void NAMD_bug(const char *err_msg)
Definition: common.C:123
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float plcutoff2
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:127
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void CudaComputeNonbonded::messageEnqueueWork ( )

Definition at line 861 of file CudaComputeNonbonded.C.

References WorkDistrib::messageEnqueueWork(), and NAMD_bug().

Referenced by ComputeMgr::recvMessageEnqueueWork().

861  {
862  if (masterPe != CkMyPe())
863  NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
865 }
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2727
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int CudaComputeNonbonded::noWork ( )
virtual

Reimplemented from Compute.

Definition at line 886 of file CudaComputeNonbonded.C.

References ComputeMgr::sendMessageEnqueueWork().

886  {
887  // Simply enqueu doWork on masterPe and return "no work"
888  computeMgr->sendMessageEnqueueWork(masterPe, this);
889  return 1;
890 }
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1664
static __thread ComputeMgr * computeMgr
void CudaComputeNonbonded::openBoxesOnPe ( )

Definition at line 867 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendLaunchWork().

Referenced by ComputeMgr::recvOpenBoxesOnPe().

867  {
868  if (rankPatches[CkMyRank()].size() == 0)
869  NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
870  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
871  openBox(rankPatches[CkMyRank()][i]);
872  }
873  bool done = false;
874  CmiLock(lock);
875  patchesCounter -= rankPatches[CkMyRank()].size();
876  if (patchesCounter == 0) {
877  patchesCounter = getNumPatches();
878  done = true;
879  }
880  CmiUnlock(lock);
881  if (done) {
882  computeMgr->sendLaunchWork(masterPe, this);
883  }
884 }
static __thread ComputeMgr * computeMgr
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1675
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int getNumPatches()
Definition: Compute.h:53
void CudaComputeNonbonded::patchReady ( PatchID  pid,
int  doneMigration,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 232 of file CudaComputeNonbonded.C.

References NAMD_bug(), and Compute::patchReady().

232  {
233  if (doneMigration) {
234  int i = findPid(pid);
235  if (i == -1)
236  NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
237  updatePatch(i);
238  }
239  CmiLock(lock);
240  Compute::patchReady(pid, doneMigration, seq);
241  CmiUnlock(lock);
242 }
void NAMD_bug(const char *err_msg)
Definition: common.C:123
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:63
void CudaComputeNonbonded::registerComputePair ( ComputeID  cid,
PatchID pid,
int *  trans 
)

Definition at line 173 of file CudaComputeNonbonded.C.

References PatchMap::center(), PatchMap::Object(), Vector::x, Vector::y, and Vector::z.

173  {
174  computesChanged = true;
175  addPatch(pid[0]);
176  addPatch(pid[1]);
177  PatchMap* patchMap = PatchMap::Object();
178  int t1 = trans[0];
179  int t2 = trans[1];
180  Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
181  offset.x += (t1%3-1) - (t2%3-1);
182  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
183  offset.z += (t1/9-1) - (t2/9-1);
184  addCompute(cid, pid[0], pid[1], offset);
185 }
static PatchMap * Object()
Definition: PatchMap.h:27
Definition: Vector.h:64
BigReal z
Definition: Vector.h:66
BigReal x
Definition: Vector.h:66
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
BigReal y
Definition: Vector.h:66
const ComputeID cid
Definition: Compute.h:43
void CudaComputeNonbonded::registerComputeSelf ( ComputeID  cid,
PatchID  pid 
)

Definition at line 163 of file CudaComputeNonbonded.C.

163  {
164  computesChanged = true;
165  addPatch(pid);
166  addCompute(cid, pid, pid, 0.);
167 }
const ComputeID cid
Definition: Compute.h:43
void CudaComputeNonbonded::skipPatchesOnPe ( )

Definition at line 686 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendFinishReductions().

Referenced by ComputeMgr::recvSkipPatchesOnPe().

686  {
687  if (rankPatches[CkMyRank()].size() == 0)
688  NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
689  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
690  skipPatch(rankPatches[CkMyRank()][i]);
691  }
692  bool done = false;
693  CmiLock(lock);
694  patchesCounter -= rankPatches[CkMyRank()].size();
695  if (patchesCounter == 0) {
696  patchesCounter = getNumPatches();
697  done = true;
698  }
699  CmiUnlock(lock);
700  if (done) {
701  // Reduction must be done on masterPe
702  computeMgr->sendFinishReductions(masterPe, this);
703  }
704 }
static __thread ComputeMgr * computeMgr
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1653
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int getNumPatches()
Definition: Compute.h:53
void CudaComputeNonbonded::unregisterBoxesOnPe ( )

Definition at line 151 of file CudaComputeNonbonded.C.

References NAMD_bug().

Referenced by ComputeMgr::recvUnregisterBoxesOnPe().

151  {
152  if (rankPatches[CkMyRank()].size() == 0)
153  NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
154  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
155  unregisterBox(rankPatches[CkMyRank()][i]);
156  }
157 }
void NAMD_bug(const char *err_msg)
Definition: common.C:123

The documentation for this class was generated from the following files: