NAMD
Public Member Functions | List of all members
DeviceCUDA Class Reference

#include <DeviceCUDA.h>

Public Member Functions

 DeviceCUDA ()
 
 ~DeviceCUDA ()
 
void initialize ()
 
int getDeviceCount ()
 
int getNumDevice ()
 
bool device_shared_with_pe (int pe)
 
bool one_device_per_node ()
 
int getNoStreaming ()
 
int getNoMergeGrids ()
 
int getMergeGrids ()
 
void setMergeGrids (const int val)
 
bool getSharedGpu ()
 
int getNextPeSharingGpu ()
 
int getMasterPe ()
 
int getNumPesSharingDevice ()
 
int getPesSharingDevice (const int i)
 
int getGpuIsMine ()
 
void setGpuIsMine (const int val)
 
int getDeviceID ()
 
int getDeviceIDbyRank (int rank)
 
int getDeviceIDforPe (int pe)
 
int getMasterPeForDeviceID (int deviceID)
 
int getMaxNumThreads ()
 
int getMaxNumBlocks ()
 

Detailed Description

Definition at line 33 of file DeviceCUDA.h.

Constructor & Destructor Documentation

DeviceCUDA::DeviceCUDA ( )

Definition at line 80 of file DeviceCUDA.C.

80 : deviceProps(NULL), devices(NULL) {}
DeviceCUDA::~DeviceCUDA ( )

Definition at line 365 of file DeviceCUDA.C.

365  {
366  if (deviceProps != NULL) delete [] deviceProps;
367  if (devices != NULL) delete [] devices;
368  delete [] pesSharingDevice;
369 }

Member Function Documentation

bool DeviceCUDA::device_shared_with_pe ( int  pe)

Definition at line 388 of file DeviceCUDA.C.

Referenced by ComputeMgr::createComputes().

388  {
389  for ( int i=0; i<numPesSharingDevice; ++i ) {
390  if ( pesSharingDevice[i] == pe ) return true;
391  }
392  return false;
393 }
int DeviceCUDA::getDeviceCount ( )
inline

Definition at line 87 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeCUDAMgr::initialize().

87 {return deviceCount;}
int DeviceCUDA::getDeviceID ( )
inline
int DeviceCUDA::getDeviceIDbyRank ( int  rank)
inline
int DeviceCUDA::getDeviceIDforPe ( int  pe)

Definition at line 374 of file DeviceCUDA.C.

References deviceIDList.

374  {
375  return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
376 }
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:68
int DeviceCUDA::getGpuIsMine ( )
inline

Definition at line 104 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::doWork().

104 {return gpuIsMine;}
int DeviceCUDA::getMasterPe ( )
inline
int DeviceCUDA::getMasterPeForDeviceID ( int  deviceID)

Definition at line 381 of file DeviceCUDA.C.

References masterPeList.

Referenced by CudaComputeNonbonded::assignPatches().

381  {
382  return masterPeList[deviceID % deviceCount] - 1;
383 }
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:71
int DeviceCUDA::getMaxNumBlocks ( )
int DeviceCUDA::getMaxNumThreads ( )

Definition at line 409 of file DeviceCUDA.C.

References cudaCheck.

409  {
410  int dev;
411  cudaCheck(cudaGetDevice(&dev));
412  return deviceProps[dev].maxThreadsPerBlock;
413 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
int DeviceCUDA::getMergeGrids ( )
inline
int DeviceCUDA::getNextPeSharingGpu ( )
inline

Definition at line 99 of file DeviceCUDA.h.

Referenced by cuda_check_local_calc(), and cuda_check_remote_calc().

99 {return nextPeSharingGpu;}
int DeviceCUDA::getNoMergeGrids ( )
inline

Definition at line 94 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

94 {return nomergegrids;}
int DeviceCUDA::getNoStreaming ( )
inline
int DeviceCUDA::getNumDevice ( )
inline
int DeviceCUDA::getNumPesSharingDevice ( )
inline

Definition at line 101 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::assignPatches(), and CudaComputeNonbonded::assignPatches().

101 {return numPesSharingDevice;}
int DeviceCUDA::getPesSharingDevice ( const int  i)
inline

Definition at line 102 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::assignPatches(), and CudaComputeNonbonded::assignPatches().

102 {return pesSharingDevice[i];}
bool DeviceCUDA::getSharedGpu ( )
inline

Definition at line 98 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), and ComputeNonbondedCUDA::recvYieldDevice().

98 {return sharedGpu;}
void DeviceCUDA::initialize ( void  )

Definition at line 85 of file DeviceCUDA.C.

References cuda_args, cudaCheck, cudaDie(), deviceIDList, cuda_args_t::devicelist, cuda_args_t::devicesperreplica, cuda_args_t::ignoresharing, masterPeList, MAX_NUM_DEVICES, MAX_NUM_RANKS, cuda_args_t::mergegrids, NAMD_die(), cuda_args_t::nomergegrids, cuda_args_t::nostreaming, and cuda_args_t::usedevicelist.

Referenced by cuda_initialize().

85  {
86  // Copy command-line arguments into class
87  this->devicelist = cuda_args.devicelist;
88  this->usedevicelist = cuda_args.usedevicelist;
89  this->devicesperreplica = cuda_args.devicesperreplica;
90  this->ignoresharing = cuda_args.ignoresharing;
91  this->mergegrids = cuda_args.mergegrids;
92  this->nomergegrids = cuda_args.nomergegrids;
93  this->nostreaming = cuda_args.nostreaming;
94 
95  if (CkMyPe() == 0) register_user_events();
96 
97  if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
98 
99  char host[128];
100  gethostname(host, 128); host[127] = 0;
101 
102  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
103  int myRankInPhysicalNode;
104  int numPesOnPhysicalNode;
105  int *pesOnPhysicalNode;
106  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
107  &pesOnPhysicalNode,&numPesOnPhysicalNode);
108 
109  {
110  int i;
111  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
112  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
113  i = numPesOnPhysicalNode;
114  break;
115  }
116  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
117  }
118  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
119  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
120  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
121  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
122  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
123  }
124  myRankInPhysicalNode = 0;
125  numPesOnPhysicalNode = 1;
126  pesOnPhysicalNode = new int[1];
127  pesOnPhysicalNode[0] = CkMyPe();
128  } else {
129  myRankInPhysicalNode = i;
130  }
131  }
132  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
133 
134  deviceCount = 0;
135  cudaCheck(cudaGetDeviceCount(&deviceCount));
136  if ( deviceCount <= 0 ) {
137  cudaDie("No CUDA devices found.");
138  }
139 
140  // Store all device props
141  deviceProps = new cudaDeviceProp[deviceCount];
142  for ( int i=0; i<deviceCount; ++i ) {
143  cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
144  }
145 
146  ndevices = 0;
147  int nexclusive = 0;
148  if ( usedevicelist ) {
149  devices = new int[strlen(devicelist)];
150  int i = 0;
151  while ( devicelist[i] ) {
152  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
153  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
154  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
155  }
156  } else {
157  if ( ! CkMyPe() ) {
158  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
159  }
160  devices = new int[deviceCount];
161  for ( int i=0; i<deviceCount; ++i ) {
162  int dev = i % deviceCount;
163 #if CUDA_VERSION >= 2020
164  cudaDeviceProp deviceProp;
165  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
166  if ( deviceProp.computeMode != cudaComputeModeProhibited
167  && (deviceProp.major >= 3)
168  && deviceProp.canMapHostMemory
169  && ( (deviceProp.multiProcessorCount > 2) ||
170  ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
171  ) {
172  devices[ndevices++] = dev;
173  }
174  if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
175  ++nexclusive;
176  }
177 #else
178  devices[ndevices++] = dev;
179 #endif
180  }
181  }
182 
183  if ( ! ndevices ) {
184  cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
185  }
186 
187  if ( devicesperreplica > 0 ) {
188  if ( devicesperreplica > ndevices ) {
189  NAMD_die("More devices per partition requested than devices are available");
190  }
191  int *olddevices = devices;
192  devices = new int[devicesperreplica];
193  for ( int i=0; i<devicesperreplica; ++i ) {
194  int mypart = CmiMyPartition();
195  devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
196  }
197  ndevices = devicesperreplica;
198  delete [] olddevices;
199  }
200 
201  int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
202  int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
203 
204  // catch multiple processes per device
205  if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
206  char msg[1024];
207  sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d). "
208  "Sharing devices between processes is inefficient. "
209  "Specify +ignoresharing (each process uses all visible devices) if "
210  "not all devices are visible to each process, otherwise "
211  "adjust number of processes to evenly divide number of devices, "
212  "specify subset of devices with +devices argument (e.g., +devices 0,2), "
213  "or multiply list shared devices (e.g., +devices 0,1,2,0).",
214  ndevices, numPesForDevice / CkMyNodeSize() );
215  NAMD_die(msg);
216  }
217 
218  {
219  // build list of devices actually used by this node
220  nodedevices = new int[ndevices];
221  nnodedevices = 0;
222  int pe = CkNodeFirst(CkMyNode());
223  int dr = -1;
224  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
225  int rank = ignoresharing ? i : CmiPhysicalRank(pe);
226  int peDeviceRank = rank * ndevices / numPesForDevice;
227  if ( peDeviceRank != dr ) {
228  dr = peDeviceRank;
229  nodedevices[nnodedevices++] = devices[dr];
230  }
231  }
232  }
233 
234  {
235  // check for devices used twice by this node
236  for ( int i=0; i<nnodedevices; ++i ) {
237  for ( int j=i+1; j<nnodedevices; ++j ) {
238  if ( nodedevices[i] == nodedevices[j] ) {
239  char msg[1024];
240  sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
241  NAMD_die(msg);
242  }
243  }
244  }
245  }
246 
247  sharedGpu = 0;
248  gpuIsMine = 1;
249  int firstPeSharingGpu = CkMyPe();
250  nextPeSharingGpu = CkMyPe();
251 
252  {
253  int dev;
254  if ( numPesForDevice > 1 ) {
255  int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
256  dev = devices[myDeviceRank];
257  masterPe = CkMyPe();
258  {
259  pesSharingDevice = new int[numPesForDevice];
260  masterPe = -1;
261  numPesSharingDevice = 0;
262  for ( int i = 0; i < numPesForDevice; ++i ) {
263  if ( i * ndevices / numPesForDevice == myDeviceRank ) {
264  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
265  pesSharingDevice[numPesSharingDevice++] = thisPe;
266  if ( masterPe < 1 ) masterPe = thisPe;
267  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
268  }
269  }
270  for ( int j = 0; j < ndevices; ++j ) {
271  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
272  }
273  }
274  if ( sharedGpu && masterPe == CkMyPe() ) {
275  if ( CmiPhysicalNodeID(masterPe) < 2 )
276  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
277  }
278  } else { // in case phys node code is lying
279  dev = devices[CkMyPe() % ndevices];
280  masterPe = CkMyPe();
281  pesSharingDevice = new int[1];
282  pesSharingDevice[0] = CkMyPe();
283  numPesSharingDevice = 1;
284  }
285 
286  deviceID = dev;
287 
288  // Store device IDs to node-wide list
289  if (CkMyRank() >= MAX_NUM_RANKS)
290  NAMD_die("Maximum number of ranks (2048) per node exceeded");
291  deviceIDList[CkMyRank()] = deviceID;
292 
293  if ( masterPe != CkMyPe() ) {
294  if ( CmiPhysicalNodeID(masterPe) < 2 )
295  CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
296  CkMyPe(), myRankInPhysicalNode, masterPe);
297  // for PME only
298  cudaCheck(cudaSetDevice(dev));
299  return;
300  }
301 
302  // Store master PEs for every device ID to node-wide list
303  if (deviceID >= MAX_NUM_DEVICES)
304  NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
305  masterPeList[deviceID] = masterPe + 1; // array is pre-initialized to zeros
306 
307  // disable token-passing but don't submit local until remote finished
308  // if shared_gpu is true, otherwise submit all work immediately
309  firstPeSharingGpu = CkMyPe();
310  nextPeSharingGpu = CkMyPe();
311 
312  gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
313 
314  if ( dev >= deviceCount ) {
315  char buf[256];
316  sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
317  CkMyPe(), dev, host, deviceCount);
318  NAMD_die(buf);
319  }
320 
321  cudaDeviceProp deviceProp;
322  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
323  if ( CmiPhysicalNodeID(masterPe) < 2 )
324  CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
325  CkMyPe(), myRankInPhysicalNode, dev, host,
326  deviceProp.name,
327  (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
328  deviceProp.major, deviceProp.minor,
329  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
330 
331  cudaCheck(cudaSetDevice(dev));
332 
333  } // just let CUDA pick a device for us
334 
335  {
336  // if only one device then already initialized in cuda_affinity_initialize()
337  cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
338  if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
339  cudaGetLastError();
340  } else {
341  cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
342  }
343 
344  int dev;
345  cudaCheck(cudaGetDevice(&dev));
346  deviceID = dev;
347  cudaDeviceProp deviceProp;
348  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
349  if ( deviceProp.computeMode == cudaComputeModeProhibited )
350  cudaDie("device in prohibited mode");
351  if ( deviceProp.major < 3 )
352  cudaDie("device not of compute capability 3.0 or higher");
353  if ( ! deviceProp.canMapHostMemory )
354  cudaDie("device cannot map host memory");
355 
356  // initialize the device on this thread
357  int *dummy;
358  cudaCheck(cudaMalloc(&dummy, 4));
359  }
360 }
int devicesperreplica
Definition: DeviceCUDA.C:42
#define MAX_NUM_DEVICES
Definition: DeviceCUDA.C:70
int nomergegrids
Definition: DeviceCUDA.C:45
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:71
static __thread cuda_args_t cuda_args
Definition: DeviceCUDA.C:49
int usedevicelist
Definition: DeviceCUDA.C:41
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:68
int mergegrids
Definition: DeviceCUDA.C:44
int nostreaming
Definition: DeviceCUDA.C:46
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
void NAMD_die(const char *err_msg)
Definition: common.C:83
#define MAX_NUM_RANKS
Definition: DeviceCUDA.C:67
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
int ignoresharing
Definition: DeviceCUDA.C:43
char * devicelist
Definition: DeviceCUDA.C:40
bool DeviceCUDA::one_device_per_node ( )

Definition at line 398 of file DeviceCUDA.C.

Referenced by ComputePmeMgr::initialize().

398  {
399  if ( numPesSharingDevice != CkMyNodeSize() ) return false;
400  int numPesOnNodeSharingDevice = 0;
401  for ( int i=0; i<numPesSharingDevice; ++i ) {
402  if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
403  ++numPesOnNodeSharingDevice;
404  }
405  }
406  return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
407 }
void DeviceCUDA::setGpuIsMine ( const int  val)
inline

Definition at line 105 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), and ComputeNonbondedCUDA::recvYieldDevice().

105 {gpuIsMine = val;}
void DeviceCUDA::setMergeGrids ( const int  val)
inline

Definition at line 96 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

96 {mergegrids = val;}

The documentation for this class was generated from the following files: