NAMD
DeviceCUDA.C
Go to the documentation of this file.
1 
2 #include "common.h"
3 #include "charm++.h"
4 #include "DeviceCUDA.h"
5 #include "WorkDistrib.h"
6 #include "CudaUtils.h"
7 
8 #ifdef NAMD_CUDA
9 
10 #include <cuda_runtime.h>
11 #include <cuda.h>
12 
13 #ifdef WIN32
14 #define __thread __declspec(thread)
15 #endif
16 
17 // Global storage for CUDA devices
19 
21  deviceCUDA = new DeviceCUDA();
23 }
24 
25 // kill all service threads
26 void cuda_finalize() {
27  int ndevs = 0;
28  cudaGetDeviceCount(&ndevs);
29  for ( int dev=0; dev < ndevs; ++dev ) {
30  cudaSetDevice(dev);
31  cudaDeviceReset();
32  }
33 }
34 
35 // -------------------------------------------------------------------------------------------------
36 // Called from BackEnd.C by all processes to read command line arguments
37 // These argument settings are used by DeviceCUDA -class
38 // -------------------------------------------------------------------------------------------------
39 struct cuda_args_t {
40  char *devicelist;
47 };
48 
49 static __thread cuda_args_t cuda_args;
50 
51 void cuda_getargs(char **argv) {
53  cuda_args.usedevicelist = CmiGetArgStringDesc(argv, "+devices", &cuda_args.devicelist,
54  "comma-delimited list of CUDA device numbers such as 0,2,1,2");
56  CmiGetArgInt(argv, "+devicesperreplica", &cuda_args.devicesperreplica);
57  if ( cuda_args.devicesperreplica < 0 ) NAMD_die("Devices per replica must be positive\n");
58  cuda_args.ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
59  cuda_args.mergegrids = CmiGetArgFlag(argv, "+mergegrids");
60  cuda_args.nomergegrids = CmiGetArgFlag(argv, "+nomergegrids");
61  if ( cuda_args.mergegrids && cuda_args.nomergegrids ) NAMD_die("Do not specify both +mergegrids and +nomergegrids");
62  cuda_args.nostreaming = CmiGetArgFlag(argv, "+nostreaming");
63 }
64 // -------------------------------------------------------------------------------------------------
65 
66 // Node-wide list of device IDs for every rank
67 #define MAX_NUM_RANKS 2048
69 // Node-wide of master PEs for every device ID
70 #define MAX_NUM_DEVICES 256
72 
73 // -------------------------------------------------------------------------------------------------
74 // -------------------------------------------------------------------------------------------------
75 // -------------------------------------------------------------------------------------------------
76 
77 //
78 // Class creator
79 //
80 DeviceCUDA::DeviceCUDA() : deviceProps(NULL), devices(NULL) {}
81 
82 //
83 // Initalize device
84 //
86  // Copy command-line arguments into class
87  this->devicelist = cuda_args.devicelist;
88  this->usedevicelist = cuda_args.usedevicelist;
89  this->devicesperreplica = cuda_args.devicesperreplica;
90  this->ignoresharing = cuda_args.ignoresharing;
91  this->mergegrids = cuda_args.mergegrids;
92  this->nomergegrids = cuda_args.nomergegrids;
93  this->nostreaming = cuda_args.nostreaming;
94 
95  if (CkMyPe() == 0) register_user_events();
96 
97  if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
98 
99  char host[128];
100  gethostname(host, 128); host[127] = 0;
101 
102  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
103  int myRankInPhysicalNode;
104  int numPesOnPhysicalNode;
105  int *pesOnPhysicalNode;
106  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
107  &pesOnPhysicalNode,&numPesOnPhysicalNode);
108 
109  {
110  int i;
111  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
112  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
113  i = numPesOnPhysicalNode;
114  break;
115  }
116  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
117  }
118  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
119  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
120  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
121  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
122  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
123  }
124  myRankInPhysicalNode = 0;
125  numPesOnPhysicalNode = 1;
126  pesOnPhysicalNode = new int[1];
127  pesOnPhysicalNode[0] = CkMyPe();
128  } else {
129  myRankInPhysicalNode = i;
130  }
131  }
132  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
133 
134  deviceCount = 0;
135  cudaCheck(cudaGetDeviceCount(&deviceCount));
136  if ( deviceCount <= 0 ) {
137  cudaDie("No CUDA devices found.");
138  }
139 
140  // Store all device props
141  deviceProps = new cudaDeviceProp[deviceCount];
142  for ( int i=0; i<deviceCount; ++i ) {
143  cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
144  }
145 
146  ndevices = 0;
147  int nexclusive = 0;
148  if ( usedevicelist ) {
149  devices = new int[strlen(devicelist)];
150  int i = 0;
151  while ( devicelist[i] ) {
152  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
153  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
154  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
155  }
156  } else {
157  if ( ! CkMyPe() ) {
158  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
159  }
160  devices = new int[deviceCount];
161  for ( int i=0; i<deviceCount; ++i ) {
162  int dev = i % deviceCount;
163 #if CUDA_VERSION >= 2020
164  cudaDeviceProp deviceProp;
165  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
166  if ( deviceProp.computeMode != cudaComputeModeProhibited
167  && (deviceProp.major >= 3)
168  && deviceProp.canMapHostMemory
169  && ( (deviceProp.multiProcessorCount > 2) ||
170  ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
171  ) {
172  devices[ndevices++] = dev;
173  }
174  if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
175  ++nexclusive;
176  }
177 #else
178  devices[ndevices++] = dev;
179 #endif
180  }
181  }
182 
183  if ( ! ndevices ) {
184  cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
185  }
186 
187  if ( devicesperreplica > 0 ) {
188  if ( devicesperreplica > ndevices ) {
189  NAMD_die("More devices per partition requested than devices are available");
190  }
191  int *olddevices = devices;
192  devices = new int[devicesperreplica];
193  for ( int i=0; i<devicesperreplica; ++i ) {
194  int mypart = CmiMyPartition();
195  devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
196  }
197  ndevices = devicesperreplica;
198  delete [] olddevices;
199  }
200 
201  int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
202  int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
203 
204  // catch multiple processes per device
205  if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
206  char msg[1024];
207  sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d). "
208  "Sharing devices between processes is inefficient. "
209  "Specify +ignoresharing (each process uses all visible devices) if "
210  "not all devices are visible to each process, otherwise "
211  "adjust number of processes to evenly divide number of devices, "
212  "specify subset of devices with +devices argument (e.g., +devices 0,2), "
213  "or multiply list shared devices (e.g., +devices 0,1,2,0).",
214  ndevices, numPesForDevice / CkMyNodeSize() );
215  NAMD_die(msg);
216  }
217 
218  {
219  // build list of devices actually used by this node
220  nodedevices = new int[ndevices];
221  nnodedevices = 0;
222  int pe = CkNodeFirst(CkMyNode());
223  int dr = -1;
224  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
225  int rank = ignoresharing ? i : CmiPhysicalRank(pe);
226  int peDeviceRank = rank * ndevices / numPesForDevice;
227  if ( peDeviceRank != dr ) {
228  dr = peDeviceRank;
229  nodedevices[nnodedevices++] = devices[dr];
230  }
231  }
232  }
233 
234  {
235  // check for devices used twice by this node
236  for ( int i=0; i<nnodedevices; ++i ) {
237  for ( int j=i+1; j<nnodedevices; ++j ) {
238  if ( nodedevices[i] == nodedevices[j] ) {
239  char msg[1024];
240  sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
241  NAMD_die(msg);
242  }
243  }
244  }
245  }
246 
247  sharedGpu = 0;
248  gpuIsMine = 1;
249  int firstPeSharingGpu = CkMyPe();
250  nextPeSharingGpu = CkMyPe();
251 
252  {
253  int dev;
254  if ( numPesForDevice > 1 ) {
255  int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
256  dev = devices[myDeviceRank];
257  masterPe = CkMyPe();
258  {
259  pesSharingDevice = new int[numPesForDevice];
260  masterPe = -1;
261  numPesSharingDevice = 0;
262  for ( int i = 0; i < numPesForDevice; ++i ) {
263  if ( i * ndevices / numPesForDevice == myDeviceRank ) {
264  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
265  pesSharingDevice[numPesSharingDevice++] = thisPe;
266  if ( masterPe < 1 ) masterPe = thisPe;
267  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
268  }
269  }
270  for ( int j = 0; j < ndevices; ++j ) {
271  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
272  }
273  }
274  if ( sharedGpu && masterPe == CkMyPe() ) {
275  if ( CmiPhysicalNodeID(masterPe) < 2 )
276  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
277  }
278  } else { // in case phys node code is lying
279  dev = devices[CkMyPe() % ndevices];
280  masterPe = CkMyPe();
281  pesSharingDevice = new int[1];
282  pesSharingDevice[0] = CkMyPe();
283  numPesSharingDevice = 1;
284  }
285 
286  deviceID = dev;
287 
288  // Store device IDs to node-wide list
289  if (CkMyRank() >= MAX_NUM_RANKS)
290  NAMD_die("Maximum number of ranks (2048) per node exceeded");
291  deviceIDList[CkMyRank()] = deviceID;
292 
293  if ( masterPe != CkMyPe() ) {
294  if ( CmiPhysicalNodeID(masterPe) < 2 )
295  CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
296  CkMyPe(), myRankInPhysicalNode, masterPe);
297  // for PME only
298  cudaCheck(cudaSetDevice(dev));
299  return;
300  }
301 
302  // Store master PEs for every device ID to node-wide list
303  if (deviceID >= MAX_NUM_DEVICES)
304  NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
305  masterPeList[deviceID] = masterPe + 1; // array is pre-initialized to zeros
306 
307  // disable token-passing but don't submit local until remote finished
308  // if shared_gpu is true, otherwise submit all work immediately
309  firstPeSharingGpu = CkMyPe();
310  nextPeSharingGpu = CkMyPe();
311 
312  gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
313 
314  if ( dev >= deviceCount ) {
315  char buf[256];
316  sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
317  CkMyPe(), dev, host, deviceCount);
318  NAMD_die(buf);
319  }
320 
321  cudaDeviceProp deviceProp;
322  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
323  if ( CmiPhysicalNodeID(masterPe) < 2 )
324  CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
325  CkMyPe(), myRankInPhysicalNode, dev, host,
326  deviceProp.name,
327  (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
328  deviceProp.major, deviceProp.minor,
329  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
330 
331  cudaCheck(cudaSetDevice(dev));
332 
333  } // just let CUDA pick a device for us
334 
335  {
336  // if only one device then already initialized in cuda_affinity_initialize()
337  cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
338  if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
339  cudaGetLastError();
340  } else {
341  cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
342  }
343 
344  int dev;
345  cudaCheck(cudaGetDevice(&dev));
346  deviceID = dev;
347  cudaDeviceProp deviceProp;
348  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
349  if ( deviceProp.computeMode == cudaComputeModeProhibited )
350  cudaDie("device in prohibited mode");
351  if ( deviceProp.major < 3 )
352  cudaDie("device not of compute capability 3.0 or higher");
353  if ( ! deviceProp.canMapHostMemory )
354  cudaDie("device cannot map host memory");
355 
356  // initialize the device on this thread
357  int *dummy;
358  cudaCheck(cudaMalloc(&dummy, 4));
359  }
360 }
361 
362 //
363 // Class destructor
364 //
366  if (deviceProps != NULL) delete [] deviceProps;
367  if (devices != NULL) delete [] devices;
368  delete [] pesSharingDevice;
369 }
370 
371 //
372 // Return device ID for pe. Assumes all nodes are the same
373 //
375  return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
376 }
377 
378 //
379 // Returns master PE for the device ID, or -1 if device not found
380 //
382  return masterPeList[deviceID % deviceCount] - 1;
383 }
384 
385 //
386 // Returns true if process "pe" shares this device
387 //
389  for ( int i=0; i<numPesSharingDevice; ++i ) {
390  if ( pesSharingDevice[i] == pe ) return true;
391  }
392  return false;
393 }
394 
395 //
396 // Returns true if there is single device per node
397 //
399  if ( numPesSharingDevice != CkMyNodeSize() ) return false;
400  int numPesOnNodeSharingDevice = 0;
401  for ( int i=0; i<numPesSharingDevice; ++i ) {
402  if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
403  ++numPesOnNodeSharingDevice;
404  }
405  }
406  return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
407 }
408 
410  int dev;
411  cudaCheck(cudaGetDevice(&dev));
412  return deviceProps[dev].maxThreadsPerBlock;
413 }
414 
416  int dev;
417  cudaCheck(cudaGetDevice(&dev));
418  return deviceProps[dev].maxGridSize[0];
419 }
420 
421 /*
422 BASE
423 2 types (remote & local)
424 16 pes per node
425 3 phases (1, 2, 3)
426 */
427 
428 void DeviceCUDA::register_user_events() {
429 
430  traceRegisterUserEvent("CUDA PME spreadCharge", CUDA_PME_SPREADCHARGE_EVENT);
431  traceRegisterUserEvent("CUDA PME gatherForce", CUDA_PME_GATHERFORCE_EVENT);
432 
433  traceRegisterUserEvent("CUDA bonded", CUDA_BONDED_KERNEL_EVENT);
434  traceRegisterUserEvent("CUDA debug", CUDA_DEBUG_EVENT);
435  traceRegisterUserEvent("CUDA nonbonded", CUDA_NONBONDED_KERNEL_EVENT);
436  traceRegisterUserEvent("CUDA GBIS Phase 1 kernel", CUDA_GBIS1_KERNEL_EVENT);
437  traceRegisterUserEvent("CUDA GBIS Phase 2 kernel", CUDA_GBIS2_KERNEL_EVENT);
438  traceRegisterUserEvent("CUDA GBIS Phase 3 kernel", CUDA_GBIS3_KERNEL_EVENT);
439 
440  traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
441  traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
442 
443 #define REGISTER_DEVICE_EVENTS(DEV) \
444  traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
445  traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
446 
463 
464 }
465 
466 #endif // NAMD_CUDA
467 
#define CUDA_GBIS2_KERNEL_EVENT
Definition: DeviceCUDA.h:13
void initialize()
Definition: DeviceCUDA.C:85
#define REGISTER_DEVICE_EVENTS(DEV)
int getMaxNumThreads()
Definition: DeviceCUDA.C:409
int devicesperreplica
Definition: DeviceCUDA.C:42
#define MAX_NUM_DEVICES
Definition: DeviceCUDA.C:70
#define CUDA_BONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:9
int nomergegrids
Definition: DeviceCUDA.C:45
void cuda_getargs(char **)
Definition: DeviceCUDA.C:51
#define CUDA_PME_SPREADCHARGE_EVENT
Definition: DeviceCUDA.h:7
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:71
#define CUDA_EVENT_ID_POLL_REMOTE
Definition: DeviceCUDA.h:16
static __thread cuda_args_t cuda_args
Definition: DeviceCUDA.C:49
int usedevicelist
Definition: DeviceCUDA.C:41
#define CUDA_DEBUG_EVENT
Definition: DeviceCUDA.h:10
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:68
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:381
int mergegrids
Definition: DeviceCUDA.C:44
#define CUDA_GBIS3_KERNEL_EVENT
Definition: DeviceCUDA.h:14
int nostreaming
Definition: DeviceCUDA.C:46
bool device_shared_with_pe(int pe)
Definition: DeviceCUDA.C:388
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
int getMaxNumBlocks()
Definition: DeviceCUDA.C:415
void NAMD_die(const char *err_msg)
Definition: common.C:83
#define MAX_NUM_RANKS
Definition: DeviceCUDA.C:67
#define CUDA_NONBONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:11
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:18
#define CUDA_GBIS1_KERNEL_EVENT
Definition: DeviceCUDA.h:12
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
int ignoresharing
Definition: DeviceCUDA.C:43
void cuda_initialize()
Definition: DeviceCUDA.C:20
bool one_device_per_node()
Definition: DeviceCUDA.C:398
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:374
#define CUDA_EVENT_ID_POLL_LOCAL
Definition: DeviceCUDA.h:19
char * devicelist
Definition: DeviceCUDA.C:40
void cuda_finalize()
Definition: DeviceCUDA.C:26
#define CUDA_PME_GATHERFORCE_EVENT
Definition: DeviceCUDA.h:8