10 #include <cuda_runtime.h>
14 #define __thread __declspec(thread)
28 cudaGetDeviceCount(&ndevs);
29 for (
int dev=0; dev < ndevs; ++dev ) {
54 "comma-delimited list of CUDA device numbers such as 0,2,1,2");
67 #define MAX_NUM_RANKS 2048
70 #define MAX_NUM_DEVICES 256
95 if (CkMyPe() == 0) register_user_events();
97 if (CkMyPe() == 0) CkPrintf(
"Info: Built with CUDA version %d\n", CUDA_VERSION);
100 gethostname(host, 128); host[127] = 0;
102 int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
103 int myRankInPhysicalNode;
104 int numPesOnPhysicalNode;
105 int *pesOnPhysicalNode;
106 CmiGetPesOnPhysicalNode(myPhysicalNodeID,
107 &pesOnPhysicalNode,&numPesOnPhysicalNode);
111 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
112 if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
113 i = numPesOnPhysicalNode;
116 if ( pesOnPhysicalNode[i] == CkMyPe() )
break;
118 if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
119 CkPrintf(
"Bad result from CmiGetPesOnPhysicalNode!\n");
120 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
121 CkPrintf(
"pe %d physnode rank %d of %d is %d\n", CkMyPe(),
122 i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
124 myRankInPhysicalNode = 0;
125 numPesOnPhysicalNode = 1;
126 pesOnPhysicalNode =
new int[1];
127 pesOnPhysicalNode[0] = CkMyPe();
129 myRankInPhysicalNode = i;
135 cudaCheck(cudaGetDeviceCount(&deviceCount));
136 if ( deviceCount <= 0 ) {
137 cudaDie(
"No CUDA devices found.");
141 deviceProps =
new cudaDeviceProp[deviceCount];
142 for (
int i=0; i<deviceCount; ++i ) {
143 cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
148 if ( usedevicelist ) {
149 devices =
new int[strlen(devicelist)];
151 while ( devicelist[i] ) {
152 ndevices += sscanf(devicelist+i,
"%d",devices+ndevices);
153 while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
154 while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
158 CkPrintf(
"Did not find +devices i,j,k,... argument, using all\n");
160 devices =
new int[deviceCount];
161 for (
int i=0; i<deviceCount; ++i ) {
162 int dev = i % deviceCount;
163 #if CUDA_VERSION >= 2020
164 cudaDeviceProp deviceProp;
165 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
166 if ( deviceProp.computeMode != cudaComputeModeProhibited
167 && (deviceProp.major >= 3)
168 && deviceProp.canMapHostMemory
169 && ( (deviceProp.multiProcessorCount > 2) ||
170 ((ndevices==0)&&(CkNumNodes()==1)) )
172 devices[ndevices++] = dev;
174 if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
178 devices[ndevices++] = dev;
184 cudaDie(
"all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
187 if ( devicesperreplica > 0 ) {
188 if ( devicesperreplica > ndevices ) {
189 NAMD_die(
"More devices per partition requested than devices are available");
191 int *olddevices = devices;
192 devices =
new int[devicesperreplica];
193 for (
int i=0; i<devicesperreplica; ++i ) {
194 int mypart = CmiMyPartition();
195 devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
197 ndevices = devicesperreplica;
198 delete [] olddevices;
201 int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
202 int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
205 if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
207 sprintf(msg,
"Number of devices (%d) is not a multiple of number of processes (%d). "
208 "Sharing devices between processes is inefficient. "
209 "Specify +ignoresharing (each process uses all visible devices) if "
210 "not all devices are visible to each process, otherwise "
211 "adjust number of processes to evenly divide number of devices, "
212 "specify subset of devices with +devices argument (e.g., +devices 0,2), "
213 "or multiply list shared devices (e.g., +devices 0,1,2,0).",
214 ndevices, numPesForDevice / CkMyNodeSize() );
220 nodedevices =
new int[ndevices];
222 int pe = CkNodeFirst(CkMyNode());
224 for (
int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
225 int rank = ignoresharing ? i : CmiPhysicalRank(pe);
226 int peDeviceRank = rank * ndevices / numPesForDevice;
227 if ( peDeviceRank != dr ) {
229 nodedevices[nnodedevices++] = devices[dr];
236 for (
int i=0; i<nnodedevices; ++i ) {
237 for (
int j=i+1; j<nnodedevices; ++j ) {
238 if ( nodedevices[i] == nodedevices[j] ) {
240 sprintf(msg,
"Device %d bound twice by same process.", nodedevices[i]);
249 int firstPeSharingGpu = CkMyPe();
250 nextPeSharingGpu = CkMyPe();
254 if ( numPesForDevice > 1 ) {
255 int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
256 dev = devices[myDeviceRank];
259 pesSharingDevice =
new int[numPesForDevice];
261 numPesSharingDevice = 0;
262 for (
int i = 0; i < numPesForDevice; ++i ) {
263 if ( i * ndevices / numPesForDevice == myDeviceRank ) {
264 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
265 pesSharingDevice[numPesSharingDevice++] = thisPe;
266 if ( masterPe < 1 ) masterPe = thisPe;
270 for (
int j = 0; j < ndevices; ++j ) {
271 if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
274 if ( sharedGpu && masterPe == CkMyPe() ) {
275 if ( CmiPhysicalNodeID(masterPe) < 2 )
276 CkPrintf(
"Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
279 dev = devices[CkMyPe() % ndevices];
281 pesSharingDevice =
new int[1];
282 pesSharingDevice[0] = CkMyPe();
283 numPesSharingDevice = 1;
290 NAMD_die(
"Maximum number of ranks (2048) per node exceeded");
293 if ( masterPe != CkMyPe() ) {
294 if ( CmiPhysicalNodeID(masterPe) < 2 )
295 CkPrintf(
"Pe %d physical rank %d will use CUDA device of pe %d\n",
296 CkMyPe(), myRankInPhysicalNode, masterPe);
304 NAMD_die(
"Maximum number of CUDA devices (256) per node exceeded");
309 firstPeSharingGpu = CkMyPe();
310 nextPeSharingGpu = CkMyPe();
312 gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
314 if ( dev >= deviceCount ) {
316 sprintf(buf,
"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
317 CkMyPe(), dev, host, deviceCount);
321 cudaDeviceProp deviceProp;
322 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
323 if ( CmiPhysicalNodeID(masterPe) < 2 )
324 CkPrintf(
"Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
325 CkMyPe(), myRankInPhysicalNode, dev, host,
327 (
unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
328 deviceProp.major, deviceProp.minor,
329 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
337 cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
338 if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
341 cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
347 cudaDeviceProp deviceProp;
348 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
349 if ( deviceProp.computeMode == cudaComputeModeProhibited )
350 cudaDie(
"device in prohibited mode");
351 if ( deviceProp.major < 3 )
352 cudaDie(
"device not of compute capability 3.0 or higher");
353 if ( ! deviceProp.canMapHostMemory )
354 cudaDie(
"device cannot map host memory");
366 if (deviceProps != NULL)
delete [] deviceProps;
367 if (devices != NULL)
delete [] devices;
368 delete [] pesSharingDevice;
389 for (
int i=0; i<numPesSharingDevice; ++i ) {
390 if ( pesSharingDevice[i] == pe )
return true;
399 if ( numPesSharingDevice != CkMyNodeSize() )
return false;
400 int numPesOnNodeSharingDevice = 0;
401 for (
int i=0; i<numPesSharingDevice; ++i ) {
402 if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
403 ++numPesOnNodeSharingDevice;
406 return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
412 return deviceProps[dev].maxThreadsPerBlock;
418 return deviceProps[dev].maxGridSize[0];
428 void DeviceCUDA::register_user_events() {
443 #define REGISTER_DEVICE_EVENTS(DEV) \
444 traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
445 traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
#define CUDA_GBIS2_KERNEL_EVENT
#define REGISTER_DEVICE_EVENTS(DEV)
#define CUDA_BONDED_KERNEL_EVENT
void cuda_getargs(char **)
#define CUDA_PME_SPREADCHARGE_EVENT
int masterPeList[MAX_NUM_DEVICES]
#define CUDA_EVENT_ID_POLL_REMOTE
static __thread cuda_args_t cuda_args
int deviceIDList[MAX_NUM_RANKS]
int getMasterPeForDeviceID(int deviceID)
#define CUDA_GBIS3_KERNEL_EVENT
bool device_shared_with_pe(int pe)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
void NAMD_die(const char *err_msg)
#define CUDA_NONBONDED_KERNEL_EVENT
__thread DeviceCUDA * deviceCUDA
#define CUDA_GBIS1_KERNEL_EVENT
bool one_device_per_node()
int getDeviceIDforPe(int pe)
#define CUDA_EVENT_ID_POLL_LOCAL
#define CUDA_PME_GATHERFORCE_EVENT