DeviceCUDA Class Reference

#include <DeviceCUDA.h>

List of all members.

Public Member Functions

 DeviceCUDA ()
 ~DeviceCUDA ()
void initialize ()
int getDeviceCount ()
int getNumDevice ()
bool device_shared_with_pe (int pe)
bool one_device_per_node ()
int getNoStreaming ()
int getNoMergeGrids ()
int getMergeGrids ()
void setMergeGrids (const int val)
bool getSharedGpu ()
int getNextPeSharingGpu ()
int getMasterPe ()
int getNumPesSharingDevice ()
int getPesSharingDevice (const int i)
int getGpuIsMine ()
void setGpuIsMine (const int val)
int getDeviceID ()
int getDeviceIDbyRank (int rank)
int getDeviceIDforPe (int pe)
int getMasterPeForDeviceID (int deviceID)
int getMaxNumThreads ()
int getMaxNumBlocks ()

Detailed Description

Definition at line 33 of file DeviceCUDA.h.


Constructor & Destructor Documentation

DeviceCUDA::DeviceCUDA (  ) 

Definition at line 80 of file DeviceCUDA.C.

00080 : deviceProps(NULL), devices(NULL) {}

DeviceCUDA::~DeviceCUDA (  ) 

Definition at line 365 of file DeviceCUDA.C.

00365                         {
00366   if (deviceProps != NULL) delete [] deviceProps;
00367   if (devices != NULL) delete [] devices;
00368         delete [] pesSharingDevice;
00369 }


Member Function Documentation

bool DeviceCUDA::device_shared_with_pe ( int  pe  ) 

Definition at line 388 of file DeviceCUDA.C.

Referenced by ComputeMgr::createComputes().

00388                                              {
00389   for ( int i=0; i<numPesSharingDevice; ++i ) {
00390     if ( pesSharingDevice[i] == pe ) return true;
00391   }
00392   return false;
00393 }

int DeviceCUDA::getDeviceCount (  )  [inline]

Definition at line 87 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeCUDAMgr::initialize().

00087 {return deviceCount;}

int DeviceCUDA::getDeviceID (  )  [inline]
int DeviceCUDA::getDeviceIDbyRank ( int  rank  )  [inline]
int DeviceCUDA::getDeviceIDforPe ( int  pe  ) 

Definition at line 374 of file DeviceCUDA.C.

References deviceIDList.

00374                                        {
00375   return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
00376 }

int DeviceCUDA::getGpuIsMine (  )  [inline]

Definition at line 104 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::doWork().

00104 {return gpuIsMine;}

int DeviceCUDA::getMasterPe (  )  [inline]
int DeviceCUDA::getMasterPeForDeviceID ( int  deviceID  ) 

Definition at line 381 of file DeviceCUDA.C.

References masterPeList.

Referenced by CudaComputeNonbonded::assignPatches().

00381                                                    {
00382   return masterPeList[deviceID % deviceCount] - 1;
00383 }

int DeviceCUDA::getMaxNumBlocks (  ) 

Definition at line 415 of file DeviceCUDA.C.

References cudaCheck.

Referenced by CudaTileListKernel::buildTileLists(), CudaComputeGBISKernel::GBISphase1(), CudaComputeGBISKernel::GBISphase2(), CudaComputeGBISKernel::GBISphase3(), CudaComputeNonbondedKernel::nonbondedForce(), and CudaComputeNonbondedKernel::reduceVirialEnergy().

00415                                 {
00416   int dev;
00417   cudaCheck(cudaGetDevice(&dev));
00418   return deviceProps[dev].maxGridSize[0];
00419 }

int DeviceCUDA::getMaxNumThreads (  ) 

Definition at line 409 of file DeviceCUDA.C.

References cudaCheck.

00409                                  {
00410   int dev;
00411   cudaCheck(cudaGetDevice(&dev));
00412   return deviceProps[dev].maxThreadsPerBlock;
00413 }

int DeviceCUDA::getMergeGrids (  )  [inline]
int DeviceCUDA::getNextPeSharingGpu (  )  [inline]

Definition at line 99 of file DeviceCUDA.h.

Referenced by cuda_check_local_calc(), and cuda_check_remote_calc().

00099 {return nextPeSharingGpu;}

int DeviceCUDA::getNoMergeGrids (  )  [inline]

Definition at line 94 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

00094 {return nomergegrids;}

int DeviceCUDA::getNoStreaming (  )  [inline]
int DeviceCUDA::getNumDevice (  )  [inline]
int DeviceCUDA::getNumPesSharingDevice (  )  [inline]

Definition at line 101 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeNonbondedCUDA::assignPatches().

00101 {return numPesSharingDevice;}

int DeviceCUDA::getPesSharingDevice ( const int  i  )  [inline]

Definition at line 102 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeNonbondedCUDA::assignPatches().

00102 {return pesSharingDevice[i];}

bool DeviceCUDA::getSharedGpu (  )  [inline]

Definition at line 98 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), and ComputeNonbondedCUDA::recvYieldDevice().

00098 {return sharedGpu;}

void DeviceCUDA::initialize ( void   ) 

Definition at line 85 of file DeviceCUDA.C.

References cuda_args, cudaCheck, cudaDie(), deviceIDList, cuda_args_t::devicelist, cuda_args_t::devicesperreplica, cuda_args_t::ignoresharing, j, masterPeList, MAX_NUM_DEVICES, MAX_NUM_RANKS, cuda_args_t::mergegrids, NAMD_die(), cuda_args_t::nomergegrids, cuda_args_t::nostreaming, and cuda_args_t::usedevicelist.

Referenced by cuda_initialize().

00085                             {
00086         // Copy command-line arguments into class
00087         this->devicelist = cuda_args.devicelist;
00088         this->usedevicelist = cuda_args.usedevicelist;
00089   this->devicesperreplica = cuda_args.devicesperreplica;
00090         this->ignoresharing = cuda_args.ignoresharing;
00091         this->mergegrids = cuda_args.mergegrids;
00092         this->nomergegrids = cuda_args.nomergegrids;
00093         this->nostreaming = cuda_args.nostreaming;
00094 
00095   if (CkMyPe() == 0) register_user_events();
00096 
00097   if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
00098 
00099   char host[128];
00100   gethostname(host, 128);  host[127] = 0;
00101 
00102   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00103   int myRankInPhysicalNode;
00104   int numPesOnPhysicalNode;
00105   int *pesOnPhysicalNode;
00106   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00107                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00108 
00109   {
00110     int i;
00111     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00112       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00113         i = numPesOnPhysicalNode;
00114         break;
00115       }
00116       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00117     }
00118     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00119       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00120       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00121         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00122           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00123       }
00124       myRankInPhysicalNode = 0;
00125       numPesOnPhysicalNode = 1;
00126       pesOnPhysicalNode = new int[1];
00127       pesOnPhysicalNode[0] = CkMyPe();
00128     } else {
00129       myRankInPhysicalNode = i;
00130     }
00131   }
00132   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00133 
00134   deviceCount = 0;
00135   cudaCheck(cudaGetDeviceCount(&deviceCount));
00136   if ( deviceCount <= 0 ) {
00137     cudaDie("No CUDA devices found.");
00138   }
00139 
00140   // Store all device props
00141   deviceProps = new cudaDeviceProp[deviceCount];
00142   for ( int i=0; i<deviceCount; ++i ) {
00143     cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
00144   }
00145 
00146   ndevices = 0;
00147   int nexclusive = 0;
00148   if ( usedevicelist ) {
00149     devices = new int[strlen(devicelist)];
00150     int i = 0;
00151     while ( devicelist[i] ) {
00152       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00153       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00154       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00155     }
00156   } else {
00157     if ( ! CkMyPe() ) {
00158       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00159     }
00160     devices = new int[deviceCount];
00161     for ( int i=0; i<deviceCount; ++i ) {
00162       int dev = i % deviceCount;
00163 #if CUDA_VERSION >= 2020
00164       cudaDeviceProp deviceProp;
00165       cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00166       if ( deviceProp.computeMode != cudaComputeModeProhibited
00167            && (deviceProp.major >= 3)
00168            && deviceProp.canMapHostMemory
00169            && ( (deviceProp.multiProcessorCount > 2) ||
00170                 ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
00171          ) {
00172         devices[ndevices++] = dev;
00173       }
00174       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00175         ++nexclusive;
00176       }
00177 #else
00178       devices[ndevices++] = dev;
00179 #endif
00180     }
00181   }
00182 
00183   if ( ! ndevices ) {
00184     cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
00185   }
00186 
00187   if ( devicesperreplica > 0 ) {
00188     if ( devicesperreplica > ndevices ) {
00189       NAMD_die("More devices per partition requested than devices are available");
00190     }
00191     int *olddevices = devices;
00192     devices = new int[devicesperreplica];
00193     for ( int i=0; i<devicesperreplica; ++i ) {
00194       int mypart = CmiMyPartition();
00195       devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
00196     }
00197     ndevices = devicesperreplica;
00198     delete [] olddevices;
00199   }
00200 
00201   int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
00202   int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
00203 
00204   // catch multiple processes per device
00205   if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
00206     char msg[1024];
00207     sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d).  "
00208             "Sharing devices between processes is inefficient.  "
00209             "Specify +ignoresharing (each process uses all visible devices) if "
00210             "not all devices are visible to each process, otherwise "
00211             "adjust number of processes to evenly divide number of devices, "
00212             "specify subset of devices with +devices argument (e.g., +devices 0,2), "
00213             "or multiply list shared devices (e.g., +devices 0,1,2,0).",
00214             ndevices, numPesForDevice / CkMyNodeSize() );
00215     NAMD_die(msg);
00216   }
00217 
00218   {
00219     // build list of devices actually used by this node
00220     nodedevices = new int[ndevices];
00221     nnodedevices = 0;
00222     int pe = CkNodeFirst(CkMyNode());
00223     int dr = -1;
00224     for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
00225       int rank = ignoresharing ? i : CmiPhysicalRank(pe);
00226       int peDeviceRank = rank * ndevices / numPesForDevice;
00227       if ( peDeviceRank != dr ) {
00228         dr = peDeviceRank;
00229         nodedevices[nnodedevices++] = devices[dr];
00230       }
00231     }
00232   }
00233 
00234   {
00235     // check for devices used twice by this node
00236     for ( int i=0; i<nnodedevices; ++i ) {
00237       for ( int j=i+1; j<nnodedevices; ++j ) {
00238         if ( nodedevices[i] == nodedevices[j] ) { 
00239           char msg[1024];
00240           sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
00241           NAMD_die(msg);
00242         }
00243       }
00244     }
00245   }
00246 
00247   sharedGpu = 0;
00248   gpuIsMine = 1;
00249   int firstPeSharingGpu = CkMyPe();
00250   nextPeSharingGpu = CkMyPe();
00251 
00252  {
00253     int dev;
00254     if ( numPesForDevice > 1 ) {
00255       int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
00256       dev = devices[myDeviceRank];
00257       masterPe = CkMyPe();
00258       {
00259         pesSharingDevice = new int[numPesForDevice];
00260         masterPe = -1;
00261         numPesSharingDevice = 0;
00262         for ( int i = 0; i < numPesForDevice; ++i ) {
00263           if ( i * ndevices / numPesForDevice == myDeviceRank ) {
00264             int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
00265             pesSharingDevice[numPesSharingDevice++] = thisPe;
00266             if ( masterPe < 1 ) masterPe = thisPe;
00267             if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
00268           }
00269         }
00270         for ( int j = 0; j < ndevices; ++j ) {
00271           if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
00272         }
00273       }
00274       if ( sharedGpu && masterPe == CkMyPe() ) {
00275         if ( CmiPhysicalNodeID(masterPe) < 2 )
00276         CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00277       }
00278     } else {  // in case phys node code is lying
00279       dev = devices[CkMyPe() % ndevices];
00280       masterPe = CkMyPe();
00281       pesSharingDevice = new int[1];
00282       pesSharingDevice[0] = CkMyPe();
00283       numPesSharingDevice = 1;
00284     }
00285 
00286     deviceID = dev;
00287 
00288     // Store device IDs to node-wide list
00289     if (CkMyRank() >= MAX_NUM_RANKS)
00290       NAMD_die("Maximum number of ranks (2048) per node exceeded");
00291     deviceIDList[CkMyRank()] = deviceID;
00292 
00293     if ( masterPe != CkMyPe() ) {
00294       if ( CmiPhysicalNodeID(masterPe) < 2 )
00295       CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00296                CkMyPe(), myRankInPhysicalNode, masterPe);
00297       // for PME only
00298       cudaCheck(cudaSetDevice(dev));
00299       return;
00300     }
00301 
00302     // Store master PEs for every device ID to node-wide list
00303     if (deviceID >= MAX_NUM_DEVICES)
00304       NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
00305     masterPeList[deviceID] = masterPe + 1;  // array is pre-initialized to zeros
00306 
00307     // disable token-passing but don't submit local until remote finished
00308     // if shared_gpu is true, otherwise submit all work immediately
00309     firstPeSharingGpu = CkMyPe();
00310     nextPeSharingGpu = CkMyPe();
00311 
00312     gpuIsMine = ( firstPeSharingGpu == CkMyPe() ); 
00313 
00314     if ( dev >= deviceCount ) {
00315       char buf[256];
00316       sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00317                 CkMyPe(), dev, host, deviceCount);
00318       NAMD_die(buf);
00319     }
00320 
00321     cudaDeviceProp deviceProp;
00322     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00323     if ( CmiPhysicalNodeID(masterPe) < 2 )
00324         CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %luMB  Rev: %d.%d  PCI: %x:%x:%x\n",
00325                CkMyPe(), myRankInPhysicalNode, dev, host,
00326                deviceProp.name,
00327                (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
00328                deviceProp.major, deviceProp.minor,
00329                deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
00330 
00331     cudaCheck(cudaSetDevice(dev));
00332 
00333   }  // just let CUDA pick a device for us
00334 
00335   {
00336     // if only one device then already initialized in cuda_affinity_initialize()
00337     cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
00338     if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
00339       cudaGetLastError();
00340     } else {
00341       cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
00342     }
00343 
00344     int dev;
00345     cudaCheck(cudaGetDevice(&dev));
00346     deviceID = dev;
00347     cudaDeviceProp deviceProp;
00348     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00349     if ( deviceProp.computeMode == cudaComputeModeProhibited )
00350       cudaDie("device in prohibited mode");
00351     if ( deviceProp.major < 3 )
00352       cudaDie("device not of compute capability 3.0 or higher");
00353     if ( ! deviceProp.canMapHostMemory )
00354       cudaDie("device cannot map host memory");
00355 
00356     // initialize the device on this thread
00357     int *dummy;
00358     cudaCheck(cudaMalloc(&dummy, 4));
00359   }
00360 }

bool DeviceCUDA::one_device_per_node (  ) 

Definition at line 398 of file DeviceCUDA.C.

Referenced by ComputePmeMgr::initialize().

00398                                      {
00399   if ( numPesSharingDevice != CkMyNodeSize() ) return false;
00400   int numPesOnNodeSharingDevice = 0;
00401   for ( int i=0; i<numPesSharingDevice; ++i ) {
00402     if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
00403       ++numPesOnNodeSharingDevice;
00404     }
00405   }
00406   return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
00407 }

void DeviceCUDA::setGpuIsMine ( const int  val  )  [inline]

Definition at line 105 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), and ComputeNonbondedCUDA::recvYieldDevice().

00105 {gpuIsMine = val;}

void DeviceCUDA::setMergeGrids ( const int  val  )  [inline]

Definition at line 96 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

00096 {mergegrids = val;}


The documentation for this class was generated from the following files:

Generated on 8 Dec 2019 for NAMD by  doxygen 1.6.1