1 #ifndef MIGRATIONBONDEDCUDA_H 2 #define MIGRATIONBONDEDCUDA_H 7 #ifdef NODEGROUP_FORCE_REGISTER 13 class MigrationBondedCUDAKernel {
34 int numPatchesHomePad;
49 size_t patchDeviceScan_alloc;
50 char* d_patchDeviceScan_scratch;
53 static constexpr
int kNumThreads = 128;
54 static constexpr
int kPatchNumberPad = 64;
55 static constexpr
int kNumTuplesTypes = 7;
58 static constexpr
int kPatchThreads = 1024;
59 static constexpr
int kPatchItemsPerThread = 8;
60 static constexpr
int kMaxPatchesForSingleThreadBlock = kPatchThreads * kPatchItemsPerThread;
62 static constexpr
int kScanThreads = 512;
63 static constexpr
int kScanItemsPerThread = 4;
65 MigrationBondedCUDAKernel();
66 ~MigrationBondedCUDAKernel();
67 void setup(
const int numDevices,
const int numPatchesHome);
73 void computeTupleDestination(
74 const int myDeviceIndex,
76 const int numPatchesHome,
77 const int4* migrationDestination,
78 const int* patchIDtoGPU,
79 const int* patchIDtoHomePatchIndex,
88 void reserveTupleDestination(
89 const int myDeviceIndex,
90 const int numPatchesHome,
94 void computePatchOffsets(
95 const int numPatchesHome,
99 void performTupleMigration(
109 const double3* d_patchMapCenter,
115 void copyTupleToDevice(
117 const int numPatchesHome,
127 TupleCounts fetchTupleCounts(
const int numPatchesHome, cudaStream_t stream);
128 void clearTupleCounts(
const int numPatchesHome, cudaStream_t stream);
130 void copyPeerDataToDevice(
134 const int numDevices,
141 #endif // MIGRATIONBONDEDCUDA_H