4 #include <cuda_runtime.h>
15 fprintf(stderr,
"writeComplexToDisk %d %s\n", size, filename);
17 copy_DtoH<float2>(d_data, h_data, size,
stream);
19 FILE *handle = fopen(filename,
"w");
20 for (
int i=0;i < size;i++)
21 fprintf(handle,
"%f %f\n", h_data[i].
x, h_data[i].
y);
27 FILE *handle = fopen(filename,
"w");
28 for (
int i=0;i < size;i++)
29 fprintf(handle,
"%f %f\n", h_data[i].
x, h_data[i].
y);
34 fprintf(stderr,
"writeRealToDisk %d %s\n", size, filename);
35 float* h_data =
new float[size];
36 copy_DtoH<float>(d_data, h_data, size,
stream);
38 FILE *handle = fopen(filename,
"w");
39 for (
int i=0;i < size;i++)
40 fprintf(handle,
"%f\n", h_data[i]);
45 void CudaFFTCompute::plan3D(
int *n,
int flags) {
47 forwardType = CUFFT_R2C;
48 backwardType = CUFFT_C2R;
49 cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
50 cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
55 void CudaFFTCompute::plan2D(
int *n,
int howmany,
int flags) {
57 forwardType = CUFFT_R2C;
58 backwardType = CUFFT_C2R;
59 int nt[2] = {n[1], n[0]};
60 cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
61 cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
66 void CudaFFTCompute::plan1DX(
int *n,
int howmany,
int flags) {
68 forwardType = CUFFT_R2C;
69 backwardType = CUFFT_C2R;
70 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
71 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
76 void CudaFFTCompute::plan1DY(
int *n,
int howmany,
int flags) {
78 forwardType = CUFFT_C2C;
79 backwardType = CUFFT_C2C;
80 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
81 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
86 void CudaFFTCompute::plan1DZ(
int *n,
int howmany,
int flags) {
88 forwardType = CUFFT_C2C;
89 backwardType = CUFFT_C2C;
90 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
91 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
104 float* CudaFFTCompute::allocateData(
const int dataSizeRequired) {
107 allocate_device<float>(&tmp, dataSizeRequired);
116 if (forwardType == CUFFT_R2C) {
128 }
else if (forwardType == CUFFT_C2C) {
144 cudaNAMD_bug(
"CudaFFTCompute::forward(), unsupported FFT type");
150 if (backwardType == CUFFT_C2R) {
165 }
else if (backwardType == CUFFT_C2C) {
177 cudaNAMD_bug(
"CudaFFTCompute::backward(), unsupported FFT type");
181 void CudaFFTCompute::setStream() {
183 cufftCheck(cufftSetStream(forwardPlan, stream));
184 cufftCheck(cufftSetStream(backwardPlan, stream));
192 const int jblock,
const int kblock,
double kappa,
int deviceID, cudaStream_t
stream) :
194 deviceID(deviceID), stream(stream) {
199 float *bm1f =
new float[pmeGrid.
K1];
200 float *bm2f =
new float[pmeGrid.
K2];
201 float *bm3f =
new float[pmeGrid.
K3];
202 for (
int i=0;i < pmeGrid.
K1;i++) bm1f[i] = (
float)
bm1[i];
203 for (
int i=0;i < pmeGrid.
K2;i++) bm2f[i] = (
float)
bm2[i];
204 for (
int i=0;i < pmeGrid.
K3;i++) bm3f[i] = (
float)
bm3[i];
205 allocate_device<float>(&d_bm1, pmeGrid.
K1);
206 allocate_device<float>(&d_bm2, pmeGrid.
K2);
207 allocate_device<float>(&d_bm3, pmeGrid.
K3);
208 copy_HtoD_sync<float>(bm1f, d_bm1, pmeGrid.
K1);
209 copy_HtoD_sync<float>(bm2f, d_bm2, pmeGrid.
K2);
210 copy_HtoD_sync<float>(bm3f, d_bm3, pmeGrid.
K3);
214 allocate_device<EnergyVirial>(&d_energyVirial, 1);
215 allocate_host<EnergyVirial>(&h_energyVirial, 1);
217 cudaCheck(cudaEventCreate(©EnergyVirialEvent));
223 deallocate_device<float>(&d_bm1);
224 deallocate_device<float>(&d_bm2);
225 deallocate_device<float>(&d_bm3);
226 deallocate_device<EnergyVirial>(&d_energyVirial);
227 deallocate_host<EnergyVirial>(&h_energyVirial);
228 cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
234 fprintf(stderr,
"K-SPACE LATTICE %g %g %g %g %g %g %g %g %g\n",
235 lattice.
a().
x, lattice.
a().
y, lattice.
a().
z,
236 lattice.
b().
x, lattice.
b().
y, lattice.
b().
z,
237 lattice.
c().
x, lattice.
c().
y, lattice.
c().
z);
241 const bool doEnergyVirial = (doEnergy || doVirial);
243 int nfft1, nfft2, nfft3;
244 float *prefac1, *prefac2, *prefac3;
250 float recip1x, recip1y, recip1z;
251 float recip2x, recip2y, recip2z;
252 float recip3x, recip3y, recip3z;
289 NAMD_bug(
"CudaPmeKSpaceCompute::solve, invalid permutation");
323 if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
326 recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
327 volume, prefac1, prefac2, prefac3,
j0,
k0, doEnergyVirial,
328 &d_energyVirial->energy, d_energyVirial->virial, (
float2*)data,
332 if (doEnergyVirial) {
333 copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
334 cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
345 void CudaPmeKSpaceCompute::energyAndVirialCheck(
void *arg,
double walltime) {
348 cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
349 if (err == cudaSuccess) {
352 if (c->pencilXYZPtr != NULL)
354 else if (c->pencilZPtr != NULL)
357 NAMD_bug(
"CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
359 }
else if (err == cudaErrorNotReady) {
362 if (c->checkCount >= 1000000) {
364 sprintf(errmsg,
"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
371 sprintf(errmsg,
"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
378 CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
383 pencilXYZPtr = pencilPtr;
388 CcdCallFnAfter(energyAndVirialCheck,
this, 0.1);
394 pencilZPtr = pencilPtr;
398 CcdCallFnAfter(energyAndVirialCheck,
this, 0.1);
402 return h_energyVirial->energy;
408 virial[0] = h_energyVirial->virial[3];
409 virial[1] = h_energyVirial->virial[4];
410 virial[2] = h_energyVirial->virial[1];
412 virial[3] = h_energyVirial->virial[4];
413 virial[4] = h_energyVirial->virial[5];
414 virial[5] = h_energyVirial->virial[2];
416 virial[6] = h_energyVirial->virial[1];
417 virial[7] = h_energyVirial->virial[7];
418 virial[8] = h_energyVirial->virial[0];
421 virial[0] = h_energyVirial->virial[0];
422 virial[1] = h_energyVirial->virial[1];
423 virial[2] = h_energyVirial->virial[2];
425 virial[3] = h_energyVirial->virial[1];
426 virial[4] = h_energyVirial->virial[3];
427 virial[5] = h_energyVirial->virial[4];
429 virial[6] = h_energyVirial->virial[2];
430 virial[7] = h_energyVirial->virial[4];
431 virial[8] = h_energyVirial->virial[5];
444 const int jblock,
const int kblock,
int deviceID, cudaStream_t
stream) :
447 NAMD_bug(
"CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
457 cudaCheck(cudaEventCreate(&gatherForceEvent));
465 if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
466 if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
469 deallocate_device<float>(&
data);
470 cudaCheck(cudaEventDestroy(gatherForceEvent));
499 reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity,
numAtoms, 1.5f);
524 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(
void *arg,
double walltime) {
528 cudaError_t err = cudaEventQuery(c->gatherForceEvent);
529 if (err == cudaSuccess) {
535 }
else if (err == cudaErrorNotReady) {
538 if (c->checkCount >= 1000000) {
540 sprintf(errmsg,
"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
547 sprintf(errmsg,
"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
554 CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
559 devicePtr = devicePtr_in;
563 CcdCallFnAfter(cuda_gatherforce_check,
this, 0.1);
568 cudaCheck(cudaEventSynchronize(gatherForceEvent));
571 void CudaPmeRealSpaceCompute::setupGridTexture(
float* data,
int data_len) {
572 if (tex_data == data && tex_data_len == data_len)
return;
574 tex_data_len = data_len;
576 cudaResourceDesc resDesc;
577 memset(&resDesc, 0,
sizeof(resDesc));
578 resDesc.resType = cudaResourceTypeLinear;
579 resDesc.res.linear.devPtr =
data;
580 resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
581 resDesc.res.linear.desc.x =
sizeof(float)*8;
582 resDesc.res.linear.sizeInBytes = data_len*
sizeof(float);
583 cudaTextureDesc texDesc;
584 memset(&texDesc, 0,
sizeof(texDesc));
585 texDesc.readMode = cudaReadModeElementType;
586 cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
593 reallocate_device<CudaForce>(&d_force, &d_forceCapacity,
numAtoms, 1.5f);
602 copy_DtoH<CudaForce>(d_force, force,
numAtoms, stream);
604 cudaCheck(cudaEventRecord(gatherForceEvent, stream));
625 const int jblock,
const int kblock,
int deviceID, cudaStream_t
stream) :
626 PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
629 allocate_device<float2>(&d_data,
dataSize);
630 #ifndef P2P_ENABLE_3D
631 allocate_device<float2>(&d_buffer,
dataSize);
635 dataPtrsYZX.resize(
nblock, NULL);
636 dataPtrsZXY.resize(
nblock, NULL);
638 allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*
nblock);
639 allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*
nblock);
644 deallocate_device<float2>(&d_data);
645 #ifndef P2P_ENABLE_3D
646 deallocate_device<float2>(&d_buffer);
648 deallocate_device< TransposeBatch<float2> >(&batchesZXY);
649 deallocate_device< TransposeBatch<float2> >(&batchesYZX);
656 if (dataPtrsYZX.size() != dataPtrsNew.size())
657 NAMD_bug(
"CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
658 for (
int iblock=0;iblock <
nblock;iblock++) {
659 dataPtrsYZX[iblock] = dataPtrsNew[iblock];
664 for (
int iperm=0;iperm < 3;iperm++) {
670 }
else if (iperm == 1) {
681 for (
int iblock=0;iblock <
nblock;iblock++) {
683 int x0 =
pos[iblock];
684 int nx =
pos[iblock+1] - x0;
685 max_nx = std::max(max_nx, nx);
689 if (dataPtrsYZX[iblock] == NULL) {
695 data_out = dataPtrsYZX[iblock];
696 width_out = isize_out;
706 h_batchesYZX[iperm*nblock + iblock] = batch;
715 max_nx_YZX[iperm] = max_nx;
718 copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*
nblock, stream);
719 cudaCheck(cudaStreamSynchronize(stream));
720 delete [] h_batchesYZX;
727 if (dataPtrsZXY.size() != dataPtrsNew.size())
728 NAMD_bug(
"CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
729 for (
int iblock=0;iblock <
nblock;iblock++) {
730 dataPtrsZXY[iblock] = dataPtrsNew[iblock];
736 for (
int iperm=0;iperm < 3;iperm++) {
742 }
else if (iperm == 1) {
753 for (
int iblock=0;iblock <
nblock;iblock++) {
755 int x0 =
pos[iblock];
756 int nx =
pos[iblock+1] - x0;
757 max_nx = std::max(max_nx, nx);
761 if (dataPtrsZXY[iblock] == NULL) {
767 data_out = dataPtrsZXY[iblock];
768 width_out = isize_out;
778 h_batchesZXY[iperm*nblock + iblock] = batch;
781 max_nx_ZXY[iperm] = max_nx;
784 copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*
nblock, stream);
785 cudaCheck(cudaStreamSynchronize(stream));
786 delete [] h_batchesZXY;
807 NAMD_bug(
"PmeTranspose::transposeXYZtoYZX, invalid permutation");
881 NAMD_bug(
"PmeTranspose::transposeXYZtoZXY, invalid permutation");
938 cudaCheck(cudaStreamSynchronize(stream));
945 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
947 int x0 =
pos[iblock];
948 int nx =
pos[iblock+1] - x0;
953 if (copyStart + copySize >
dataSize)
954 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
956 if (copySize > h_dataSize)
957 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
959 copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
966 NAMD_bug(
"CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
969 int i0, i1, j0, j1, k0, k1;
970 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
975 copy3D_HtoD<float2>(data_in, data_out,
983 #ifndef P2P_ENABLE_3D
991 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
994 int i0, i1, j0, j1, k0, k1;
995 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1000 float2* data_in = d_buffer + i0*nj*nk;
1002 copy3D_DtoD<float2>(data_in, data_out,
1007 ni, nj, nk, stream);
1015 NAMD_bug(
"CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1018 int i0, i1, j0, j1, k0, k1;
1019 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1024 return d_buffer + i0*nj*nk;
1033 int kblock_out = iblock;
1035 copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1042 int jblock_out = iblock;
1045 copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1048 void CudaPmeTranspose::copyDataToPeerDevice(
const int iblock,
1049 const int iblock_out,
const int jblock_out,
const int kblock_out,
1050 int deviceID_out,
int permutation_out,
float2* data_out) {
1055 int i0, i1, j0, j1, k0, k1;
1056 getBlockDim(
pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1062 int isize_out = i1-i0+1;
1063 int jsize_out = j1-j0+1;
1065 int x0 =
pos[iblock];
1068 #ifndef P2P_ENABLE_3D
1070 copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
1072 copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
1076 isize_out, jsize_out,
1077 ni, nj, nk, stream);
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void setDataPtrsYZX(std::vector< float2 * > &dataPtrsNew, float2 *data)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
~CudaPmeRealSpaceCompute()
static __thread atom * atoms
void energyAndVirialDone()
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void spreadCharge(Lattice &lattice)
void CcdCallBacksReset(void *ignored, double curWallTime)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void energyAndVirialDone()
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
__thread cudaStream_t stream
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
void NAMD_bug(const char *err_msg)
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
void getVirial(double *virial)
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream)
BigReal volume(void) const
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void cudaNAMD_bug(const char *msg)
__global__ void gather_force(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
void setDataPtrsZXY(std::vector< float2 * > &dataPtrsNew, float2 *data)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void gatherForce(Lattice &lattice, CudaForce *force)
void waitStreamSynchronize()
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void transposeXYZtoYZX(const float2 *data)
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
void transposeXYZtoZXY(const float2 *data)
void waitGatherForceDone()
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
float2 * getBuffer(const int iblock)
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)