CudaPmeTranspose Class Reference

#include <CudaPmeSolverUtil.h>

Inheritance diagram for CudaPmeTranspose:
PmeTranspose

List of all members.

Public Member Functions

 CudaPmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
 ~CudaPmeTranspose ()
void setDataPtrsYZX (std::vector< float2 * > &dataPtrsNew, float2 *data)
void setDataPtrsZXY (std::vector< float2 * > &dataPtrsNew, float2 *data)
void transposeXYZtoYZX (const float2 *data)
void transposeXYZtoZXY (const float2 *data)
void waitStreamSynchronize ()
void copyDataDeviceToHost (const int iblock, float2 *h_data, const int h_dataSize)
void copyDataHostToDevice (const int iblock, float2 *data_in, float2 *data_out)
void copyDataDeviceToDevice (const int iblock, float2 *data_out)
float2getBuffer (const int iblock)
void copyDataToPeerDeviceYZX (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataToPeerDeviceZXY (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)

Detailed Description

Definition at line 139 of file CudaPmeSolverUtil.h.


Constructor & Destructor Documentation

CudaPmeTranspose::CudaPmeTranspose ( PmeGrid  pmeGrid,
const int  permutation,
const int  jblock,
const int  kblock,
int  deviceID,
cudaStream_t  stream 
)

Definition at line 624 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, and PmeTranspose::nblock.

00625                                                                            : 
00626   PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
00627   cudaCheck(cudaSetDevice(deviceID));
00628 
00629   allocate_device<float2>(&d_data, dataSize);
00630 #ifndef P2P_ENABLE_3D
00631   allocate_device<float2>(&d_buffer, dataSize);
00632 #endif
00633 
00634   // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
00635   dataPtrsYZX.resize(nblock, NULL);
00636   dataPtrsZXY.resize(nblock, NULL);
00637 
00638   allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
00639   allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
00640 }

CudaPmeTranspose::~CudaPmeTranspose (  ) 

Definition at line 642 of file CudaPmeSolverUtil.C.

References cudaCheck.

00642                                     {
00643   cudaCheck(cudaSetDevice(deviceID));
00644   deallocate_device<float2>(&d_data);
00645 #ifndef P2P_ENABLE_3D
00646   deallocate_device<float2>(&d_buffer);
00647 #endif
00648   deallocate_device< TransposeBatch<float2> >(&batchesZXY);
00649   deallocate_device< TransposeBatch<float2> >(&batchesYZX);
00650 }


Member Function Documentation

void CudaPmeTranspose::copyDataDeviceToDevice ( const int  iblock,
float2 data_out 
)

Definition at line 987 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

00987                                                                                 {
00988   cudaCheck(cudaSetDevice(deviceID));
00989 
00990   if (iblock >= nblock)
00991     NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
00992 
00993   // Determine block size = how much we're copying
00994   int i0, i1, j0, j1, k0, k1;
00995   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00996   int ni = i1-i0+1;
00997   int nj = j1-j0+1;
00998   int nk = k1-k0+1;
00999 
01000   float2* data_in = d_buffer + i0*nj*nk;
01001 
01002   copy3D_DtoD<float2>(data_in, data_out,
01003     0, 0, 0,
01004     ni, nj,
01005     i0, 0, 0,
01006     isize, jsize,
01007     ni, nj, nk, stream);
01008 }

void CudaPmeTranspose::copyDataDeviceToHost ( const int  iblock,
float2 h_data,
const int  h_dataSize 
)

Definition at line 941 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, and PmeTranspose::pos.

00941                                                                                                   {
00942   cudaCheck(cudaSetDevice(deviceID));
00943 
00944   if (iblock >= nblock)
00945     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
00946 
00947   int x0 = pos[iblock];
00948   int nx = pos[iblock+1] - x0;
00949 
00950   int copySize  = jsize*ksize*nx;
00951   int copyStart = jsize*ksize*x0;
00952 
00953   if (copyStart + copySize > dataSize)
00954     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
00955 
00956   if (copySize > h_dataSize) 
00957     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
00958 
00959   copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
00960 }

void CudaPmeTranspose::copyDataHostToDevice ( const int  iblock,
float2 data_in,
float2 data_out 
)

Definition at line 962 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

00962                                                                                                {
00963   cudaCheck(cudaSetDevice(deviceID));
00964 
00965   if (iblock >= nblock)
00966     NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
00967 
00968   // Determine block size = how much we're copying
00969   int i0, i1, j0, j1, k0, k1;
00970   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00971   int ni = i1-i0+1;
00972   int nj = j1-j0+1;
00973   int nk = k1-k0+1;
00974 
00975   copy3D_HtoD<float2>(data_in, data_out,
00976     0, 0, 0,
00977     ni, nj,
00978     i0, 0, 0,
00979     isize, jsize,
00980     ni, nj, nk, stream);
00981 }

void CudaPmeTranspose::copyDataToPeerDeviceYZX ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1028 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

01029                     {
01030 
01031   int iblock_out = jblock;
01032   int jblock_out = kblock;
01033   int kblock_out = iblock;
01034 
01035   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01036 }

void CudaPmeTranspose::copyDataToPeerDeviceZXY ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1038 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

01039                     {
01040 
01041   int iblock_out = kblock;
01042   int jblock_out = iblock;
01043   int kblock_out = jblock;
01044 
01045   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01046 }

float2 * CudaPmeTranspose::getBuffer ( const int  iblock  ) 

Definition at line 1013 of file CudaPmeSolverUtil.C.

References getBlockDim(), PmeTranspose::jblock, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

01013                                                     {
01014   if (iblock >= nblock)
01015     NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
01016 
01017   // Determine block size = how much we're copying
01018   int i0, i1, j0, j1, k0, k1;
01019   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
01020   int ni = i1-i0+1;
01021   int nj = j1-j0+1;
01022   int nk = k1-k0+1;
01023 
01024   return d_buffer + i0*nj*nk;
01025 }

void CudaPmeTranspose::setDataPtrsYZX ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 655 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::ysize_out, and TransposeBatch< T >::zsize_out.

00655                                                                                    {
00656   if (dataPtrsYZX.size() != dataPtrsNew.size())
00657     NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
00658   for (int iblock=0;iblock < nblock;iblock++) {
00659     dataPtrsYZX[iblock] = dataPtrsNew[iblock];
00660   }
00661   // Build batched data structures
00662   TransposeBatch<float2> *h_batchesYZX = new TransposeBatch<float2>[3*nblock];
00663 
00664   for (int iperm=0;iperm < 3;iperm++) {
00665     int isize_out;
00666     if (iperm == 0) {
00667       // Perm_Z_cX_Y:
00668       // ZXY -> XYZ
00669       isize_out = pmeGrid.K1/2+1;
00670     } else if (iperm == 1) {
00671       // Perm_cX_Y_Z:
00672       // XYZ -> YZX
00673       isize_out = pmeGrid.K2;
00674     } else {
00675       // Perm_Y_Z_cX:
00676       // YZX -> ZXY
00677       isize_out = pmeGrid.K3;
00678     }
00679 
00680     int max_nx = 0;
00681     for (int iblock=0;iblock < nblock;iblock++) {
00682 
00683       int x0 = pos[iblock];
00684       int nx = pos[iblock+1] - x0;
00685       max_nx = std::max(max_nx, nx);
00686 
00687       int width_out;
00688       float2* data_out;
00689       if (dataPtrsYZX[iblock] == NULL) {
00690         // Local transpose, use internal buffer
00691         data_out = d_data + jsize*ksize*x0;
00692         width_out = jsize;
00693       } else {
00694         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00695         data_out = dataPtrsYZX[iblock];
00696         width_out = isize_out;
00697       }
00698 
00699       TransposeBatch<float2> batch;
00700       batch.nx        = nx;
00701       batch.ysize_out = width_out;
00702       batch.zsize_out = ksize;
00703       batch.data_in   = data+x0;
00704       batch.data_out  = data_out;
00705 
00706       h_batchesYZX[iperm*nblock + iblock] = batch;
00707 
00708     // transpose_xyz_yzx(
00709     //   nx, jsize, ksize,
00710     //   isize, jsize,
00711     //   width_out, ksize,
00712     //   data+x0, data_out, stream);
00713     }
00714 
00715     max_nx_YZX[iperm] = max_nx;
00716   }
00717 
00718   copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
00719   cudaCheck(cudaStreamSynchronize(stream));
00720   delete [] h_batchesYZX;
00721 }

void CudaPmeTranspose::setDataPtrsZXY ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 726 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::xsize_out, and TransposeBatch< T >::zsize_out.

00726                                                                                    {
00727   if (dataPtrsZXY.size() != dataPtrsNew.size())
00728     NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
00729   for (int iblock=0;iblock < nblock;iblock++) {
00730     dataPtrsZXY[iblock] = dataPtrsNew[iblock];
00731   }
00732 
00733   // Build batched data structures
00734   TransposeBatch<float2> *h_batchesZXY = new TransposeBatch<float2>[3*nblock];
00735 
00736   for (int iperm=0;iperm < 3;iperm++) {
00737     int isize_out;
00738     if (iperm == 0) {
00739       // Perm_cX_Y_Z:
00740       // XYZ -> ZXY
00741       isize_out = pmeGrid.K3;
00742     } else if (iperm == 1) {
00743       // Perm_Z_cX_Y:
00744       // ZXY -> YZX
00745       isize_out = pmeGrid.K2;
00746     } else {
00747       // Perm_Y_Z_cX:
00748       // YZX -> XYZ
00749       isize_out = pmeGrid.K1/2+1;
00750     }
00751 
00752     int max_nx = 0;
00753     for (int iblock=0;iblock < nblock;iblock++) {
00754 
00755       int x0 = pos[iblock];
00756       int nx = pos[iblock+1] - x0;
00757       max_nx = std::max(max_nx, nx);
00758 
00759       int width_out;
00760       float2* data_out;
00761       if (dataPtrsZXY[iblock] == NULL) {
00762         // Local transpose, use internal buffer
00763         data_out = d_data + jsize*ksize*x0;
00764         width_out = ksize;
00765       } else {
00766         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00767         data_out = dataPtrsZXY[iblock];
00768         width_out = isize_out;
00769       }
00770 
00771       TransposeBatch<float2> batch;
00772       batch.nx        = nx;
00773       batch.zsize_out = width_out;
00774       batch.xsize_out = nx;
00775       batch.data_in   = data+x0;
00776       batch.data_out  = data_out;
00777 
00778       h_batchesZXY[iperm*nblock + iblock] = batch;
00779     }
00780 
00781     max_nx_ZXY[iperm] = max_nx;
00782   }
00783 
00784   copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
00785   cudaCheck(cudaStreamSynchronize(stream));
00786   delete [] h_batchesZXY;
00787 }

void CudaPmeTranspose::transposeXYZtoYZX ( const float2 data  )  [virtual]

Implements PmeTranspose.

Definition at line 789 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_yzx(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

00789                                                            {
00790   cudaCheck(cudaSetDevice(deviceID));
00791 
00792   int iperm;
00793   switch(permutation) {
00794     case Perm_Z_cX_Y:
00795     // ZXY -> XYZ
00796     iperm = 0;
00797     break;
00798     case Perm_cX_Y_Z:
00799     // XYZ -> YZX
00800     iperm = 1;
00801     break;
00802     case Perm_Y_Z_cX:
00803     // YZX -> ZXY
00804     iperm = 2;
00805     break;
00806     default:
00807     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00808     break;
00809   }
00810 
00811   batchTranspose_xyz_yzx(
00812     nblock, batchesYZX + iperm*nblock,
00813     max_nx_YZX[iperm], jsize, ksize,
00814     isize, jsize, stream);
00815 
00816 
00817 /*
00818   int isize_out;
00819   switch(permutation) {
00820     case Perm_Z_cX_Y:
00821     // ZXY -> XYZ
00822     isize_out = pmeGrid.K1/2+1;
00823     break;
00824     case Perm_cX_Y_Z:
00825     // XYZ -> YZX
00826     isize_out = pmeGrid.K2;
00827     break;
00828     case Perm_Y_Z_cX:
00829     // YZX -> ZXY
00830     isize_out = pmeGrid.K3;
00831     break;
00832     default:
00833     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00834     break;
00835   }
00836 
00837   for (int iblock=0;iblock < nblock;iblock++) {
00838 
00839     int x0 = pos[iblock];
00840     int nx = pos[iblock+1] - x0;
00841 
00842     int width_out;
00843     float2* data_out;
00844     if (dataPtrsYZX[iblock] == NULL) {
00845       // Local transpose, use internal buffer
00846       data_out = d_data + jsize*ksize*x0;
00847       width_out = jsize;
00848     } else {
00849       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00850       data_out = dataPtrsYZX[iblock];
00851       width_out = isize_out;
00852     }
00853 
00854     transpose_xyz_yzx(
00855       nx, jsize, ksize,
00856       isize, jsize,
00857       width_out, ksize,
00858       data+x0, data_out, stream);
00859   }
00860 */
00861 }

void CudaPmeTranspose::transposeXYZtoZXY ( const float2 data  )  [virtual]

Implements PmeTranspose.

Definition at line 863 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_zxy(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

00863                                                            {
00864   cudaCheck(cudaSetDevice(deviceID));
00865 
00866   int iperm;
00867   switch(permutation) {
00868     case Perm_cX_Y_Z:
00869     // XYZ -> ZXY
00870     iperm = 0;
00871     break;
00872     case Perm_Z_cX_Y:
00873     // ZXY -> YZX
00874     iperm = 1;
00875     break;
00876     case Perm_Y_Z_cX:
00877     // YZX -> XYZ
00878     iperm = 2;
00879     break;
00880     default:
00881     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00882     break;
00883   }
00884 
00885   batchTranspose_xyz_zxy(
00886     nblock, batchesZXY + iperm*nblock,
00887     max_nx_ZXY[iperm], jsize, ksize,
00888     isize, jsize, stream);
00889 
00890 /*
00891   int isize_out;
00892   switch(permutation) {
00893     case Perm_cX_Y_Z:
00894     // XYZ -> ZXY
00895     isize_out = pmeGrid.K3;
00896     break;
00897     case Perm_Z_cX_Y:
00898     // ZXY -> YZX
00899     isize_out = pmeGrid.K2;
00900     break;
00901     case Perm_Y_Z_cX:
00902     // YZX -> XYZ
00903     isize_out = pmeGrid.K1/2+1;
00904     break;
00905     default:
00906     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00907     break;
00908   }
00909 
00910   for (int iblock=0;iblock < nblock;iblock++) {
00911 
00912     int x0 = pos[iblock];
00913     int nx = pos[iblock+1] - x0;
00914 
00915     int width_out;
00916     float2* data_out;
00917     if (dataPtrsZXY[iblock] == NULL) {
00918       // Local transpose, use internal buffer
00919       data_out = d_data + jsize*ksize*x0;
00920       width_out = ksize;
00921     } else {
00922       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00923       data_out = dataPtrsZXY[iblock];
00924       width_out = isize_out;
00925     }
00926 
00927     transpose_xyz_zxy(
00928       nx, jsize, ksize,
00929       isize, jsize,
00930       width_out, nx,
00931       data+x0, data_out, stream);
00932   }
00933 */
00934 }

void CudaPmeTranspose::waitStreamSynchronize (  ) 

Definition at line 936 of file CudaPmeSolverUtil.C.

References cudaCheck.

00936                                              {
00937   cudaCheck(cudaSetDevice(deviceID));
00938   cudaCheck(cudaStreamSynchronize(stream));
00939 }


The documentation for this class was generated from the following files:

Generated on 25 May 2020 for NAMD by  doxygen 1.6.1