NAMD
Public Member Functions | List of all members
CudaPmeTranspose Class Reference

#include <CudaPmeSolverUtil.h>

Inheritance diagram for CudaPmeTranspose:
PmeTranspose

Public Member Functions

 CudaPmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
 
 ~CudaPmeTranspose ()
 
void setDataPtrsYZX (std::vector< float2 * > &dataPtrsNew, float2 *data)
 
void setDataPtrsZXY (std::vector< float2 * > &dataPtrsNew, float2 *data)
 
void transposeXYZtoYZX (const float2 *data)
 
void transposeXYZtoZXY (const float2 *data)
 
void waitStreamSynchronize ()
 
void copyDataDeviceToHost (const int iblock, float2 *h_data, const int h_dataSize)
 
void copyDataHostToDevice (const int iblock, float2 *data_in, float2 *data_out)
 
void copyDataDeviceToDevice (const int iblock, float2 *data_out)
 
float2getBuffer (const int iblock)
 
void copyDataToPeerDeviceYZX (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
 
void copyDataToPeerDeviceZXY (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
 
- Public Member Functions inherited from PmeTranspose
 PmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock)
 
virtual ~PmeTranspose ()
 

Additional Inherited Members

- Protected Attributes inherited from PmeTranspose
PmeGrid pmeGrid
 
const int permutation
 
const int jblock
 
const int kblock
 
int isize
 
int jsize
 
int ksize
 
int dataSize
 
int nblock
 
std::vector< int > pos
 

Detailed Description

Definition at line 139 of file CudaPmeSolverUtil.h.

Constructor & Destructor Documentation

CudaPmeTranspose::CudaPmeTranspose ( PmeGrid  pmeGrid,
const int  permutation,
const int  jblock,
const int  kblock,
int  deviceID,
cudaStream_t  stream 
)

Definition at line 624 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, and PmeTranspose::nblock.

625  :
626  PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
627  cudaCheck(cudaSetDevice(deviceID));
628 
629  allocate_device<float2>(&d_data, dataSize);
630 #ifndef P2P_ENABLE_3D
631  allocate_device<float2>(&d_buffer, dataSize);
632 #endif
633 
634  // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
635  dataPtrsYZX.resize(nblock, NULL);
636  dataPtrsZXY.resize(nblock, NULL);
637 
638  allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
639  allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
640 }
const int permutation
PmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock)
__thread cudaStream_t stream
const int jblock
const int kblock
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
CudaPmeTranspose::~CudaPmeTranspose ( )

Definition at line 642 of file CudaPmeSolverUtil.C.

References cudaCheck.

642  {
643  cudaCheck(cudaSetDevice(deviceID));
644  deallocate_device<float2>(&d_data);
645 #ifndef P2P_ENABLE_3D
646  deallocate_device<float2>(&d_buffer);
647 #endif
648  deallocate_device< TransposeBatch<float2> >(&batchesZXY);
649  deallocate_device< TransposeBatch<float2> >(&batchesYZX);
650 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:79

Member Function Documentation

void CudaPmeTranspose::copyDataDeviceToDevice ( const int  iblock,
float2 data_out 
)

Definition at line 987 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

987  {
988  cudaCheck(cudaSetDevice(deviceID));
989 
990  if (iblock >= nblock)
991  NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
992 
993  // Determine block size = how much we're copying
994  int i0, i1, j0, j1, k0, k1;
995  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
996  int ni = i1-i0+1;
997  int nj = j1-j0+1;
998  int nk = k1-k0+1;
999 
1000  float2* data_in = d_buffer + i0*nj*nk;
1001 
1002  copy3D_DtoD<float2>(data_in, data_out,
1003  0, 0, 0,
1004  ni, nj,
1005  i0, 0, 0,
1006  isize, jsize,
1007  ni, nj, nk, stream);
1008 }
const int permutation
__thread cudaStream_t stream
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:123
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:86
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void CudaPmeTranspose::copyDataDeviceToHost ( const int  iblock,
float2 h_data,
const int  h_dataSize 
)

Definition at line 941 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, and PmeTranspose::pos.

941  {
942  cudaCheck(cudaSetDevice(deviceID));
943 
944  if (iblock >= nblock)
945  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
946 
947  int x0 = pos[iblock];
948  int nx = pos[iblock+1] - x0;
949 
950  int copySize = jsize*ksize*nx;
951  int copyStart = jsize*ksize*x0;
952 
953  if (copyStart + copySize > dataSize)
954  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
955 
956  if (copySize > h_dataSize)
957  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
958 
959  copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
960 }
std::vector< int > pos
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
Definition: common.C:123
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void CudaPmeTranspose::copyDataHostToDevice ( const int  iblock,
float2 data_in,
float2 data_out 
)

Definition at line 962 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

962  {
963  cudaCheck(cudaSetDevice(deviceID));
964 
965  if (iblock >= nblock)
966  NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
967 
968  // Determine block size = how much we're copying
969  int i0, i1, j0, j1, k0, k1;
970  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
971  int ni = i1-i0+1;
972  int nj = j1-j0+1;
973  int nk = k1-k0+1;
974 
975  copy3D_HtoD<float2>(data_in, data_out,
976  0, 0, 0,
977  ni, nj,
978  i0, 0, 0,
979  isize, jsize,
980  ni, nj, nk, stream);
981 }
const int permutation
__thread cudaStream_t stream
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:123
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:86
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void CudaPmeTranspose::copyDataToPeerDeviceYZX ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1028 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

1029  {
1030 
1031  int iblock_out = jblock;
1032  int jblock_out = kblock;
1033  int kblock_out = iblock;
1034 
1035  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1036 }
const int jblock
const int kblock
void CudaPmeTranspose::copyDataToPeerDeviceZXY ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1038 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

1039  {
1040 
1041  int iblock_out = kblock;
1042  int jblock_out = iblock;
1043  int kblock_out = jblock;
1044 
1045  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1046 }
const int jblock
const int kblock
float2 * CudaPmeTranspose::getBuffer ( const int  iblock)

Definition at line 1013 of file CudaPmeSolverUtil.C.

References getBlockDim(), PmeTranspose::jblock, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

1013  {
1014  if (iblock >= nblock)
1015  NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1016 
1017  // Determine block size = how much we're copying
1018  int i0, i1, j0, j1, k0, k1;
1019  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1020  int ni = i1-i0+1;
1021  int nj = j1-j0+1;
1022  int nk = k1-k0+1;
1023 
1024  return d_buffer + i0*nj*nk;
1025 }
const int permutation
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:123
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:86
void CudaPmeTranspose::setDataPtrsYZX ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 655 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::ysize_out, and TransposeBatch< T >::zsize_out.

655  {
656  if (dataPtrsYZX.size() != dataPtrsNew.size())
657  NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
658  for (int iblock=0;iblock < nblock;iblock++) {
659  dataPtrsYZX[iblock] = dataPtrsNew[iblock];
660  }
661  // Build batched data structures
663 
664  for (int iperm=0;iperm < 3;iperm++) {
665  int isize_out;
666  if (iperm == 0) {
667  // Perm_Z_cX_Y:
668  // ZXY -> XYZ
669  isize_out = pmeGrid.K1/2+1;
670  } else if (iperm == 1) {
671  // Perm_cX_Y_Z:
672  // XYZ -> YZX
673  isize_out = pmeGrid.K2;
674  } else {
675  // Perm_Y_Z_cX:
676  // YZX -> ZXY
677  isize_out = pmeGrid.K3;
678  }
679 
680  int max_nx = 0;
681  for (int iblock=0;iblock < nblock;iblock++) {
682 
683  int x0 = pos[iblock];
684  int nx = pos[iblock+1] - x0;
685  max_nx = std::max(max_nx, nx);
686 
687  int width_out;
688  float2* data_out;
689  if (dataPtrsYZX[iblock] == NULL) {
690  // Local transpose, use internal buffer
691  data_out = d_data + jsize*ksize*x0;
692  width_out = jsize;
693  } else {
694  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
695  data_out = dataPtrsYZX[iblock];
696  width_out = isize_out;
697  }
698 
700  batch.nx = nx;
701  batch.ysize_out = width_out;
702  batch.zsize_out = ksize;
703  batch.data_in = data+x0;
704  batch.data_out = data_out;
705 
706  h_batchesYZX[iperm*nblock + iblock] = batch;
707 
708  // transpose_xyz_yzx(
709  // nx, jsize, ksize,
710  // isize, jsize,
711  // width_out, ksize,
712  // data+x0, data_out, stream);
713  }
714 
715  max_nx_YZX[iperm] = max_nx;
716  }
717 
718  copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
719  cudaCheck(cudaStreamSynchronize(stream));
720  delete [] h_batchesYZX;
721 }
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
std::vector< int > pos
__thread cudaStream_t stream
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int K3
Definition: PmeBase.h:18
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void CudaPmeTranspose::setDataPtrsZXY ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 726 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::xsize_out, and TransposeBatch< T >::zsize_out.

726  {
727  if (dataPtrsZXY.size() != dataPtrsNew.size())
728  NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
729  for (int iblock=0;iblock < nblock;iblock++) {
730  dataPtrsZXY[iblock] = dataPtrsNew[iblock];
731  }
732 
733  // Build batched data structures
735 
736  for (int iperm=0;iperm < 3;iperm++) {
737  int isize_out;
738  if (iperm == 0) {
739  // Perm_cX_Y_Z:
740  // XYZ -> ZXY
741  isize_out = pmeGrid.K3;
742  } else if (iperm == 1) {
743  // Perm_Z_cX_Y:
744  // ZXY -> YZX
745  isize_out = pmeGrid.K2;
746  } else {
747  // Perm_Y_Z_cX:
748  // YZX -> XYZ
749  isize_out = pmeGrid.K1/2+1;
750  }
751 
752  int max_nx = 0;
753  for (int iblock=0;iblock < nblock;iblock++) {
754 
755  int x0 = pos[iblock];
756  int nx = pos[iblock+1] - x0;
757  max_nx = std::max(max_nx, nx);
758 
759  int width_out;
760  float2* data_out;
761  if (dataPtrsZXY[iblock] == NULL) {
762  // Local transpose, use internal buffer
763  data_out = d_data + jsize*ksize*x0;
764  width_out = ksize;
765  } else {
766  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
767  data_out = dataPtrsZXY[iblock];
768  width_out = isize_out;
769  }
770 
772  batch.nx = nx;
773  batch.zsize_out = width_out;
774  batch.xsize_out = nx;
775  batch.data_in = data+x0;
776  batch.data_out = data_out;
777 
778  h_batchesZXY[iperm*nblock + iblock] = batch;
779  }
780 
781  max_nx_ZXY[iperm] = max_nx;
782  }
783 
784  copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
785  cudaCheck(cudaStreamSynchronize(stream));
786  delete [] h_batchesZXY;
787 }
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
std::vector< int > pos
__thread cudaStream_t stream
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int K3
Definition: PmeBase.h:18
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void CudaPmeTranspose::transposeXYZtoYZX ( const float2 data)
virtual

Implements PmeTranspose.

Definition at line 789 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_yzx(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

789  {
790  cudaCheck(cudaSetDevice(deviceID));
791 
792  int iperm;
793  switch(permutation) {
794  case Perm_Z_cX_Y:
795  // ZXY -> XYZ
796  iperm = 0;
797  break;
798  case Perm_cX_Y_Z:
799  // XYZ -> YZX
800  iperm = 1;
801  break;
802  case Perm_Y_Z_cX:
803  // YZX -> ZXY
804  iperm = 2;
805  break;
806  default:
807  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
808  break;
809  }
810 
812  nblock, batchesYZX + iperm*nblock,
813  max_nx_YZX[iperm], jsize, ksize,
814  isize, jsize, stream);
815 
816 
817 /*
818  int isize_out;
819  switch(permutation) {
820  case Perm_Z_cX_Y:
821  // ZXY -> XYZ
822  isize_out = pmeGrid.K1/2+1;
823  break;
824  case Perm_cX_Y_Z:
825  // XYZ -> YZX
826  isize_out = pmeGrid.K2;
827  break;
828  case Perm_Y_Z_cX:
829  // YZX -> ZXY
830  isize_out = pmeGrid.K3;
831  break;
832  default:
833  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
834  break;
835  }
836 
837  for (int iblock=0;iblock < nblock;iblock++) {
838 
839  int x0 = pos[iblock];
840  int nx = pos[iblock+1] - x0;
841 
842  int width_out;
843  float2* data_out;
844  if (dataPtrsYZX[iblock] == NULL) {
845  // Local transpose, use internal buffer
846  data_out = d_data + jsize*ksize*x0;
847  width_out = jsize;
848  } else {
849  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
850  data_out = dataPtrsYZX[iblock];
851  width_out = isize_out;
852  }
853 
854  transpose_xyz_yzx(
855  nx, jsize, ksize,
856  isize, jsize,
857  width_out, ksize,
858  data+x0, data_out, stream);
859  }
860 */
861 }
const int permutation
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
Definition: common.C:123
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void CudaPmeTranspose::transposeXYZtoZXY ( const float2 data)
virtual

Implements PmeTranspose.

Definition at line 863 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_zxy(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

863  {
864  cudaCheck(cudaSetDevice(deviceID));
865 
866  int iperm;
867  switch(permutation) {
868  case Perm_cX_Y_Z:
869  // XYZ -> ZXY
870  iperm = 0;
871  break;
872  case Perm_Z_cX_Y:
873  // ZXY -> YZX
874  iperm = 1;
875  break;
876  case Perm_Y_Z_cX:
877  // YZX -> XYZ
878  iperm = 2;
879  break;
880  default:
881  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
882  break;
883  }
884 
886  nblock, batchesZXY + iperm*nblock,
887  max_nx_ZXY[iperm], jsize, ksize,
888  isize, jsize, stream);
889 
890 /*
891  int isize_out;
892  switch(permutation) {
893  case Perm_cX_Y_Z:
894  // XYZ -> ZXY
895  isize_out = pmeGrid.K3;
896  break;
897  case Perm_Z_cX_Y:
898  // ZXY -> YZX
899  isize_out = pmeGrid.K2;
900  break;
901  case Perm_Y_Z_cX:
902  // YZX -> XYZ
903  isize_out = pmeGrid.K1/2+1;
904  break;
905  default:
906  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
907  break;
908  }
909 
910  for (int iblock=0;iblock < nblock;iblock++) {
911 
912  int x0 = pos[iblock];
913  int nx = pos[iblock+1] - x0;
914 
915  int width_out;
916  float2* data_out;
917  if (dataPtrsZXY[iblock] == NULL) {
918  // Local transpose, use internal buffer
919  data_out = d_data + jsize*ksize*x0;
920  width_out = ksize;
921  } else {
922  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
923  data_out = dataPtrsZXY[iblock];
924  width_out = isize_out;
925  }
926 
927  transpose_xyz_zxy(
928  nx, jsize, ksize,
929  isize, jsize,
930  width_out, nx,
931  data+x0, data_out, stream);
932  }
933 */
934 }
const int permutation
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
Definition: common.C:123
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void CudaPmeTranspose::waitStreamSynchronize ( )

Definition at line 936 of file CudaPmeSolverUtil.C.

References cudaCheck.

936  {
937  cudaCheck(cudaSetDevice(deviceID));
938  cudaCheck(cudaStreamSynchronize(stream));
939 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:79

The documentation for this class was generated from the following files: