NAMD
Public Member Functions | List of all members
CudaPmeTranspose Class Reference

#include <CudaPmeSolverUtil.h>

Inheritance diagram for CudaPmeTranspose:
PmeTranspose

Public Member Functions

 CudaPmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
 
 ~CudaPmeTranspose ()
 
void setDataPtrsYZX (std::vector< float2 * > &dataPtrsNew, float2 *data)
 
void setDataPtrsZXY (std::vector< float2 * > &dataPtrsNew, float2 *data)
 
void transposeXYZtoYZX (const float2 *data)
 
void transposeXYZtoZXY (const float2 *data)
 
void waitStreamSynchronize ()
 
void copyDataDeviceToHost (const int iblock, float2 *h_data, const int h_dataSize)
 
void copyDataHostToDevice (const int iblock, float2 *data_in, float2 *data_out)
 
void copyDataDeviceToDevice (const int iblock, float2 *data_out)
 
float2getBuffer (const int iblock)
 
void copyDataToPeerDeviceYZX (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
 
void copyDataToPeerDeviceZXY (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
 
- Public Member Functions inherited from PmeTranspose
 PmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock)
 
virtual ~PmeTranspose ()
 

Additional Inherited Members

- Protected Attributes inherited from PmeTranspose
PmeGrid pmeGrid
 
const int permutation
 
const int jblock
 
const int kblock
 
int isize
 
int jsize
 
int ksize
 
int dataSize
 
int nblock
 
std::vector< int > pos
 

Detailed Description

Definition at line 171 of file CudaPmeSolverUtil.h.

Constructor & Destructor Documentation

CudaPmeTranspose::CudaPmeTranspose ( PmeGrid  pmeGrid,
const int  permutation,
const int  jblock,
const int  kblock,
int  deviceID,
cudaStream_t  stream 
)

Definition at line 788 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, and PmeTranspose::nblock.

789  :
790  PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
791  cudaCheck(cudaSetDevice(deviceID));
792 
793  allocate_device<float2>(&d_data, dataSize);
794 #ifndef P2P_ENABLE_3D
795  allocate_device<float2>(&d_buffer, dataSize);
796 #endif
797 
798  // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
799  dataPtrsYZX.resize(nblock, NULL);
800  dataPtrsZXY.resize(nblock, NULL);
801 
802  allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
803  allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
804 }
const int permutation
PmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock)
__thread cudaStream_t stream
const int jblock
const int kblock
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
CudaPmeTranspose::~CudaPmeTranspose ( )

Definition at line 806 of file CudaPmeSolverUtil.C.

References cudaCheck.

806  {
807  cudaCheck(cudaSetDevice(deviceID));
808  deallocate_device<float2>(&d_data);
809 #ifndef P2P_ENABLE_3D
810  deallocate_device<float2>(&d_buffer);
811 #endif
812  deallocate_device< TransposeBatch<float2> >(&batchesZXY);
813  deallocate_device< TransposeBatch<float2> >(&batchesYZX);
814 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95

Member Function Documentation

void CudaPmeTranspose::copyDataDeviceToDevice ( const int  iblock,
float2 data_out 
)

Definition at line 1151 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

1151  {
1152  cudaCheck(cudaSetDevice(deviceID));
1153 
1154  if (iblock >= nblock)
1155  NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1156 
1157  // Determine block size = how much we're copying
1158  int i0, i1, j0, j1, k0, k1;
1159  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1160  int ni = i1-i0+1;
1161  int nj = j1-j0+1;
1162  int nk = k1-k0+1;
1163 
1164  float2* data_in = d_buffer + i0*nj*nk;
1165 
1166  copy3D_DtoD<float2>(data_in, data_out,
1167  0, 0, 0,
1168  ni, nj,
1169  i0, 0, 0,
1170  isize, jsize,
1171  ni, nj, nk, stream);
1172 }
const int permutation
__thread cudaStream_t stream
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:123
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:86
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void CudaPmeTranspose::copyDataDeviceToHost ( const int  iblock,
float2 h_data,
const int  h_dataSize 
)

Definition at line 1105 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, and PmeTranspose::pos.

1105  {
1106  cudaCheck(cudaSetDevice(deviceID));
1107 
1108  if (iblock >= nblock)
1109  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1110 
1111  int x0 = pos[iblock];
1112  int nx = pos[iblock+1] - x0;
1113 
1114  int copySize = jsize*ksize*nx;
1115  int copyStart = jsize*ksize*x0;
1116 
1117  if (copyStart + copySize > dataSize)
1118  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1119 
1120  if (copySize > h_dataSize)
1121  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1122 
1123  copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1124 }
std::vector< int > pos
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
Definition: common.C:123
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void CudaPmeTranspose::copyDataHostToDevice ( const int  iblock,
float2 data_in,
float2 data_out 
)

Definition at line 1126 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

1126  {
1127  cudaCheck(cudaSetDevice(deviceID));
1128 
1129  if (iblock >= nblock)
1130  NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1131 
1132  // Determine block size = how much we're copying
1133  int i0, i1, j0, j1, k0, k1;
1134  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1135  int ni = i1-i0+1;
1136  int nj = j1-j0+1;
1137  int nk = k1-k0+1;
1138 
1139  copy3D_HtoD<float2>(data_in, data_out,
1140  0, 0, 0,
1141  ni, nj,
1142  i0, 0, 0,
1143  isize, jsize,
1144  ni, nj, nk, stream);
1145 }
const int permutation
__thread cudaStream_t stream
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:123
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:86
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void CudaPmeTranspose::copyDataToPeerDeviceYZX ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1192 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

1193  {
1194 
1195  int iblock_out = jblock;
1196  int jblock_out = kblock;
1197  int kblock_out = iblock;
1198 
1199  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1200 }
const int jblock
const int kblock
void CudaPmeTranspose::copyDataToPeerDeviceZXY ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1202 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

1203  {
1204 
1205  int iblock_out = kblock;
1206  int jblock_out = iblock;
1207  int kblock_out = jblock;
1208 
1209  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1210 }
const int jblock
const int kblock
float2 * CudaPmeTranspose::getBuffer ( const int  iblock)

Definition at line 1177 of file CudaPmeSolverUtil.C.

References getBlockDim(), PmeTranspose::jblock, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

1177  {
1178  if (iblock >= nblock)
1179  NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1180 
1181  // Determine block size = how much we're copying
1182  int i0, i1, j0, j1, k0, k1;
1183  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1184  int ni = i1-i0+1;
1185  int nj = j1-j0+1;
1186  int nk = k1-k0+1;
1187 
1188  return d_buffer + i0*nj*nk;
1189 }
const int permutation
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:123
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:86
void CudaPmeTranspose::setDataPtrsYZX ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 819 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::ysize_out, and TransposeBatch< T >::zsize_out.

819  {
820  if (dataPtrsYZX.size() != dataPtrsNew.size())
821  NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
822  for (int iblock=0;iblock < nblock;iblock++) {
823  dataPtrsYZX[iblock] = dataPtrsNew[iblock];
824  }
825  // Build batched data structures
827 
828  for (int iperm=0;iperm < 3;iperm++) {
829  int isize_out;
830  if (iperm == 0) {
831  // Perm_Z_cX_Y:
832  // ZXY -> XYZ
833  isize_out = pmeGrid.K1/2+1;
834  } else if (iperm == 1) {
835  // Perm_cX_Y_Z:
836  // XYZ -> YZX
837  isize_out = pmeGrid.K2;
838  } else {
839  // Perm_Y_Z_cX:
840  // YZX -> ZXY
841  isize_out = pmeGrid.K3;
842  }
843 
844  int max_nx = 0;
845  for (int iblock=0;iblock < nblock;iblock++) {
846 
847  int x0 = pos[iblock];
848  int nx = pos[iblock+1] - x0;
849  max_nx = std::max(max_nx, nx);
850 
851  int width_out;
852  float2* data_out;
853  if (dataPtrsYZX[iblock] == NULL) {
854  // Local transpose, use internal buffer
855  data_out = d_data + jsize*ksize*x0;
856  width_out = jsize;
857  } else {
858  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
859  data_out = dataPtrsYZX[iblock];
860  width_out = isize_out;
861  }
862 
864  batch.nx = nx;
865  batch.ysize_out = width_out;
866  batch.zsize_out = ksize;
867  batch.data_in = data+x0;
868  batch.data_out = data_out;
869 
870  h_batchesYZX[iperm*nblock + iblock] = batch;
871 
872  // transpose_xyz_yzx(
873  // nx, jsize, ksize,
874  // isize, jsize,
875  // width_out, ksize,
876  // data+x0, data_out, stream);
877  }
878 
879  max_nx_YZX[iperm] = max_nx;
880  }
881 
882  copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
883  cudaCheck(cudaStreamSynchronize(stream));
884  delete [] h_batchesYZX;
885 }
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
std::vector< int > pos
__thread cudaStream_t stream
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int K3
Definition: PmeBase.h:18
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void CudaPmeTranspose::setDataPtrsZXY ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 890 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::xsize_out, and TransposeBatch< T >::zsize_out.

890  {
891  if (dataPtrsZXY.size() != dataPtrsNew.size())
892  NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
893  for (int iblock=0;iblock < nblock;iblock++) {
894  dataPtrsZXY[iblock] = dataPtrsNew[iblock];
895  }
896 
897  // Build batched data structures
899 
900  for (int iperm=0;iperm < 3;iperm++) {
901  int isize_out;
902  if (iperm == 0) {
903  // Perm_cX_Y_Z:
904  // XYZ -> ZXY
905  isize_out = pmeGrid.K3;
906  } else if (iperm == 1) {
907  // Perm_Z_cX_Y:
908  // ZXY -> YZX
909  isize_out = pmeGrid.K2;
910  } else {
911  // Perm_Y_Z_cX:
912  // YZX -> XYZ
913  isize_out = pmeGrid.K1/2+1;
914  }
915 
916  int max_nx = 0;
917  for (int iblock=0;iblock < nblock;iblock++) {
918 
919  int x0 = pos[iblock];
920  int nx = pos[iblock+1] - x0;
921  max_nx = std::max(max_nx, nx);
922 
923  int width_out;
924  float2* data_out;
925  if (dataPtrsZXY[iblock] == NULL) {
926  // Local transpose, use internal buffer
927  data_out = d_data + jsize*ksize*x0;
928  width_out = ksize;
929  } else {
930  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
931  data_out = dataPtrsZXY[iblock];
932  width_out = isize_out;
933  }
934 
936  batch.nx = nx;
937  batch.zsize_out = width_out;
938  batch.xsize_out = nx;
939  batch.data_in = data+x0;
940  batch.data_out = data_out;
941 
942  h_batchesZXY[iperm*nblock + iblock] = batch;
943  }
944 
945  max_nx_ZXY[iperm] = max_nx;
946  }
947 
948  copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
949  cudaCheck(cudaStreamSynchronize(stream));
950  delete [] h_batchesZXY;
951 }
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
std::vector< int > pos
__thread cudaStream_t stream
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:123
int K3
Definition: PmeBase.h:18
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void CudaPmeTranspose::transposeXYZtoYZX ( const float2 data)
virtual

Implements PmeTranspose.

Definition at line 953 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_yzx(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

953  {
954  cudaCheck(cudaSetDevice(deviceID));
955 
956  int iperm;
957  switch(permutation) {
958  case Perm_Z_cX_Y:
959  // ZXY -> XYZ
960  iperm = 0;
961  break;
962  case Perm_cX_Y_Z:
963  // XYZ -> YZX
964  iperm = 1;
965  break;
966  case Perm_Y_Z_cX:
967  // YZX -> ZXY
968  iperm = 2;
969  break;
970  default:
971  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
972  break;
973  }
974 
976  nblock, batchesYZX + iperm*nblock,
977  max_nx_YZX[iperm], jsize, ksize,
978  isize, jsize, stream);
979 
980 
981 /*
982  int isize_out;
983  switch(permutation) {
984  case Perm_Z_cX_Y:
985  // ZXY -> XYZ
986  isize_out = pmeGrid.K1/2+1;
987  break;
988  case Perm_cX_Y_Z:
989  // XYZ -> YZX
990  isize_out = pmeGrid.K2;
991  break;
992  case Perm_Y_Z_cX:
993  // YZX -> ZXY
994  isize_out = pmeGrid.K3;
995  break;
996  default:
997  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
998  break;
999  }
1000 
1001  for (int iblock=0;iblock < nblock;iblock++) {
1002 
1003  int x0 = pos[iblock];
1004  int nx = pos[iblock+1] - x0;
1005 
1006  int width_out;
1007  float2* data_out;
1008  if (dataPtrsYZX[iblock] == NULL) {
1009  // Local transpose, use internal buffer
1010  data_out = d_data + jsize*ksize*x0;
1011  width_out = jsize;
1012  } else {
1013  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1014  data_out = dataPtrsYZX[iblock];
1015  width_out = isize_out;
1016  }
1017 
1018  transpose_xyz_yzx(
1019  nx, jsize, ksize,
1020  isize, jsize,
1021  width_out, ksize,
1022  data+x0, data_out, stream);
1023  }
1024 */
1025 }
const int permutation
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
Definition: common.C:123
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void CudaPmeTranspose::transposeXYZtoZXY ( const float2 data)
virtual

Implements PmeTranspose.

Definition at line 1027 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_zxy(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

1027  {
1028  cudaCheck(cudaSetDevice(deviceID));
1029 
1030  int iperm;
1031  switch(permutation) {
1032  case Perm_cX_Y_Z:
1033  // XYZ -> ZXY
1034  iperm = 0;
1035  break;
1036  case Perm_Z_cX_Y:
1037  // ZXY -> YZX
1038  iperm = 1;
1039  break;
1040  case Perm_Y_Z_cX:
1041  // YZX -> XYZ
1042  iperm = 2;
1043  break;
1044  default:
1045  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1046  break;
1047  }
1048 
1050  nblock, batchesZXY + iperm*nblock,
1051  max_nx_ZXY[iperm], jsize, ksize,
1052  isize, jsize, stream);
1053 
1054 /*
1055  int isize_out;
1056  switch(permutation) {
1057  case Perm_cX_Y_Z:
1058  // XYZ -> ZXY
1059  isize_out = pmeGrid.K3;
1060  break;
1061  case Perm_Z_cX_Y:
1062  // ZXY -> YZX
1063  isize_out = pmeGrid.K2;
1064  break;
1065  case Perm_Y_Z_cX:
1066  // YZX -> XYZ
1067  isize_out = pmeGrid.K1/2+1;
1068  break;
1069  default:
1070  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1071  break;
1072  }
1073 
1074  for (int iblock=0;iblock < nblock;iblock++) {
1075 
1076  int x0 = pos[iblock];
1077  int nx = pos[iblock+1] - x0;
1078 
1079  int width_out;
1080  float2* data_out;
1081  if (dataPtrsZXY[iblock] == NULL) {
1082  // Local transpose, use internal buffer
1083  data_out = d_data + jsize*ksize*x0;
1084  width_out = ksize;
1085  } else {
1086  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1087  data_out = dataPtrsZXY[iblock];
1088  width_out = isize_out;
1089  }
1090 
1091  transpose_xyz_zxy(
1092  nx, jsize, ksize,
1093  isize, jsize,
1094  width_out, nx,
1095  data+x0, data_out, stream);
1096  }
1097 */
1098 }
const int permutation
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
Definition: common.C:123
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void CudaPmeTranspose::waitStreamSynchronize ( )

Definition at line 1100 of file CudaPmeSolverUtil.C.

References cudaCheck.

1100  {
1101  cudaCheck(cudaSetDevice(deviceID));
1102  cudaCheck(cudaStreamSynchronize(stream));
1103 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95

The documentation for this class was generated from the following files: