NAMD
Classes | Functions
CudaPmeSolverUtilKernel.h File Reference

Go to the source code of this file.

Classes

struct  TransposeBatch< T >
 

Functions

void spread_charge (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
 
void scalar_sum (const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
 
void gather_force (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
 
void transpose_xyz_yzx (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
 
void batchTranspose_xyz_yzx (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
 
void transpose_xyz_zxy (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
 
void batchTranspose_xyz_zxy (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
 

Function Documentation

void batchTranspose_xyz_yzx ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1340 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoYZX().

1343  {
1344 
1345  dim3 numthread(TILEDIM, TILEROWS, 1);
1346  dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
1347 
1348  batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
1349  (batches, ny, nz, xsize_in, ysize_in);
1350 
1351  cudaCheck(cudaGetLastError());
1352 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void batchTranspose_xyz_zxy ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1377 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoZXY().

1381  {
1382 
1383  dim3 numthread(TILEDIM, TILEROWS, 1);
1384  dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
1385 
1386  batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
1387  (batches, ny, nz, xsize_in, ysize_in);
1388 
1389  cudaCheck(cudaGetLastError());
1390 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void gather_force ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
const float *  data,
const int  order,
float3 *  force,
const cudaTextureObject_t  gridTexObj,
cudaStream_t  stream 
)

Definition at line 1200 of file CudaPmeSolverUtilKernel.cu.

References atoms, cudaCheck, and cudaNAMD_bug().

1208  {
1209 
1210  dim3 nthread(32, 2, 1);
1211  dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
1212  // dim3 nblock(npatch, 1, 1);
1213 
1214  switch(order) {
1215  case 4:
1216  if (periodicY && periodicZ)
1217  gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>>
1218  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1219  // recip11, recip22, recip33,
1220  data,
1221  gridTexObj,
1222  1, force);
1223  else if (periodicY)
1224  gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>>
1225  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1226  // recip11, recip22, recip33,
1227  data,
1228  gridTexObj,
1229  1, force);
1230  else if (periodicZ)
1231  gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>>
1232  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1233  // recip11, recip22, recip33,
1234  data,
1235  gridTexObj,
1236  1, force);
1237  else
1238  gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>>
1239  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1240  // recip11, recip22, recip33,
1241  data,
1242  gridTexObj,
1243  1, force);
1244  break;
1245 
1246  case 6:
1247  if (periodicY && periodicZ)
1248  gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>>
1249  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1250  // recip11, recip22, recip33,
1251  data,
1252  gridTexObj,
1253  1, force);
1254  else if (periodicY)
1255  gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>>
1256  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1257  // recip11, recip22, recip33,
1258  data,
1259  gridTexObj,
1260  1, force);
1261  else if (periodicZ)
1262  gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>>
1263  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1264  // recip11, recip22, recip33,
1265  data,
1266  gridTexObj,
1267  1, force);
1268  else
1269  gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>>
1270  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1271  // recip11, recip22, recip33,
1272  data,
1273  gridTexObj,
1274  1, force);
1275  break;
1276 
1277  case 8:
1278  if (periodicY && periodicZ)
1279  gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>>
1280  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1281  // recip11, recip22, recip33,
1282  data,
1283  gridTexObj,
1284  1, force);
1285  else if (periodicY)
1286  gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>>
1287  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1288  // recip11, recip22, recip33,
1289  data,
1290  gridTexObj,
1291  1, force);
1292  else if (periodicZ)
1293  gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>>
1294  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1295  // recip11, recip22, recip33,
1296  data,
1297  gridTexObj,
1298  1, force);
1299  else
1300  gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>>
1301  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1302  // recip11, recip22, recip33,
1303  data,
1304  gridTexObj,
1305  1, force);
1306  break;
1307 
1308  default:
1309  char str[128];
1310  sprintf(str, "gather_force, order %d not implemented",order);
1311  cudaNAMD_bug(str);
1312  }
1313  cudaCheck(cudaGetLastError());
1314 
1315 }
static __thread atom * atoms
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void scalar_sum ( const bool  orderXYZ,
const int  nfft1,
const int  nfft2,
const int  nfft3,
const int  size1,
const int  size2,
const int  size3,
const double  kappa,
const float  recip1x,
const float  recip1y,
const float  recip1z,
const float  recip2x,
const float  recip2y,
const float  recip2z,
const float  recip3x,
const float  recip3y,
const float  recip3z,
const double  volume,
const float *  prefac1,
const float *  prefac2,
const float *  prefac3,
const int  k2_00,
const int  k3_00,
const bool  doEnergyVirial,
double *  energy,
double *  virial,
float2 data,
cudaStream_t  stream 
)

Definition at line 1136 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, and M_PI.

Referenced by CudaPmeKSpaceCompute::solve().

1145  {
1146 
1147  int nthread = 1024;
1148  int nblock = 64;
1149 
1150  int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
1151  if (doEnergyVirial) {
1152  const int warpSize = 32;
1153  shmem_size = max(shmem_size, (int)((nthread/warpSize)*sizeof(RecipVirial_t)));
1154  }
1155 
1156  float piv_inv = (float)(1.0/(M_PI*volume));
1157  float fac = (float)(M_PI*M_PI/(kappa*kappa));
1158 
1159  if (doEnergyVirial) {
1160  if (orderXYZ) {
1161  scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>>
1162  (nfft1, nfft2, nfft3, size1, size2, size3,
1163  recip1x, recip1y, recip1z,
1164  recip2x, recip2y, recip2z,
1165  recip3x, recip3y, recip3z,
1166  prefac1, prefac2, prefac3,
1167  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1168  } else {
1169  scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>>
1170  (nfft1, nfft2, nfft3, size1, size2, size3,
1171  recip1x, recip1y, recip1z,
1172  recip2x, recip2y, recip2z,
1173  recip3x, recip3y, recip3z,
1174  prefac1, prefac2, prefac3,
1175  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1176  }
1177  } else {
1178  if (orderXYZ) {
1179  scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>>
1180  (nfft1, nfft2, nfft3, size1, size2, size3,
1181  recip1x, recip1y, recip1z,
1182  recip2x, recip2y, recip2z,
1183  recip3x, recip3y, recip3z,
1184  prefac1, prefac2, prefac3,
1185  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1186  } else {
1187  scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>>
1188  (nfft1, nfft2, nfft3, size1, size2, size3,
1189  recip1x, recip1y, recip1z,
1190  recip2x, recip2y, recip2z,
1191  recip3x, recip3y, recip3z,
1192  prefac1, prefac2, prefac3,
1193  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1194  }
1195  }
1196  cudaCheck(cudaGetLastError());
1197 
1198 }
#define M_PI
Definition: GoMolecule.C:39
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void spread_charge ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
float *  data,
const int  order,
cudaStream_t  stream 
)

Definition at line 1042 of file CudaPmeSolverUtilKernel.cu.

References atoms, cudaCheck, cudaNAMD_bug(), and if().

Referenced by CudaPmeRealSpaceCompute::spreadCharge().

1047  {
1048 
1049  dim3 nthread, nblock;
1050 
1051  switch(order) {
1052  case 4:
1053  nthread.x = 32;
1054  nthread.y = 4;
1055  nthread.z = 1;
1056  nblock.x = (numAtoms - 1)/nthread.x + 1;
1057  nblock.y = 1;
1058  nblock.z = 1;
1059  if (periodicY && periodicZ)
1060  spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
1061  (atoms, numAtoms,
1062  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1063  else if (periodicY)
1064  spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
1065  (atoms, numAtoms,
1066  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1067  else if (periodicZ)
1068  spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
1069  (atoms, numAtoms,
1070  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1071  else
1072  spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
1073  (atoms, numAtoms,
1074  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1075  break;
1076 
1077  case 6:
1078  nthread.x = 32;
1079  nthread.y = 7;
1080  nthread.z = 1;
1081  nblock.x = (numAtoms - 1)/nthread.x + 1;
1082  nblock.y = 1;
1083  nblock.z = 1;
1084  if (periodicY && periodicZ)
1085  spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
1086  (atoms, numAtoms,
1087  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1088  else if (periodicY)
1089  spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
1090  (atoms, numAtoms,
1091  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1092  else if (periodicZ)
1093  spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
1094  (atoms, numAtoms,
1095  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1096  else
1097  spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
1098  (atoms, numAtoms,
1099  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1100  break;
1101 
1102  case 8:
1103  nthread.x = 32;
1104  nthread.y = 16;
1105  nthread.z = 1;
1106  nblock.x = (numAtoms - 1)/nthread.x + 1;
1107  nblock.y = 1;
1108  nblock.z = 1;
1109  if (periodicY && periodicZ)
1110  spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
1111  (atoms, numAtoms,
1112  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1113  else if (periodicY)
1114  spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
1115  (atoms, numAtoms,
1116  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1117  else if (periodicZ)
1118  spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
1119  (atoms, numAtoms,
1120  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1121  else
1122  spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
1123  (atoms, numAtoms,
1124  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1125  break;
1126 
1127  default:
1128  char str[128];
1129  sprintf(str, "spread_charge, order %d not implemented",order);
1130  cudaNAMD_bug(str);
1131  }
1132  cudaCheck(cudaGetLastError());
1133 
1134 }
static __thread atom * atoms
if(ComputeNonbondedUtil::goMethod==2)
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void transpose_xyz_yzx ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1320 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

1324  {
1325 
1326  dim3 numthread(TILEDIM, TILEROWS, 1);
1327  dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
1328 
1329  transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
1330  (nx, ny, nz, xsize_in, ysize_in,
1331  ysize_out, zsize_out,
1332  data_in, data_out);
1333 
1334  cudaCheck(cudaGetLastError());
1335 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
void transpose_xyz_zxy ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1357 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

1361  {
1362 
1363  dim3 numthread(TILEDIM, TILEROWS, 1);
1364  dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
1365 
1366  transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
1367  (nx, ny, nz, xsize_in, ysize_in,
1368  zsize_out, xsize_out,
1369  data_in, data_out);
1370 
1371  cudaCheck(cudaGetLastError());
1372 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:79