NAMD
Classes | Functions | Variables
CudaPmeSolverUtilKernel.cu File Reference
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include "CudaUtils.h"
#include "CudaPmeSolverUtilKernel.h"

Go to the source code of this file.

Classes

struct  RecipVirial_t
 
struct  gather_t< T, order >
 

Functions

__forceinline__ __device__ double expfunc (const double x)
 
__forceinline__ __device__ float expfunc (const float x)
 
template<typename T , typename T2 , bool calc_energy_virial, bool orderXYZ, bool doOrtho>
__global__ void scalar_sum_kernel (const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const T recip1x, const T recip1y, const T recip1z, const T recip2x, const T recip2y, const T recip2z, const T recip3x, const T recip3y, const T recip3z, const T *prefac1, const T *prefac2, const T *prefac3, const T pi_ewald, const T piv_inv, const int k2_00, const int k3_00, T2 *data, double *__restrict__ energy_recip, double *__restrict__ virial)
 
template<typename T >
__forceinline__ __device__ void write_grid (const float val, const int ind, T *data)
 
template<typename T , int order>
__forceinline__ __device__ void calc_one_theta (const T w, T *theta)
 
template<typename T , typename T3 , int order>
__forceinline__ __device__ void calc_theta_dtheta (T wx, T wy, T wz, T3 *theta, T3 *dtheta)
 
template<typename AT , int order, bool periodicY, bool periodicZ>
__global__ void spread_charge_kernel (const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, AT *data)
 
template<typename T >
__forceinline__ __device__ void gather_force_store (const float fx, const float fy, const float fz, const int stride, const int pos, T *force)
 
template<>
__forceinline__ __device__ void gather_force_store< float > (const float fx, const float fy, const float fz, const int stride, const int pos, float *force)
 
template<>
__forceinline__ __device__ void gather_force_store< float3 > (const float fx, const float fy, const float fz, const int stride, const int pos, float3 *force)
 
template<typename CT , typename FT , int order, bool periodicY, bool periodicZ>
__global__ void gather_force (const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
 
template<typename T >
__device__ __forceinline__ void transpose_xyz_yzx_device (const int x_in, const int y_in, const int z_in, const int x_out, const int y_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
 
template<typename T >
__global__ void transpose_xyz_yzx_kernel (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
 
template<typename T >
__global__ void batchTranspose_xyz_yzx_kernel (const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
 
template<typename T >
__device__ __forceinline__ void transpose_xyz_zxy_device (const int x_in, const int y_in, const int z_in, const int x_out, const int z_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
 
template<typename T >
__global__ void transpose_xyz_zxy_kernel (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
 
template<typename T >
__global__ void batchTranspose_xyz_zxy_kernel (const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
 
void spread_charge (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
 
void scalar_sum (const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
 
void gather_force (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
 
void transpose_xyz_yzx (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
 
void batchTranspose_xyz_yzx (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
 
void transpose_xyz_zxy (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
 
void batchTranspose_xyz_zxy (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
 

Variables

const int TILEDIM = 32
 
const int TILEROWS = 8
 

Function Documentation

void batchTranspose_xyz_yzx ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1340 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoYZX().

1343  {
1344 
1345  dim3 numthread(TILEDIM, TILEROWS, 1);
1346  dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
1347 
1348  batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
1349  (batches, ny, nz, xsize_in, ysize_in);
1350 
1351  cudaCheck(cudaGetLastError());
1352 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
template<typename T >
__global__ void batchTranspose_xyz_yzx_kernel ( const TransposeBatch< T > *  batches,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in 
)

Definition at line 853 of file CudaPmeSolverUtilKernel.cu.

References TransposeBatch< T >::nx, and TILEDIM.

856  {
857 
858  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
859  int y_in = blockIdx.y * TILEDIM + threadIdx.y;
860  int z_in = (blockIdx.z % nz) + threadIdx.z;
861 
862  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
863  int y_out = blockIdx.y * TILEDIM + threadIdx.x;
864 
865  int bi = blockIdx.z/nz;
866 
867  TransposeBatch<T> batch = batches[bi];
868  int nx = batch.nx;
869  int ysize_out = batch.ysize_out;
870  int zsize_out = batch.zsize_out;
871  T* data_in = batch.data_in;
872  T* data_out = batch.data_out;
873 
874  transpose_xyz_yzx_device<T>(
875  x_in, y_in, z_in,
876  x_out, y_out,
877  nx, ny, nz,
878  xsize_in, ysize_in,
879  ysize_out, zsize_out,
880  data_in, data_out);
881 
882 }
const int TILEDIM
void batchTranspose_xyz_zxy ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1377 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoZXY().

1381  {
1382 
1383  dim3 numthread(TILEDIM, TILEROWS, 1);
1384  dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
1385 
1386  batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
1387  (batches, ny, nz, xsize_in, ysize_in);
1388 
1389  cudaCheck(cudaGetLastError());
1390 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
template<typename T >
__global__ void batchTranspose_xyz_zxy_kernel ( const TransposeBatch< T > *  batches,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in 
)

Definition at line 1008 of file CudaPmeSolverUtilKernel.cu.

References TransposeBatch< T >::nx, and TILEDIM.

1011  {
1012 
1013  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1014  int y_in = (blockIdx.z % ny) + threadIdx.z;
1015  int z_in = blockIdx.y * TILEDIM + threadIdx.y;
1016 
1017  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1018  int z_out = blockIdx.y * TILEDIM + threadIdx.x;
1019 
1020  int bi = blockIdx.z/ny;
1021 
1022  TransposeBatch<T> batch = batches[bi];
1023  int nx = batch.nx;
1024  int zsize_out = batch.zsize_out;
1025  int xsize_out = batch.xsize_out;
1026  T* data_in = batch.data_in;
1027  T* data_out = batch.data_out;
1028 
1029  transpose_xyz_zxy_device<T>(
1030  x_in, y_in, z_in, x_out, z_out,
1031  nx, ny, nz,
1032  xsize_in, ysize_in,
1033  zsize_out, xsize_out,
1034  data_in, data_out);
1035 
1036 }
const int TILEDIM
template<typename T , int order>
__forceinline__ __device__ void calc_one_theta ( const T  w,
T *  theta 
)

Definition at line 314 of file CudaPmeSolverUtilKernel.cu.

References order.

314  {
315 
316  theta[order-1] = ((T)0);
317  theta[1] = w;
318  theta[0] = ((T)1) - w;
319 
320 #pragma unroll
321  for (int k=3;k <= order-1;k++) {
322  T div = ((T)1) / (T)(k-1);
323  theta[k-1] = div*w*theta[k-2];
324 #pragma unroll
325  for (int j=1;j <= k-2;j++) {
326  theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
327  }
328  theta[0] = div*(((T)1) - w)*theta[0];
329  }
330 
331  //--- one more recursion
332  T div = ((T)1) / (T)(order-1);
333  theta[order-1] = div*w*theta[order-2];
334 #pragma unroll
335  for (int j=1;j <= order-2;j++) {
336  theta[order-j-1] = div*((w+j)*theta[order-j-2] + (order-j-w)*theta[order-j-1]);
337  }
338 
339  theta[0] = div*(((T)1) - w)*theta[0];
340 }
#define order
Definition: PmeRealSpace.C:235
template<typename T , typename T3 , int order>
__forceinline__ __device__ void calc_theta_dtheta ( wx,
wy,
wz,
T3 *  theta,
T3 *  dtheta 
)

Definition at line 346 of file CudaPmeSolverUtilKernel.cu.

References order, x, y, and z.

346  {
347 
348  theta[order-1].x = ((T)0);
349  theta[order-1].y = ((T)0);
350  theta[order-1].z = ((T)0);
351  theta[1].x = wx;
352  theta[1].y = wy;
353  theta[1].z = wz;
354  theta[0].x = ((T)1) - wx;
355  theta[0].y = ((T)1) - wy;
356  theta[0].z = ((T)1) - wz;
357 
358 #pragma unroll
359  for (int k=3;k <= order-1;k++) {
360  T div = ((T)1) / (T)(k-1);
361  theta[k-1].x = div*wx*theta[k-2].x;
362  theta[k-1].y = div*wy*theta[k-2].y;
363  theta[k-1].z = div*wz*theta[k-2].z;
364 #pragma unroll
365  for (int j=1;j <= k-2;j++) {
366  theta[k-j-1].x = div*((wx + j)*theta[k-j-2].x + (k-j-wx)*theta[k-j-1].x);
367  theta[k-j-1].y = div*((wy + j)*theta[k-j-2].y + (k-j-wy)*theta[k-j-1].y);
368  theta[k-j-1].z = div*((wz + j)*theta[k-j-2].z + (k-j-wz)*theta[k-j-1].z);
369  }
370  theta[0].x = div*(((T)1) - wx)*theta[0].x;
371  theta[0].y = div*(((T)1) - wy)*theta[0].y;
372  theta[0].z = div*(((T)1) - wz)*theta[0].z;
373  }
374 
375  //--- perform standard b-spline differentiation
376  dtheta[0].x = -theta[0].x;
377  dtheta[0].y = -theta[0].y;
378  dtheta[0].z = -theta[0].z;
379 #pragma unroll
380  for (int j=2;j <= order;j++) {
381  dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
382  dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
383  dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
384  }
385 
386  //--- one more recursion
387  T div = ((T)1) / (T)(order-1);
388  theta[order-1].x = div*wx*theta[order-2].x;
389  theta[order-1].y = div*wy*theta[order-2].y;
390  theta[order-1].z = div*wz*theta[order-2].z;
391 #pragma unroll
392  for (int j=1;j <= order-2;j++) {
393  theta[order-j-1].x = div*((wx + j)*theta[order-j-2].x + (order-j-wx)*theta[order-j-1].x);
394  theta[order-j-1].y = div*((wy + j)*theta[order-j-2].y + (order-j-wy)*theta[order-j-1].y);
395  theta[order-j-1].z = div*((wz + j)*theta[order-j-2].z + (order-j-wz)*theta[order-j-1].z);
396  }
397 
398  theta[0].x = div*(((T)1) - wx)*theta[0].x;
399  theta[0].y = div*(((T)1) - wy)*theta[0].y;
400  theta[0].z = div*(((T)1) - wz)*theta[0].z;
401 }
#define order
Definition: PmeRealSpace.C:235
gridSize z
gridSize y
gridSize x
__forceinline__ __device__ double expfunc ( const double  x)

Definition at line 46 of file CudaPmeSolverUtilKernel.cu.

Referenced by scalar_sum_kernel().

46  {
47  return exp(x);
48 }
gridSize x
__forceinline__ __device__ float expfunc ( const float  x)

Definition at line 50 of file CudaPmeSolverUtilKernel.cu.

50  {
51  return __expf(x);
52 }
gridSize x
template<typename CT , typename FT , int order, bool periodicY, bool periodicZ>
__global__ void gather_force ( const float4 *  xyzq,
const int  ncoord,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const float *  data,
const cudaTextureObject_t  gridTexObj,
const int  stride,
FT *  force 
)

Definition at line 569 of file CudaPmeSolverUtilKernel.cu.

References __ldg, BLOCK_SYNC, for(), order, WARP_BALLOT, WARP_FULL_MASK, WARP_SHUFFLE, x, y, and z.

Referenced by CudaPmeRealSpaceCompute::gatherForce().

577  {
578 
579  const int tid = threadIdx.x + threadIdx.y*blockDim.x; // 0...63
580 
581  // Shared memory
582  __shared__ gather_t<CT, order> shmem[32];
583 
584  const int pos = blockIdx.x*blockDim.x + threadIdx.x;
585  const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
586 
587  // Load atom data into shared memory
588  if (pos < pos_end && threadIdx.y == 0) {
589 
590  float4 xyzqi = xyzq[pos];
591  float x = xyzqi.x;
592  float y = xyzqi.y;
593  float z = xyzqi.z;
594  float q = xyzqi.w;
595 
596  float frx = ((float)nfftx)*x;
597  float fry = ((float)nffty)*y;
598  float frz = ((float)nfftz)*z;
599 
600  int frxi = (int)frx;
601  int fryi = (int)fry;
602  int frzi = (int)frz;
603 
604  float wx = frx - (float)frxi;
605  float wy = fry - (float)fryi;
606  float wz = frz - (float)frzi;
607 
608  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
609  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
610 
611  shmem[threadIdx.x].ix = frxi;
612  shmem[threadIdx.x].iy = fryi - y00;
613  shmem[threadIdx.x].iz = frzi - z00;
614  shmem[threadIdx.x].charge = q;
615 
616  float3 theta_tmp[order];
617  float3 dtheta_tmp[order];
618  calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
619 
620 #pragma unroll
621  for (int i=0;i < order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
622 
623 #pragma unroll
624  for (int i=0;i < order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
625 
626 #pragma unroll
627  for (int i=0;i < order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
628 
629 #pragma unroll
630  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
631 
632 #pragma unroll
633  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
634 
635 #pragma unroll
636  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
637 
638  }
639  BLOCK_SYNC;
640 
641  // We divide the order x order x order cube into 8 sub-cubes.
642  // These sub-cubes are taken care by a single thread
643  // The size of the sub-cubes is:
644  // order=4 : 2x2x2
645  // order=6 : 3x3x3
646  // order=8 : 4x4x4
647  const int nsc = (order == 4) ? 2 : ((order == 6) ? 3 : 4);
648  // Calculate the starting index on the sub-cube for this thread
649  // tid = 0...63
650  const int t = (tid % 8); // sub-cube index (0...7)
651  // t = (tx0 + ty0*2 + tz0*4)/nsc
652  // (tx0, ty0, tz0) gives the starting index of the 3x3x3 sub-cube
653  const int tz0 = (t / 4)*nsc;
654  const int ty0 = ((t / 2) % 2)*nsc;
655  const int tx0 = (t % 2)*nsc;
656 
657  //
658  // Calculate forces for 32 atoms. We have 32*2 = 64 threads
659  // Loop is iterated 4 times:
660  // (iterations)
661  // Threads 0...7 = atoms 0, 8, 16, 24
662  // Threads 8...15 = atoms 1, 9, 17, 25
663  // Threads 16...31 = atoms 2, 10, 18, 26
664  // ...
665  // Threads 56...63 = atoms 7, 15, 23, 31
666  //
667 
668  int base = tid/8;
669  const int base_end = pos_end - blockIdx.x*blockDim.x;
670 
671  // Make sure to mask out any threads that are not running the loop.
672  // This will happen if the number of atoms is not a multiple of 32.
673  int warp_mask = WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
674 
675  while (base < base_end) {
676 
677  float f1 = 0.0f;
678  float f2 = 0.0f;
679  float f3 = 0.0f;
680  int ix0 = shmem[base].ix;
681  int iy0 = shmem[base].iy;
682  int iz0 = shmem[base].iz;
683 
684  // Each thread calculates a nsc x nsc x nsc sub-cube
685 #pragma unroll
686  for (int i=0;i < nsc*nsc*nsc;i++) {
687  int tz = tz0 + (i/(nsc*nsc));
688  int ty = ty0 + ((i/nsc) % nsc);
689  int tx = tx0 + (i % nsc);
690 
691  int ix = ix0 + tx;
692  int iy = iy0 + ty;
693  int iz = iz0 + tz;
694  if (ix >= nfftx) ix -= nfftx;
695 
696  if (periodicY && (iy >= nffty)) iy -= nffty;
697  if (!periodicY && (iy < 0 || iy >= ysize)) continue;
698 
699  if (periodicZ && (iz >= nfftz)) iz -= nfftz;
700  if (!periodicZ && (iz < 0 || iz >= zsize)) continue;
701 
702  int ind = ix + (iy + iz*ysize)*xdim;
703 
704 #if __CUDA_ARCH__ >= 350
705  float q0 = __ldg(&data[ind]);
706 #else
707  float q0 = tex1Dfetch<float>(gridTexObj, ind);
708 #endif
709  float thx0 = shmem[base].thetax[tx];
710  float thy0 = shmem[base].thetay[ty];
711  float thz0 = shmem[base].thetaz[tz];
712  float dthx0 = shmem[base].dthetax[tx];
713  float dthy0 = shmem[base].dthetay[ty];
714  float dthz0 = shmem[base].dthetaz[tz];
715  f1 += dthx0 * thy0 * thz0 * q0;
716  f2 += thx0 * dthy0 * thz0 * q0;
717  f3 += thx0 * thy0 * dthz0 * q0;
718  }
719 
720  //-------------------------
721 
722  // Reduce
723  const int i = threadIdx.x & 7;
724 
725  f1 += WARP_SHUFFLE(warp_mask, f1, i+4, 8);
726  f2 += WARP_SHUFFLE(warp_mask, f2, i+4, 8);
727  f3 += WARP_SHUFFLE(warp_mask, f3, i+4, 8);
728 
729  f1 += WARP_SHUFFLE(warp_mask, f1, i+2, 8);
730  f2 += WARP_SHUFFLE(warp_mask, f2, i+2, 8);
731  f3 += WARP_SHUFFLE(warp_mask, f3, i+2, 8);
732 
733  f1 += WARP_SHUFFLE(warp_mask, f1, i+1, 8);
734  f2 += WARP_SHUFFLE(warp_mask, f2, i+1, 8);
735  f3 += WARP_SHUFFLE(warp_mask, f3, i+1, 8);
736 
737  if (i == 0) {
738  shmem[base].f1 = f1;
739  shmem[base].f2 = f2;
740  shmem[base].f3 = f3;
741  }
742 
743  base += 8;
744  warp_mask = WARP_BALLOT(warp_mask, (base < base_end) );
745  }
746 
747  // Write forces
748  BLOCK_SYNC;
749  if (pos < pos_end && threadIdx.y == 0) {
750  float f1 = shmem[threadIdx.x].f1;
751  float f2 = shmem[threadIdx.x].f2;
752  float f3 = shmem[threadIdx.x].f3;
753  float q = -shmem[threadIdx.x].charge; //*ccelec_float;
754  // float fx = q*recip1*f1*nfftx;
755  // float fy = q*recip2*f2*nffty;
756  // float fz = q*recip3*f3*nfftz;
757  float fx = q*f1*nfftx;
758  float fy = q*f2*nffty;
759  float fz = q*f3*nfftz;
760  gather_force_store<FT>(fx, fy, fz, stride, pos, force);
761  }
762 
763 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
#define order
Definition: PmeRealSpace.C:235
gridSize z
#define WARP_BALLOT(MASK, P)
Definition: CudaUtils.h:42
#define __ldg
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
gridSize y
gridSize x
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:38
for(int i=0;i< n1;++i)
void gather_force ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
const float *  data,
const int  order,
float3 *  force,
const cudaTextureObject_t  gridTexObj,
cudaStream_t  stream 
)

Definition at line 1200 of file CudaPmeSolverUtilKernel.cu.

References atoms, cudaCheck, and cudaNAMD_bug().

1208  {
1209 
1210  dim3 nthread(32, 2, 1);
1211  dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
1212  // dim3 nblock(npatch, 1, 1);
1213 
1214  switch(order) {
1215  case 4:
1216  if (periodicY && periodicZ)
1217  gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>>
1218  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1219  // recip11, recip22, recip33,
1220  data,
1221  gridTexObj,
1222  1, force);
1223  else if (periodicY)
1224  gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>>
1225  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1226  // recip11, recip22, recip33,
1227  data,
1228  gridTexObj,
1229  1, force);
1230  else if (periodicZ)
1231  gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>>
1232  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1233  // recip11, recip22, recip33,
1234  data,
1235  gridTexObj,
1236  1, force);
1237  else
1238  gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>>
1239  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1240  // recip11, recip22, recip33,
1241  data,
1242  gridTexObj,
1243  1, force);
1244  break;
1245 
1246  case 6:
1247  if (periodicY && periodicZ)
1248  gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>>
1249  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1250  // recip11, recip22, recip33,
1251  data,
1252  gridTexObj,
1253  1, force);
1254  else if (periodicY)
1255  gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>>
1256  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1257  // recip11, recip22, recip33,
1258  data,
1259  gridTexObj,
1260  1, force);
1261  else if (periodicZ)
1262  gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>>
1263  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1264  // recip11, recip22, recip33,
1265  data,
1266  gridTexObj,
1267  1, force);
1268  else
1269  gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>>
1270  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1271  // recip11, recip22, recip33,
1272  data,
1273  gridTexObj,
1274  1, force);
1275  break;
1276 
1277  case 8:
1278  if (periodicY && periodicZ)
1279  gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>>
1280  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1281  // recip11, recip22, recip33,
1282  data,
1283  gridTexObj,
1284  1, force);
1285  else if (periodicY)
1286  gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>>
1287  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1288  // recip11, recip22, recip33,
1289  data,
1290  gridTexObj,
1291  1, force);
1292  else if (periodicZ)
1293  gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>>
1294  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1295  // recip11, recip22, recip33,
1296  data,
1297  gridTexObj,
1298  1, force);
1299  else
1300  gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>>
1301  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1302  // recip11, recip22, recip33,
1303  data,
1304  gridTexObj,
1305  1, force);
1306  break;
1307 
1308  default:
1309  char str[128];
1310  sprintf(str, "gather_force, order %d not implemented",order);
1311  cudaNAMD_bug(str);
1312  }
1313  cudaCheck(cudaGetLastError());
1314 
1315 }
static __thread atom * atoms
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
template<typename T >
__forceinline__ __device__ void gather_force_store ( const float  fx,
const float  fy,
const float  fz,
const int  stride,
const int  pos,
T *  force 
)

Definition at line 517 of file CudaPmeSolverUtilKernel.cu.

519  {
520 }
template<>
__forceinline__ __device__ void gather_force_store< float > ( const float  fx,
const float  fy,
const float  fz,
const int  stride,
const int  pos,
float *  force 
)

Definition at line 524 of file CudaPmeSolverUtilKernel.cu.

526  {
527  // Store into non-strided float XYZ array
528  force[pos] = fx;
529  force[pos+stride] = fy;
530  force[pos+stride*2] = fz;
531 }
template<>
__forceinline__ __device__ void gather_force_store< float3 > ( const float  fx,
const float  fy,
const float  fz,
const int  stride,
const int  pos,
float3 *  force 
)

Definition at line 535 of file CudaPmeSolverUtilKernel.cu.

537  {
538  // Store into non-strided "float3" array
539  force[pos].x = fx;
540  force[pos].y = fy;
541  force[pos].z = fz;
542 }
void scalar_sum ( const bool  orderXYZ,
const int  nfft1,
const int  nfft2,
const int  nfft3,
const int  size1,
const int  size2,
const int  size3,
const double  kappa,
const float  recip1x,
const float  recip1y,
const float  recip1z,
const float  recip2x,
const float  recip2y,
const float  recip2z,
const float  recip3x,
const float  recip3y,
const float  recip3z,
const double  volume,
const float *  prefac1,
const float *  prefac2,
const float *  prefac3,
const int  k2_00,
const int  k3_00,
const bool  doEnergyVirial,
double *  energy,
double *  virial,
float2 data,
cudaStream_t  stream 
)

Definition at line 1136 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, and M_PI.

Referenced by CudaPmeKSpaceCompute::solve().

1145  {
1146 
1147  int nthread = 1024;
1148  int nblock = 64;
1149 
1150  int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
1151  if (doEnergyVirial) {
1152  const int warpSize = 32;
1153  shmem_size = max(shmem_size, (int)((nthread/warpSize)*sizeof(RecipVirial_t)));
1154  }
1155 
1156  float piv_inv = (float)(1.0/(M_PI*volume));
1157  float fac = (float)(M_PI*M_PI/(kappa*kappa));
1158 
1159  if (doEnergyVirial) {
1160  if (orderXYZ) {
1161  scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>>
1162  (nfft1, nfft2, nfft3, size1, size2, size3,
1163  recip1x, recip1y, recip1z,
1164  recip2x, recip2y, recip2z,
1165  recip3x, recip3y, recip3z,
1166  prefac1, prefac2, prefac3,
1167  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1168  } else {
1169  scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>>
1170  (nfft1, nfft2, nfft3, size1, size2, size3,
1171  recip1x, recip1y, recip1z,
1172  recip2x, recip2y, recip2z,
1173  recip3x, recip3y, recip3z,
1174  prefac1, prefac2, prefac3,
1175  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1176  }
1177  } else {
1178  if (orderXYZ) {
1179  scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>>
1180  (nfft1, nfft2, nfft3, size1, size2, size3,
1181  recip1x, recip1y, recip1z,
1182  recip2x, recip2y, recip2z,
1183  recip3x, recip3y, recip3z,
1184  prefac1, prefac2, prefac3,
1185  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1186  } else {
1187  scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>>
1188  (nfft1, nfft2, nfft3, size1, size2, size3,
1189  recip1x, recip1y, recip1z,
1190  recip2x, recip2y, recip2z,
1191  recip3x, recip3y, recip3z,
1192  prefac1, prefac2, prefac3,
1193  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1194  }
1195  }
1196  cudaCheck(cudaGetLastError());
1197 
1198 }
#define M_PI
Definition: GoMolecule.C:39
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
template<typename T , typename T2 , bool calc_energy_virial, bool orderXYZ, bool doOrtho>
__global__ void scalar_sum_kernel ( const int  nfft1,
const int  nfft2,
const int  nfft3,
const int  size1,
const int  size2,
const int  size3,
const T  recip1x,
const T  recip1y,
const T  recip1z,
const T  recip2x,
const T  recip2y,
const T  recip2z,
const T  recip3x,
const T  recip3y,
const T  recip3z,
const T *  prefac1,
const T *  prefac2,
const T *  prefac3,
const T  pi_ewald,
const T  piv_inv,
const int  k2_00,
const int  k3_00,
T2 *  data,
double *__restrict__  energy_recip,
double *__restrict__  virial 
)

Definition at line 60 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, RecipVirial_t::energy, expfunc(), RecipVirial_t::virial, WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

70  {
71  // Shared memory required: sizeof(T)*(nfft1 + nfft2 + nfft3)
72  extern __shared__ T sh_prefac[];
73 
74  // Create pointers to shared memory
75  T* sh_prefac1 = (T *)&sh_prefac[0];
76  T* sh_prefac2 = (T *)&sh_prefac[nfft1];
77  T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
78 
79  // Calculate start position (k1, k2, k3) for each thread
80  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
81  int k3 = tid/(size1*size2);
82  tid -= k3*size1*size2;
83  int k2 = tid/size1;
84  int k1 = tid - k2*size1;
85 
86  // Starting position in data
87  int pos = k1 + (k2 + k3*size2)*size1;
88 
89  // Move (k2, k3) to the global coordinate (k1_00 = 0 since this is the pencil direction)
90  k2 += k2_00;
91  k3 += k3_00;
92 
93  // Calculate limits w.r.t. global coordinates
94  const int lim2 = size2 + k2_00;
95  const int lim3 = size3 + k3_00;
96 
97  // Calculate increments (k1_inc, k2_inc, k3_inc)
98  int tot_inc = blockDim.x*gridDim.x;
99  const int k3_inc = tot_inc/(size1*size2);
100  tot_inc -= k3_inc*size1*size2;
101  const int k2_inc = tot_inc/size1;
102  const int k1_inc = tot_inc - k2_inc*size1;
103 
104  // Set data[0] = 0 for the global (0,0,0)
105  if (k1 == 0 && k2 == 0 && k3 == 0) {
106  T2 zero;
107  zero.x = (T)0;
108  zero.y = (T)0;
109  data[0] = zero;
110  // Increment position
111  k1 += k1_inc;
112  pos += k1_inc;
113  if (k1 >= size1) {
114  k1 -= size1;
115  k2++;
116  }
117  k2 += k2_inc;
118  pos += k2_inc*size1;
119  if (k2 >= lim2) {
120  k2 -= size2;
121  k3++;
122  }
123  k3 += k3_inc;
124  pos += k3_inc*size1*size2;
125  }
126 
127  // Load prefac data into shared memory
128  {
129  int t = threadIdx.x;
130  while (t < nfft1) {
131  sh_prefac1[t] = prefac1[t];
132  t += blockDim.x;
133  }
134  t = threadIdx.x;
135  while (t < nfft2) {
136  sh_prefac2[t] = prefac2[t];
137  t += blockDim.x;
138  }
139  t = threadIdx.x;
140  while (t < nfft3) {
141  sh_prefac3[t] = prefac3[t];
142  t += blockDim.x;
143  }
144  }
145  BLOCK_SYNC;
146 
147  double energy = 0.0;
148  double virial0 = 0.0;
149  double virial1 = 0.0;
150  double virial2 = 0.0;
151  double virial3 = 0.0;
152  double virial4 = 0.0;
153  double virial5 = 0.0;
154 
155  // If nfft1 is odd, set nfft1_half to impossible value so that
156  // the second condition in "if ( (k1 == 0) || (k1 == nfft1_half) )"
157  // is never satisfied
158  const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
159  const int nfft2_half = nfft2/2;
160  const int nfft3_half = nfft3/2;
161 
162  while (k3 < lim3) {
163 
164  T2 q = data[pos];
165 
166  int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
167  int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
168 
169  T m1, m2, m3;
170  if (doOrtho) {
171  m1 = recip1x*k1;
172  m2 = recip2y*k2s;
173  m3 = recip3z*k3s;
174  } else {
175  m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
176  m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
177  m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
178  }
179 
180  T msq = m1*m1 + m2*m2 + m3*m3;
181  T msq_inv = ((T)1)/msq;
182 
183  T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
184  T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
185  if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
186  T xp3 = expfunc(-pi_ewald*msq)*piv_inv;
187  T C = xp3*msq_inv;
188  T theta = theta3*C;
189 
190  if (calc_energy_virial) {
191  T fac = q2*C;
192  T vir = ((T)2)*(pi_ewald + msq_inv);
193  energy += fac;
194  virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
195  virial1 += (double)(fac*vir*m1*m2);
196  virial2 += (double)(fac*vir*m1*m3);
197  virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
198  virial4 += (double)(fac*vir*m2*m3);
199  virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
200  }
201 
202  q.x *= theta;
203  q.y *= theta;
204  data[pos] = q;
205 
206  // Increment position
207  k1 += k1_inc;
208  pos += k1_inc;
209  if (k1 >= size1) {
210  k1 -= size1;
211  k2++;
212  }
213  k2 += k2_inc;
214  pos += k2_inc*size1;
215  if (k2 >= lim2) {
216  k2 -= size2;
217  k3++;
218  }
219  k3 += k3_inc;
220  pos += k3_inc*size2*size1;
221  }
222 
223  // Reduce energy and virial
224  if (calc_energy_virial) {
225  const int tid = threadIdx.x & (warpSize-1);
226  const int base = (threadIdx.x/warpSize);
227  volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac;
228  // Reduce within warps
229  for (int d=warpSize/2;d >= 1;d /= 2) {
230  energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
231  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
232  virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
233  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
234  virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
235  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
236  virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
237  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
238  virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
239  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
240  virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
241  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
242  virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
243  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
244  }
245  // Reduce between warps
246  // NOTE: this BLOCK_SYNC is needed because we're using a single shared memory buffer
247  BLOCK_SYNC;
248  if (tid == 0) {
249  sh_ev[base].energy = energy;
250  sh_ev[base].virial[0] = virial0;
251  sh_ev[base].virial[1] = virial1;
252  sh_ev[base].virial[2] = virial2;
253  sh_ev[base].virial[3] = virial3;
254  sh_ev[base].virial[4] = virial4;
255  sh_ev[base].virial[5] = virial5;
256  }
257  BLOCK_SYNC;
258  if (base == 0) {
259  energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
260  virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
261  virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
262  virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
263  virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
264  virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
265  virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
266  for (int d=warpSize/2;d >= 1;d /= 2) {
267  energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
268  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
269  virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
270  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
271  virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
272  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
273  virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
274  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
275  virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
276  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
277  virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
278  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
279  virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
280  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
281  }
282  }
283 
284  if (threadIdx.x == 0) {
285  atomicAdd(energy_recip, energy*0.5);
286  virial0 *= -0.5;
287  virial1 *= -0.5;
288  virial2 *= -0.5;
289  virial3 *= -0.5;
290  virial4 *= -0.5;
291  virial5 *= -0.5;
292  atomicAdd(&virial[0], virial0);
293  atomicAdd(&virial[1], virial1);
294  atomicAdd(&virial[2], virial2);
295  atomicAdd(&virial[3], virial3);
296  atomicAdd(&virial[4], virial4);
297  atomicAdd(&virial[5], virial5);
298  }
299 
300  }
301 
302 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
#define WARPSIZE
__forceinline__ __device__ double expfunc(const double x)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:38
void spread_charge ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
float *  data,
const int  order,
cudaStream_t  stream 
)

Definition at line 1042 of file CudaPmeSolverUtilKernel.cu.

References atoms, cudaCheck, cudaNAMD_bug(), and if().

Referenced by CudaPmeRealSpaceCompute::spreadCharge().

1047  {
1048 
1049  dim3 nthread, nblock;
1050 
1051  switch(order) {
1052  case 4:
1053  nthread.x = 32;
1054  nthread.y = 4;
1055  nthread.z = 1;
1056  nblock.x = (numAtoms - 1)/nthread.x + 1;
1057  nblock.y = 1;
1058  nblock.z = 1;
1059  if (periodicY && periodicZ)
1060  spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
1061  (atoms, numAtoms,
1062  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1063  else if (periodicY)
1064  spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
1065  (atoms, numAtoms,
1066  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1067  else if (periodicZ)
1068  spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
1069  (atoms, numAtoms,
1070  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1071  else
1072  spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
1073  (atoms, numAtoms,
1074  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1075  break;
1076 
1077  case 6:
1078  nthread.x = 32;
1079  nthread.y = 7;
1080  nthread.z = 1;
1081  nblock.x = (numAtoms - 1)/nthread.x + 1;
1082  nblock.y = 1;
1083  nblock.z = 1;
1084  if (periodicY && periodicZ)
1085  spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
1086  (atoms, numAtoms,
1087  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1088  else if (periodicY)
1089  spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
1090  (atoms, numAtoms,
1091  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1092  else if (periodicZ)
1093  spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
1094  (atoms, numAtoms,
1095  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1096  else
1097  spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
1098  (atoms, numAtoms,
1099  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1100  break;
1101 
1102  case 8:
1103  nthread.x = 32;
1104  nthread.y = 16;
1105  nthread.z = 1;
1106  nblock.x = (numAtoms - 1)/nthread.x + 1;
1107  nblock.y = 1;
1108  nblock.z = 1;
1109  if (periodicY && periodicZ)
1110  spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
1111  (atoms, numAtoms,
1112  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1113  else if (periodicY)
1114  spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
1115  (atoms, numAtoms,
1116  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1117  else if (periodicZ)
1118  spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
1119  (atoms, numAtoms,
1120  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1121  else
1122  spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
1123  (atoms, numAtoms,
1124  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1125  break;
1126 
1127  default:
1128  char str[128];
1129  sprintf(str, "spread_charge, order %d not implemented",order);
1130  cudaNAMD_bug(str);
1131  }
1132  cudaCheck(cudaGetLastError());
1133 
1134 }
static __thread atom * atoms
if(ComputeNonbondedUtil::goMethod==2)
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
template<typename AT , int order, bool periodicY, bool periodicZ>
__global__ void spread_charge_kernel ( const float4 *  xyzq,
const int  ncoord,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
AT *  data 
)

Definition at line 410 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, order, x, y, and z.

414  {
415 
416  // Shared memory use:
417  // order = 4: 1920 bytes
418  // order = 6: 2688 bytes
419  // order = 8: 3456 bytes
420  __shared__ int sh_ix[32];
421  __shared__ int sh_iy[32];
422  __shared__ int sh_iz[32];
423  __shared__ float sh_thetax[order*32];
424  __shared__ float sh_thetay[order*32];
425  __shared__ float sh_thetaz[order*32];
426 
427  // Process atoms pos to pos_end-1 (blockDim.x)
428  const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
429  const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
430 
431  if (pos < pos_end && threadIdx.y == 0) {
432 
433  float4 xyzqi = xyzq[pos];
434  float x = xyzqi.x;
435  float y = xyzqi.y;
436  float z = xyzqi.z;
437  float q = xyzqi.w;
438 
439  float frx = ((float)nfftx)*x;
440  float fry = ((float)nffty)*y;
441  float frz = ((float)nfftz)*z;
442 
443  int frxi = (int)frx;
444  int fryi = (int)fry;
445  int frzi = (int)frz;
446 
447  float wx = frx - (float)frxi;
448  float wy = fry - (float)fryi;
449  float wz = frz - (float)frzi;
450 
451  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
452  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
453 
454  sh_ix[threadIdx.x] = frxi;
455  sh_iy[threadIdx.x] = fryi - y00;
456  sh_iz[threadIdx.x] = frzi - z00;
457 
458  float theta[order];
459 
460  calc_one_theta<float, order>(wx, theta);
461 #pragma unroll
462  for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
463 
464  calc_one_theta<float, order>(wy, theta);
465 #pragma unroll
466  for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
467 
468  calc_one_theta<float, order>(wz, theta);
469 #pragma unroll
470  for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
471 
472  }
473 
474  BLOCK_SYNC;
475 
476  // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
477  // NOTE: Only tid=0...order*order*order-1 do any computation
478  const int order3 = ((order*order*order-1)/warpSize + 1)*warpSize;
479  const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3; // 0...order3-1
480  const int x0 = tid % order;
481  const int y0 = (tid / order) % order;
482  const int z0 = tid / (order*order);
483 
484  // Loop over atoms pos..pos_end-1
485  int iadd = blockDim.x*blockDim.y/order3;
486  int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
487  int iend = pos_end - blockIdx.x*blockDim.x;
488  for (;i < iend;i += iadd) {
489  int x = sh_ix[i] + x0;
490  int y = sh_iy[i] + y0;
491  int z = sh_iz[i] + z0;
492 
493  if (x >= nfftx) x -= nfftx;
494 
495  if (periodicY && (y >= nffty)) y -= nffty;
496  if (!periodicY && (y < 0 || y >= ysize)) continue;
497 
498  if (periodicZ && (z >= nfftz)) z -= nfftz;
499  if (!periodicZ && (z < 0 || z >= zsize)) continue;
500 
501  // Get position on the grid
502  int ind = x + xdim*(y + ysize*(z));
503 
504  // Here we unroll the 6x6x6 loop with 216 threads.
505  // NOTE: We use 7*32=224 threads to do this
506  // Calculate interpolated charge value and store it to global memory
507 
508  if (tid < order*order*order)
509  write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
510  }
511 
512 }
#define order
Definition: PmeRealSpace.C:235
gridSize z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
gridSize y
gridSize x
void transpose_xyz_yzx ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1320 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

1324  {
1325 
1326  dim3 numthread(TILEDIM, TILEROWS, 1);
1327  dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
1328 
1329  transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
1330  (nx, ny, nz, xsize_in, ysize_in,
1331  ysize_out, zsize_out,
1332  data_in, data_out);
1333 
1334  cudaCheck(cudaGetLastError());
1335 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
template<typename T >
__device__ __forceinline__ void transpose_xyz_yzx_device ( const int  x_in,
const int  y_in,
const int  z_in,
const int  x_out,
const int  y_out,
const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 770 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, TILEDIM, and TILEROWS.

776  {
777 
778  // Shared memory
779  __shared__ T tile[TILEDIM][TILEDIM+1];
780 
781  // Read (x,y) data_in into tile (shared memory)
782  for (int j=0;j < TILEDIM;j += TILEROWS)
783  if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
784  tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
785 
786  BLOCK_SYNC;
787 
788  // Write (y,x) tile into data_out
789  const int z_out = z_in;
790  for (int j=0;j < TILEDIM;j += TILEROWS)
791  if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
792  data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
793 }
const int TILEDIM
const int TILEROWS
template<typename T >
__global__ void transpose_xyz_yzx_kernel ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 799 of file CudaPmeSolverUtilKernel.cu.

References TILEDIM.

803  {
804 
805  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
806  int y_in = blockIdx.y * TILEDIM + threadIdx.y;
807  int z_in = blockIdx.z + threadIdx.z;
808 
809  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
810  int y_out = blockIdx.y * TILEDIM + threadIdx.x;
811 
812  transpose_xyz_yzx_device<T>(
813  x_in, y_in, z_in,
814  x_out, y_out,
815  nx, ny, nz,
816  xsize_in, ysize_in,
817  ysize_out, zsize_out,
818  data_in, data_out);
819 
820 /*
821  // Shared memory
822  __shared__ T tile[TILEDIM][TILEDIM+1];
823 
824  int x = blockIdx.x * TILEDIM + threadIdx.x;
825  int y = blockIdx.y * TILEDIM + threadIdx.y;
826  int z = blockIdx.z + threadIdx.z;
827 
828  // Read (x,y) data_in into tile (shared memory)
829  for (int j=0;j < TILEDIM;j += TILEROWS)
830  if ((x < nx) && (y + j < ny) && (z < nz))
831  tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
832 
833  BLOCK_SYNC;
834 
835  // Write (y,x) tile into data_out
836  x = blockIdx.x * TILEDIM + threadIdx.y;
837  y = blockIdx.y * TILEDIM + threadIdx.x;
838  for (int j=0;j < TILEDIM;j += TILEROWS)
839  if ((x + j < nx) && (y < ny) && (z < nz))
840  data_out[y + (z + (x+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
841 */
842 }
const int TILEDIM
void transpose_xyz_zxy ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1357 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

1361  {
1362 
1363  dim3 numthread(TILEDIM, TILEROWS, 1);
1364  dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
1365 
1366  transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
1367  (nx, ny, nz, xsize_in, ysize_in,
1368  zsize_out, xsize_out,
1369  data_in, data_out);
1370 
1371  cudaCheck(cudaGetLastError());
1372 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
template<typename T >
__device__ __forceinline__ void transpose_xyz_zxy_device ( const int  x_in,
const int  y_in,
const int  z_in,
const int  x_out,
const int  z_out,
const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 948 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, TILEDIM, and TILEROWS.

954  {
955 
956  // Shared memory
957  __shared__ T tile[TILEDIM][TILEDIM+1];
958 
959  // Read (x,z) data_in into tile (shared memory)
960  for (int k=0;k < TILEDIM;k += TILEROWS)
961  if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
962  tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
963 
964  BLOCK_SYNC;
965 
966  // Write (z,x) tile into data_out
967  const int y_out = y_in;
968  for (int k=0;k < TILEDIM;k += TILEROWS)
969  if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
970  data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
971 }
const int TILEDIM
const int TILEROWS
template<typename T >
__global__ void transpose_xyz_zxy_kernel ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 977 of file CudaPmeSolverUtilKernel.cu.

References TILEDIM.

981  {
982 
983  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
984  int y_in = blockIdx.z + threadIdx.z;
985  int z_in = blockIdx.y * TILEDIM + threadIdx.y;
986 
987  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
988  int z_out = blockIdx.y * TILEDIM + threadIdx.x;
989 
990  transpose_xyz_zxy_device<T>(
991  x_in, y_in, z_in, x_out, z_out,
992  nx, ny, nz,
993  xsize_in, ysize_in,
994  zsize_out, xsize_out,
995  data_in, data_out);
996 
997 }
const int TILEDIM
template<typename T >
__forceinline__ __device__ void write_grid ( const float  val,
const int  ind,
T *  data 
)

Definition at line 305 of file CudaPmeSolverUtilKernel.cu.

306  {
307  atomicAdd(&data[ind], (T)val);
308 }

Variable Documentation

const int TILEDIM = 32
const int TILEROWS = 8