2 #define _USE_MATH_DEFINES
46 __forceinline__ __device__
double expfunc(
const double x) {
50 __forceinline__ __device__
float expfunc(
const float x) {
59 template <
typename T,
typename T2,
bool calc_energy_virial,
bool orderXYZ,
bool doOrtho>
61 const int size1,
const int size2,
const int size3,
62 const T recip1x,
const T recip1y,
const T recip1z,
63 const T recip2x,
const T recip2y,
const T recip2z,
64 const T recip3x,
const T recip3y,
const T recip3z,
65 const T* prefac1,
const T* prefac2,
const T* prefac3,
66 const T pi_ewald,
const T piv_inv,
67 const int k2_00,
const int k3_00,
69 double* __restrict__ energy_recip,
70 double* __restrict__ virial) {
72 extern __shared__ T sh_prefac[];
75 T* sh_prefac1 = (T *)&sh_prefac[0];
76 T* sh_prefac2 = (T *)&sh_prefac[nfft1];
77 T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
80 unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
81 int k3 = tid/(size1*size2);
82 tid -= k3*size1*size2;
84 int k1 = tid - k2*size1;
87 int pos = k1 + (k2 + k3*size2)*size1;
94 const int lim2 = size2 + k2_00;
95 const int lim3 = size3 + k3_00;
98 int tot_inc = blockDim.x*gridDim.x;
99 const int k3_inc = tot_inc/(size1*size2);
100 tot_inc -= k3_inc*size1*size2;
101 const int k2_inc = tot_inc/size1;
102 const int k1_inc = tot_inc - k2_inc*size1;
105 if (k1 == 0 && k2 == 0 && k3 == 0) {
124 pos += k3_inc*size1*size2;
131 sh_prefac1[t] = prefac1[t];
136 sh_prefac2[t] = prefac2[t];
141 sh_prefac3[t] = prefac3[t];
148 double virial0 = 0.0;
149 double virial1 = 0.0;
150 double virial2 = 0.0;
151 double virial3 = 0.0;
152 double virial4 = 0.0;
153 double virial5 = 0.0;
158 const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
159 const int nfft2_half = nfft2/2;
160 const int nfft3_half = nfft3/2;
166 int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
167 int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
175 m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
176 m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
177 m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
180 T msq = m1*m1 + m2*m2 + m3*m3;
181 T msq_inv = ((T)1)/msq;
183 T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
184 T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
185 if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
186 T xp3 =
expfunc(-pi_ewald*msq)*piv_inv;
190 if (calc_energy_virial) {
192 T vir = ((T)2)*(pi_ewald + msq_inv);
194 virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
195 virial1 += (double)(fac*vir*m1*m2);
196 virial2 += (double)(fac*vir*m1*m3);
197 virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
198 virial4 += (double)(fac*vir*m2*m3);
199 virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
220 pos += k3_inc*size2*size1;
224 if (calc_energy_virial) {
225 const int tid = threadIdx.x & (warpSize-1);
226 const int base = (threadIdx.x/warpSize);
229 for (
int d=warpSize/2;d >= 1;d /= 2) {
249 sh_ev[base].
energy = energy;
250 sh_ev[base].
virial[0] = virial0;
251 sh_ev[base].
virial[1] = virial1;
252 sh_ev[base].
virial[2] = virial2;
253 sh_ev[base].
virial[3] = virial3;
254 sh_ev[base].
virial[4] = virial4;
255 sh_ev[base].
virial[5] = virial5;
259 energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
260 virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
261 virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
262 virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
263 virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
264 virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
265 virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
266 for (
int d=warpSize/2;d >= 1;d /= 2) {
284 if (threadIdx.x == 0) {
285 atomicAdd(energy_recip, energy*0.5);
292 atomicAdd(&virial[0], virial0);
293 atomicAdd(&virial[1], virial1);
294 atomicAdd(&virial[2], virial2);
295 atomicAdd(&virial[3], virial3);
296 atomicAdd(&virial[4], virial4);
297 atomicAdd(&virial[5], virial5);
304 template <
typename T>
305 __forceinline__ __device__
void write_grid(
const float val,
const int ind,
307 atomicAdd(&data[ind], (T)val);
313 template <
typename T,
int order>
316 theta[
order-1] = ((T)0);
318 theta[0] = ((T)1) - w;
321 for (
int k=3;k <=
order-1;k++) {
322 T div = ((T)1) / (T)(k-1);
323 theta[k-1] = div*w*theta[k-2];
325 for (
int j=1;j <= k-2;j++) {
326 theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
328 theta[0] = div*(((T)1) - w)*theta[0];
332 T div = ((T)1) / (T)(
order-1);
335 for (
int j=1;j <=
order-2;j++) {
339 theta[0] = div*(((T)1) - w)*theta[0];
345 template <
typename T,
typename T3,
int order>
348 theta[
order-1].x = ((T)0);
349 theta[
order-1].y = ((T)0);
350 theta[
order-1].z = ((T)0);
354 theta[0].x = ((T)1) - wx;
355 theta[0].y = ((T)1) - wy;
356 theta[0].z = ((T)1) - wz;
359 for (
int k=3;k <=
order-1;k++) {
360 T div = ((T)1) / (T)(k-1);
361 theta[k-1].x = div*wx*theta[k-2].x;
362 theta[k-1].y = div*wy*theta[k-2].y;
363 theta[k-1].z = div*wz*theta[k-2].z;
365 for (
int j=1;j <= k-2;j++) {
366 theta[k-j-1].x = div*((wx + j)*theta[k-j-2].
x + (k-j-wx)*theta[k-j-1].x);
367 theta[k-j-1].y = div*((wy + j)*theta[k-j-2].
y + (k-j-wy)*theta[k-j-1].y);
368 theta[k-j-1].z = div*((wz + j)*theta[k-j-2].
z + (k-j-wz)*theta[k-j-1].z);
370 theta[0].x = div*(((T)1) - wx)*theta[0].
x;
371 theta[0].y = div*(((T)1) - wy)*theta[0].
y;
372 theta[0].z = div*(((T)1) - wz)*theta[0].
z;
376 dtheta[0].x = -theta[0].x;
377 dtheta[0].y = -theta[0].y;
378 dtheta[0].z = -theta[0].z;
380 for (
int j=2;j <=
order;j++) {
381 dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
382 dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
383 dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
387 T div = ((T)1) / (T)(
order-1);
392 for (
int j=1;j <=
order-2;j++) {
398 theta[0].x = div*(((T)1) - wx)*theta[0].
x;
399 theta[0].y = div*(((T)1) - wy)*theta[0].
y;
400 theta[0].z = div*(((T)1) - wz)*theta[0].
z;
408 template <
typename AT,
int order,
bool periodicY,
bool periodicZ>
411 const int nfftx,
const int nffty,
const int nfftz,
412 const int xsize,
const int ysize,
const int zsize,
413 const int xdim,
const int y00,
const int z00,
420 __shared__
int sh_ix[32];
421 __shared__
int sh_iy[32];
422 __shared__
int sh_iz[32];
423 __shared__
float sh_thetax[
order*32];
424 __shared__
float sh_thetay[
order*32];
425 __shared__
float sh_thetaz[
order*32];
428 const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
429 const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
431 if (pos < pos_end && threadIdx.y == 0) {
433 float4 xyzqi = xyzq[pos];
439 float frx = ((float)nfftx)*
x;
440 float fry = ((float)nffty)*
y;
441 float frz = ((float)nfftz)*
z;
447 float wx = frx - (float)frxi;
448 float wy = fry - (float)fryi;
449 float wz = frz - (float)frzi;
451 if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
452 if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
454 sh_ix[threadIdx.x] = frxi;
455 sh_iy[threadIdx.x] = fryi - y00;
456 sh_iz[threadIdx.x] = frzi - z00;
460 calc_one_theta<float, order>(wx, theta);
462 for (
int i=0;i <
order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
464 calc_one_theta<float, order>(wy, theta);
466 for (
int i=0;i <
order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
468 calc_one_theta<float, order>(wz, theta);
470 for (
int i=0;i <
order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
479 const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3;
480 const int x0 = tid %
order;
485 int iadd = blockDim.x*blockDim.y/order3;
486 int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
487 int iend = pos_end - blockIdx.x*blockDim.x;
488 for (;i < iend;i += iadd) {
489 int x = sh_ix[i] + x0;
490 int y = sh_iy[i] + y0;
491 int z = sh_iz[i] + z0;
493 if (x >= nfftx) x -= nfftx;
495 if (periodicY && (y >= nffty)) y -= nffty;
496 if (!periodicY && (y < 0 || y >= ysize))
continue;
498 if (periodicZ && (z >= nfftz)) z -= nfftz;
499 if (!periodicZ && (z < 0 || z >= zsize))
continue;
502 int ind = x + xdim*(y + ysize*(
z));
509 write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
516 template <
typename T> __forceinline__ __device__
518 const int stride,
const int pos,
523 template <> __forceinline__ __device__
525 const int stride,
const int pos,
529 force[pos+stride] = fy;
530 force[pos+stride*2] = fz;
534 template <> __forceinline__ __device__
536 const int stride,
const int pos,
546 template <
typename T,
int order>
568 template <
typename CT,
typename FT,
int order,
bool periodicY,
bool periodicZ>
570 const int nfftx,
const int nffty,
const int nfftz,
571 const int xsize,
const int ysize,
const int zsize,
572 const int xdim,
const int y00,
const int z00,
575 const cudaTextureObject_t gridTexObj,
579 const int tid = threadIdx.x + threadIdx.y*blockDim.x;
584 const int pos = blockIdx.x*blockDim.x + threadIdx.x;
585 const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
588 if (pos < pos_end && threadIdx.y == 0) {
590 float4 xyzqi = xyzq[pos];
596 float frx = ((float)nfftx)*
x;
597 float fry = ((float)nffty)*
y;
598 float frz = ((float)nfftz)*
z;
604 float wx = frx - (float)frxi;
605 float wy = fry - (float)fryi;
606 float wz = frz - (float)frzi;
608 if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
609 if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
611 shmem[threadIdx.x].ix = frxi;
612 shmem[threadIdx.x].iy = fryi - y00;
613 shmem[threadIdx.x].iz = frzi - z00;
614 shmem[threadIdx.x].charge = q;
616 float3 theta_tmp[
order];
617 float3 dtheta_tmp[
order];
618 calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
621 for (
int i=0;i <
order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
624 for (
int i=0;i <
order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
627 for (
int i=0;i <
order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
630 for (
int i=0;i <
order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
633 for (
int i=0;i <
order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
636 for (
int i=0;i <
order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
647 const int nsc = (
order == 4) ? 2 : ((
order == 6) ? 3 : 4);
650 const int t = (tid % 8);
653 const int tz0 = (t / 4)*nsc;
654 const int ty0 = ((t / 2) % 2)*nsc;
655 const int tx0 = (t % 2)*nsc;
669 const int base_end = pos_end - blockIdx.x*blockDim.x;
675 while (base < base_end) {
680 int ix0 = shmem[base].ix;
681 int iy0 = shmem[base].iy;
682 int iz0 = shmem[base].iz;
686 for (
int i=0;i < nsc*nsc*nsc;i++) {
687 int tz = tz0 + (i/(nsc*nsc));
688 int ty = ty0 + ((i/nsc) % nsc);
689 int tx = tx0 + (i % nsc);
694 if (ix >= nfftx) ix -= nfftx;
696 if (periodicY && (iy >= nffty)) iy -= nffty;
697 if (!periodicY && (iy < 0 || iy >= ysize))
continue;
699 if (periodicZ && (iz >= nfftz)) iz -= nfftz;
700 if (!periodicZ && (iz < 0 || iz >= zsize))
continue;
702 int ind = ix + (iy + iz*ysize)*xdim;
704 #if __CUDA_ARCH__ >= 350
705 float q0 =
__ldg(&data[ind]);
707 float q0 = tex1Dfetch<float>(gridTexObj, ind);
709 float thx0 = shmem[base].thetax[tx];
710 float thy0 = shmem[base].thetay[ty];
711 float thz0 = shmem[base].thetaz[tz];
712 float dthx0 = shmem[base].dthetax[tx];
713 float dthy0 = shmem[base].dthetay[ty];
714 float dthz0 = shmem[base].dthetaz[tz];
715 f1 += dthx0 * thy0 * thz0 * q0;
716 f2 += thx0 * dthy0 * thz0 * q0;
717 f3 += thx0 * thy0 * dthz0 * q0;
723 const int i = threadIdx.x & 7;
744 warp_mask =
WARP_BALLOT(warp_mask, (base < base_end) );
749 if (pos < pos_end && threadIdx.y == 0) {
750 float f1 = shmem[threadIdx.x].f1;
751 float f2 = shmem[threadIdx.x].f2;
752 float f3 = shmem[threadIdx.x].f3;
753 float q = -shmem[threadIdx.x].charge;
757 float fx = q*f1*nfftx;
758 float fy = q*f2*nffty;
759 float fz = q*f3*nfftz;
760 gather_force_store<FT>(fx, fy, fz, stride, pos, force);
768 template <
typename T>
769 __device__ __forceinline__
771 const int x_in,
const int y_in,
const int z_in,
772 const int x_out,
const int y_out,
773 const int nx,
const int ny,
const int nz,
774 const int xsize_in,
const int ysize_in,
775 const int ysize_out,
const int zsize_out,
776 const T* data_in, T* data_out) {
783 if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
784 tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
789 const int z_out = z_in;
791 if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
792 data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
798 template <
typename T>
800 const int nx,
const int ny,
const int nz,
801 const int xsize_in,
const int ysize_in,
802 const int ysize_out,
const int zsize_out,
803 const T* data_in, T* data_out) {
805 int x_in = blockIdx.x *
TILEDIM + threadIdx.x;
806 int y_in = blockIdx.y *
TILEDIM + threadIdx.y;
807 int z_in = blockIdx.z + threadIdx.z;
809 int x_out = blockIdx.x *
TILEDIM + threadIdx.y;
810 int y_out = blockIdx.y *
TILEDIM + threadIdx.x;
812 transpose_xyz_yzx_device<T>(
817 ysize_out, zsize_out,
852 template <
typename T>
855 const int ny,
const int nz,
856 const int xsize_in,
const int ysize_in) {
858 int x_in = blockIdx.x *
TILEDIM + threadIdx.x;
859 int y_in = blockIdx.y *
TILEDIM + threadIdx.y;
860 int z_in = (blockIdx.z % nz) + threadIdx.z;
862 int x_out = blockIdx.x *
TILEDIM + threadIdx.y;
863 int y_out = blockIdx.y *
TILEDIM + threadIdx.x;
865 int bi = blockIdx.z/nz;
869 int ysize_out = batch.ysize_out;
870 int zsize_out = batch.zsize_out;
871 T* data_in = batch.data_in;
872 T* data_out = batch.data_out;
874 transpose_xyz_yzx_device<T>(
879 ysize_out, zsize_out,
946 template <
typename T>
947 __device__ __forceinline__
949 const int x_in,
const int y_in,
const int z_in,
950 const int x_out,
const int z_out,
951 const int nx,
const int ny,
const int nz,
952 const int xsize_in,
const int ysize_in,
953 const int zsize_out,
const int xsize_out,
954 const T* data_in, T* data_out) {
961 if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
962 tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
967 const int y_out = y_in;
969 if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
970 data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
976 template <
typename T>
978 const int nx,
const int ny,
const int nz,
979 const int xsize_in,
const int ysize_in,
980 const int zsize_out,
const int xsize_out,
981 const T* data_in, T* data_out) {
983 int x_in = blockIdx.x *
TILEDIM + threadIdx.x;
984 int y_in = blockIdx.z + threadIdx.z;
985 int z_in = blockIdx.y *
TILEDIM + threadIdx.y;
987 int x_out = blockIdx.x *
TILEDIM + threadIdx.y;
988 int z_out = blockIdx.y *
TILEDIM + threadIdx.x;
990 transpose_xyz_zxy_device<T>(
991 x_in, y_in, z_in, x_out, z_out,
994 zsize_out, xsize_out,
1007 template <
typename T>
1010 const int ny,
const int nz,
1011 const int xsize_in,
const int ysize_in) {
1013 int x_in = blockIdx.x *
TILEDIM + threadIdx.x;
1014 int y_in = (blockIdx.z % ny) + threadIdx.z;
1015 int z_in = blockIdx.y *
TILEDIM + threadIdx.y;
1017 int x_out = blockIdx.x *
TILEDIM + threadIdx.y;
1018 int z_out = blockIdx.y *
TILEDIM + threadIdx.x;
1020 int bi = blockIdx.z/ny;
1024 int zsize_out = batch.zsize_out;
1025 int xsize_out = batch.xsize_out;
1026 T* data_in = batch.data_in;
1027 T* data_out = batch.data_out;
1029 transpose_xyz_zxy_device<T>(
1030 x_in, y_in, z_in, x_out, z_out,
1033 zsize_out, xsize_out,
1043 const int nfftx,
const int nffty,
const int nfftz,
1044 const int xsize,
const int ysize,
const int zsize,
1045 const int xdim,
const int y00,
const int z00,
1046 const bool periodicY,
const bool periodicZ,
1047 float* data,
const int order, cudaStream_t
stream) {
1049 dim3 nthread, nblock;
1056 nblock.x = (numAtoms - 1)/nthread.x + 1;
1059 if (periodicY && periodicZ)
1060 spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
1062 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1064 spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
1066 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1068 spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
1070 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1072 spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
1074 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1081 nblock.x = (numAtoms - 1)/nthread.x + 1;
1084 if (periodicY && periodicZ)
1085 spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
1087 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1089 spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
1091 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1093 spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
1095 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1097 spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
1099 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1106 nblock.x = (numAtoms - 1)/nthread.x + 1;
1109 if (periodicY && periodicZ)
1110 spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
1112 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1114 spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
1116 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1118 spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
1120 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1122 spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
1124 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1129 sprintf(str,
"spread_charge, order %d not implemented",order);
1136 void scalar_sum(
const bool orderXYZ,
const int nfft1,
const int nfft2,
const int nfft3,
1137 const int size1,
const int size2,
const int size3,
const double kappa,
1138 const float recip1x,
const float recip1y,
const float recip1z,
1139 const float recip2x,
const float recip2y,
const float recip2z,
1140 const float recip3x,
const float recip3y,
const float recip3z,
1141 const double volume,
1142 const float* prefac1,
const float* prefac2,
const float* prefac3,
1143 const int k2_00,
const int k3_00,
1144 const bool doEnergyVirial,
double* energy,
double* virial,
float2* data,
1150 int shmem_size =
sizeof(float)*(nfft1 + nfft2 + nfft3);
1151 if (doEnergyVirial) {
1152 const int warpSize = 32;
1153 shmem_size = max(shmem_size, (
int)((nthread/warpSize)*
sizeof(
RecipVirial_t)));
1156 float piv_inv = (float)(1.0/(
M_PI*volume));
1157 float fac = (float)(
M_PI*
M_PI/(kappa*kappa));
1159 if (doEnergyVirial) {
1161 scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>>
1162 (nfft1, nfft2, nfft3, size1, size2, size3,
1163 recip1x, recip1y, recip1z,
1164 recip2x, recip2y, recip2z,
1165 recip3x, recip3y, recip3z,
1166 prefac1, prefac2, prefac3,
1167 fac, piv_inv, k2_00, k3_00, data, energy, virial);
1169 scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>>
1170 (nfft1, nfft2, nfft3, size1, size2, size3,
1171 recip1x, recip1y, recip1z,
1172 recip2x, recip2y, recip2z,
1173 recip3x, recip3y, recip3z,
1174 prefac1, prefac2, prefac3,
1175 fac, piv_inv, k2_00, k3_00, data, energy, virial);
1179 scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>>
1180 (nfft1, nfft2, nfft3, size1, size2, size3,
1181 recip1x, recip1y, recip1z,
1182 recip2x, recip2y, recip2z,
1183 recip3x, recip3y, recip3z,
1184 prefac1, prefac2, prefac3,
1185 fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1187 scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>>
1188 (nfft1, nfft2, nfft3, size1, size2, size3,
1189 recip1x, recip1y, recip1z,
1190 recip2x, recip2y, recip2z,
1191 recip3x, recip3y, recip3z,
1192 prefac1, prefac2, prefac3,
1193 fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1202 const int nfftx,
const int nffty,
const int nfftz,
1203 const int xsize,
const int ysize,
const int zsize,
1204 const int xdim,
const int y00,
const int z00,
1205 const bool periodicY,
const bool periodicZ,
1206 const float* data,
const int order, float3* force,
1207 const cudaTextureObject_t gridTexObj,
1210 dim3 nthread(32, 2, 1);
1211 dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
1216 if (periodicY && periodicZ)
1217 gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>>
1218 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1224 gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>>
1225 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1231 gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>>
1232 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1238 gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>>
1239 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1247 if (periodicY && periodicZ)
1248 gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>>
1249 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1255 gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>>
1256 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1262 gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>>
1263 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1269 gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>>
1270 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1278 if (periodicY && periodicZ)
1279 gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>>
1280 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1286 gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>>
1287 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1293 gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>>
1294 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1300 gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>>
1301 (
atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1310 sprintf(str,
"gather_force, order %d not implemented",order);
1321 const int nx,
const int ny,
const int nz,
1322 const int xsize_in,
const int ysize_in,
1323 const int ysize_out,
const int zsize_out,
1329 transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
1330 (nx, ny, nz, xsize_in, ysize_in,
1331 ysize_out, zsize_out,
1342 const int max_nx,
const int ny,
const int nz,
1343 const int xsize_in,
const int ysize_in, cudaStream_t
stream) {
1346 dim3 numblock((max_nx-1)/
TILEDIM+1, (ny-1)/
TILEDIM+1, nz*numBatches);
1348 batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
1349 (batches, ny, nz, xsize_in, ysize_in);
1358 const int nx,
const int ny,
const int nz,
1359 const int xsize_in,
const int ysize_in,
1360 const int zsize_out,
const int xsize_out,
1366 transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
1367 (nx, ny, nz, xsize_in, ysize_in,
1368 zsize_out, xsize_out,
1379 const int max_nx,
const int ny,
const int nz,
1380 const int xsize_in,
const int ysize_in,
1384 dim3 numblock((max_nx-1)/
TILEDIM+1, (nz-1)/
TILEDIM+1, ny*numBatches);
1386 batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
1387 (batches, ny, nz, xsize_in, ysize_in);
__forceinline__ __device__ void gather_force_store(const float fx, const float fy, const float fz, const int stride, const int pos, T *force)
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
__forceinline__ __device__ void write_grid(const float val, const int ind, T *data)
static __thread atom * atoms
void transpose_xyz_zxy(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
if(ComputeNonbondedUtil::goMethod==2)
__thread cudaStream_t stream
__forceinline__ __device__ void calc_theta_dtheta(T wx, T wy, T wz, T3 *theta, T3 *dtheta)
__global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const T recip1x, const T recip1y, const T recip1z, const T recip2x, const T recip2y, const T recip2z, const T recip3x, const T recip3y, const T recip3z, const T *prefac1, const T *prefac2, const T *prefac3, const T pi_ewald, const T piv_inv, const int k2_00, const int k3_00, T2 *data, double *__restrict__ energy_recip, double *__restrict__ virial)
__global__ void transpose_xyz_zxy_kernel(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
#define WARP_BALLOT(MASK, P)
__forceinline__ __device__ void calc_one_theta(const T w, T *theta)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void cudaNAMD_bug(const char *msg)
__global__ void gather_force(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
__device__ __forceinline__ void transpose_xyz_zxy_device(const int x_in, const int y_in, const int z_in, const int x_out, const int z_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__device__ __forceinline__ void transpose_xyz_yzx_device(const int x_in, const int y_in, const int z_in, const int x_out, const int y_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
void transpose_xyz_yzx(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
__global__ void spread_charge_kernel(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, AT *data)
__forceinline__ __device__ void gather_force_store< float >(const float fx, const float fy, const float fz, const int stride, const int pos, float *force)
__forceinline__ __device__ void gather_force_store< float3 >(const float fx, const float fy, const float fz, const int stride, const int pos, float3 *force)
__forceinline__ __device__ double expfunc(const double x)
__global__ void batchTranspose_xyz_yzx_kernel(const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
__global__ void transpose_xyz_yzx_kernel(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
__global__ void batchTranspose_xyz_zxy_kernel(const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)