NAMD
CudaPmeSolverUtilKernel.cu
Go to the documentation of this file.
1 #ifdef WIN32
2 #define _USE_MATH_DEFINES
3 #endif
4 #include <math.h>
5 #include <stdio.h>
6 #ifdef NAMD_CUDA
7 #include <cuda.h>
8 #endif
9 #include "CudaUtils.h"
11 
12 #ifdef NAMD_CUDA
13 // CCELEC is 1/ (4 pi eps ) in AKMA units, conversion from SI
14 // units: CCELEC = e*e*Na / (4*pi*eps*1Kcal*1A)
15 //
16 // parameter :: CCELEC=332.0636D0 ! old value of dubious origin
17 // parameter :: CCELEC=331.843D0 ! value from 1986-1987 CRC Handbook
18 // ! of Chemistry and Physics
19 // real(chm_real), parameter :: &
20 // CCELEC_amber = 332.0522173D0, &
21 // CCELEC_charmm = 332.0716D0 , &
22 // CCELEC_discover = 332.054D0 , &
23 // CCELEC_namd = 332.0636D0
24 //const double ccelec = 332.0636;
25 //const double half_ccelec = 0.5*ccelec;
26 //const float ccelec_float = 332.0636f;
27 
28 /*
29 // Structure into which virials are stored
30 // NOTE: This structure is only used for computing addresses
31 struct Virial_t {
32  double sforce_dp[27][3];
33  long long int sforce_fp[27][3];
34  double virmat[9];
35  // Energies start here ...
36 };
37 */
38 
39 // Local structure for scalar_sum -function for energy and virial reductions
40 struct RecipVirial_t {
41  double energy;
42  double virial[6];
43 };
44 
45 
46 __forceinline__ __device__ double expfunc(const double x) {
47  return exp(x);
48 }
49 
50 __forceinline__ __device__ float expfunc(const float x) {
51  return __expf(x);
52 }
53 
54 //
55 // Performs scalar sum on data(nfft1, nfft2, nfft3)
56 // T = float or double
57 // T2 = float2 or double2
58 //
59 template <typename T, typename T2, bool calc_energy_virial, bool orderXYZ, bool doOrtho>
60 __global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3,
61  const int size1, const int size2, const int size3,
62  const T recip1x, const T recip1y, const T recip1z,
63  const T recip2x, const T recip2y, const T recip2z,
64  const T recip3x, const T recip3y, const T recip3z,
65  const T* prefac1, const T* prefac2, const T* prefac3,
66  const T pi_ewald, const T piv_inv,
67  const int k2_00, const int k3_00,
68  T2* data,
69  double* __restrict__ energy_recip,
70  double* __restrict__ virial) {
71  // Shared memory required: sizeof(T)*(nfft1 + nfft2 + nfft3)
72  extern __shared__ T sh_prefac[];
73 
74  // Create pointers to shared memory
75  T* sh_prefac1 = (T *)&sh_prefac[0];
76  T* sh_prefac2 = (T *)&sh_prefac[nfft1];
77  T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
78 
79  // Calculate start position (k1, k2, k3) for each thread
80  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
81  int k3 = tid/(size1*size2);
82  tid -= k3*size1*size2;
83  int k2 = tid/size1;
84  int k1 = tid - k2*size1;
85 
86  // Starting position in data
87  int pos = k1 + (k2 + k3*size2)*size1;
88 
89  // Move (k2, k3) to the global coordinate (k1_00 = 0 since this is the pencil direction)
90  k2 += k2_00;
91  k3 += k3_00;
92 
93  // Calculate limits w.r.t. global coordinates
94  const int lim2 = size2 + k2_00;
95  const int lim3 = size3 + k3_00;
96 
97  // Calculate increments (k1_inc, k2_inc, k3_inc)
98  int tot_inc = blockDim.x*gridDim.x;
99  const int k3_inc = tot_inc/(size1*size2);
100  tot_inc -= k3_inc*size1*size2;
101  const int k2_inc = tot_inc/size1;
102  const int k1_inc = tot_inc - k2_inc*size1;
103 
104  // Set data[0] = 0 for the global (0,0,0)
105  if (k1 == 0 && k2 == 0 && k3 == 0) {
106  T2 zero;
107  zero.x = (T)0;
108  zero.y = (T)0;
109  data[0] = zero;
110  // Increment position
111  k1 += k1_inc;
112  pos += k1_inc;
113  if (k1 >= size1) {
114  k1 -= size1;
115  k2++;
116  }
117  k2 += k2_inc;
118  pos += k2_inc*size1;
119  if (k2 >= lim2) {
120  k2 -= size2;
121  k3++;
122  }
123  k3 += k3_inc;
124  pos += k3_inc*size1*size2;
125  }
126 
127  // Load prefac data into shared memory
128  {
129  int t = threadIdx.x;
130  while (t < nfft1) {
131  sh_prefac1[t] = prefac1[t];
132  t += blockDim.x;
133  }
134  t = threadIdx.x;
135  while (t < nfft2) {
136  sh_prefac2[t] = prefac2[t];
137  t += blockDim.x;
138  }
139  t = threadIdx.x;
140  while (t < nfft3) {
141  sh_prefac3[t] = prefac3[t];
142  t += blockDim.x;
143  }
144  }
145  BLOCK_SYNC;
146 
147  double energy = 0.0;
148  double virial0 = 0.0;
149  double virial1 = 0.0;
150  double virial2 = 0.0;
151  double virial3 = 0.0;
152  double virial4 = 0.0;
153  double virial5 = 0.0;
154 
155  // If nfft1 is odd, set nfft1_half to impossible value so that
156  // the second condition in "if ( (k1 == 0) || (k1 == nfft1_half) )"
157  // is never satisfied
158  const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
159  const int nfft2_half = nfft2/2;
160  const int nfft3_half = nfft3/2;
161 
162  while (k3 < lim3) {
163 
164  T2 q = data[pos];
165 
166  int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
167  int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
168 
169  T m1, m2, m3;
170  if (doOrtho) {
171  m1 = recip1x*k1;
172  m2 = recip2y*k2s;
173  m3 = recip3z*k3s;
174  } else {
175  m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
176  m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
177  m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
178  }
179 
180  T msq = m1*m1 + m2*m2 + m3*m3;
181  T msq_inv = ((T)1)/msq;
182 
183  T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
184  T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
185  if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
186  T xp3 = expfunc(-pi_ewald*msq)*piv_inv;
187  T C = xp3*msq_inv;
188  T theta = theta3*C;
189 
190  if (calc_energy_virial) {
191  T fac = q2*C;
192  T vir = ((T)2)*(pi_ewald + msq_inv);
193  energy += fac;
194  virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
195  virial1 += (double)(fac*vir*m1*m2);
196  virial2 += (double)(fac*vir*m1*m3);
197  virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
198  virial4 += (double)(fac*vir*m2*m3);
199  virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
200  }
201 
202  q.x *= theta;
203  q.y *= theta;
204  data[pos] = q;
205 
206  // Increment position
207  k1 += k1_inc;
208  pos += k1_inc;
209  if (k1 >= size1) {
210  k1 -= size1;
211  k2++;
212  }
213  k2 += k2_inc;
214  pos += k2_inc*size1;
215  if (k2 >= lim2) {
216  k2 -= size2;
217  k3++;
218  }
219  k3 += k3_inc;
220  pos += k3_inc*size2*size1;
221  }
222 
223  // Reduce energy and virial
224  if (calc_energy_virial) {
225  const int tid = threadIdx.x & (warpSize-1);
226  const int base = (threadIdx.x/warpSize);
227  volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac;
228  // Reduce within warps
229  for (int d=warpSize/2;d >= 1;d /= 2) {
230  energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
231  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
232  virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
233  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
234  virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
235  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
236  virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
237  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
238  virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
239  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
240  virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
241  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
242  virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
243  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
244  }
245  // Reduce between warps
246  // NOTE: this BLOCK_SYNC is needed because we're using a single shared memory buffer
247  BLOCK_SYNC;
248  if (tid == 0) {
249  sh_ev[base].energy = energy;
250  sh_ev[base].virial[0] = virial0;
251  sh_ev[base].virial[1] = virial1;
252  sh_ev[base].virial[2] = virial2;
253  sh_ev[base].virial[3] = virial3;
254  sh_ev[base].virial[4] = virial4;
255  sh_ev[base].virial[5] = virial5;
256  }
257  BLOCK_SYNC;
258  if (base == 0) {
259  energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
260  virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
261  virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
262  virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
263  virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
264  virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
265  virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
266  for (int d=warpSize/2;d >= 1;d /= 2) {
267  energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
268  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
269  virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
270  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
271  virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
272  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
273  virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
274  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
275  virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
276  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
277  virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
278  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
279  virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
280  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
281  }
282  }
283 
284  if (threadIdx.x == 0) {
285  atomicAdd(energy_recip, energy*0.5);
286  virial0 *= -0.5;
287  virial1 *= -0.5;
288  virial2 *= -0.5;
289  virial3 *= -0.5;
290  virial4 *= -0.5;
291  virial5 *= -0.5;
292  atomicAdd(&virial[0], virial0);
293  atomicAdd(&virial[1], virial1);
294  atomicAdd(&virial[2], virial2);
295  atomicAdd(&virial[3], virial3);
296  atomicAdd(&virial[4], virial4);
297  atomicAdd(&virial[5], virial5);
298  }
299 
300  }
301 
302 }
303 
304 template <typename T>
305 __forceinline__ __device__ void write_grid(const float val, const int ind,
306  T* data) {
307  atomicAdd(&data[ind], (T)val);
308 }
309 
310 //
311 // General version for any order
312 //
313 template <typename T, int order>
314 __forceinline__ __device__ void calc_one_theta(const T w, T *theta) {
315 
316  theta[order-1] = ((T)0);
317  theta[1] = w;
318  theta[0] = ((T)1) - w;
319 
320 #pragma unroll
321  for (int k=3;k <= order-1;k++) {
322  T div = ((T)1) / (T)(k-1);
323  theta[k-1] = div*w*theta[k-2];
324 #pragma unroll
325  for (int j=1;j <= k-2;j++) {
326  theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
327  }
328  theta[0] = div*(((T)1) - w)*theta[0];
329  }
330 
331  //--- one more recursion
332  T div = ((T)1) / (T)(order-1);
333  theta[order-1] = div*w*theta[order-2];
334 #pragma unroll
335  for (int j=1;j <= order-2;j++) {
336  theta[order-j-1] = div*((w+j)*theta[order-j-2] + (order-j-w)*theta[order-j-1]);
337  }
338 
339  theta[0] = div*(((T)1) - w)*theta[0];
340 }
341 
342 //
343 // Calculate theta and dtheta for general order bspline
344 //
345 template <typename T, typename T3, int order>
346 __forceinline__ __device__ void calc_theta_dtheta(T wx, T wy, T wz, T3 *theta, T3 *dtheta) {
347 
348  theta[order-1].x = ((T)0);
349  theta[order-1].y = ((T)0);
350  theta[order-1].z = ((T)0);
351  theta[1].x = wx;
352  theta[1].y = wy;
353  theta[1].z = wz;
354  theta[0].x = ((T)1) - wx;
355  theta[0].y = ((T)1) - wy;
356  theta[0].z = ((T)1) - wz;
357 
358 #pragma unroll
359  for (int k=3;k <= order-1;k++) {
360  T div = ((T)1) / (T)(k-1);
361  theta[k-1].x = div*wx*theta[k-2].x;
362  theta[k-1].y = div*wy*theta[k-2].y;
363  theta[k-1].z = div*wz*theta[k-2].z;
364 #pragma unroll
365  for (int j=1;j <= k-2;j++) {
366  theta[k-j-1].x = div*((wx + j)*theta[k-j-2].x + (k-j-wx)*theta[k-j-1].x);
367  theta[k-j-1].y = div*((wy + j)*theta[k-j-2].y + (k-j-wy)*theta[k-j-1].y);
368  theta[k-j-1].z = div*((wz + j)*theta[k-j-2].z + (k-j-wz)*theta[k-j-1].z);
369  }
370  theta[0].x = div*(((T)1) - wx)*theta[0].x;
371  theta[0].y = div*(((T)1) - wy)*theta[0].y;
372  theta[0].z = div*(((T)1) - wz)*theta[0].z;
373  }
374 
375  //--- perform standard b-spline differentiation
376  dtheta[0].x = -theta[0].x;
377  dtheta[0].y = -theta[0].y;
378  dtheta[0].z = -theta[0].z;
379 #pragma unroll
380  for (int j=2;j <= order;j++) {
381  dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
382  dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
383  dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
384  }
385 
386  //--- one more recursion
387  T div = ((T)1) / (T)(order-1);
388  theta[order-1].x = div*wx*theta[order-2].x;
389  theta[order-1].y = div*wy*theta[order-2].y;
390  theta[order-1].z = div*wz*theta[order-2].z;
391 #pragma unroll
392  for (int j=1;j <= order-2;j++) {
393  theta[order-j-1].x = div*((wx + j)*theta[order-j-2].x + (order-j-wx)*theta[order-j-1].x);
394  theta[order-j-1].y = div*((wy + j)*theta[order-j-2].y + (order-j-wy)*theta[order-j-1].y);
395  theta[order-j-1].z = div*((wz + j)*theta[order-j-2].z + (order-j-wz)*theta[order-j-1].z);
396  }
397 
398  theta[0].x = div*(((T)1) - wx)*theta[0].x;
399  theta[0].y = div*(((T)1) - wy)*theta[0].y;
400  theta[0].z = div*(((T)1) - wz)*theta[0].z;
401 }
402 
403 //
404 // Spreads the charge on the grid. Calculates theta and dtheta on the fly
405 // blockDim.x = Number of atoms each block loads
406 // blockDim.y*blockDim.x/order3 = Number of atoms we spread at once
407 //
408 template <typename AT, int order, bool periodicY, bool periodicZ>
409 __global__ void
410 spread_charge_kernel(const float4 *xyzq, const int ncoord,
411  const int nfftx, const int nffty, const int nfftz,
412  const int xsize, const int ysize, const int zsize,
413  const int xdim, const int y00, const int z00,
414  AT* data) {
415 
416  // Shared memory use:
417  // order = 4: 1920 bytes
418  // order = 6: 2688 bytes
419  // order = 8: 3456 bytes
420  __shared__ int sh_ix[32];
421  __shared__ int sh_iy[32];
422  __shared__ int sh_iz[32];
423  __shared__ float sh_thetax[order*32];
424  __shared__ float sh_thetay[order*32];
425  __shared__ float sh_thetaz[order*32];
426 
427  // Process atoms pos to pos_end-1 (blockDim.x)
428  const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
429  const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
430 
431  if (pos < pos_end && threadIdx.y == 0) {
432 
433  float4 xyzqi = xyzq[pos];
434  float x = xyzqi.x;
435  float y = xyzqi.y;
436  float z = xyzqi.z;
437  float q = xyzqi.w;
438 
439  float frx = ((float)nfftx)*x;
440  float fry = ((float)nffty)*y;
441  float frz = ((float)nfftz)*z;
442 
443  int frxi = (int)frx;
444  int fryi = (int)fry;
445  int frzi = (int)frz;
446 
447  float wx = frx - (float)frxi;
448  float wy = fry - (float)fryi;
449  float wz = frz - (float)frzi;
450 
451  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
452  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
453 
454  sh_ix[threadIdx.x] = frxi;
455  sh_iy[threadIdx.x] = fryi - y00;
456  sh_iz[threadIdx.x] = frzi - z00;
457 
458  float theta[order];
459 
460  calc_one_theta<float, order>(wx, theta);
461 #pragma unroll
462  for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
463 
464  calc_one_theta<float, order>(wy, theta);
465 #pragma unroll
466  for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
467 
468  calc_one_theta<float, order>(wz, theta);
469 #pragma unroll
470  for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
471 
472  }
473 
474  BLOCK_SYNC;
475 
476  // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
477  // NOTE: Only tid=0...order*order*order-1 do any computation
478  const int order3 = ((order*order*order-1)/warpSize + 1)*warpSize;
479  const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3; // 0...order3-1
480  const int x0 = tid % order;
481  const int y0 = (tid / order) % order;
482  const int z0 = tid / (order*order);
483 
484  // Loop over atoms pos..pos_end-1
485  int iadd = blockDim.x*blockDim.y/order3;
486  int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
487  int iend = pos_end - blockIdx.x*blockDim.x;
488  for (;i < iend;i += iadd) {
489  int x = sh_ix[i] + x0;
490  int y = sh_iy[i] + y0;
491  int z = sh_iz[i] + z0;
492 
493  if (x >= nfftx) x -= nfftx;
494 
495  if (periodicY && (y >= nffty)) y -= nffty;
496  if (!periodicY && (y < 0 || y >= ysize)) continue;
497 
498  if (periodicZ && (z >= nfftz)) z -= nfftz;
499  if (!periodicZ && (z < 0 || z >= zsize)) continue;
500 
501  // Get position on the grid
502  int ind = x + xdim*(y + ysize*(z));
503 
504  // Here we unroll the 6x6x6 loop with 216 threads.
505  // NOTE: We use 7*32=224 threads to do this
506  // Calculate interpolated charge value and store it to global memory
507 
508  if (tid < order*order*order)
509  write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
510  }
511 
512 }
513 
514 //-----------------------------------------------------------------------------------------
515 // Generic version can not be used
516 template <typename T> __forceinline__ __device__
517 void gather_force_store(const float fx, const float fy, const float fz,
518  const int stride, const int pos,
519  T* force) {
520 }
521 
522 // Template specialization for "float"
523 template <> __forceinline__ __device__
524 void gather_force_store<float>(const float fx, const float fy, const float fz,
525  const int stride, const int pos,
526  float* force) {
527  // Store into non-strided float XYZ array
528  force[pos] = fx;
529  force[pos+stride] = fy;
530  force[pos+stride*2] = fz;
531 }
532 
533 // Template specialization for "float3"
534 template <> __forceinline__ __device__
535 void gather_force_store<float3>(const float fx, const float fy, const float fz,
536  const int stride, const int pos,
537  float3* force) {
538  // Store into non-strided "float3" array
539  force[pos].x = fx;
540  force[pos].y = fy;
541  force[pos].z = fz;
542 }
543 //-----------------------------------------------------------------------------------------
544 
545 // Per atom data structure for the gather_force -kernels
546 template <typename T, int order>
547 struct gather_t {
548  int ix;
549  int iy;
550  int iz;
558  float f1;
559  float f2;
560  float f3;
561 };
562 
563 //
564 // Gathers forces from the grid
565 // blockDim.x = Number of atoms each block loads
566 // blockDim.x*blockDim.y = Total number of threads per block
567 //
568 template <typename CT, typename FT, int order, bool periodicY, bool periodicZ>
569 __global__ void gather_force(const float4 *xyzq, const int ncoord,
570  const int nfftx, const int nffty, const int nfftz,
571  const int xsize, const int ysize, const int zsize,
572  const int xdim, const int y00, const int z00,
573  // const float recip1, const float recip2, const float recip3,
574  const float* data, // NOTE: data is used for loads when __CUDA_ARCH__ >= 350
575  const cudaTextureObject_t gridTexObj,
576  const int stride,
577  FT *force) {
578 
579  const int tid = threadIdx.x + threadIdx.y*blockDim.x; // 0...63
580 
581  // Shared memory
582  __shared__ gather_t<CT, order> shmem[32];
583 
584  const int pos = blockIdx.x*blockDim.x + threadIdx.x;
585  const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
586 
587  // Load atom data into shared memory
588  if (pos < pos_end && threadIdx.y == 0) {
589 
590  float4 xyzqi = xyzq[pos];
591  float x = xyzqi.x;
592  float y = xyzqi.y;
593  float z = xyzqi.z;
594  float q = xyzqi.w;
595 
596  float frx = ((float)nfftx)*x;
597  float fry = ((float)nffty)*y;
598  float frz = ((float)nfftz)*z;
599 
600  int frxi = (int)frx;
601  int fryi = (int)fry;
602  int frzi = (int)frz;
603 
604  float wx = frx - (float)frxi;
605  float wy = fry - (float)fryi;
606  float wz = frz - (float)frzi;
607 
608  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
609  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
610 
611  shmem[threadIdx.x].ix = frxi;
612  shmem[threadIdx.x].iy = fryi - y00;
613  shmem[threadIdx.x].iz = frzi - z00;
614  shmem[threadIdx.x].charge = q;
615 
616  float3 theta_tmp[order];
617  float3 dtheta_tmp[order];
618  calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
619 
620 #pragma unroll
621  for (int i=0;i < order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
622 
623 #pragma unroll
624  for (int i=0;i < order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
625 
626 #pragma unroll
627  for (int i=0;i < order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
628 
629 #pragma unroll
630  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
631 
632 #pragma unroll
633  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
634 
635 #pragma unroll
636  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
637 
638  }
639  BLOCK_SYNC;
640 
641  // We divide the order x order x order cube into 8 sub-cubes.
642  // These sub-cubes are taken care by a single thread
643  // The size of the sub-cubes is:
644  // order=4 : 2x2x2
645  // order=6 : 3x3x3
646  // order=8 : 4x4x4
647  const int nsc = (order == 4) ? 2 : ((order == 6) ? 3 : 4);
648  // Calculate the starting index on the sub-cube for this thread
649  // tid = 0...63
650  const int t = (tid % 8); // sub-cube index (0...7)
651  // t = (tx0 + ty0*2 + tz0*4)/nsc
652  // (tx0, ty0, tz0) gives the starting index of the 3x3x3 sub-cube
653  const int tz0 = (t / 4)*nsc;
654  const int ty0 = ((t / 2) % 2)*nsc;
655  const int tx0 = (t % 2)*nsc;
656 
657  //
658  // Calculate forces for 32 atoms. We have 32*2 = 64 threads
659  // Loop is iterated 4 times:
660  // (iterations)
661  // Threads 0...7 = atoms 0, 8, 16, 24
662  // Threads 8...15 = atoms 1, 9, 17, 25
663  // Threads 16...31 = atoms 2, 10, 18, 26
664  // ...
665  // Threads 56...63 = atoms 7, 15, 23, 31
666  //
667 
668  int base = tid/8;
669  const int base_end = pos_end - blockIdx.x*blockDim.x;
670 
671  // Make sure to mask out any threads that are not running the loop.
672  // This will happen if the number of atoms is not a multiple of 32.
673  int warp_mask = WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
674 
675  while (base < base_end) {
676 
677  float f1 = 0.0f;
678  float f2 = 0.0f;
679  float f3 = 0.0f;
680  int ix0 = shmem[base].ix;
681  int iy0 = shmem[base].iy;
682  int iz0 = shmem[base].iz;
683 
684  // Each thread calculates a nsc x nsc x nsc sub-cube
685 #pragma unroll
686  for (int i=0;i < nsc*nsc*nsc;i++) {
687  int tz = tz0 + (i/(nsc*nsc));
688  int ty = ty0 + ((i/nsc) % nsc);
689  int tx = tx0 + (i % nsc);
690 
691  int ix = ix0 + tx;
692  int iy = iy0 + ty;
693  int iz = iz0 + tz;
694  if (ix >= nfftx) ix -= nfftx;
695 
696  if (periodicY && (iy >= nffty)) iy -= nffty;
697  if (!periodicY && (iy < 0 || iy >= ysize)) continue;
698 
699  if (periodicZ && (iz >= nfftz)) iz -= nfftz;
700  if (!periodicZ && (iz < 0 || iz >= zsize)) continue;
701 
702  int ind = ix + (iy + iz*ysize)*xdim;
703 
704 #if __CUDA_ARCH__ >= 350
705  float q0 = __ldg(&data[ind]);
706 #else
707  float q0 = tex1Dfetch<float>(gridTexObj, ind);
708 #endif
709  float thx0 = shmem[base].thetax[tx];
710  float thy0 = shmem[base].thetay[ty];
711  float thz0 = shmem[base].thetaz[tz];
712  float dthx0 = shmem[base].dthetax[tx];
713  float dthy0 = shmem[base].dthetay[ty];
714  float dthz0 = shmem[base].dthetaz[tz];
715  f1 += dthx0 * thy0 * thz0 * q0;
716  f2 += thx0 * dthy0 * thz0 * q0;
717  f3 += thx0 * thy0 * dthz0 * q0;
718  }
719 
720  //-------------------------
721 
722  // Reduce
723  const int i = threadIdx.x & 7;
724 
725  f1 += WARP_SHUFFLE(warp_mask, f1, i+4, 8);
726  f2 += WARP_SHUFFLE(warp_mask, f2, i+4, 8);
727  f3 += WARP_SHUFFLE(warp_mask, f3, i+4, 8);
728 
729  f1 += WARP_SHUFFLE(warp_mask, f1, i+2, 8);
730  f2 += WARP_SHUFFLE(warp_mask, f2, i+2, 8);
731  f3 += WARP_SHUFFLE(warp_mask, f3, i+2, 8);
732 
733  f1 += WARP_SHUFFLE(warp_mask, f1, i+1, 8);
734  f2 += WARP_SHUFFLE(warp_mask, f2, i+1, 8);
735  f3 += WARP_SHUFFLE(warp_mask, f3, i+1, 8);
736 
737  if (i == 0) {
738  shmem[base].f1 = f1;
739  shmem[base].f2 = f2;
740  shmem[base].f3 = f3;
741  }
742 
743  base += 8;
744  warp_mask = WARP_BALLOT(warp_mask, (base < base_end) );
745  }
746 
747  // Write forces
748  BLOCK_SYNC;
749  if (pos < pos_end && threadIdx.y == 0) {
750  float f1 = shmem[threadIdx.x].f1;
751  float f2 = shmem[threadIdx.x].f2;
752  float f3 = shmem[threadIdx.x].f3;
753  float q = -shmem[threadIdx.x].charge; //*ccelec_float;
754  // float fx = q*recip1*f1*nfftx;
755  // float fy = q*recip2*f2*nffty;
756  // float fz = q*recip3*f3*nfftz;
757  float fx = q*f1*nfftx;
758  float fy = q*f2*nffty;
759  float fz = q*f3*nfftz;
760  gather_force_store<FT>(fx, fy, fz, stride, pos, force);
761  }
762 
763 }
764 
765 const int TILEDIM = 32;
766 const int TILEROWS = 8;
767 
768 template <typename T>
769 __device__ __forceinline__
771  const int x_in, const int y_in, const int z_in,
772  const int x_out, const int y_out,
773  const int nx, const int ny, const int nz,
774  const int xsize_in, const int ysize_in,
775  const int ysize_out, const int zsize_out,
776  const T* data_in, T* data_out) {
777 
778  // Shared memory
779  __shared__ T tile[TILEDIM][TILEDIM+1];
780 
781  // Read (x,y) data_in into tile (shared memory)
782  for (int j=0;j < TILEDIM;j += TILEROWS)
783  if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
784  tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
785 
786  BLOCK_SYNC;
787 
788  // Write (y,x) tile into data_out
789  const int z_out = z_in;
790  for (int j=0;j < TILEDIM;j += TILEROWS)
791  if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
792  data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
793 }
794 
795 //
796 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
797 //
798 template <typename T>
799 __global__ void transpose_xyz_yzx_kernel(
800  const int nx, const int ny, const int nz,
801  const int xsize_in, const int ysize_in,
802  const int ysize_out, const int zsize_out,
803  const T* data_in, T* data_out) {
804 
805  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
806  int y_in = blockIdx.y * TILEDIM + threadIdx.y;
807  int z_in = blockIdx.z + threadIdx.z;
808 
809  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
810  int y_out = blockIdx.y * TILEDIM + threadIdx.x;
811 
812  transpose_xyz_yzx_device<T>(
813  x_in, y_in, z_in,
814  x_out, y_out,
815  nx, ny, nz,
816  xsize_in, ysize_in,
817  ysize_out, zsize_out,
818  data_in, data_out);
819 
820 /*
821  // Shared memory
822  __shared__ T tile[TILEDIM][TILEDIM+1];
823 
824  int x = blockIdx.x * TILEDIM + threadIdx.x;
825  int y = blockIdx.y * TILEDIM + threadIdx.y;
826  int z = blockIdx.z + threadIdx.z;
827 
828  // Read (x,y) data_in into tile (shared memory)
829  for (int j=0;j < TILEDIM;j += TILEROWS)
830  if ((x < nx) && (y + j < ny) && (z < nz))
831  tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
832 
833  BLOCK_SYNC;
834 
835  // Write (y,x) tile into data_out
836  x = blockIdx.x * TILEDIM + threadIdx.y;
837  y = blockIdx.y * TILEDIM + threadIdx.x;
838  for (int j=0;j < TILEDIM;j += TILEROWS)
839  if ((x + j < nx) && (y < ny) && (z < nz))
840  data_out[y + (z + (x+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
841 */
842 }
843 
844 //
845 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(y, z, x)
846 // Batch index bi is encoded in blockIdx.z, where
847 // blockIdx.z = 0...nz-1 are for batch 1
848 // blockIdx.z = nz...2*nz-1 are for batch 2
849 // ...
850 // gridDim.z = nz*numBatches
851 //
852 template <typename T>
854  const TransposeBatch<T>* batches,
855  const int ny, const int nz,
856  const int xsize_in, const int ysize_in) {
857 
858  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
859  int y_in = blockIdx.y * TILEDIM + threadIdx.y;
860  int z_in = (blockIdx.z % nz) + threadIdx.z;
861 
862  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
863  int y_out = blockIdx.y * TILEDIM + threadIdx.x;
864 
865  int bi = blockIdx.z/nz;
866 
867  TransposeBatch<T> batch = batches[bi];
868  int nx = batch.nx;
869  int ysize_out = batch.ysize_out;
870  int zsize_out = batch.zsize_out;
871  T* data_in = batch.data_in;
872  T* data_out = batch.data_out;
873 
874  transpose_xyz_yzx_device<T>(
875  x_in, y_in, z_in,
876  x_out, y_out,
877  nx, ny, nz,
878  xsize_in, ysize_in,
879  ysize_out, zsize_out,
880  data_in, data_out);
881 
882 }
883 
884 /*
885 //
886 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
887 //
888 template <typename T>
889 __forceinline__ __device__
890 void transpose_xyz_yzx_dev(
891  const int blockz,
892  const int nx, const int ny, const int nz,
893  const int xsize_in, const int ysize_in,
894  const int xsize_out, const int ysize_out,
895  const T* data_in, T* data_out) {
896 
897  // Shared memory
898  __shared__ T tile[TILEDIM][TILEDIM+1];
899 
900  int x = blockIdx.x * TILEDIM + threadIdx.x;
901  int y = blockIdx.y * TILEDIM + threadIdx.y;
902  // int z = blockIdx.z + threadIdx.z;
903  int z = blockz + threadIdx.z;
904 
905  // Read (x,y) data_in into tile (shared memory)
906  for (int j=0;j < TILEDIM;j += TILEROWS)
907  if ((x < nx) && (y + j < ny) && (z < nz))
908  tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
909 
910  BLOCK_SYNC;
911 
912  // Write (y,x) tile into data_out
913  x = blockIdx.x * TILEDIM + threadIdx.y;
914  y = blockIdx.y * TILEDIM + threadIdx.x;
915  for (int j=0;j < TILEDIM;j += TILEROWS)
916  if ((x + j < nx) && (y < ny) && (z < nz))
917  data_out[y + (z + (x+j)*ysize_out)*xsize_out] = tile[threadIdx.x][threadIdx.y + j];
918 
919 }
920 
921 //
922 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
923 // (nx, ny, nz) = size of the transposed volume
924 // (xsize_in, ysize_in, zsize_in) = size of the input data
925 // into nblock memory blocks
926 //
927 template <typename T>
928 __global__ void transpose_xyz_yzx_kernel(
929  const int nblock,
930  const int nx, const int ny, const int nz,
931  const int xsize_in, const int ysize_in,
932  const int xsize_out, const int ysize_out,
933  const T* data_in, T* data_out) {
934 
935  const int iblock = blockIdx.z/nz;
936 
937  if (iblock < nblock) {
938  transpose_xyz_yzx_dev(blockIdx.z % nz, nx, ny, nz,
939  xsize_in, ysize_in, xsize_out, ysize_out,
940  data_in, data_out);
941  }
942 
943 }
944 */
945 
946 template <typename T>
947 __device__ __forceinline__
949  const int x_in, const int y_in, const int z_in,
950  const int x_out, const int z_out,
951  const int nx, const int ny, const int nz,
952  const int xsize_in, const int ysize_in,
953  const int zsize_out, const int xsize_out,
954  const T* data_in, T* data_out) {
955 
956  // Shared memory
957  __shared__ T tile[TILEDIM][TILEDIM+1];
958 
959  // Read (x,z) data_in into tile (shared memory)
960  for (int k=0;k < TILEDIM;k += TILEROWS)
961  if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
962  tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
963 
964  BLOCK_SYNC;
965 
966  // Write (z,x) tile into data_out
967  const int y_out = y_in;
968  for (int k=0;k < TILEDIM;k += TILEROWS)
969  if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
970  data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
971 }
972 
973 //
974 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(z, x, y)
975 //
976 template <typename T>
977 __global__ void transpose_xyz_zxy_kernel(
978  const int nx, const int ny, const int nz,
979  const int xsize_in, const int ysize_in,
980  const int zsize_out, const int xsize_out,
981  const T* data_in, T* data_out) {
982 
983  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
984  int y_in = blockIdx.z + threadIdx.z;
985  int z_in = blockIdx.y * TILEDIM + threadIdx.y;
986 
987  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
988  int z_out = blockIdx.y * TILEDIM + threadIdx.x;
989 
990  transpose_xyz_zxy_device<T>(
991  x_in, y_in, z_in, x_out, z_out,
992  nx, ny, nz,
993  xsize_in, ysize_in,
994  zsize_out, xsize_out,
995  data_in, data_out);
996 
997 }
998 
999 //
1000 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(z, x, y)
1001 // Batch index bi is encoded in blockIdx.z, where
1002 // blockIdx.z = 0...ny-1 are for batch 1
1003 // blockIdx.z = ny...2*ny-1 are for batch 2
1004 // ...
1005 // gridDim.z = ny*numBatches
1006 //
1007 template <typename T>
1009  const TransposeBatch<T>* batches,
1010  const int ny, const int nz,
1011  const int xsize_in, const int ysize_in) {
1012 
1013  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1014  int y_in = (blockIdx.z % ny) + threadIdx.z;
1015  int z_in = blockIdx.y * TILEDIM + threadIdx.y;
1016 
1017  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1018  int z_out = blockIdx.y * TILEDIM + threadIdx.x;
1019 
1020  int bi = blockIdx.z/ny;
1021 
1022  TransposeBatch<T> batch = batches[bi];
1023  int nx = batch.nx;
1024  int zsize_out = batch.zsize_out;
1025  int xsize_out = batch.xsize_out;
1026  T* data_in = batch.data_in;
1027  T* data_out = batch.data_out;
1028 
1029  transpose_xyz_zxy_device<T>(
1030  x_in, y_in, z_in, x_out, z_out,
1031  nx, ny, nz,
1032  xsize_in, ysize_in,
1033  zsize_out, xsize_out,
1034  data_in, data_out);
1035 
1036 }
1037 
1038 //#######################################################################################
1039 //#######################################################################################
1040 //#######################################################################################
1041 
1042 void spread_charge(const float4 *atoms, const int numAtoms,
1043  const int nfftx, const int nffty, const int nfftz,
1044  const int xsize, const int ysize, const int zsize,
1045  const int xdim, const int y00, const int z00,
1046  const bool periodicY, const bool periodicZ,
1047  float* data, const int order, cudaStream_t stream) {
1048 
1049  dim3 nthread, nblock;
1050 
1051  switch(order) {
1052  case 4:
1053  nthread.x = 32;
1054  nthread.y = 4;
1055  nthread.z = 1;
1056  nblock.x = (numAtoms - 1)/nthread.x + 1;
1057  nblock.y = 1;
1058  nblock.z = 1;
1059  if (periodicY && periodicZ)
1060  spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
1061  (atoms, numAtoms,
1062  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1063  else if (periodicY)
1064  spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
1065  (atoms, numAtoms,
1066  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1067  else if (periodicZ)
1068  spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
1069  (atoms, numAtoms,
1070  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1071  else
1072  spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
1073  (atoms, numAtoms,
1074  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1075  break;
1076 
1077  case 6:
1078  nthread.x = 32;
1079  nthread.y = 7;
1080  nthread.z = 1;
1081  nblock.x = (numAtoms - 1)/nthread.x + 1;
1082  nblock.y = 1;
1083  nblock.z = 1;
1084  if (periodicY && periodicZ)
1085  spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
1086  (atoms, numAtoms,
1087  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1088  else if (periodicY)
1089  spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
1090  (atoms, numAtoms,
1091  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1092  else if (periodicZ)
1093  spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
1094  (atoms, numAtoms,
1095  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1096  else
1097  spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
1098  (atoms, numAtoms,
1099  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1100  break;
1101 
1102  case 8:
1103  nthread.x = 32;
1104  nthread.y = 16;
1105  nthread.z = 1;
1106  nblock.x = (numAtoms - 1)/nthread.x + 1;
1107  nblock.y = 1;
1108  nblock.z = 1;
1109  if (periodicY && periodicZ)
1110  spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
1111  (atoms, numAtoms,
1112  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1113  else if (periodicY)
1114  spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
1115  (atoms, numAtoms,
1116  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1117  else if (periodicZ)
1118  spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
1119  (atoms, numAtoms,
1120  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1121  else
1122  spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
1123  (atoms, numAtoms,
1124  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1125  break;
1126 
1127  default:
1128  char str[128];
1129  sprintf(str, "spread_charge, order %d not implemented",order);
1130  cudaNAMD_bug(str);
1131  }
1132  cudaCheck(cudaGetLastError());
1133 
1134 }
1135 
1136 void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3,
1137  const int size1, const int size2, const int size3, const double kappa,
1138  const float recip1x, const float recip1y, const float recip1z,
1139  const float recip2x, const float recip2y, const float recip2z,
1140  const float recip3x, const float recip3y, const float recip3z,
1141  const double volume,
1142  const float* prefac1, const float* prefac2, const float* prefac3,
1143  const int k2_00, const int k3_00,
1144  const bool doEnergyVirial, double* energy, double* virial, float2* data,
1145  cudaStream_t stream) {
1146 
1147  int nthread = 1024;
1148  int nblock = 64;
1149 
1150  int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
1151  if (doEnergyVirial) {
1152  const int warpSize = 32;
1153  shmem_size = max(shmem_size, (int)((nthread/warpSize)*sizeof(RecipVirial_t)));
1154  }
1155 
1156  float piv_inv = (float)(1.0/(M_PI*volume));
1157  float fac = (float)(M_PI*M_PI/(kappa*kappa));
1158 
1159  if (doEnergyVirial) {
1160  if (orderXYZ) {
1161  scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>>
1162  (nfft1, nfft2, nfft3, size1, size2, size3,
1163  recip1x, recip1y, recip1z,
1164  recip2x, recip2y, recip2z,
1165  recip3x, recip3y, recip3z,
1166  prefac1, prefac2, prefac3,
1167  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1168  } else {
1169  scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>>
1170  (nfft1, nfft2, nfft3, size1, size2, size3,
1171  recip1x, recip1y, recip1z,
1172  recip2x, recip2y, recip2z,
1173  recip3x, recip3y, recip3z,
1174  prefac1, prefac2, prefac3,
1175  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1176  }
1177  } else {
1178  if (orderXYZ) {
1179  scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>>
1180  (nfft1, nfft2, nfft3, size1, size2, size3,
1181  recip1x, recip1y, recip1z,
1182  recip2x, recip2y, recip2z,
1183  recip3x, recip3y, recip3z,
1184  prefac1, prefac2, prefac3,
1185  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1186  } else {
1187  scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>>
1188  (nfft1, nfft2, nfft3, size1, size2, size3,
1189  recip1x, recip1y, recip1z,
1190  recip2x, recip2y, recip2z,
1191  recip3x, recip3y, recip3z,
1192  prefac1, prefac2, prefac3,
1193  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1194  }
1195  }
1196  cudaCheck(cudaGetLastError());
1197 
1198 }
1199 
1200 void gather_force(const float4 *atoms, const int numAtoms,
1201  // const float recip11, const float recip22, const float recip33,
1202  const int nfftx, const int nffty, const int nfftz,
1203  const int xsize, const int ysize, const int zsize,
1204  const int xdim, const int y00, const int z00,
1205  const bool periodicY, const bool periodicZ,
1206  const float* data, const int order, float3* force,
1207  const cudaTextureObject_t gridTexObj,
1208  cudaStream_t stream) {
1209 
1210  dim3 nthread(32, 2, 1);
1211  dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
1212  // dim3 nblock(npatch, 1, 1);
1213 
1214  switch(order) {
1215  case 4:
1216  if (periodicY && periodicZ)
1217  gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>>
1218  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1219  // recip11, recip22, recip33,
1220  data,
1221  gridTexObj,
1222  1, force);
1223  else if (periodicY)
1224  gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>>
1225  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1226  // recip11, recip22, recip33,
1227  data,
1228  gridTexObj,
1229  1, force);
1230  else if (periodicZ)
1231  gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>>
1232  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1233  // recip11, recip22, recip33,
1234  data,
1235  gridTexObj,
1236  1, force);
1237  else
1238  gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>>
1239  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1240  // recip11, recip22, recip33,
1241  data,
1242  gridTexObj,
1243  1, force);
1244  break;
1245 
1246  case 6:
1247  if (periodicY && periodicZ)
1248  gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>>
1249  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1250  // recip11, recip22, recip33,
1251  data,
1252  gridTexObj,
1253  1, force);
1254  else if (periodicY)
1255  gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>>
1256  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1257  // recip11, recip22, recip33,
1258  data,
1259  gridTexObj,
1260  1, force);
1261  else if (periodicZ)
1262  gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>>
1263  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1264  // recip11, recip22, recip33,
1265  data,
1266  gridTexObj,
1267  1, force);
1268  else
1269  gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>>
1270  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1271  // recip11, recip22, recip33,
1272  data,
1273  gridTexObj,
1274  1, force);
1275  break;
1276 
1277  case 8:
1278  if (periodicY && periodicZ)
1279  gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>>
1280  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1281  // recip11, recip22, recip33,
1282  data,
1283  gridTexObj,
1284  1, force);
1285  else if (periodicY)
1286  gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>>
1287  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1288  // recip11, recip22, recip33,
1289  data,
1290  gridTexObj,
1291  1, force);
1292  else if (periodicZ)
1293  gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>>
1294  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1295  // recip11, recip22, recip33,
1296  data,
1297  gridTexObj,
1298  1, force);
1299  else
1300  gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>>
1301  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1302  // recip11, recip22, recip33,
1303  data,
1304  gridTexObj,
1305  1, force);
1306  break;
1307 
1308  default:
1309  char str[128];
1310  sprintf(str, "gather_force, order %d not implemented",order);
1311  cudaNAMD_bug(str);
1312  }
1313  cudaCheck(cudaGetLastError());
1314 
1315 }
1316 
1317 //
1318 // Transpose
1319 //
1321  const int nx, const int ny, const int nz,
1322  const int xsize_in, const int ysize_in,
1323  const int ysize_out, const int zsize_out,
1324  const float2* data_in, float2* data_out, cudaStream_t stream) {
1325 
1326  dim3 numthread(TILEDIM, TILEROWS, 1);
1327  dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
1328 
1329  transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
1330  (nx, ny, nz, xsize_in, ysize_in,
1331  ysize_out, zsize_out,
1332  data_in, data_out);
1333 
1334  cudaCheck(cudaGetLastError());
1335 }
1336 
1337 //
1338 // Batched transpose
1339 //
1341  const int numBatches, TransposeBatch<float2>* batches,
1342  const int max_nx, const int ny, const int nz,
1343  const int xsize_in, const int ysize_in, cudaStream_t stream) {
1344 
1345  dim3 numthread(TILEDIM, TILEROWS, 1);
1346  dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
1347 
1348  batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
1349  (batches, ny, nz, xsize_in, ysize_in);
1350 
1351  cudaCheck(cudaGetLastError());
1352 }
1353 
1354 //
1355 // Transpose
1356 //
1358  const int nx, const int ny, const int nz,
1359  const int xsize_in, const int ysize_in,
1360  const int zsize_out, const int xsize_out,
1361  const float2* data_in, float2* data_out, cudaStream_t stream) {
1362 
1363  dim3 numthread(TILEDIM, TILEROWS, 1);
1364  dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
1365 
1366  transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
1367  (nx, ny, nz, xsize_in, ysize_in,
1368  zsize_out, xsize_out,
1369  data_in, data_out);
1370 
1371  cudaCheck(cudaGetLastError());
1372 }
1373 
1374 //
1375 // Batched transpose
1376 //
1378  const int numBatches, TransposeBatch<float2>* batches,
1379  const int max_nx, const int ny, const int nz,
1380  const int xsize_in, const int ysize_in,
1381  cudaStream_t stream) {
1382 
1383  dim3 numthread(TILEDIM, TILEROWS, 1);
1384  dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
1385 
1386  batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
1387  (batches, ny, nz, xsize_in, ysize_in);
1388 
1389  cudaCheck(cudaGetLastError());
1390 }
1391 
1392 #endif // NAMD_CUDA
#define M_PI
Definition: GoMolecule.C:39
const int TILEDIM
__forceinline__ __device__ void gather_force_store(const float fx, const float fy, const float fz, const int stride, const int pos, T *force)
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
__forceinline__ __device__ void write_grid(const float val, const int ind, T *data)
static __thread atom * atoms
void transpose_xyz_zxy(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
if(ComputeNonbondedUtil::goMethod==2)
__thread cudaStream_t stream
__forceinline__ __device__ void calc_theta_dtheta(T wx, T wy, T wz, T3 *theta, T3 *dtheta)
__global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const T recip1x, const T recip1y, const T recip1z, const T recip2x, const T recip2y, const T recip2z, const T recip3x, const T recip3y, const T recip3z, const T *prefac1, const T *prefac2, const T *prefac3, const T pi_ewald, const T piv_inv, const int k2_00, const int k3_00, T2 *data, double *__restrict__ energy_recip, double *__restrict__ virial)
#define order
Definition: PmeRealSpace.C:235
gridSize z
__global__ void transpose_xyz_zxy_kernel(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
#define WARP_BALLOT(MASK, P)
Definition: CudaUtils.h:42
#define __ldg
__forceinline__ __device__ void calc_one_theta(const T w, T *theta)
const int TILEROWS
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
__global__ void gather_force(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
__device__ __forceinline__ void transpose_xyz_zxy_device(const int x_in, const int y_in, const int z_in, const int x_out, const int z_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__device__ __forceinline__ void transpose_xyz_yzx_device(const int x_in, const int y_in, const int z_in, const int x_out, const int y_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
void transpose_xyz_yzx(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
__global__ void spread_charge_kernel(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, AT *data)
__forceinline__ __device__ void gather_force_store< float >(const float fx, const float fy, const float fz, const int stride, const int pos, float *force)
gridSize y
#define WARPSIZE
#define cudaCheck(stmt)
Definition: CudaUtils.h:79
__forceinline__ __device__ void gather_force_store< float3 >(const float fx, const float fy, const float fz, const int stride, const int pos, float3 *force)
__forceinline__ __device__ double expfunc(const double x)
__global__ void batchTranspose_xyz_yzx_kernel(const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
gridSize x
__global__ void transpose_xyz_yzx_kernel(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
__global__ void batchTranspose_xyz_zxy_kernel(const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:38
for(int i=0;i< n1;++i)