NAMD
ComputePmeCUDAKernel.cu
Go to the documentation of this file.
1 #include "ComputePmeCUDAKernel.h"
2 #include "CudaUtils.h"
3 
4 void NAMD_die(const char *);
5 
6 #ifdef NAMD_CUDA
7 
8 #include <cuda.h>
9 
10 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
11 // allow linking, must prevent execution elsewhere
12 __device__ void atomicAdd(float *, float) { }
13 #endif
14 
15 #if 0
16 #include <stdio.h>
17 #endif
18 
19 // must run with at least order**2 threads
20 #define warps_per_block 8
21 #define atoms_per_warp 4
22 #define atoms_per_block (atoms_per_warp * warps_per_block)
23 
24 
25 static const float order_4_coeffs[4][4] =
26 {{1, -3, 3, -1}, {0, 3, -6, 3}, {0, 3, 0, -3}, {0, 1, 4, 1}};
27 // divide by 3! = 6
28 
29 static const float order_6_coeffs[6][6] =
30 {{1, -5, 10, -10, 5, -1}, {0, 5, -20, 30, -20, 5}, {0, 10, -20, 0,
31  20, -10}, {0, 10, 20, -60, 20, 10}, {0, 5, 50, 0, -50, -5}, {0, 1,
32  26, 66, 26, 1}};
33 // divide by 5! = 120
34 
35 static const float order_8_coeffs[8][8] =
36 {{1, -7, 21, -35, 35, -21, 7, -1}, {0, 7, -42, 105, -140, 105, -42,
37  7}, {0, 21, -84, 105, 0, -105, 84, -21}, {0, 35, 0, -315, 560, -315,
38  0, 35}, {0, 35, 280, -665, 0, 665, -280, -35}, {0, 21, 504,
39  315, -1680, 315, 504, 21}, {0, 7, 392, 1715,
40  0, -1715, -392, -7}, {0, 1, 120, 1191, 2416, 1191, 120, 1}};
41 // divide by 7! = 5040
42 
43 void cuda_init_bspline_coeffs(float **c, float **dc, int order) {
44  float *coeffs = new float[order*order];
45  float *dcoeffs = new float[order*order];
46  double divisor;
47  static const float *scoeffs;
48  switch ( order ) {
49  case 4:
50  scoeffs = &order_4_coeffs[0][0];
51  divisor = 6;
52  break;
53  case 6:
54  scoeffs = &order_6_coeffs[0][0];
55  divisor = 120;
56  break;
57  case 8:
58  scoeffs = &order_8_coeffs[0][0];
59  divisor = 5040;
60  break;
61  default:
62  NAMD_die("unsupported PMEInterpOrder");
63  }
64  double sum = 0;
65  for ( int i=0, p=order-1; i<order; ++i,--p ) {
66  for ( int j=0; j<order; ++j ) {
67  double c = scoeffs[i*order+(order-1-j)]; // reverse order
68  sum += c;
69  c /= divisor;
70  coeffs[i*order+j] = c;
71  dcoeffs[i*order+j] = (double)p * c;
72  // printf("%d %d %f %f\n", i, j, c, (double)p*c);
73  }
74  }
75  // printf("checksum: %f %f\n", sum, divisor);
76  if ( sum != divisor )
77  NAMD_die("cuda_init_bspline_coeffs static data checksum error");
78  cudaMalloc((void**) c, order*order*sizeof(float));
79  cudaMalloc((void**) dc, order*order*sizeof(float));
80  cudaMemcpy(*c, coeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
81  cudaMemcpy(*dc, dcoeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
82  delete [] coeffs;
83  delete [] dcoeffs;
84 }
85 
86 #define BSPLINE_DEFS \
87  __shared__ union { \
88  float a2d[order][order]; \
89  float a1d[order*order]; \
90  } bspline_coeffs; \
91  __shared__ volatile union { \
92  float a2d[atoms_per_warp][7]; \
93  float a1d[atoms_per_warp*7]; \
94  } atoms[warps_per_block]; \
95  if ( threadIdx.x < order*order ) { \
96  bspline_coeffs.a1d[threadIdx.x] = coeffs[threadIdx.x]; \
97  } \
98  BLOCK_SYNC;
99 
100 // simplify warp-synchronous operations
101 #define WARP (threadIdx.x>>5)
102 #define THREAD (threadIdx.x&31)
103 #define FX bspline_factors[threadIdx.x>>5][0]
104 #define FY bspline_factors[threadIdx.x>>5][1]
105 #define FZ bspline_factors[threadIdx.x>>5][2]
106 #define DFX bspline_dfactors[threadIdx.x>>5][0]
107 #define DFY bspline_dfactors[threadIdx.x>>5][1]
108 #define DFZ bspline_dfactors[threadIdx.x>>5][2]
109 
110 #define FX2 bspline_2factors[threadIdx.x>>5][0]
111 #define FY2 bspline_2factors[threadIdx.x>>5][1]
112 #define FZ2 bspline_2factors[threadIdx.x>>5][2]
113 
114 #define ATOM atoms[threadIdx.x>>5].a2d[i_atom]
115 #define AX atoms[threadIdx.x>>5].a2d[i_atom][0]
116 #define AY atoms[threadIdx.x>>5].a2d[i_atom][1]
117 #define AZ atoms[threadIdx.x>>5].a2d[i_atom][2]
118 #define AQ atoms[threadIdx.x>>5].a2d[i_atom][3]
119 #define AI atoms[threadIdx.x>>5].a2d[i_atom][4]
120 #define AJ atoms[threadIdx.x>>5].a2d[i_atom][5]
121 #define AK atoms[threadIdx.x>>5].a2d[i_atom][6]
122 
123 
124 
125 template <int order>
127  const float * __restrict__ coeffs, \
128  float * const * __restrict__ q_arr, int * __restrict__ f_arr, int * __restrict__ fz_arr, \
129  float ** __restrict__ a_data_ptr, int* n_atoms_ptr, \
130  int* K1_ptr, int* K2_ptr, int* K3_ptr
131 ) {
132 
133  int patchIndex = blockIdx.y;
134  int K1 = K1_ptr[patchIndex];
135  int K2 = K2_ptr[patchIndex];
136  int K3 = K3_ptr[patchIndex];
137  int n_atoms = n_atoms_ptr[patchIndex];
138 
140  __shared__ volatile float bspline_factors[warps_per_block][3][order];
141  int atoms_this_warp = atoms_per_warp;
142  {
143  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
144  if ( aidx + atoms_per_warp > n_atoms ) {
145  atoms_this_warp = n_atoms - aidx; // may be negative
146  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
147  }
148  if ( THREAD < atoms_this_warp*7 ) {
149  float* a_data = a_data_ptr[patchIndex];
150  atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
151  }
152  }
153  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
154  WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
155  float aq=AQ;
156  int ai=(int)AI;
157  int aj=(int)AJ;
158  int ak=(int)AK;
159 
160  if ( THREAD < 3 * order ) {
161  const float af = ATOM[THREAD/order]; // x, y, or z
162  float f = bspline_coeffs.a2d[0][THREAD % order];
163  for ( int i=1; i<order; ++i ) {
164  f = af * f + bspline_coeffs.a2d[i][THREAD % order];
165  }
166  bspline_factors[WARP][THREAD/order][THREAD % order] = f;
167  }
168  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_factors
169  for ( int i=THREAD; i < order*order; i+=32 ) {
170  int ti = i/order;
171  int tj = i%order;
172  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
173  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
174  f_arr[gti * K2 + gtj] = 1;
175  }
176  if ( THREAD < order ) {
177  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
178  gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
179 
180  fz_arr[gtk] = 1;
181  }
182 
183  for ( int i=THREAD; i < order*order*order; i+=32 ) {
184  int ti = i/(order*order);
185  int tj = (i/order)%order;
186  int tk = i%order;
187  float val = aq * FX[ti] * FY[tj] * FZ[tk];
188 
189  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
190  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
191  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
192  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
193  float *dest = q_arr[gti * K2 + gtj]; // could load as constant
194  if ( dest ) atomicAdd(dest+gtk,val);
195  }
196  } // i_atom
197 }
198 
199 
200 template <int order>
201 __global__ void cuda_pme_charges_dev(
202  const float * __restrict__ coeffs, \
203  float * const * __restrict__ q_arr, int * __restrict__ f_arr, int * __restrict__ fz_arr, \
204  float * __restrict__ a_data, int n_atoms, \
205  int K1, int K2, int K3
206 ) {
208  __shared__ volatile float bspline_factors[warps_per_block][3][order];
209  int atoms_this_warp = atoms_per_warp;
210  {
211  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
212  if ( aidx + atoms_per_warp > n_atoms ) {
213  atoms_this_warp = n_atoms - aidx; // may be negative
214  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
215  }
216  if ( THREAD < atoms_this_warp*7 ) {
217  atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
218  }
219  }
220  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
221  WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
222  float aq=AQ;
223  int ai=(int)AI;
224  int aj=(int)AJ;
225  int ak=(int)AK;
226 #if 0
227 if ( THREAD == 0 && (
228  AI != (int)AI || AJ != (int)AJ || AK != (int)AK ||
229  AQ < -30.f || AQ > 30.f || AX < 0.f || AX > 1.f ||
230  AY < 0.f || AY > 1.f || AZ < 0.f || AZ > 1.f )
231 ) {
232  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
233  printf("%d/%d %f %f %f %f %f %f %f\n", aidx, n_atoms, AI, AJ, AK, AQ, AX, AY, AZ );
234  continue;
235 }
236 #endif
237  if ( THREAD < 3 * order ) {
238  const float af = ATOM[THREAD/order]; // x, y, or z
239  float f = bspline_coeffs.a2d[0][THREAD % order];
240  for ( int i=1; i<order; ++i ) {
241  f = af * f + bspline_coeffs.a2d[i][THREAD % order];
242  }
243  bspline_factors[WARP][THREAD/order][THREAD % order] = f;
244  }
245  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_factors
246  for ( int i=THREAD; i < order*order; i+=32 ) {
247  int ti = i/order;
248  int tj = i%order;
249  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
250  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
251 #if 0
252 if ( gti < 0 || gtj < 0 || gti >= K1 || gtj >= K2 ) {
253  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
254  printf("%d/%d %d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, AI, AJ, AK, AQ, AX, AY, AZ);
255 } else
256 #endif
257  f_arr[gti * K2 + gtj] = 1;
258  }
259  if ( THREAD < order ) {
260  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
261  gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
262 #if 0
263 if ( gtk < 0 || gtk >= K3+order-1 ) {
264  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
265  printf("%d/%d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, THREAD, gtk, AI, AJ, AK, AQ, AX, AY, AZ);
266 } else
267 #endif
268  fz_arr[gtk] = 1;
269  }
270  for ( int i=THREAD; i < order*order*order; i+=32 ) {
271  int ti = i/(order*order);
272  int tj = (i/order)%order;
273  int tk = i%order;
274  float val = aq * FX[ti] * FY[tj] * FZ[tk];
275  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
276  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
277  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
278  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
279  float *dest = q_arr[gti * K2 + gtj]; // could load as constant
280 #if 0
281 if ( gti < 0 || gtj < 0 || gtk < 0 || gti >= K1 || gtj >= K2 || gtk >= K3+order-1 || dest == 0 ) {
282  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
283  printf("%d/%d %d %d %d %d %f %f %f %ld %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, gtk, AI, AJ, AK, dest, AQ, AX, AY, AZ);
284 } else
285 #endif
286  if ( dest ) atomicAdd(dest+gtk,val);
287  }
288  } // i_atom
289 }
290 
291 template <int order>
292 __global__ void cuda_pme_forces_dev(
293  const float * __restrict__ coeffs, \
294  float * const * __restrict__ q_arr, \
295  float * const * __restrict__ afn_g, \
296  /* float * __restrict__ a_data,
297  float * __restrict__ f_data, int n_atoms, */
298  int K1, int K2, int K3
299 ) {
300  __shared__ float *afn_s[3];
301  if ( threadIdx.x < 3 ) afn_s[threadIdx.x] = afn_g[3*blockIdx.y+threadIdx.x];
303  __shared__ volatile union {
304  float a2d[atoms_per_warp][3];
305  float a1d[atoms_per_warp*3];
306  } force_output[warps_per_block];
307  __shared__ float2 bspline_2factors[warps_per_block][3][order];
308  __shared__ volatile union {
309  float a2d[32][3];
310  float a1d[32*3];
311  } force_reduction[warps_per_block];
312  int atoms_this_warp = atoms_per_warp;
313  {
314  const int n_atoms = afn_s[2] - afn_s[1];
315  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
316  if ( aidx + atoms_per_warp > n_atoms ) {
317  atoms_this_warp = n_atoms - aidx; // may be negative
318  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
319  }
320  if ( THREAD < atoms_this_warp*7 ) {
321  atoms[WARP].a1d[THREAD] = afn_s[0][aidx*7+THREAD];
322  }
323  WARP_SYNC(WARP_FULL_MASK); // done writing atoms
324  }
325  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
326  float aq=AQ;
327  int ai=(int)AI;
328  int aj=(int)AJ;
329  int ak=(int)AK;
330  if ( THREAD < 3 * order ) {
331  const float af = ATOM[THREAD/order]; // x, y, or z
332  float df = 0.f;
333  float c = bspline_coeffs.a2d[0][THREAD % order];
334  float f = c;
335  for ( int i=1; i<order; ++i ) {
336  df = af * df - (order-i) * c;
337  c = bspline_coeffs.a2d[i][THREAD % order];
338  f = af * f + c;
339  }
340  bspline_2factors[WARP][THREAD/order][THREAD % order] = make_float2(f,df);
341  }
342  __threadfence_block(); // bspline_2factors not declared volatile
343  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_2factors (FX2,FY2,FZ2)
344  float force_x = 0.f;
345  float force_y = 0.f;
346  float force_z = 0.f;
347 
348  for ( int i=THREAD; i < order*order*order; i+=32 ) {
349  int ti = i/(order*order);
350  int tj = (i/order)%order;
351  int tk = i%order;
352 
353  const float2 fx2 = FX2[ti];
354  const float2 fy2 = FY2[tj];
355  const float2 fz2 = FZ2[tk];
356 
357  const float fx = fx2.x;
358  const float fy = fy2.x;
359  const float fz = fz2.x;
360  const float dfx = fx2.y;
361  const float dfy = fy2.y;
362  const float dfz = fz2.y;
363 
364  float dx = K1 * aq * dfx * fy * fz;
365  float dy = K2 * aq * fx * dfy * fz;
366  float dz = K3 * aq * fx * fy * dfz;
367 
368  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
369  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
370  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
371 
372  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
373  const float * __restrict__ src = q_arr[gti * K2 + gtj]; // could load as constant
374  float pot = src ? src[gtk] : __int_as_float(0x7fffffff); // CUDA NaN
375  force_x += dx * pot;
376  force_y += dy * pot;
377  force_z += dz * pot;
378  }
379  force_reduction[WARP].a2d[THREAD][0] = force_x;
380  force_reduction[WARP].a2d[THREAD][1] = force_y;
381  force_reduction[WARP].a2d[THREAD][2] = force_z;
382  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
383  if ( THREAD < 24 ) {
384  force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 24]
385  + force_reduction[WARP].a1d[THREAD + 48]
386  + force_reduction[WARP].a1d[THREAD + 72];
387  }
388  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
389  if ( THREAD < 6 ) {
390  force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 6]
391  + force_reduction[WARP].a1d[THREAD + 12]
392  + force_reduction[WARP].a1d[THREAD + 18];
393  }
394  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
395  if ( THREAD < 3 ) {
396  force_output[WARP].a2d[i_atom][THREAD] = force_reduction[WARP].a1d[THREAD]
397  + force_reduction[WARP].a1d[THREAD + 3];
398  }
399  } // i_atom
400  WARP_SYNC(WARP_FULL_MASK); // done writing force_output
401  if ( THREAD < atoms_this_warp*3 ) {
402  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
403  afn_s[1][aidx*3+THREAD] = force_output[WARP].a1d[THREAD];
404  }
405 }
406 
408  int nblocks = (n_atoms + atoms_per_block - 1) / atoms_per_block;
409  if ( ! nblocks ) return;
410 
411 #define CALL(ORDER) if ( order == ORDER ) \
412  cuda_pme_charges_dev<ORDER><<<nblocks,32*warps_per_block,0,stream>>>( \
413  coeffs, q_arr, f_arr, fz_arr, a_data, n_atoms, K1, K2, K3)
414  CALL(4);
415  else CALL(6);
416  else CALL(8);
417  else NAMD_die("unsupported PMEInterpOrder");
418 #undef CALL
419 }
420 
422  int nblocksX = (n_max_atoms + atoms_per_block - 1) / atoms_per_block;
423  if ( (! nblocksX) || (! numPatches) ) return;
424  dim3 gridSize;
425  gridSize.x = nblocksX;
426  gridSize.y = numPatches;
427  gridSize.z = 1;
428 #define CALL(ORDER) if ( order == ORDER ) \
429  cuda_pme_charges_batched_dev<ORDER><<<gridSize,32*warps_per_block,0,stream>>>( \
430  coeffs, q_arr, f_arr, fz_arr, a_data_ptr, n_atoms_ptr, K1_ptr, K2_ptr, K3_ptr)
431  CALL(4);
432  else CALL(6);
433  else CALL(8);
434  else NAMD_die("unsupported PMEInterpOrder");
435 #undef CALL
436 }
437 
438 
439 #ifdef CUDA_VERSION
440 #if CUDA_VERSION < 4020
441 void dummy() { }
442 #define cudaFuncSetSharedMemConfig(X,Y) dummy()
443 #endif
444 #endif
445 
446 
448  int nblocks = (maxn + atoms_per_block - 1) / atoms_per_block;
449  if ( (! nblocks) || (! dimy) ) return;
450 
451 #define CALL(ORDER) if ( order == ORDER ) \
452  cudaFuncSetSharedMemConfig(cuda_pme_forces_dev<ORDER>,cudaSharedMemBankSizeEightByte), \
453  cuda_pme_forces_dev<ORDER><<<dim3(nblocks,dimy),32*warps_per_block,0,stream>>>( \
454  coeffs, q_arr, afn, /* a_data, f_data, n_atoms, */ K1, K2, K3)
455  CALL(4);
456  else CALL(6);
457  else CALL(8);
458  else NAMD_die("unsupported PMEInterpOrder");
459 #undef CALL
460 }
461 
462 #endif // NAMD_CUDA
463 
#define ATOM
CUDA_PME_CHARGES_PROTOTYPE
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
#define THREAD
#define warps_per_block
float x
Definition: PmeSolver.C:3
#define atoms_per_block
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float4 *__restrict__ float4 *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
#define AY
#define FX2
static __thread atom * atoms
#define CUDA_PME_FORCES_PROTOTYPE
#define CUDA_PME_CHARGES_BATCHED_PROTOTYPE
__global__ void cuda_pme_charges_batched_dev(const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float **__restrict__ a_data_ptr, int *n_atoms_ptr, int *K1_ptr, int *K2_ptr, int *K3_ptr)
#define FY2
static const float order_8_coeffs[8][8]
#define WARP
#define order
Definition: PmeRealSpace.C:235
#define FX
float y
Definition: PmeSolver.C:3
#define FZ
dim3 gridSize
__global__ void cuda_pme_forces_dev(const float *__restrict__ coeffs, float *const *__restrict__ q_arr, float *const *__restrict__ afn_g, int K1, int K2, int K3)
#define FZ2
void NAMD_die(const char *err_msg)
Definition: common.C:83
#define BSPLINE_DEFS
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:43
#define AZ
#define AQ
#define AK
#define AI
#define atoms_per_warp
#define AX
#define AJ
static const float order_4_coeffs[4][4]
void cuda_init_bspline_coeffs(float **c, float **dc, int order)
static const float order_6_coeffs[6][6]
__global__ void cuda_pme_charges_dev(const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float *__restrict__ a_data, int n_atoms, int K1, int K2, int K3)
#define CALL(DOENERGY, DOVIRIAL)
#define FY
CUDA_PME_CHARGES_PROTOTYPE int nblocks