NAMD
Macros | Functions | Variables
ComputePmeCUDAKernel.cu File Reference
#include "ComputePmeCUDAKernel.h"
#include "CudaUtils.h"
#include <cuda.h>

Go to the source code of this file.

Macros

#define warps_per_block   8
 
#define atoms_per_warp   4
 
#define atoms_per_block   (atoms_per_warp * warps_per_block)
 
#define BSPLINE_DEFS
 
#define WARP   (threadIdx.x>>5)
 
#define THREAD   (threadIdx.x&31)
 
#define FX   bspline_factors[threadIdx.x>>5][0]
 
#define FY   bspline_factors[threadIdx.x>>5][1]
 
#define FZ   bspline_factors[threadIdx.x>>5][2]
 
#define DFX   bspline_dfactors[threadIdx.x>>5][0]
 
#define DFY   bspline_dfactors[threadIdx.x>>5][1]
 
#define DFZ   bspline_dfactors[threadIdx.x>>5][2]
 
#define FX2   bspline_2factors[threadIdx.x>>5][0]
 
#define FY2   bspline_2factors[threadIdx.x>>5][1]
 
#define FZ2   bspline_2factors[threadIdx.x>>5][2]
 
#define ATOM   atoms[threadIdx.x>>5].a2d[i_atom]
 
#define AX   atoms[threadIdx.x>>5].a2d[i_atom][0]
 
#define AY   atoms[threadIdx.x>>5].a2d[i_atom][1]
 
#define AZ   atoms[threadIdx.x>>5].a2d[i_atom][2]
 
#define AQ   atoms[threadIdx.x>>5].a2d[i_atom][3]
 
#define AI   atoms[threadIdx.x>>5].a2d[i_atom][4]
 
#define AJ   atoms[threadIdx.x>>5].a2d[i_atom][5]
 
#define AK   atoms[threadIdx.x>>5].a2d[i_atom][6]
 
#define CALL(ORDER)
 
#define CALL(ORDER)
 
#define CALL(ORDER)
 

Functions

void NAMD_die (const char *)
 
void cuda_init_bspline_coeffs (float **c, float **dc, int order)
 
template<int order>
__global__ void cuda_pme_charges_batched_dev (const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float **__restrict__ a_data_ptr, int *n_atoms_ptr, int *K1_ptr, int *K2_ptr, int *K3_ptr)
 
template<int order>
__global__ void cuda_pme_charges_dev (const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float *__restrict__ a_data, int n_atoms, int K1, int K2, int K3)
 
template<int order>
__global__ void cuda_pme_forces_dev (const float *__restrict__ coeffs, float *const *__restrict__ q_arr, float *const *__restrict__ afn_g, int K1, int K2, int K3)
 
 if (!nblocks) return
 
 CALL (4)
 
else CALL (6)
 
else CALL (8)
 
else NAMD_die ("unsupported PMEInterpOrder")
 
 if ((!nblocksX)||(!numPatches)) return
 
 if ((!nblocks)||(!dimy)) return
 

Variables

static const float order_4_coeffs [4][4]
 
static const float order_6_coeffs [6][6]
 
static const float order_8_coeffs [8][8]
 
CUDA_PME_CHARGES_PROTOTYPE int nblocks = (n_atoms + atoms_per_block - 1) / atoms_per_block
 
 CUDA_PME_CHARGES_BATCHED_PROTOTYPE
 
dim3 gridSize
 
gridSize x = nblocksX
 
gridSize y = numPatches
 
gridSize z = 1
 
 CUDA_PME_FORCES_PROTOTYPE
 

Macro Definition Documentation

#define AI   atoms[threadIdx.x>>5].a2d[i_atom][4]
#define AJ   atoms[threadIdx.x>>5].a2d[i_atom][5]
#define AK   atoms[threadIdx.x>>5].a2d[i_atom][6]
#define AQ   atoms[threadIdx.x>>5].a2d[i_atom][3]
#define ATOM   atoms[threadIdx.x>>5].a2d[i_atom]
#define atoms_per_block   (atoms_per_warp * warps_per_block)
#define atoms_per_warp   4
#define AX   atoms[threadIdx.x>>5].a2d[i_atom][0]

Definition at line 115 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_dev().

#define AY   atoms[threadIdx.x>>5].a2d[i_atom][1]

Definition at line 116 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_dev().

#define AZ   atoms[threadIdx.x>>5].a2d[i_atom][2]

Definition at line 117 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_dev().

#define BSPLINE_DEFS
Value:
__shared__ union { \
float a2d[order][order]; \
float a1d[order*order]; \
} bspline_coeffs; \
__shared__ volatile union { \
float a2d[atoms_per_warp][7]; \
float a1d[atoms_per_warp*7]; \
if ( threadIdx.x < order*order ) { \
bspline_coeffs.a1d[threadIdx.x] = coeffs[threadIdx.x]; \
} \
#define warps_per_block
static __thread atom * atoms
if(ComputeNonbondedUtil::goMethod==2)
#define order
Definition: PmeRealSpace.C:235
#define atoms_per_warp

Definition at line 86 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define CALL (   ORDER)
Value:
if ( order == ORDER ) \
cuda_pme_charges_dev<ORDER><<<nblocks,32*warps_per_block,0,stream>>>( \
coeffs, q_arr, f_arr, fz_arr, a_data, n_atoms, K1, K2, K3)
#define warps_per_block
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
CUDA_PME_CHARGES_PROTOTYPE int nblocks

Definition at line 451 of file ComputePmeCUDAKernel.cu.

#define CALL (   ORDER)
Value:
if ( order == ORDER ) \
cuda_pme_charges_batched_dev<ORDER><<<gridSize,32*warps_per_block,0,stream>>>( \
coeffs, q_arr, f_arr, fz_arr, a_data_ptr, n_atoms_ptr, K1_ptr, K2_ptr, K3_ptr)
#define warps_per_block
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
dim3 gridSize

Definition at line 451 of file ComputePmeCUDAKernel.cu.

#define CALL (   ORDER)
Value:
if ( order == ORDER ) \
cudaFuncSetSharedMemConfig(cuda_pme_forces_dev<ORDER>,cudaSharedMemBankSizeEightByte), \
cuda_pme_forces_dev<ORDER><<<dim3(nblocks,dimy),32*warps_per_block,0,stream>>>( \
coeffs, q_arr, afn, /* a_data, f_data, n_atoms, */ K1, K2, K3)
#define warps_per_block
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
CUDA_PME_CHARGES_PROTOTYPE int nblocks

Definition at line 451 of file ComputePmeCUDAKernel.cu.

#define DFX   bspline_dfactors[threadIdx.x>>5][0]

Definition at line 106 of file ComputePmeCUDAKernel.cu.

#define DFY   bspline_dfactors[threadIdx.x>>5][1]

Definition at line 107 of file ComputePmeCUDAKernel.cu.

#define DFZ   bspline_dfactors[threadIdx.x>>5][2]

Definition at line 108 of file ComputePmeCUDAKernel.cu.

#define FX   bspline_factors[threadIdx.x>>5][0]

Definition at line 103 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), and cuda_pme_charges_dev().

#define FX2   bspline_2factors[threadIdx.x>>5][0]

Definition at line 110 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_forces_dev().

#define FY   bspline_factors[threadIdx.x>>5][1]

Definition at line 104 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), and cuda_pme_charges_dev().

#define FY2   bspline_2factors[threadIdx.x>>5][1]

Definition at line 111 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_forces_dev().

#define FZ   bspline_factors[threadIdx.x>>5][2]

Definition at line 105 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), and cuda_pme_charges_dev().

#define FZ2   bspline_2factors[threadIdx.x>>5][2]

Definition at line 112 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_forces_dev().

#define THREAD   (threadIdx.x&31)
#define WARP   (threadIdx.x>>5)
#define warps_per_block   8

Function Documentation

CALL ( )
else CALL ( )
else CALL ( )
void cuda_init_bspline_coeffs ( float **  c,
float **  dc,
int  order 
)

Definition at line 43 of file ComputePmeCUDAKernel.cu.

References NAMD_die(), order, order_4_coeffs, order_6_coeffs, and order_8_coeffs.

Referenced by ComputePmeMgr::initialize_computes().

43  {
44  float *coeffs = new float[order*order];
45  float *dcoeffs = new float[order*order];
46  double divisor;
47  static const float *scoeffs;
48  switch ( order ) {
49  case 4:
50  scoeffs = &order_4_coeffs[0][0];
51  divisor = 6;
52  break;
53  case 6:
54  scoeffs = &order_6_coeffs[0][0];
55  divisor = 120;
56  break;
57  case 8:
58  scoeffs = &order_8_coeffs[0][0];
59  divisor = 5040;
60  break;
61  default:
62  NAMD_die("unsupported PMEInterpOrder");
63  }
64  double sum = 0;
65  for ( int i=0, p=order-1; i<order; ++i,--p ) {
66  for ( int j=0; j<order; ++j ) {
67  double c = scoeffs[i*order+(order-1-j)]; // reverse order
68  sum += c;
69  c /= divisor;
70  coeffs[i*order+j] = c;
71  dcoeffs[i*order+j] = (double)p * c;
72  // printf("%d %d %f %f\n", i, j, c, (double)p*c);
73  }
74  }
75  // printf("checksum: %f %f\n", sum, divisor);
76  if ( sum != divisor )
77  NAMD_die("cuda_init_bspline_coeffs static data checksum error");
78  cudaMalloc((void**) c, order*order*sizeof(float));
79  cudaMalloc((void**) dc, order*order*sizeof(float));
80  cudaMemcpy(*c, coeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
81  cudaMemcpy(*dc, dcoeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
82  delete [] coeffs;
83  delete [] dcoeffs;
84 }
static const float order_8_coeffs[8][8]
#define order
Definition: PmeRealSpace.C:235
void NAMD_die(const char *err_msg)
Definition: common.C:83
static const float order_4_coeffs[4][4]
static const float order_6_coeffs[6][6]
template<int order>
__global__ void cuda_pme_charges_batched_dev ( const float *__restrict__  coeffs,
float *const *__restrict__  q_arr,
int *__restrict__  f_arr,
int *__restrict__  fz_arr,
float **__restrict__  a_data_ptr,
int *  n_atoms_ptr,
int *  K1_ptr,
int *  K2_ptr,
int *  K3_ptr 
)

Definition at line 126 of file ComputePmeCUDAKernel.cu.

References AI, AJ, AK, AQ, ATOM, atoms, atoms_per_block, atoms_per_warp, BSPLINE_DEFS, FX, FY, FZ, order, THREAD, WARP, WARP_FULL_MASK, WARP_SYNC, and warps_per_block.

131  {
132 
133  int patchIndex = blockIdx.y;
134  int K1 = K1_ptr[patchIndex];
135  int K2 = K2_ptr[patchIndex];
136  int K3 = K3_ptr[patchIndex];
137  int n_atoms = n_atoms_ptr[patchIndex];
138 
140  __shared__ volatile float bspline_factors[warps_per_block][3][order];
141  int atoms_this_warp = atoms_per_warp;
142  {
143  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
144  if ( aidx + atoms_per_warp > n_atoms ) {
145  atoms_this_warp = n_atoms - aidx; // may be negative
146  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
147  }
148  if ( THREAD < atoms_this_warp*7 ) {
149  float* a_data = a_data_ptr[patchIndex];
150  atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
151  }
152  }
153  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
154  WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
155  float aq=AQ;
156  int ai=(int)AI;
157  int aj=(int)AJ;
158  int ak=(int)AK;
159 
160  if ( THREAD < 3 * order ) {
161  const float af = ATOM[THREAD/order]; // x, y, or z
162  float f = bspline_coeffs.a2d[0][THREAD % order];
163  for ( int i=1; i<order; ++i ) {
164  f = af * f + bspline_coeffs.a2d[i][THREAD % order];
165  }
166  bspline_factors[WARP][THREAD/order][THREAD % order] = f;
167  }
168  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_factors
169  for ( int i=THREAD; i < order*order; i+=32 ) {
170  int ti = i/order;
171  int tj = i%order;
172  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
173  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
174  f_arr[gti * K2 + gtj] = 1;
175  }
176  if ( THREAD < order ) {
177  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
178  gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
179 
180  fz_arr[gtk] = 1;
181  }
182 
183  for ( int i=THREAD; i < order*order*order; i+=32 ) {
184  int ti = i/(order*order);
185  int tj = (i/order)%order;
186  int tk = i%order;
187  float val = aq * FX[ti] * FY[tj] * FZ[tk];
188 
189  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
190  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
191  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
192  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
193  float *dest = q_arr[gti * K2 + gtj]; // could load as constant
194  if ( dest ) atomicAdd(dest+gtk,val);
195  }
196  } // i_atom
197 }
#define ATOM
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
#define THREAD
#define warps_per_block
#define atoms_per_block
static __thread atom * atoms
#define WARP
#define order
Definition: PmeRealSpace.C:235
#define FX
#define FZ
#define BSPLINE_DEFS
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:43
#define AQ
#define AK
#define AI
#define atoms_per_warp
#define AJ
#define FY
template<int order>
__global__ void cuda_pme_charges_dev ( const float *__restrict__  coeffs,
float *const *__restrict__  q_arr,
int *__restrict__  f_arr,
int *__restrict__  fz_arr,
float *__restrict__  a_data,
int  n_atoms,
int  K1,
int  K2,
int  K3 
)

Definition at line 201 of file ComputePmeCUDAKernel.cu.

References AI, AJ, AK, AQ, ATOM, atoms, atoms_per_block, atoms_per_warp, AX, AY, AZ, BSPLINE_DEFS, FX, FY, FZ, order, THREAD, WARP, WARP_FULL_MASK, WARP_SYNC, and warps_per_block.

206  {
208  __shared__ volatile float bspline_factors[warps_per_block][3][order];
209  int atoms_this_warp = atoms_per_warp;
210  {
211  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
212  if ( aidx + atoms_per_warp > n_atoms ) {
213  atoms_this_warp = n_atoms - aidx; // may be negative
214  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
215  }
216  if ( THREAD < atoms_this_warp*7 ) {
217  atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
218  }
219  }
220  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
221  WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
222  float aq=AQ;
223  int ai=(int)AI;
224  int aj=(int)AJ;
225  int ak=(int)AK;
226 #if 0
227 if ( THREAD == 0 && (
228  AI != (int)AI || AJ != (int)AJ || AK != (int)AK ||
229  AQ < -30.f || AQ > 30.f || AX < 0.f || AX > 1.f ||
230  AY < 0.f || AY > 1.f || AZ < 0.f || AZ > 1.f )
231 ) {
232  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
233  printf("%d/%d %f %f %f %f %f %f %f\n", aidx, n_atoms, AI, AJ, AK, AQ, AX, AY, AZ );
234  continue;
235 }
236 #endif
237  if ( THREAD < 3 * order ) {
238  const float af = ATOM[THREAD/order]; // x, y, or z
239  float f = bspline_coeffs.a2d[0][THREAD % order];
240  for ( int i=1; i<order; ++i ) {
241  f = af * f + bspline_coeffs.a2d[i][THREAD % order];
242  }
243  bspline_factors[WARP][THREAD/order][THREAD % order] = f;
244  }
245  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_factors
246  for ( int i=THREAD; i < order*order; i+=32 ) {
247  int ti = i/order;
248  int tj = i%order;
249  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
250  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
251 #if 0
252 if ( gti < 0 || gtj < 0 || gti >= K1 || gtj >= K2 ) {
253  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
254  printf("%d/%d %d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, AI, AJ, AK, AQ, AX, AY, AZ);
255 } else
256 #endif
257  f_arr[gti * K2 + gtj] = 1;
258  }
259  if ( THREAD < order ) {
260  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
261  gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
262 #if 0
263 if ( gtk < 0 || gtk >= K3+order-1 ) {
264  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
265  printf("%d/%d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, THREAD, gtk, AI, AJ, AK, AQ, AX, AY, AZ);
266 } else
267 #endif
268  fz_arr[gtk] = 1;
269  }
270  for ( int i=THREAD; i < order*order*order; i+=32 ) {
271  int ti = i/(order*order);
272  int tj = (i/order)%order;
273  int tk = i%order;
274  float val = aq * FX[ti] * FY[tj] * FZ[tk];
275  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
276  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
277  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
278  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
279  float *dest = q_arr[gti * K2 + gtj]; // could load as constant
280 #if 0
281 if ( gti < 0 || gtj < 0 || gtk < 0 || gti >= K1 || gtj >= K2 || gtk >= K3+order-1 || dest == 0 ) {
282  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
283  printf("%d/%d %d %d %d %d %f %f %f %ld %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, gtk, AI, AJ, AK, dest, AQ, AX, AY, AZ);
284 } else
285 #endif
286  if ( dest ) atomicAdd(dest+gtk,val);
287  }
288  } // i_atom
289 }
#define ATOM
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
#define THREAD
#define warps_per_block
#define atoms_per_block
#define AY
static __thread atom * atoms
#define WARP
#define order
Definition: PmeRealSpace.C:235
#define FX
#define FZ
#define BSPLINE_DEFS
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:43
#define AZ
#define AQ
#define AK
#define AI
#define atoms_per_warp
#define AX
#define AJ
#define FY
template<int order>
__global__ void cuda_pme_forces_dev ( const float *__restrict__  coeffs,
float *const *__restrict__  q_arr,
float *const *__restrict__  afn_g,
int  K1,
int  K2,
int  K3 
)

Definition at line 292 of file ComputePmeCUDAKernel.cu.

References AI, AJ, AK, AQ, ATOM, atoms, atoms_per_block, atoms_per_warp, BSPLINE_DEFS, FX2, FY2, FZ2, order, THREAD, WARP, WARP_FULL_MASK, WARP_SYNC, warps_per_block, float2::x, and float2::y.

299  {
300  __shared__ float *afn_s[3];
301  if ( threadIdx.x < 3 ) afn_s[threadIdx.x] = afn_g[3*blockIdx.y+threadIdx.x];
303  __shared__ volatile union {
304  float a2d[atoms_per_warp][3];
305  float a1d[atoms_per_warp*3];
306  } force_output[warps_per_block];
307  __shared__ float2 bspline_2factors[warps_per_block][3][order];
308  __shared__ volatile union {
309  float a2d[32][3];
310  float a1d[32*3];
311  } force_reduction[warps_per_block];
312  int atoms_this_warp = atoms_per_warp;
313  {
314  const int n_atoms = afn_s[2] - afn_s[1];
315  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
316  if ( aidx + atoms_per_warp > n_atoms ) {
317  atoms_this_warp = n_atoms - aidx; // may be negative
318  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
319  }
320  if ( THREAD < atoms_this_warp*7 ) {
321  atoms[WARP].a1d[THREAD] = afn_s[0][aidx*7+THREAD];
322  }
323  WARP_SYNC(WARP_FULL_MASK); // done writing atoms
324  }
325  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
326  float aq=AQ;
327  int ai=(int)AI;
328  int aj=(int)AJ;
329  int ak=(int)AK;
330  if ( THREAD < 3 * order ) {
331  const float af = ATOM[THREAD/order]; // x, y, or z
332  float df = 0.f;
333  float c = bspline_coeffs.a2d[0][THREAD % order];
334  float f = c;
335  for ( int i=1; i<order; ++i ) {
336  df = af * df - (order-i) * c;
337  c = bspline_coeffs.a2d[i][THREAD % order];
338  f = af * f + c;
339  }
340  bspline_2factors[WARP][THREAD/order][THREAD % order] = make_float2(f,df);
341  }
342  __threadfence_block(); // bspline_2factors not declared volatile
343  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_2factors (FX2,FY2,FZ2)
344  float force_x = 0.f;
345  float force_y = 0.f;
346  float force_z = 0.f;
347 
348  for ( int i=THREAD; i < order*order*order; i+=32 ) {
349  int ti = i/(order*order);
350  int tj = (i/order)%order;
351  int tk = i%order;
352 
353  const float2 fx2 = FX2[ti];
354  const float2 fy2 = FY2[tj];
355  const float2 fz2 = FZ2[tk];
356 
357  const float fx = fx2.x;
358  const float fy = fy2.x;
359  const float fz = fz2.x;
360  const float dfx = fx2.y;
361  const float dfy = fy2.y;
362  const float dfz = fz2.y;
363 
364  float dx = K1 * aq * dfx * fy * fz;
365  float dy = K2 * aq * fx * dfy * fz;
366  float dz = K3 * aq * fx * fy * dfz;
367 
368  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
369  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
370  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
371 
372  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
373  const float * __restrict__ src = q_arr[gti * K2 + gtj]; // could load as constant
374  float pot = src ? src[gtk] : __int_as_float(0x7fffffff); // CUDA NaN
375  force_x += dx * pot;
376  force_y += dy * pot;
377  force_z += dz * pot;
378  }
379  force_reduction[WARP].a2d[THREAD][0] = force_x;
380  force_reduction[WARP].a2d[THREAD][1] = force_y;
381  force_reduction[WARP].a2d[THREAD][2] = force_z;
382  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
383  if ( THREAD < 24 ) {
384  force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 24]
385  + force_reduction[WARP].a1d[THREAD + 48]
386  + force_reduction[WARP].a1d[THREAD + 72];
387  }
388  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
389  if ( THREAD < 6 ) {
390  force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 6]
391  + force_reduction[WARP].a1d[THREAD + 12]
392  + force_reduction[WARP].a1d[THREAD + 18];
393  }
394  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
395  if ( THREAD < 3 ) {
396  force_output[WARP].a2d[i_atom][THREAD] = force_reduction[WARP].a1d[THREAD]
397  + force_reduction[WARP].a1d[THREAD + 3];
398  }
399  } // i_atom
400  WARP_SYNC(WARP_FULL_MASK); // done writing force_output
401  if ( THREAD < atoms_this_warp*3 ) {
402  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
403  afn_s[1][aidx*3+THREAD] = force_output[WARP].a1d[THREAD];
404  }
405 }
#define ATOM
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
#define THREAD
#define warps_per_block
float x
Definition: PmeSolver.C:3
#define atoms_per_block
#define FX2
static __thread atom * atoms
#define FY2
#define WARP
#define order
Definition: PmeRealSpace.C:235
float y
Definition: PmeSolver.C:3
#define FZ2
#define BSPLINE_DEFS
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:43
#define AQ
#define AK
#define AI
#define atoms_per_warp
#define AJ
if ( nblocks)
if ( (!nblocksX)||(!numPatches )
if ( (!nblocks)||(!dimy)  )
void NAMD_die ( const char *  )

Definition at line 83 of file common.C.

84 {
85  if ( ! err_msg ) err_msg = "(unknown error)";
86  CkPrintf("FATAL ERROR: %s\n", err_msg);
87  fflush(stdout);
88  char repstr[24] = "";
89  if (CmiNumPartitions() > 1) {
90  sprintf(repstr,"REPLICA %d ", CmiMyPartition());
91  }
92  CkError("%sFATAL ERROR: %s\n", repstr, err_msg);
93 #if CHARM_VERSION < 61000
94  CkExit();
95 #else
96  CkExit(1);
97 #endif
98 }
else NAMD_die ( "unsupported PMEInterpOrder"  )

Variable Documentation

CUDA_PME_CHARGES_BATCHED_PROTOTYPE
Initial value:
{
int nblocksX = (n_max_atoms + atoms_per_block - 1) / atoms_per_block
#define atoms_per_block

Definition at line 421 of file ComputePmeCUDAKernel.cu.

CUDA_PME_FORCES_PROTOTYPE
Initial value:
{
#define atoms_per_block
CUDA_PME_CHARGES_PROTOTYPE int nblocks

Definition at line 447 of file ComputePmeCUDAKernel.cu.

dim3 gridSize

Definition at line 424 of file ComputePmeCUDAKernel.cu.

Definition at line 408 of file ComputePmeCUDAKernel.cu.

const float order_4_coeffs[4][4]
static
Initial value:
=
{{1, -3, 3, -1}, {0, 3, -6, 3}, {0, 3, 0, -3}, {0, 1, 4, 1}}

Definition at line 25 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_init_bspline_coeffs().

const float order_6_coeffs[6][6]
static
Initial value:
=
{{1, -5, 10, -10, 5, -1}, {0, 5, -20, 30, -20, 5}, {0, 10, -20, 0,
20, -10}, {0, 10, 20, -60, 20, 10}, {0, 5, 50, 0, -50, -5}, {0, 1,
26, 66, 26, 1}}

Definition at line 29 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_init_bspline_coeffs().

const float order_8_coeffs[8][8]
static
Initial value:
=
{{1, -7, 21, -35, 35, -21, 7, -1}, {0, 7, -42, 105, -140, 105, -42,
7}, {0, 21, -84, 105, 0, -105, 84, -21}, {0, 35, 0, -315, 560, -315,
0, 35}, {0, 35, 280, -665, 0, 665, -280, -35}, {0, 21, 504,
315, -1680, 315, 504, 21}, {0, 7, 392, 1715,
0, -1715, -392, -7}, {0, 1, 120, 1191, 2416, 1191, 120, 1}}

Definition at line 35 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_init_bspline_coeffs().

gridSize x = nblocksX

Definition at line 425 of file ComputePmeCUDAKernel.cu.

Referenced by AVector::AVector(), buildSortKeys(), calc_theta_dtheta(), PmeRealSpaceCompute::calcGridCoord(), ComputeQMMgr::calcMOPAC(), ComputeQMMgr::calcORCA(), ComputeQMMgr::calcUSR(), eABF1D::calpmf(), GridforceFullSubGrid::compute_b(), compute_b_spline(), GridforceFullBaseGrid::compute_VdV(), BondElem::computeForce(), GromacsPairElem::computeForce(), convertForces< long long int >(), diheComp(), ComputeSphericalBC::doForce(), ComputeCylindricalBC::doForce(), HomePatch::doGroupSizeCheck(), ComputeFmmSerial::doWork(), ComputeMsmSerial::doWork(), ComputeFullDirect::doWork(), ComputeExt::doWork(), ComputeMsm::doWork(), ComputeGBISser::doWork(), ComputeGlobal::doWork(), ComputeEwald::doWork(), ComputeQM::doWork(), dumpbench(), PmeAtomFiler::fileAtoms(), gather_force(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), PmePencilXY::initBlockSizes(), PmePencilX::initBlockSizes(), ComputePmeMgr::initialize(), CudaPmePencilXY::initializeDevice(), CudaPmePencilX::initializeDevice(), minHeap::insert(), maxHeap::insert(), MatrixFitRMS(), Controller::minimize(), Node::Node(), obj_transabout(), obj_transvec(), obj_transvecinv(), pe_sortop_coord_x::operator()(), ComputeQM::processFullQM(), ComputeQMMgr::procQMRes(), ComputePmeMgr::procUntrans(), HashPool< T >::push_back(), Parameters::read_energy_type_cubspline(), eABF1D::readfile(), recursive_bisect_coord(), ComputePmeMgr::recvArrays(), ComputeExtMgr::recvCoord(), ComputePmeCUDAMgr::recvPencils(), ComputeGlobal::recvResults(), Matrix4::scale(), scale_coordinates(), ComputeNonbondedUtil::select(), ComputePmeMgr::sendTransSubset(), ComputePmeMgr::sendUntransSubset(), AVector::Set(), PDB::set_all_positions(), settle1(), spread_charge_kernel(), storeForces(), Tcl_loadCoords(), ProxyCombinedResultMsg::toRaw(), Matrix4Symmetry::translate(), Matrix4TMD::translate(), Matrix4::translate(), ParallelIOMgr::updateMolInfo(), void(), writeComplexToDisk(), writeHostComplexToDisk(), and writeResult().

Definition at line 426 of file ComputePmeCUDAKernel.cu.

Referenced by AVector::AVector(), buildSortKeys(), calc_theta_dtheta(), PmeRealSpaceCompute::calcGridCoord(), ComputeQMMgr::calcMOPAC(), ComputeQMMgr::calcORCA(), ComputeQMMgr::calcUSR(), eABF1D::calpmf(), GridforceFullSubGrid::compute_b(), compute_b_spline(), GridforceFullBaseGrid::compute_VdV(), convertForces< long long int >(), HomePatch::doGroupSizeCheck(), dumpbench(), PmeAtomFiler::fileAtoms(), gather_force(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), PmePencilY::initBlockSizes(), ComputePmeCUDADevice::initialize(), ComputePmeMgr::initialize(), CudaPmePencilY::initializeDevice(), ComputePmeCUDADevice::initializePatches(), mollify(), Node::Node(), obj_transabout(), obj_transvec(), obj_transvecinv(), pe_sortop_coord_y::operator()(), ComputeQMMgr::procQMRes(), eABF1D::readfile(), recursive_bisect_coord(), ComputePmeMgr::recvArrays(), ComputePmeCUDADevice::recvAtoms(), ComputePmeCUDADevice::recvAtomsFromNeighbor(), ComputeExtMgr::recvCoord(), ComputePmeCUDADevice::recvForcesFromNeighbor(), ComputePmeCUDAMgr::recvPencils(), Matrix4::scale(), scale_coordinates(), ComputePmeCUDADevice::sendAtomsToNeighbors(), ComputePmeCUDADevice::sendForcesToNeighbors(), AVector::Set(), PDB::set_all_positions(), spread_charge_kernel(), storeForces(), Tcl_loadCoords(), ProxyCombinedResultMsg::toRaw(), Matrix4Symmetry::translate(), Matrix4TMD::translate(), Matrix4::translate(), ParallelIOMgr::updateMolInfo(), void(), writeComplexToDisk(), writeHostComplexToDisk(), and writeResult().

gridSize z = 1

Definition at line 427 of file ComputePmeCUDAKernel.cu.

Referenced by AVector::AVector(), calc_theta_dtheta(), PmeRealSpaceCompute::calcGridCoord(), ComputeQMMgr::calcMOPAC(), ComputeQMMgr::calcORCA(), ComputeQMMgr::calcUSR(), GridforceFullSubGrid::compute_b(), compute_b_spline(), GridforceFullBaseGrid::compute_VdV(), convertForces< long long int >(), HomePatch::doGroupSizeCheck(), dumpbench(), PmeAtomFiler::fileAtoms(), gather_force(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), Molecule::get_go_force2(), Molecule::get_gro_force2(), HomePatch::hardWallDrude(), PmePencilZ::initBlockSizes(), ComputePmeCUDADevice::initialize(), ComputePmeMgr::initialize(), CudaPmePencilZ::initializeDevice(), ComputePmeCUDADevice::initializePatches(), Node::Node(), obj_transabout(), obj_transvec(), obj_transvecinv(), ComputeQMMgr::procQMRes(), HomePatch::rattle1old(), ComputePmeMgr::recvArrays(), ComputePmeCUDADevice::recvAtoms(), ComputePmeCUDADevice::recvAtomsFromNeighbor(), ComputeExtMgr::recvCoord(), ComputePmeCUDADevice::recvForcesFromNeighbor(), ComputePmeCUDAMgr::recvPencils(), Matrix4::scale(), scale_coordinates(), ComputePmeCUDADevice::sendAtomsToNeighbors(), ComputePmeCUDADevice::sendForcesToNeighbors(), AVector::Set(), PDB::set_all_positions(), spread_charge_kernel(), storeForces(), Sequencer::submitHalfstep(), Sequencer::submitReductions(), Random::sum_of_squared_gaussians(), Tcl_loadCoords(), ProxyCombinedResultMsg::toRaw(), Matrix4Symmetry::translate(), Matrix4TMD::translate(), Matrix4::translate(), ParallelIOMgr::updateMolInfo(), void(), and writeResult().