NAMD
ComputeNonbondedCUDAKernelBase.h
Go to the documentation of this file.
1 
2 #ifdef NAMD_CUDA
3 
4 #define NAME(X) SLOWNAME( X )
5 
6 #undef SLOW
7 #undef SLOWNAME
8 #ifdef DO_SLOW
9 #define SLOW(X) X
10 #define SLOWNAME(X) ENERGYNAME( X ## _slow )
11 #else
12 #define SLOW(X)
13 #define SLOWNAME(X) ENERGYNAME( X )
14 #endif
15 
16 #undef ENERGY
17 #undef ENERGYNAME
18 #ifdef DO_ENERGY
19 #define ENERGY(X) X
20 #define ENERGYNAME(X) PAIRLISTNAME( X ## _energy )
21 #else
22 #define ENERGY(X)
23 #define ENERGYNAME(X) PAIRLISTNAME( X )
24 #endif
25 
26 #undef GENPAIRLIST
27 #undef USEPAIRLIST
28 #undef PAIRLISTNAME
29 #ifdef MAKE_PAIRLIST
30 #define GENPAIRLIST(X) X
31 #define USEPAIRLIST(X)
32 #define PAIRLISTNAME(X) LAST( X ## _pairlist )
33 #else
34 #define GENPAIRLIST(X)
35 #define USEPAIRLIST(X) X
36 #define PAIRLISTNAME(X) LAST( X )
37 #endif
38 
39 #define LAST(X) X
40 
41 #undef KEPLER_SHUFFLE
42 #ifdef __CUDA_ARCH__
43 #define KEPLER_SHUFFLE
44 #if __CUDA_ARCH__ < 300
45 #undef KEPLER_SHUFFLE
46 #endif
47 #endif
48 
49 #undef REG_JFORCE
50 #ifdef KEPLER_SHUFFLE
51 #ifndef MAKE_PAIRLIST
52 #define REG_JFORCE
53 #endif
54 #endif
55 
56 #ifdef KEPLER_SHUFFLE
57 __device__ __forceinline__ static void NAME(shfl_reduction)(
58  float *reg,
59  float *smem
60  )
61 {
62  *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, 16, 32);
63  *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, 8, 32);
64  *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, 4, 32);
65  *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, 2, 32);
66  *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, 1, 32);
67 
68  if ( threadIdx.x % 32 == 0 ) {
69  atomicAdd(smem,*reg);
70  }
71  return;
72 }
73 #endif /* KEPLER_SHUFFLE */
74 
75 __device__ __forceinline__
76 static void NAME(finish_forces_virials)(const int start, const int size, const int patch_ind,
77  const atom* atoms,
78  volatile float* sh_buf,
79 #ifndef KEPLER_SHUFFLE
80  volatile float* sh_slow_buf, volatile float* sh_vcc,
81 #endif
82  float4* tmpforces, float4* slow_tmpforces,
83  float4* forces, float4* slow_forces,
84  float* tmpvirials, float* slow_tmpvirials,
85  float* virials, float* slow_virials);
86 
87 //
88 // Reduces up to three variables into global memory locations dst[0], dst[1], dst[2]
89 //
90 template<typename T, int n, int sh_buf_size>
91 __device__ __forceinline__
92 void NAME(reduceVariables)(volatile T* sh_buf, T* dst, T val1, T val2, T val3) {
93  // Sanity check
94  cuda_static_assert(n > 0 && n <= NUM_WARP);
95 #ifdef KEPLER_SHUFFLE
96  // Requires NUM_WARP*n*sizeof(float) shared memory
97  cuda_static_assert(sh_buf_size >= NUM_WARP*n*sizeof(T));
98  // Reduce within warp
99  for (int i=WARPSIZE/2;i >= 1;i/=2) {
100  if (n >= 1) val1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val1, i, WARPSIZE);
101  if (n >= 2) val2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val2, i, WARPSIZE);
102  if (n >= 3) val3 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val3, i, WARPSIZE);
103  }
104  if (threadIdx.x == 0) {
105  if (n >= 1) sh_buf[threadIdx.y*n + 0] = val1;
106  if (n >= 2) sh_buf[threadIdx.y*n + 1] = val2;
107  if (n >= 3) sh_buf[threadIdx.y*n + 2] = val3;
108  }
109  BLOCK_SYNC;
110  if (threadIdx.x < n && threadIdx.y == 0) {
111  T finalval = (T)0;
112 #pragma unroll
113  for (int i=0;i < NUM_WARP;++i) {
114  finalval += sh_buf[i*n + threadIdx.x];
115  }
116  atomicAdd(&dst[threadIdx.x], finalval);
117  }
118 #else // ! KEPLER_SHUFFLE
119  // Requires NUM_WARP*n*WARPSIZE*sizeof(float) shared memory
120  cuda_static_assert(sh_buf_size >= NUM_WARP*n*WARPSIZE*sizeof(T));
121  volatile T* sh_bufy = &sh_buf[threadIdx.y*n*WARPSIZE];
122  if (n >= 1) sh_bufy[threadIdx.x*n + 0] = val1;
123  if (n >= 2) sh_bufy[threadIdx.x*n + 1] = val2;
124  if (n >= 3) sh_bufy[threadIdx.x*n + 2] = val3;
125  // Reducue within warp
126  for (int d=1;d < WARPSIZE;d*=2) {
127  int pos = threadIdx.x + d;
128  T val1t, val2t, val3t;
129  if (n >= 1) val1t = (pos < WARPSIZE) ? sh_bufy[pos*n + 0] : (T)0;
130  if (n >= 2) val2t = (pos < WARPSIZE) ? sh_bufy[pos*n + 1] : (T)0;
131  if (n >= 3) val3t = (pos < WARPSIZE) ? sh_bufy[pos*n + 2] : (T)0;
132  if (n >= 1) sh_bufy[threadIdx.x*n + 0] += val1t;
133  if (n >= 2) sh_bufy[threadIdx.x*n + 1] += val2t;
134  if (n >= 3) sh_bufy[threadIdx.x*n + 2] += val3t;
135  }
136  BLOCK_SYNC;
137  if (threadIdx.x < n && threadIdx.y == 0) {
138  T finalval = (T)0;
139 #pragma unroll
140  for (int i=0;i < NUM_WARP;++i) {
141  finalval += sh_buf[i*n*WARPSIZE + threadIdx.x];
142  }
143  atomicAdd(&dst[threadIdx.x], finalval);
144  }
145 #endif // KEPLER_SHUFFLE
146 }
147 
148 //
149 // Called with 2d thread setting:
150 // x-threadblock = warpSize = 32
151 // y-threadblock = NUM_WARP = 4
152 //
153 __global__ static void
155 USEPAIRLIST(__launch_bounds__(NUM_WARP*WARPSIZE, 12) )
156 NAME(dev_nonbonded)
157  (const patch_pair* patch_pairs,
158  const atom *atoms, const atom_param *atom_params,
159  const int* vdw_types, unsigned int* plist,
160  float4 *tmpforces, float4 *slow_tmpforces,
161  float4 *forces, float4 *slow_forces,
162  float* tmpvirials, float* slow_tmpvirials,
163  float* virials, float* slow_virials,
164  unsigned int* global_counters, int* force_ready_queue,
165  const unsigned int *overflow_exclusions,
166  const int npatches,
167  const int block_begin, const int total_block_count, int* block_order,
168  exclmask* exclmasks, const int lj_table_size,
169  const float3 lata, const float3 latb, const float3 latc,
170  const float cutoff2, const float plcutoff2, const int doSlow) {
171 
172  // Local structure definitions
173  GENPAIRLIST(struct vdw_index {
174  int vdw_type;
175  int index;
176  };)
177 
178  // Shared memory
179  __shared__ patch_pair sh_patch_pair;
180 #ifndef REG_JFORCE
181  __shared__ float3 sh_jforce_2d[NUM_WARP][WARPSIZE];
182  SLOW(__shared__ float3 sh_jforce_slow_2d[NUM_WARP][WARPSIZE];)
183 #endif
184 #ifndef KEPLER_SHUFFLE
185  __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
186 #endif
187  __shared__ float3 sh_iforcesum[SLOW(NUM_WARP+) NUM_WARP];
188 
189  ENERGY(
190  float totalev = 0.f;
191  float totalee = 0.f;
192  SLOW( float totales = 0.f; )
193  )
194 
195  GENPAIRLIST(int nexcluded=0;);
196 
197  {
198 #ifndef KEPLER_SHUFFLE
199  GENPAIRLIST(__shared__ atom_param sh_jap_2d[NUM_WARP][WARPSIZE];)
200  USEPAIRLIST(__shared__ int sh_jap_vdw_type_2d[NUM_WARP][WARPSIZE];)
201 #endif
202  USEPAIRLIST(__shared__ int sh_plist_ind[NUM_WARP];
203  __shared__ unsigned int sh_plist_val[NUM_WARP];);
204 
205  // Load patch_pair -data into shared memory
206  {
207  const int t = threadIdx.x + threadIdx.y*WARPSIZE;
208 
209  if (t < 3*(SLOW(NUM_WARP+) NUM_WARP)) {
210  float *p = (float *)sh_iforcesum;
211  p[threadIdx.x] = 0.0f;
212  }
213 
214  if (t < PATCH_PAIR_SIZE) {
215  int* src = (int *)&patch_pairs[block_begin + blockIdx.x];
216  int* dst = (int *)&sh_patch_pair;
217  dst[t] = src[t];
218  }
219  // Need to sync here to make sure sh_patch_pair is ready
220  BLOCK_SYNC;
221 
222  // Initialize pairlist index to impossible value
223  USEPAIRLIST(if (threadIdx.x == 0) sh_plist_ind[threadIdx.y] = -1;);
224 
225  // Initialize pair list to "no interactions"
226  GENPAIRLIST({
227  if (t < sh_patch_pair.plist_size)
228  plist[sh_patch_pair.plist_start + t] = 0;
229  })
230 
231  // convert scaled offset with current lattice and write into shared memory
232  if (t == 0) {
233  float offx = sh_patch_pair.offset.x * lata.x
234  + sh_patch_pair.offset.y * latb.x
235  + sh_patch_pair.offset.z * latc.x;
236  float offy = sh_patch_pair.offset.x * lata.y
237  + sh_patch_pair.offset.y * latb.y
238  + sh_patch_pair.offset.z * latc.y;
239  float offz = sh_patch_pair.offset.x * lata.z
240  + sh_patch_pair.offset.y * latb.z
241  + sh_patch_pair.offset.z * latc.z;
242  sh_patch_pair.offset.x = offx;
243  sh_patch_pair.offset.y = offy;
244  sh_patch_pair.offset.z = offz;
245  }
246 
247  BLOCK_SYNC;
248  }
249 
250  // Compute pointers to shared memory to avoid point computation later on
251 #ifndef REG_JFORCE
252  volatile float3* sh_jforce = &sh_jforce_2d[threadIdx.y][0];
253  SLOW(volatile float3* sh_jforce_slow = &sh_jforce_slow_2d[threadIdx.y][0];)
254 #endif
255 
256 #ifndef KEPLER_SHUFFLE
257  atom* sh_jpq = &sh_jpq_2d[threadIdx.y][0];
258  GENPAIRLIST(atom_param* sh_jap = &sh_jap_2d[threadIdx.y][0];);
259  USEPAIRLIST(int* sh_jap_vdw_type = &sh_jap_vdw_type_2d[threadIdx.y][0];);
260 #endif
261 
262  for (int blocki = threadIdx.y*WARPSIZE;blocki < sh_patch_pair.patch1_size;blocki += WARPSIZE*NUM_WARP) {
263 
264  atom ipq;
265  GENPAIRLIST(vdw_index iap;);
266  USEPAIRLIST(int iap_vdw_type;);
267  // Load i atom data
268  if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
269  int i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
270  float4 tmpa = ((float4*)atoms)[i];
271  ipq.position.x = tmpa.x + sh_patch_pair.offset.x;
272  ipq.position.y = tmpa.y + sh_patch_pair.offset.y;
273  ipq.position.z = tmpa.z + sh_patch_pair.offset.z;
274  ipq.charge = tmpa.w;
275  GENPAIRLIST(uint4 tmpap = ((uint4*)atom_params)[i];
276  iap.vdw_type = tmpap.x*lj_table_size;
277  iap.index = tmpap.y;);
278  USEPAIRLIST(iap_vdw_type = vdw_types[i]*lj_table_size;);
279  }
280 
281  // i-forces in registers
282  float3 iforce;
283  iforce.x = 0.0f;
284  iforce.y = 0.0f;
285  iforce.z = 0.0f;
286  SLOW(float3 iforce_slow;
287  iforce_slow.x = 0.0f;
288  iforce_slow.y = 0.0f;
289  iforce_slow.z = 0.0f;)
290 
291  const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
292  (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
293  int blockj = (diag_patch_pair) ? blocki : 0;
294  for (;blockj < sh_patch_pair.patch2_size;blockj += WARPSIZE) {
295 
296  USEPAIRLIST({
297  const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
298  int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
299  int plist_ind = pos/32;
300  unsigned int plist_bit = 1 << (pos % 32);
301  // Check if we need to load next entry in the pairlist
302  if (plist_ind != sh_plist_ind[threadIdx.y]) {
303  sh_plist_val[threadIdx.y] = plist[sh_patch_pair.plist_start + plist_ind];
304  sh_plist_ind[threadIdx.y] = plist_ind;
305  }
306  if ((sh_plist_val[threadIdx.y] & plist_bit) == 0) continue;
307  })
308 
309  // Load j atom data
310 #ifdef KEPLER_SHUFFLE
311  atom jpq;
312  GENPAIRLIST(atom_param jap;);
313  USEPAIRLIST(int jap_vdw_type;);
314 #endif
315 
316  GENPAIRLIST(
317  // Avoid calculating pairs of blocks where all atoms on both blocks are fixed
318  if (blocki >= sh_patch_pair.patch1_free_size && blockj >= sh_patch_pair.patch2_free_size) continue;
319  int nfreej = sh_patch_pair.patch2_free_size - blockj;
320  int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
321  );
322 
323  //GENPAIRLIST(bool inside_plcutoff = false;)
324  if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
325  int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
326  float4 tmpa = ((float4*)atoms)[j];
327 #ifdef KEPLER_SHUFFLE
328  jpq.position.x = tmpa.x;
329  jpq.position.y = tmpa.y;
330  jpq.position.z = tmpa.z;
331  jpq.charge = tmpa.w;
332 #else
333  sh_jpq[threadIdx.x].position.x = tmpa.x;
334  sh_jpq[threadIdx.x].position.y = tmpa.y;
335  sh_jpq[threadIdx.x].position.z = tmpa.z;
336  sh_jpq[threadIdx.x].charge = tmpa.w;
337 #endif
338 
339 #ifdef KEPLER_SHUFFLE
340  GENPAIRLIST(jap = atom_params[j];)
341  USEPAIRLIST(jap_vdw_type = vdw_types[j];)
342 #else
343  GENPAIRLIST(sh_jap[threadIdx.x] = atom_params[j];)
344  USEPAIRLIST(sh_jap_vdw_type[threadIdx.x] = vdw_types[j];)
345 #endif
346  }
347 
348  // j-forces in shared memory
349 #ifdef REG_JFORCE
350  float3 jforce;
351  jforce.x = 0.0f;
352  jforce.y = 0.0f;
353  jforce.z = 0.0f;
354  SLOW(float3 jforce_slow;
355  jforce_slow.x = 0.0f;
356  jforce_slow.y = 0.0f;
357  jforce_slow.z = 0.0f;
358  );
359 #else
360  sh_jforce[threadIdx.x].x = 0.0f;
361  sh_jforce[threadIdx.x].y = 0.0f;
362  sh_jforce[threadIdx.x].z = 0.0f;
363  SLOW(sh_jforce_slow[threadIdx.x].x = 0.0f;
364  sh_jforce_slow[threadIdx.x].y = 0.0f;
365  sh_jforce_slow[threadIdx.x].z = 0.0f;)
366 #endif
367 
368  GENPAIRLIST(unsigned int excl = 0;)
369  USEPAIRLIST(
370  const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
371  const int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
372  unsigned int excl = exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x];
373  );
374  GENPAIRLIST(
375  int nloopi = sh_patch_pair.patch1_size - blocki;
376  if (nloopi > WARPSIZE) nloopi = WARPSIZE;
377  // NOTE: We must truncate nfreei to be non-negative number since we're comparing to threadIdx.x (unsigned int) later on
378  int nfreei = max(sh_patch_pair.patch1_free_size - blocki, 0);
379  )
380  const bool diag_tile = diag_patch_pair && (blocki == blockj);
381  // Loop through tile diagonals. Local tile indices are:
382  // i = threadIdx.x % WARPSIZE = constant
383  // j = (t + threadIdx.x) % WARPSIZE
384  const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
385  int t = (diag_tile) ? 1 : 0;
386  if (diag_tile) {
387  USEPAIRLIST(excl >>= 1;);
388 #ifdef KEPLER_SHUFFLE
389  jpq.charge = WARP_SHUFFLE(WARP_FULL_MASK, jpq.charge, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
390  USEPAIRLIST(jap_vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap_vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE ););
391  GENPAIRLIST(jap.vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap.vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
392  jap.index = WARP_SHUFFLE(WARP_FULL_MASK, jap.index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
393  jap.excl_maxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_maxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
394  jap.excl_index = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
395  );
396 #endif
397  }
398 
399  for (; t < WARPSIZE; ++t) {
400  USEPAIRLIST(if (WARP_ANY(WARP_FULL_MASK, excl & 1)))
401  {
402  GENPAIRLIST(excl >>= 1;);
403  int j = (t + threadIdx.x) & modval;
404 #ifdef KEPLER_SHUFFLE
405  float tmpx = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.x, j, WARPSIZE) - ipq.position.x;
406  float tmpy = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.y, j, WARPSIZE) - ipq.position.y;
407  float tmpz = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.z, j, WARPSIZE) - ipq.position.z;
408  GENPAIRLIST(
409  int j_vdw_type = jap.vdw_type;
410  int j_index = jap.index;
411  int j_excl_maxdiff = jap.excl_maxdiff;
412  int j_excl_index = jap.excl_index;
413  );
414  float j_charge = jpq.charge;
415  USEPAIRLIST(
416  int j_vdw_type = jap_vdw_type;
417  );
418 #endif
419  GENPAIRLIST(if (j < nloopj && threadIdx.x < nloopi && (j < nfreej || threadIdx.x < nfreei) ))
420  {
421 
422 #ifndef KEPLER_SHUFFLE
423  float tmpx = sh_jpq[j].position.x - ipq.position.x;
424  float tmpy = sh_jpq[j].position.y - ipq.position.y;
425  float tmpz = sh_jpq[j].position.z - ipq.position.z;
426  GENPAIRLIST(
427  int j_vdw_type = sh_jap[j].vdw_type;
428  int j_index = sh_jap[j].index;
429  int j_excl_maxdiff = sh_jap[j].excl_maxdiff;
430  int j_excl_index = sh_jap[j].excl_index;
431  );
432  float j_charge = sh_jpq[j].charge;
433  USEPAIRLIST(
434  int j_vdw_type = sh_jap_vdw_type[j];
435  );
436 #endif
437  float r2 = tmpx*tmpx + tmpy*tmpy + tmpz*tmpz;
438  GENPAIRLIST(if (r2 < plcutoff2))
439  USEPAIRLIST(if ((excl & 1) && r2 < cutoff2))
440  {
441  GENPAIRLIST(
442  bool excluded = false;
443  int indexdiff = (int)(iap.index) - j_index;
444  if ( abs(indexdiff) <= j_excl_maxdiff) {
445  indexdiff += j_excl_index;
446  int indexword = ((unsigned int) indexdiff) >> 5;
447  //indexword = tex1Dfetch(tex_exclusions, indexword);
448  if ( indexword < MAX_CONST_EXCLUSIONS )
449  indexword = const_exclusions[indexword];
450  else {
451  indexword = overflow_exclusions[indexword];
452  }
453  excluded = ((indexword & (1<<(indexdiff&31))) != 0);
454  if (excluded) nexcluded++;
455  }
456  if (!excluded) excl |= 0x80000000;
457  )
458  GENPAIRLIST(if ( ! excluded && r2 < cutoff2))
459  {
460  ENERGY( float rsqrtfr2; );
461  float4 fi = tex1D(force_table, ENERGY(rsqrtfr2 =) rsqrtf(r2));
462  ENERGY( float4 ei = tex1D(energy_table, rsqrtfr2); );
463  GENPAIRLIST(float2 ljab = tex1Dfetch(lj_table, j_vdw_type + iap.vdw_type););
464  USEPAIRLIST(float2 ljab = tex1Dfetch(lj_table, j_vdw_type + iap_vdw_type););
465 
466  float f_slow = ipq.charge * j_charge;
467  float f = ljab.x * fi.z + ljab.y * fi.y + f_slow * fi.x;
468  ENERGY(
469  float ev = ljab.x * ei.z + ljab.y * ei.y;
470  float ee = f_slow * ei.x;
471  SLOW( float es = f_slow * ei.w; )
472  )
473  SLOW( f_slow *= fi.w; )
474  ENERGY(
475  totalev += ev;
476  totalee += ee;
477  SLOW( totales += es; )
478  )
479  float fx = tmpx * f;
480  float fy = tmpy * f;
481  float fz = tmpz * f;
482  iforce.x += fx;
483  iforce.y += fy;
484  iforce.z += fz;
485 #ifdef REG_JFORCE
486  jforce.x -= fx;
487  jforce.y -= fy;
488  jforce.z -= fz;
489 #else
490  sh_jforce[j].x -= fx;
491  sh_jforce[j].y -= fy;
492  sh_jforce[j].z -= fz;
493 #endif
494  SLOW(
495  float fx_slow = tmpx * f_slow;
496  float fy_slow = tmpy * f_slow;
497  float fz_slow = tmpz * f_slow;
498  iforce_slow.x += fx_slow;
499  iforce_slow.y += fy_slow;
500  iforce_slow.z += fz_slow;
501  )
502 #ifdef REG_JFORCE
503  SLOW(
504  jforce_slow.x -= fx_slow;
505  jforce_slow.y -= fy_slow;
506  jforce_slow.z -= fz_slow;
507  )
508 #else
509  SLOW(
510  sh_jforce_slow[j].x -= fx_slow;
511  sh_jforce_slow[j].y -= fy_slow;
512  sh_jforce_slow[j].z -= fz_slow;
513  )
514 #endif
515  }
516  } // cutoff
517  } // if (j < nloopj...)
518 }
519  USEPAIRLIST(excl >>= 1;);
520 #ifdef KEPLER_SHUFFLE
521  jpq.charge = WARP_SHUFFLE(WARP_FULL_MASK, jpq.charge, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
522  USEPAIRLIST(jap_vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap_vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE ););
523  GENPAIRLIST(jap.vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap.vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
524  jap.index = WARP_SHUFFLE(WARP_FULL_MASK, jap.index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
525  jap.excl_maxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_maxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
526  jap.excl_index = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
527  );
528 #ifdef REG_JFORCE
529  jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
530  jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
531  jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
532  SLOW(
533  jforce_slow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
534  jforce_slow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
535  jforce_slow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
536  );
537 #endif
538 #endif
539  } // t
540 
541  // Write j-forces
542  GENPAIRLIST(if (WARP_ANY(WARP_FULL_MASK, excl != 0))) {
543  if ( blockj + threadIdx.x < sh_patch_pair.patch2_size ) {
544  int jforce_pos = sh_patch_pair.patch2_start + blockj + threadIdx.x;
545 #ifdef REG_JFORCE
546  atomicAdd(&tmpforces[jforce_pos].x, jforce.x);
547  atomicAdd(&tmpforces[jforce_pos].y, jforce.y);
548  atomicAdd(&tmpforces[jforce_pos].z, jforce.z);
549  SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, jforce_slow.x);
550  atomicAdd(&slow_tmpforces[jforce_pos].y, jforce_slow.y);
551  atomicAdd(&slow_tmpforces[jforce_pos].z, jforce_slow.z););
552 #else
553  atomicAdd(&tmpforces[jforce_pos].x, sh_jforce[threadIdx.x].x);
554  atomicAdd(&tmpforces[jforce_pos].y, sh_jforce[threadIdx.x].y);
555  atomicAdd(&tmpforces[jforce_pos].z, sh_jforce[threadIdx.x].z);
556  SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, sh_jforce_slow[threadIdx.x].x);
557  atomicAdd(&slow_tmpforces[jforce_pos].y, sh_jforce_slow[threadIdx.x].y);
558  atomicAdd(&slow_tmpforces[jforce_pos].z, sh_jforce_slow[threadIdx.x].z););
559 #endif
560  }
561 
562  GENPAIRLIST(
563  const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
564  int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
565  exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x] = excl;
566  if (threadIdx.x == 0) {
567  int plist_ind = pos/32;
568  unsigned int plist_bit = 1 << (pos % 32);
569  atomicOr(&plist[sh_patch_pair.plist_start + plist_ind], plist_bit);
570  }
571  );
572  }
573 
574  } // for (blockj)
575 
576  // Write i-forces
577  if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
578  int iforce_pos = sh_patch_pair.patch1_start + blocki + threadIdx.x;
579  atomicAdd(&tmpforces[iforce_pos].x, iforce.x);
580  atomicAdd(&tmpforces[iforce_pos].y, iforce.y);
581  atomicAdd(&tmpforces[iforce_pos].z, iforce.z);
582  SLOW(atomicAdd(&slow_tmpforces[iforce_pos].x, iforce_slow.x);
583  atomicAdd(&slow_tmpforces[iforce_pos].y, iforce_slow.y);
584  atomicAdd(&slow_tmpforces[iforce_pos].z, iforce_slow.z););
585  }
586  // Accumulate total forces for virial (warp synchronous)
587 #ifdef KEPLER_SHUFFLE
588  for (int i=WARPSIZE/2;i >= 1;i/=2) {
589  iforce.x += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.x, i, WARPSIZE);
590  iforce.y += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.y, i, WARPSIZE);
591  iforce.z += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.z, i, WARPSIZE);
592  SLOW(
593  iforce_slow.x += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.x, i, WARPSIZE);
594  iforce_slow.y += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.y, i, WARPSIZE);
595  iforce_slow.z += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.z, i, WARPSIZE);
596  );
597  }
598  if (threadIdx.x == 0) {
599  sh_iforcesum[threadIdx.y].x += iforce.x;
600  sh_iforcesum[threadIdx.y].y += iforce.y;
601  sh_iforcesum[threadIdx.y].z += iforce.z;
602  SLOW(
603  sh_iforcesum[threadIdx.y+NUM_WARP].x += iforce_slow.x;
604  sh_iforcesum[threadIdx.y+NUM_WARP].y += iforce_slow.y;
605  sh_iforcesum[threadIdx.y+NUM_WARP].z += iforce_slow.z;
606  );
607  }
608 #else
609  sh_jforce[threadIdx.x].x = iforce.x;
610  sh_jforce[threadIdx.x].y = iforce.y;
611  sh_jforce[threadIdx.x].z = iforce.z;
612  SLOW(
613  sh_jforce_slow[threadIdx.x].x = iforce_slow.x;
614  sh_jforce_slow[threadIdx.x].y = iforce_slow.y;
615  sh_jforce_slow[threadIdx.x].z = iforce_slow.z;
616  );
617  for (int d=1;d < WARPSIZE;d*=2) {
618  int pos = threadIdx.x + d;
619  float valx = (pos < WARPSIZE) ? sh_jforce[pos].x : 0.0f;
620  float valy = (pos < WARPSIZE) ? sh_jforce[pos].y : 0.0f;
621  float valz = (pos < WARPSIZE) ? sh_jforce[pos].z : 0.0f;
622  SLOW(
623  float slow_valx = (pos < WARPSIZE) ? sh_jforce_slow[pos].x : 0.0f;
624  float slow_valy = (pos < WARPSIZE) ? sh_jforce_slow[pos].y : 0.0f;
625  float slow_valz = (pos < WARPSIZE) ? sh_jforce_slow[pos].z : 0.0f;
626  );
627  sh_jforce[threadIdx.x].x += valx;
628  sh_jforce[threadIdx.x].y += valy;
629  sh_jforce[threadIdx.x].z += valz;
630  SLOW(
631  sh_jforce_slow[threadIdx.x].x += slow_valx;
632  sh_jforce_slow[threadIdx.x].y += slow_valy;
633  sh_jforce_slow[threadIdx.x].z += slow_valz;
634  );
635  }
636  if (threadIdx.x == 0) {
637  sh_iforcesum[threadIdx.y].x += sh_jforce[threadIdx.x].x;
638  sh_iforcesum[threadIdx.y].y += sh_jforce[threadIdx.x].y;
639  sh_iforcesum[threadIdx.y].z += sh_jforce[threadIdx.x].z;
640  SLOW(
641  sh_iforcesum[threadIdx.y+NUM_WARP].x += sh_jforce_slow[threadIdx.x].x;
642  sh_iforcesum[threadIdx.y+NUM_WARP].y += sh_jforce_slow[threadIdx.x].y;
643  sh_iforcesum[threadIdx.y+NUM_WARP].z += sh_jforce_slow[threadIdx.x].z;
644  );
645  }
646 #endif
647 
648  } // for (blocki)
649 
650  }
651 
652  {
653 
654 #ifdef REG_JFORCE
655 #undef SH_BUF_SIZE
656 #define SH_BUF_SIZE NUM_WARP*(SLOW(9)+9)*sizeof(float)
657  __shared__ float sh_buf[NUM_WARP*(SLOW(9)+9)];
658 #else // ! REG_JFORCE
659 #undef SH_BUF_SIZE
660 #define SH_BUF_SIZE NUM_WARP*WARPSIZE*3*sizeof(float)
661  volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
662  // Sync here to make sure we can write into shared memory (sh_jforce_2d)
663  BLOCK_SYNC;
664 #endif
665 
666 #if ENERGY(1+)0
667  NAME(reduceVariables)<float, SLOW(1+)2, SH_BUF_SIZE>(sh_buf, &tmpvirials[sh_patch_pair.patch1_ind*16 + 9], totalev, totalee, SLOW(totales+)0.0f);
668 #endif
669 
670 #if GENPAIRLIST(1+)0
672  NAME(reduceVariables)<int, 1, SH_BUF_SIZE>((int *)sh_buf, (int *)&tmpvirials[sh_patch_pair.patch1_ind*16 + 12], nexcluded, 0, 0);
673 #endif
674 
675  // Virials
676  BLOCK_SYNC;
677  if (threadIdx.x < SLOW(3+)3 && threadIdx.y == 0) {
678  float* sh_virials = (float *)sh_iforcesum + (threadIdx.x % 3) + (threadIdx.x/3)*3*NUM_WARP;
679  float iforcesum = 0.0f;
680 #pragma unroll
681  for (int i=0;i < 3*NUM_WARP;i+=3) iforcesum += sh_virials[i];
682  float vx = iforcesum*sh_patch_pair.offset.x;
683  float vy = iforcesum*sh_patch_pair.offset.y;
684  float vz = iforcesum*sh_patch_pair.offset.z;
685  sh_iforcesum[threadIdx.x].x = vx;
686  sh_iforcesum[threadIdx.x].y = vy;
687  sh_iforcesum[threadIdx.x].z = vz;
688  }
689  if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
690  // virials are in sh_virials[0...8] and slow virials in sh_virials[9...17]
691  float* sh_virials = (float *)sh_iforcesum;
692  int patch1_ind = sh_patch_pair.patch1_ind;
693  float *dst = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
694  atomicAdd(&dst[patch1_ind*16 + (threadIdx.x % 9)], sh_virials[threadIdx.x]);
695  }
696 
697  // Make sure forces are up-to-date in device global memory
698  __threadfence();
699  BLOCK_SYNC;
700 
701  // Mark patch pair (patch1_ind, patch2_ind) as "done"
702  int patch1_ind = sh_patch_pair.patch1_ind;
703  int patch2_ind = sh_patch_pair.patch2_ind;
704  if (threadIdx.x == 0 && threadIdx.y == 0) {
705  sh_patch_pair.patch_done[0] = false;
706  sh_patch_pair.patch_done[1] = false;
707  //
708  // global_counters[0]: force_ready_queue
709  // global_counters[1]: block_order
710  // global_counters[2...npatches+1]: number of pairs finished for patch i+2
711  //
712  unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
713  int patch1_old = atomicInc(&global_counters[patch1_ind+2], patch1_num_pairs-1);
714  if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
715  if (patch1_ind != patch2_ind) {
716  unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
717  int patch2_old = atomicInc(&global_counters[patch2_ind+2], patch2_num_pairs-1);
718  if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
719  }
720  }
721  // sync threads so that patch1_done and patch2_done are visible to all threads
722  BLOCK_SYNC;
723 
724  if (sh_patch_pair.patch_done[0]) {
725 
726 // #ifndef REG_JFORCE
727 // volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
728 // #endif
729 #ifndef KEPLER_SHUFFLE
730  volatile float* sh_vcc = (volatile float*)&sh_jpq_2d[0][0];
731  volatile float* sh_slow_buf = NULL;
732  SLOW(sh_slow_buf = (volatile float*)&sh_jforce_slow_2d[0][0];)
733 #endif
734  NAME(finish_forces_virials)(sh_patch_pair.patch1_start, sh_patch_pair.patch1_size,
735  patch1_ind, atoms, sh_buf,
736 #ifndef KEPLER_SHUFFLE
737  sh_slow_buf, sh_vcc,
738 #endif
741 
742  }
743 
744  if (sh_patch_pair.patch_done[1]) {
745 // #ifndef REG_JFORCE
746 // volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
747 // #endif
748 #ifndef KEPLER_SHUFFLE
749  volatile float* sh_vcc = (volatile float*)&sh_jpq_2d[0][0];
750  volatile float* sh_slow_buf = NULL;
751  SLOW(sh_slow_buf = (volatile float*)&sh_jforce_slow_2d[0][0];)
752 #endif
753  NAME(finish_forces_virials)(sh_patch_pair.patch2_start, sh_patch_pair.patch2_size,
754  patch2_ind, atoms, sh_buf,
755 #ifndef KEPLER_SHUFFLE
756  sh_slow_buf, sh_vcc,
757 #endif
760  }
761 
762  if (force_ready_queue != NULL && (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1])) {
763  // Make sure page-locked host forces are up-to-date
764 #if __CUDA_ARCH__ < 200
765  __threadfence();
766 #else
767  __threadfence_system();
768 #endif
769  BLOCK_SYNC;
770  // Add patch into "force_ready_queue"
771  if (threadIdx.x == 0 && threadIdx.y == 0) {
772  if (sh_patch_pair.patch_done[0]) {
773  int ind = atomicInc(&global_counters[0], npatches-1);
774  force_ready_queue[ind] = patch1_ind;
775  }
776  if (sh_patch_pair.patch_done[1]) {
777  int ind = atomicInc(&global_counters[0], npatches-1);
778  force_ready_queue[ind] = patch2_ind;
779  }
780  // Make sure "force_ready_queue" is visible in page-locked host memory
781 #if __CUDA_ARCH__ < 200
782  __threadfence();
783 #else
784  __threadfence_system();
785 #endif
786  }
787  }
788 
789  if (threadIdx.x == 0 && threadIdx.y == 0 && block_order != NULL) {
790  int old = atomicInc(&global_counters[1], total_block_count-1);
791  block_order[old] = block_begin + blockIdx.x;
792  }
793  }
794 
795 }
796 
797 //
798 // Copy patch forces to their final resting place in page-locked host memory
799 // and reduce virials
800 //
801 __device__ __forceinline__
802 static void NAME(finish_forces_virials)(const int start, const int size, const int patch_ind,
803  const atom* atoms,
804  volatile float* sh_buf,
805 #ifndef KEPLER_SHUFFLE
806  volatile float* sh_slow_buf, volatile float* sh_vcc,
807 #endif
808  float4* tmpforces, float4* slow_tmpforces,
809  float4* forces, float4* slow_forces,
810  float* tmpvirials, float* slow_tmpvirials,
811  float* virials, float* slow_virials) {
812  float vxx = 0.f;
813  float vxy = 0.f;
814  float vxz = 0.f;
815  float vyx = 0.f;
816  float vyy = 0.f;
817  float vyz = 0.f;
818  float vzx = 0.f;
819  float vzy = 0.f;
820  float vzz = 0.f;
821  SLOW(
822  float slow_vxx = 0.f;
823  float slow_vxy = 0.f;
824  float slow_vxz = 0.f;
825  float slow_vyx = 0.f;
826  float slow_vyy = 0.f;
827  float slow_vyz = 0.f;
828  float slow_vzx = 0.f;
829  float slow_vzy = 0.f;
830  float slow_vzz = 0.f;
831  )
832  for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < size;i+=NUM_WARP*WARPSIZE) {
833  const int p = start+i;
834  float4 f = tmpforces[p];
835  forces[p] = f;
836  float4 pos = ((float4*)atoms)[p];
837  vxx += f.x * pos.x;
838  vxy += f.x * pos.y;
839  vxz += f.x * pos.z;
840  vyx += f.y * pos.x;
841  vyy += f.y * pos.y;
842  vyz += f.y * pos.z;
843  vzx += f.z * pos.x;
844  vzy += f.z * pos.y;
845  vzz += f.z * pos.z;
846  SLOW(
847  float4 slow_f = slow_tmpforces[p];
848  slow_forces[p] = slow_f;
849  slow_vxx += slow_f.x * pos.x;
850  slow_vxy += slow_f.x * pos.y;
851  slow_vxz += slow_f.x * pos.z;
852  slow_vyx += slow_f.y * pos.x;
853  slow_vyy += slow_f.y * pos.y;
854  slow_vyz += slow_f.y * pos.z;
855  slow_vzx += slow_f.z * pos.x;
856  slow_vzy += slow_f.z * pos.y;
857  slow_vzz += slow_f.z * pos.z;
858  )
859  }
860 #ifdef KEPLER_SHUFFLE
861  // Reduce within warps
862  for (int i=WARPSIZE/2;i >= 1;i/=2) {
863  vxx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vxx, i, WARPSIZE);
864  vxy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vxy, i, WARPSIZE);
865  vxz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vxz, i, WARPSIZE);
866  vyx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vyx, i, WARPSIZE);
867  vyy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vyy, i, WARPSIZE);
868  vyz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vyz, i, WARPSIZE);
869  vzx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vzx, i, WARPSIZE);
870  vzy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vzy, i, WARPSIZE);
871  vzz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vzz, i, WARPSIZE);
872  SLOW(
873  slow_vxx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vxx, i, WARPSIZE);
874  slow_vxy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vxy, i, WARPSIZE);
875  slow_vxz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vxz, i, WARPSIZE);
876  slow_vyx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vyx, i, WARPSIZE);
877  slow_vyy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vyy, i, WARPSIZE);
878  slow_vyz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vyz, i, WARPSIZE);
879  slow_vzx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vzx, i, WARPSIZE);
880  slow_vzy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vzy, i, WARPSIZE);
881  slow_vzz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vzz, i, WARPSIZE);
882  )
883  }
884  // Reduce between warps
885  // Requires NUM_WARP*(SLOW(9)+9)*sizeof(float) amount of shared memory
886  if (threadIdx.x == 0) {
887  sh_buf[threadIdx.y*(SLOW(9)+9) + 0] = vxx;
888  sh_buf[threadIdx.y*(SLOW(9)+9) + 1] = vxy;
889  sh_buf[threadIdx.y*(SLOW(9)+9) + 2] = vxz;
890  sh_buf[threadIdx.y*(SLOW(9)+9) + 3] = vyx;
891  sh_buf[threadIdx.y*(SLOW(9)+9) + 4] = vyy;
892  sh_buf[threadIdx.y*(SLOW(9)+9) + 5] = vyz;
893  sh_buf[threadIdx.y*(SLOW(9)+9) + 6] = vzx;
894  sh_buf[threadIdx.y*(SLOW(9)+9) + 7] = vzy;
895  sh_buf[threadIdx.y*(SLOW(9)+9) + 8] = vzz;
896  SLOW(
897  sh_buf[threadIdx.y*(SLOW(9)+9) + 9] = slow_vxx;
898  sh_buf[threadIdx.y*(SLOW(9)+9) + 10] = slow_vxy;
899  sh_buf[threadIdx.y*(SLOW(9)+9) + 11] = slow_vxz;
900  sh_buf[threadIdx.y*(SLOW(9)+9) + 12] = slow_vyx;
901  sh_buf[threadIdx.y*(SLOW(9)+9) + 13] = slow_vyy;
902  sh_buf[threadIdx.y*(SLOW(9)+9) + 14] = slow_vyz;
903  sh_buf[threadIdx.y*(SLOW(9)+9) + 15] = slow_vzx;
904  sh_buf[threadIdx.y*(SLOW(9)+9) + 16] = slow_vzy;
905  sh_buf[threadIdx.y*(SLOW(9)+9) + 17] = slow_vzz;
906  )
907  }
908  BLOCK_SYNC;
909  // Write final virials into global memory
910  if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
911  float v = 0.0f;
912 #pragma unroll
913  for (int i=0;i < NUM_WARP;i++) v += sh_buf[i*(SLOW(9)+9) + threadIdx.x];
914  float* dst = (threadIdx.x < 9) ? virials : slow_virials;
915  const float* src = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
916  int pos = patch_ind*16 + (threadIdx.x % 9);
917  dst[pos] = v + src[pos];
918  }
919 #else // ! KEPLER_SHUFFLE
920  // We have total of NUM_WARP*WARPSIZE*3 floats, reduce in sets of three
921  // (NOTE: we do have more shared memory available, so this could be optimized further
922  // for pre-Kepler architectures.)
923  const int t = threadIdx.x + threadIdx.y*WARPSIZE;
924  volatile float* sh_v1 = &sh_buf[0];
925  volatile float* sh_v2 = &sh_buf[NUM_WARP*WARPSIZE];
926  volatile float* sh_v3 = &sh_buf[2*NUM_WARP*WARPSIZE];
927  SLOW(
928  volatile float* sh_slow_v1 = &sh_slow_buf[0];
929  volatile float* sh_slow_v2 = &sh_slow_buf[NUM_WARP*WARPSIZE];
930  volatile float* sh_slow_v3 = &sh_slow_buf[2*NUM_WARP*WARPSIZE];
931  )
932 
933  // vxx, vxy, vxz
934  sh_v1[t] = vxx;
935  sh_v2[t] = vxy;
936  sh_v3[t] = vxz;
937  SLOW(
938  sh_slow_v1[t] = slow_vxx;
939  sh_slow_v2[t] = slow_vxy;
940  sh_slow_v3[t] = slow_vxz;
941  )
942  for (int d=1;d < NUM_WARP*WARPSIZE;d*=2) {
943  int pos = t + d;
944  float v1 = (pos < NUM_WARP*WARPSIZE) ? sh_v1[pos] : 0.0f;
945  float v2 = (pos < NUM_WARP*WARPSIZE) ? sh_v2[pos] : 0.0f;
946  float v3 = (pos < NUM_WARP*WARPSIZE) ? sh_v3[pos] : 0.0f;
947  SLOW(
948  float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
949  float slow_v2 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
950  float slow_v3 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
951  )
952  BLOCK_SYNC;
953  sh_v1[t] += v1;
954  sh_v2[t] += v2;
955  sh_v3[t] += v3;
956  SLOW(
957  sh_slow_v1[t] += slow_v1;
958  sh_slow_v2[t] += slow_v2;
959  sh_slow_v3[t] += slow_v3;
960  )
961  BLOCK_SYNC;
962  }
963  if (threadIdx.x == 0 && threadIdx.y == 0) {
964  sh_vcc[0] = sh_v1[0];
965  sh_vcc[1] = sh_v2[0];
966  sh_vcc[2] = sh_v3[0];
967  SLOW(
968  sh_vcc[9+0] = sh_slow_v1[0];
969  sh_vcc[9+1] = sh_slow_v2[0];
970  sh_vcc[9+2] = sh_slow_v3[0];
971  )
972  }
973  // vyx, vyy, vyz
974  sh_v1[t] = vyx;
975  sh_v2[t] = vyy;
976  sh_v3[t] = vyz;
977  SLOW(
978  sh_slow_v1[t] = slow_vyx;
979  sh_slow_v2[t] = slow_vyy;
980  sh_slow_v3[t] = slow_vyz;
981  )
982  for (int d=1;d < NUM_WARP*WARPSIZE;d*=2) {
983  int pos = t + d;
984  float v1 = (pos < NUM_WARP*WARPSIZE) ? sh_v1[pos] : 0.0f;
985  float v2 = (pos < NUM_WARP*WARPSIZE) ? sh_v2[pos] : 0.0f;
986  float v3 = (pos < NUM_WARP*WARPSIZE) ? sh_v3[pos] : 0.0f;
987  SLOW(
988  float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
989  float slow_v2 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
990  float slow_v3 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
991  )
992  BLOCK_SYNC;
993  sh_v1[t] += v1;
994  sh_v2[t] += v2;
995  sh_v3[t] += v3;
996  SLOW(
997  sh_slow_v1[t] += slow_v1;
998  sh_slow_v2[t] += slow_v2;
999  sh_slow_v3[t] += slow_v3;
1000  )
1001  BLOCK_SYNC;
1002  }
1003  if (threadIdx.x == 0 && threadIdx.y == 0) {
1004  sh_vcc[3] = sh_v1[0];
1005  sh_vcc[4] = sh_v2[0];
1006  sh_vcc[5] = sh_v3[0];
1007  SLOW(
1008  sh_vcc[9+3] = sh_slow_v1[0];
1009  sh_vcc[9+4] = sh_slow_v2[0];
1010  sh_vcc[9+5] = sh_slow_v3[0];
1011  )
1012  }
1013  // vzx, vzy, vzz
1014  sh_v1[t] = vzx;
1015  sh_v2[t] = vzy;
1016  sh_v3[t] = vzz;
1017  SLOW(
1018  sh_slow_v1[t] = slow_vzx;
1019  sh_slow_v2[t] = slow_vzy;
1020  sh_slow_v3[t] = slow_vzz;
1021  )
1022  for (int d=1;d < NUM_WARP*WARPSIZE;d*=2) {
1023  int pos = t + d;
1024  float v1 = (pos < NUM_WARP*WARPSIZE) ? sh_v1[pos] : 0.0f;
1025  float v2 = (pos < NUM_WARP*WARPSIZE) ? sh_v2[pos] : 0.0f;
1026  float v3 = (pos < NUM_WARP*WARPSIZE) ? sh_v3[pos] : 0.0f;
1027  SLOW(
1028  float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
1029  float slow_v2 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
1030  float slow_v3 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
1031  )
1032  BLOCK_SYNC;
1033  sh_v1[t] += v1;
1034  sh_v2[t] += v2;
1035  sh_v3[t] += v3;
1036  SLOW(
1037  sh_slow_v1[t] += slow_v1;
1038  sh_slow_v2[t] += slow_v2;
1039  sh_slow_v3[t] += slow_v3;
1040  )
1041  BLOCK_SYNC;
1042  }
1043  if (threadIdx.x == 0 && threadIdx.y == 0) {
1044  sh_vcc[6] = sh_v1[0];
1045  sh_vcc[7] = sh_v2[0];
1046  sh_vcc[8] = sh_v3[0];
1047  SLOW(
1048  sh_vcc[9+6] = sh_slow_v1[0];
1049  sh_vcc[9+7] = sh_slow_v2[0];
1050  sh_vcc[9+8] = sh_slow_v3[0];
1051  )
1052  }
1053  // Write final virials and energies into global memory
1054  if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
1055  float* dst = (threadIdx.x < 9) ? virials : slow_virials;
1056  const float* src = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
1057  int pos = patch_ind*16 + (threadIdx.x % 9);
1058  dst[pos] = sh_vcc[threadIdx.x] + src[pos];
1059  }
1060 #endif // KEPLER_SHUFFLE
1061  ENERGY(
1062  // Write final energies into global memory
1063  if (threadIdx.x < 3 && threadIdx.y == 0) {
1064  int pos = patch_ind*16 + 9 + threadIdx.x;
1065  virials[pos] = tmpvirials[pos];
1066  }
1067  );
1068  GENPAIRLIST(
1069  if (threadIdx.x == 0 && threadIdx.y == 0) {
1070  int pos = patch_ind*16 + 12;
1071  virials[pos] = tmpvirials[pos];
1072  }
1073  );
1074 }
1075 
1076 #endif // NAMD_CUDA
1077 
static __thread int * block_order
#define USEPAIRLIST(X)
#define ENERGY(X)
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
__device__ static __forceinline__ void NAME() finish_forces_virials(const int start, const int size, const int patch_ind, const atom *atoms, volatile float *sh_buf, volatile float *sh_slow_buf, volatile float *sh_vcc, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials)
static __thread int lj_table_size
static __thread atom * atoms
static __thread float4 * forces
static __thread unsigned int * plist
static __thread unsigned int * overflow_exclusions
static __thread exclmask * exclmasks
#define cuda_static_assert(expr)
Definition: CudaUtils.h:69
static __thread float * slow_tmpvirials
#define PATCH_PAIR_SIZE
if(ComputeNonbondedUtil::goMethod==2)
#define SLOW(X)
static __thread float4 * slow_forces
static __thread float * slow_virials
#define SH_BUF_SIZE
#define NAME(X)
static __thread float * tmpvirials
static __thread patch_pair * patch_pairs
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
texture< float2, 1, cudaReadModeElementType > lj_table
gridSize z
#define GENPAIRLIST(X)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float plcutoff2
static __thread float * virials
#define MAX_CONST_EXCLUSIONS
__device__ __forceinline__ void NAME() reduceVariables(volatile T *sh_buf, T *dst, T val1, T val2, T val3)
static __thread float4 * slow_tmpforces
static __thread unsigned int * global_counters
texture< float4, 1, cudaReadModeElementType > energy_table
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
static __thread atom_param * atom_params
texture< float4, 1, cudaReadModeElementType > force_table
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:32
gridSize y
#define NUM_WARP
#define WARPSIZE
static __thread int * vdw_types
static __thread int * force_ready_queue
gridSize x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
#define WARP_ANY(MASK, P)
Definition: CudaUtils.h:41
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
static __thread float4 * tmpforces
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:38