4 #define NAME(X) SLOWNAME( X )
10 #define SLOWNAME(X) ENERGYNAME( X ## _slow )
13 #define SLOWNAME(X) ENERGYNAME( X )
20 #define ENERGYNAME(X) PAIRLISTNAME( X ## _energy )
23 #define ENERGYNAME(X) PAIRLISTNAME( X )
30 #define GENPAIRLIST(X) X
31 #define USEPAIRLIST(X)
32 #define PAIRLISTNAME(X) LAST( X ## _pairlist )
34 #define GENPAIRLIST(X)
35 #define USEPAIRLIST(X) X
36 #define PAIRLISTNAME(X) LAST( X )
43 #define KEPLER_SHUFFLE
44 #if __CUDA_ARCH__ < 300
57 __device__ __forceinline__
static void NAME(shfl_reduction)(
68 if ( threadIdx.x % 32 == 0 ) {
75 __device__ __forceinline__
78 volatile float* sh_buf,
79 #ifndef KEPLER_SHUFFLE
80 volatile float* sh_slow_buf,
volatile float* sh_vcc,
90 template<
typename T,
int n,
int sh_buf_size>
91 __device__ __forceinline__
104 if (threadIdx.x == 0) {
105 if (n >= 1) sh_buf[threadIdx.y*n + 0] = val1;
106 if (n >= 2) sh_buf[threadIdx.y*n + 1] = val2;
107 if (n >= 3) sh_buf[threadIdx.y*n + 2] = val3;
110 if (threadIdx.x < n && threadIdx.y == 0) {
114 finalval += sh_buf[i*n + threadIdx.x];
116 atomicAdd(&dst[threadIdx.x], finalval);
118 #else // ! KEPLER_SHUFFLE
121 volatile T* sh_bufy = &sh_buf[threadIdx.y*n*
WARPSIZE];
122 if (n >= 1) sh_bufy[threadIdx.x*n + 0] = val1;
123 if (n >= 2) sh_bufy[threadIdx.x*n + 1] = val2;
124 if (n >= 3) sh_bufy[threadIdx.x*n + 2] = val3;
127 int pos = threadIdx.x + d;
128 T val1t, val2t, val3t;
129 if (n >= 1) val1t = (pos <
WARPSIZE) ? sh_bufy[pos*n + 0] : (T)0;
130 if (n >= 2) val2t = (pos <
WARPSIZE) ? sh_bufy[pos*n + 1] : (T)0;
131 if (n >= 3) val3t = (pos <
WARPSIZE) ? sh_bufy[pos*n + 2] : (T)0;
132 if (n >= 1) sh_bufy[threadIdx.x*n + 0] += val1t;
133 if (n >= 2) sh_bufy[threadIdx.x*n + 1] += val2t;
134 if (n >= 3) sh_bufy[threadIdx.x*n + 2] += val3t;
137 if (threadIdx.x < n && threadIdx.y == 0) {
141 finalval += sh_buf[i*n*WARPSIZE + threadIdx.x];
143 atomicAdd(&dst[threadIdx.x], finalval);
145 #endif // KEPLER_SHUFFLE
153 __global__
static void
167 const
int block_begin, const
int total_block_count,
int*
block_order,
169 const float3
lata, const float3
latb, const float3
latc,
179 __shared__ patch_pair sh_patch_pair;
182 SLOW(__shared__ float3 sh_jforce_slow_2d[
NUM_WARP][WARPSIZE];)
184 #ifndef KEPLER_SHUFFLE
185 __shared__ atom sh_jpq_2d[
NUM_WARP][WARPSIZE];
192 SLOW(
float totales = 0.f; )
198 #ifndef KEPLER_SHUFFLE
203 __shared__
unsigned int sh_plist_val[
NUM_WARP];);
207 const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
210 float *p = (
float *)sh_iforcesum;
211 p[threadIdx.x] = 0.0f;
215 int* src = (
int *)&patch_pairs[block_begin + blockIdx.x];
216 int* dst = (
int *)&sh_patch_pair;
223 USEPAIRLIST(
if (threadIdx.x == 0) sh_plist_ind[threadIdx.y] = -1;);
227 if (t < sh_patch_pair.plist_size)
228 plist[sh_patch_pair.plist_start + t] = 0;
233 float offx = sh_patch_pair.offset.x * lata.x
234 + sh_patch_pair.offset.y * latb.x
235 + sh_patch_pair.offset.z * latc.x;
236 float offy = sh_patch_pair.offset.x * lata.y
237 + sh_patch_pair.offset.y * latb.y
238 + sh_patch_pair.offset.z * latc.y;
239 float offz = sh_patch_pair.offset.x * lata.z
240 + sh_patch_pair.offset.y * latb.z
241 + sh_patch_pair.offset.z * latc.z;
242 sh_patch_pair.offset.x = offx;
243 sh_patch_pair.offset.y = offy;
244 sh_patch_pair.offset.z = offz;
252 volatile float3* sh_jforce = &sh_jforce_2d[threadIdx.y][0];
253 SLOW(
volatile float3* sh_jforce_slow = &sh_jforce_slow_2d[threadIdx.y][0];)
256 #ifndef KEPLER_SHUFFLE
257 atom* sh_jpq = &sh_jpq_2d[threadIdx.y][0];
258 GENPAIRLIST(atom_param* sh_jap = &sh_jap_2d[threadIdx.y][0];);
259 USEPAIRLIST(
int* sh_jap_vdw_type = &sh_jap_vdw_type_2d[threadIdx.y][0];);
262 for (
int blocki = threadIdx.y*WARPSIZE;blocki < sh_patch_pair.patch1_size;blocki += WARPSIZE*
NUM_WARP) {
268 if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
269 int i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
270 float4 tmpa = ((float4*)atoms)[i];
271 ipq.position.x = tmpa.x + sh_patch_pair.offset.x;
272 ipq.position.y = tmpa.y + sh_patch_pair.offset.y;
273 ipq.position.z = tmpa.z + sh_patch_pair.offset.z;
275 GENPAIRLIST(uint4 tmpap = ((uint4*)atom_params)[i];
277 iap.index = tmpap.y;);
278 USEPAIRLIST(iap_vdw_type = vdw_types[i]*lj_table_size;);
286 SLOW(float3 iforce_slow;
287 iforce_slow.x = 0.0f;
288 iforce_slow.y = 0.0f;
289 iforce_slow.z = 0.0f;)
291 const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
292 (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
293 int blockj = (diag_patch_pair) ? blocki : 0;
294 for (;blockj < sh_patch_pair.patch2_size;blockj +=
WARPSIZE) {
297 const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
298 int pos = (blockj/
WARPSIZE) + (blocki/WARPSIZE)*size2;
299 int plist_ind = pos/32;
300 unsigned int plist_bit = 1 << (pos % 32);
302 if (plist_ind != sh_plist_ind[threadIdx.y]) {
303 sh_plist_val[threadIdx.y] = plist[sh_patch_pair.plist_start + plist_ind];
304 sh_plist_ind[threadIdx.y] = plist_ind;
306 if ((sh_plist_val[threadIdx.y] & plist_bit) == 0)
continue;
310 #ifdef KEPLER_SHUFFLE
318 if (blocki >= sh_patch_pair.patch1_free_size && blockj >= sh_patch_pair.patch2_free_size)
continue;
319 int nfreej = sh_patch_pair.patch2_free_size - blockj;
320 int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
324 if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
325 int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
326 float4 tmpa = ((float4*)atoms)[j];
327 #ifdef KEPLER_SHUFFLE
328 jpq.position.x = tmpa.x;
329 jpq.position.y = tmpa.y;
330 jpq.position.z = tmpa.z;
333 sh_jpq[threadIdx.x].position.x = tmpa.x;
334 sh_jpq[threadIdx.x].position.y = tmpa.y;
335 sh_jpq[threadIdx.x].position.z = tmpa.z;
336 sh_jpq[threadIdx.x].charge = tmpa.w;
339 #ifdef KEPLER_SHUFFLE
344 USEPAIRLIST(sh_jap_vdw_type[threadIdx.x] = vdw_types[j];)
354 SLOW(float3 jforce_slow;
355 jforce_slow.x = 0.0f;
356 jforce_slow.y = 0.0f;
357 jforce_slow.z = 0.0f;
360 sh_jforce[threadIdx.x].x = 0.0f;
361 sh_jforce[threadIdx.x].y = 0.0f;
362 sh_jforce[threadIdx.x].z = 0.0f;
363 SLOW(sh_jforce_slow[threadIdx.x].x = 0.0f;
364 sh_jforce_slow[threadIdx.x].y = 0.0f;
365 sh_jforce_slow[threadIdx.x].z = 0.0f;)
370 const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
371 const int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
372 unsigned int excl = exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x];
375 int nloopi = sh_patch_pair.patch1_size - blocki;
376 if (nloopi > WARPSIZE) nloopi = WARPSIZE;
378 int nfreei = max(sh_patch_pair.patch1_free_size - blocki, 0);
380 const bool diag_tile = diag_patch_pair && (blocki == blockj);
384 const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
385 int t = (diag_tile) ? 1 : 0;
388 #ifdef KEPLER_SHUFFLE
403 int j = (t + threadIdx.x) & modval;
404 #ifdef KEPLER_SHUFFLE
409 int j_vdw_type = jap.vdw_type;
410 int j_index = jap.index;
411 int j_excl_maxdiff = jap.excl_maxdiff;
412 int j_excl_index = jap.excl_index;
414 float j_charge = jpq.charge;
416 int j_vdw_type = jap_vdw_type;
419 GENPAIRLIST(
if (j < nloopj && threadIdx.x < nloopi && (j < nfreej || threadIdx.x < nfreei) ))
422 #ifndef KEPLER_SHUFFLE
423 float tmpx = sh_jpq[j].position.x - ipq.position.x;
424 float tmpy = sh_jpq[j].position.y - ipq.position.y;
425 float tmpz = sh_jpq[j].position.z - ipq.position.z;
427 int j_vdw_type = sh_jap[j].vdw_type;
428 int j_index = sh_jap[j].index;
429 int j_excl_maxdiff = sh_jap[j].excl_maxdiff;
430 int j_excl_index = sh_jap[j].excl_index;
432 float j_charge = sh_jpq[j].charge;
434 int j_vdw_type = sh_jap_vdw_type[j];
437 float r2 = tmpx*tmpx + tmpy*tmpy + tmpz*tmpz;
442 bool excluded =
false;
443 int indexdiff = (int)(iap.index) - j_index;
444 if ( abs(indexdiff) <= j_excl_maxdiff) {
445 indexdiff += j_excl_index;
446 int indexword = ((
unsigned int) indexdiff) >> 5;
451 indexword = overflow_exclusions[indexword];
453 excluded = ((indexword & (1<<(indexdiff&31))) != 0);
454 if (excluded) nexcluded++;
456 if (!excluded) excl |= 0x80000000;
460 ENERGY(
float rsqrtfr2; );
466 float f_slow = ipq.charge * j_charge;
467 float f = ljab.x * fi.z + ljab.y * fi.y + f_slow * fi.x;
469 float ev = ljab.x * ei.z + ljab.y * ei.y;
470 float ee = f_slow * ei.x;
471 SLOW(
float es = f_slow * ei.w; )
473 SLOW( f_slow *= fi.w; )
477 SLOW( totales += es; )
490 sh_jforce[j].x -= fx;
491 sh_jforce[j].y -= fy;
492 sh_jforce[j].z -= fz;
495 float fx_slow = tmpx * f_slow;
496 float fy_slow = tmpy * f_slow;
497 float fz_slow = tmpz * f_slow;
498 iforce_slow.x += fx_slow;
499 iforce_slow.y += fy_slow;
500 iforce_slow.z += fz_slow;
504 jforce_slow.x -= fx_slow;
505 jforce_slow.y -= fy_slow;
506 jforce_slow.z -= fz_slow;
510 sh_jforce_slow[j].
x -= fx_slow;
511 sh_jforce_slow[j].y -= fy_slow;
512 sh_jforce_slow[j].z -= fz_slow;
520 #ifdef KEPLER_SHUFFLE
543 if ( blockj + threadIdx.x < sh_patch_pair.patch2_size ) {
544 int jforce_pos = sh_patch_pair.patch2_start + blockj + threadIdx.x;
546 atomicAdd(&tmpforces[jforce_pos].
x, jforce.x);
547 atomicAdd(&tmpforces[jforce_pos].
y, jforce.y);
548 atomicAdd(&tmpforces[jforce_pos].
z, jforce.z);
549 SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, jforce_slow.x);
550 atomicAdd(&slow_tmpforces[jforce_pos].y, jforce_slow.y);
551 atomicAdd(&slow_tmpforces[jforce_pos].z, jforce_slow.z););
553 atomicAdd(&tmpforces[jforce_pos].x, sh_jforce[threadIdx.x].x);
554 atomicAdd(&tmpforces[jforce_pos].y, sh_jforce[threadIdx.x].y);
555 atomicAdd(&tmpforces[jforce_pos].z, sh_jforce[threadIdx.x].z);
556 SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, sh_jforce_slow[threadIdx.x].x);
557 atomicAdd(&slow_tmpforces[jforce_pos].y, sh_jforce_slow[threadIdx.x].y);
558 atomicAdd(&slow_tmpforces[jforce_pos].z, sh_jforce_slow[threadIdx.x].z););
563 const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
564 int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
565 exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x] = excl;
566 if (threadIdx.x == 0) {
567 int plist_ind = pos/32;
568 unsigned int plist_bit = 1 << (pos % 32);
569 atomicOr(&plist[sh_patch_pair.plist_start + plist_ind], plist_bit);
577 if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
578 int iforce_pos = sh_patch_pair.patch1_start + blocki + threadIdx.x;
579 atomicAdd(&tmpforces[iforce_pos].
x, iforce.x);
580 atomicAdd(&tmpforces[iforce_pos].
y, iforce.y);
581 atomicAdd(&tmpforces[iforce_pos].
z, iforce.z);
582 SLOW(atomicAdd(&slow_tmpforces[iforce_pos].x, iforce_slow.x);
583 atomicAdd(&slow_tmpforces[iforce_pos].y, iforce_slow.y);
584 atomicAdd(&slow_tmpforces[iforce_pos].z, iforce_slow.z););
587 #ifdef KEPLER_SHUFFLE
588 for (
int i=WARPSIZE/2;i >= 1;i/=2) {
598 if (threadIdx.x == 0) {
599 sh_iforcesum[threadIdx.y].x += iforce.x;
600 sh_iforcesum[threadIdx.y].y += iforce.y;
601 sh_iforcesum[threadIdx.y].z += iforce.z;
603 sh_iforcesum[threadIdx.y+NUM_WARP].x += iforce_slow.x;
604 sh_iforcesum[threadIdx.y+NUM_WARP].y += iforce_slow.y;
605 sh_iforcesum[threadIdx.y+NUM_WARP].z += iforce_slow.z;
609 sh_jforce[threadIdx.x].x = iforce.x;
610 sh_jforce[threadIdx.x].y = iforce.y;
611 sh_jforce[threadIdx.x].z = iforce.z;
613 sh_jforce_slow[threadIdx.x].x = iforce_slow.x;
614 sh_jforce_slow[threadIdx.x].y = iforce_slow.y;
615 sh_jforce_slow[threadIdx.x].z = iforce_slow.z;
618 int pos = threadIdx.x + d;
619 float valx = (pos <
WARPSIZE) ? sh_jforce[pos].
x : 0.0f;
620 float valy = (pos <
WARPSIZE) ? sh_jforce[pos].
y : 0.0f;
621 float valz = (pos <
WARPSIZE) ? sh_jforce[pos].
z : 0.0f;
623 float slow_valx = (pos < WARPSIZE) ? sh_jforce_slow[pos].
x : 0.0f;
624 float slow_valy = (pos <
WARPSIZE) ? sh_jforce_slow[pos].
y : 0.0f;
625 float slow_valz = (pos <
WARPSIZE) ? sh_jforce_slow[pos].
z : 0.0f;
627 sh_jforce[threadIdx.x].x += valx;
628 sh_jforce[threadIdx.x].y += valy;
629 sh_jforce[threadIdx.x].z += valz;
631 sh_jforce_slow[threadIdx.x].x += slow_valx;
632 sh_jforce_slow[threadIdx.x].y += slow_valy;
633 sh_jforce_slow[threadIdx.x].z += slow_valz;
636 if (threadIdx.x == 0) {
637 sh_iforcesum[threadIdx.y].x += sh_jforce[threadIdx.x].x;
638 sh_iforcesum[threadIdx.y].y += sh_jforce[threadIdx.x].y;
639 sh_iforcesum[threadIdx.y].z += sh_jforce[threadIdx.x].z;
641 sh_iforcesum[threadIdx.y+NUM_WARP].x += sh_jforce_slow[threadIdx.x].x;
642 sh_iforcesum[threadIdx.y+NUM_WARP].y += sh_jforce_slow[threadIdx.x].y;
643 sh_iforcesum[threadIdx.y+NUM_WARP].z += sh_jforce_slow[threadIdx.x].z;
656 #define SH_BUF_SIZE NUM_WARP*(SLOW(9)+9)*sizeof(float)
657 __shared__
float sh_buf[NUM_WARP*(
SLOW(9)+9)];
658 #else // ! REG_JFORCE
660 #define SH_BUF_SIZE NUM_WARP*WARPSIZE*3*sizeof(float)
661 volatile float* sh_buf = (
float *)&sh_jforce_2d[0][0];
677 if (threadIdx.x <
SLOW(3+)3 && threadIdx.y == 0) {
678 float* sh_virials = (
float *)sh_iforcesum + (threadIdx.x % 3) + (threadIdx.x/3)*3*NUM_WARP;
679 float iforcesum = 0.0f;
681 for (
int i=0;i < 3*
NUM_WARP;i+=3) iforcesum += sh_virials[i];
682 float vx = iforcesum*sh_patch_pair.offset.x;
683 float vy = iforcesum*sh_patch_pair.offset.y;
684 float vz = iforcesum*sh_patch_pair.offset.z;
685 sh_iforcesum[threadIdx.x].x = vx;
686 sh_iforcesum[threadIdx.x].y = vy;
687 sh_iforcesum[threadIdx.x].z = vz;
689 if (threadIdx.x <
SLOW(9+)9 && threadIdx.y == 0) {
691 float* sh_virials = (
float *)sh_iforcesum;
692 int patch1_ind = sh_patch_pair.patch1_ind;
693 float *dst = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
694 atomicAdd(&dst[patch1_ind*16 + (threadIdx.x % 9)], sh_virials[threadIdx.x]);
702 int patch1_ind = sh_patch_pair.patch1_ind;
703 int patch2_ind = sh_patch_pair.patch2_ind;
704 if (threadIdx.x == 0 && threadIdx.y == 0) {
705 sh_patch_pair.patch_done[0] =
false;
706 sh_patch_pair.patch_done[1] =
false;
712 unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
713 int patch1_old = atomicInc(&global_counters[patch1_ind+2], patch1_num_pairs-1);
714 if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] =
true;
715 if (patch1_ind != patch2_ind) {
716 unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
717 int patch2_old = atomicInc(&global_counters[patch2_ind+2], patch2_num_pairs-1);
718 if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] =
true;
724 if (sh_patch_pair.patch_done[0]) {
729 #ifndef KEPLER_SHUFFLE
730 volatile float* sh_vcc = (
volatile float*)&sh_jpq_2d[0][0];
731 volatile float* sh_slow_buf = NULL;
732 SLOW(sh_slow_buf = (
volatile float*)&sh_jforce_slow_2d[0][0];)
735 patch1_ind,
atoms, sh_buf,
736 #ifndef KEPLER_SHUFFLE
744 if (sh_patch_pair.patch_done[1]) {
748 #ifndef KEPLER_SHUFFLE
749 volatile float* sh_vcc = (
volatile float*)&sh_jpq_2d[0][0];
750 volatile float* sh_slow_buf = NULL;
751 SLOW(sh_slow_buf = (
volatile float*)&sh_jforce_slow_2d[0][0];)
754 patch2_ind,
atoms, sh_buf,
755 #ifndef KEPLER_SHUFFLE
762 if (force_ready_queue != NULL && (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1])) {
764 #if __CUDA_ARCH__ < 200
767 __threadfence_system();
771 if (threadIdx.x == 0 && threadIdx.y == 0) {
772 if (sh_patch_pair.patch_done[0]) {
773 int ind = atomicInc(&global_counters[0], npatches-1);
774 force_ready_queue[ind] = patch1_ind;
776 if (sh_patch_pair.patch_done[1]) {
777 int ind = atomicInc(&global_counters[0], npatches-1);
778 force_ready_queue[ind] = patch2_ind;
781 #if __CUDA_ARCH__ < 200
784 __threadfence_system();
789 if (threadIdx.x == 0 && threadIdx.y == 0 && block_order != NULL) {
790 int old = atomicInc(&global_counters[1], total_block_count-1);
791 block_order[old] = block_begin + blockIdx.x;
801 __device__ __forceinline__
804 volatile float* sh_buf,
805 #ifndef KEPLER_SHUFFLE
806 volatile float* sh_slow_buf,
volatile float* sh_vcc,
822 float slow_vxx = 0.f;
823 float slow_vxy = 0.f;
824 float slow_vxz = 0.f;
825 float slow_vyx = 0.f;
826 float slow_vyy = 0.f;
827 float slow_vyz = 0.f;
828 float slow_vzx = 0.f;
829 float slow_vzy = 0.f;
830 float slow_vzz = 0.f;
832 for (
int i=threadIdx.x+threadIdx.y*WARPSIZE;i < size;i+=NUM_WARP*WARPSIZE) {
833 const int p = start+i;
834 float4 f = tmpforces[p];
836 float4 pos = ((float4*)atoms)[p];
847 float4 slow_f = slow_tmpforces[p];
848 slow_forces[p] = slow_f;
849 slow_vxx += slow_f.x * pos.x;
850 slow_vxy += slow_f.x * pos.y;
851 slow_vxz += slow_f.x * pos.z;
852 slow_vyx += slow_f.y * pos.x;
853 slow_vyy += slow_f.y * pos.y;
854 slow_vyz += slow_f.y * pos.z;
855 slow_vzx += slow_f.z * pos.x;
856 slow_vzy += slow_f.z * pos.y;
857 slow_vzz += slow_f.z * pos.z;
860 #ifdef KEPLER_SHUFFLE
862 for (
int i=WARPSIZE/2;i >= 1;i/=2) {
886 if (threadIdx.x == 0) {
887 sh_buf[threadIdx.y*(
SLOW(9)+9) + 0] = vxx;
888 sh_buf[threadIdx.y*(
SLOW(9)+9) + 1] = vxy;
889 sh_buf[threadIdx.y*(
SLOW(9)+9) + 2] = vxz;
890 sh_buf[threadIdx.y*(
SLOW(9)+9) + 3] = vyx;
891 sh_buf[threadIdx.y*(
SLOW(9)+9) + 4] = vyy;
892 sh_buf[threadIdx.y*(
SLOW(9)+9) + 5] = vyz;
893 sh_buf[threadIdx.y*(
SLOW(9)+9) + 6] = vzx;
894 sh_buf[threadIdx.y*(
SLOW(9)+9) + 7] = vzy;
895 sh_buf[threadIdx.y*(
SLOW(9)+9) + 8] = vzz;
897 sh_buf[threadIdx.y*(
SLOW(9)+9) + 9] = slow_vxx;
898 sh_buf[threadIdx.y*(
SLOW(9)+9) + 10] = slow_vxy;
899 sh_buf[threadIdx.y*(
SLOW(9)+9) + 11] = slow_vxz;
900 sh_buf[threadIdx.y*(
SLOW(9)+9) + 12] = slow_vyx;
901 sh_buf[threadIdx.y*(
SLOW(9)+9) + 13] = slow_vyy;
902 sh_buf[threadIdx.y*(
SLOW(9)+9) + 14] = slow_vyz;
903 sh_buf[threadIdx.y*(
SLOW(9)+9) + 15] = slow_vzx;
904 sh_buf[threadIdx.y*(
SLOW(9)+9) + 16] = slow_vzy;
905 sh_buf[threadIdx.y*(
SLOW(9)+9) + 17] = slow_vzz;
910 if (threadIdx.x <
SLOW(9+)9 && threadIdx.y == 0) {
913 for (
int i=0;i <
NUM_WARP;i++) v += sh_buf[i*(
SLOW(9)+9) + threadIdx.x];
914 float* dst = (threadIdx.x < 9) ? virials :
slow_virials;
915 const float* src = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
916 int pos = patch_ind*16 + (threadIdx.x % 9);
917 dst[pos] = v + src[pos];
919 #else // ! KEPLER_SHUFFLE
923 const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
924 volatile float* sh_v1 = &sh_buf[0];
925 volatile float* sh_v2 = &sh_buf[NUM_WARP*
WARPSIZE];
926 volatile float* sh_v3 = &sh_buf[2*NUM_WARP*
WARPSIZE];
928 volatile float* sh_slow_v1 = &sh_slow_buf[0];
929 volatile float* sh_slow_v2 = &sh_slow_buf[NUM_WARP*
WARPSIZE];
930 volatile float* sh_slow_v3 = &sh_slow_buf[2*NUM_WARP*
WARPSIZE];
938 sh_slow_v1[t] = slow_vxx;
939 sh_slow_v2[t] = slow_vxy;
940 sh_slow_v3[t] = slow_vxz;
942 for (
int d=1;d < NUM_WARP*
WARPSIZE;d*=2) {
944 float v1 = (pos < NUM_WARP*
WARPSIZE) ? sh_v1[pos] : 0.0f;
945 float v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_v2[pos] : 0.0f;
946 float v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_v3[pos] : 0.0f;
948 float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
949 float slow_v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
950 float slow_v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
957 sh_slow_v1[t] += slow_v1;
958 sh_slow_v2[t] += slow_v2;
959 sh_slow_v3[t] += slow_v3;
963 if (threadIdx.x == 0 && threadIdx.y == 0) {
964 sh_vcc[0] = sh_v1[0];
965 sh_vcc[1] = sh_v2[0];
966 sh_vcc[2] = sh_v3[0];
968 sh_vcc[9+0] = sh_slow_v1[0];
969 sh_vcc[9+1] = sh_slow_v2[0];
970 sh_vcc[9+2] = sh_slow_v3[0];
978 sh_slow_v1[t] = slow_vyx;
979 sh_slow_v2[t] = slow_vyy;
980 sh_slow_v3[t] = slow_vyz;
982 for (
int d=1;d < NUM_WARP*
WARPSIZE;d*=2) {
984 float v1 = (pos < NUM_WARP*
WARPSIZE) ? sh_v1[pos] : 0.0f;
985 float v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_v2[pos] : 0.0f;
986 float v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_v3[pos] : 0.0f;
988 float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
989 float slow_v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
990 float slow_v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
997 sh_slow_v1[t] += slow_v1;
998 sh_slow_v2[t] += slow_v2;
999 sh_slow_v3[t] += slow_v3;
1003 if (threadIdx.x == 0 && threadIdx.y == 0) {
1004 sh_vcc[3] = sh_v1[0];
1005 sh_vcc[4] = sh_v2[0];
1006 sh_vcc[5] = sh_v3[0];
1008 sh_vcc[9+3] = sh_slow_v1[0];
1009 sh_vcc[9+4] = sh_slow_v2[0];
1010 sh_vcc[9+5] = sh_slow_v3[0];
1018 sh_slow_v1[t] = slow_vzx;
1019 sh_slow_v2[t] = slow_vzy;
1020 sh_slow_v3[t] = slow_vzz;
1022 for (
int d=1;d < NUM_WARP*
WARPSIZE;d*=2) {
1024 float v1 = (pos < NUM_WARP*
WARPSIZE) ? sh_v1[pos] : 0.0f;
1025 float v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_v2[pos] : 0.0f;
1026 float v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_v3[pos] : 0.0f;
1028 float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
1029 float slow_v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
1030 float slow_v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
1037 sh_slow_v1[t] += slow_v1;
1038 sh_slow_v2[t] += slow_v2;
1039 sh_slow_v3[t] += slow_v3;
1043 if (threadIdx.x == 0 && threadIdx.y == 0) {
1044 sh_vcc[6] = sh_v1[0];
1045 sh_vcc[7] = sh_v2[0];
1046 sh_vcc[8] = sh_v3[0];
1048 sh_vcc[9+6] = sh_slow_v1[0];
1049 sh_vcc[9+7] = sh_slow_v2[0];
1050 sh_vcc[9+8] = sh_slow_v3[0];
1054 if (threadIdx.x <
SLOW(9+)9 && threadIdx.y == 0) {
1055 float* dst = (threadIdx.x < 9) ? virials : slow_virials;
1056 const float* src = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
1057 int pos = patch_ind*16 + (threadIdx.x % 9);
1058 dst[pos] = sh_vcc[threadIdx.x] + src[pos];
1060 #endif // KEPLER_SHUFFLE
1063 if (threadIdx.x < 3 && threadIdx.y == 0) {
1064 int pos = patch_ind*16 + 9 + threadIdx.x;
1065 virials[pos] = tmpvirials[pos];
1069 if (threadIdx.x == 0 && threadIdx.y == 0) {
1070 int pos = patch_ind*16 + 12;
1071 virials[pos] = tmpvirials[pos];
static __thread int * block_order
__device__ static __forceinline__ void NAME() finish_forces_virials(const int start, const int size, const int patch_ind, const atom *atoms, volatile float *sh_buf, volatile float *sh_slow_buf, volatile float *sh_vcc, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials)
static __thread int lj_table_size
static __thread atom * atoms
static __thread float4 * forces
static __thread unsigned int * plist
static __thread unsigned int * overflow_exclusions
static __thread exclmask * exclmasks
#define cuda_static_assert(expr)
static __thread float * slow_tmpvirials
if(ComputeNonbondedUtil::goMethod==2)
static __thread float4 * slow_forces
static __thread float * slow_virials
static __thread float * tmpvirials
static __thread patch_pair * patch_pairs
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
texture< float2, 1, cudaReadModeElementType > lj_table
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t const int const float plcutoff2
static __thread float * virials
#define MAX_CONST_EXCLUSIONS
__device__ __forceinline__ void NAME() reduceVariables(volatile T *sh_buf, T *dst, T val1, T val2, T val3)
static __thread float4 * slow_tmpforces
static __thread unsigned int * global_counters
texture< float4, 1, cudaReadModeElementType > energy_table
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
static __thread atom_param * atom_params
texture< float4, 1, cudaReadModeElementType > force_table
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
static __thread int * vdw_types
static __thread int * force_ready_queue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
#define WARP_ANY(MASK, P)
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
static __thread float4 * tmpforces
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)