8 #if __CUDA_ARCH__ < 300 19 const patch_pair *patch_pairs,
30 unsigned int *P1_counters
35 __shared__ patch_pair sh_patch_pair;
36 #ifndef KEPLER_SHUFFLE 37 __shared__ atom sh_jpq_2d[NUM_WARP][
WARPSIZE];
38 __shared__
float sh_intRad0j_2d[NUM_WARP][
WARPSIZE];
41 volatile GBReal* sh_psiSumJ = sh_psiSumJ_2d[threadIdx.y];
42 #ifndef KEPLER_SHUFFLE 43 volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
44 volatile float* sh_intRad0j = sh_intRad0j_2d[threadIdx.y];
49 const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
50 if (t < PATCH_PAIR_SIZE) {
51 int* src = (
int *)&patch_pairs[blockIdx.x];
52 int* dst = (
int *)&sh_patch_pair;
59 float offx = sh_patch_pair.offset.x * lata.x
60 + sh_patch_pair.offset.y * latb.x
61 + sh_patch_pair.offset.z * latc.x;
62 float offy = sh_patch_pair.offset.x * lata.y
63 + sh_patch_pair.offset.y * latb.y
64 + sh_patch_pair.offset.z * latc.y;
65 float offz = sh_patch_pair.offset.x * lata.z
66 + sh_patch_pair.offset.y * latb.z
67 + sh_patch_pair.offset.z * latc.z;
68 sh_patch_pair.offset.x = offx;
69 sh_patch_pair.offset.y = offy;
70 sh_patch_pair.offset.z = offz;
76 for (
int blocki = threadIdx.y*
WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += NUM_WARP*
WARPSIZE) {
78 int nloopi = sh_patch_pair.patch1_size - blocki;
87 if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
88 i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
89 float4 tmpa = ((float4*)atoms)[i];
90 atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
91 atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
92 atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
93 atomi.charge = intRad0[i];
94 intRadSi = intRadS[i];
100 const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
101 (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
102 int blockj = (diag_patch_pair) ? blocki : 0;
105 for (; blockj < sh_patch_pair.patch2_size; blockj +=
WARPSIZE ) {
107 int nloopj = min(sh_patch_pair.patch2_size - blockj,
WARPSIZE);
109 #ifdef KEPLER_SHUFFLE 118 if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
119 int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
120 float4 tmpa = ((float4*)atoms)[j];
121 #ifdef KEPLER_SHUFFLE 125 chargej = intRadS[j];
126 intRad0j_val = intRad0[j];
128 sh_jpq[threadIdx.x].position.x = tmpa.x;
129 sh_jpq[threadIdx.x].position.y = tmpa.y;
130 sh_jpq[threadIdx.x].position.z = tmpa.z;
131 sh_jpq[threadIdx.x].charge = intRadS[j];
132 sh_intRad0j[threadIdx.x] = intRad0[j];
136 const bool diag_tile = diag_patch_pair && (blocki == blockj);
139 sh_psiSumJ[threadIdx.x] = 0.f;
142 int t = diag_tile ? 1 : 0;
143 #ifdef KEPLER_SHUFFLE 154 int j = (t + threadIdx.x) % modval;
155 #ifndef KEPLER_SHUFFLE 156 float xj = sh_jpq[j].position.x;
157 float yj = sh_jpq[j].position.y;
158 float zj = sh_jpq[j].position.z;
159 float chargej = sh_jpq[j].charge;
160 float intRad0j_val = sh_intRad0j[j];
162 if (j < nloopj && threadIdx.x < nloopi)
164 float dx = atomi.position.x - xj;
165 float dy = atomi.position.y - yj;
166 float dz = atomi.position.z - zj;
167 float r2 = dx*dx + dy*dy + dz*dz;
172 float r_i = 1.f / sqrt(r2);
176 CalcH(r,r2,r_i,a_cut,atomi.charge,chargej,hij,dij);
180 CalcH(r,r2,r_i,a_cut,intRad0j_val,intRadSi,hji,dji);
181 sh_psiSumJ[j] += hji;
184 #ifdef KEPLER_SHUFFLE 193 if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
194 int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
195 atomicAdd(&tmp_psiSum[i_out], sh_psiSumJ[threadIdx.x]);
202 if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
203 int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
204 atomicAdd(&tmp_psiSum[i_out], psiSumI);
214 int patch1_ind = sh_patch_pair.patch1_ind;
215 int patch2_ind = sh_patch_pair.patch2_ind;
217 if (threadIdx.x == 0 && threadIdx.y == 0) {
218 sh_patch_pair.patch_done[0] =
false;
219 sh_patch_pair.patch_done[1] =
false;
221 unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
222 int patch1_old = atomicInc(&P1_counters[patch1_ind], patch1_num_pairs-1);
223 if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] =
true;
224 if (patch1_ind != patch2_ind) {
225 unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
226 int patch2_old = atomicInc(&P1_counters[patch2_ind], patch2_num_pairs-1);
227 if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] =
true;
233 if (sh_patch_pair.patch_done[0]) {
234 const int start = sh_patch_pair.patch1_start;
235 for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*
WARPSIZE) {
236 psiSum[start+i] = tmp_psiSum[start+i];
240 if (sh_patch_pair.patch_done[1]) {
241 const int start = sh_patch_pair.patch2_start;
242 for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*
WARPSIZE) {
243 psiSum[start+i] = tmp_psiSum[start+i];
247 if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
249 #if __CUDA_ARCH__ < 200 252 __threadfence_system();
265 const patch_pair *patch_pairs,
267 const float *bornRad,
274 const float smoothDist,
275 const float epsilon_p,
276 const float epsilon_s,
281 const int doFullElec,
286 unsigned int *P2_counters
290 __shared__ patch_pair sh_patch_pair;
291 #ifndef KEPLER_SHUFFLE 292 __shared__ atom sh_jpq_2d[NUM_WARP][
WARPSIZE];
293 __shared__
float sh_jBornRad_2d[NUM_WARP][
WARPSIZE];
295 __shared__ float3 sh_forceJ_2d[NUM_WARP][
WARPSIZE];
296 __shared__
float sh_dEdaSumJ_2d[NUM_WARP][
WARPSIZE];
298 #ifndef KEPLER_SHUFFLE 299 volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
300 volatile float* sh_jBornRad = sh_jBornRad_2d[threadIdx.y];
302 volatile float3* sh_forceJ = sh_forceJ_2d[threadIdx.y];
303 volatile float* sh_dEdaSumJ = sh_dEdaSumJ_2d[threadIdx.y];
307 const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
308 if (t < PATCH_PAIR_SIZE) {
309 int* src = (
int *)&patch_pairs[blockIdx.x];
310 int* dst = (
int *)&sh_patch_pair;
317 float offx = sh_patch_pair.offset.x * lata.x
318 + sh_patch_pair.offset.y * latb.x
319 + sh_patch_pair.offset.z * latc.x;
320 float offy = sh_patch_pair.offset.x * lata.y
321 + sh_patch_pair.offset.y * latb.y
322 + sh_patch_pair.offset.z * latc.y;
323 float offz = sh_patch_pair.offset.x * lata.z
324 + sh_patch_pair.offset.y * latb.z
325 + sh_patch_pair.offset.z * latc.z;
326 sh_patch_pair.offset.x = offx;
327 sh_patch_pair.offset.y = offy;
328 sh_patch_pair.offset.z = offz;
336 float r_cut2 = r_cut*r_cut;
337 float r_cut_2 = 1.f / r_cut2;
338 float r_cut_4 = 4.f*r_cut_2*r_cut_2;
339 float epsilon_s_i = 1.f / epsilon_s;
340 float epsilon_p_i = 1.f / epsilon_p;
343 for (
int blocki = threadIdx.y*
WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += BLOCK_SIZE ) {
345 int nloopi = sh_patch_pair.patch1_size - blocki;
354 if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
355 i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
356 float4 tmpa = ((float4*)atoms)[i];
357 atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
358 atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
359 atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
360 atomi.charge = - tmpa.w * scaling;
361 bornRadI = bornRad[i];
371 const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
372 (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
373 int blockj = (diag_patch_pair) ? blocki : 0;
375 for (; blockj < sh_patch_pair.patch2_size; blockj +=
WARPSIZE) {
377 int nloopj = min(sh_patch_pair.patch2_size - blockj,
WARPSIZE);
379 #ifdef KEPLER_SHUFFLE 388 if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
389 int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
390 float4 tmpa = ((float4*)atoms)[j];
391 #ifdef KEPLER_SHUFFLE 396 bornRadJ = bornRad[j];
398 sh_jpq[threadIdx.x].position.x = tmpa.x;
399 sh_jpq[threadIdx.x].position.y = tmpa.y;
400 sh_jpq[threadIdx.x].position.z = tmpa.z;
401 sh_jpq[threadIdx.x].charge = tmpa.w;
402 sh_jBornRad[threadIdx.x] = bornRad[j];
406 sh_forceJ[threadIdx.x].x = 0.f;
407 sh_forceJ[threadIdx.x].y = 0.f;
408 sh_forceJ[threadIdx.x].z = 0.f;
409 sh_dEdaSumJ[threadIdx.x] = 0.f;
411 const bool diag_tile = diag_patch_pair && (blocki == blockj);
414 int j = (t + threadIdx.x) % modval;
415 #ifndef KEPLER_SHUFFLE 416 float xj = sh_jpq[j].position.x;
417 float yj = sh_jpq[j].position.y;
418 float zj = sh_jpq[j].position.z;
419 float chargej = sh_jpq[j].charge;
420 float bornRadJ = sh_jBornRad[j];
422 if (j < nloopj && threadIdx.x < nloopi)
424 float dx = atomi.position.x - xj;
425 float dy = atomi.position.y - yj;
426 float dz = atomi.position.z - zj;
427 float r2 = dx*dx + dy*dy + dz*dz;
430 if (r2 < r_cut2 && r2 > 0.01f) {
432 float r_i = 1.f / sqrt(r2);
437 float qiqj = atomi.charge*chargej;
438 float aiaj = bornRadI*bornRadJ;
439 float aiaj4 = 4*aiaj;
440 float expr2aiaj4 = exp(-r2/aiaj4);
441 float fij = sqrt(r2+aiaj*expr2aiaj4);
443 float expkappa = exp(-kappa*fij);
444 float Dij = epsilon_p_i - expkappa*epsilon_s_i;
445 float gbEij = qiqj*Dij*f_i;
448 float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
449 float ddrf_i = -ddrfij*f_i*f_i;
450 float ddrDij = kappa*expkappa*ddrfij*epsilon_s_i;
451 float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
455 float ddrScale = 0.f;
457 if (smoothDist > 0.f) {
458 scale = r2 * r_cut_2 - 1.f;
460 ddrScale = r*(r2-r_cut2)*r_cut_4;
461 energyT += gbEij * scale;
462 forcedEdr = -(ddrGbEij)*scale-(gbEij)*ddrScale;
465 forcedEdr = -ddrGbEij;
470 float dEdai = 0.5f*qiqj*f_i*f_i
471 *(kappa*epsilon_s_i*expkappa-Dij*f_i)
472 *(aiaj+0.25f*r2)*expr2aiaj4/bornRadI*scale;
474 float dEdaj = 0.5f*qiqj*f_i*f_i
475 *(kappa*epsilon_s_i*expkappa-Dij*f_i)
476 *(aiaj+0.25f*r2)*expr2aiaj4/bornRadJ*scale;
477 sh_dEdaSumJ[j] += dEdaj;
481 float tmpx = dx*forcedEdr;
482 float tmpy = dy*forcedEdr;
483 float tmpz = dz*forcedEdr;
487 sh_forceJ[j].x -= tmpx;
488 sh_forceJ[j].y -= tmpy;
489 sh_forceJ[j].z -= tmpz;
494 float fij = bornRadI;
495 float expkappa = exp(-kappa*fij);
496 float Dij = epsilon_p_i - expkappa*epsilon_s_i;
497 float gbEij = atomi.charge*(atomi.charge / (-scaling) )*Dij/fij;
498 energyT += 0.5f*gbEij;
502 #ifdef KEPLER_SHUFFLE 510 if ( blockj + threadIdx.x < sh_patch_pair.patch2_size) {
511 int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
512 atomicAdd(&tmp_dEdaSum[i_out], sh_dEdaSumJ[threadIdx.x]);
513 atomicAdd(&tmp_forces[i_out].x, sh_forceJ[threadIdx.x].x);
514 atomicAdd(&tmp_forces[i_out].y, sh_forceJ[threadIdx.x].y);
515 atomicAdd(&tmp_forces[i_out].z, sh_forceJ[threadIdx.x].z);
521 if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
522 int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
523 atomicAdd(&tmp_dEdaSum[i_out], dEdaSumI);
524 atomicAdd(&tmp_forces[i_out].x, forceI.x);
525 atomicAdd(&tmp_forces[i_out].y, forceI.y);
526 atomicAdd(&tmp_forces[i_out].z, forceI.z);
534 volatile float* sh_energy = (
float *)&sh_forceJ_2d[threadIdx.y][0].x;
536 sh_energy[threadIdx.x] = (
float)energyT;
538 int pos = threadIdx.x + d;
539 float val = (pos <
WARPSIZE) ? sh_energy[pos] : 0.0f;
540 sh_energy[threadIdx.x] += val;
544 if (threadIdx.x == 0 && threadIdx.y == 0) {
545 float tot_energy = 0.0f;
547 for (
int i=0;i < NUM_WARP;++i) {
548 tot_energy += ((
float *)&sh_forceJ_2d[i][0].x)[0];
550 int patch1_ind = sh_patch_pair.patch1_ind;
551 atomicAdd(&tmp_energy[patch1_ind], (
float)tot_energy);
561 int patch1_ind = sh_patch_pair.patch1_ind;
562 int patch2_ind = sh_patch_pair.patch2_ind;
564 if (threadIdx.x == 0 && threadIdx.y == 0) {
565 sh_patch_pair.patch_done[0] =
false;
566 sh_patch_pair.patch_done[1] =
false;
568 unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
569 int patch1_old = atomicInc(&P2_counters[patch1_ind], patch1_num_pairs-1);
570 if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] =
true;
571 if (patch1_ind != patch2_ind) {
572 unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
573 int patch2_old = atomicInc(&P2_counters[patch2_ind], patch2_num_pairs-1);
574 if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] =
true;
580 if (sh_patch_pair.patch_done[0]) {
581 const int start = sh_patch_pair.patch1_start;
582 for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*
WARPSIZE) {
583 forces[start+i] = tmp_forces[start+i];
584 dEdaSum[start+i] = tmp_dEdaSum[start+i];
586 energy[patch1_ind] = tmp_energy[patch1_ind];
589 if (sh_patch_pair.patch_done[1]) {
590 const int start = sh_patch_pair.patch2_start;
591 for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*
WARPSIZE) {
592 forces[start+i] = tmp_forces[start+i];
593 dEdaSum[start+i] = tmp_dEdaSum[start+i];
595 energy[patch2_ind] = tmp_energy[patch2_ind];
598 if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
600 #if __CUDA_ARCH__ < 200 603 __threadfence_system();
616 const patch_pair *patch_pairs,
618 const float *intRad0,
619 const float *intRadS,
620 const float *dHdrPrefix,
629 unsigned int *P3_counters
633 __shared__ patch_pair sh_patch_pair;
634 #ifndef KEPLER_SHUFFLE 635 __shared__ atom sh_jpq_2d[NUM_WARP][
WARPSIZE];
636 __shared__
float sh_intRadJ0_2d[NUM_WARP][
WARPSIZE];
637 __shared__
float sh_jDHdrPrefix_2d[NUM_WARP][
WARPSIZE];
639 __shared__ float3 sh_forceJ_2d[NUM_WARP][
WARPSIZE];
641 #ifndef KEPLER_SHUFFLE 642 volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
643 volatile float* sh_intRadJ0 = sh_intRadJ0_2d[threadIdx.y];
644 volatile float* sh_jDHdrPrefix = sh_jDHdrPrefix_2d[threadIdx.y];
646 volatile float3* sh_forceJ = sh_forceJ_2d[threadIdx.y];
650 const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
651 if (t < PATCH_PAIR_SIZE) {
652 int* src = (
int *)&patch_pairs[blockIdx.x];
653 int* dst = (
int *)&sh_patch_pair;
660 float offx = sh_patch_pair.offset.x * lata.x
661 + sh_patch_pair.offset.y * latb.x
662 + sh_patch_pair.offset.z * latc.x;
663 float offy = sh_patch_pair.offset.x * lata.y
664 + sh_patch_pair.offset.y * latb.y
665 + sh_patch_pair.offset.z * latc.y;
666 float offz = sh_patch_pair.offset.x * lata.z
667 + sh_patch_pair.offset.y * latb.z
668 + sh_patch_pair.offset.z * latc.z;
669 sh_patch_pair.offset.x = offx;
670 sh_patch_pair.offset.y = offy;
671 sh_patch_pair.offset.z = offz;
677 for (
int blocki = threadIdx.y*
WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += NUM_WARP*
WARPSIZE ) {
679 int nloopi = sh_patch_pair.patch1_size - blocki;
689 if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
690 i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
691 float4 tmpa = ((float4*)atoms)[i];
692 atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
693 atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
694 atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
695 atomi.charge = intRad0[i];
696 intRadIS = intRadS[i];
697 dHdrPrefixI = dHdrPrefix[i];
706 const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
707 (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
708 int blockj = (diag_patch_pair) ? blocki : 0;
710 for (; blockj < sh_patch_pair.patch2_size; blockj +=
WARPSIZE ) {
712 int nloopj = min(sh_patch_pair.patch2_size - blockj,
WARPSIZE);
714 #ifdef KEPLER_SHUFFLE 724 if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
725 int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
726 float4 tmpa = ((float4*)atoms)[j];
727 #ifdef KEPLER_SHUFFLE 731 intRadSJ = intRadS[j];
732 dHdrPrefixJ = dHdrPrefix[j];
733 intRadJ0 = intRad0[j];
735 sh_jpq[threadIdx.x].position.x = tmpa.x;
736 sh_jpq[threadIdx.x].position.y = tmpa.y;
737 sh_jpq[threadIdx.x].position.z = tmpa.z;
738 sh_jpq[threadIdx.x].charge = intRadS[j];
739 sh_jDHdrPrefix[threadIdx.x] = dHdrPrefix[j];
740 sh_intRadJ0[threadIdx.x] = intRad0[j];
744 sh_forceJ[threadIdx.x].x = 0.f;
745 sh_forceJ[threadIdx.x].y = 0.f;
746 sh_forceJ[threadIdx.x].z = 0.f;
748 const bool diag_tile = diag_patch_pair && (blocki == blockj);
750 #ifdef KEPLER_SHUFFLE 760 int t = diag_tile ? 1 : 0;
762 int j = (t + threadIdx.x) % modval;
763 #ifndef KEPLER_SHUFFLE 764 float xj = sh_jpq[j].position.x;
765 float yj = sh_jpq[j].position.y;
766 float zj = sh_jpq[j].position.z;
767 float intRadSJ = sh_jpq[j].charge;
768 float dHdrPrefixJ = sh_jDHdrPrefix[j];
769 float intRadJ0 = sh_intRadJ0[j];
771 if (j < nloopj && threadIdx.x < nloopi)
773 float dx = atomi.position.x - xj;
774 float dy = atomi.position.y - yj;
775 float dz = atomi.position.z - zj;
776 float r2 = dx*dx + dy*dy + dz*dz;
781 float r_i = 1.f / sqrt(r2);
785 CalcDH(r,r2,r_i,a_cut,atomi.charge,intRadSJ,dhij,dij);
786 CalcDH(r,r2,r_i,a_cut,intRadJ0,intRadIS,dhji,dji);
788 float forceAlpha = -r_i*(dHdrPrefixI*dhij+dHdrPrefixJ*dhji);
789 float tmpx = dx * forceAlpha;
790 float tmpy = dy * forceAlpha;
791 float tmpz = dz * forceAlpha;
795 sh_forceJ[j].x -= tmpx;
796 sh_forceJ[j].y -= tmpy;
797 sh_forceJ[j].z -= tmpz;
800 #ifdef KEPLER_SHUFFLE 809 if ( blockj + threadIdx.x < sh_patch_pair.patch2_size) {
810 int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
811 atomicAdd(&tmp_forces[i_out].x, sh_forceJ[threadIdx.x].x);
812 atomicAdd(&tmp_forces[i_out].y, sh_forceJ[threadIdx.x].y);
813 atomicAdd(&tmp_forces[i_out].z, sh_forceJ[threadIdx.x].z);
818 if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
819 int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
820 atomicAdd(&tmp_forces[i_out].x, forceI.x);
821 atomicAdd(&tmp_forces[i_out].y, forceI.y);
822 atomicAdd(&tmp_forces[i_out].z, forceI.z);
832 int patch1_ind = sh_patch_pair.patch1_ind;
833 int patch2_ind = sh_patch_pair.patch2_ind;
835 if (threadIdx.x == 0 && threadIdx.y == 0) {
836 sh_patch_pair.patch_done[0] =
false;
837 sh_patch_pair.patch_done[1] =
false;
839 unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
840 int patch1_old = atomicInc(&P3_counters[patch1_ind], patch1_num_pairs-1);
841 if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] =
true;
842 if (patch1_ind != patch2_ind) {
843 unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
844 int patch2_old = atomicInc(&P3_counters[patch2_ind], patch2_num_pairs-1);
845 if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] =
true;
851 if (sh_patch_pair.patch_done[0]) {
852 const int start = sh_patch_pair.patch1_start;
853 for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*
WARPSIZE) {
854 forces[start+i] = tmp_forces[start+i];
858 if (sh_patch_pair.patch_done[1]) {
859 const int start = sh_patch_pair.patch2_start;
860 for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*
WARPSIZE) {
861 forces[start+i] = tmp_forces[start+i];
865 if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
867 #if __CUDA_ARCH__ < 200 870 __threadfence_system();
static __global__ void GBIS_P3_Kernel(const patch_pair *patch_pairs, const atom *atoms, const float *intRad0, const float *intRadS, const float *dHdrPrefix, const float a_cut, const float rho_0, const float scaling, float3 lata, float3 latb, float3 latc, float4 *tmp_forces, float4 *forces, unsigned int *P3_counters)
static void CalcDH(float r, float r2, float ri, float rc, float r0, float rs, float &dh, int &d)
static void CalcH(float r, float r2, float ri, float rc, float r0, float rs, float &h, int &d)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
static __global__ void GBIS_P1_Kernel(const patch_pair *patch_pairs, const atom *atoms, const float *intRad0, const float *intRadS, GBReal *tmp_psiSum, GBReal *psiSum, const float a_cut, const float rho_0, float3 lata, float3 latb, float3 latc, unsigned int *P1_counters)
static __global__ void GBIS_P2_Kernel(const patch_pair *patch_pairs, const atom *atoms, const float *bornRad, GBReal *tmp_dEdaSum, GBReal *dEdaSum, const float a_cut, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float3 lata, float3 latb, float3 latc, const int doEnergy, const int doFullElec, float4 *tmp_forces, float4 *forces, float *tmp_energy, float *energy, unsigned int *P2_counters)