NAMD
Macros | Functions | Variables
ComputeBondedCUDAKernel.cu File Reference
#include <math.h>
#include <cuda.h>
#include <namd_cub/cub.cuh>
#include "ComputeBondedCUDAKernel.h"
#include "DeviceCUDA.h"

Go to the source code of this file.

Macros

#define BONDEDFORCESKERNEL_NUM_WARP   4
 
#define CALL(DOENERGY, DOVIRIAL)
 
#define CALL(DOENERGY, DOVIRIAL, DOELECT)
 

Functions

__device__ long long int lliroundf (float f)
 
__device__ unsigned long long int llitoulli (long long int l)
 
template<typename T >
__forceinline__ __device__ void convertForces (const float x, const float y, const float z, T &Tx, T &Ty, T &Tz)
 
template<>
__forceinline__ __device__ void convertForces< long long int > (const float x, const float y, const float z, long long int &Tx, long long int &Ty, long long int &Tz)
 
template<typename T >
__forceinline__ __device__ void calcComponentForces (float fij, const float dx, const float dy, const float dz, T &fxij, T &fyij, T &fzij)
 
template<>
__forceinline__ __device__ void calcComponentForces< long long int > (float fij, const float dx, const float dy, const float dz, long long int &fxij, long long int &fyij, long long int &fzij)
 
template<typename T >
__forceinline__ __device__ void calcComponentForces (float fij1, const float dx1, const float dy1, const float dz1, float fij2, const float dx2, const float dy2, const float dz2, T &fxij, T &fyij, T &fzij)
 
template<>
__forceinline__ __device__ void calcComponentForces< long long int > (float fij1, const float dx1, const float dy1, const float dz1, float fij2, const float dx2, const float dy2, const float dz2, long long int &fxij, long long int &fyij, long long int &fzij)
 
template<typename T >
__forceinline__ __device__ void storeForces (const T fx, const T fy, const T fz, const int ind, const int stride, T *force)
 
template<>
__forceinline__ __device__ void storeForces< long long int > (const long long int fx, const long long int fy, const long long int fz, const int ind, const int stride, long long int *force)
 
template<typename T , bool doEnergy, bool doVirial>
__device__ void bondForce (const int index, const CudaBond *__restrict__ bondList, const CudaBondValue *__restrict__ bondValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
 
template<typename T , bool doEnergy, bool doVirial, bool doElect>
__device__ void modifiedExclusionForce (const int index, const CudaExclusion *__restrict__ exclusionList, const bool doSlow, const float one_scale14, const int vdwCoefTableWidth, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, double &energyVdw, T *__restrict__ forceNbond, double &energyNbond, T *__restrict__ forceSlow, double &energySlow, ComputeBondedCUDAKernel::BondedVirial< double > &virialNbond, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
 
template<typename T , bool doEnergy, bool doVirial>
__device__ void exclusionForce (const int index, const CudaExclusion *__restrict__ exclusionList, const float r2_delta, const int r2_delta_expc, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, T *__restrict__ forceSlow, double &energySlow, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
 
template<typename T , bool doEnergy, bool doVirial>
__device__ void angleForce (const int index, const CudaAngle *__restrict__ angleList, const CudaAngleValue *__restrict__ angleValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
 
template<bool doEnergy>
__forceinline__ __device__ void diheComp (const CudaDihedralValue *dihedralValues, int ic, const float sin_phi, const float cos_phi, const float scale, float &df, double &e)
 
template<typename T , bool doEnergy, bool doVirial>
__device__ void diheForce (const int index, const CudaDihedral *__restrict__ diheList, const CudaDihedralValue *__restrict__ dihedralValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
 
__device__ __forceinline__ float3 cross (const float3 v1, const float3 v2)
 
__device__ __forceinline__ float dot (const float3 v1, const float3 v2)
 
template<typename T , bool doEnergy, bool doVirial>
__device__ void crosstermForce (const int index, const CudaCrossterm *__restrict__ crosstermList, const CudaCrosstermValue *__restrict__ crosstermValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
 
template<typename T , bool doEnergy, bool doVirial>
__global__ void bondedForcesKernel (const int start, const int numBonds, const CudaBond *__restrict__ bonds, const CudaBondValue *__restrict__ bondValues, const int numAngles, const CudaAngle *__restrict__ angles, const CudaAngleValue *__restrict__ angleValues, const int numDihedrals, const CudaDihedral *__restrict__ dihedrals, const CudaDihedralValue *__restrict__ dihedralValues, const int numImpropers, const CudaDihedral *__restrict__ impropers, const CudaDihedralValue *__restrict__ improperValues, const int numExclusions, const CudaExclusion *__restrict__ exclusions, const int numCrossterms, const CudaCrossterm *__restrict__ crossterms, const CudaCrosstermValue *__restrict__ crosstermValues, const float cutoff2, const float r2_delta, const int r2_delta_expc, const float *__restrict__ r2_table, const float4 *__restrict__ exclusionTable, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, T *__restrict__ forceSlow, double *__restrict__ energies_virials)
 
template<typename T , bool doEnergy, bool doVirial, bool doElect>
__global__ void modifiedExclusionForcesKernel (const int start, const int numModifiedExclusions, const CudaExclusion *__restrict__ modifiedExclusions, const bool doSlow, const float one_scale14, const float cutoff2, const int vdwCoefTableWidth, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ forceNbond, T *__restrict__ forceSlow, double *__restrict__ energies_virials)
 

Variables

__thread DeviceCUDAdeviceCUDA
 

Macro Definition Documentation

#define BONDEDFORCESKERNEL_NUM_WARP   4
#define CALL (   DOENERGY,
  DOVIRIAL 
)
Value:
bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> \
<<< nblock_use, nthread, shmem_size, stream >>> \
(start, numBonds, bonds, bondValues, \
numAngles, angles, angleValues, \
numDihedrals, dihedrals, dihedralValues, \
numImpropers, impropers, improperValues, \
numExclusionsDoSlow, exclusions, \
numCrossterms, crossterms, crosstermValues, \
r2_delta, r2_delta_expc, \
cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex(), \
xyzq, forceStride, \
forces, &forces[startSlow], energies_virials);
static __thread unsigned int * exclusions
static __thread float4 * forces
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb

Referenced by ComputeBondedCUDAKernel::bondedForce(), cuda_nonbonded_forces(), CudaComputeGBISKernel::GBISphase2(), and CudaComputeNonbondedKernel::nonbondedForce().

#define CALL (   DOENERGY,
  DOVIRIAL,
  DOELECT 
)
Value:
modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT> \
<<< nblock_use, nthread, shmem_size, stream >>> \
(start, numModifiedExclusions, modifiedExclusions, \
doSlow, one_scale14, cutoff2, \
cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getExclusionVdwCoefTable(), \
cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
cudaNonbondedTables.getModifiedExclusionForceTableTex(), cudaNonbondedTables.getModifiedExclusionEnergyTableTex(), \
xyzq, forceStride, lata, latb, latc, \
&forces[startNbond], &forces[startSlow], energies_virials);
static __thread float4 * forces
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb

Function Documentation

template<typename T , bool doEnergy, bool doVirial>
__device__ void angleForce ( const int  index,
const CudaAngle *__restrict__  angleList,
const CudaAngleValue *__restrict__  angleValues,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  force,
double &  energy,
ComputeBondedCUDAKernel::BondedVirial< double > &  virial 
)

Definition at line 440 of file ComputeBondedCUDAKernel.cu.

References force_to_float, CudaAngle::i, CudaAngle::ioffsetXYZ, CudaAngle::itype, CudaAngle::j, CudaAngle::k, CudaAngleValue::k, CudaAngleValue::k_ub, CudaAngle::koffsetXYZ, CudaAngleValue::normal, CudaAngleValue::r_ub, CudaAngle::scale, and CudaAngleValue::theta0.

452  {
453 
454  CudaAngle al = angleList[index];
455 
456  float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
457  float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
458  float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
459 
460  float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
461  float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
462  float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
463 
464  float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
465  float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
466  float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
467 
468  float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
469  float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
470  float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
471 
472  float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
473  float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
474 
475  float xijr = xij*rij_inv;
476  float yijr = yij*rij_inv;
477  float zijr = zij*rij_inv;
478  float xkjr = xkj*rkj_inv;
479  float ykjr = ykj*rkj_inv;
480  float zkjr = zkj*rkj_inv;
481  float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
482 
483  CudaAngleValue angleValue = angleValues[al.itype];
484  angleValue.k *= al.scale;
485 
486  float diff;
487  if (angleValue.normal == 1) {
488  // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
489  cos_theta = min(0.999f, max(-0.999f, cos_theta));
490  float theta = acosf(cos_theta);
491  diff = theta - angleValue.theta0;
492  } else {
493  diff = cos_theta - angleValue.theta0;
494  }
495 
496  if (doEnergy) {
497  energy += (double)(angleValue.k * diff * diff);
498  }
499  if (angleValue.normal == 1) {
500  float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
501  if (inv_sin_theta > 1.0e6) {
502  diff = (diff < 0.0f) ? 1.0f : -1.0f;
503  } else {
504  diff *= -inv_sin_theta;
505  }
506  }
507  diff *= -2.0f*angleValue.k;
508 
509  float dtxi = rij_inv*(xkjr - cos_theta*xijr);
510  float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
511  float dtyi = rij_inv*(ykjr - cos_theta*yijr);
512  float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
513  float dtzi = rij_inv*(zkjr - cos_theta*zijr);
514  float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
515 
516  T T_dtxi, T_dtyi, T_dtzi;
517  T T_dtxj, T_dtyj, T_dtzj;
518  calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
519  calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
520  T T_dtxk = -T_dtxi - T_dtxj;
521  T T_dtyk = -T_dtyi - T_dtyj;
522  T T_dtzk = -T_dtzi - T_dtzj;
523  storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force);
524 
525  if (angleValue.k_ub) {
526  float xik = xij - xkj;
527  float yik = yij - ykj;
528  float zik = zij - zkj;
529  float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
530  float rik = 1.0f/rik_inv;
531  float diff_ub = rik - angleValue.r_ub;
532  if (doEnergy) {
533  energy += (double)(angleValue.k_ub * diff_ub * diff_ub);
534  }
535  diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
536  T T_dxik, T_dyik, T_dzik;
537  calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
538  T_dtxi += T_dxik;
539  T_dtyi += T_dyik;
540  T_dtzi += T_dzik;
541  T_dtxj -= T_dxik;
542  T_dtyj -= T_dyik;
543  T_dtzj -= T_dzik;
544  }
545 
546  storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force);
547  storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force);
548 
549  // Store virial
550  if (doVirial) {
551 #ifdef WRITE_FULL_VIRIALS
552  float fxi = ((float)T_dtxi)*force_to_float;
553  float fyi = ((float)T_dtyi)*force_to_float;
554  float fzi = ((float)T_dtzi)*force_to_float;
555  float fxk = ((float)T_dtxj)*force_to_float;
556  float fyk = ((float)T_dtyj)*force_to_float;
557  float fzk = ((float)T_dtzj)*force_to_float;
558  virial.xx = (fxi*xij) + (fxk*xkj);
559  virial.xy = (fxi*yij) + (fxk*ykj);
560  virial.xz = (fxi*zij) + (fxk*zkj);
561  virial.yx = (fyi*xij) + (fyk*xkj);
562  virial.yy = (fyi*yij) + (fyk*ykj);
563  virial.yz = (fyi*zij) + (fyk*zkj);
564  virial.zx = (fzi*xij) + (fzk*xkj);
565  virial.zy = (fzi*yij) + (fzk*ykj);
566  virial.zz = (fzi*zij) + (fzk*zkj);
567 #endif
568  }
569 }
static __constant__ const float force_to_float
float3 koffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
float3 ioffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
template<typename T , bool doEnergy, bool doVirial>
__global__ void bondedForcesKernel ( const int  start,
const int  numBonds,
const CudaBond *__restrict__  bonds,
const CudaBondValue *__restrict__  bondValues,
const int  numAngles,
const CudaAngle *__restrict__  angles,
const CudaAngleValue *__restrict__  angleValues,
const int  numDihedrals,
const CudaDihedral *__restrict__  dihedrals,
const CudaDihedralValue *__restrict__  dihedralValues,
const int  numImpropers,
const CudaDihedral *__restrict__  impropers,
const CudaDihedralValue *__restrict__  improperValues,
const int  numExclusions,
const CudaExclusion *__restrict__  exclusions,
const int  numCrossterms,
const CudaCrossterm *__restrict__  crossterms,
const CudaCrosstermValue *__restrict__  crosstermValues,
const float  cutoff2,
const float  r2_delta,
const int  r2_delta_expc,
const float *__restrict__  r2_table,
const float4 *__restrict__  exclusionTable,
cudaTextureObject_t  r2_table_tex,
cudaTextureObject_t  exclusionTableTex,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  force,
T *__restrict__  forceSlow,
double *__restrict__  energies_virials 
)

Definition at line 1037 of file ComputeBondedCUDAKernel.cu.

References ComputeBondedCUDAKernel::amdDiheVirialIndex_XX, BLOCK_SYNC, BONDEDFORCESKERNEL_NUM_WARP, cutoff2, double_to_virial, ComputeBondedCUDAKernel::energyIndex_ANGLE, ComputeBondedCUDAKernel::energyIndex_BOND, ComputeBondedCUDAKernel::energyIndex_CROSSTERM, ComputeBondedCUDAKernel::energyIndex_DIHEDRAL, ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW, ComputeBondedCUDAKernel::energyIndex_IMPROPER, exclusions, lata, latb, latc, llitoulli(), ComputeBondedCUDAKernel::normalVirialIndex_XX, ComputeBondedCUDAKernel::slowVirialIndex_XX, WARP_FULL_MASK, WARP_SHUFFLE_XOR, WARPSIZE, ComputeBondedCUDAKernel::BondedVirial< T >::xx, ComputeBondedCUDAKernel::BondedVirial< T >::xy, xyzq, ComputeBondedCUDAKernel::BondedVirial< T >::xz, ComputeBondedCUDAKernel::BondedVirial< T >::yx, ComputeBondedCUDAKernel::BondedVirial< T >::yy, ComputeBondedCUDAKernel::BondedVirial< T >::yz, ComputeBondedCUDAKernel::BondedVirial< T >::zx, ComputeBondedCUDAKernel::BondedVirial< T >::zy, and ComputeBondedCUDAKernel::BondedVirial< T >::zz.

1076  {
1077 
1078  // Thread-block index
1079  int indexTB = start + blockIdx.x;
1080 
1081  const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
1082  const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
1083  const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
1084  const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
1085  const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
1086  const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
1087 
1088  // Each thread computes single bonded interaction.
1089  // Each thread block computes single bonded type
1090  double energy;
1091  int energyIndex;
1092 
1093  if (doEnergy) {
1094  energy = 0.0;
1095  energyIndex = -1;
1096  }
1097 
1098 #ifdef WRITE_FULL_VIRIALS
1100  int virialIndex;
1101  if (doVirial) {
1102  virial.xx = 0.0;
1103  virial.xy = 0.0;
1104  virial.xz = 0.0;
1105  virial.yx = 0.0;
1106  virial.yy = 0.0;
1107  virial.yz = 0.0;
1108  virial.zx = 0.0;
1109  virial.zy = 0.0;
1110  virial.zz = 0.0;
1112  }
1113 #endif
1114 
1115  if (indexTB < numBondsTB) {
1116  int i = threadIdx.x + indexTB*blockDim.x;
1117  if (doEnergy) {
1119  }
1120  if (i < numBonds) {
1121  bondForce<T, doEnergy, doVirial>
1122  (i, bonds, bondValues, xyzq,
1123  stride, lata, latb, latc,
1124  force, energy, virial);
1125  }
1126  goto reduce;
1127  }
1128  indexTB -= numBondsTB;
1129 
1130  if (indexTB < numAnglesTB) {
1131  int i = threadIdx.x + indexTB*blockDim.x;
1132  if (doEnergy) {
1134  }
1135  if (i < numAngles) {
1136  angleForce<T, doEnergy, doVirial>
1137  (i, angles, angleValues, xyzq, stride,
1138  lata, latb, latc,
1139  force, energy, virial);
1140  }
1141  goto reduce;
1142  }
1143  indexTB -= numAnglesTB;
1144 
1145  if (indexTB < numDihedralsTB) {
1146  int i = threadIdx.x + indexTB*blockDim.x;
1147  if (doEnergy) {
1149  }
1150  if (doVirial) {
1152  }
1153  if (i < numDihedrals) {
1154  diheForce<T, doEnergy, doVirial>
1155  (i, dihedrals, dihedralValues, xyzq, stride,
1156  lata, latb, latc,
1157  force, energy, virial);
1158  }
1159  goto reduce;
1160  }
1161  indexTB -= numDihedralsTB;
1162 
1163  if (indexTB < numImpropersTB) {
1164  int i = threadIdx.x + indexTB*blockDim.x;
1165  if (doEnergy) {
1167  }
1168  if (i < numImpropers) {
1169  diheForce<T, doEnergy, doVirial>
1170  (i, impropers, improperValues, xyzq, stride,
1171  lata, latb, latc,
1172  force, energy, virial);
1173  }
1174  goto reduce;
1175  }
1176  indexTB -= numImpropersTB;
1177 
1178  if (indexTB < numExclusionsTB) {
1179  int i = threadIdx.x + indexTB*blockDim.x;
1180  if (doEnergy) {
1182  }
1183  if (doVirial) {
1185  }
1186  if (i < numExclusions) {
1187  exclusionForce<T, doEnergy, doVirial>
1188  (i, exclusions, r2_delta, r2_delta_expc,
1189 #if __CUDA_ARCH__ >= 350
1190  r2_table, exclusionTable,
1191 #else
1192  r2_table_tex, exclusionTableTex,
1193 #endif
1194  xyzq, stride, lata, latb, latc, cutoff2,
1195  forceSlow, energy, virial);
1196  }
1197  goto reduce;
1198  }
1199  indexTB -= numExclusionsTB;
1200 
1201  if (indexTB < numCrosstermsTB) {
1202  int i = threadIdx.x + indexTB*blockDim.x;
1203  if (doEnergy) {
1205  }
1206  if (doVirial) {
1208  }
1209  if (i < numCrossterms) {
1210  crosstermForce<T, doEnergy, doVirial>
1211  (i, crossterms, crosstermValues,
1212  xyzq, stride, lata, latb, latc,
1213  force, energy, virial);
1214  }
1215  goto reduce;
1216  }
1217  // indexTB -= numCrosstermsTB;
1218 
1219  reduce:
1220 
1221  // Write energies to global memory
1222  if (doEnergy) {
1223  // energyIndex is constant within thread-block
1224  __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
1225 #pragma unroll
1226  for (int i=16;i >= 1;i/=2) {
1227  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, 32);
1228  }
1229  int laneID = (threadIdx.x & (WARPSIZE - 1));
1230  int warpID = threadIdx.x / WARPSIZE;
1231  if (laneID == 0) {
1232  shEnergy[warpID] = energy;
1233  }
1234  BLOCK_SYNC;
1235  if (warpID == 0) {
1236  energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
1237 #pragma unroll
1238  for (int i=16;i >= 1;i/=2) {
1239  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, 32);
1240  }
1241  if (laneID == 0) {
1242  atomicAdd(&energies_virials[energyIndex], energy);
1243  }
1244  }
1245  }
1246 
1247  // Write virials to global memory
1248 #ifdef WRITE_FULL_VIRIALS
1249  if (doVirial) {
1250 #pragma unroll
1251  for (int i=16;i >= 1;i/=2) {
1252  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, 32);
1253  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, 32);
1254  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, 32);
1255  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, 32);
1256  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, 32);
1257  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, 32);
1258  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, 32);
1259  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, 32);
1260  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, 32);
1261  }
1263  int laneID = (threadIdx.x & (WARPSIZE - 1));
1264  int warpID = threadIdx.x / WARPSIZE;
1265  if (laneID == 0) {
1266  shVirial[warpID] = virial;
1267  }
1268  BLOCK_SYNC;
1269 
1270  if (warpID == 0) {
1271  virial.xx = 0.0;
1272  virial.xy = 0.0;
1273  virial.xz = 0.0;
1274  virial.yx = 0.0;
1275  virial.yy = 0.0;
1276  virial.yz = 0.0;
1277  virial.zx = 0.0;
1278  virial.zy = 0.0;
1279  virial.zz = 0.0;
1280  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
1281 #pragma unroll
1282  for (int i=16;i >= 1;i/=2) {
1283  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, 32);
1284  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, 32);
1285  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, 32);
1286  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, 32);
1287  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, 32);
1288  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, 32);
1289  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, 32);
1290  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, 32);
1291  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, 32);
1292  }
1293 
1294  if (laneID == 0) {
1295 #ifdef USE_FP_VIRIAL
1296  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 0], llitoulli(virial.xx*double_to_virial));
1297  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 1], llitoulli(virial.xy*double_to_virial));
1298  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 2], llitoulli(virial.xz*double_to_virial));
1299  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 3], llitoulli(virial.yx*double_to_virial));
1300  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 4], llitoulli(virial.yy*double_to_virial));
1301  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 5], llitoulli(virial.yz*double_to_virial));
1302  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 6], llitoulli(virial.zx*double_to_virial));
1303  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 7], llitoulli(virial.zy*double_to_virial));
1304  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 8], llitoulli(virial.zz*double_to_virial));
1305 #else
1306  atomicAdd(&energies_virials[virialIndex + 0], virial.xx);
1307  atomicAdd(&energies_virials[virialIndex + 1], virial.xy);
1308  atomicAdd(&energies_virials[virialIndex + 2], virial.xz);
1309  atomicAdd(&energies_virials[virialIndex + 3], virial.yx);
1310  atomicAdd(&energies_virials[virialIndex + 4], virial.yy);
1311  atomicAdd(&energies_virials[virialIndex + 5], virial.yz);
1312  atomicAdd(&energies_virials[virialIndex + 6], virial.zx);
1313  atomicAdd(&energies_virials[virialIndex + 7], virial.zy);
1314  atomicAdd(&energies_virials[virialIndex + 8], virial.zz);
1315 #endif
1316  }
1317  }
1318  }
1319 #endif
1320 
1321 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
__device__ unsigned long long int llitoulli(long long int l)
static __thread unsigned int * exclusions
#define BONDEDFORCESKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:32
static __constant__ const double double_to_virial
#define WARPSIZE
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
template<typename T , bool doEnergy, bool doVirial>
__device__ void bondForce ( const int  index,
const CudaBond *__restrict__  bondList,
const CudaBondValue *__restrict__  bondValues,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  force,
double &  energy,
ComputeBondedCUDAKernel::BondedVirial< double > &  virial 
)

Definition at line 145 of file ComputeBondedCUDAKernel.cu.

References CudaBond::i, CudaBond::ioffsetXYZ, CudaBond::itype, CudaBond::j, CudaBondValue::k, CudaBond::scale, CudaBondValue::x0, and CudaBondValue::x1.

158  {
159 
160  CudaBond bl = bondList[index];
161  CudaBondValue bondValue = bondValues[bl.itype];
162  if (bondValue.x0 == 0.0f) return;
163 
164  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
165  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
166  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
167 
168  float4 xyzqi = xyzq[bl.i];
169  float4 xyzqj = xyzq[bl.j];
170 
171  float xij = xyzqi.x + shx - xyzqj.x;
172  float yij = xyzqi.y + shy - xyzqj.y;
173  float zij = xyzqi.z + shz - xyzqj.z;
174 
175  float r = sqrtf(xij*xij + yij*yij + zij*zij);
176 
177  float db = r - bondValue.x0;
178  if (bondValue.x1) {
179  // in this case, the bond represents a harmonic wall potential
180  // where x0 is the lower wall and x1 is the upper
181  db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
182  }
183  float fij = db * bondValue.k * bl.scale;
184 
185  if (doEnergy) {
186  energy += (double)(fij*db);
187  }
188  fij *= -2.0f/r;
189 
190  T T_fxij, T_fyij, T_fzij;
191  calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
192 
193  // Store forces
194  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force);
195  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force);
196 
197  // Store virial
198  if (doVirial) {
199 #ifdef WRITE_FULL_VIRIALS
200  float fxij = fij*xij;
201  float fyij = fij*yij;
202  float fzij = fij*zij;
203  virial.xx = (fxij*xij);
204  virial.xy = (fxij*yij);
205  virial.xz = (fxij*zij);
206  virial.yx = (fyij*xij);
207  virial.yy = (fyij*yij);
208  virial.yz = (fyij*zij);
209  virial.zx = (fzij*xij);
210  virial.zy = (fzij*yij);
211  virial.zz = (fzij*zij);
212 #endif
213  }
214 }
float scale
float3 ioffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
template<typename T >
__forceinline__ __device__ void calcComponentForces ( float  fij,
const float  dx,
const float  dy,
const float  dz,
T &  fxij,
T &  fyij,
T &  fzij 
)

Definition at line 57 of file ComputeBondedCUDAKernel.cu.

60  {
61 
62  fxij = (T)(fij*dx);
63  fyij = (T)(fij*dy);
64  fzij = (T)(fij*dz);
65 
66 }
template<typename T >
__forceinline__ __device__ void calcComponentForces ( float  fij1,
const float  dx1,
const float  dy1,
const float  dz1,
float  fij2,
const float  dx2,
const float  dy2,
const float  dz2,
T &  fxij,
T &  fyij,
T &  fzij 
)

Definition at line 86 of file ComputeBondedCUDAKernel.cu.

91  {
92 
93  fxij = (T)(fij1*dx1 + fij2*dx2);
94  fyij = (T)(fij1*dy1 + fij2*dy2);
95  fzij = (T)(fij1*dz1 + fij2*dz2);
96 }
template<>
__forceinline__ __device__ void calcComponentForces< long long int > ( float  fij,
const float  dx,
const float  dy,
const float  dz,
long long int &  fxij,
long long int &  fyij,
long long int &  fzij 
)

Definition at line 70 of file ComputeBondedCUDAKernel.cu.

References float_to_force, and lliroundf().

75  {
76 
77  fij *= float_to_force;
78  fxij = lliroundf(fij*dx);
79  fyij = lliroundf(fij*dy);
80  fzij = lliroundf(fij*dz);
81 
82 }
static __constant__ const float float_to_force
__device__ long long int lliroundf(float f)
template<>
__forceinline__ __device__ void calcComponentForces< long long int > ( float  fij1,
const float  dx1,
const float  dy1,
const float  dz1,
float  fij2,
const float  dx2,
const float  dy2,
const float  dz2,
long long int &  fxij,
long long int &  fyij,
long long int &  fzij 
)

Definition at line 100 of file ComputeBondedCUDAKernel.cu.

References float_to_force, and lliroundf().

111  {
112 
113  fij1 *= float_to_force;
114  fij2 *= float_to_force;
115  fxij = lliroundf(fij1*dx1 + fij2*dx2);
116  fyij = lliroundf(fij1*dy1 + fij2*dy2);
117  fzij = lliroundf(fij1*dz1 + fij2*dz2);
118 }
static __constant__ const float float_to_force
__device__ long long int lliroundf(float f)
template<typename T >
__forceinline__ __device__ void convertForces ( const float  x,
const float  y,
const float  z,
T &  Tx,
T &  Ty,
T &  Tz 
)

Definition at line 37 of file ComputeBondedCUDAKernel.cu.

38  {
39 
40  Tx = (T)(x);
41  Ty = (T)(y);
42  Tz = (T)(z);
43 }
gridSize z
gridSize y
gridSize x
template<>
__forceinline__ __device__ void convertForces< long long int > ( const float  x,
const float  y,
const float  z,
long long int &  Tx,
long long int &  Ty,
long long int &  Tz 
)

Definition at line 47 of file ComputeBondedCUDAKernel.cu.

References float_to_force, lliroundf(), x, y, and z.

48  {
49 
53 }
static __constant__ const float float_to_force
gridSize z
__device__ long long int lliroundf(float f)
gridSize y
gridSize x
__device__ __forceinline__ float3 cross ( const float3  v1,
const float3  v2 
)
template<typename T , bool doEnergy, bool doVirial>
__device__ void crosstermForce ( const int  index,
const CudaCrossterm *__restrict__  crosstermList,
const CudaCrosstermValue *__restrict__  crosstermValues,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  force,
double &  energy,
ComputeBondedCUDAKernel::BondedVirial< double > &  virial 
)

Definition at line 775 of file ComputeBondedCUDAKernel.cu.

References A, B, CudaCrosstermValue::c, cross(), CudaCrosstermValue::dim, dot(), CudaCrossterm::i1, CudaCrossterm::i2, CudaCrossterm::i3, CudaCrossterm::i4, CudaCrossterm::i5, CudaCrossterm::i6, CudaCrossterm::i7, CudaCrossterm::i8, CudaCrossterm::itype, M_PI, CudaCrossterm::offset12XYZ, CudaCrossterm::offset23XYZ, CudaCrossterm::offset34XYZ, CudaCrossterm::offset56XYZ, CudaCrossterm::offset67XYZ, CudaCrossterm::offset78XYZ, and CudaCrossterm::scale.

788  {
789 
790  CudaCrossterm cl = crosstermList[index];
791 
792  // ----------------------------------------------------------------------------
793  // Angle between 1 - 2 - 3 - 4
794  //
795  // 1 - 2
796  float3 sh12 = make_float3(
797  cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
798  cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
799  cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
800 
801  float4 xyzq1 = xyzq[cl.i1];
802  float4 xyzq2 = xyzq[cl.i2];
803 
804  float3 r12 = make_float3(
805  xyzq1.x + sh12.x - xyzq2.x,
806  xyzq1.y + sh12.y - xyzq2.y,
807  xyzq1.z + sh12.z - xyzq2.z);
808 
809  // 2 - 3
810  float3 sh23 = make_float3(
811  cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
812  cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
813  cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
814 
815  float4 xyzq3 = xyzq[cl.i3];
816 
817  float3 r23 = make_float3(
818  xyzq2.x + sh23.x - xyzq3.x,
819  xyzq2.y + sh23.y - xyzq3.y,
820  xyzq2.z + sh23.z - xyzq3.z);
821 
822  // 3 - 4
823  float3 sh34 = make_float3(
824  cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
825  cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
826  cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
827 
828  float4 xyzq4 = xyzq[cl.i4];
829 
830  float3 r34 = make_float3(
831  xyzq3.x + sh34.x - xyzq4.x,
832  xyzq3.y + sh34.y - xyzq4.y,
833  xyzq3.z + sh34.z - xyzq4.z);
834 
835  // Calculate the cross products
836  float3 A = cross(r12, r23);
837  float3 B = cross(r23, r34);
838  float3 C = cross(r23, A);
839 
840  // Calculate the inverse distances
841  float inv_rA = rsqrtf(dot(A, A));
842  float inv_rB = rsqrtf(dot(B, B));
843  float inv_rC = rsqrtf(dot(C, C));
844 
845  // Calculate the sin and cos
846  float cos_phi = dot(A, B)*(inv_rA*inv_rB);
847  float sin_phi = dot(C, B)*(inv_rC*inv_rB);
848 
849  float phi = -atan2f(sin_phi,cos_phi);
850 
851  // ----------------------------------------------------------------------------
852  // Angle between 5 - 6 - 7 - 8
853  //
854 
855  // 5 - 6
856  float3 sh56 = make_float3(
857  cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
858  cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
859  cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
860 
861  float4 xyzq5 = xyzq[cl.i5];
862  float4 xyzq6 = xyzq[cl.i6];
863 
864  float3 r56 = make_float3(
865  xyzq5.x + sh56.x - xyzq6.x,
866  xyzq5.y + sh56.y - xyzq6.y,
867  xyzq5.z + sh56.z - xyzq6.z);
868 
869  // 6 - 7
870  float3 sh67 = make_float3(
871  cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
872  cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
873  cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
874 
875  float4 xyzq7 = xyzq[cl.i7];
876 
877  float3 r67 = make_float3(
878  xyzq6.x + sh67.x - xyzq7.x,
879  xyzq6.y + sh67.y - xyzq7.y,
880  xyzq6.z + sh67.z - xyzq7.z);
881 
882  // 7 - 8
883  float3 sh78 = make_float3(
884  cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
885  cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
886  cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
887 
888  float4 xyzq8 = xyzq[cl.i8];
889 
890  float3 r78 = make_float3(
891  xyzq7.x + sh78.x - xyzq8.x,
892  xyzq7.y + sh78.y - xyzq8.y,
893  xyzq7.z + sh78.z - xyzq8.z);
894 
895  // Calculate the cross products
896  float3 D = cross(r56, r67);
897  float3 E = cross(r67, r78);
898  float3 F = cross(r67, D);
899 
900  // Calculate the inverse distances
901  float inv_rD = rsqrtf(dot(D, D));
902  float inv_rE = rsqrtf(dot(E, E));
903  float inv_rF = rsqrtf(dot(F, F));
904 
905  // Calculate the sin and cos
906  float cos_psi = dot(D, E)*(inv_rD*inv_rE);
907  float sin_psi = dot(F, E)*(inv_rF*inv_rE);
908 
909  float psi = -atan2f(sin_psi,cos_psi);
910 
911  // ----------------------------------------------------------------------------
912  // Calculate the energy
913 
914  const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
915 
916  // Shift angles
917  phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
918  psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
919 
920  // distance measured in grid points between angle and smallest value
921  float phi_h = phi * inv_h;
922  float psi_h = psi * inv_h;
923 
924  // find smallest numbered grid point in stencil
925  int iphi = (int)floor(phi_h);
926  int ipsi = (int)floor(psi_h);
927 
928  const CudaCrosstermValue& cp = crosstermValues[cl.itype];
929 
930  // Load coefficients
931  float4 c[4];
932  c[0] = cp.c[iphi][ipsi][0];
933  c[1] = cp.c[iphi][ipsi][1];
934  c[2] = cp.c[iphi][ipsi][2];
935  c[3] = cp.c[iphi][ipsi][3];
936 
937  float dphi = phi_h - (float)iphi;
938  float dpsi = psi_h - (float)ipsi;
939 
940  if (doEnergy) {
941  float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
942  energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
943  energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
944  energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
945  energy += energyf*cl.scale;
946  }
947 
948  float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
949  dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
950  dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) )); // 13 muls
951  dEdphi *= cl.scale*inv_h;
952 
953  float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
954  dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
955  dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) )); // 13 muls
956  dEdpsi *= cl.scale*inv_h;
957 
958  // float normCross1 = dot(A, A);
959  float square_r23 = dot(r23, r23);
960  float norm_r23 = sqrtf(square_r23);
961  float inv_square_r23 = 1.0f/square_r23;
962  float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
963  float ff2 = -dot(r12, r23)*inv_square_r23;
964  float ff3 = -dot(r34, r23)*inv_square_r23;
965  float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
966 
967  float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
968  float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
969  float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
970  float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
971  float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
972 
973  T T_f1x, T_f1y, T_f1z;
974  T T_f2x, T_f2y, T_f2z;
975  T T_f3x, T_f3y, T_f3z;
976  T T_f4x, T_f4y, T_f4z;
977  convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
978  convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
979  convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
980  convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
981  storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force);
982  storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force);
983  storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force);
984  storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force);
985 
986  float square_r67 = dot(r67, r67);
987  float norm_r67 = sqrtf(square_r67);
988  float inv_square_r67 = 1.0f/(square_r67);
989  ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
990  ff2 = -dot(r56, r67)*inv_square_r67;
991  ff3 = -dot(r78, r67)*inv_square_r67;
992  ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
993 
994  float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
995  float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
996  float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
997  float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
998  float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
999 
1000  T T_f5x, T_f5y, T_f5z;
1001  T T_f6x, T_f6y, T_f6z;
1002  T T_f7x, T_f7y, T_f7z;
1003  T T_f8x, T_f8y, T_f8z;
1004  convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1005  convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1006  convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1007  convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1008  storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force);
1009  storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force);
1010  storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force);
1011  storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force);
1012 
1013  // Store virial
1014  if (doVirial) {
1015 #ifdef WRITE_FULL_VIRIALS
1016  float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1017  float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1018  virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1019  virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1020  virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1021  virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1022  virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1023  virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1024  virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1025  virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1026  virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1027  }
1028 #endif
1029 
1030 }
#define M_PI
Definition: GoMolecule.C:39
float3 offset12XYZ
float3 offset78XYZ
const BigReal A
__device__ __forceinline__ float3 cross(const float3 v1, const float3 v2)
float3 offset67XYZ
float3 offset23XYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
float3 offset56XYZ
float4 c[dim][dim][4]
__device__ __forceinline__ float dot(const float3 v1, const float3 v2)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
const BigReal B
float3 offset34XYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
template<bool doEnergy>
__forceinline__ __device__ void diheComp ( const CudaDihedralValue dihedralValues,
int  ic,
const float  sin_phi,
const float  cos_phi,
const float  scale,
float &  df,
double &  e 
)

Definition at line 578 of file ComputeBondedCUDAKernel.cu.

References CudaDihedralValue::delta, CudaDihedralValue::k, M_PI, CudaDihedralValue::n, and x.

579  {
580 
581  df = 0.0f;
582  if (doEnergy) e = 0.0;
583 
584  float phi = atan2f(sin_phi, cos_phi);
585 
586  bool lrep = true;
587  while (lrep) {
588  CudaDihedralValue dihedralValue = dihedralValues[ic];
589  dihedralValue.k *= scale;
590 
591  // Last dihedral has n low bit set to 0
592  lrep = (dihedralValue.n & 1);
593  dihedralValue.n >>= 1;
594  if (dihedralValue.n) {
595  float nf = dihedralValue.n;
596  float x = nf*phi - dihedralValue.delta;
597  if (doEnergy) {
598  float sin_x, cos_x;
599  sincosf(x, &sin_x, &cos_x);
600  e += (double)(dihedralValue.k*(1.0f + cos_x));
601  df += (double)(nf*dihedralValue.k*sin_x);
602  } else {
603  float sin_x = sinf(x);
604  df += (double)(nf*dihedralValue.k*sin_x);
605  }
606  } else {
607  float diff = phi - dihedralValue.delta;
608  if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
609  if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
610  if (doEnergy) {
611  e += (double)(dihedralValue.k*diff*diff);
612  }
613  df -= 2.0f*dihedralValue.k*diff;
614  }
615  ic++;
616  }
617 
618 }
#define M_PI
Definition: GoMolecule.C:39
gridSize x
template<typename T , bool doEnergy, bool doVirial>
__device__ void diheForce ( const int  index,
const CudaDihedral *__restrict__  diheList,
const CudaDihedralValue *__restrict__  dihedralValues,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  force,
double &  energy,
ComputeBondedCUDAKernel::BondedVirial< double > &  virial 
)

Definition at line 622 of file ComputeBondedCUDAKernel.cu.

References force_to_float, CudaDihedral::i, CudaDihedral::ioffsetXYZ, CudaDihedral::itype, CudaDihedral::j, CudaDihedral::joffsetXYZ, CudaDihedral::k, CudaDihedral::l, CudaDihedral::loffsetXYZ, and CudaDihedral::scale.

634  {
635 
636  CudaDihedral dl = diheList[index];
637 
638  float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
639  float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
640  float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
641 
642  float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
643  float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
644  float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
645 
646  float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
647  float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
648  float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
649 
650  float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
651  float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
652  float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
653 
654  float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
655  float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
656  float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
657 
658  float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
659  float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
660  float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
661 
662  // A=F^G, B=H^G.
663  float ax = yij*zjk - zij*yjk;
664  float ay = zij*xjk - xij*zjk;
665  float az = xij*yjk - yij*xjk;
666  float bx = ylk*zjk - zlk*yjk;
667  float by = zlk*xjk - xlk*zjk;
668  float bz = xlk*yjk - ylk*xjk;
669 
670  float ra2 = ax*ax + ay*ay + az*az;
671  float rb2 = bx*bx + by*by + bz*bz;
672  float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
673 
674  // if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
675  // nlinear = nlinear + 1
676  // endif
677 
678  float rgr = 1.0f / rg;
679  float ra2r = 1.0f / ra2;
680  float rb2r = 1.0f / rb2;
681  float rabr = sqrtf(ra2r*rb2r);
682 
683  float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
684  //
685  // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
686  // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
687  float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
688  //
689  // Energy and derivative contributions.
690 
691  float df;
692  double e;
693  diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
694 
695  if (doEnergy) energy += e;
696 
697  //
698  // Compute derivatives wrt catesian coordinates.
699  //
700  // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
701  // FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
702 
703  float fg = xij*xjk + yij*yjk + zij*zjk;
704  float hg = xlk*xjk + ylk*yjk + zlk*zjk;
705  ra2r *= df;
706  rb2r *= df;
707  float fga = fg*ra2r*rgr;
708  float hgb = hg*rb2r*rgr;
709  float gaa = ra2r*rg;
710  float gbb = rb2r*rg;
711  // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
712 
713  // Store forces
714  T T_fix, T_fiy, T_fiz;
715  calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
716  storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force);
717 
718  T dgx, dgy, dgz;
719  calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
720  T T_fjx = dgx - T_fix;
721  T T_fjy = dgy - T_fiy;
722  T T_fjz = dgz - T_fiz;
723  storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force);
724 
725  T dhx, dhy, dhz;
726  calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
727  T T_fkx = -dhx - dgx;
728  T T_fky = -dhy - dgy;
729  T T_fkz = -dhz - dgz;
730  storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force);
731  storeForces<T>(dhx, dhy, dhz, dl.l, stride, force);
732 
733  // Store virial
734  if (doVirial) {
735 #ifdef WRITE_FULL_VIRIALS
736  float fix = ((float)T_fix)*force_to_float;
737  float fiy = ((float)T_fiy)*force_to_float;
738  float fiz = ((float)T_fiz)*force_to_float;
739  float fjx = ((float)dgx)*force_to_float;
740  float fjy = ((float)dgy)*force_to_float;
741  float fjz = ((float)dgz)*force_to_float;
742  float flx = ((float)dhx)*force_to_float;
743  float fly = ((float)dhy)*force_to_float;
744  float flz = ((float)dhz)*force_to_float;
745  virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
746  virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
747  virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
748  virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
749  virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
750  virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
751  virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
752  virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
753  virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
754 #endif
755  }
756 
757 }
static __constant__ const float force_to_float
float3 ioffsetXYZ
float3 joffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
float3 loffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
__device__ __forceinline__ float dot ( const float3  v1,
const float3  v2 
)

Definition at line 767 of file ComputeBondedCUDAKernel.cu.

Referenced by crosstermForce(), and ARestraint::GetDihe().

767  {
768  return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
769 }
template<typename T , bool doEnergy, bool doVirial>
__device__ void exclusionForce ( const int  index,
const CudaExclusion *__restrict__  exclusionList,
const float  r2_delta,
const int  r2_delta_expc,
cudaTextureObject_t  r2_table_tex,
cudaTextureObject_t  exclusionTableTex,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
const float  cutoff2,
T *__restrict__  forceSlow,
double &  energySlow,
ComputeBondedCUDAKernel::BondedVirial< double > &  virialSlow 
)

Definition at line 350 of file ComputeBondedCUDAKernel.cu.

References __ldg, CudaExclusion::i, CudaExclusion::ioffsetXYZ, and CudaExclusion::j.

371  {
372 
373  CudaExclusion bl = exclusionList[index];
374 
375  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
376  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
377  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
378 
379  float4 xyzqi = xyzq[bl.i];
380  float4 xyzqj = xyzq[bl.j];
381 
382  float xij = xyzqi.x + shx - xyzqj.x;
383  float yij = xyzqi.y + shy - xyzqj.y;
384  float zij = xyzqi.z + shz - xyzqj.z;
385 
386  float r2 = xij*xij + yij*yij + zij*zij;
387  if (r2 < cutoff2) {
388  r2 += r2_delta;
389 
390  union { float f; int i; } r2i;
391  r2i.f = r2;
392  int table_i = (r2i.i >> 17) + r2_delta_expc; // table_i >= 0
393 
394 #if __CUDA_ARCH__ >= 350
395  float r2_table_val = __ldg(&r2_table[table_i]);
396 #else
397  float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
398 #endif
399  float diffa = r2 - r2_table_val;
400  float qq = xyzqi.w * xyzqj.w;
401 
402 #if __CUDA_ARCH__ >= 350
403  float4 slow = __ldg(&exclusionTable[table_i]);
404 #else
405  float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
406 #endif
407 
408  if (doEnergy) {
409  energySlow += (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
410  }
411 
412  float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
413 
414  T T_fxij, T_fyij, T_fzij;
415  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
416  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow);
417  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow);
418 
419  // Store virial
420  if (doVirial) {
421 #ifdef WRITE_FULL_VIRIALS
422  float fxij = fSlow*xij;
423  float fyij = fSlow*yij;
424  float fzij = fSlow*zij;
425  virialSlow.xx = (fxij*xij);
426  virialSlow.xy = (fxij*yij);
427  virialSlow.xz = (fxij*zij);
428  virialSlow.yx = (fyij*xij);
429  virialSlow.yy = (fyij*yij);
430  virialSlow.yz = (fyij*zij);
431  virialSlow.zx = (fzij*xij);
432  virialSlow.zy = (fzij*yij);
433  virialSlow.zz = (fzij*zij);
434 #endif
435  }
436  }
437 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
#define __ldg
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
__device__ long long int lliroundf ( float  f)
inline

Definition at line 21 of file ComputeBondedCUDAKernel.cu.

Referenced by calcComponentForces< long long int >(), and convertForces< long long int >().

22 {
23  long long int l;
24  asm("cvt.rni.s64.f32 %0, %1;" : "=l"(l) : "f"(f));
25  return l;
26 }
__device__ unsigned long long int llitoulli ( long long int  l)
inline

Definition at line 28 of file ComputeBondedCUDAKernel.cu.

Referenced by bondedForcesKernel(), modifiedExclusionForcesKernel(), and storeForces< long long int >().

29 {
30  unsigned long long int u;
31  asm("mov.b64 %0, %1;" : "=l"(u) : "l"(l));
32  return u;
33 }
template<typename T , bool doEnergy, bool doVirial, bool doElect>
__device__ void modifiedExclusionForce ( const int  index,
const CudaExclusion *__restrict__  exclusionList,
const bool  doSlow,
const float  one_scale14,
const int  vdwCoefTableWidth,
cudaTextureObject_t  vdwCoefTableTex,
cudaTextureObject_t  forceTableTex,
cudaTextureObject_t  energyTableTex,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
const float  cutoff2,
double &  energyVdw,
T *__restrict__  forceNbond,
double &  energyNbond,
T *__restrict__  forceSlow,
double &  energySlow,
ComputeBondedCUDAKernel::BondedVirial< double > &  virialNbond,
ComputeBondedCUDAKernel::BondedVirial< double > &  virialSlow 
)

Definition at line 220 of file ComputeBondedCUDAKernel.cu.

References __ldg, energyTableTex, forceTableTex, CudaExclusion::i, CudaExclusion::ioffsetXYZ, CudaExclusion::j, vdwCoefTableTex, vdwCoefTableWidth, CudaExclusion::vdwtypei, CudaExclusion::vdwtypej, float2::x, ComputeBondedCUDAKernel::BondedVirial< T >::xx, ComputeBondedCUDAKernel::BondedVirial< T >::xy, ComputeBondedCUDAKernel::BondedVirial< T >::xz, float2::y, ComputeBondedCUDAKernel::BondedVirial< T >::yx, ComputeBondedCUDAKernel::BondedVirial< T >::yy, ComputeBondedCUDAKernel::BondedVirial< T >::yz, ComputeBondedCUDAKernel::BondedVirial< T >::zx, ComputeBondedCUDAKernel::BondedVirial< T >::zy, and ComputeBondedCUDAKernel::BondedVirial< T >::zz.

246  {
247 
248  CudaExclusion bl = exclusionList[index];
249 
250  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
251  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
252  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
253 
254  float4 xyzqi = xyzq[bl.i];
255  float4 xyzqj = xyzq[bl.j];
256 
257  float xij = xyzqi.x + shx - xyzqj.x;
258  float yij = xyzqi.y + shy - xyzqj.y;
259  float zij = xyzqi.z + shz - xyzqj.z;
260 
261  float r2 = xij*xij + yij*yij + zij*zij;
262  if (r2 < cutoff2) {
263 
264  float rinv = rsqrtf(r2);
265 
266  float qq;
267  if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
268 
269  int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
270 #if __CUDA_ARCH__ >= 350
271  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
272 #else
273  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
274 #endif
275 
276  float4 fi = tex1D<float4>(forceTableTex, rinv);
277  float4 ei;
278  if (doEnergy) {
279  ei = tex1D<float4>(energyTableTex, rinv);
280  energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
281  if (doElect) {
282  energyNbond += qq * ei.x;
283  if (doSlow) energySlow += qq * ei.w;
284  }
285  }
286 
287  float fNbond;
288  if (doElect) {
289  fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
290  } else {
291  fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
292  }
293  T T_fxij, T_fyij, T_fzij;
294  calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
295  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond);
296  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond);
297 
298  float fSlow;
299  if (doSlow && doElect) {
300  fSlow = -qq * fi.w;
301  T T_fxij, T_fyij, T_fzij;
302  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
303  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow);
304  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow);
305  }
306 
307  // Store virial
308  if (doVirial) {
309 #ifdef WRITE_FULL_VIRIALS
310  float fxij = fNbond*xij;
311  float fyij = fNbond*yij;
312  float fzij = fNbond*zij;
313  virialNbond.xx = (fxij*xij);
314  virialNbond.xy = (fxij*yij);
315  virialNbond.xz = (fxij*zij);
316  virialNbond.yx = (fyij*xij);
317  virialNbond.yy = (fyij*yij);
318  virialNbond.yz = (fyij*zij);
319  virialNbond.zx = (fzij*xij);
320  virialNbond.zy = (fzij*yij);
321  virialNbond.zz = (fzij*zij);
322 #endif
323  }
324 
325  // Store virial
326  if (doVirial && doSlow && doElect) {
327 #ifdef WRITE_FULL_VIRIALS
328  float fxij = fSlow*xij;
329  float fyij = fSlow*yij;
330  float fzij = fSlow*zij;
331  virialSlow.xx = (fxij*xij);
332  virialSlow.xy = (fxij*yij);
333  virialSlow.xz = (fxij*zij);
334  virialSlow.yx = (fyij*xij);
335  virialSlow.yy = (fyij*yij);
336  virialSlow.yz = (fyij*zij);
337  virialSlow.zx = (fzij*xij);
338  virialSlow.zy = (fzij*yij);
339  virialSlow.zz = (fzij*zij);
340 #endif
341  }
342 
343  }
344 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t cudaTextureObject_t energyTableTex
float x
Definition: PmeSolver.C:3
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t forceTableTex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t vdwCoefTableTex
float y
Definition: PmeSolver.C:3
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
#define __ldg
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
template<typename T , bool doEnergy, bool doVirial, bool doElect>
__global__ void modifiedExclusionForcesKernel ( const int  start,
const int  numModifiedExclusions,
const CudaExclusion *__restrict__  modifiedExclusions,
const bool  doSlow,
const float  one_scale14,
const float  cutoff2,
const int  vdwCoefTableWidth,
const float2 *__restrict__  vdwCoefTable,
cudaTextureObject_t  vdwCoefTableTex,
cudaTextureObject_t  modifiedExclusionForceTableTex,
cudaTextureObject_t  modifiedExclusionEnergyTableTex,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  forceNbond,
T *__restrict__  forceSlow,
double *__restrict__  energies_virials 
)

Definition at line 1324 of file ComputeBondedCUDAKernel.cu.

References BLOCK_SYNC, BONDEDFORCESKERNEL_NUM_WARP, cutoff2, double_to_virial, ComputeBondedCUDAKernel::energyIndex_ELECT, ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW, ComputeBondedCUDAKernel::energyIndex_LJ, if(), lata, latb, latc, llitoulli(), ComputeBondedCUDAKernel::nbondVirialIndex_XX, ComputeBondedCUDAKernel::nbondVirialIndex_XY, ComputeBondedCUDAKernel::nbondVirialIndex_XZ, ComputeBondedCUDAKernel::nbondVirialIndex_YX, ComputeBondedCUDAKernel::nbondVirialIndex_YY, ComputeBondedCUDAKernel::nbondVirialIndex_YZ, ComputeBondedCUDAKernel::nbondVirialIndex_ZX, ComputeBondedCUDAKernel::nbondVirialIndex_ZY, ComputeBondedCUDAKernel::nbondVirialIndex_ZZ, ComputeBondedCUDAKernel::slowVirialIndex_XX, ComputeBondedCUDAKernel::slowVirialIndex_XY, ComputeBondedCUDAKernel::slowVirialIndex_XZ, ComputeBondedCUDAKernel::slowVirialIndex_YX, ComputeBondedCUDAKernel::slowVirialIndex_YY, ComputeBondedCUDAKernel::slowVirialIndex_YZ, ComputeBondedCUDAKernel::slowVirialIndex_ZX, ComputeBondedCUDAKernel::slowVirialIndex_ZY, ComputeBondedCUDAKernel::slowVirialIndex_ZZ, vdwCoefTable, vdwCoefTableTex, vdwCoefTableWidth, WARP_FULL_MASK, WARP_SHUFFLE_XOR, WARPSIZE, ComputeBondedCUDAKernel::BondedVirial< T >::xx, ComputeBondedCUDAKernel::BondedVirial< T >::xy, xyzq, ComputeBondedCUDAKernel::BondedVirial< T >::xz, ComputeBondedCUDAKernel::BondedVirial< T >::yx, ComputeBondedCUDAKernel::BondedVirial< T >::yy, ComputeBondedCUDAKernel::BondedVirial< T >::yz, ComputeBondedCUDAKernel::BondedVirial< T >::zx, ComputeBondedCUDAKernel::BondedVirial< T >::zy, and ComputeBondedCUDAKernel::BondedVirial< T >::zz.

1343  {
1344 
1345  // index
1346  int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
1347 
1348  double energyVdw, energyNbond, energySlow;
1349  if (doEnergy) {
1350  energyVdw = 0.0;
1351  if (doElect) {
1352  energyNbond = 0.0;
1353  energySlow = 0.0;
1354  }
1355  }
1356 
1357 #ifdef WRITE_FULL_VIRIALS
1360  if (doVirial) {
1361  virialNbond.xx = 0.0;
1362  virialNbond.xy = 0.0;
1363  virialNbond.xz = 0.0;
1364  virialNbond.yx = 0.0;
1365  virialNbond.yy = 0.0;
1366  virialNbond.yz = 0.0;
1367  virialNbond.zx = 0.0;
1368  virialNbond.zy = 0.0;
1369  virialNbond.zz = 0.0;
1370  if (doElect) {
1371  virialSlow.xx = 0.0;
1372  virialSlow.xy = 0.0;
1373  virialSlow.xz = 0.0;
1374  virialSlow.yx = 0.0;
1375  virialSlow.yy = 0.0;
1376  virialSlow.yz = 0.0;
1377  virialSlow.zx = 0.0;
1378  virialSlow.zy = 0.0;
1379  virialSlow.zz = 0.0;
1380  }
1381  }
1382 #endif
1383 
1384  if (i < numModifiedExclusions)
1385  {
1386  modifiedExclusionForce<T, doEnergy, doVirial, doElect>
1387  (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
1388 #if __CUDA_ARCH__ >= 350
1389  vdwCoefTable,
1390 #else
1392 #endif
1393  modifiedExclusionForceTableTex, modifiedExclusionEnergyTableTex,
1394  xyzq, stride, lata, latb, latc, cutoff2,
1395  energyVdw, forceNbond, energyNbond,
1396  forceSlow, energySlow, virialNbond, virialSlow);
1397  }
1398 
1399  // Write energies to global memory
1400  if (doEnergy) {
1401  __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
1402  __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1403  __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1404 #pragma unroll
1405  for (int i=16;i >= 1;i/=2) {
1406  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
1407  if (doElect) {
1408  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, 32);
1409  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
1410  }
1411  }
1412  int laneID = (threadIdx.x & (WARPSIZE - 1));
1413  int warpID = threadIdx.x / WARPSIZE;
1414  if (laneID == 0) {
1415  shEnergyVdw[warpID] = energyVdw;
1416  if (doElect) {
1417  shEnergyNbond[warpID] = energyNbond;
1418  shEnergySlow[warpID] = energySlow;
1419  }
1420  }
1421  BLOCK_SYNC;
1422  if (warpID == 0) {
1423  energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
1424  if (doElect) {
1425  energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
1426  energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
1427  }
1428 #pragma unroll
1429  for (int i=16;i >= 1;i/=2) {
1430  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
1431  if (doElect) {
1432  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, 32);
1433  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
1434  }
1435  }
1436  if (laneID == 0) {
1437  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
1438  if (doElect) {
1439  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
1440  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW], energySlow);
1441  }
1442  }
1443  }
1444  }
1445 
1446  // Write virials to global memory
1447 #ifdef WRITE_FULL_VIRIALS
1448  if (doVirial) {
1449 #pragma unroll
1450  for (int i=16;i >= 1;i/=2) {
1451  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, 32);
1452  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, 32);
1453  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, 32);
1454  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, 32);
1455  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, 32);
1456  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, 32);
1457  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, 32);
1458  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, 32);
1459  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, 32);
1460  if (doElect && doSlow) {
1461  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, 32);
1462  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, 32);
1463  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, 32);
1464  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, 32);
1465  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, 32);
1466  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, 32);
1467  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, 32);
1468  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, 32);
1469  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, 32);
1470  }
1471  }
1473  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1474  int laneID = (threadIdx.x & (WARPSIZE - 1));
1475  int warpID = threadIdx.x / WARPSIZE;
1476  if (laneID == 0) {
1477  shVirialNbond[warpID] = virialNbond;
1478  shVirialSlow[warpID] = virialSlow;
1479  }
1480  BLOCK_SYNC;
1481 
1482  virialNbond.xx = 0.0;
1483  virialNbond.xy = 0.0;
1484  virialNbond.xz = 0.0;
1485  virialNbond.yx = 0.0;
1486  virialNbond.yy = 0.0;
1487  virialNbond.yz = 0.0;
1488  virialNbond.zx = 0.0;
1489  virialNbond.zy = 0.0;
1490  virialNbond.zz = 0.0;
1491  if (doElect && doSlow) {
1492  virialSlow.xx = 0.0;
1493  virialSlow.xy = 0.0;
1494  virialSlow.xz = 0.0;
1495  virialSlow.yx = 0.0;
1496  virialSlow.yy = 0.0;
1497  virialSlow.yz = 0.0;
1498  virialSlow.zx = 0.0;
1499  virialSlow.zy = 0.0;
1500  virialSlow.zz = 0.0;
1501  }
1502 
1503  if (warpID == 0) {
1504  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
1505  virialNbond = shVirialNbond[laneID];
1506  virialSlow = shVirialSlow[laneID];
1507  }
1508 #pragma unroll
1509  for (int i=16;i >= 1;i/=2) {
1510  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, 32);
1511  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, 32);
1512  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, 32);
1513  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, 32);
1514  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, 32);
1515  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, 32);
1516  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, 32);
1517  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, 32);
1518  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, 32);
1519  if (doElect && doSlow) {
1520  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, 32);
1521  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, 32);
1522  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, 32);
1523  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, 32);
1524  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, 32);
1525  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, 32);
1526  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, 32);
1527  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, 32);
1528  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, 32);
1529  }
1530  }
1531 
1532  if (laneID == 0)
1533  {
1534 #ifdef USE_FP_VIRIAL
1535  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX], llitoulli(virialNbond.xx*double_to_virial));
1536  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XY], llitoulli(virialNbond.xy*double_to_virial));
1537  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XZ], llitoulli(virialNbond.xz*double_to_virial));
1538  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YX], llitoulli(virialNbond.yx*double_to_virial));
1539  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YY], llitoulli(virialNbond.yy*double_to_virial));
1540  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YZ], llitoulli(virialNbond.yz*double_to_virial));
1541  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZX], llitoulli(virialNbond.zx*double_to_virial));
1542  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZY], llitoulli(virialNbond.zy*double_to_virial));
1543  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], llitoulli(virialNbond.zz*double_to_virial));
1544  if (doElect && doSlow) {
1545  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX], llitoulli(virialSlow.xx*double_to_virial));
1546  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XY], llitoulli(virialSlow.xy*double_to_virial));
1547  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XZ], llitoulli(virialSlow.xz*double_to_virial));
1548  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YX], llitoulli(virialSlow.yx*double_to_virial));
1549  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YY], llitoulli(virialSlow.yy*double_to_virial));
1550  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YZ], llitoulli(virialSlow.yz*double_to_virial));
1551  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZX], llitoulli(virialSlow.zx*double_to_virial));
1552  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZY], llitoulli(virialSlow.zy*double_to_virial));
1553  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZZ], llitoulli(virialSlow.zz*double_to_virial));
1554  }
1555 #else
1556  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
1557  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
1558  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
1559  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
1560  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
1561  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
1562  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
1563  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
1564  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
1565  if (doElect && doSlow) {
1566  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
1567  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
1568  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
1569  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
1570  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
1571  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
1572  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
1573  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
1574  atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
1575  }
1576 #endif
1577  }
1578  }
1579  }
1580 #endif
1581 
1582 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:11
__device__ unsigned long long int llitoulli(long long int l)
if(ComputeNonbondedUtil::goMethod==2)
#define BONDEDFORCESKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t vdwCoefTableTex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 latc
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:32
static __constant__ const double double_to_virial
#define WARPSIZE
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ const int *__restrict__ const float3 const float3 latb
template<typename T >
__forceinline__ __device__ void storeForces ( const T  fx,
const T  fy,
const T  fz,
const int  ind,
const int  stride,
T *  force 
)

Definition at line 122 of file ComputeBondedCUDAKernel.cu.

124  {
125  // The generic version can not be used
126 }
template<>
__forceinline__ __device__ void storeForces< long long int > ( const long long int  fx,
const long long int  fy,
const long long int  fz,
const int  ind,
const int  stride,
long long int *  force 
)

Definition at line 131 of file ComputeBondedCUDAKernel.cu.

References llitoulli().

135  {
136  atomicAdd((unsigned long long int *)&force[ind ], llitoulli(fx));
137  atomicAdd((unsigned long long int *)&force[ind + stride ], llitoulli(fy));
138  atomicAdd((unsigned long long int *)&force[ind + stride*2], llitoulli(fz));
139 }
__device__ unsigned long long int llitoulli(long long int l)

Variable Documentation

__thread DeviceCUDA* deviceCUDA