00001 #ifdef WIN32
00002 #define _USE_MATH_DEFINES
00003 #define __thread __declspec(thread)
00004 #endif
00005 #include <math.h>
00006
00007 #if defined(NAMD_HIP) && !defined(NAMD_CUDA)
00008 #include <hipcub/hipcub.hpp>
00009 #define cub hipcub
00010 #endif
00011 #ifdef NAMD_CUDA
00012 #include <cuda.h>
00013 #if __CUDACC_VER_MAJOR__ >= 11
00014 #include <cub/cub.cuh>
00015 #else
00016 #include <namd_cub/cub.cuh>
00017 #endif
00018 #endif
00019
00020 #include "ComputeBondedCUDAKernel.h"
00021
00022 #ifdef FUTURE_CUDADEVICE
00023 #include "CudaDevice.h"
00024 #else
00025 #include "DeviceCUDA.h"
00026 extern __thread DeviceCUDA *deviceCUDA;
00027 #endif
00028
00029 __device__ __forceinline__
00030 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
00031 const int tableSize = FORCE_ENERGY_TABLE_SIZE;
00032 const float x = k * (float)tableSize - 0.5f;
00033 const float f = floorf(x);
00034 const float a = x - f;
00035 const unsigned int i = (unsigned int)f;
00036 const int i0 = i < tableSize - 1 ? i : tableSize - 1;
00037 const int i1 = i0 + 1;
00038 const float4 t0 = tex1Dfetch<float4>(tex, i0);
00039 const float4 t1 = tex1Dfetch<float4>(tex, i1);
00040 return make_float4(
00041 a * (t1.x - t0.x) + t0.x,
00042 a * (t1.y - t0.y) + t0.y,
00043 a * (t1.z - t0.z) + t0.z,
00044 a * (t1.w - t0.w) + t0.w);
00045 }
00046
00047 #if defined(USE_FP_FORCE) || defined(USE_FP_VIRIAL)
00048 #ifndef NAMD_HIP
00049 __device__ inline long long int lliroundf(float f)
00050 {
00051 long long int l;
00052 asm("cvt.rni.s64.f32 %0, %1;" : "=l"(l) : "f"(f));
00053 return l;
00054 }
00055
00056 __device__ inline unsigned long long int llitoulli(long long int l)
00057 {
00058 unsigned long long int u;
00059 asm("mov.b64 %0, %1;" : "=l"(u) : "l"(l));
00060 return u;
00061 }
00062 #else
00063 __device__ inline long long int lliroundf(float f)
00064 {
00065 return llrintf(f);
00066 }
00067
00068 __device__ inline unsigned long long int llitoulli(long long int l)
00069 {
00070 return l;
00071 }
00072 #endif
00073 #endif
00074
00075 template <typename T>
00076 __forceinline__ __device__
00077 void convertForces(const float x, const float y, const float z,
00078 T &Tx, T &Ty, T &Tz) {
00079
00080 Tx = (T)(x);
00081 Ty = (T)(y);
00082 Tz = (T)(z);
00083 }
00084
00085 #if defined(USE_FP_FORCE)
00086 template <>
00087 __forceinline__ __device__
00088 void convertForces<long long int>(const float x, const float y, const float z,
00089 long long int &Tx, long long int &Ty, long long int &Tz) {
00090
00091 Tx = lliroundf(x*float_to_force);
00092 Ty = lliroundf(y*float_to_force);
00093 Tz = lliroundf(z*float_to_force);
00094 }
00095 #endif
00096
00097 template <typename T>
00098 __forceinline__ __device__
00099 void calcComponentForces(
00100 float fij,
00101 const float dx, const float dy, const float dz,
00102 T &fxij, T &fyij, T &fzij) {
00103
00104 fxij = (T)(fij*dx);
00105 fyij = (T)(fij*dy);
00106 fzij = (T)(fij*dz);
00107
00108 }
00109
00110 #if defined(USE_FP_FORCE)
00111 template <>
00112 __forceinline__ __device__
00113 void calcComponentForces<long long int>(
00114 float fij,
00115 const float dx, const float dy, const float dz,
00116 long long int &fxij,
00117 long long int &fyij,
00118 long long int &fzij) {
00119
00120 fij *= float_to_force;
00121 fxij = lliroundf(fij*dx);
00122 fyij = lliroundf(fij*dy);
00123 fzij = lliroundf(fij*dz);
00124
00125 }
00126 #endif
00127
00128 template <typename T>
00129 __forceinline__ __device__
00130 void calcComponentForces(
00131 float fij1,
00132 const float dx1, const float dy1, const float dz1,
00133 float fij2,
00134 const float dx2, const float dy2, const float dz2,
00135 T &fxij, T &fyij, T &fzij) {
00136
00137 fxij = (T)(fij1*dx1 + fij2*dx2);
00138 fyij = (T)(fij1*dy1 + fij2*dy2);
00139 fzij = (T)(fij1*dz1 + fij2*dz2);
00140 }
00141
00142 #if defined(USE_FP_FORCE)
00143 template <>
00144 __forceinline__ __device__
00145 void calcComponentForces<long long int>(
00146 float fij1,
00147 const float dx1,
00148 const float dy1,
00149 const float dz1,
00150 float fij2,
00151 const float dx2,
00152 const float dy2,
00153 const float dz2,
00154 long long int &fxij,
00155 long long int &fyij,
00156 long long int &fzij) {
00157
00158 fij1 *= float_to_force;
00159 fij2 *= float_to_force;
00160 fxij = lliroundf(fij1*dx1 + fij2*dx2);
00161 fyij = lliroundf(fij1*dy1 + fij2*dy2);
00162 fzij = lliroundf(fij1*dz1 + fij2*dz2);
00163 }
00164 #endif
00165
00166 __forceinline__ __device__
00167 int warpAggregatingAtomicInc(int* counter) {
00168 #if WARPSIZE == 64
00169 WarpMask mask = __ballot(1);
00170 int total = __popcll(mask);
00171 int prefix = __popcll(mask & cub::LaneMaskLt());
00172 int firstLane = __ffsll(mask) - 1;
00173 #else
00174 WarpMask mask = __activemask();
00175 int total = __popc(mask);
00176 int prefix = __popc(mask & cub::LaneMaskLt());
00177 int firstLane = __ffs(mask) - 1;
00178 #endif
00179 int start = 0;
00180 if (prefix == 0) {
00181 start = atomicAdd(counter, total);
00182 }
00183 start = WARP_SHUFFLE(mask, start, firstLane, WARPSIZE);
00184 return start + prefix;
00185 }
00186
00187 template <typename T>
00188 __forceinline__ __device__
00189 void storeForces(const T fx, const T fy, const T fz,
00190 const int ind, const int stride,
00191 T* force,
00192 T* forceList, int* forceListCounter,
00193 int* forceListStarts, int* forceListNexts) {
00194 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
00195 #if defined(NAMD_HIP)
00196
00197 WarpMask mask = WARP_BALLOT(WARP_FULL_MASK, 1);
00198 if (mask == ~(WarpMask)0) {
00199 const int laneId = threadIdx.x % WARPSIZE;
00200 const int prevLaneInd = WARP_SHUFFLE(WARP_FULL_MASK, ind, laneId - 1, WARPSIZE);
00201 const bool isHead = laneId == 0 || ind != prevLaneInd;
00202 if (!WARP_ALL(WARP_FULL_MASK, isHead)) {
00203
00204 typedef cub::WarpReduce<T> WarpReduce;
00205 __shared__ typename WarpReduce::TempStorage temp_storage;
00206 const T sumfx = WarpReduce(temp_storage).HeadSegmentedSum(fx, isHead);
00207 const T sumfy = WarpReduce(temp_storage).HeadSegmentedSum(fy, isHead);
00208 const T sumfz = WarpReduce(temp_storage).HeadSegmentedSum(fz, isHead);
00209 if (isHead) {
00210 atomicAdd(&force[ind ], sumfx);
00211 atomicAdd(&force[ind + stride ], sumfy);
00212 atomicAdd(&force[ind + stride*2], sumfz);
00213 }
00214 return;
00215 }
00216 }
00217
00218 atomicAdd(&force[ind ], fx);
00219 atomicAdd(&force[ind + stride ], fy);
00220 atomicAdd(&force[ind + stride*2], fz);
00221 #else
00222 atomicAdd(&force[ind ], fx);
00223 atomicAdd(&force[ind + stride ], fy);
00224 atomicAdd(&force[ind + stride*2], fz);
00225 #endif
00226 #else
00227 const int newPos = warpAggregatingAtomicInc(forceListCounter);
00228 forceListNexts[newPos] = atomicExch(&forceListStarts[ind], newPos);
00229 forceList[newPos * 3 + 0] = fx;
00230 forceList[newPos * 3 + 1] = fy;
00231 forceList[newPos * 3 + 2] = fz;
00232 #endif
00233 }
00234
00235 #if defined(USE_FP_FORCE)
00236
00237 template <>
00238 __forceinline__ __device__
00239 void storeForces <long long int> (const long long int fx,
00240 const long long int fy,
00241 const long long int fz,
00242 const int ind, const int stride,
00243 long long int* force) {
00244 atomicAdd((unsigned long long int *)&force[ind ], llitoulli(fx));
00245 atomicAdd((unsigned long long int *)&force[ind + stride ], llitoulli(fy));
00246 atomicAdd((unsigned long long int *)&force[ind + stride*2], llitoulli(fz));
00247 }
00248 #endif
00249
00250
00251
00252
00253 template <typename T, bool doEnergy, bool doVirial>
00254 __device__ void bondForce(
00255 const int index,
00256 const CudaBond* __restrict__ bondList,
00257 const CudaBondValue* __restrict__ bondValues,
00258 const float4* __restrict__ xyzq,
00259 const int stride,
00260 const float3 lata, const float3 latb, const float3 latc,
00261 T* __restrict__ force, double &energy,
00262 T* __restrict__ forceList, int* forceListCounter,
00263 int* forceListStarts, int* forceListNexts,
00264 #ifdef WRITE_FULL_VIRIALS
00265 ComputeBondedCUDAKernel::BondedVirial<double>& virial
00266 #else
00267 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
00268 #endif
00269 ) {
00270
00271 CudaBond bl = bondList[index];
00272 CudaBondValue bondValue = bondValues[bl.itype];
00273 if (bondValue.x0 == 0.0f) return;
00274
00275 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
00276 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
00277 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
00278
00279 float4 xyzqi = xyzq[bl.i];
00280 float4 xyzqj = xyzq[bl.j];
00281
00282 float xij = xyzqi.x + shx - xyzqj.x;
00283 float yij = xyzqi.y + shy - xyzqj.y;
00284 float zij = xyzqi.z + shz - xyzqj.z;
00285
00286 float r = sqrtf(xij*xij + yij*yij + zij*zij);
00287
00288 float db = r - bondValue.x0;
00289 if (bondValue.x1) {
00290
00291
00292 db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
00293 }
00294 float fij = db * bondValue.k * bl.scale;
00295
00296 if (doEnergy) {
00297 energy += (double)(fij*db);
00298 }
00299 fij *= -2.0f/r;
00300
00301 T T_fxij, T_fyij, T_fzij;
00302 calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
00303
00304
00305 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
00306 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
00307
00308
00309 if (doVirial) {
00310 #ifdef WRITE_FULL_VIRIALS
00311 float fxij = fij*xij;
00312 float fyij = fij*yij;
00313 float fzij = fij*zij;
00314 virial.xx = (fxij*xij);
00315 virial.xy = (fxij*yij);
00316 virial.xz = (fxij*zij);
00317 virial.yx = (fyij*xij);
00318 virial.yy = (fyij*yij);
00319 virial.yz = (fyij*zij);
00320 virial.zx = (fzij*xij);
00321 virial.zy = (fzij*yij);
00322 virial.zz = (fzij*zij);
00323 #endif
00324 }
00325 }
00326
00327
00328
00329
00330
00331 #if defined(NAMD_HIP)
00332
00333
00334
00335
00336
00337 template<class T>
00338 __device__ __forceinline__
00339 T tableLookup(const T* table, const int tableSize, const float k)
00340 {
00341 const float x = k * static_cast<float>(tableSize) - 0.5f;
00342 const float f = floorf(x);
00343 const float a = x - f;
00344 const int i = static_cast<int>(f);
00345 const int i0 = max(0, min(tableSize - 1, i));
00346 const int i1 = max(0, min(tableSize - 1, i + 1));
00347 return (1.0f - a) * __ldg(&table[i0]) + a * __ldg(&table[i1]);
00348 }
00349 #endif
00350
00351 template <typename T, bool doEnergy, bool doVirial, bool doElect>
00352 __device__ void modifiedExclusionForce(
00353 const int index,
00354 const CudaExclusion* __restrict__ exclusionList,
00355 const bool doSlow,
00356 const float one_scale14,
00357 const int vdwCoefTableWidth,
00358 #if __CUDA_ARCH__ >= 350
00359 const float2* __restrict__ vdwCoefTable,
00360 #else
00361 cudaTextureObject_t vdwCoefTableTex,
00362 #endif
00363 cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
00364 const float4* __restrict__ xyzq,
00365 const int stride,
00366 const float3 lata, const float3 latb, const float3 latc,
00367 const float cutoff2,
00368 double &energyVdw,
00369 T* __restrict__ forceNbond, double &energyNbond,
00370 T* __restrict__ forceSlow, double &energySlow,
00371 T* __restrict__ forceList, int* forceListCounter,
00372 int* forceListStartsNbond, int* forceListStartsSlow, int* forceListNexts,
00373 #ifdef WRITE_FULL_VIRIALS
00374 ComputeBondedCUDAKernel::BondedVirial<double>& virialNbond,
00375 ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow
00376 #else
00377 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialNbond,
00378 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow
00379 #endif
00380 ) {
00381
00382 CudaExclusion bl = exclusionList[index];
00383
00384 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
00385 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
00386 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
00387
00388 float4 xyzqi = xyzq[bl.i];
00389 float4 xyzqj = xyzq[bl.j];
00390
00391 float xij = xyzqi.x + shx - xyzqj.x;
00392 float yij = xyzqi.y + shy - xyzqj.y;
00393 float zij = xyzqi.z + shz - xyzqj.z;
00394
00395 float r2 = xij*xij + yij*yij + zij*zij;
00396 if (r2 < cutoff2) {
00397
00398 float rinv = rsqrtf(r2);
00399
00400 float qq;
00401 if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
00402
00403 int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
00404 #if __CUDA_ARCH__ >= 350
00405 float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
00406 #else
00407 float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
00408 #endif
00409
00410 #if defined(NAMD_CUDA)
00411 float4 fi = tex1D<float4>(forceTableTex, rinv);
00412 #else // NAMD_HIP
00413 float4 fi = sampleTableTex(forceTableTex, rinv);
00414 #endif
00415 float4 ei;
00416 if (doEnergy) {
00417 #if defined NAMD_CUDA
00418 ei = tex1D<float4>(energyTableTex, rinv);
00419 #else
00420 ei = sampleTableTex(energyTableTex, rinv);
00421 #endif
00422 energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
00423 if (doElect) {
00424 energyNbond += qq * ei.x;
00425 if (doSlow) energySlow += qq * ei.w;
00426 }
00427 }
00428
00429 float fNbond;
00430 if (doElect) {
00431 fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
00432 } else {
00433 fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
00434 }
00435 T T_fxij, T_fyij, T_fzij;
00436 calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
00437 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
00438 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
00439
00440 float fSlow;
00441 if (doSlow && doElect) {
00442 fSlow = -qq * fi.w;
00443 T T_fxij, T_fyij, T_fzij;
00444 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
00445 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
00446 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
00447 }
00448
00449
00450 if (doVirial) {
00451 #ifdef WRITE_FULL_VIRIALS
00452 float fxij = fNbond*xij;
00453 float fyij = fNbond*yij;
00454 float fzij = fNbond*zij;
00455 virialNbond.xx = (fxij*xij);
00456 virialNbond.xy = (fxij*yij);
00457 virialNbond.xz = (fxij*zij);
00458 virialNbond.yx = (fyij*xij);
00459 virialNbond.yy = (fyij*yij);
00460 virialNbond.yz = (fyij*zij);
00461 virialNbond.zx = (fzij*xij);
00462 virialNbond.zy = (fzij*yij);
00463 virialNbond.zz = (fzij*zij);
00464 #endif
00465 }
00466
00467
00468 if (doVirial && doSlow && doElect) {
00469 #ifdef WRITE_FULL_VIRIALS
00470 float fxij = fSlow*xij;
00471 float fyij = fSlow*yij;
00472 float fzij = fSlow*zij;
00473 virialSlow.xx = (fxij*xij);
00474 virialSlow.xy = (fxij*yij);
00475 virialSlow.xz = (fxij*zij);
00476 virialSlow.yx = (fyij*xij);
00477 virialSlow.yy = (fyij*yij);
00478 virialSlow.yz = (fyij*zij);
00479 virialSlow.zx = (fzij*xij);
00480 virialSlow.zy = (fzij*yij);
00481 virialSlow.zz = (fzij*zij);
00482 #endif
00483 }
00484
00485 }
00486 }
00487
00488
00489
00490
00491 template <typename T, bool doEnergy, bool doVirial>
00492 __device__ void exclusionForce(
00493 const int index,
00494 const CudaExclusion* __restrict__ exclusionList,
00495 const float r2_delta, const int r2_delta_expc,
00496
00497 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
00498 const float* __restrict__ r2_table,
00499 const float4* __restrict__ exclusionTable,
00500 #else
00501 cudaTextureObject_t r2_table_tex,
00502 cudaTextureObject_t exclusionTableTex,
00503 #endif
00504
00505 const float4* __restrict__ xyzq,
00506 const int stride,
00507 const float3 lata, const float3 latb, const float3 latc,
00508 const float cutoff2,
00509 T* __restrict__ forceSlow, double &energySlow,
00510 T* __restrict__ forceList, int* forceListCounter,
00511 int* forceListStartsSlow, int* forceListNexts,
00512 #ifdef WRITE_FULL_VIRIALS
00513 ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow
00514 #else
00515 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow
00516 #endif
00517 ) {
00518
00519 CudaExclusion bl = exclusionList[index];
00520
00521 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
00522 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
00523 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
00524
00525 float4 xyzqi = xyzq[bl.i];
00526 float4 xyzqj = xyzq[bl.j];
00527
00528 float xij = xyzqi.x + shx - xyzqj.x;
00529 float yij = xyzqi.y + shy - xyzqj.y;
00530 float zij = xyzqi.z + shz - xyzqj.z;
00531
00532 float r2 = xij*xij + yij*yij + zij*zij;
00533 if (r2 < cutoff2) {
00534 r2 += r2_delta;
00535
00536 union { float f; int i; } r2i;
00537 r2i.f = r2;
00538 int table_i = (r2i.i >> 17) + r2_delta_expc;
00539
00540 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
00541 float r2_table_val = __ldg(&r2_table[table_i]);
00542 #else
00543 float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
00544 #endif
00545
00546 float diffa = r2 - r2_table_val;
00547 float qq = xyzqi.w * xyzqj.w;
00548
00549 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
00550 float4 slow = __ldg(&exclusionTable[table_i]);
00551 #else
00552 float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
00553 #endif
00554
00555
00556 if (doEnergy) {
00557 energySlow += (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
00558 }
00559
00560 float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
00561
00562 T T_fxij, T_fyij, T_fzij;
00563 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
00564 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
00565 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
00566
00567
00568 if (doVirial) {
00569 #ifdef WRITE_FULL_VIRIALS
00570 float fxij = fSlow*xij;
00571 float fyij = fSlow*yij;
00572 float fzij = fSlow*zij;
00573 virialSlow.xx = (fxij*xij);
00574 virialSlow.xy = (fxij*yij);
00575 virialSlow.xz = (fxij*zij);
00576 virialSlow.yx = (fyij*xij);
00577 virialSlow.yy = (fyij*yij);
00578 virialSlow.yz = (fyij*zij);
00579 virialSlow.zx = (fzij*xij);
00580 virialSlow.zy = (fzij*yij);
00581 virialSlow.zz = (fzij*zij);
00582 #endif
00583 }
00584 }
00585 }
00586
00587 template <typename T, bool doEnergy, bool doVirial>
00588 __device__ void angleForce(const int index,
00589 const CudaAngle* __restrict__ angleList,
00590 const CudaAngleValue* __restrict__ angleValues,
00591 const float4* __restrict__ xyzq,
00592 const int stride,
00593 const float3 lata, const float3 latb, const float3 latc,
00594 T* __restrict__ force, double &energy,
00595 T* __restrict__ forceList, int* forceListCounter,
00596 int* forceListStarts, int* forceListNexts,
00597 #ifdef WRITE_FULL_VIRIALS
00598 ComputeBondedCUDAKernel::BondedVirial<double>& virial
00599 #else
00600 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
00601 #endif
00602 ) {
00603
00604 CudaAngle al = angleList[index];
00605
00606 float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
00607 float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
00608 float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
00609
00610 float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
00611 float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
00612 float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
00613
00614 float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
00615 float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
00616 float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
00617
00618 float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
00619 float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
00620 float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
00621
00622 float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
00623 float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
00624
00625 float xijr = xij*rij_inv;
00626 float yijr = yij*rij_inv;
00627 float zijr = zij*rij_inv;
00628 float xkjr = xkj*rkj_inv;
00629 float ykjr = ykj*rkj_inv;
00630 float zkjr = zkj*rkj_inv;
00631 float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
00632
00633 CudaAngleValue angleValue = angleValues[al.itype];
00634 angleValue.k *= al.scale;
00635
00636 float diff;
00637 if (angleValue.normal == 1) {
00638
00639 cos_theta = fminf(0.999f, fmaxf(-0.999f, cos_theta));
00640 float theta = acosf(cos_theta);
00641 diff = theta - angleValue.theta0;
00642 } else {
00643 diff = cos_theta - angleValue.theta0;
00644 }
00645
00646 if (doEnergy) {
00647 energy += (double)(angleValue.k * diff * diff);
00648 }
00649 if (angleValue.normal == 1) {
00650 float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
00651 if (inv_sin_theta > 1.0e6) {
00652 diff = (diff < 0.0f) ? 1.0f : -1.0f;
00653 } else {
00654 diff *= -inv_sin_theta;
00655 }
00656 }
00657 diff *= -2.0f*angleValue.k;
00658
00659 float dtxi = rij_inv*(xkjr - cos_theta*xijr);
00660 float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
00661 float dtyi = rij_inv*(ykjr - cos_theta*yijr);
00662 float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
00663 float dtzi = rij_inv*(zkjr - cos_theta*zijr);
00664 float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
00665
00666 T T_dtxi, T_dtyi, T_dtzi;
00667 T T_dtxj, T_dtyj, T_dtzj;
00668 calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
00669 calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
00670 T T_dtxk = -T_dtxi - T_dtxj;
00671 T T_dtyk = -T_dtyi - T_dtyj;
00672 T T_dtzk = -T_dtzi - T_dtzj;
00673 storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
00674
00675 if (angleValue.k_ub) {
00676 float xik = xij - xkj;
00677 float yik = yij - ykj;
00678 float zik = zij - zkj;
00679 float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
00680 float rik = 1.0f/rik_inv;
00681 float diff_ub = rik - angleValue.r_ub;
00682 if (doEnergy) {
00683 energy += (double)(angleValue.k_ub * diff_ub * diff_ub);
00684 }
00685 diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
00686 T T_dxik, T_dyik, T_dzik;
00687 calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
00688 T_dtxi += T_dxik;
00689 T_dtyi += T_dyik;
00690 T_dtzi += T_dzik;
00691 T_dtxj -= T_dxik;
00692 T_dtyj -= T_dyik;
00693 T_dtzj -= T_dzik;
00694 }
00695
00696 storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
00697 storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
00698
00699
00700 if (doVirial) {
00701 #ifdef WRITE_FULL_VIRIALS
00702 float fxi = ((float)T_dtxi)*force_to_float;
00703 float fyi = ((float)T_dtyi)*force_to_float;
00704 float fzi = ((float)T_dtzi)*force_to_float;
00705 float fxk = ((float)T_dtxj)*force_to_float;
00706 float fyk = ((float)T_dtyj)*force_to_float;
00707 float fzk = ((float)T_dtzj)*force_to_float;
00708 virial.xx = (fxi*xij) + (fxk*xkj);
00709 virial.xy = (fxi*yij) + (fxk*ykj);
00710 virial.xz = (fxi*zij) + (fxk*zkj);
00711 virial.yx = (fyi*xij) + (fyk*xkj);
00712 virial.yy = (fyi*yij) + (fyk*ykj);
00713 virial.yz = (fyi*zij) + (fyk*zkj);
00714 virial.zx = (fzi*xij) + (fzk*xkj);
00715 virial.zy = (fzi*yij) + (fzk*ykj);
00716 virial.zz = (fzi*zij) + (fzk*zkj);
00717 #endif
00718 }
00719 }
00720
00721
00722
00723
00724
00725
00726 template <bool doEnergy>
00727 __forceinline__ __device__
00728 void diheComp(const CudaDihedralValue* dihedralValues, int ic,
00729 const float sin_phi, const float cos_phi, const float scale, float& df, double& e) {
00730
00731 df = 0.0f;
00732 if (doEnergy) e = 0.0;
00733
00734 float phi = atan2f(sin_phi, cos_phi);
00735
00736 bool lrep = true;
00737 while (lrep) {
00738 CudaDihedralValue dihedralValue = dihedralValues[ic];
00739 dihedralValue.k *= scale;
00740
00741
00742 lrep = (dihedralValue.n & 1);
00743 dihedralValue.n >>= 1;
00744 if (dihedralValue.n) {
00745 float nf = dihedralValue.n;
00746 float x = nf*phi - dihedralValue.delta;
00747 if (doEnergy) {
00748 float sin_x, cos_x;
00749 sincosf(x, &sin_x, &cos_x);
00750 e += (double)(dihedralValue.k*(1.0f + cos_x));
00751 df += (double)(nf*dihedralValue.k*sin_x);
00752 } else {
00753 float sin_x = sinf(x);
00754 df += (double)(nf*dihedralValue.k*sin_x);
00755 }
00756 } else {
00757 float diff = phi - dihedralValue.delta;
00758 if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
00759 if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
00760 if (doEnergy) {
00761 e += (double)(dihedralValue.k*diff*diff);
00762 }
00763 df -= 2.0f*dihedralValue.k*diff;
00764 }
00765 ic++;
00766 }
00767
00768 }
00769
00770
00771 template <typename T, bool doEnergy, bool doVirial>
00772 __device__ void diheForce(const int index,
00773 const CudaDihedral* __restrict__ diheList,
00774 const CudaDihedralValue* __restrict__ dihedralValues,
00775 const float4* __restrict__ xyzq,
00776 const int stride,
00777 const float3 lata, const float3 latb, const float3 latc,
00778 T* __restrict__ force, double &energy,
00779 T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
00780 #ifdef WRITE_FULL_VIRIALS
00781 ComputeBondedCUDAKernel::BondedVirial<double>& virial
00782 #else
00783 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
00784 #endif
00785 ) {
00786
00787 CudaDihedral dl = diheList[index];
00788
00789 float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
00790 float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
00791 float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
00792
00793 float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
00794 float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
00795 float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
00796
00797 float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
00798 float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
00799 float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
00800
00801 float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
00802 float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
00803 float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
00804
00805 float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
00806 float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
00807 float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
00808
00809 float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
00810 float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
00811 float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
00812
00813
00814 float ax = yij*zjk - zij*yjk;
00815 float ay = zij*xjk - xij*zjk;
00816 float az = xij*yjk - yij*xjk;
00817 float bx = ylk*zjk - zlk*yjk;
00818 float by = zlk*xjk - xlk*zjk;
00819 float bz = xlk*yjk - ylk*xjk;
00820
00821 float ra2 = ax*ax + ay*ay + az*az;
00822 float rb2 = bx*bx + by*by + bz*bz;
00823 float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
00824
00825
00826
00827
00828
00829 float rgr = 1.0f / rg;
00830 float ra2r = 1.0f / ra2;
00831 float rb2r = 1.0f / rb2;
00832 float rabr = sqrtf(ra2r*rb2r);
00833
00834 float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
00835
00836
00837
00838 float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
00839
00840
00841
00842 float df;
00843 double e;
00844 diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
00845
00846 if (doEnergy) energy += e;
00847
00848
00849
00850
00851
00852
00853
00854 float fg = xij*xjk + yij*yjk + zij*zjk;
00855 float hg = xlk*xjk + ylk*yjk + zlk*zjk;
00856 ra2r *= df;
00857 rb2r *= df;
00858 float fga = fg*ra2r*rgr;
00859 float hgb = hg*rb2r*rgr;
00860 float gaa = ra2r*rg;
00861 float gbb = rb2r*rg;
00862
00863
00864
00865 T T_fix, T_fiy, T_fiz;
00866 calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
00867 storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
00868
00869 T dgx, dgy, dgz;
00870 calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
00871 T T_fjx = dgx - T_fix;
00872 T T_fjy = dgy - T_fiy;
00873 T T_fjz = dgz - T_fiz;
00874 storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
00875
00876 T dhx, dhy, dhz;
00877 calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
00878 T T_fkx = -dhx - dgx;
00879 T T_fky = -dhy - dgy;
00880 T T_fkz = -dhz - dgz;
00881 storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
00882 storeForces<T>(dhx, dhy, dhz, dl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
00883
00884
00885 if (doVirial) {
00886 #ifdef WRITE_FULL_VIRIALS
00887 float fix = ((float)T_fix)*force_to_float;
00888 float fiy = ((float)T_fiy)*force_to_float;
00889 float fiz = ((float)T_fiz)*force_to_float;
00890 float fjx = ((float)dgx)*force_to_float;
00891 float fjy = ((float)dgy)*force_to_float;
00892 float fjz = ((float)dgz)*force_to_float;
00893 float flx = ((float)dhx)*force_to_float;
00894 float fly = ((float)dhy)*force_to_float;
00895 float flz = ((float)dhz)*force_to_float;
00896 virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
00897 virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
00898 virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
00899 virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
00900 virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
00901 virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
00902 virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
00903 virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
00904 virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
00905 #endif
00906 }
00907
00908 }
00909
00910 __device__ __forceinline__ float3 cross(const float3 v1, const float3 v2) {
00911 return make_float3(
00912 v1.y*v2.z-v2.y*v1.z,
00913 v2.x*v1.z-v1.x*v2.z,
00914 v1.x*v2.y-v2.x*v1.y
00915 );
00916 }
00917
00918 __device__ __forceinline__ float dot(const float3 v1, const float3 v2) {
00919 return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
00920 }
00921
00922
00923
00924
00925 template <typename T, bool doEnergy, bool doVirial>
00926 __device__ void crosstermForce(
00927 const int index,
00928 const CudaCrossterm* __restrict__ crosstermList,
00929 const CudaCrosstermValue* __restrict__ crosstermValues,
00930 const float4* __restrict__ xyzq,
00931 const int stride,
00932 const float3 lata, const float3 latb, const float3 latc,
00933 T* __restrict__ force, double &energy,
00934 T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
00935 #ifdef WRITE_FULL_VIRIALS
00936 ComputeBondedCUDAKernel::BondedVirial<double>& virial
00937 #else
00938 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
00939 #endif
00940 ) {
00941
00942 CudaCrossterm cl = crosstermList[index];
00943
00944
00945
00946
00947
00948 float3 sh12 = make_float3(
00949 cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
00950 cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
00951 cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
00952
00953 float4 xyzq1 = xyzq[cl.i1];
00954 float4 xyzq2 = xyzq[cl.i2];
00955
00956 float3 r12 = make_float3(
00957 xyzq1.x + sh12.x - xyzq2.x,
00958 xyzq1.y + sh12.y - xyzq2.y,
00959 xyzq1.z + sh12.z - xyzq2.z);
00960
00961
00962 float3 sh23 = make_float3(
00963 cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
00964 cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
00965 cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
00966
00967 float4 xyzq3 = xyzq[cl.i3];
00968
00969 float3 r23 = make_float3(
00970 xyzq2.x + sh23.x - xyzq3.x,
00971 xyzq2.y + sh23.y - xyzq3.y,
00972 xyzq2.z + sh23.z - xyzq3.z);
00973
00974
00975 float3 sh34 = make_float3(
00976 cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
00977 cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
00978 cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
00979
00980 float4 xyzq4 = xyzq[cl.i4];
00981
00982 float3 r34 = make_float3(
00983 xyzq3.x + sh34.x - xyzq4.x,
00984 xyzq3.y + sh34.y - xyzq4.y,
00985 xyzq3.z + sh34.z - xyzq4.z);
00986
00987
00988 float3 A = cross(r12, r23);
00989 float3 B = cross(r23, r34);
00990 float3 C = cross(r23, A);
00991
00992
00993 float inv_rA = rsqrtf(dot(A, A));
00994 float inv_rB = rsqrtf(dot(B, B));
00995 float inv_rC = rsqrtf(dot(C, C));
00996
00997
00998 float cos_phi = dot(A, B)*(inv_rA*inv_rB);
00999 float sin_phi = dot(C, B)*(inv_rC*inv_rB);
01000
01001 float phi = -atan2f(sin_phi,cos_phi);
01002
01003
01004
01005
01006
01007
01008 float3 sh56 = make_float3(
01009 cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
01010 cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
01011 cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
01012
01013 float4 xyzq5 = xyzq[cl.i5];
01014 float4 xyzq6 = xyzq[cl.i6];
01015
01016 float3 r56 = make_float3(
01017 xyzq5.x + sh56.x - xyzq6.x,
01018 xyzq5.y + sh56.y - xyzq6.y,
01019 xyzq5.z + sh56.z - xyzq6.z);
01020
01021
01022 float3 sh67 = make_float3(
01023 cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
01024 cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
01025 cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
01026
01027 float4 xyzq7 = xyzq[cl.i7];
01028
01029 float3 r67 = make_float3(
01030 xyzq6.x + sh67.x - xyzq7.x,
01031 xyzq6.y + sh67.y - xyzq7.y,
01032 xyzq6.z + sh67.z - xyzq7.z);
01033
01034
01035 float3 sh78 = make_float3(
01036 cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
01037 cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
01038 cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
01039
01040 float4 xyzq8 = xyzq[cl.i8];
01041
01042 float3 r78 = make_float3(
01043 xyzq7.x + sh78.x - xyzq8.x,
01044 xyzq7.y + sh78.y - xyzq8.y,
01045 xyzq7.z + sh78.z - xyzq8.z);
01046
01047
01048 float3 D = cross(r56, r67);
01049 float3 E = cross(r67, r78);
01050 float3 F = cross(r67, D);
01051
01052
01053 float inv_rD = rsqrtf(dot(D, D));
01054 float inv_rE = rsqrtf(dot(E, E));
01055 float inv_rF = rsqrtf(dot(F, F));
01056
01057
01058 float cos_psi = dot(D, E)*(inv_rD*inv_rE);
01059 float sin_psi = dot(F, E)*(inv_rF*inv_rE);
01060
01061 float psi = -atan2f(sin_psi,cos_psi);
01062
01063
01064
01065
01066 const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
01067
01068
01069 phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
01070 psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
01071
01072
01073 float phi_h = phi * inv_h;
01074 float psi_h = psi * inv_h;
01075
01076
01077 int iphi = (int)floor(phi_h);
01078 int ipsi = (int)floor(psi_h);
01079
01080 const CudaCrosstermValue& cp = crosstermValues[cl.itype];
01081
01082
01083 float4 c[4];
01084 c[0] = cp.c[iphi][ipsi][0];
01085 c[1] = cp.c[iphi][ipsi][1];
01086 c[2] = cp.c[iphi][ipsi][2];
01087 c[3] = cp.c[iphi][ipsi][3];
01088
01089 float dphi = phi_h - (float)iphi;
01090 float dpsi = psi_h - (float)ipsi;
01091
01092 if (doEnergy) {
01093 float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
01094 energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
01095 energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
01096 energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
01097 energy += energyf*cl.scale;
01098 }
01099
01100 float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
01101 dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
01102 dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) ));
01103 dEdphi *= cl.scale*inv_h;
01104
01105 float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
01106 dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
01107 dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) ));
01108 dEdpsi *= cl.scale*inv_h;
01109
01110
01111 float square_r23 = dot(r23, r23);
01112 float norm_r23 = sqrtf(square_r23);
01113 float inv_square_r23 = 1.0f/square_r23;
01114 float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
01115 float ff2 = -dot(r12, r23)*inv_square_r23;
01116 float ff3 = -dot(r34, r23)*inv_square_r23;
01117 float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
01118
01119 float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
01120 float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
01121 float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
01122 float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
01123 float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
01124
01125 T T_f1x, T_f1y, T_f1z;
01126 T T_f2x, T_f2y, T_f2z;
01127 T T_f3x, T_f3y, T_f3z;
01128 T T_f4x, T_f4y, T_f4z;
01129 convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
01130 convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
01131 convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
01132 convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
01133 storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
01134 storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
01135 storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
01136 storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
01137
01138 float square_r67 = dot(r67, r67);
01139 float norm_r67 = sqrtf(square_r67);
01140 float inv_square_r67 = 1.0f/(square_r67);
01141 ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
01142 ff2 = -dot(r56, r67)*inv_square_r67;
01143 ff3 = -dot(r78, r67)*inv_square_r67;
01144 ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
01145
01146 float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
01147 float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
01148 float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
01149 float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
01150 float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
01151
01152 T T_f5x, T_f5y, T_f5z;
01153 T T_f6x, T_f6y, T_f6z;
01154 T T_f7x, T_f7y, T_f7z;
01155 T T_f8x, T_f8y, T_f8z;
01156 convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
01157 convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
01158 convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
01159 convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
01160 storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
01161 storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
01162 storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
01163 storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
01164
01165
01166 if (doVirial) {
01167 #ifdef WRITE_FULL_VIRIALS
01168 float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
01169 float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
01170 virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
01171 virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
01172 virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
01173 virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
01174 virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
01175 virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
01176 virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
01177 virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
01178 virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
01179 }
01180 #endif
01181
01182 }
01183
01184
01185 #ifndef NAMD_CUDA
01186 #define BONDEDFORCESKERNEL_NUM_WARP 4
01187 #else
01188 #define BONDEDFORCESKERNEL_NUM_WARP 4
01189 #endif
01190
01191
01192
01193
01194 template <typename T, bool doEnergy, bool doVirial>
01195 __global__ void bondedForcesKernel(
01196 const int start,
01197
01198 const int numBonds,
01199 const CudaBond* __restrict__ bonds,
01200 const CudaBondValue* __restrict__ bondValues,
01201
01202 const int numAngles,
01203 const CudaAngle* __restrict__ angles,
01204 const CudaAngleValue* __restrict__ angleValues,
01205
01206 const int numDihedrals,
01207 const CudaDihedral* __restrict__ dihedrals,
01208 const CudaDihedralValue* __restrict__ dihedralValues,
01209
01210 const int numImpropers,
01211 const CudaDihedral* __restrict__ impropers,
01212 const CudaDihedralValue* __restrict__ improperValues,
01213
01214 const int numExclusions,
01215 const CudaExclusion* __restrict__ exclusions,
01216
01217 const int numCrossterms,
01218 const CudaCrossterm* __restrict__ crossterms,
01219 const CudaCrosstermValue* __restrict__ crosstermValues,
01220
01221 const float cutoff2,
01222 const float r2_delta, const int r2_delta_expc,
01223
01224 const float* __restrict__ r2_table,
01225 const float4* __restrict__ exclusionTable,
01226
01227 #ifndef NAMD_HIP
01228 cudaTextureObject_t r2_table_tex,
01229 cudaTextureObject_t exclusionTableTex,
01230 #endif
01231
01232 const float4* __restrict__ xyzq,
01233 const int stride,
01234 const float3 lata, const float3 latb, const float3 latc,
01235 T* __restrict__ force,
01236 T* __restrict__ forceSlow,
01237 T* __restrict__ forceList,
01238 int* forceListCounter,
01239 int* forceListStarts,
01240 int* forceListStartsSlow,
01241 int* forceListNexts,
01242 double* __restrict__ energies_virials) {
01243
01244
01245 int indexTB = start + blockIdx.x;
01246
01247 const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
01248 const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
01249 const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
01250 const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
01251 const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
01252 const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
01253
01254
01255
01256 double energy;
01257 int energyIndex;
01258
01259 if (doEnergy) {
01260 energy = 0.0;
01261 energyIndex = -1;
01262 }
01263
01264 #ifdef WRITE_FULL_VIRIALS
01265 ComputeBondedCUDAKernel::BondedVirial<double> virial;
01266 int virialIndex;
01267 if (doVirial) {
01268 virial.xx = 0.0;
01269 virial.xy = 0.0;
01270 virial.xz = 0.0;
01271 virial.yx = 0.0;
01272 virial.yy = 0.0;
01273 virial.yz = 0.0;
01274 virial.zx = 0.0;
01275 virial.zy = 0.0;
01276 virial.zz = 0.0;
01277 virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
01278 }
01279 #endif
01280
01281 if (indexTB < numBondsTB) {
01282 int i = threadIdx.x + indexTB*blockDim.x;
01283 if (doEnergy) {
01284 energyIndex = ComputeBondedCUDAKernel::energyIndex_BOND;
01285 }
01286 if (i < numBonds) {
01287 bondForce<T, doEnergy, doVirial>
01288 (i, bonds, bondValues, xyzq,
01289 stride, lata, latb, latc,
01290 force, energy,
01291 forceList, forceListCounter, forceListStarts, forceListNexts,
01292 virial);
01293 }
01294 goto reduce;
01295 }
01296 indexTB -= numBondsTB;
01297
01298 if (indexTB < numAnglesTB) {
01299 int i = threadIdx.x + indexTB*blockDim.x;
01300 if (doEnergy) {
01301 energyIndex = ComputeBondedCUDAKernel::energyIndex_ANGLE;
01302 }
01303 if (i < numAngles) {
01304 angleForce<T, doEnergy, doVirial>
01305 (i, angles, angleValues, xyzq, stride,
01306 lata, latb, latc,
01307 force, energy,
01308 forceList, forceListCounter, forceListStarts, forceListNexts,
01309 virial);
01310 }
01311 goto reduce;
01312 }
01313 indexTB -= numAnglesTB;
01314
01315 if (indexTB < numDihedralsTB) {
01316 int i = threadIdx.x + indexTB*blockDim.x;
01317 if (doEnergy) {
01318 energyIndex = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL;
01319 }
01320 if (doVirial) {
01321 virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
01322 }
01323 if (i < numDihedrals) {
01324 diheForce<T, doEnergy, doVirial>
01325 (i, dihedrals, dihedralValues, xyzq, stride,
01326 lata, latb, latc,
01327 force, energy,
01328 forceList, forceListCounter, forceListStarts, forceListNexts,
01329 virial);
01330 }
01331 goto reduce;
01332 }
01333 indexTB -= numDihedralsTB;
01334
01335 if (indexTB < numImpropersTB) {
01336 int i = threadIdx.x + indexTB*blockDim.x;
01337 if (doEnergy) {
01338 energyIndex = ComputeBondedCUDAKernel::energyIndex_IMPROPER;
01339 }
01340 if (i < numImpropers) {
01341 diheForce<T, doEnergy, doVirial>
01342 (i, impropers, improperValues, xyzq, stride,
01343 lata, latb, latc,
01344 force, energy,
01345 forceList, forceListCounter, forceListStarts, forceListNexts,
01346 virial);
01347 }
01348 goto reduce;
01349 }
01350 indexTB -= numImpropersTB;
01351
01352 if (indexTB < numExclusionsTB) {
01353 int i = threadIdx.x + indexTB*blockDim.x;
01354 if (doEnergy) {
01355 energyIndex = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW;
01356 }
01357 if (doVirial) {
01358 virialIndex = ComputeBondedCUDAKernel::slowVirialIndex_XX;
01359 }
01360 if (i < numExclusions) {
01361 exclusionForce<T, doEnergy, doVirial>
01362 (i, exclusions, r2_delta, r2_delta_expc,
01363 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
01364 r2_table, exclusionTable,
01365 #else
01366 r2_table_tex, exclusionTableTex,
01367 #endif
01368 xyzq, stride, lata, latb, latc, cutoff2,
01369 forceSlow, energy,
01370 forceList, forceListCounter, forceListStartsSlow, forceListNexts,
01371 virial);
01372 }
01373 goto reduce;
01374 }
01375 indexTB -= numExclusionsTB;
01376
01377 if (indexTB < numCrosstermsTB) {
01378 int i = threadIdx.x + indexTB*blockDim.x;
01379 if (doEnergy) {
01380 energyIndex = ComputeBondedCUDAKernel::energyIndex_CROSSTERM;
01381 }
01382 if (doVirial) {
01383 virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
01384 }
01385 if (i < numCrossterms) {
01386 crosstermForce<T, doEnergy, doVirial>
01387 (i, crossterms, crosstermValues,
01388 xyzq, stride, lata, latb, latc,
01389 force, energy,
01390 forceList, forceListCounter, forceListStarts, forceListNexts,
01391 virial);
01392 }
01393 goto reduce;
01394 }
01395
01396
01397 reduce:
01398
01399
01400 if (doEnergy) {
01401
01402 __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
01403 #pragma unroll
01404 for (int i=WARPSIZE/2;i >= 1;i/=2) {
01405 energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
01406 }
01407 int laneID = (threadIdx.x & (WARPSIZE - 1));
01408 int warpID = threadIdx.x / WARPSIZE;
01409 if (laneID == 0) {
01410 shEnergy[warpID] = energy;
01411 }
01412 BLOCK_SYNC;
01413 if (warpID == 0) {
01414 energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
01415 #pragma unroll
01416 for (int i=WARPSIZE/2;i >= 1;i/=2) {
01417 energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
01418 }
01419 if (laneID == 0) {
01420 const int bin = blockIdx.x % ATOMIC_BINS;
01421 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex], energy);
01422 }
01423 }
01424 }
01425
01426
01427 #ifdef WRITE_FULL_VIRIALS
01428 if (doVirial) {
01429 #pragma unroll
01430 for (int i=WARPSIZE/2;i >= 1;i/=2) {
01431 virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
01432 virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
01433 virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
01434 virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
01435 virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
01436 virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
01437 virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
01438 virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
01439 virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
01440 }
01441 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirial[BONDEDFORCESKERNEL_NUM_WARP];
01442 int laneID = (threadIdx.x & (WARPSIZE - 1));
01443 int warpID = threadIdx.x / WARPSIZE;
01444 if (laneID == 0) {
01445 shVirial[warpID] = virial;
01446 }
01447 BLOCK_SYNC;
01448
01449 if (warpID == 0) {
01450 virial.xx = 0.0;
01451 virial.xy = 0.0;
01452 virial.xz = 0.0;
01453 virial.yx = 0.0;
01454 virial.yy = 0.0;
01455 virial.yz = 0.0;
01456 virial.zx = 0.0;
01457 virial.zy = 0.0;
01458 virial.zz = 0.0;
01459 if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
01460 #pragma unroll
01461 for (int i=WARPSIZE/2;i >= 1;i/=2) {
01462 virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
01463 virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
01464 virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
01465 virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
01466 virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
01467 virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
01468 virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
01469 virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
01470 virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
01471 }
01472
01473 if (laneID == 0) {
01474 #ifdef USE_FP_VIRIAL
01475 atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 0], llitoulli(virial.xx*double_to_virial));
01476 atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 1], llitoulli(virial.xy*double_to_virial));
01477 atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 2], llitoulli(virial.xz*double_to_virial));
01478 atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 3], llitoulli(virial.yx*double_to_virial));
01479 atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 4], llitoulli(virial.yy*double_to_virial));
01480 atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 5], llitoulli(virial.yz*double_to_virial));
01481 atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 6], llitoulli(virial.zx*double_to_virial));
01482 atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 7], llitoulli(virial.zy*double_to_virial));
01483 atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 8], llitoulli(virial.zz*double_to_virial));
01484 #else
01485 const int bin = blockIdx.x % ATOMIC_BINS;
01486 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 0], virial.xx);
01487 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 1], virial.xy);
01488 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 2], virial.xz);
01489 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 3], virial.yx);
01490 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 4], virial.yy);
01491 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 5], virial.yz);
01492 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 6], virial.zx);
01493 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 7], virial.zy);
01494 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 8], virial.zz);
01495 #endif
01496 }
01497 }
01498 }
01499 #endif
01500
01501 }
01502
01503 template <typename T, bool doEnergy, bool doVirial, bool doElect>
01504 __global__ void modifiedExclusionForcesKernel(
01505 const int start,
01506
01507 const int numModifiedExclusions,
01508 const CudaExclusion* __restrict__ modifiedExclusions,
01509
01510 const bool doSlow,
01511 const float one_scale14,
01512 const float cutoff2,
01513 const int vdwCoefTableWidth,
01514 const float2* __restrict__ vdwCoefTable,
01515 cudaTextureObject_t vdwCoefTableTex,
01516 cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex,
01517 const float4* __restrict__ xyzq,
01518 const int stride,
01519 const float3 lata, const float3 latb, const float3 latc,
01520 T* __restrict__ forceNbond, T* __restrict__ forceSlow,
01521 T* __restrict__ forceList,
01522 int* forceListCounter,
01523 int* forceListStartsNbond,
01524 int* forceListStartsSlow,
01525 int* forceListNexts,
01526 double* __restrict__ energies_virials
01527 ) {
01528
01529
01530 int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
01531
01532 double energyVdw, energyNbond, energySlow;
01533 if (doEnergy) {
01534 energyVdw = 0.0;
01535 if (doElect) {
01536 energyNbond = 0.0;
01537 energySlow = 0.0;
01538 }
01539 }
01540
01541 #ifdef WRITE_FULL_VIRIALS
01542 ComputeBondedCUDAKernel::BondedVirial<double> virialNbond;
01543 ComputeBondedCUDAKernel::BondedVirial<double> virialSlow;
01544 if (doVirial) {
01545 virialNbond.xx = 0.0;
01546 virialNbond.xy = 0.0;
01547 virialNbond.xz = 0.0;
01548 virialNbond.yx = 0.0;
01549 virialNbond.yy = 0.0;
01550 virialNbond.yz = 0.0;
01551 virialNbond.zx = 0.0;
01552 virialNbond.zy = 0.0;
01553 virialNbond.zz = 0.0;
01554 if (doElect) {
01555 virialSlow.xx = 0.0;
01556 virialSlow.xy = 0.0;
01557 virialSlow.xz = 0.0;
01558 virialSlow.yx = 0.0;
01559 virialSlow.yy = 0.0;
01560 virialSlow.yz = 0.0;
01561 virialSlow.zx = 0.0;
01562 virialSlow.zy = 0.0;
01563 virialSlow.zz = 0.0;
01564 }
01565 }
01566 #endif
01567
01568 if (i < numModifiedExclusions)
01569 {
01570 modifiedExclusionForce<T, doEnergy, doVirial, doElect>
01571 (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
01572 #if __CUDA_ARCH__ >= 350
01573 vdwCoefTable,
01574 #else
01575 vdwCoefTableTex,
01576 #endif
01577 modifiedExclusionForceTableTex, modifiedExclusionEnergyTableTex,
01578 xyzq, stride, lata, latb, latc, cutoff2,
01579 energyVdw, forceNbond, energyNbond,
01580 forceSlow, energySlow,
01581 forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts,
01582 virialNbond, virialSlow);
01583 }
01584
01585
01586 if (doEnergy) {
01587 __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
01588 __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
01589 __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
01590
01591
01592 #pragma unroll
01593 for (int i=WARPSIZE/2;i >= 1;i/=2) {
01594 energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
01595 if (doElect) {
01596 energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
01597 energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
01598 }
01599 }
01600 int laneID = (threadIdx.x & (WARPSIZE - 1));
01601 int warpID = threadIdx.x / WARPSIZE;
01602 if (laneID == 0) {
01603 shEnergyVdw[warpID] = energyVdw;
01604 if (doElect) {
01605 shEnergyNbond[warpID] = energyNbond;
01606 shEnergySlow[warpID] = energySlow;
01607 }
01608 }
01609 BLOCK_SYNC;
01610 if (warpID == 0) {
01611 energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
01612 if (doElect) {
01613 energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
01614 energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
01615 }
01616 #pragma unroll
01617 for (int i=WARPSIZE/2;i >= 1;i/=2) {
01618 energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
01619 if (doElect) {
01620 energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
01621 energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
01622 }
01623 }
01624 if (laneID == 0) {
01625 const int bin = blockIdx.x % ATOMIC_BINS;
01626 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
01627 if (doElect) {
01628 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
01629 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW], energySlow);
01630 }
01631 }
01632 }
01633 }
01634
01635
01636 #ifdef WRITE_FULL_VIRIALS
01637 if (doVirial) {
01638 #pragma unroll
01639 for (int i=WARPSIZE/2;i >= 1;i/=2) {
01640 virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
01641 virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
01642 virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
01643 virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
01644 virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
01645 virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
01646 virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
01647 virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
01648 virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
01649 if (doElect && doSlow) {
01650 virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
01651 virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
01652 virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
01653 virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
01654 virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
01655 virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
01656 virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
01657 virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
01658 virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
01659 }
01660 }
01661 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialNbond[BONDEDFORCESKERNEL_NUM_WARP];
01662 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
01663 int laneID = (threadIdx.x & (WARPSIZE - 1));
01664 int warpID = threadIdx.x / WARPSIZE;
01665 if (laneID == 0) {
01666 shVirialNbond[warpID] = virialNbond;
01667 if (doElect && doSlow) {
01668 shVirialSlow[warpID] = virialSlow;
01669 }
01670 }
01671 BLOCK_SYNC;
01672
01673 virialNbond.xx = 0.0;
01674 virialNbond.xy = 0.0;
01675 virialNbond.xz = 0.0;
01676 virialNbond.yx = 0.0;
01677 virialNbond.yy = 0.0;
01678 virialNbond.yz = 0.0;
01679 virialNbond.zx = 0.0;
01680 virialNbond.zy = 0.0;
01681 virialNbond.zz = 0.0;
01682 if (doElect && doSlow) {
01683 virialSlow.xx = 0.0;
01684 virialSlow.xy = 0.0;
01685 virialSlow.xz = 0.0;
01686 virialSlow.yx = 0.0;
01687 virialSlow.yy = 0.0;
01688 virialSlow.yz = 0.0;
01689 virialSlow.zx = 0.0;
01690 virialSlow.zy = 0.0;
01691 virialSlow.zz = 0.0;
01692 }
01693
01694 if (warpID == 0) {
01695 if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
01696 virialNbond = shVirialNbond[laneID];
01697 if (doElect && doSlow) {
01698 virialSlow = shVirialSlow[laneID];
01699 }
01700 }
01701 #pragma unroll
01702 for (int i=WARPSIZE/2;i >= 1;i/=2) {
01703 virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
01704 virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
01705 virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
01706 virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
01707 virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
01708 virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
01709 virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
01710 virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
01711 virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
01712 if (doElect && doSlow) {
01713 virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
01714 virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
01715 virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
01716 virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
01717 virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
01718 virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
01719 virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
01720 virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
01721 virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
01722 }
01723 }
01724
01725 if (laneID == 0)
01726 {
01727 #ifdef USE_FP_VIRIAL
01728 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX], llitoulli(virialNbond.xx*double_to_virial));
01729 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XY], llitoulli(virialNbond.xy*double_to_virial));
01730 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XZ], llitoulli(virialNbond.xz*double_to_virial));
01731 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YX], llitoulli(virialNbond.yx*double_to_virial));
01732 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YY], llitoulli(virialNbond.yy*double_to_virial));
01733 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YZ], llitoulli(virialNbond.yz*double_to_virial));
01734 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZX], llitoulli(virialNbond.zx*double_to_virial));
01735 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZY], llitoulli(virialNbond.zy*double_to_virial));
01736 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], llitoulli(virialNbond.zz*double_to_virial));
01737 if (doElect && doSlow) {
01738 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX], llitoulli(virialSlow.xx*double_to_virial));
01739 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XY], llitoulli(virialSlow.xy*double_to_virial));
01740 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XZ], llitoulli(virialSlow.xz*double_to_virial));
01741 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YX], llitoulli(virialSlow.yx*double_to_virial));
01742 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YY], llitoulli(virialSlow.yy*double_to_virial));
01743 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YZ], llitoulli(virialSlow.yz*double_to_virial));
01744 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZX], llitoulli(virialSlow.zx*double_to_virial));
01745 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZY], llitoulli(virialSlow.zy*double_to_virial));
01746 atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZZ], llitoulli(virialSlow.zz*double_to_virial));
01747 }
01748 #else
01749 const int bin = blockIdx.x % ATOMIC_BINS;
01750 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
01751 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
01752 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
01753 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
01754 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
01755 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
01756 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
01757 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
01758 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
01759 if (doElect && doSlow) {
01760 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
01761 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
01762 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
01763 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
01764 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
01765 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
01766 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
01767 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
01768 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
01769 }
01770 #endif
01771 }
01772 }
01773 }
01774 #endif
01775
01776 }
01777
01778 template <typename T>
01779 __global__ void gatherBondedForcesKernel(
01780 const int start,
01781 const int atomStorageSize,
01782 const int stride,
01783 const T* __restrict__ forceList,
01784 const int* __restrict__ forceListStarts,
01785 const int* __restrict__ forceListNexts,
01786 T* __restrict__ force) {
01787
01788 int i = threadIdx.x + (start + blockIdx.x) * blockDim.x;
01789
01790 if (i < atomStorageSize) {
01791 T fx = 0;
01792 T fy = 0;
01793 T fz = 0;
01794 int pos = forceListStarts[i];
01795 while (pos != -1) {
01796 fx += forceList[pos * 3 + 0];
01797 fy += forceList[pos * 3 + 1];
01798 fz += forceList[pos * 3 + 2];
01799 pos = forceListNexts[pos];
01800 }
01801 force[i ] = fx;
01802 force[i + stride ] = fy;
01803 force[i + stride * 2] = fz;
01804 }
01805 }
01806
01807 __global__ void reduceBondedBinsKernel(double* energies_virials) {
01808 const int bin = threadIdx.x;
01809
01810 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
01811 __shared__ typename WarpReduce::TempStorage tempStorage;
01812
01813 double v = WarpReduce(tempStorage).Sum(energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + blockIdx.x]);
01814 if (threadIdx.x == 0) {
01815 energies_virials[blockIdx.x] = v;
01816 }
01817 }
01818
01819
01820
01821
01822
01823
01824
01825
01826 ComputeBondedCUDAKernel::ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables) :
01827 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
01828
01829 cudaCheck(cudaSetDevice(deviceID));
01830
01831 tupleData = NULL;
01832 tupleDataSize = 0;
01833
01834 numBonds = 0;
01835 numAngles = 0;
01836 numDihedrals = 0;
01837 numImpropers = 0;
01838 numModifiedExclusions = 0;
01839 numExclusions = 0;
01840 numCrossterms = 0;
01841
01842 bondValues = NULL;
01843 angleValues = NULL;
01844 dihedralValues = NULL;
01845 improperValues = NULL;
01846 crosstermValues = NULL;
01847
01848 xyzq = NULL;
01849 xyzqSize = 0;
01850
01851 forces = NULL;
01852 forcesSize = 0;
01853
01854 forceList = NULL;
01855 forceListStarts = NULL;
01856 forceListNexts = NULL;
01857 forceListSize = 0;
01858 forceListStartsSize = 0;
01859 forceListNextsSize = 0;
01860 allocate_device<int>(&forceListCounter, 1);
01861
01862 allocate_device<double>(&energies_virials, ATOMIC_BINS * energies_virials_SIZE);
01863 }
01864
01865
01866
01867
01868 ComputeBondedCUDAKernel::~ComputeBondedCUDAKernel() {
01869 cudaCheck(cudaSetDevice(deviceID));
01870
01871 deallocate_device<double>(&energies_virials);
01872
01873
01874 if (tupleData != NULL) deallocate_device<char>(&tupleData);
01875 if (xyzq != NULL) deallocate_device<float4>(&xyzq);
01876 if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
01877
01878 if (forceList != NULL) deallocate_device<FORCE_TYPE>(&forceList);
01879 if (forceListCounter != NULL) deallocate_device<int>(&forceListCounter);
01880 if (forceListStarts != NULL) deallocate_device<int>(&forceListStarts);
01881 if (forceListNexts != NULL) deallocate_device<int>(&forceListNexts);
01882
01883 if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
01884 if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
01885 if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
01886 if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
01887 if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
01888 }
01889
01890 void ComputeBondedCUDAKernel::setupBondValues(int numBondValues, CudaBondValue* h_bondValues) {
01891 allocate_device<CudaBondValue>(&bondValues, numBondValues);
01892 copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
01893 }
01894
01895 void ComputeBondedCUDAKernel::setupAngleValues(int numAngleValues, CudaAngleValue* h_angleValues) {
01896 allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
01897 copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
01898 }
01899
01900 void ComputeBondedCUDAKernel::setupDihedralValues(int numDihedralValues, CudaDihedralValue* h_dihedralValues) {
01901 allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
01902 copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
01903 }
01904
01905 void ComputeBondedCUDAKernel::setupImproperValues(int numImproperValues, CudaDihedralValue* h_improperValues) {
01906 allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
01907 copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
01908 }
01909
01910 void ComputeBondedCUDAKernel::setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue* h_crosstermValues) {
01911 allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
01912 copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
01913 }
01914
01915
01916
01917
01918 void ComputeBondedCUDAKernel::update(
01919 const int numBondsIn,
01920 const int numAnglesIn,
01921 const int numDihedralsIn,
01922 const int numImpropersIn,
01923 const int numModifiedExclusionsIn,
01924 const int numExclusionsIn,
01925 const int numCrosstermsIn,
01926 const char* h_tupleData,
01927 cudaStream_t stream) {
01928
01929 numBonds = numBondsIn;
01930 numAngles = numAnglesIn;
01931 numDihedrals = numDihedralsIn;
01932 numImpropers = numImpropersIn;
01933 numModifiedExclusions = numModifiedExclusionsIn;
01934 numExclusions = numExclusionsIn;
01935 numCrossterms = numCrosstermsIn;
01936
01937 int numBondsWA = warpAlign(numBonds);
01938 int numAnglesWA = warpAlign(numAngles);
01939 int numDihedralsWA = warpAlign(numDihedrals);
01940 int numImpropersWA = warpAlign(numImpropers);
01941 int numModifiedExclusionsWA = warpAlign(numModifiedExclusions);
01942 int numExclusionsWA = warpAlign(numExclusions);
01943 int numCrosstermsWA = warpAlign(numCrossterms);
01944
01945 int sizeTot = numBondsWA*sizeof(CudaBond) + numAnglesWA*sizeof(CudaAngle) +
01946 numDihedralsWA*sizeof(CudaDihedral) + numImpropersWA*sizeof(CudaDihedral) +
01947 numModifiedExclusionsWA*sizeof(CudaExclusion) + numExclusionsWA*sizeof(CudaExclusion) +
01948 numCrosstermsWA*sizeof(CudaCrossterm);
01949
01950 reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, 1.4f);
01951 copy_HtoD<char>(h_tupleData, tupleData, sizeTot, stream);
01952
01953
01954 int pos = 0;
01955 bonds = (CudaBond *)&tupleData[pos];
01956 pos += numBondsWA*sizeof(CudaBond);
01957
01958 angles = (CudaAngle* )&tupleData[pos];
01959 pos += numAnglesWA*sizeof(CudaAngle);
01960
01961 dihedrals = (CudaDihedral* )&tupleData[pos];
01962 pos += numDihedralsWA*sizeof(CudaDihedral);
01963
01964 impropers = (CudaDihedral* )&tupleData[pos];
01965 pos += numImpropersWA*sizeof(CudaDihedral);
01966
01967 modifiedExclusions = (CudaExclusion* )&tupleData[pos];
01968 pos += numModifiedExclusionsWA*sizeof(CudaExclusion);
01969
01970 exclusions = (CudaExclusion* )&tupleData[pos];
01971 pos += numExclusionsWA*sizeof(CudaExclusion);
01972
01973 crossterms = (CudaCrossterm* )&tupleData[pos];
01974 pos += numCrosstermsWA*sizeof(CudaCrossterm);
01975 }
01976
01977
01978
01979
01980 int ComputeBondedCUDAKernel::getForceStride(const int atomStorageSize) {
01981 #ifdef USE_STRIDED_FORCE
01982
01983 return ((atomStorageSize*sizeof(FORCE_TYPE) - 1)/256 + 1)*256/sizeof(FORCE_TYPE);
01984 #else
01985 return 1;
01986 #endif
01987 }
01988
01989
01990
01991
01992 int ComputeBondedCUDAKernel::getForceSize(const int atomStorageSize) {
01993 #ifdef USE_STRIDED_FORCE
01994 return (3*getForceStride(atomStorageSize));
01995 #else
01996 return (3*atomStorageSize);
01997 #endif
01998 }
01999
02000
02001
02002
02003 int ComputeBondedCUDAKernel::getAllForceSize(const int atomStorageSize, const bool doSlow) {
02004
02005 int forceSize = getForceSize(atomStorageSize);
02006
02007 if (numModifiedExclusions > 0 || numExclusions > 0) {
02008 if (doSlow) {
02009
02010 forceSize *= 3;
02011 } else {
02012
02013 forceSize *= 2;
02014 }
02015 }
02016
02017 return forceSize;
02018 }
02019
02020
02021
02022
02023 void ComputeBondedCUDAKernel::bondedForce(
02024 const double scale14, const int atomStorageSize,
02025 const bool doEnergy, const bool doVirial, const bool doSlow,
02026 const float3 lata, const float3 latb, const float3 latc,
02027 const float cutoff2, const float r2_delta, const int r2_delta_expc,
02028 const float4* h_xyzq, FORCE_TYPE* h_forces,
02029 double *h_energies_virials,
02030 cudaStream_t stream) {
02031
02032 int forceStorageSize = getAllForceSize(atomStorageSize, true);
02033 int forceCopySize = getAllForceSize(atomStorageSize, doSlow);
02034 int forceStride = getForceStride(atomStorageSize);
02035
02036 int forceSize = getForceSize(atomStorageSize);
02037 int startNbond = forceSize;
02038 int startSlow = 2*forceSize;
02039
02040
02041 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
02042 reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
02043
02044 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
02045
02046
02047
02048
02049
02050
02051
02052
02053 int listSize = 3 * (numBonds * 2 + numAngles * 3 + numDihedrals * 4 + numImpropers * 4 + numExclusions * 2 + numCrossterms * 8 + numModifiedExclusions * 4);
02054 reallocate_device<FORCE_TYPE>(&forceList, &forceListSize, listSize, 1.4f);
02055 reallocate_device<int>(&forceListNexts, &forceListNextsSize, listSize, 1.4f);
02056 reallocate_device<int>(&forceListStarts, &forceListStartsSize, 3 * atomStorageSize, 1.4f);
02057 int* forceListStartsNbond = forceListStarts + atomStorageSize;
02058 int* forceListStartsSlow = forceListStarts + 2 * atomStorageSize;
02059
02060 clear_device_array<int>(forceListCounter, 1, stream);
02061 cudaCheck(cudaMemsetAsync(forceListStarts, -1, sizeof(int) * 3 * atomStorageSize, stream));
02062 #else
02063 int* forceListStartsNbond = NULL;
02064 int* forceListStartsSlow = NULL;
02065 #endif
02066
02067
02068 copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
02069
02070
02071 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
02072 clear_device_array<FORCE_TYPE>(forces, forceCopySize, stream);
02073 #endif
02074 if (doEnergy || doVirial) {
02075 clear_device_array<double>(energies_virials, ATOMIC_BINS * energies_virials_SIZE, stream);
02076 }
02077
02078 float one_scale14 = (float)(1.0 - scale14);
02079
02080
02081 int numExclusionsDoSlow = doSlow ? numExclusions : 0;
02082
02083 int nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
02084
02085 int numBondsTB = (numBonds + nthread - 1)/nthread;
02086 int numAnglesTB = (numAngles + nthread - 1)/nthread;
02087 int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
02088 int numImpropersTB = (numImpropers + nthread - 1)/nthread;
02089 int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
02090 int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
02091
02092 int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
02093 numExclusionsTB + numCrosstermsTB;
02094 int shmem_size = 0;
02095
02096
02097
02098
02099 int max_nblock = deviceCUDA->getMaxNumBlocks();
02100
02101 int start = 0;
02102 while (start < nblock)
02103 {
02104 int nleft = nblock - start;
02105 int nblock_use = min(max_nblock, nleft);
02106
02107
02108 #ifdef NAMD_HIP
02109 #define NONBONDEDTABLES cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable()
02110 #else
02111 #define NONBONDEDTABLES cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
02112 cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex()
02113 #endif
02114
02115 #ifdef NAMD_HIP
02116 #define CALL(DOENERGY, DOVIRIAL) \
02117 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> <<< nblock_use, nthread, shmem_size, stream >>> \
02118 (start, numBonds, bonds, bondValues, \
02119 numAngles, angles, angleValues, \
02120 numDihedrals, dihedrals, dihedralValues, \
02121 numImpropers, impropers, improperValues, \
02122 numExclusionsDoSlow, exclusions, \
02123 numCrossterms, crossterms, crosstermValues, \
02124 cutoff2, \
02125 r2_delta, r2_delta_expc, \
02126 cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable() , \
02127 xyzq, forceStride, \
02128 lata, latb, latc, \
02129 forces, &forces[startSlow], \
02130 forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
02131 energies_virials);
02132 #else
02133 #define CALL(DOENERGY, DOVIRIAL) \
02134 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> <<< nblock_use, nthread, shmem_size, stream >>> \
02135 (start, numBonds, bonds, bondValues, \
02136 numAngles, angles, angleValues, \
02137 numDihedrals, dihedrals, dihedralValues, \
02138 numImpropers, impropers, improperValues, \
02139 numExclusionsDoSlow, exclusions, \
02140 numCrossterms, crossterms, crosstermValues, \
02141 cutoff2, \
02142 r2_delta, r2_delta_expc, \
02143 cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable() , \
02144 cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex() , \
02145 xyzq, forceStride, \
02146 lata, latb, latc, \
02147 forces, &forces[startSlow], \
02148 forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
02149 energies_virials);
02150 #endif
02151
02152 if (!doEnergy && !doVirial) CALL(0, 0);
02153 if (!doEnergy && doVirial) CALL(0, 1);
02154 if (doEnergy && !doVirial) CALL(1, 0);
02155 if (doEnergy && doVirial) CALL(1, 1);
02156
02157 #undef CALL
02158 cudaCheck(cudaGetLastError());
02159
02160 start += nblock_use;
02161 }
02162
02163 nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
02164 nblock = (numModifiedExclusions + nthread - 1)/nthread;
02165
02166 bool doElect = (one_scale14 == 0.0f) ? false : true;
02167
02168 start = 0;
02169 while (start < nblock)
02170 {
02171 int nleft = nblock - start;
02172 int nblock_use = min(max_nblock, nleft);
02173
02174 #define CALL(DOENERGY, DOVIRIAL, DOELECT) \
02175 modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT> \
02176 <<< nblock_use, nthread, shmem_size, stream >>> (\
02177 start, numModifiedExclusions, modifiedExclusions, \
02178 doSlow, one_scale14, cutoff2, \
02179 cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getExclusionVdwCoefTable(), \
02180 cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
02181 cudaNonbondedTables.getModifiedExclusionForceTableTex(), cudaNonbondedTables.getModifiedExclusionEnergyTableTex(), \
02182 xyzq, forceStride, lata, latb, latc, \
02183 &forces[startNbond], &forces[startSlow], \
02184 forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts, \
02185 energies_virials);
02186
02187
02188 if (!doEnergy && !doVirial && !doElect) CALL(0, 0, 0);
02189 if (!doEnergy && doVirial && !doElect) CALL(0, 1, 0);
02190 if (doEnergy && !doVirial && !doElect) CALL(1, 0, 0);
02191 if (doEnergy && doVirial && !doElect) CALL(1, 1, 0);
02192
02193 if (!doEnergy && !doVirial && doElect) CALL(0, 0, 1);
02194 if (!doEnergy && doVirial && doElect) CALL(0, 1, 1);
02195 if (doEnergy && !doVirial && doElect) CALL(1, 0, 1);
02196 if (doEnergy && doVirial && doElect) CALL(1, 1, 1);
02197
02198 #undef CALL
02199 cudaCheck(cudaGetLastError());
02200
02201 start += nblock_use;
02202 }
02203 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
02204 nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
02205 nblock = (atomStorageSize + nthread - 1)/nthread;
02206
02207 start = 0;
02208 while (start < nblock)
02209 {
02210 int nleft = nblock - start;
02211 int nblock_use = min(max_nblock, nleft);
02212
02213
02214
02215
02216 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
02217 start, atomStorageSize, forceStride,
02218 forceList, forceListStarts, forceListNexts,
02219 forces);
02220 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
02221 start, atomStorageSize, forceStride,
02222 forceList, forceListStartsNbond, forceListNexts,
02223 &forces[startNbond]);
02224 if (doSlow) {
02225 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
02226 start, atomStorageSize, forceStride,
02227 forceList, forceListStartsSlow, forceListNexts,
02228 &forces[startSlow]);
02229 }
02230 cudaCheck(cudaGetLastError());
02231
02232
02233
02234
02235
02236
02237
02238 start += nblock_use;
02239 }
02240 #endif
02241
02242 copy_DtoH<FORCE_TYPE>(forces, h_forces, forceCopySize, stream);
02243 if (doEnergy || doVirial) {
02244 if (ATOMIC_BINS > 1) {
02245
02246 reduceBondedBinsKernel<<<energies_virials_SIZE, ATOMIC_BINS, 0, stream>>>(energies_virials);
02247 }
02248 copy_DtoH<double>(energies_virials, h_energies_virials, energies_virials_SIZE, stream);
02249 }
02250
02251 }