11 AVXTiles::AVXTiles() : _numAtoms(0), _numTiles(0), _numTilesAlloc(0),
17 AVXTiles::~AVXTiles() {
23 #ifdef MEM_OPT_VERSION 27 free(exclIndexMaxDiff);
40 void AVXTiles::_realloc() {
46 #ifdef MEM_OPT_VERSION 50 free(exclIndexMaxDiff);
60 _numTilesAlloc = 1.2 * _numTiles;
61 const int atomsAlloc = _numTilesAlloc << 4;
62 int e = posix_memalign((
void **)&forces, 64,
63 sizeof(AVXTilesForce)*atomsAlloc);
64 e = e | posix_memalign((
void **)&forcesSlow, 64,
65 sizeof(AVXTilesForce)*atomsAlloc);
66 e = e | posix_memalign((
void **)&vdwTypes, 64,
sizeof(
int)*atomsAlloc);
67 e = e | posix_memalign((
void **)&atomIndex, 64 ,
sizeof(
int)*atomsAlloc);
68 #ifdef MEM_OPT_VERSION 69 e = e | posix_memalign((
void **)&atomExclIndex, 64 ,
sizeof(
int)*atomsAlloc);
71 e = e | posix_memalign((
void **)&exclIndexStart, 64,
sizeof(
int)*atomsAlloc);
72 e = e | posix_memalign((
void **)&exclIndexMaxDiff, 64,
73 sizeof(
int)*atomsAlloc);
74 e = e | posix_memalign((
void **)&reverseOrder, 64,
sizeof(
int)*atomsAlloc);
75 e = e | posix_memalign((
void **)&bbox_x, 64,
sizeof(
float)*_numTilesAlloc);
76 e = e | posix_memalign((
void **)&bbox_y, 64,
sizeof(
float)*_numTilesAlloc);
77 e = e | posix_memalign((
void **)&bbox_z, 64,
sizeof(
float)*_numTilesAlloc);
78 e = e | posix_memalign((
void **)&bbox_wx, 64,
sizeof(
float)*_numTilesAlloc);
79 e = e | posix_memalign((
void **)&bbox_wy, 64,
sizeof(
float)*_numTilesAlloc);
80 e = e | posix_memalign((
void **)&bbox_wz, 64,
sizeof(
float)*_numTilesAlloc);
82 if (e)
NAMD_die(
"Could not allocate memory for tiles.");
87 void AVXTiles::atomUpdate(
const CompAtom *compAtom,
90 int numFreeAtoms = _numAtoms;
93 for (
int k=0; k < _numAtoms; ++k) {
97 const int id = compAtomExt[j].
id;
99 if (compAtomExt[j].atomFixed) --numFreeAtoms;
100 #ifdef MEM_OPT_VERSION 101 const int exclId = compAtomExt[j].exclId;
102 atomExclIndex[k] = exclId;
103 const ExclusionCheck *exclcheck = mol->get_excl_check_for_idx(exclId);
104 exclIndexStart[k] =
id + exclcheck->
min;
105 exclIndexMaxDiff[k] =
id + exclcheck->
max;
106 #else // ! MEM_OPT_VERSION 108 exclIndexStart[k] = exclcheck->
min;
109 exclIndexMaxDiff[k] = exclcheck->
max;
110 #endif // MEM_OPT_VERSION 114 _numFreeAtoms = numFreeAtoms;
115 const int end = _numTiles << 4;
116 for (
int i = _numAtoms; i < end; i++) vdwTypes[i] = 0;
121 void AVXTiles::_buildBoundingBoxes(
const int step) {
123 for (
int tile = 0; tile < _numTiles; tile ++) {
125 const float *iptr = (
float *)(atoms + (tile << 4));
126 const __m512 t0 = _mm512_loadu_ps(iptr);
127 const __m512 t1 = _mm512_loadu_ps(iptr+16);
128 const __m512 t2 = _mm512_loadu_ps(iptr+32);
129 const __m512 t3 = _mm512_loadu_ps(iptr+48);
130 const __m512i t4 = _mm512_set_epi32(29,25,21,17,13,9,5,1,
131 28,24,20,16,12,8,4,0);
132 const __m512 t5 = _mm512_permutex2var_ps(t0, t4, t1);
133 const __m512i t6 = _mm512_set_epi32(28,24,20,16,12,8,4,0,
134 29,25,21,17,13,9,5,1);
135 const __m512 t7 = _mm512_permutex2var_ps(t2, t6, t3);
136 const __mmask16 t9 = 0xFF00;
137 const __m512i t5i = _mm512_castps_si512(t5);
138 const __m512i t7i = _mm512_castps_si512(t7);
139 const __m512 x = _mm512_castsi512_ps(_mm512_mask_blend_epi32(t9, t5i,
141 const __m512 y = _mm512_shuffle_f32x4(t5, t7, 0x4E);
142 const __m512i t12 = _mm512_set_epi32(31,27,23,19,15,11,7,3,30,26,22,
144 const __m512 t13 = _mm512_permutex2var_ps(t0, t12, t1);
145 const __m512i t14 = _mm512_set_epi32(30,26,22,18,14,10,6,2,31,27,23,
147 const __m512 t15 = _mm512_permutex2var_ps(t2, t14, t3);
148 const __m512i t13i = _mm512_castps_si512(t13);
149 const __m512i t15i = _mm512_castps_si512(t15);
150 const __m512 z = _mm512_castsi512_ps(_mm512_mask_blend_epi32(t9, t13i,
154 const float min_x = _mm512_reduce_min_ps(x);
156 const float max_x = _mm512_reduce_max_ps(x);
158 const float min_y = _mm512_reduce_min_ps(y);
160 const float max_y = _mm512_reduce_max_ps(y);
162 const float min_z = _mm512_reduce_min_ps(z);
164 const float max_z = _mm512_reduce_max_ps(z);
166 bbox_x[tile] = 0.5f*(min_x + max_x);
167 bbox_y[tile] = 0.5f*(min_y + max_y);
168 bbox_z[tile] = 0.5f*(min_z + max_z);
169 bbox_wx[tile] = 0.5f*(max_x - min_x);
170 bbox_wy[tile] = 0.5f*(max_y - min_y);
171 bbox_wz[tile] = 0.5f*(max_z - min_z);
177 template <
int doSlow,
int doVirial,
int touched>
178 void AVXTiles::_nativeForceVirialUpdate(
const CompAtom *p,
179 const Vector ¢er,
Force * __restrict__ nativeForces,
180 Force * __restrict__ nativeForcesSlow,
181 const Force * __restrict__ nativeForcesVirial,
182 const Force * __restrict__ nativeForcesSlowVirial,
183 double virial[6],
double virialSlow[6]) {
185 BigReal v_xx, v_xy, v_xz, v_yy, v_yz, v_zz;
186 BigReal vS_xx, vS_xy, vS_xz, vS_yy, vS_yz, vS_zz;
188 v_xx = v_xy = v_xz = v_yy = v_yz = v_zz = 0.0;
189 if (doSlow) vS_xx = vS_xy = vS_xz = vS_yy = vS_yz = vS_zz = 0.0;
192 const int * __restrict__ reverseOrder = this->reverseOrder;
193 const AVXTilesForce * __restrict__ forces = this->forces;
194 const AVXTilesForce * __restrict__ forcesSlow = this->forcesSlow;
195 #pragma omp simd aligned(reverseOrder, forces, forcesSlow:64) \ 196 reduction(+:v_xx, v_xy, v_xz, v_yy, v_yz, v_zz, \ 197 vS_xx, vS_xy, vS_xz, vS_yy, vS_yz, vS_zz) 198 for (
int i = 0; i < _numAtoms; i++) {
199 BigReal f_x = nativeForcesVirial[i].x;
200 BigReal f_y = nativeForcesVirial[i].y;
201 BigReal f_z = nativeForcesVirial[i].z;
203 const int order = reverseOrder[i];
204 f_x += forces[
order].x;
205 f_y += forces[
order].y;
206 f_z += forces[
order].z;
220 nativeForces[i].x += f_x;
221 nativeForces[i].y += f_y;
222 nativeForces[i].z += f_z;
224 BigReal f_x = nativeForcesSlowVirial[i].x;
225 BigReal f_y = nativeForcesSlowVirial[i].y;
226 BigReal f_z = nativeForcesSlowVirial[i].z;
228 const int order = reverseOrder[i];
229 f_x += forcesSlow[
order].x;
230 f_y += forcesSlow[
order].y;
231 f_z += forcesSlow[
order].z;
241 nativeForcesSlow[i].x += f_x;
242 nativeForcesSlow[i].y += f_y;
243 nativeForcesSlow[i].z += f_z;
254 virialSlow[0] = vS_xx;
255 virialSlow[1] = vS_xy;
256 virialSlow[2] = vS_xz;
257 virialSlow[3] = vS_yy;
258 virialSlow[4] = vS_yz;
259 virialSlow[5] = vS_zz;
266 void AVXTiles::nativeForceVirialUpdate(
const int doSlow,
const int doVirial,
268 Force * __restrict__ nativeForces,
269 Force * __restrict__ nativeForcesSlow,
270 const Force * __restrict__ nativeForcesVirial,
271 const Force * __restrict__ nativeForcesSlowVirial,
272 double virial[6],
double virialSlow[6]) {
274 #define CALL(DOSLOW, DOVIRIAL, TOUCHED) \ 275 _nativeForceVirialUpdate<DOSLOW, DOVIRIAL, \ 276 TOUCHED>(p, center, nativeForces, nativeForcesSlow,\ 277 nativeForcesVirial, \ 278 nativeForcesSlowVirial, virial, \ 282 if (!doSlow && !doVirial) CALL(0, 0, 1);
283 if (!doSlow && doVirial) CALL(0, 1, 1);
284 if (doSlow && doVirial) CALL(1, 1, 1);
285 if (doSlow && !doVirial) CALL(1, 0, 1);
287 if (!doSlow && !doVirial) CALL(0, 0, 0);
288 if (!doSlow && doVirial) CALL(0, 1, 0);
289 if (doSlow && doVirial) CALL(1, 1, 0);
290 if (doSlow && !doVirial) CALL(1, 0, 0);
295 #endif // NAMD_AVXTILES
const ExclusionCheck * get_excl_check_for_atom(int anum) const
Molecule stores the structural information for the system.
void NAMD_die(const char *err_msg)