7 AVXJTiles::AVXJTiles() : _numTiles(0), _numTilesAlloc(0) {
10 AVXJTiles::~AVXJTiles() {
18 void AVXJTiles::_realloc() {
24 _numTilesAlloc = 1.2 * _numTiles;
25 int e = posix_memalign((
void **)&excl, 64,
26 sizeof(
unsigned int)*_numTilesAlloc << 4);
27 e = e | posix_memalign((
void **)&atomStart, 64,
sizeof(
int)*_numTilesAlloc);
28 e = e | posix_memalign((
void **)&status, 64,
sizeof(
int)*_numTilesAlloc);
29 if (e)
NAMD_die(
"Could not allocate memory for tiles.");
34 AVXTileLists::AVXTileLists() : _numLists(0), _numListsAlloc(0),
35 _numModifiedAlloc(0), _numExcludedAlloc(0),
36 _maxPairLists(0), _maxPairs(0)
38 #ifndef MEM_OPT_VERSION
39 _exclFlyListBuffer = 0;
45 AVXTileLists::~AVXTileLists() {
50 if (_numModifiedAlloc) {
54 if (_numExcludedAlloc) {
58 if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
66 #ifndef MEM_OPT_VERSION
67 delete []_exclFlyListBuffer;
73 void AVXTileLists::setSimParams(
const float scale,
const float scale14,
74 const float c1,
const float c3,
75 const float switchOn2,
float *fastTable,
76 float *fastEnergyTable,
float *slowTable,
77 float *slowEnergyTable,
float *eps4sigma,
78 float *eps4sigma14,
float *ljTable,
79 const float ljTableWidth,
float *modifiedTable,
80 float *modifiedEnergyTable,
82 float *excludedEnergyTable,
83 const int interpolationMode) {
84 _interpolationMode = interpolationMode;
86 if (_interpolationMode > 1)
87 _paramScale = 2.0 * scale;
90 _paramScale14 = scale14;
93 _paramSwitchOn2 = switchOn2;
94 if (_interpolationMode == 3) {
95 _paramFastTable = fastTable;
96 _paramFastEnergyTable = fastEnergyTable;
98 _paramSlowTable = slowTable;
99 _paramSlowEnergyTable = slowEnergyTable;
100 _paramModifiedTable = modifiedTable;
101 _paramModifiedEnergyTable = modifiedEnergyTable;
102 _paramExcludedTable = excludedTable;
103 _paramExcludedEnergyTable = excludedEnergyTable;
104 if (_interpolationMode > 1) {
105 _paramLjTable = ljTable;
106 _paramLjWidth = ljTableWidth;
108 _paramEps4Sigma = eps4sigma;
109 _paramEps4Sigma14 = eps4sigma14;
124 template <
bool count,
bool partitionMode>
125 int AVXTileLists::_buildBB() {
130 if (partitionMode || jTiles.maxTiles() == 0)
131 _buildBB<true, partitionMode>();
134 const int maxJtiles = jTiles.maxTiles();
136 const int numTiles2 = tiles_p1->numTiles();
137 const bool zeroShift = ! (_shx*_shx + _shy*_shy + _shz*_shz > 0);
138 const bool self = zeroShift && (tiles_p0 == tiles_p1);
141 if (!count) out_i = 0;
144 if (partitionMode && count==
false && listDepth[
itileList]==0)
147 int itileListLen = 0;
155 const __m512 bbIx = _mm512_set1_ps(tiles_p0->bbox_x[
itileList] + _shx);
156 const __m512 bbIy = _mm512_set1_ps(tiles_p0->bbox_y[
itileList] + _shy);
157 const __m512 bbIz = _mm512_set1_ps(tiles_p0->bbox_z[
itileList] + _shz);
158 const __m512 bbIwx = _mm512_set1_ps(tiles_p0->bbox_wx[
itileList]);
159 const __m512 bbIwy = _mm512_set1_ps(tiles_p0->bbox_wy[
itileList]);
160 const __m512 bbIwz = _mm512_set1_ps(tiles_p0->bbox_wz[
itileList]);
163 __m512i jAtomStart = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
164 128, 144, 160, 176, 192, 208, 224,
166 jAtomStart = _mm512_add_epi32(jAtomStart, _mm512_set1_epi32(jinit<<4));
167 int jtileStart = numJtiles;
169 __mmask16 loopMask = 0xFFFF;
170 for (
int j=jinit; j < numTiles2; j += 16) {
172 if (numTiles2 - j < 16)
173 loopMask >>= (16 - (numTiles2 - j));
176 const __m512 bbJx = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
177 loopMask, tiles_p1->bbox_x + j);
178 const __m512 bbJy = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
179 loopMask, tiles_p1->bbox_y + j);
180 const __m512 bbJz = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
181 loopMask, tiles_p1->bbox_z + j);
182 const __m512 bbJwx = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
183 loopMask, tiles_p1->bbox_wx + j);
184 const __m512 bbJwy = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
185 loopMask, tiles_p1->bbox_wy + j);
186 const __m512 bbJwz = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
187 loopMask, tiles_p1->bbox_wz + j);
190 const __m512 dx_one = _mm512_abs_ps(_mm512_sub_ps(bbIx, bbJx));
191 const __m512 dx_two = _mm512_add_ps(bbIwx, bbJwx);
192 const __mmask16 lxmask = _mm512_cmplt_ps_mask(dx_two, dx_one);
193 const __m512 dx = _mm512_mask_sub_ps(_mm512_setzero_ps(), lxmask,
196 const __m512 dy_one = _mm512_abs_ps(_mm512_sub_ps(bbIy, bbJy));
197 const __m512 dy_two = _mm512_add_ps(bbIwy, bbJwy);
198 const __mmask16 lymask = _mm512_cmplt_ps_mask(dy_two, dy_one);
199 const __m512 dy = _mm512_mask_sub_ps(_mm512_setzero_ps(), lymask,
202 const __m512 dz_one = _mm512_abs_ps(_mm512_sub_ps(bbIz, bbJz));
203 const __m512 dz_two = _mm512_add_ps(bbIwz, bbJwz);
204 const __mmask16 lzmask = _mm512_cmplt_ps_mask(dz_two, dz_one);
205 const __m512 dz = _mm512_mask_sub_ps(_mm512_setzero_ps(), lzmask,
208 const __m512 r2bb = _mm512_fmadd_ps(dz,dz,
209 _mm512_fmadd_ps(dx, dx, _mm512_mul_ps(dy, dy)));
211 const __mmask16 keep = _mm512_cmplt_ps_mask(r2bb,
212 _mm512_set1_ps(_plcutoff2)) & loopMask;
213 const int nKeep = _popcnt32(keep);
215 if (count ==
false && nKeep)
216 if (partitionMode || jtileStart + itileListLen + nKeep <= maxJtiles)
217 _mm512_mask_compressstoreu_epi32(jTiles.atomStart+jtileStart+
218 itileListLen, keep, jAtomStart);
220 itileListLen += nKeep;
222 jAtomStart = _mm512_add_epi32(jAtomStart, _mm512_set1_epi32(256));
225 if (count ==
false) {
226 if (itileListLen > 0) {
227 listDepth[out_i] = (
unsigned int)itileListLen;
232 }
else if (partitionMode)
233 listDepth[
itileList] = (
unsigned int)itileListLen;
235 numJtiles += itileListLen;
240 if (partitionMode && count) {
241 const int jtilesPerPart =
static_cast<double>(numJtiles)/_numParts + 0.5;
242 int jtileCount = 0, curPart = 0, nextPart = jtilesPerPart;
246 jtileCount += listDepth[j];
247 if (curPart < _minPart || curPart >= _maxPart)
250 numJtiles += listDepth[j];
251 if (jtileCount >= nextPart) {
253 nextPart += jtilesPerPart;
254 if (curPart == _numParts) curPart--;
260 jTiles.realloc(numJtiles);
266 if (!partitionMode && jTiles.realloc(numJtiles))
267 return _buildBB<false, false>();
272 memset(jTiles.status, 0, numJtiles *
sizeof(
int));
273 lists[_numLists].jtileStart = lists[_numLists-1].jtileStart +
274 listDepth[_numLists - 1];
280 void AVXTileLists::delEmptyLists() {
283 const int numLists = _numLists;
285 __mmask16 tripMask = 0xFFFF;
288 for (
int ii = 0; ii < numLists; ii += 16) {
289 if (numLists - ii < 16) {
290 vecTrip = numLists - ii;
291 tripMask >>= (16 - vecTrip);
295 __m512i depth = _mm512_load_epi32(listDepth + ii);
297 __mmask16 depth_mask = _mm512_cmpneq_epi32_mask(depth,
298 _mm512_setzero_epi32()) & tripMask;
300 depth = _mm512_and_epi32(depth, _mm512_set1_epi32((
int)65535));
302 _mm512_mask_compressstoreu_epi32(listDepth + out_i, depth_mask, depth);
304 for (
int vi = 0; vi < vecTrip; vi++) {
305 if (*((
unsigned int*)&(depth) + vi)) {
306 const int i = ii + vi;
307 lists[out_i].atomStart_i = lists[i].atomStart_i;
309 int start = totalTiles;
310 int startOld = lists[i].jtileStart;
311 int endOld = lists[i+1].jtileStart;
312 lists[out_i].jtileStart = start;
315 for (
int jtileOld=startOld; jtileOld < endOld; jtileOld++) {
316 int jtile = jTiles.status[jtileOld];
319 jTiles.atomStart[jtile0] = jTiles.atomStart[jtileOld];
322 excl = _mm512_load_epi32(jTiles.excl + (jtileOld<<4));
323 _mm512_store_epi32((
void *)(jTiles.excl + (jtile0<<4)), excl);
328 totalTiles += (*((
unsigned int*)&(depth) + vi));
332 jTiles.realloc(totalTiles);
336 lists[_numLists].jtileStart = lists[_numLists-1].jtileStart +
337 listDepth[_numLists - 1];
340 void AVXTileLists::build() {
341 tiles_p0->buildBoundingBoxes(_lastBuild);
342 if (tiles_p0 != tiles_p1)
343 tiles_p1->buildBoundingBoxes(_lastBuild);
345 if (_maxPart - _minPart < _numParts)
346 _buildBB<false,true>();
348 _buildBB<false,false>();
351 if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
352 for (
int i = 0; i < _maxPairLists; i++) _numPairs[i] = 0;
353 int end = tiles_p0->numAtoms() & 15;
354 if (end <= NAMD_AVXTILES_PAIR_THRESHOLD) {
355 const int start = tiles_p0->numAtoms() - end;
357 for (
int i = start; i < end; i++) _pair_i[i-start] = i;
362 void AVXTileLists::_realloc() {
363 if (_numListsAlloc) {
367 _numListsAlloc = 1.2 * _numLists;
368 int e = posix_memalign((
void **)&lists, 64,
sizeof(List)*(_numListsAlloc+1));
369 e = e | posix_memalign((
void **)&listDepth, 64,
370 sizeof(
unsigned int)*(_numListsAlloc+1));
371 if (e)
NAMD_die(
"Could not allocate memory for tiles.");
373 if (_numModifiedAlloc == 0) {
374 int newNum = tiles_p0->numAtoms();
375 if (newNum < tiles_p1->numAtoms()) newNum = tiles_p1->numAtoms();
376 reallocModified(2 * newNum);
378 reallocExcluded(2.2 * newNum);
380 if (NAMD_AVXTILES_PAIR_THRESHOLD > 0)
381 _reallocPairLists(tiles_p0->numAtoms() & 15, NAMD_AVXTILES_IPAIRCOUNT);
386 void AVXTileLists::_reallocModified() {
387 int numModifiedAlloc = 1.2 * _numModified;
390 if (posix_memalign((
void **)&newp, 64,
sizeof(
int) * numModifiedAlloc))
391 NAMD_die(
"Could not allocate memory for tiles.");
393 if (_numModifiedAlloc) {
394 memcpy(newp, _modified_i,
sizeof(
int) * _numModifiedAlloc);
399 if (posix_memalign((
void **)&newp, 64,
sizeof(
int) * numModifiedAlloc))
400 NAMD_die(
"Could not allocate memory for tiles.");
402 if (_numModifiedAlloc) {
403 memcpy(newp, _modified_j,
sizeof(
int) * _numModifiedAlloc);
408 _numModifiedAlloc = numModifiedAlloc;
413 void AVXTileLists::_reallocExcluded() {
414 int numExcludedAlloc = 1.2 * _numExcluded;
417 if (posix_memalign((
void **)&newp, 64,
sizeof(
int) * numExcludedAlloc))
418 NAMD_die(
"Could not allocate memory for tiles.");
420 if (_numExcludedAlloc) {
421 memcpy(newp, _excluded_i,
sizeof(
int) * _numExcludedAlloc);
426 if (posix_memalign((
void **)&newp, 64,
sizeof(
int) * numExcludedAlloc))
427 NAMD_die(
"Could not allocate memory for tiles.");
429 if (_numExcludedAlloc) {
430 memcpy(newp, _excluded_j,
sizeof(
int) * _numExcludedAlloc);
435 _numExcludedAlloc = numExcludedAlloc;
440 void AVXTileLists::_reallocPairLists(
const int numPairLists,
441 const int maxPairs) {
442 if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
443 if (!_maxPairLists) {
444 _maxPairLists = NAMD_AVXTILES_PAIR_THRESHOLD;
445 _maxPairs = maxPairs;
446 int e = posix_memalign((
void **)&_pair_i, 64,
sizeof(
int)*_maxPairLists);
447 e = e | posix_memalign((
void **)&_numPairs, 64,
448 sizeof(
int)*_maxPairLists);
449 e = e | posix_memalign((
void **)&_pairStart, 64,
450 sizeof(
int)*_maxPairLists);
451 e = e | posix_memalign((
void **)&_pairLists, 64,
452 sizeof(
int)*_maxPairLists*_maxPairs);
453 if (e)
NAMD_die(
"Could not allocate memory for tiles.");
456 for (
int i = 0; i < _maxPairLists; i++) {
457 _pairStart[i] = listStart;
458 listStart += _maxPairs;
460 }
else if (maxPairs > _maxPairs) {
462 if (posix_memalign((
void **)&hold, 64,
463 sizeof(
int)*_maxPairLists*maxPairs))
464 NAMD_die(
"Could not allocate memory for tiles.");
467 for (
int i = 0; i < _maxPairLists; i++) {
468 if (_numPairs[i]) memcpy(hold + listStart, _pairLists + _pairStart[i],
469 sizeof(
int) * _numPairs[i]);
470 _pairStart[i] = listStart;
471 listStart += maxPairs;
475 _maxPairs = maxPairs;
480 #endif // NAMD_AVXTILES
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
void NAMD_die(const char *err_msg)
__global__ void const int numTileLists