00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "WKFThreads.h"
00026
00027 #if defined(__SSE2__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00028 #define VMDUSESSE 1 // enable SSE in combination with target macros
00029 #endif
00030
00031 #if defined(__AVX__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00032 #define VMDUSEAVX 1 // enable AVX with runtime dispatch
00033 #endif
00034
00035 #if defined(VMDUSESSE) && defined(VMDUSEAVX)
00036
00037 #if defined(VMDUSESSE)
00038 #include <emmintrin.h>
00039 #endif
00040 #if defined(VMDUSEAVX)
00041 #include <immintrin.h>
00042 #endif
00043
00044
00045
00046 #include <math.h>
00047 #include <stdio.h>
00048 #include <stdlib.h>
00049
00050 #if defined(_MSC_VER)
00051 #include <windows.h>
00052 #include <conio.h>
00053 #else
00054 #include <unistd.h>
00055 #endif // _MSC_VER
00056
00057
00058 #if 0
00059
00060
00061
00062
00063
00064 void set_1fv_aligned(const int *iv, int n, const int val) {
00065 int i=0;
00066
00067 #if defined(VMDUSESSE)
00068 __m128i = _mm_set_p
00069
00070 for (; i<(n-3); i+=4) {
00071 }
00072 #endif
00073 }
00074 #endif
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086 #if defined(_WIN64)
00087 #define myintptrtype size_t
00088 #elif 1
00089 #define myintptrtype unsigned long
00090 #else
00091 #define myintptrtype uintptr_t
00092 #endif
00093
00094
00095
00096
00097
00098 static int test_alignment_Nbyte_powertwo(const void *ptr, unsigned int alignsz) {
00099 unsigned int alignmask = alignsz - 1;
00100 return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~alignmask)));
00101 }
00102
00103 #if 0
00104 static int bytes_prev_alignment(const void *ptr, unsigned int alignsz) {
00105 unsigned int alignmask = alignsz - 1;
00106 myintptrtype t = (((myintptrtype) ptr) & alignmask);
00107 return t;
00108 }
00109
00110 static int bytes_next_alignment(const void *ptr, unsigned int alignsz) {
00111 unsigned int alignmask = alignsz - 1;
00112 myintptrtype t = (alignsz - (((myintptrtype) ptr) & (alignmask))) & alignmask;
00113 return t;
00114 }
00115 #endif
00116
00117
00118
00119
00120 #if defined(VMDUSESSE)
00121
00122 static int hadd_m128i(__m128i sum4) {
00123 __m128i tmp = sum4;
00124 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00125 tmp = _mm_add_epi32(sum4, tmp);
00126 sum4 = tmp;
00127 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00128 tmp = _mm_add_epi32(sum4, tmp);
00129 sum4 = tmp;
00130
00131 int sum = _mm_cvtsi128_si32(sum4);
00132 return sum;
00133 }
00134
00135 #endif
00136
00137
00138
00139
00140
00141 #if 0 && defined(VMDUSEAVX)
00142
00143 static int hadd_m256i(__m256i sum8) {
00144 int tmp[sizeof(__m256i)/sizeof(int)];
00145 _mm256_store_si256((__m256i *)tmp, sum8);
00146 int sum = tmp[0]+tmp[1]+tmp[2]+tmp[3]+tmp[4]+tmp[5]+tmp[6]+tmp[7];
00147 return sum;
00148 }
00149
00150 #endif
00151
00152 #if defined(VMDUSEAVX)
00153
00154
00155 int analyze_selection_aligned_avx(int n, const int *on,
00156 int *firstsel, int *lastsel, int *selected) {
00157 int sel = 0;
00158 int first = 0;
00159 int last = -1;
00160 int i;
00161
00162
00163 if (selected != NULL)
00164 *selected = sel;
00165
00166 if (firstsel != NULL)
00167 *firstsel = first;
00168
00169 if (lastsel != NULL)
00170 *lastsel = last;
00171
00172
00173 if ((firstsel != NULL) || (selected != NULL)) {
00174
00175
00176 #if 1
00177 for (i=0; ((i<n) && !test_alignment_Nbyte_powertwo(&on[i], 32)); i++) {
00178 if (on[i]) {
00179 first = i;
00180 goto foundfirstsel;
00181 }
00182 }
00183 #else
00184
00185 int nextalign = bytes_next_alignment(&on[0], 32) / sizeof(int);
00186 int endalign = (n < nextalign) ? n : nextalign;
00187 for (i=0; i<endalign; i++) {
00188 if (on[i]) {
00189 first = i;
00190 goto foundfirstsel;
00191 }
00192 }
00193 #endif
00194
00195
00196 for (; i<(n-7); i+=8) {
00197
00198 __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00199 if (!_mm256_testz_si256(on8, on8))
00200 break;
00201 }
00202
00203 for (; i<n; i++) {
00204 if (on[i]) {
00205 first = i;
00206 goto foundfirstsel;
00207 }
00208 }
00209
00210
00211
00212 _mm256_zeroupper();
00213 return -1;
00214 }
00215 foundfirstsel:
00216
00217
00218 if ((lastsel != NULL) || (selected != NULL)) {
00219
00220
00221 #if 1
00222 for (i=n-1; i>=0; i--) {
00223 if (on[i]) {
00224 last = i;
00225 goto foundlastsel;
00226 }
00227
00228
00229 if (test_alignment_Nbyte_powertwo(&on[i], 32))
00230 break;
00231 }
00232 #else
00233
00234 int prevalign = bytes_prev_alignment(&on[0], 32) / sizeof(int);
00235 int startalign = (0 > n-prevalign-1) ? 0 : (n-prevalign-1);
00236 for (i=n-1; i>=startalign; i--) {
00237 if (on[i]) {
00238 last = i;
00239 goto foundlastsel;
00240 }
00241 }
00242 #endif
00243
00244 for (i-=8; i>=0; i-=8) {
00245
00246 __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00247 if (!_mm256_testz_si256(on8, on8))
00248 break;
00249 }
00250
00251 int last8=i;
00252 for (i=last8+7; i>=last8; i--) {
00253 if (on[i]) {
00254 last = i;
00255 goto foundlastsel;
00256 }
00257 }
00258
00259
00260
00261 _mm256_zeroupper();
00262 return -1;
00263 }
00264 foundlastsel:
00265
00266
00267
00268 if (selected != NULL) {
00269
00270
00271 #if 1
00272 for (i=first; ((i<=last) && (!test_alignment_Nbyte_powertwo(&on[i], 16))); i++) {
00273 sel += on[i];
00274 }
00275 #else
00276
00277
00278
00279 int nextalign = bytes_next_alignment(&on[0], 16) / sizeof(int);
00280 int endalign = (last < first+nextalign) ? last : (first+nextalign);
00281 for (i=first; i<=endalign; i++) {
00282 sel += on[i];
00283 }
00284 #endif
00285
00286
00287 __m128i sum4 = _mm_setzero_si128();
00288 for (; i<=(last-3); i+=4) {
00289
00290 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00291
00292
00293 sum4 = _mm_add_epi32(sum4, on4);
00294 }
00295 sel += hadd_m128i(sum4);
00296
00297
00298 for (; i<=last; i++) {
00299 sel += on[i];
00300 }
00301 }
00302
00303 if (selected != NULL)
00304 *selected = sel;
00305
00306 if (firstsel != NULL)
00307 *firstsel = first;
00308
00309 if (lastsel != NULL)
00310 *lastsel = last;
00311
00312
00313
00314 _mm256_zeroupper();
00315 return 0;
00316 }
00317 #endif
00318
00319 #endif // SSE+AVX
00320