00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #if defined(VMDCPUDISPATCH) && defined(__ARM_FEATURE_SVE)
00034 #include <arm_sve.h>
00035
00036 #include "WKFThreads.h"
00037
00038
00039 #include <math.h>
00040 #include <stdio.h>
00041 #include <stdlib.h>
00042 #include <stddef.h>
00043
00044 int arm_sve_vecsize_32bits(void) {
00045 return svcntw();
00046 }
00047
00048 int arm_sve_vecsize_64bits(void) {
00049 return svcntd();
00050 }
00051
00052
00053
00054 void minmaxmean_1fv_aligned_sve(const float *f, ptrdiff_t n,
00055 float *fmin, float *fmax, float *fmean) {
00056 if (n < 1) {
00057 *fmin = 0.0f;
00058 *fmax = 0.0f;
00059 *fmean = 0.0f;
00060 return;
00061 }
00062
00063 svbool_t pg = svptrue_b32();
00064 svfloat32_t minv = svdup_f32(f[0]);
00065 svfloat32_t maxv = minv;
00066 svfloat64_t meanv = svdup_f64(0.0);
00067
00068 for (ptrdiff_t i=0; i<n; i+=svcntw()) {
00069 pg = svwhilelt_b32(i, n);
00070 svfloat32_t tmp = svld1(pg, (float32_t *) &f[i]);
00071
00072 minv = svmin_m(pg, minv, tmp);
00073 maxv = svmax_m(pg, maxv, tmp);
00074 meanv = svadd_z(pg, meanv, svcvt_f64_z(pg, tmp));
00075 }
00076
00077 pg = svptrue_b32();
00078 *fmin = svminv(pg, minv);
00079 *fmax = svmaxv(pg, maxv);
00080 *fmean = float(svaddv(pg, meanv) / n);
00081 }
00082
00083
00084
00085 void minmax_1fv_aligned_sve(const float *f, ptrdiff_t n,
00086 float *fmin, float *fmax) {
00087 if (n < 1)
00088 return;
00089
00090 svbool_t pg = svptrue_b32();
00091 svfloat32_t minv = svdup_f32(f[0]);
00092 svfloat32_t maxv = minv;
00093 for (ptrdiff_t i=0; i<n; i+=svcntw()) {
00094 pg = svwhilelt_b32(i, n);
00095 svfloat32_t tmp = svld1(pg, (float32_t *) &f[i]);
00096 minv = svmin_m(pg, minv, tmp);
00097 maxv = svmax_m(pg, maxv, tmp);
00098 }
00099
00100 pg = svptrue_b32();
00101 *fmin = svminv(pg, minv);
00102 *fmax = svmaxv(pg, maxv);
00103 }
00104
00105
00106
00107
00108 void minmax_3fv_aligned_sve(const float *f, const ptrdiff_t n3,
00109 float *fmin, float *fmax) {
00110 if (n3 < 1)
00111 return;
00112
00113 svbool_t pg = svptrue_b32();
00114 svfloat32x3_t minv = svcreate3(svdup_f32(f[0]),
00115 svdup_f32(f[0]),
00116 svdup_f32(f[0]));
00117
00118 svfloat32x3_t maxv = minv;
00119 int vlen = svcntw();
00120 int vlen3 = vlen*3;
00121 ptrdiff_t cnt, i;
00122 for (cnt=0,i=0; cnt<n3; cnt+=vlen,i+=vlen3) {
00123 pg = svwhilelt_b32(cnt, n3);
00124 svfloat32x3_t tmp = svld3(pg, (float32_t *) &f[i]);
00125 svset3(minv, 0, svmin_m(pg, svget3(minv, 0), svget3(tmp, 0)));
00126 svset3(maxv, 0, svmax_m(pg, svget3(maxv, 0), svget3(tmp, 0)));
00127 svset3(minv, 1, svmin_m(pg, svget3(minv, 1), svget3(tmp, 1)));
00128 svset3(maxv, 1, svmax_m(pg, svget3(maxv, 1), svget3(tmp, 1)));
00129 svset3(minv, 2, svmin_m(pg, svget3(minv, 2), svget3(tmp, 2)));
00130 svset3(maxv, 2, svmax_m(pg, svget3(maxv, 2), svget3(tmp, 2)));
00131 }
00132
00133 pg = svptrue_b32();
00134 fmin[0] = svminv(pg, svget3(minv, 0));
00135 fmax[0] = svmaxv(pg, svget3(maxv, 0));
00136 fmin[1] = svminv(pg, svget3(minv, 1));
00137 fmax[1] = svmaxv(pg, svget3(maxv, 1));
00138 fmin[2] = svminv(pg, svget3(minv, 2));
00139 fmax[2] = svmaxv(pg, svget3(maxv, 2));
00140 }
00141
00142
00143 #else // CPUDISPATCH+SVE
00144
00145 int arm_sve_vecsize_32bits(void) {
00146 return -1;
00147 }
00148
00149 int arm_sve_vecsize_64bits(void) {
00150 return -1;
00151 }
00152
00153 #endif
00154