34template <
typename T,
size_t N = 16 /
sizeof(T)>
37 static constexpr size_t kPrivateN = N;
44 return *
this = (*
this * other);
47 return *
this = (*
this / other);
50 return *
this = (*
this + other);
53 return *
this = (*
this - other);
56 return *
this = (*
this % other);
59 return *
this = (*
this & other);
62 return *
this = (*
this | other);
65 return *
this = (*
this ^ other);
73 T raw[16 /
sizeof(T)] = {};
77template <
typename T,
size_t N = 16 /
sizeof(T)>
81 return b ?
static_cast<Raw>(
~Raw{0}) : 0;
85 Raw bits[16 /
sizeof(T)] = {};
89using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
92using TFromV =
typename V::PrivateT;
97template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
111template <
class D,
class VFrom>
120template <
class D,
class VFrom>
126 constexpr size_t kFromByteLen =
sizeof(TFrom) *
HWY_MAX_LANES_D(DFrom);
128 constexpr size_t kCopyByteLen =
HWY_MIN(kFromByteLen, kToByteLen);
131 CopyBytes<kCopyByteLen>(&v.raw, &to.raw);
139template <
class FromSizeTag,
class ToSizeTag,
class DTo,
class DFrom>
150template <
class D,
typename T2>
153 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
154 v.raw[i] = ConvertScalarTo<TFromD<D>>(t);
167template <
class D, HWY_IF_T_SIZE_D(D, 1)>
186 result.raw[10] = t10;
187 result.raw[11] = t11;
188 result.raw[12] = t12;
189 result.raw[13] = t13;
190 result.raw[14] = t14;
191 result.raw[15] = t15;
195template <
class D, HWY_IF_T_SIZE_D(D, 2)>
197 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
198 TFromD<D> t5, TFromD<D> t6,
212template <
class D, HWY_IF_T_SIZE_D(D, 4)>
223template <
class D, HWY_IF_T_SIZE_D(D, 8)>
233template <
class D,
typename T = TFromD<D>,
typename T2>
236 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
245template <
typename T,
size_t N>
249 using TU =
TFromD<
decltype(du)>;
251 for (
size_t i = 0; i < N; ++i) {
252 vu.raw[i] =
static_cast<TU
>(~vu.raw[i]);
258template <
typename T,
size_t N>
264 for (
size_t i = 0; i < N; ++i) {
265 au.raw[i] &= bu.raw[i];
269template <
typename T,
size_t N>
275template <
typename T,
size_t N>
281template <
typename T,
size_t N>
287 for (
size_t i = 0; i < N; ++i) {
288 au.raw[i] |= bu.raw[i];
292template <
typename T,
size_t N>
298template <
typename T,
size_t N>
304 for (
size_t i = 0; i < N; ++i) {
305 au.raw[i] ^= bu.raw[i];
309template <
typename T,
size_t N>
315template <
typename T,
size_t N>
316HWY_API Vec128<T, N>
Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
317 return Xor(x1,
Xor(x2, x3));
321template <
typename T,
size_t N>
322HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
323 return Or(o1,
Or(o2, o3));
327template <
typename T,
size_t N>
328HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
329 return Or(o,
And(a1, a2));
333template <
typename T,
size_t N>
340template <
typename T,
size_t N>
342 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
343 const DFromV<
decltype(magn)>
d;
348template <
typename T,
size_t N>
350 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
356template <
typename T,
size_t N>
358 for (
size_t i = 0; i < N; ++i) {
367template <
typename T,
size_t N>
377template <
class DTo,
class MFrom>
394 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
395 m.bits[i] = MFromD<D>::FromBool(i < n);
401template <
typename T,
size_t N>
408template <
typename T,
size_t N>
414template <
typename T,
size_t N>
420template <
typename T,
size_t N>
425 const auto vi =
BitCast(di, v);
427 for (
size_t i = 0; i < N; ++i) {
428 v.raw[i] = vi.raw[i] < 0 ? yes.raw[i] : no.raw[i];
435template <
typename T,
size_t N>
437 const Simd<T, N, 0>
d;
441template <
typename T,
size_t N>
442HWY_API Mask128<T, N>
And(Mask128<T, N> a, Mask128<T, N> b) {
443 const Simd<T, N, 0>
d;
447template <
typename T,
size_t N>
448HWY_API Mask128<T, N>
AndNot(Mask128<T, N> a, Mask128<T, N> b) {
449 const Simd<T, N, 0>
d;
453template <
typename T,
size_t N>
454HWY_API Mask128<T, N>
Or(Mask128<T, N> a, Mask128<T, N> b) {
455 const Simd<T, N, 0>
d;
459template <
typename T,
size_t N>
460HWY_API Mask128<T, N>
Xor(Mask128<T, N> a, Mask128<T, N> b) {
461 const Simd<T, N, 0>
d;
465template <
typename T,
size_t N>
467 const Simd<T, N, 0>
d;
475template <
int kBits,
typename T,
size_t N>
477 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
479 for (
size_t i = 0; i < N; ++i) {
480 const TU raw_u =
static_cast<TU
>(v.
raw[i]);
481 const auto shifted = raw_u << kBits;
482 v.
raw[i] =
static_cast<T
>(shifted);
487template <
int kBits,
typename T,
size_t N>
489 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
492 for (
size_t i = 0; i < N; ++i) {
500template <
int kBits,
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
505 constexpr size_t kSizeInBits =
sizeof(T) * 8;
506 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
507 if (kBits == 0)
return v;
515template <
typename T,
size_t N>
517 for (
size_t i = 0; i < N; ++i) {
519 v.raw[i] =
static_cast<T
>(shifted);
524template <
typename T,
size_t N>
526 for (
size_t i = 0; i < N; ++i) {
535template <
typename T,
size_t N>
537 for (
size_t i = 0; i < N; ++i) {
540 v.
raw[i] =
static_cast<T
>(shifted);
545template <
typename T,
size_t N>
547 for (
size_t i = 0; i < N; ++i) {
559template <
typename T,
size_t N>
562 for (
size_t i = 0; i < N; ++i) {
563 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw[i]);
564 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw[i]);
565 a.
raw[i] =
static_cast<T
>((a64 + b64) &
static_cast<uint64_t
>(~T(0)));
569template <
typename T,
size_t N>
572 for (
size_t i = 0; i < N; ++i) {
573 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw[i]);
574 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw[i]);
575 a.
raw[i] =
static_cast<T
>((a64 - b64) &
static_cast<uint64_t
>(~T(0)));
580template <
typename T,
size_t N>
583 for (
size_t i = 0; i < N; ++i) {
589template <
typename T,
size_t N>
592 for (
size_t i = 0; i < N; ++i) {
600template <
typename T,
size_t N>
604template <
typename T,
size_t N>
613 Vec128<uint64_t, (N + 7) / 8> sums;
614 for (
size_t i = 0; i < N; ++i) {
615 sums.
raw[i / 8] += v.
raw[i];
622 Vec128<int64_t, (N + 7) / 8> sums;
623 for (
size_t i = 0; i < N; ++i) {
624 sums.
raw[i / 8] += v.
raw[i];
634 for (
size_t i = 0; i < N; ++i) {
636 HWY_MAX(hwy::LowestValue<T>(),
static_cast<TW
>(a.
raw[i]) + b.
raw[i]),
637 hwy::HighestValue<T>()));
647 for (
size_t i = 0; i < N; ++i) {
649 HWY_MAX(hwy::LowestValue<T>(),
static_cast<TW
>(a.
raw[i]) - b.
raw[i]),
650 hwy::HighestValue<T>()));
656template <
typename T,
size_t N>
658 static_assert(!IsSigned<T>(),
"Only for unsigned");
659 for (
size_t i = 0; i < N; ++i) {
660 a.
raw[i] =
static_cast<T
>((a.
raw[i] + b.
raw[i] + 1) / 2);
667template <
typename T,
size_t N>
669 for (
size_t i = 0; i < N; ++i) {
680template <
typename T,
size_t N>
683 for (
size_t i = 0; i < N; ++i) {
688template <
typename T,
size_t N>
691 for (
size_t i = 0; i < N; ++i) {
697template <
typename T,
size_t N>
700 for (
size_t i = 0; i < N; ++i) {
711template <
typename T,
size_t N>
714 for (
size_t i = 0; i < N; ++i) {
728template <
typename T,
size_t N>
733template <
typename T,
size_t N>
743template <
typename T,
size_t N>
749template <
typename T,
size_t N>
755template <
typename T,
size_t N>
763template <
typename T,
size_t N>
773template <
typename T,
size_t N>
776 for (
size_t i = 0; i < N; ++i) {
782template <
typename T,
size_t N>
784 for (
size_t i = 0; i < N; ++i) {
785 a.
raw[i] =
static_cast<T
>(
static_cast<uint64_t
>(a.
raw[i]) *
786 static_cast<uint64_t
>(b.
raw[i]));
791template <
typename T,
size_t N>
794 for (
size_t i = 0; i < N; ++i) {
795 a.
raw[i] =
static_cast<T
>(
static_cast<uint64_t
>(a.
raw[i]) *
796 static_cast<uint64_t
>(b.
raw[i]));
804#ifdef HWY_NATIVE_MUL_8
805#undef HWY_NATIVE_MUL_8
807#define HWY_NATIVE_MUL_8
809#ifdef HWY_NATIVE_MUL_64
810#undef HWY_NATIVE_MUL_64
812#define HWY_NATIVE_MUL_64
815template <
typename T,
size_t N>
820template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
822 for (
size_t i = 0; i < N; ++i) {
823 a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i];
829template <
class T,
size_t N,
834 for (
size_t i = 0; i < N; ++i) {
835 a.
raw[i] =
static_cast<T
>(
836 (
static_cast<TW
>(a.
raw[i]) *
static_cast<TW
>(b.
raw[i])) >>
842template <
class T, HWY_IF_UI64(T)>
849template <
class T, HWY_IF_UI64(T)>
863 for (
size_t i = 0; i < N; ++i) {
864 a.
raw[i] =
static_cast<int16_t
>((a.
raw[i] * b.
raw[i] + 16384) >> 15);
870template <
class T,
size_t N,
876 Vec128<TW, (N + 1) / 2> mul;
877 for (
size_t i = 0; i < N; i += 2) {
878 const TW a_wide = a.
raw[i];
879 mul.raw[i / 2] =
static_cast<TW
>(a_wide * b.
raw[i]);
885template <
class T,
size_t N,
891 Vec128<TW, (N + 1) / 2> mul;
892 for (
size_t i = 0; i < N; i += 2) {
893 const TW a_wide = a.
raw[i + 1];
894 mul.raw[i / 2] =
static_cast<TW
>(a_wide * b.
raw[i + 1]);
901 for (
size_t i = 0; i < N; ++i) {
911template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
918template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
921 return mul * x + add;
924template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
927 return add - mul * x;
930template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
933 return mul * x - sub;
936template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
939 return Neg(mul) * x - sub;
946 for (
size_t i = 0; i < N; ++i) {
947 const float half = v.
raw[i] * 0.5f;
949 v.
raw[i] = BitCastScalar<float>(
static_cast<uint32_t
>(
950 0x5F3759DF - (BitCastScalar<uint32_t>(v.
raw[i]) >> 1)));
952 v.
raw[i] = v.
raw[i] * (1.5f - (half * v.
raw[i] * v.
raw[i]));
960#if defined(HWY_NO_LIBCXX)
961#if HWY_COMPILER_GCC_ACTUAL
962 return __builtin_sqrt(v);
964 uint32_t bits = BitCastScalar<uint32_t>(v);
966 bits = (1 << 29) + (bits >> 1) - (1 << 22);
967 return BitCastScalar<float>(bits);
974#if defined(HWY_NO_LIBCXX)
975#if HWY_COMPILER_GCC_ACTUAL
976 return __builtin_sqrt(v);
978 uint64_t bits = BitCastScalar<uint64_t>(v);
980 bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
981 return BitCastScalar<double>(bits);
990template <
typename T,
size_t N>
992 for (
size_t i = 0; i < N; ++i) {
1000template <
typename T,
size_t N>
1003 const T k0 = ConvertScalarTo<T>(0);
1005 for (
size_t i = 0; i < N; ++i) {
1006 if (!(a.
raw[i] < MantissaEnd<T>())) {
1009 const T bias = ConvertScalarTo<T>(v.
raw[i] < k0 ? -0.5 : 0.5);
1010 const TI rounded = ConvertScalarTo<TI>(v.
raw[i] + bias);
1012 v.
raw[i] = v.
raw[i] < 0 ? ConvertScalarTo<T>(-0) : k0;
1015 const T rounded_f = ConvertScalarTo<T>(rounded);
1017 if ((rounded & 1) &&
1018 ScalarAbs(rounded_f - v.
raw[i]) == ConvertScalarTo<T>(0.5)) {
1019 v.
raw[i] = ConvertScalarTo<T>(rounded - (v.
raw[i] < k0 ? -1 : 1));
1022 v.
raw[i] = rounded_f;
1032 const T k0 = ConvertScalarTo<T>(0);
1034 const Vec128<float, N> abs =
Abs(v);
1035 Vec128<int32_t, N> ret;
1036 for (
size_t i = 0; i < N; ++i) {
1039 if (!(abs.raw[i] < MantissaEnd<T>())) {
1041 if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
1042 ret.raw[i] = signbit ? LimitsMin<TI>() :
LimitsMax<TI>();
1045 ret.raw[i] =
static_cast<TI
>(v.raw[i]);
1048 const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
1049 const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
1054 const T rounded_f = ConvertScalarTo<T>(rounded);
1056 if ((rounded & 1) &&
1057 ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
1058 ret.raw[i] = rounded - (signbit ? -1 : 1);
1061 ret.raw[i] = rounded;
1066template <
typename T,
size_t N>
1070 for (
size_t i = 0; i < N; ++i) {
1071 if (!(abs.
raw[i] <= MantissaEnd<T>())) {
1074 const TI truncated =
static_cast<TI
>(v.
raw[i]);
1075 if (truncated == 0) {
1076 v.
raw[i] = v.
raw[i] < 0 ? -T{0} : T{0};
1079 v.
raw[i] =
static_cast<T
>(truncated);
1085template <
typename Float,
size_t N>
1087 constexpr int kMantissaBits = MantissaBits<Float>();
1089 const Bits kExponentMask = MaxExponentField<Float>();
1090 const Bits kMantissaMask = MantissaMask<Float>();
1091 const Bits kBias = kExponentMask / 2;
1093 for (
size_t i = 0; i < N; ++i) {
1094 const bool positive = v.
raw[i] > Float(0.0);
1096 Bits bits = BitCastScalar<Bits>(v.
raw[i]);
1098 const int exponent =
1099 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1101 if (exponent >= kMantissaBits)
continue;
1104 v.
raw[i] = positive ? Float{1} : Float{-0.0};
1108 const Bits mantissa_mask = kMantissaMask >> exponent;
1110 if ((bits & mantissa_mask) == 0)
continue;
1113 if (positive) bits += (kMantissaMask + 1) >> exponent;
1114 bits &= ~mantissa_mask;
1116 v.
raw[i] = BitCastScalar<Float>(bits);
1122template <
typename Float,
size_t N>
1124 constexpr int kMantissaBits = MantissaBits<Float>();
1126 const Bits kExponentMask = MaxExponentField<Float>();
1127 const Bits kMantissaMask = MantissaMask<Float>();
1128 const Bits kBias = kExponentMask / 2;
1130 for (
size_t i = 0; i < N; ++i) {
1131 const bool negative = v.
raw[i] < Float(0.0);
1133 Bits bits = BitCastScalar<Bits>(v.
raw[i]);
1135 const int exponent =
1136 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1138 if (exponent >= kMantissaBits)
continue;
1141 v.
raw[i] = negative ? Float(-1.0) : Float(0.0);
1145 const Bits mantissa_mask = kMantissaMask >> exponent;
1147 if ((bits & mantissa_mask) == 0)
continue;
1150 if (negative) bits += (kMantissaMask + 1) >> exponent;
1151 bits &= ~mantissa_mask;
1153 v.
raw[i] = BitCastScalar<Float>(bits);
1160template <
typename T,
size_t N>
1163 for (
size_t i = 0; i < N; ++i) {
1172template <
typename T,
size_t N>
1175 for (
size_t i = 0; i < N; ++i) {
1181template <
typename T,
size_t N>
1184 for (
size_t i = 0; i < N; ++i) {
1190template <
typename T,
size_t N>
1192 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1193 return (v & bit) == bit;
1196template <
typename T,
size_t N>
1199 for (
size_t i = 0; i < N; ++i) {
1204template <
typename T,
size_t N>
1207 for (
size_t i = 0; i < N; ++i) {
1213template <
typename T,
size_t N>
1216 for (
size_t i = 0; i < N; ++i) {
1221template <
typename T,
size_t N>
1224 for (
size_t i = 0; i < N; ++i) {
1245 const bool lt = a.
raw[1] < b.
raw[1];
1256 const bool eq = a.
raw[1] == b.
raw[1] && a.
raw[0] == b.
raw[0];
1265 const bool ne = a.
raw[1] != b.
raw[1] || a.
raw[0] != b.
raw[0];
1274 const bool eq = a.
raw[1] == b.
raw[1];
1283 const bool ne = a.
raw[1] != b.
raw[1];
1342 return Load(
d, aligned);
1345#ifdef HWY_NATIVE_LOAD_N
1346#undef HWY_NATIVE_LOAD_N
1348#define HWY_NATIVE_LOAD_N
1353 size_t max_lanes_to_load) {
1355 const size_t N =
Lanes(
d);
1356 const size_t num_of_lanes_to_load =
HWY_MIN(max_lanes_to_load, N);
1363 size_t max_lanes_to_load) {
1365 const size_t N =
Lanes(
d);
1366 const size_t num_of_lanes_to_load =
HWY_MIN(max_lanes_to_load, N);
1386 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1387 if (
m.bits[i])
p[i] = v.raw[i];
1391#ifdef HWY_NATIVE_STORE_N
1392#undef HWY_NATIVE_STORE_N
1394#define HWY_NATIVE_STORE_N
1399 size_t max_lanes_to_store) {
1400 const size_t N =
Lanes(
d);
1401 const size_t num_of_lanes_to_store =
HWY_MIN(max_lanes_to_store, N);
1410#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1411#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1413#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1416template <
class D,
typename T = TFromD<D>>
1421 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1422 buf0[i] = *unaligned++;
1423 buf1[i] = *unaligned++;
1429template <
class D,
typename T = TFromD<D>>
1435 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1436 buf0[i] = *unaligned++;
1437 buf1[i] = *unaligned++;
1438 buf2[i] = *unaligned++;
1445template <
class D,
typename T = TFromD<D>>
1453 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1454 buf0[i] = *unaligned++;
1455 buf1[i] = *unaligned++;
1456 buf2[i] = *unaligned++;
1457 buf3[i] = *unaligned++;
1470 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1471 *unaligned++ = v0.raw[i];
1472 *unaligned++ = v1.raw[i];
1479 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1480 *unaligned++ = v0.raw[i];
1481 *unaligned++ = v1.raw[i];
1482 *unaligned++ = v2.raw[i];
1490 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1491 *unaligned++ = v0.raw[i];
1492 *unaligned++ = v1.raw[i];
1493 *unaligned++ = v2.raw[i];
1494 *unaligned++ = v3.raw[i];
1514template <
class ToT,
class FromT>
1521 constexpr unsigned kMaxExpField =
1522 static_cast<unsigned>(MaxExponentField<FromT>());
1523 constexpr unsigned kExpBias = kMaxExpField >> 1;
1524 constexpr unsigned kMinOutOfRangeExpField =
static_cast<unsigned>(
HWY_MIN(
1525 kExpBias +
sizeof(ToT) * 8 -
static_cast<unsigned>(IsSigned<ToT>()),
1534 const FromT val_to_compare =
1535 static_cast<FromT
>(IsSigned<ToT>() ?
ScalarAbs(val) : val);
1546 return (
static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1547 MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1548 ?
static_cast<ToT
>(val)
1549 :
static_cast<ToT
>(
static_cast<ToTU
>(LimitsMax<ToT>()) +
1553template <
class ToT,
class ToTypeTag,
class FromT>
1555 return ConvertScalarTo<ToT>(val);
1561 return CastValueForF2IConv<ToT>(val);
1567 return CastValueForF2IConv<ToT>(val);
1574template <
class ToT,
class FromT>
1580 constexpr unsigned kMaxExpField =
1581 static_cast<unsigned>(MaxExponentField<FromT>());
1582 constexpr unsigned kExpBias = kMaxExpField >> 1;
1583 constexpr unsigned kMinOutOfRangeExpField =
static_cast<unsigned>(
HWY_MIN(
1584 kExpBias +
sizeof(ToT) * 8 -
static_cast<unsigned>(IsSigned<ToT>()),
1593 const FromT val_to_compare =
1594 static_cast<FromT
>(IsSigned<ToT>() ?
ScalarAbs(val) : val);
1605 return (
static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1606 MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1607 ?
static_cast<ToT
>(val)
1608 :
static_cast<ToT
>(LimitsMin<ToT>());
1613template <
class DTo,
typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TFrom)>
1615 static_assert(
sizeof(
TFromD<DTo>) >
sizeof(TFrom),
"Not promoting");
1617 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1619 ret.raw[i] = detail::CastValueForPromoteTo<TFromD<DTo>>(
1620 hwy::TypeTag<TFromD<DTo>>(), from.raw[i]);
1625#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1626#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1628#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1631template <
class D64, HWY_IF_UI64_D(D64)>
1634 for (
size_t i = 0; i <
MaxLanes(d64); ++i) {
1635 ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D64>>(v.raw[i]);
1642template <
class D, HWY_IF_F32_D(D)>
1645 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1653 ret.raw[i] =
static_cast<float>(from.raw[i]);
1657template <
class D, HWY_IF_UI32_D(D)>
1660 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1662 ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(from.raw[i]);
1667template <
class DTo,
typename TFrom,
size_t N,
HWY_IF_SIGNED(TFrom),
1671 static_assert(
sizeof(TTo) <
sizeof(TFrom),
"Not demoting");
1674 for (
size_t i = 0; i < N; ++i) {
1678 ret.raw[i] =
static_cast<TTo
>(from.
raw[i]);
1693#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
1694#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
1695 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
1700 using TTo = TFromD<DTo>;
1701 static_assert(
sizeof(TTo) <
sizeof(TFrom),
"Not demoting");
1703 const auto max =
static_cast<MakeUnsigned<TTo>
>(LimitsMax<TTo>());
1706 for (
size_t i = 0; i < N; ++i) {
1708 ret.raw[i] =
static_cast<TTo
>(
HWY_MIN(from.raw[i], max));
1713template <
class DTo,
typename TFrom,
size_t N,
HWY_IF_UI64(TFrom),
1716 using TTo = TFromD<DTo>;
1717 static_assert(
sizeof(TTo) <
sizeof(TFrom),
"Not demoting");
1720 for (
size_t i = 0; i < N; ++i) {
1723 ret.raw[i] =
static_cast<TTo
>(from.raw[i]);
1728template <
class DBF16, HWY_IF_BF16_D(DBF16),
class VF32>
1730 const Repartition<uint32_t,
decltype(dbf16)> du32;
1731 const VFromD<
decltype(du32)> b_in_lower = ShiftRight<16>(
BitCast(du32, b));
1733 const VFromD<
decltype(du32)> a_mask =
Set(du32, 0xFFFF0000);
1737template <
class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>),
class V,
1738 HWY_IF_SIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1739 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1742 const size_t NW =
Lanes(dw);
1744 const TN min = LimitsMin<TN>();
1745 const TN max = LimitsMax<TN>();
1747 for (
size_t i = 0; i < NW; ++i) {
1748 ret.raw[i] =
static_cast<TN
>(
HWY_MIN(
HWY_MAX(min, a.raw[i]), max));
1750 for (
size_t i = 0; i < NW; ++i) {
1751 ret.raw[NW + i] =
static_cast<TN
>(
HWY_MIN(
HWY_MAX(min, b.raw[i]), max));
1761 const size_t NW =
Lanes(dw);
1762 using TN = TFromD<DN>;
1764 const TN_U max =
static_cast<TN_U
>(LimitsMax<TN>());
1766 for (
size_t i = 0; i < NW; ++i) {
1767 ret.raw[i] =
static_cast<TN
>(
HWY_MIN(a.raw[i], max));
1769 for (
size_t i = 0; i < NW; ++i) {
1770 ret.raw[NW + i] =
static_cast<TN
>(
HWY_MIN(b.raw[i], max));
1775template <
class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>),
class V,
1776 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
1777 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1778 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1787 const size_t NW =
Lanes(dn) / 2;
1790 for (
size_t i = 0; i < NW; ++i) {
1791 ret.raw[i] = ConvertScalarTo<TN>(a.raw[i]);
1793 for (
size_t i = 0; i < NW; ++i) {
1794 ret.raw[NW + i] = ConvertScalarTo<TN>(b.raw[i]);
1814template <
class D, HWY_IF_F32_D(D),
size_t N>
1817 for (
size_t i = 0; i < N; ++i) {
1823#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
1824#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
1826#define HWY_NATIVE_DEMOTE_F32_TO_BF16
1829template <
class D, HWY_IF_BF16_D(D),
size_t N>
1832 for (
size_t i = 0; i < N; ++i) {
1838#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1839#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1841#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1844template <
class D32, HWY_IF_UI32_D(D32)>
1847 for (
size_t i = 0; i <
MaxLanes(d32); ++i) {
1848 ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D32>>(v.raw[i]);
1856template <
typename TFrom,
typename DTo>
1860 static_assert(
sizeof(ToT) ==
sizeof(TFrom),
"Should have same size");
1864 for (
size_t i = 0; i < N; ++i) {
1866 ret.raw[i] = CastValueForF2IConv<ToT>(from.raw[i]);
1871template <
typename TFrom,
typename DTo>
1875 static_assert(
sizeof(ToT) ==
sizeof(TFrom),
"Should have same size");
1878 for (
size_t i = 0; i < N; ++i) {
1880 ret.raw[i] =
static_cast<ToT
>(from.raw[i]);
1887template <
class DTo,
typename TFrom>
1892#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1893#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1895#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1902 for (
size_t i = 0; i <
MaxLanes(di); i++) {
1903 ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<DI>>(v.raw[i]);
1915template <
class D, HWY_IF_U8_D(D),
size_t N>
1918 for (
size_t i = 0; i < N; ++i) {
1919 ret.raw[i] =
static_cast<uint8_t
>(v.
raw[i] & 0xFF);
1924template <
class D, HWY_IF_U16_D(D),
size_t N>
1927 for (
size_t i = 0; i < N; ++i) {
1928 ret.raw[i] =
static_cast<uint16_t
>(v.raw[i] & 0xFFFF);
1933template <
class D, HWY_IF_U32_D(D),
size_t N>
1936 for (
size_t i = 0; i < N; ++i) {
1937 ret.raw[i] =
static_cast<uint32_t
>(v.raw[i] & 0xFFFFFFFFu);
1942template <
class D, HWY_IF_U8_D(D),
size_t N>
1945 for (
size_t i = 0; i < N; ++i) {
1946 ret.raw[i] =
static_cast<uint8_t
>(v.
raw[i] & 0xFF);
1951template <
class D, HWY_IF_U16_D(D),
size_t N>
1954 for (
size_t i = 0; i < N; ++i) {
1955 ret.raw[i] =
static_cast<uint16_t
>(v.raw[i] & 0xFFFF);
1960template <
class D, HWY_IF_U8_D(D),
size_t N>
1963 for (
size_t i = 0; i < N; ++i) {
1964 ret.raw[i] =
static_cast<uint8_t
>(v.
raw[i] & 0xFF);
1969#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
1970#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
1972#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
1980 const size_t NW =
Lanes(dw);
1981 using TW =
TFromD<
decltype(dw)>;
1982 using TN =
TFromD<
decltype(dn)>;
1984 constexpr TW max_val{LimitsMax<TN>()};
1986 for (
size_t i = 0; i < NW; ++i) {
1987 ret.raw[i] =
static_cast<TN
>(a.raw[i] & max_val);
1989 for (
size_t i = 0; i < NW; ++i) {
1990 ret.raw[NW + i] =
static_cast<TN
>(b.raw[i] & max_val);
1997template <
typename T,
size_t N>
2018 const Half<
decltype(
d)> dh;
2020 CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
2024template <
class D,
class VH = VFromD<Half<D>>>
2026 const Half<
decltype(
d)> dh;
2028 CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
2035 const Half<
decltype(
d)> dh;
2037 CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
2044 const Half<
decltype(
d)> dh;
2053 const Half<
decltype(
d)> dh;
2062 const Half<
decltype(
d)> dh;
2064 CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
2071 const Half<
decltype(
d)> dh;
2073 for (
size_t i = 0; i <
MaxLanes(dh); ++i) {
2074 ret.raw[i] = lo.raw[2 * i];
2076 for (
size_t i = 0; i <
MaxLanes(dh); ++i) {
2077 ret.raw[
MaxLanes(dh) + i] = hi.raw[2 * i];
2084#if HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
2085#define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
2087#define HWY_EMU128_CONCAT_INLINE HWY_API
2092 const Half<
decltype(
d)> dh;
2094 for (
size_t i = 0; i <
MaxLanes(dh); ++i) {
2095 ret.raw[i] = lo.raw[2 * i + 1];
2097 for (
size_t i = 0; i <
MaxLanes(dh); ++i) {
2098 ret.raw[
MaxLanes(dh) + i] = hi.raw[2 * i + 1];
2104template <
int kBytes,
class D>
2108 reinterpret_cast<const uint8_t *
HWY_RESTRICT>(lo.raw);
2111 CopyBytes<
d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
2112 CopyBytes<kBytes>(hi.raw, ret8 +
d.MaxBytes() - kBytes);
2118template <
int kBytes,
class D>
2120 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2124 ZeroBytes<kBytes>(ret8);
2125 CopyBytes<
d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
2129template <
int kBytes,
typename T,
size_t N>
2131 return ShiftLeftBytes<kBytes>(
DFromV<
decltype(v)>(), v);
2136template <
int kLanes,
class D,
typename T = TFromD<D>>
2142template <
int kLanes,
typename T,
size_t N>
2144 return ShiftLeftLanes<kLanes>(
DFromV<
decltype(v)>(), v);
2148template <
int kBytes,
class D>
2150 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2156 CopyBytes<
d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
2157 ZeroBytes<kBytes>(ret8 +
d.MaxBytes() - kBytes);
2162template <
int kLanes,
class D>
2165 constexpr size_t kBytes = kLanes *
sizeof(TFromD<D>);
2171template <
typename T,
size_t N>
2176template <
typename T,
size_t N>
2182template <
typename T,
size_t N>
2187template <
typename T,
size_t N>
2189 for (
size_t i = 0; i < N; i += 2) {
2195template <
typename T,
size_t N>
2197 for (
size_t i = 0; i < N; i += 2) {
2203template <
typename T,
size_t N>
2204HWY_API Vec128<T, N>
OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
2205 for (
size_t i = 0; i < N; i += 2) {
2206 odd.
raw[i] = even.raw[i];
2214 for (
size_t i = 1; i < N; i += 2) {
2215 a.raw[i] = b.raw[i - 1];
2223 for (
size_t i = 1; i < N; i += 2) {
2224 b.raw[i - 1] = a.raw[i];
2229template <
typename T,
size_t N>
2235template <
typename T,
size_t N>
2243template <
typename T,
size_t N>
2248template <
class D,
typename TI,
size_t N>
2250 static_assert(
sizeof(
TFromD<D>) ==
sizeof(TI),
"Index/lane size must match");
2256template <
class D,
typename TI>
2258 D
d,
const TI* idx) {
2262template <
typename T,
size_t N>
2265 for (
size_t i = 0; i < N; ++i) {
2266 ret.raw[i] = v.raw[idx.raw[i]];
2271template <
typename T,
size_t N>
2276 constexpr TI kVecLaneIdxMask =
static_cast<TI
>(N - 1);
2277 for (
size_t i = 0; i < N; ++i) {
2278 const auto src_idx = idx.raw[i];
2279 const auto masked_src_lane_idx = src_idx & kVecLaneIdxMask;
2280 ret.
raw[i] = (src_idx < static_cast<TI>(N)) ? a.
raw[masked_src_lane_idx]
2281 : b.
raw[masked_src_lane_idx];
2297 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2304#ifdef HWY_NATIVE_REVERSE2_8
2305#undef HWY_NATIVE_REVERSE2_8
2307#define HWY_NATIVE_REVERSE2_8
2313 for (
size_t i = 0; i <
MaxLanes(
d); i += 2) {
2314 ret.raw[i + 0] = v.raw[i + 1];
2315 ret.raw[i + 1] = v.raw[i + 0];
2323 for (
size_t i = 0; i <
MaxLanes(
d); i += 4) {
2324 ret.raw[i + 0] = v.raw[i + 3];
2325 ret.raw[i + 1] = v.raw[i + 2];
2326 ret.raw[i + 2] = v.raw[i + 1];
2327 ret.raw[i + 3] = v.raw[i + 0];
2335 for (
size_t i = 0; i <
MaxLanes(
d); i += 8) {
2336 ret.raw[i + 0] = v.raw[i + 7];
2337 ret.raw[i + 1] = v.raw[i + 6];
2338 ret.raw[i + 2] = v.raw[i + 5];
2339 ret.raw[i + 3] = v.raw[i + 4];
2340 ret.raw[i + 4] = v.raw[i + 3];
2341 ret.raw[i + 5] = v.raw[i + 2];
2342 ret.raw[i + 6] = v.raw[i + 1];
2343 ret.raw[i + 7] = v.raw[i + 0];
2354 const size_t clamped_amt =
HWY_MIN(amt, N);
2356 (N - clamped_amt) *
sizeof(TFromD<D>));
2366 const size_t clamped_amt =
HWY_MIN(amt, N);
2368 (N - clamped_amt) *
sizeof(TFromD<D>));
2377template <
typename T,
size_t N>
2379 static_assert(
sizeof(T) == 4,
"Only for 32-bit");
2380 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2385template <
typename T>
2387 static_assert(
sizeof(T) == 4,
"Only for 32-bit");
2389 ret.raw[3] = v.raw[1];
2390 ret.raw[2] = v.raw[0];
2391 ret.raw[1] = v.raw[3];
2392 ret.raw[0] = v.raw[2];
2395template <
typename T>
2397 static_assert(
sizeof(T) == 8,
"Only for 64-bit");
2402template <
typename T>
2405 ret.raw[3] = v.raw[0];
2406 ret.raw[2] = v.raw[3];
2407 ret.raw[1] = v.raw[2];
2408 ret.raw[0] = v.raw[1];
2413template <
typename T>
2416 ret.raw[3] = v.raw[2];
2417 ret.raw[2] = v.raw[1];
2418 ret.raw[1] = v.raw[0];
2419 ret.raw[0] = v.raw[3];
2423template <
typename T>
2429template <
int kLane,
typename T,
size_t N>
2431 for (
size_t i = 0; i < N; ++i) {
2439template <
typename T,
size_t N,
typename TI,
size_t NI>
2445 reinterpret_cast<const uint8_t*
>(
indices.raw);
2449 for (
size_t i = 0; i < NI *
sizeof(TI); ++i) {
2450 const size_t idx = idx_bytes[i];
2452 ret_bytes[i] = idx <
sizeof(T) * N ? v_bytes[idx] : 0;
2457template <
typename T,
size_t N,
typename TI,
size_t NI>
2466template <
typename T,
size_t N>
2469 for (
size_t i = 0; i < N / 2; ++i) {
2470 ret.
raw[2 * i + 0] = a.
raw[i];
2471 ret.
raw[2 * i + 1] = b.
raw[i];
2484 const Half<
decltype(
d)> dh;
2486 for (
size_t i = 0; i <
MaxLanes(dh); ++i) {
2487 ret.raw[2 * i + 0] = a.raw[
MaxLanes(dh) + i];
2488 ret.raw[2 * i + 1] = b.raw[
MaxLanes(dh) + i];
2497template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2501template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2506template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2515 typename MFromD<D>::Raw or_sum = 0;
2516 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2517 or_sum |= mask.bits[i];
2524 constexpr uint64_t kAll = LimitsMax<typename MFromD<D>::Raw>();
2525 uint64_t and_sum = kAll;
2526 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2527 and_sum &= mask.bits[i];
2529 return and_sum == kAll;
2536 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2537 const size_t bit =
size_t{1} << (i & 7);
2538 const size_t idx_byte = i >> 3;
2547 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2548 m.bits[i] = MFromD<D>::FromBool(((mask_bits >> i) & 1u) != 0);
2558 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2559 const size_t bit =
size_t{1} << (i & 7);
2560 const size_t idx_byte = i >> 3;
2562 bits[idx_byte] =
static_cast<uint8_t
>(bits[idx_byte] | bit);
2571 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2572 count += mask.bits[i] != 0;
2579 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2580 if (mask.bits[i] != 0)
return i;
2588 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2589 if (mask.bits[i] != 0)
return static_cast<intptr_t
>(i);
2591 return intptr_t{-1};
2596 for (intptr_t i =
static_cast<intptr_t
>(
MaxLanes(
d) - 1); i >= 0; i--) {
2597 if (mask.bits[i] != 0)
return static_cast<size_t>(i);
2605 for (intptr_t i =
static_cast<intptr_t
>(
MaxLanes(
d) - 1); i >= 0; i--) {
2606 if (mask.bits[i] != 0)
return i;
2608 return intptr_t{-1};
2613template <
typename T>
2614struct CompressIsPartition {
2615 enum {
value = (
sizeof(T) != 1) };
2618template <
typename T,
size_t N>
2622 for (
size_t i = 0; i < N; ++i) {
2624 ret.
raw[count++] = v.
raw[i];
2627 for (
size_t i = 0; i < N; ++i) {
2628 if (!mask.
bits[i]) {
2629 ret.
raw[count++] = v.
raw[i];
2640#ifdef HWY_NATIVE_EXPAND
2641#undef HWY_NATIVE_EXPAND
2643#define HWY_NATIVE_EXPAND
2646template <
typename T,
size_t N>
2650 for (
size_t i = 0; i < N; ++i) {
2652 ret.
raw[i] = v.
raw[in_pos++];
2654 ret.
raw[i] = ConvertScalarTo<T>(0);
2667 for (
size_t i = 0; i <
Lanes(
d); ++i) {
2669 ret.
raw[i] = unaligned[in_pos++];
2671 ret.raw[i] = TFromD<D>();
2678template <
typename T,
size_t N>
2682 for (
size_t i = 0; i < N; ++i) {
2683 if (!mask.
bits[i]) {
2684 ret.
raw[count++] = v.
raw[i];
2687 for (
size_t i = 0; i < N; ++i) {
2689 ret.
raw[count++] = v.
raw[i];
2698 Mask128<uint64_t> ) {
2703template <
typename T,
size_t N>
2712template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
2716 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2718 unaligned[count++] = v.raw[i];
2725template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
2732template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
2746template <
class T,
size_t N, HWY_IF_LANES_GT(N, 1)>
2751 TU result_lane_mask{0};
2752 for (
size_t i = 0; i < N; i++) {
2753 result_lane_mask =
static_cast<TU
>(result_lane_mask | mask.
bits[i]);
2754 result.
bits[i] = result_lane_mask;
2759template <
class T,
size_t N>
2764template <
class T,
size_t N>
2770 TU result_lane_mask =
static_cast<TU
>(~TU{0});
2771 for (
size_t i = 0; i < N; i++) {
2772 const auto curr_lane_mask_bits = mask.
bits[i];
2773 result.
bits[i] =
static_cast<TU
>(curr_lane_mask_bits & result_lane_mask);
2775 static_cast<TU
>(result_lane_mask &
2776 static_cast<TU
>(-
static_cast<TI
>(mask.
bits[i] == 0)));
2781template <
class T,
size_t N>
2787 TU result_lane_mask =
static_cast<TU
>(~TU{0});
2788 for (
size_t i = 0; i < N; i++) {
2789 result.
bits[i] = result_lane_mask;
2791 static_cast<TU
>(result_lane_mask &
2792 static_cast<TU
>(-
static_cast<TI
>(mask.
bits[i] == 0)));
2799template <
class D, HWY_IF_F32_D(D),
class VBF16>
2801 const Rebind<uint32_t,
decltype(df32)> du32;
2802 using VU32 =
VFromD<
decltype(du32)>;
2803 const VU32 odd =
Set(du32, 0xFFFF0000u);
2805 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
2807 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
2813template <
class D, HWY_IF_I32_D(D),
class VI16>
2815 using VI32 =
VFromD<
decltype(d32)>;
2817 const VI32 ae = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32, a)));
2818 const VI32 be = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32, b)));
2819 const VI32 ao = ShiftRight<16>(
BitCast(d32, a));
2820 const VI32 bo = ShiftRight<16>(
BitCast(d32, b));
2824template <
class D, HWY_IF_U32_D(D),
class VU16>
2826 const auto lo16_mask =
Set(du32, 0x0000FFFFu);
2828 const auto a0 =
And(
BitCast(du32, a), lo16_mask);
2829 const auto b0 =
And(
BitCast(du32, b), lo16_mask);
2831 const auto a1 = ShiftRight<16>(
BitCast(du32, a));
2832 const auto b1 = ShiftRight<16>(
BitCast(du32, b));
2839template <
class D, HWY_IF_F32_D(D),
size_t N,
class VBF16>
2843 const Rebind<uint32_t,
decltype(df32)> du32;
2844 using VU32 =
VFromD<
decltype(du32)>;
2845 const VU32 odd =
Set(du32, 0xFFFF0000u);
2847 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
2849 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
2855template <
class D, HWY_IF_I32_D(D),
size_t N,
class VI16>
2859 using VI32 =
VFromD<
decltype(d32)>;
2861 const VI32 ae = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32, a)));
2862 const VI32 be = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32, b)));
2863 const VI32 ao = ShiftRight<16>(
BitCast(d32, a));
2864 const VI32 bo = ShiftRight<16>(
BitCast(d32, b));
2865 sum1 =
Add(
Mul(ao, bo), sum1);
2866 return Add(
Mul(ae, be), sum0);
2869template <
class D, HWY_IF_U32_D(D),
size_t N,
class VU16>
2873 using VU32 =
VFromD<
decltype(du32)>;
2874 const VU32 lo16_mask =
Set(du32, uint32_t{0x0000FFFFu});
2875 const VU32 ae =
And(
BitCast(du32, a), lo16_mask);
2876 const VU32 be =
And(
BitCast(du32, b), lo16_mask);
2877 const VU32 ao = ShiftRight<16>(
BitCast(du32, a));
2878 const VU32 bo = ShiftRight<16>(
BitCast(du32, b));
2879 sum1 =
Add(
Mul(ao, bo), sum1);
2880 return Add(
Mul(ae, be), sum0);
2886 return Add(sum0, sum1);
2891#ifdef HWY_NATIVE_REDUCE_SCALAR
2892#undef HWY_NATIVE_REDUCE_SCALAR
2894#define HWY_NATIVE_REDUCE_SCALAR
2897template <
class D,
typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2900 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2905template <
class D,
typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2907 T min = HighestValue<T>();
2908 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2913template <
class D,
typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2915 T max = LowestValue<T>();
2916 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2924template <
class D, HWY_IF_LANES_GT_D(D, 1)>
2928template <
class D, HWY_IF_LANES_GT_D(D, 1)>
2932template <
class D, HWY_IF_LANES_GT_D(D, 1)>
2941template <
class T, HWY_IF_UI64(T)>
2943 alignas(16) T mul[2];
2948template <
class T, HWY_IF_UI64(T)>
2950 alignas(16) T mul[2];
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_IF_SIGNED(T)
Definition base.h:622
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
#define HWY_IF_UNSIGNED(T)
Definition base.h:620
#define HWY_IF_UI64(T)
Definition base.h:687
Definition arm_neon-inl.h:865
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:867
static HWY_INLINE Raw FromBool(bool b)
Definition emu128-inl.h:80
Raw bits[16/sizeof(T)]
Definition emu128-inl.h:85
Definition arm_neon-inl.h:813
HWY_INLINE Vec128()=default
HWY_INLINE Vec128 & operator%=(const Vec128 other)
Definition emu128-inl.h:55
T PrivateT
Definition arm_neon-inl.h:816
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition emu128-inl.h:46
Raw raw
Definition arm_neon-inl.h:851
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition emu128-inl.h:52
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition emu128-inl.h:64
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition emu128-inl.h:61
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition emu128-inl.h:43
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition emu128-inl.h:58
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition emu128-inl.h:49
#define HWY_EMU128_CONCAT_INLINE
Definition emu128-inl.h:2087
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:744
HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag, FromT val)
Definition emu128-inl.h:1554
HWY_API VFromD< DTo > ConvertTo(hwy::FloatTag, DTo, Vec128< TFrom, HWY_MAX_LANES_D(DTo)> from)
Definition emu128-inl.h:1857
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:689
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:560
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:681
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:570
HWY_INLINE VFromD< DTo > ZeroExtendResizeBitCast(FromSizeTag, ToSizeTag, DTo d_to, DFrom, VFromD< DFrom > v)
Definition emu128-inl.h:140
HWY_INLINE ToT CastValueForF2IConv(FromT val)
Definition emu128-inl.h:1515
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:774
static HWY_INLINE float ScalarSqrt(float v)
Definition emu128-inl.h:959
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE void StoreU16ToF16(const uint16_t val, hwy::float16_t *HWY_RESTRICT to)
Definition emu128-inl.h:1801
HWY_INLINE uint16_t U16FromF16(const hwy::float16_t *HWY_RESTRICT from)
Definition emu128-inl.h:1806
HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val)
Definition emu128-inl.h:1575
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API TFromD< D > ReduceMax(D d, VFromD< D > v)
Definition arm_sve-inl.h:3213
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_API TFromD< D > ReduceMin(D d, VFromD< D > v)
Definition arm_sve-inl.h:3208
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API VFromD< DN > OrderedTruncate2To(DN dn, V a, V b)
Definition emu128-inl.h:1978
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API void LoadInterleaved2(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1)
Definition arm_neon-inl.h:9049
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
VFromD< ScalableTag< bfloat16_t > > VBF16
Definition arm_sve-inl.h:410
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue< float >()
Definition base.h:2203
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment)
Definition base.h:2676
HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf)
Definition base.h:1778
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val)
Definition base.h:2873
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val)
Definition base.h:2822
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val)
Definition base.h:2829
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue< float >()
Definition base.h:2224
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f)
Definition base.h:1817
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef< T > ScalarAbs(T val)
Definition base.h:2815
HWY_API constexpr RemoveCvRef< T > ScalarShr(T val, int shift_amt)
Definition base.h:2528
HWY_API constexpr T LimitsMax()
Definition base.h:2174
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:624
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_SPECIAL_FLOAT_D(D)
Definition ops/shared-inl.h:540
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
@ value
Definition arm_neon-inl.h:8429
Definition arm_neon-inl.h:5654
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:5655
Definition ops/shared-inl.h:198
int VFromD
Definition tuple-inl.h:25