26#if HWY_COMPILER_GCC_ACTUAL
29 ignored
"-Wmaybe-uninitialized")
38#ifndef HWY_DISABLE_PCLMUL_AES
43#include
"hwy/ops/shared-inl.h"
52#define HWY_X86_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
54#define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
57#undef HWY_AVX3_HAVE_F32_TO_BF16C
58#if HWY_TARGET <= HWY_AVX3_ZEN4 && !HWY_COMPILER_CLANGCL && \
59 (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 900) && \
60 !defined(HWY_AVX3_DISABLE_AVX512BF16)
61#define HWY_AVX3_HAVE_F32_TO_BF16C 1
63#define HWY_AVX3_HAVE_F32_TO_BF16C 0
87template <
typename T,
size_t N = 16 /
sizeof(T)>
93 static constexpr size_t kPrivateN = N;
98 return *
this = (*
this * other);
101 return *
this = (*
this / other);
104 return *
this = (*
this + other);
107 return *
this = (*
this - other);
110 return *
this = (*
this % other);
113 return *
this = (*
this & other);
116 return *
this = (*
this | other);
119 return *
this = (*
this ^ other);
126using Vec64 = Vec128<T, 8 /
sizeof(T)>;
129using Vec32 = Vec128<T, 4 /
sizeof(T)>;
132using Vec16 = Vec128<T, 2 /
sizeof(T)>;
134#if HWY_TARGET <= HWY_AVX3
139template <
size_t size>
160template <
typename T,
size_t N = 16 /
sizeof(T)>
174template <
typename T,
size_t N = 16 /
sizeof(T)>
184template <
typename T,
size_t N>
185constexpr uint64_t OnlyActive(uint64_t mask_bits) {
186 return ((N *
sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
191#if HWY_TARGET <= HWY_AVX3
195template <
typename T,
size_t N>
197 return OnlyActive<T, N>(mask.raw);
204using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
207using TFromV =
typename V::PrivateT;
212template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
217template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
222template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
226template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
230template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
254#if HWY_AVX3_HAVE_F32_TO_BF16C
261#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
263 return reinterpret_cast<__m128i
>(v);
268 return BitCastScalar<__m128i>(v);
273template <
typename T,
size_t N>
280struct BitCastFromInteger128 {
290struct BitCastFromInteger128<float> {
294struct BitCastFromInteger128<double> {
298template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
300 Vec128<uint8_t, D().MaxBytes()> v) {
306template <
class D,
typename FromT, HWY_IF_V_SIZE_LE_D(D, 16)>
314template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
316 return VFromD<D>{_mm_set1_epi8(
static_cast<char>(t))};
318template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
320 return VFromD<D>{_mm_set1_epi16(
static_cast<short>(t))};
322template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
324 return VFromD<D>{_mm_set1_epi32(
static_cast<int>(t))};
326template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
328 return VFromD<D>{_mm_set1_epi64x(
static_cast<long long>(t))};
331template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
336template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
340template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
346template <
class D, HWY_X86_IF_EMULATED_D(D)>
349 static_assert(
sizeof(TFromD<D>) == 2,
"Expecting [b]f16");
351 CopyBytes<2>(&t, &bits);
368template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
373template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
377template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
381template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
390template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
392 return static_cast<T
>(_mm_cvtsi128_si32(v.
raw) & 0xFF);
394template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
398 const uint16_t bits =
399 static_cast<uint16_t
>(_mm_cvtsi128_si32(
BitCast(du, v).raw) & 0xFFFF);
400 return BitCastScalar<T>(bits);
402template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
404 return static_cast<T
>(_mm_cvtsi128_si32(v.raw));
408 return _mm_cvtss_f32(v.
raw);
410template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
414 alignas(16) T lanes[2];
418 return static_cast<T
>(_mm_cvtsi128_si64(v.raw));
423 return _mm_cvtsd_f64(v.
raw);
437template <
class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
439 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
440 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
441 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
442 TFromD<D> t11, TFromD<D> t12,
443 TFromD<D> t13, TFromD<D> t14,
446 static_cast<char>(t0),
static_cast<char>(t1),
static_cast<char>(t2),
447 static_cast<char>(t3),
static_cast<char>(t4),
static_cast<char>(t5),
448 static_cast<char>(t6),
static_cast<char>(t7),
static_cast<char>(t8),
449 static_cast<char>(t9),
static_cast<char>(t10),
static_cast<char>(t11),
450 static_cast<char>(t12),
static_cast<char>(t13),
static_cast<char>(t14),
451 static_cast<char>(t15))};
454template <
class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
456 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
457 TFromD<D> t5, TFromD<D> t6,
460 _mm_setr_epi16(
static_cast<int16_t
>(t0),
static_cast<int16_t
>(t1),
461 static_cast<int16_t
>(t2),
static_cast<int16_t
>(t3),
462 static_cast<int16_t
>(t4),
static_cast<int16_t
>(t5),
463 static_cast<int16_t
>(t6),
static_cast<int16_t
>(t7))};
467template <
class D, HWY_IF_BF16_D(D)>
469 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
470 TFromD<D> t5, TFromD<D> t6,
475 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
476 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
477 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
478 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
482template <
class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
484 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
485 TFromD<D> t5, TFromD<D> t6,
487 return VFromD<D>{_mm_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7)};
491template <
class D, HWY_IF_F16_D(D)>
493 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
494 TFromD<D> t5, TFromD<D> t6,
499 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
500 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
501 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
502 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
506template <
class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
508 TFromD<D> t2, TFromD<D> t3) {
510 _mm_setr_epi32(
static_cast<int32_t
>(t0),
static_cast<int32_t
>(t1),
511 static_cast<int32_t
>(t2),
static_cast<int32_t
>(t3))};
514template <
class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
516 TFromD<D> t2, TFromD<D> t3) {
517 return VFromD<D>{_mm_setr_ps(t0, t1, t2, t3)};
520template <
class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
525 _mm_set_epi64x(
static_cast<int64_t
>(t1),
static_cast<int64_t
>(t0))};
528template <
class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
537template <
typename T,
size_t N>
538HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
556template <
typename T,
size_t N>
557HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
558 const DFromV<
decltype(mask)>
d;
576template <
typename T,
size_t N>
577HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
595template <
typename T,
size_t N>
596HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
613template <
typename T,
size_t N>
614HWY_API Vec128<T, N>
Not(
const Vec128<T, N> v) {
617 using VU =
VFromD<
decltype(du)>;
618#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
620 return BitCast(
d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
627template <
typename T,
size_t N>
628HWY_API Vec128<T, N>
Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
629#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
632 using VU =
VFromD<
decltype(du)>;
633 const __m128i ret = _mm_ternarylogic_epi64(
637 return Xor(x1,
Xor(x2, x3));
642template <
typename T,
size_t N>
643HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
644#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
647 using VU =
VFromD<
decltype(du)>;
648 const __m128i ret = _mm_ternarylogic_epi64(
652 return Or(o1,
Or(o2, o3));
657template <
typename T,
size_t N>
658HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
659#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
662 using VU =
VFromD<
decltype(du)>;
663 const __m128i ret = _mm_ternarylogic_epi64(
667 return Or(o,
And(a1, a2));
672template <
typename T,
size_t N>
675#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
678 using VU =
VFromD<
decltype(du)>;
680 d, VU{_mm_ternarylogic_epi64(
BitCast(du, mask).raw,
BitCast(du, yes).raw,
688#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
690#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
691#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
693#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
705template <
typename T,
size_t N>
710template <
typename T,
size_t N>
715template <
typename T,
size_t N>
723#if HWY_TARGET <= HWY_AVX3_DL
725#ifdef HWY_NATIVE_POPCNT
726#undef HWY_NATIVE_POPCNT
728#define HWY_NATIVE_POPCNT
733template <
typename T,
size_t N>
738template <
typename T,
size_t N>
743template <
typename T,
size_t N>
748template <
typename T,
size_t N>
756template <
typename T,
size_t N>
770template <
typename T,
size_t N>
775template <
typename T,
size_t N>
780template <
typename T,
size_t N>
787template <
typename T,
size_t N>
794template <
class V, HWY_IF_FLOAT(TFromV<V>)>
798 using TI =
TFromD<
decltype(di)>;
799 return v &
BitCast(
d,
Set(di,
static_cast<TI
>(~SignMask<TI>())));
806 static_assert(IsFloat<TFromV<V>>(),
"Only makes sense for floating-point");
808 const DFromV<
decltype(magn)>
d;
833#if HWY_TARGET <= HWY_AVX3
838template <
typename T,
size_t N>
843template <
typename T,
size_t N>
848template <
typename T,
size_t N>
853template <
typename T,
size_t N>
861template <
typename T,
size_t N>
889#ifdef HWY_NATIVE_MASK_FALSE
890#undef HWY_NATIVE_MASK_FALSE
892#define HWY_NATIVE_MASK_FALSE
898 return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)
>(0)};
902#ifdef HWY_NATIVE_IS_NEGATIVE
903#undef HWY_NATIVE_IS_NEGATIVE
905#define HWY_NATIVE_IS_NEGATIVE
909template <
class V, HWY_IF_NOT_UNSIGNED_V(V)>
916#ifdef HWY_NATIVE_PROMOTE_MASK_TO
917#undef HWY_NATIVE_PROMOTE_MASK_TO
919#define HWY_NATIVE_PROMOTE_MASK_TO
923template <
class DTo,
class DFrom,
925 class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
934#ifdef HWY_NATIVE_DEMOTE_MASK_TO
935#undef HWY_NATIVE_DEMOTE_MASK_TO
937#define HWY_NATIVE_DEMOTE_MASK_TO
941template <
class DTo,
class DFrom,
943 class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
952#ifdef HWY_NATIVE_COMBINE_MASKS
953#undef HWY_NATIVE_COMBINE_MASKS
955#define HWY_NATIVE_COMBINE_MASKS
958template <
class D, HWY_IF_LANES_D(D, 2)>
961#if HWY_COMPILER_HAS_MASK_INTRINSICS
962 const __mmask8 combined_mask = _kor_mask8(
963 _kshiftli_mask8(
static_cast<__mmask8
>(hi.raw), 1),
964 _kand_mask8(
static_cast<__mmask8
>(lo.raw),
static_cast<__mmask8
>(1)));
966 const auto combined_mask =
967 (
static_cast<unsigned>(hi.raw) << 1) | (lo.raw & 1);
973template <
class D, HWY_IF_LANES_D(D, 4)>
976#if HWY_COMPILER_HAS_MASK_INTRINSICS
977 const __mmask8 combined_mask = _kor_mask8(
978 _kshiftli_mask8(
static_cast<__mmask8
>(hi.raw), 2),
979 _kand_mask8(
static_cast<__mmask8
>(lo.raw),
static_cast<__mmask8
>(3)));
981 const auto combined_mask =
982 (
static_cast<unsigned>(hi.raw) << 2) | (lo.raw & 3);
985 return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)
>(combined_mask)};
988template <
class D, HWY_IF_LANES_D(D, 8)>
991#if HWY_COMPILER_HAS_MASK_INTRINSICS
992 const __mmask8 combined_mask = _kor_mask8(
993 _kshiftli_mask8(
static_cast<__mmask8
>(hi.raw), 4),
994 _kand_mask8(
static_cast<__mmask8
>(lo.raw),
static_cast<__mmask8
>(15)));
996 const auto combined_mask =
997 (
static_cast<unsigned>(hi.raw) << 4) | (lo.raw & 15u);
1000 return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)
>(combined_mask)};
1003template <
class D, HWY_IF_LANES_D(D, 16)>
1006#if HWY_COMPILER_HAS_MASK_INTRINSICS
1007 const __mmask16 combined_mask = _mm512_kunpackb(
1008 static_cast<__mmask16
>(hi.raw),
static_cast<__mmask16
>(lo.raw));
1010 const auto combined_mask =
1011 ((
static_cast<unsigned>(hi.raw) << 8) | (lo.raw & 0xFFu));
1014 return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)
>(combined_mask)};
1019#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
1020#undef HWY_NATIVE_LOWER_HALF_OF_MASK
1022#define HWY_NATIVE_LOWER_HALF_OF_MASK
1030 constexpr size_t kNumOfBitsInRawMask =
sizeof(RawM) * 8;
1032 MFromD<D> result_mask{
static_cast<RawM
>(
m.raw)};
1034 if (kN < kNumOfBitsInRawMask) {
1036 And(result_mask,
MFromD<D>{
static_cast<RawM
>((1ULL << kN) - 1)});
1044#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
1045#undef HWY_NATIVE_UPPER_HALF_OF_MASK
1047#define HWY_NATIVE_UPPER_HALF_OF_MASK
1050template <
class D, HWY_IF_LANES_D(D, 1)>
1052#if HWY_COMPILER_HAS_MASK_INTRINSICS
1053 const auto shifted_mask = _kshiftri_mask8(
static_cast<__mmask8
>(
m.raw), 1);
1055 const auto shifted_mask =
static_cast<unsigned>(
m.raw) >> 1;
1061template <
class D, HWY_IF_LANES_D(D, 2)>
1063#if HWY_COMPILER_HAS_MASK_INTRINSICS
1064 const auto shifted_mask = _kshiftri_mask8(
static_cast<__mmask8
>(
m.raw), 2);
1066 const auto shifted_mask =
static_cast<unsigned>(
m.raw) >> 2;
1069 return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)
>(shifted_mask)};
1072template <
class D, HWY_IF_LANES_D(D, 4)>
1074#if HWY_COMPILER_HAS_MASK_INTRINSICS
1075 const auto shifted_mask = _kshiftri_mask8(
static_cast<__mmask8
>(
m.raw), 4);
1077 const auto shifted_mask =
static_cast<unsigned>(
m.raw) >> 4;
1080 return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)
>(shifted_mask)};
1083template <
class D, HWY_IF_LANES_D(D, 8)>
1085#if HWY_COMPILER_HAS_MASK_INTRINSICS
1086 const auto shifted_mask = _kshiftri_mask16(
static_cast<__mmask16
>(
m.raw), 8);
1088 const auto shifted_mask =
static_cast<unsigned>(
m.raw) >> 8;
1091 return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)
>(shifted_mask)};
1096#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
1097#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
1099#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
1103template <
class DTo,
class DFrom,
1105 class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
1110 using RawMH =
decltype(MH().raw);
1112 return CombineMasks(d_to, MH{
static_cast<RawMH
>(b.raw)},
1113 MH{
static_cast<RawMH
>(a.raw)});
1117#ifdef HWY_NATIVE_SLIDE_MASK
1118#undef HWY_NATIVE_SLIDE_MASK
1120#define HWY_NATIVE_SLIDE_MASK
1123template <
class D, HWY_IF_LANES_LE_D(D, 8)>
1127 constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
1129#if HWY_COMPILER_HAS_MASK_INTRINSICS
1131 static_cast<RawM
>(_kshiftli_mask8(
static_cast<__mmask8
>(
m.raw), 1))};
1135 And(result_mask,
MFromD<D>{
static_cast<RawM
>(kValidLanesMask)});
1139 static_cast<RawM
>((
static_cast<unsigned>(
m.raw) << 1) & kValidLanesMask)};
1145template <
class D, HWY_IF_LANES_D(D, 16)>
1147 using RawM =
decltype(MFromD<D>().raw);
1148#if HWY_COMPILER_HAS_MASK_INTRINSICS
1150 static_cast<RawM
>(_kshiftli_mask16(
static_cast<__mmask16
>(
m.raw), 1))};
1152 return MFromD<D>{
static_cast<RawM
>(
static_cast<unsigned>(
m.raw) << 1)};
1156template <
class D, HWY_IF_LANES_LE_D(D, 8)>
1160 constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
1162#if HWY_COMPILER_HAS_MASK_INTRINSICS
1168 static_cast<RawM
>(_kshiftri_mask8(
static_cast<__mmask8
>(
m.raw), 1))};
1171 static_cast<RawM
>((
static_cast<unsigned>(
m.raw) & kValidLanesMask) >> 1)};
1175template <
class D, HWY_IF_LANES_D(D, 16)>
1177 using RawM =
decltype(MFromD<D>().raw);
1178#if HWY_COMPILER_HAS_MASK_INTRINSICS
1180 static_cast<RawM
>(_kshiftri_mask16(
static_cast<__mmask16
>(
m.raw), 1))};
1183 static_cast<RawM
>((
static_cast<unsigned>(
m.raw) & 0xFFFFu) >> 1)};
1192 constexpr uint64_t kValidLanesMask =
1193 static_cast<uint64_t
>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
1196 (
static_cast<uint64_t
>(
m.raw) << (amt & 63)) & kValidLanesMask)};
1204 constexpr uint64_t kValidLanesMask =
1205 static_cast<uint64_t
>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
1208 (
static_cast<uint64_t
>(
m.raw) & kValidLanesMask) >> (amt & 63))};
1213template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
1218template <
typename T,
size_t N, HWY_IF_UI16(T)>
1220 return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1223template <
typename T,
size_t N, HWY_IF_UI32(T)>
1225 return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1228template <
typename T,
size_t N, HWY_IF_UI64(T)>
1230 return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1236 return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))};
1258template <
typename TFrom,
size_t NFrom,
class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)>
1260 static_assert(
sizeof(TFrom) ==
sizeof(
TFromD<DTo>),
"Must have same size");
1268template <
typename T,
size_t N>
1274template <
typename T,
size_t N>
1280template <
typename T,
size_t N>
1286template <
typename T,
size_t N>
1295template <
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1304 Vec128<float16_t, N> yes,
1305 Vec128<float16_t, N> no) {
1306 return Vec128<float16_t, N>{_mm_mask_blend_ph(mask.raw, no.raw, yes.raw)};
1311template <
class V,
class D = DFromV<V>, HWY_X86_IF_EMULATED_D(D)>
1333template <
typename T,
size_t N>
1338template <
typename T,
size_t N>
1343template <
typename T,
size_t N>
1348template <
typename T,
size_t N>
1356template <
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1374template <
class V,
class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
1382template <
typename T,
size_t N>
1388template <
typename T,
size_t N>
1393template <
typename T,
size_t N>
1398template <
typename T,
size_t N>
1406template <
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1424template <
class V,
class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
1433#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
1434#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
1435 HWY_COMPILER_CLANG >= 800
1436#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
1438#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
1444template <
typename T,
size_t N>
1447#if HWY_COMPILER_HAS_MASK_INTRINSICS
1453template <
typename T,
size_t N>
1456#if HWY_COMPILER_HAS_MASK_INTRINSICS
1462template <
typename T,
size_t N>
1465#if HWY_COMPILER_HAS_MASK_INTRINSICS
1471template <
typename T,
size_t N>
1474#if HWY_COMPILER_HAS_MASK_INTRINSICS
1481template <
typename T,
size_t N>
1484#if HWY_COMPILER_HAS_MASK_INTRINSICS
1490template <
typename T,
size_t N>
1493#if HWY_COMPILER_HAS_MASK_INTRINSICS
1499template <
typename T,
size_t N>
1502#if HWY_COMPILER_HAS_MASK_INTRINSICS
1508template <
typename T,
size_t N>
1511#if HWY_COMPILER_HAS_MASK_INTRINSICS
1518template <
typename T,
size_t N>
1521#if HWY_COMPILER_HAS_MASK_INTRINSICS
1527template <
typename T,
size_t N>
1530#if HWY_COMPILER_HAS_MASK_INTRINSICS
1536template <
typename T,
size_t N>
1539#if HWY_COMPILER_HAS_MASK_INTRINSICS
1545template <
typename T,
size_t N>
1548#if HWY_COMPILER_HAS_MASK_INTRINSICS
1555template <
typename T,
size_t N>
1558#if HWY_COMPILER_HAS_MASK_INTRINSICS
1564template <
typename T,
size_t N>
1567#if HWY_COMPILER_HAS_MASK_INTRINSICS
1573template <
typename T,
size_t N>
1576#if HWY_COMPILER_HAS_MASK_INTRINSICS
1582template <
typename T,
size_t N>
1585#if HWY_COMPILER_HAS_MASK_INTRINSICS
1592template <
typename T,
size_t N>
1596#if HWY_COMPILER_HAS_MASK_INTRINSICS
1602template <
typename T,
size_t N>
1606#if HWY_COMPILER_HAS_MASK_INTRINSICS
1612template <
typename T,
size_t N>
1616#if HWY_COMPILER_HAS_MASK_INTRINSICS
1622template <
typename T,
size_t N>
1626#if HWY_COMPILER_HAS_MASK_INTRINSICS
1634template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
1636#if HWY_COMPILER_HAS_MASK_INTRINSICS
1637 return Mask128<T, N>{
static_cast<__mmask16
>(_knot_mask16(
m.raw))};
1643template <
typename T,
size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
1645#if HWY_COMPILER_HAS_MASK_INTRINSICS
1648 return Mask128<T, N>{
static_cast<__mmask8
>(~m.raw)};
1652template <
typename T>
1657template <
typename T,
size_t N, HWY_IF_LANES_LE(N, 8)>
1665template <
typename T>
1670template <
typename T,
size_t N, HWY_IF_LANES_LE(N, 4)>
1678template <
typename T,
size_t N>
1686template <
typename T,
size_t N>
1697template <
typename T,
size_t N>
1698HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1702template <
typename T,
size_t N>
1703HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1707template <
typename T,
size_t N>
1708HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1712template <
typename T,
size_t N>
1713HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1717template <
typename T,
size_t N>
1723template <
typename T,
size_t N>
1733template <
typename T,
size_t N>
1735 return Mask128<T, N>{v.raw};
1741template <
typename T,
size_t N>
1743 return Vec128<T, N>{v.raw};
1752#if HWY_TARGET >= HWY_SSSE3
1755template <
typename T,
size_t N>
1765template <
typename T,
size_t N>
1768 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
1772 Vec128<float, N> yes, Vec128<float, N> no) {
1773 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
1777 Vec128<double, N> yes,
1778 Vec128<double, N> no) {
1779 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
1785template <
typename T,
size_t N>
1791template <
typename T,
size_t N>
1798template <
typename T,
size_t N>
1800 const Simd<T, N, 0>
d;
1804template <
typename T,
size_t N>
1805HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1806 const Simd<T, N, 0>
d;
1810template <
typename T,
size_t N>
1811HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1812 const Simd<T, N, 0>
d;
1816template <
typename T,
size_t N>
1817HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1818 const Simd<T, N, 0>
d;
1822template <
typename T,
size_t N>
1823HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1824 const Simd<T, N, 0>
d;
1828template <
typename T,
size_t N>
1830 const Simd<T, N, 0>
d;
1838template <
int kBits,
size_t N>
1840 return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
1843template <
int kBits,
size_t N>
1845 return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
1848template <
int kBits,
size_t N>
1850 return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
1853template <
int kBits,
size_t N>
1855 return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
1857template <
int kBits,
size_t N>
1859 return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
1861template <
int kBits,
size_t N>
1863 return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
1866#if HWY_TARGET <= HWY_AVX3_DL
1869template <
typename T,
size_t N>
1872 return Vec128<T, N>{_mm_gf2p8affine_epi64_epi8(v.
raw, matrix.raw, 0)};
1878template <
int kBits,
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
1880 const DFromV<
decltype(v)> d8;
1882 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<
MakeWide<T>>{v.raw}).raw};
1885 : (shifted &
Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1892template <
int kBits,
size_t N>
1894 return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
1896template <
int kBits,
size_t N>
1898 return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
1900template <
int kBits,
size_t N>
1902 return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
1905template <
int kBits,
size_t N>
1907 return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
1909template <
int kBits,
size_t N>
1911 return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
1914#if HWY_TARGET > HWY_AVX3_DL
1916template <
int kBits,
size_t N>
1918 const DFromV<
decltype(v)> d8;
1920 const Vec128<uint8_t, N> shifted{
1921 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
1922 return shifted &
Set(d8, 0xFF >> kBits);
1925template <
int kBits,
size_t N>
1927 const DFromV<
decltype(v)> di;
1929 const auto shifted =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
1930 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
1931 return (shifted ^ shifted_sign) - shifted_sign;
1946#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1947#if defined(__clang_analyzer__) || \
1948 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1949#define HWY_SAFE_PARTIAL_LOAD_STORE 1
1951#define HWY_SAFE_PARTIAL_LOAD_STORE 0
1957template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
1959 return VFromD<D>{_mm_load_si128(
reinterpret_cast<const __m128i*
>(aligned))};
1962template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1964 return Vec128<float16_t>{_mm_load_ph(aligned)};
1968template <
class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
1973template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
1977template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
1982template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
1984 return VFromD<D>{_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(
p))};
1987template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1989 return Vec128<float16_t>{_mm_loadu_ph(
p)};
1993template <
class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
1998template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
2000 return Vec128<float>{_mm_loadu_ps(
p)};
2002template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
2007template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
2010#if HWY_SAFE_PARTIAL_LOAD_STORE
2011 __m128i v = _mm_setzero_si128();
2012 CopyBytes<8>(
p, &v);
2014 const __m128i v = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(
p));
2019template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
2021#if HWY_SAFE_PARTIAL_LOAD_STORE
2022 __m128 v = _mm_setzero_ps();
2023 CopyBytes<8>(
p, &v);
2026 const __m128 hi = _mm_setzero_ps();
2027 return Vec64<float>{_mm_loadl_pi(hi,
reinterpret_cast<const __m64*
>(
p))};
2031template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
2033#if HWY_SAFE_PARTIAL_LOAD_STORE
2034 __m128d v = _mm_setzero_pd();
2035 CopyBytes<8>(
p, &v);
2042template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
2044#if HWY_SAFE_PARTIAL_LOAD_STORE
2045 __m128 v = _mm_setzero_ps();
2046 CopyBytes<4>(
p, &v);
2054template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)>
2061#if HWY_SAFE_PARTIAL_LOAD_STORE
2067 const __m128i v = _mm_cvtsi32_si128(bits);
2073template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2079template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
2086template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2088 _mm_store_si128(
reinterpret_cast<__m128i*
>(aligned), v.raw);
2091template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
2093 _mm_store_ph(aligned, v.raw);
2097template <
class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
2100 Store(
BitCast(du, v), du,
reinterpret_cast<uint16_t*
>(aligned));
2102template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
2104 _mm_store_ps(aligned, v.
raw);
2106template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
2109 _mm_store_pd(aligned, v.
raw);
2112template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2114 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(
p), v.raw);
2117template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
2119 _mm_storeu_ph(
p, v.raw);
2123template <
class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
2128template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
2130 _mm_storeu_ps(
p, v.raw);
2132template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
2134 _mm_storeu_pd(
p, v.
raw);
2137template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
2139#if HWY_SAFE_PARTIAL_LOAD_STORE
2141 CopyBytes<8>(&v,
p);
2144 _mm_storel_epi64(
reinterpret_cast<__m128i*
>(
p),
BitCast(du, v).raw);
2147template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
2149#if HWY_SAFE_PARTIAL_LOAD_STORE
2150 CopyBytes<8>(&v,
p);
2152 _mm_storel_pi(
reinterpret_cast<__m64*
>(
p), v.
raw);
2155template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
2157#if HWY_SAFE_PARTIAL_LOAD_STORE
2158 CopyBytes<8>(&v,
p);
2160 _mm_storel_pd(
p, v.
raw);
2165template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)>
2169template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
2171#if HWY_SAFE_PARTIAL_LOAD_STORE
2172 CopyBytes<4>(&v,
p);
2174 _mm_store_ss(
p, v.
raw);
2179template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2187template <
typename T,
size_t N,
typename TI,
size_t NI>
2189 const Vec128<TI, NI> from) {
2190 const DFromV<
decltype(from)>
d;
2193 const DFromV<
decltype(bytes)> d_bytes;
2194 const Repartition<uint8_t,
decltype(d_bytes)> du8_bytes;
2195#if HWY_TARGET == HWY_SSE2
2196#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
2197 typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16)));
2203 __builtin_shuffle(
reinterpret_cast<GccU8RawVectType
>(bytes.raw),
2204 reinterpret_cast<GccU8RawVectType
>(from.raw)))};
2206 const Full128<uint8_t> du8_full;
2208 alignas(16) uint8_t result_bytes[16];
2209 alignas(16) uint8_t u8_bytes[16];
2210 alignas(16) uint8_t from_bytes[16];
2212 Store(Vec128<uint8_t>{
BitCast(du8_bytes, bytes).raw}, du8_full, u8_bytes);
2213 Store(Vec128<uint8_t>{
BitCast(du8, from).raw}, du8_full, from_bytes);
2215 for (
int i = 0; i < 16; i++) {
2216 result_bytes[i] = u8_bytes[from_bytes[i] & 15];
2223 d,
VFromD<
decltype(du8)>{_mm_shuffle_epi8(
BitCast(du8_bytes, bytes).raw,
2230template <
class V,
class VI>
2232#if HWY_TARGET == HWY_SSE2
2233 const DFromV<
decltype(from)>
d;
2236 const auto di8_from =
BitCast(di8, from);
2252template <
typename T,
size_t N>
2254 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2255 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2256 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
2260 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2269template <
typename T, HWY_IF_T_SIZE(T, 1)>
2272 const Twice<
decltype(
d)> d2;
2273 const auto ba =
Combine(d2, b, a);
2274#if HWY_TARGET == HWY_SSE2
2275 Vec32<uint16_t> ba_shuffled{
2276 _mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))};
2277 return BitCast(
d,
Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled)));
2280 const auto shuffle_idx =
2281 BitCast(d2,
Dup128VecFromValues(d2_u, 1, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0,
2286template <
typename T, HWY_IF_T_SIZE(T, 2)>
2289 const Twice<
decltype(
d)> d2;
2290 const auto ba =
Combine(d2, b, a);
2291#if HWY_TARGET == HWY_SSE2
2292 Vec64<uint32_t> ba_shuffled{
2293 _mm_shuffle_epi32(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))};
2295 _mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))};
2298 const auto shuffle_idx =
BitCast(
2304template <
typename T, HWY_IF_T_SIZE(T, 4)>
2308 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
2313template <
typename T, HWY_IF_T_SIZE(T, 1)>
2316#if HWY_TARGET == HWY_SSE2
2317 const auto zero =
Zero(
d);
2318 const Rebind<int16_t,
decltype(
d)> di16;
2319 const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16(
2320 _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))};
2321 const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16(
2322 _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))};
2323 const auto ba_shuffled =
Combine(di16, b_shuffled, a_shuffled);
2324 return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)};
2326 const Twice<
decltype(
d)> d2;
2327 const auto ba =
Combine(d2, b, a);
2329 const auto shuffle_idx =
2330 BitCast(d2,
Dup128VecFromValues(d2_u, 0, 3, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0,
2335template <
typename T, HWY_IF_T_SIZE(T, 2)>
2338#if HWY_TARGET == HWY_SSE2
2339 const Vec32<T> a_shuffled{
2340 _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(3, 0, 3, 0))};
2341 const Vec32<T> b_shuffled{
2342 _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(1, 2, 1, 2))};
2343 return Combine(
d, b_shuffled, a_shuffled);
2345 const Twice<
decltype(
d)> d2;
2346 const auto ba =
Combine(d2, b, a);
2348 const auto shuffle_idx =
BitCast(
2354template <
typename T, HWY_IF_T_SIZE(T, 4)>
2358 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
2363template <
typename T, HWY_IF_T_SIZE(T, 1)>
2366#if HWY_TARGET == HWY_SSE2
2367 const auto zero =
Zero(
d);
2368 const Rebind<int16_t,
decltype(
d)> di16;
2369 const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16(
2370 _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))};
2371 const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16(
2372 _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))};
2373 const auto ba_shuffled =
Combine(di16, b_shuffled, a_shuffled);
2374 return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)};
2376 const Twice<
decltype(
d)> d2;
2377 const auto ba =
Combine(d2, b, a);
2379 const auto shuffle_idx =
2380 BitCast(d2,
Dup128VecFromValues(d2_u, 2, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0,
2385template <
typename T, HWY_IF_T_SIZE(T, 2)>
2388#if HWY_TARGET == HWY_SSE2
2389 const Vec32<T> a_shuffled{
2390 _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(1, 2, 1, 2))};
2391 const Vec32<T> b_shuffled{
2392 _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(3, 0, 3, 0))};
2393 return Combine(
d, b_shuffled, a_shuffled);
2395 const Twice<
decltype(
d)> d2;
2396 const auto ba =
Combine(d2, b, a);
2398 const auto shuffle_idx =
BitCast(
2404template <
typename T, HWY_IF_T_SIZE(T, 4)>
2408 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
2469#if HWY_TARGET <= HWY_AVX3
2477template <
typename T,
size_t N>
2482template <
typename T,
size_t N>
2487template <
typename T,
size_t N>
2492template <
typename T,
size_t N>
2500template <
typename T,
size_t N>
2501HWY_API Mask128<T, N>
TestBit(
const Vec128<T, N> v,
const Vec128<T, N> bit) {
2502 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
2508template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
2513template <
typename T,
size_t N, HWY_IF_UI16(T)>
2515 return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
2518template <
typename T,
size_t N, HWY_IF_UI32(T)>
2520 return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
2523template <
typename T,
size_t N, HWY_IF_UI64(T)>
2525 return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
2531 Vec128<float16_t, N> b) {
2535 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2541 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2546 Vec128<double, N> b) {
2547 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2552template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
2557template <
typename T,
size_t N, HWY_IF_UI16(T)>
2559 return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
2562template <
typename T,
size_t N, HWY_IF_UI32(T)>
2564 return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
2567template <
typename T,
size_t N, HWY_IF_UI64(T)>
2569 return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
2575 Vec128<float16_t, N> b) {
2579 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2585 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2590 Vec128<double, N> b) {
2591 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2598HWY_API Mask128<int8_t, N>
operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
2599 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
2603 Vec128<int16_t, N> b) {
2604 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
2608 Vec128<int32_t, N> b) {
2609 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
2613 Vec128<int64_t, N> b) {
2614 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
2619 Vec128<uint8_t, N> b) {
2620 return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
2624 Vec128<uint16_t, N> b) {
2625 return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
2629 Vec128<uint32_t, N> b) {
2630 return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
2634 Vec128<uint64_t, N> b) {
2635 return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
2641 Vec128<float16_t, N> b) {
2645 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
2650HWY_API Mask128<float, N>
operator>(Vec128<float, N> a, Vec128<float, N> b) {
2651 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
2654HWY_API Mask128<double, N>
operator>(Vec128<double, N> a, Vec128<double, N> b) {
2655 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
2663 Vec128<float16_t, N> b) {
2667 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
2673 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
2677 Vec128<double, N> b) {
2678 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
2683 Vec128<int8_t, N> b) {
2684 return Mask128<int8_t, N>{_mm_cmpge_epi8_mask(a.raw, b.raw)};
2688 Vec128<int16_t, N> b) {
2689 return Mask128<int16_t, N>{_mm_cmpge_epi16_mask(a.raw, b.raw)};
2693 Vec128<int32_t, N> b) {
2694 return Mask128<int32_t, N>{_mm_cmpge_epi32_mask(a.raw, b.raw)};
2698 Vec128<int64_t, N> b) {
2699 return Mask128<int64_t, N>{_mm_cmpge_epi64_mask(a.raw, b.raw)};
2704 Vec128<uint8_t, N> b) {
2705 return Mask128<uint8_t, N>{_mm_cmpge_epu8_mask(a.raw, b.raw)};
2709 Vec128<uint16_t, N> b) {
2710 return Mask128<uint16_t, N>{_mm_cmpge_epu16_mask(a.raw, b.raw)};
2714 Vec128<uint32_t, N> b) {
2715 return Mask128<uint32_t, N>{_mm_cmpge_epu32_mask(a.raw, b.raw)};
2719 Vec128<uint64_t, N> b) {
2720 return Mask128<uint64_t, N>{_mm_cmpge_epu64_mask(a.raw, b.raw)};
2727template <
class DTo,
typename TFrom,
size_t NFrom, HWY_IF_V_SIZE_LE_D(DTo, 16)>
2729 static_assert(
sizeof(TFrom) ==
sizeof(TFromD<DTo>),
"Must have same size");
2730 const Simd<TFrom, NFrom, 0>
d;
2734template <
typename T,
size_t N>
2736 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
2737 return (v & bit) == bit;
2745 Vec128<uint8_t, N> b) {
2746 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
2750 Vec128<uint16_t, N> b) {
2751 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
2755 Vec128<uint32_t, N> b) {
2756 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
2760 const Vec128<uint64_t, N> b) {
2761#if HWY_TARGET >= HWY_SSSE3
2762 const DFromV<
decltype(a)> d64;
2768 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
2775 Vec128<int8_t, N> b) {
2776 return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
2780 Vec128<int16_t, N> b) {
2781 return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
2785 Vec128<int32_t, N> b) {
2786 return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
2790 const Vec128<int64_t, N> b) {
2800 return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
2804 Vec128<double, N> b) {
2805 return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
2815 Vec128<uint8_t, N> b) {
2820 Vec128<uint16_t, N> b) {
2825 Vec128<uint32_t, N> b) {
2830 Vec128<uint64_t, N> b) {
2835 Vec128<int8_t, N> b) {
2840 Vec128<int16_t, N> b) {
2845 Vec128<int32_t, N> b) {
2850 Vec128<int64_t, N> b) {
2856 return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
2860 Vec128<double, N> b) {
2861 return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
2870 Vec128<int8_t, N> b) {
2871 return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
2875 Vec128<int16_t, N> b) {
2876 return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
2880 Vec128<int32_t, N> b) {
2881 return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
2886 const Vec128<int64_t, N> a,
2887 const Vec128<int64_t, N> b) {
2888#if HWY_TARGET >= HWY_SSSE3
2896 const __m128i upper =
OrAnd(m_gt32, m_eq32,
Sub(b, a)).raw;
2898 return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
2900 return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};
2904template <
typename T,
size_t N>
2907 const DFromV<
decltype(a)> du;
2909 const Vec128<T, N> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
2917 Vec128<float, N> b) {
2918 return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
2922 Vec128<double, N> b) {
2923 return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
2928template <
typename T,
size_t N>
2930 return detail::Gt(hwy::TypeTag<T>(), a, b);
2936template <
typename T,
size_t N>
2939 return Not(
Gt(tag, b, a));
2942template <
typename T,
size_t N>
2945 return Not(
Gt(tag, b, a));
2950 Vec128<float, N> b) {
2951 return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
2955 Vec128<double, N> b) {
2956 return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
2961template <
typename T,
size_t N>
2963 return detail::Ge(hwy::TypeTag<T>(), a, b);
2970template <
typename T,
size_t N>
2975template <
typename T,
size_t N>
2984template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
2987 static_cast<char>(15),
static_cast<char>(14),
static_cast<char>(13),
2988 static_cast<char>(12),
static_cast<char>(11),
static_cast<char>(10),
2989 static_cast<char>(9),
static_cast<char>(8),
static_cast<char>(7),
2990 static_cast<char>(6),
static_cast<char>(5),
static_cast<char>(4),
2991 static_cast<char>(3),
static_cast<char>(2),
static_cast<char>(1),
2992 static_cast<char>(0))};
2995template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
2997 return VFromD<D>{_mm_set_epi16(int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4},
2998 int16_t{3}, int16_t{2}, int16_t{1},
3003template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
3005 return VFromD<D>{_mm_set_ph(float16_t{7}, float16_t{6}, float16_t{5},
3006 float16_t{4}, float16_t{3}, float16_t{2},
3007 float16_t{1}, float16_t{0})};
3011template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
3014 _mm_set_epi32(int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})};
3017template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
3019 return VFromD<D>{_mm_set_epi64x(int64_t{1}, int64_t{0})};
3022template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3024 return VFromD<D>{_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)};
3027template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3032#if HWY_COMPILER_MSVC
3033template <
class V, HWY_IF_V_SIZE_V(V, 1)>
3035 const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFF)};
3036 return v & mask_out_mask;
3038template <
class V, HWY_IF_V_SIZE_V(V, 2)>
3040#if HWY_TARGET <= HWY_SSE4
3041 return V{_mm_blend_epi16(v.raw, _mm_setzero_si128(), 0xFE)};
3043 const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFFFF)};
3044 return v & mask_out_mask;
3047template <
class V, HWY_IF_V_SIZE_V(V, 4)>
3051 using VF =
VFromD<
decltype(df)>;
3052 return BitCast(
d, VF{_mm_move_ss(_mm_setzero_ps(),
BitCast(df, v).raw)});
3054template <
class V, HWY_IF_V_SIZE_V(V, 8)>
3058 using VU =
VFromD<
decltype(du)>;
3061template <
class V, HWY_IF_V_SIZE_GT_V(V, 8)>
3069template <
class D,
typename T2, HWY_IF_V_SIZE_LE_D(D, 16)>
3071 const auto result_iota =
3073#if HWY_COMPILER_MSVC
3074 return detail::MaskOutVec128Iota(result_iota);
3082template <
class D,
class M = MFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
3088#if HWY_TARGET <= HWY_AVX3
3090 const uint64_t all = (1ull << kN) - 1;
3091 return M::FromBits(_bzhi_u64(all, num));
3093 const uint32_t all =
static_cast<uint32_t
>((1ull << kN) - 1);
3094 return M::FromBits(_bzhi_u32(all,
static_cast<uint32_t
>(num)));
3098 using TI =
TFromD<
decltype(di)>;
3109template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
3111 return Vec128<T, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
3113template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
3117 using VU =
VFromD<
decltype(du)>;
3121template <
typename T,
size_t N, HWY_IF_UI32(T)>
3123 return Vec128<T, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
3125template <
typename T,
size_t N, HWY_IF_UI64(T)>
3127 return Vec128<T, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
3132 Vec128<float, N> b) {
3133 return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
3137 Vec128<double, N> b) {
3138 return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
3151#if HWY_TARGET <= HWY_AVX3
3153template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3159template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3166template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
3169 return VFromD<D>{_mm_maskz_loadu_epi32(
m.raw,
p)};
3172template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
3175 return VFromD<D>{_mm_maskz_loadu_epi64(
m.raw,
p)};
3178template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3184template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3190template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3193 return VFromD<D>{_mm_mask_loadu_epi8(v.raw,
m.raw,
p)};
3196template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3201 _mm_mask_loadu_epi16(
BitCast(du, v).raw,
m.raw,
p)});
3204template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
3207 return VFromD<D>{_mm_mask_loadu_epi32(v.raw,
m.raw,
p)};
3210template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
3213 return VFromD<D>{_mm_mask_loadu_epi64(v.raw,
m.raw,
p)};
3216template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3219 return VFromD<D>{_mm_mask_loadu_ps(v.raw,
m.raw,
p)};
3222template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3225 return VFromD<D>{_mm_mask_loadu_pd(v.raw,
m.raw,
p)};
3228#elif HWY_TARGET == HWY_AVX2
3230template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
3233 auto p_p =
reinterpret_cast<const int*
>(
p);
3234 return VFromD<D>{_mm_maskload_epi32(p_p,
m.raw)};
3237template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
3240 auto p_p =
reinterpret_cast<const long long*
>(
p);
3241 return VFromD<D>{_mm_maskload_epi64(p_p,
m.raw)};
3244template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3250template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3267template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
3277#if HWY_TARGET > HWY_AVX3
3290#if HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT
3292#ifdef HWY_NATIVE_LOAD_N
3293#undef HWY_NATIVE_LOAD_N
3295#define HWY_NATIVE_LOAD_N
3301 (1 << 4) | (1 << 8))>
3312 (1 << 4) | (1 << 8))>
3318 FirstN(d_full, num_lanes), d_full,
p));
3321#if HWY_TARGET > HWY_AVX3
3326template <
class D, HWY_IF_V_SIZE_LE_D(D, 2)>
3333template <
class D, HWY_IF_V_SIZE_LE_D(D, 2)>
3340template <
class D, HWY_IF_V_SIZE_GT_D(D, 2)>
3344 using DI32 = Repartition<int32_t, D>;
3355 di32_full,
reinterpret_cast<const int32_t*
>(
p)),
3359template <
class D, HWY_IF_V_SIZE_GT_D(D, 2)>
3364 using DI32 = Repartition<int32_t, D>;
3376 di32_full,
reinterpret_cast<const int32_t*
>(
p)),
3394 return (num_lanes > 0) ?
LoadU(
d,
p) : no;
3398template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)>
3402 if (num_lanes > 1) {
3405 const FixedTag<TFromD<D>, 1> d1;
3410template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)>
3414 if (num_lanes > 1) {
3417 if (num_lanes == 0)
return no;
3419 const FixedTag<TFromD<D>, 1> d1;
3424template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)>
3428 const size_t trailing_n = num_lanes & 3;
3429 if (trailing_n == 0)
return Zero(
d);
3433 if ((trailing_n & 2) != 0) {
3436 CopyBytes<sizeof(int16_t)>(
p + num_lanes - trailing_n, &i16_bits);
3445template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)>
3449 const size_t trailing_n = num_lanes & 3;
3450 if (trailing_n == 0)
return no;
3454 if ((trailing_n & 2) != 0) {
3457 CopyBytes<sizeof(int16_t)>(
p + num_lanes - trailing_n, &i16_bits);
3466template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
3470 if ((num_lanes & 1) != 0) {
3471 return And(load_mask,
Set(
d,
p[num_lanes - 1]));
3477template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
3481 if ((num_lanes & 1) != 0) {
3491template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
3492HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
size_t N) {
3493 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
3496 const VFromD<D> load_mask =
3497 ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N)));
3498 const
size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D));
3499 const VFromD<D> v_trailing =
3500 detail::AVX2UIF8Or16LoadTrailingN(load_mask, d, p, num_lanes);
3502#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
3503 if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) &&
3504 num_lanes < (4 / sizeof(TFromD<D>))) {
3509 return detail::AVX2UIF8Or16LoadLeadingN(load_mask, d, p, v_trailing);
3513template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
3514HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
3516 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
3519 const VFromD<D> load_mask =
3520 ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N)));
3521 const
size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D));
3522 const VFromD<D> v_trailing =
3523 detail::AVX2UIF8Or16LoadTrailingNOr(no, load_mask, d, p, num_lanes);
3525#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
3526 if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) &&
3527 num_lanes < (4 / sizeof(TFromD<D>))) {
3532 return detail::AVX2UIF8Or16LoadLeadingNOr(no, load_mask, d, p, v_trailing);
3550 using TI =
TFromD<
decltype(di)>;
3555 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
3563#if HWY_TARGET <= HWY_AVX3
3565template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3568 _mm_mask_storeu_epi8(
p,
m.raw, v.raw);
3570template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3573 const RebindToUnsigned<
decltype(d)> du;
3574 _mm_mask_storeu_epi16(
reinterpret_cast<uint16_t*
>(p), RebindMask(du, m).raw,
3575 BitCast(du, v).raw);
3578template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
3581 auto pi =
reinterpret_cast<int*
>(p);
3582 _mm_mask_storeu_epi32(pi, m.raw, v.raw);
3585template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
3588 auto pi =
reinterpret_cast<long long*
>(
p);
3589 _mm_mask_storeu_epi64(pi,
m.raw, v.raw);
3592template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3594 _mm_mask_storeu_ps(
p,
m.raw, v.raw);
3597template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3599 _mm_mask_storeu_pd(
p,
m.raw, v.raw);
3602#elif HWY_TARGET == HWY_AVX2
3608 detail::ScalarMaskedStore(v, m, d, p);
3613template <
class D,
class V,
class M, HWY_IF_UI32_D(D)>
3615 auto pi =
reinterpret_cast<int*
>(p);
3616 _mm_maskstore_epi32(pi, m.raw, v.raw);
3619template <
class D,
class V,
class M, HWY_IF_UI64_D(D)>
3621 auto pi =
reinterpret_cast<long long*
>(
p);
3622 _mm_maskstore_epi64(pi,
m.raw, v.raw);
3625template <
class D,
class V,
class M, HWY_IF_F32_D(D)>
3627 _mm_maskstore_ps(p,
m.raw, v.raw);
3630template <
class D,
class V,
class M, HWY_IF_F64_D(D)>
3632 _mm_maskstore_pd(p,
m.raw, v.raw);
3643 if (
d.MaxBytes() < 16) {
3644 const Full128<TFromD<D>> dfull;
3645 const Mask128<TFromD<D>> mfull{
m.raw};
3650 detail::NativeBlendedStore<D>(v,
RebindMask(di, m), p);
3655template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
3659 detail::ScalarMaskedStore(v, m, d, p);
3671 const Vec128<uint8_t, N> b) {
3672 return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
3676 const Vec128<uint16_t, N> b) {
3677 return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
3681 const Vec128<uint32_t, N> b) {
3682 return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
3686 const Vec128<uint64_t, N> b) {
3687 return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
3693 const Vec128<int8_t, N> b) {
3694 return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
3698 const Vec128<int16_t, N> b) {
3699 return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
3703 const Vec128<int32_t, N> b) {
3704 return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
3708 const Vec128<int64_t, N> b) {
3709 return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
3716 const Vec128<float16_t, N> b) {
3717 return Vec128<float16_t, N>{_mm_add_ph(a.raw, b.raw)};
3722 const Vec128<float, N> b) {
3723 return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
3727 const Vec128<double, N> b) {
3728 return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
3736 const Vec128<uint8_t, N> b) {
3737 return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
3741 Vec128<uint16_t, N> b) {
3742 return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
3746 const Vec128<uint32_t, N> b) {
3747 return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
3751 const Vec128<uint64_t, N> b) {
3752 return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
3758 const Vec128<int8_t, N> b) {
3759 return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
3763 const Vec128<int16_t, N> b) {
3764 return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
3768 const Vec128<int32_t, N> b) {
3769 return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
3773 const Vec128<int64_t, N> b) {
3774 return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
3781 const Vec128<float16_t, N> b) {
3782 return Vec128<float16_t, N>{_mm_sub_ph(a.raw, b.raw)};
3787 const Vec128<float, N> b) {
3788 return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
3792 const Vec128<double, N> b) {
3793 return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
3798#if HWY_TARGET <= HWY_SSSE3
3800#undef HWY_IF_ADDSUB_V
3801#define HWY_IF_ADDSUB_V(V) \
3802 HWY_IF_V_SIZE_GT_V( \
3803 V, ((hwy::IsFloat3264<TFromV<V>>()) ? 32 : sizeof(TFromV<V>)))
3805template <
size_t N, HWY_IF_LANES_GT(N, 1)>
3816HWY_API Vec128<uint64_t, N / 8> SumsOf8(
const Vec128<uint8_t, N> v) {
3817 return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
3821template <
class V, HWY_IF_I8_D(DFromV<V>)>
3823 const DFromV<
decltype(v)> d;
3837#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3838#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3840#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3846 return Vec128<uint64_t, N / 8>{_mm_sad_epu8(a.
raw, b.
raw)};
3850template <
class V, HWY_IF_I8_D(DFromV<V>)>
3861 const auto a_adj =
BitCast(du,
Xor(a, i8_msb));
3862 const auto b_adj =
BitCast(du,
Xor(b, i8_msb));
3870#if HWY_TARGET <= HWY_AVX3
3882 return Vec128<uint32_t, (N + 3) / 4>{
3883 _mm_maskz_dbsad_epu8(
static_cast<__mmask8
>(0x55), v.
raw,
Zero(
d).raw, 0)};
3893#if HWY_TARGET <= HWY_SSE4
3894#ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
3895#undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
3897#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
3900template <
int kAOffset,
int kBOffset,
size_t N>
3903 static_assert(0 <= kAOffset && kAOffset <= 1,
3904 "kAOffset must be between 0 and 1");
3905 static_assert(0 <= kBOffset && kBOffset <= 3,
3906 "kBOffset must be between 0 and 3");
3907 return Vec128<uint16_t, (N + 1) / 2>{
3908 _mm_mpsadbw_epu8(a.
raw, b.
raw, (kAOffset << 2) | kBOffset)};
3912template <
int kAOffset,
int kBOffset,
class V, HWY_IF_I8_D(DFromV<V>)>
3923 const auto a_adj =
BitCast(du,
Xor(a, i8_msb));
3924 const auto b_adj =
BitCast(du,
Xor(b, i8_msb));
3929 return BitCast(dw, SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj));
3935#if HWY_TARGET <= HWY_AVX3
3936#ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
3937#undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
3939#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
3942template <
int kIdx3,
int kIdx2,
int kIdx1,
int kIdx0,
size_t N>
3945 static_assert(0 <= kIdx0 && kIdx0 <= 3,
"kIdx0 must be between 0 and 3");
3946 static_assert(0 <= kIdx1 && kIdx1 <= 3,
"kIdx1 must be between 0 and 3");
3947 static_assert(0 <= kIdx2 && kIdx2 <= 3,
"kIdx2 must be between 0 and 3");
3948 static_assert(0 <= kIdx3 && kIdx3 <= 3,
"kIdx3 must be between 0 and 3");
3949 return Vec128<uint16_t, (N + 1) / 2>{
3950 _mm_dbsad_epu8(b.
raw, a.
raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
3954template <
int kIdx3,
int kIdx2,
int kIdx1,
int kIdx0,
class V,
3967 const auto a_adj =
BitCast(du,
Xor(a, i8_msb));
3968 const auto b_adj =
BitCast(du,
Xor(b, i8_msb));
3975 dw, SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj));
3985HWY_API Vec128<uint8_t, N> SaturatedAdd(
const Vec128<uint8_t, N> a,
3986 const Vec128<uint8_t, N> b) {
3987 return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
3991 const Vec128<uint16_t, N> b) {
3992 return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
3998 const Vec128<int8_t, N> b) {
3999 return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
4003 const Vec128<int16_t, N> b) {
4004 return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
4007#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
4008#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
4009#undef HWY_NATIVE_I32_SATURATED_ADDSUB
4011#define HWY_NATIVE_I32_SATURATED_ADDSUB
4014#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
4015#undef HWY_NATIVE_I64_SATURATED_ADDSUB
4017#define HWY_NATIVE_I64_SATURATED_ADDSUB
4024 const auto sum = a + b;
4027 const auto i32_max =
Set(
d, LimitsMax<int32_t>());
4029 i32_max.raw,
MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
4030 return IfThenElse(overflow_mask, overflow_result, sum);
4037 const auto sum = a + b;
4040 const auto i64_max =
Set(
d, LimitsMax<int64_t>());
4042 i64_max.raw,
MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
4043 return IfThenElse(overflow_mask, overflow_result, sum);
4053HWY_API Vec128<uint8_t, N> SaturatedSub(
const Vec128<uint8_t, N> a,
4054 const Vec128<uint8_t, N> b) {
4055 return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
4059 const Vec128<uint16_t, N> b) {
4060 return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
4066 const Vec128<int8_t, N> b) {
4067 return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
4071 const Vec128<int16_t, N> b) {
4072 return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
4075#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
4080 const auto diff = a - b;
4083 const auto i32_max =
Set(
d, LimitsMax<int32_t>());
4085 i32_max.raw,
MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
4086 return IfThenElse(overflow_mask, overflow_result, diff);
4093 const auto diff = a - b;
4096 const auto i64_max =
Set(
d, LimitsMax<int64_t>());
4098 i64_max.raw,
MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
4099 return IfThenElse(overflow_mask, overflow_result, diff);
4109HWY_API Vec128<uint8_t, N> AverageRound(
const Vec128<uint8_t, N> a,
4110 const Vec128<uint8_t, N> b) {
4111 return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
4115 const Vec128<uint16_t, N> b) {
4116 return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
4123 const Vec128<uint16_t, N> b) {
4124 return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
4128 const Vec128<int16_t, N> b) {
4129 return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
4135 const Vec128<uint16_t, N> b) {
4136 return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
4140 const Vec128<int16_t, N> b) {
4141 return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
4157 const DFromV<
decltype(a)> d;
4159 const auto p_even = BitCast(d, MulEven(a, b));
4160 const auto p_odd = BitCast(d, MulOdd(a, b));
4161 return InterleaveOdd(d, p_even, p_odd);
4166template <
class V, HWY_IF_U8_D(DFromV<V>)>
4170 const auto lo8_mask =
Set(dw, uint16_t{0x00FF});
4175template <
class V, HWY_IF_I8_D(DFromV<V>)>
4177 const DFromV<
decltype(a)> d;
4178 const RepartitionToWide<
decltype(d)> dw;
4179 return ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, a))) *
4180 ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, b)));
4183template <
class V, HWY_IF_UI16_D(DFromV<V>)>
4185 const DFromV<
decltype(a)> d;
4186 const RepartitionToWide<
decltype(d)> dw;
4187 const RepartitionToNarrow<
decltype(dw)> dw_as_d16;
4189 const auto lo = ResizeBitCast(dw, a * b);
4190 const auto hi = ShiftLeft<16>(ResizeBitCast(dw, MulHigh(a, b)));
4191 return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo)));
4195HWY_API Vec128<uint64_t, (N + 1) / 2>
MulEven(
const Vec128<uint32_t, N> a,
4196 const Vec128<uint32_t, N> b) {
4197 return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
4201HWY_API Vec128<int64_t, (N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
4202 const Vec128<int32_t, N> b) {
4203#if HWY_TARGET >= HWY_SSSE3
4204 const DFromV<
decltype(a)> d;
4222 const auto neg_p_hi = ShiftLeft<32>(
4225 return p_lo - neg_p_hi;
4227 return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
4231template <
class V, HWY_IF_T_SIZE_V(V, 1)>
4239template <
class V, HWY_IF_UI16_D(DFromV<V>)>
4241 const DFromV<
decltype(a)> d;
4242 const RepartitionToWide<
decltype(d)> dw;
4243 const RebindToUnsigned<
decltype(dw)> dw_u;
4244 const RepartitionToNarrow<
decltype(dw)> dw_as_d16;
4246 const auto lo = ShiftRight<16>(BitCast(dw_u, ResizeBitCast(dw, a * b)));
4247 const auto hi = ResizeBitCast(dw, MulHigh(a, b));
4248 return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo)));
4251template <
class V, HWY_IF_UI32_D(DFromV<V>)>
4253 return MulEven(DupOdd(a), DupOdd(b));
4258 const Vec128<uint32_t, N> b) {
4259#if HWY_TARGET >= HWY_SSSE3
4263 const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
4264 const auto mullo_x2x0 =
MulEven(a, b);
4265 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
4266 const auto mullo_x3x1 =
4267 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
4270 const __m128i mul_20 =
4271 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
4272 const __m128i mul_31 =
4273 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
4274 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
4276 return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
4282 const Vec128<int32_t, N> b) {
4284 const DFromV<
decltype(a)> d;
4294#if HWY_TARGET > HWY_AVX3_DL
4295template <
int kBits,
size_t N>
4297 static_assert(0 <= kBits && kBits < 8,
"Invalid shift count");
4298 if (kBits == 0)
return v;
4300 return Or(ShiftRight<kBits>(v), ShiftLeft<
HWY_MIN(7, 8 - kBits)>(v));
4304template <
int kBits,
size_t N>
4306 static_assert(0 <= kBits && kBits < 16,
"Invalid shift count");
4307 if (kBits == 0)
return v;
4312template <
int kBits,
size_t N>
4314 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
4315#if HWY_TARGET <= HWY_AVX3
4318 if (kBits == 0)
return v;
4323template <
int kBits,
size_t N>
4325 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
4326#if HWY_TARGET <= HWY_AVX3
4329 if (kBits == 0)
return v;
4335template <
int kBits,
class V, HWY_IF_SIGNED_V(V)>
4337 const DFromV<
decltype(v)> d;
4338 const RebindToUnsigned<
decltype(d)> du;
4339 return BitCast(d, RotateRight<kBits>(BitCast(du, v)));
4343#if HWY_TARGET <= HWY_AVX3
4345#ifdef HWY_NATIVE_ROL_ROR_32_64
4346#undef HWY_NATIVE_ROL_ROR_32_64
4348#define HWY_NATIVE_ROL_ROR_32_64
4351template <
class T,
size_t N, HWY_IF_UI32(T)>
4352HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
4353 return Vec128<T, N>{_mm_rolv_epi32(a.raw, b.raw)};
4356template <
class T,
size_t N, HWY_IF_UI32(T)>
4357HWY_API Vec128<T, N>
Ror(Vec128<T, N> a, Vec128<T, N> b) {
4358 return Vec128<T, N>{_mm_rorv_epi32(a.raw, b.raw)};
4361template <
class T,
size_t N, HWY_IF_UI64(T)>
4362HWY_API Vec128<T, N>
Rol(Vec128<T, N> a, Vec128<T, N> b) {
4363 return Vec128<T, N>{_mm_rolv_epi64(a.raw, b.raw)};
4366template <
class T,
size_t N, HWY_IF_UI64(T)>
4367HWY_API Vec128<T, N>
Ror(Vec128<T, N> a, Vec128<T, N> b) {
4368 return Vec128<T, N>{_mm_rorv_epi64(a.raw, b.raw)};
4375#if HWY_TARGET <= HWY_AVX3
4377#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
4378#undef HWY_NATIVE_ROL_ROR_SAME_32_64
4380#define HWY_NATIVE_ROL_ROR_SAME_32_64
4388 return Rol(v,
Set(
d,
static_cast<TFromV<V>>(
static_cast<unsigned>(bits))));
4395 return Ror(v,
Set(
d,
static_cast<TFromV<V>>(
static_cast<unsigned>(bits))));
4402HWY_API Vec128<int8_t, N> BroadcastSignBit(
const Vec128<int8_t, N> v) {
4403 const DFromV<
decltype(v)> d;
4404 return VecFromMask(v < Zero(d));
4409 return ShiftRight<15>(v);
4414 return ShiftRight<31>(v);
4420#if HWY_TARGET <= HWY_AVX3
4423#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
4429 const auto sign = ShiftRight<31>(
BitCast(d32, v));
4431 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4439HWY_API Vec128<int8_t, N> Abs(
const Vec128<int8_t, N> v) {
4440#if HWY_COMPILER_MSVC || HWY_TARGET == HWY_SSE2
4441 const DFromV<
decltype(v)> d;
4442 const RebindToUnsigned<
decltype(d)> du;
4443 const auto zero = Zero(du);
4444 const auto v_as_u8 = BitCast(du, v);
4445 return BitCast(d, Min(v_as_u8, zero - v_as_u8));
4447 return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
4452HWY_API Vec128<int16_t, N>
Abs(
const Vec128<int16_t, N> v) {
4453#if HWY_TARGET == HWY_SSE2
4454 const auto zero =
Zero(DFromV<
decltype(v)>());
4455 return Max(v, zero - v);
4457 return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
4462HWY_API Vec128<int32_t, N>
Abs(
const Vec128<int32_t, N> v) {
4463#if HWY_TARGET <= HWY_SSSE3
4464 return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
4466 const auto zero =
Zero(DFromV<
decltype(v)>());
4471#if HWY_TARGET <= HWY_AVX3
4473HWY_API Vec128<int64_t, N>
Abs(
const Vec128<int64_t, N> v) {
4474 return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
4478template <
class V, HWY_IF_I64(TFromV<V>)>
4480 const auto zero =
Zero(DFromV<
decltype(v)>());
4485#ifdef HWY_NATIVE_SATURATED_ABS
4486#undef HWY_NATIVE_SATURATED_ABS
4488#define HWY_NATIVE_SATURATED_ABS
4492template <
class V, HWY_IF_I8(TFromV<V>)>
4494 const DFromV<
decltype(v)> d;
4495 const RebindToUnsigned<
decltype(d)> du;
4496 return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v))));
4500template <
class V, HWY_IF_I16(TFromV<V>)>
4506template <
class V, HWY_IF_I32(TFromV<V>)>
4508 const auto abs_v =
Abs(v);
4510#if HWY_TARGET <= HWY_SSE4
4511 const DFromV<
decltype(v)> d;
4514 Set(du,
static_cast<uint32_t
>(LimitsMax<int32_t>()))));
4521template <
class V, HWY_IF_I64(TFromV<V>)>
4523 const auto abs_v =
Abs(v);
4530#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
4531 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400)
4538template <
int kBits,
size_t N>
4540#if HWY_TARGET <= HWY_AVX3
4544 const DFromV<
decltype(v)> di;
4546 const auto right =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
4548 return right | sign;
4558#if HWY_TARGET <= HWY_SSE4
4569template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
4572 static_assert(IsSigned<T>(),
"Only works for signed/float");
4576#if HWY_TARGET <= HWY_AVX3
4587template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
4588HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
4590 static_assert(IsSigned<T>(), "Only works for
signed/
float");
4591 const DFromV<decltype(v)> d;
4593#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
4596 const RebindToFloat<decltype(d)> df;
4597 const auto mask = MaskFromVec(BitCast(df, v));
4598 return BitCast(d, IfThenElse(mask, BitCast(df, yes), BitCast(df, no)));
4601#if HWY_TARGET <= HWY_AVX3
4605 const auto mask = MaskFromVec(v);
4607 const RebindToSigned<decltype(d)> di;
4608 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
4611 return IfThenElse(mask, yes, no);
4615#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
4617#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
4618#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
4620#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
4623#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
4624#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
4626#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
4631template <
class V, HWY_IF_NOT_UNSIGNED_V(V),
4632 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))>
4633HWY_API V IfNegativeThenElseZero(V v, V yes) {
4634 const DFromV<decltype(v)> d;
4635 return IfNegativeThenElse(v, yes, Zero(d));
4638template <
class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)>
4639HWY_API V IfNegativeThenElseZero(V v, V yes) {
4640 return IfThenElseZero(IsNegative(v), yes);
4645HWY_API V IfNegativeThenZeroElse(V v, V no) {
4646 const DFromV<
decltype(v)> d;
4647 return IfNegativeThenElse(v, Zero(d), no);
4650template <
class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)>
4659#if HWY_TARGET <= HWY_SSSE3
4661#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
4662#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
4664#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
4686template <
class V, HWY_IF_I64_D(DFromV<V>)>
4687HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
4688#if HWY_TARGET <= HWY_AVX3
4690 const DFromV<
decltype(v)> d;
4691 return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
4694 return IfNegativeThenElse(mask, Neg(v), v);
4703HWY_API Vec128<uint16_t, N> ShiftLeftSame(
const Vec128<uint16_t, N> v,
4706 if (__builtin_constant_p(bits)) {
4707 return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, bits)};
4710 return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
4716 if (__builtin_constant_p(bits)) {
4717 return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, bits)};
4720 return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
4726 if (__builtin_constant_p(bits)) {
4727 return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, bits)};
4730 return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
4737 if (__builtin_constant_p(bits)) {
4738 return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, bits)};
4741 return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
4748 if (__builtin_constant_p(bits)) {
4749 return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, bits)};
4752 return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
4759 if (__builtin_constant_p(bits)) {
4760 return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, bits)};
4763 return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
4766template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
4768 const DFromV<
decltype(v)> d8;
4770 const Vec128<T, N> shifted{
4772 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
4781 if (__builtin_constant_p(bits)) {
4782 return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, bits)};
4785 return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
4791 if (__builtin_constant_p(bits)) {
4792 return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, bits)};
4795 return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
4801 if (__builtin_constant_p(bits)) {
4802 return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, bits)};
4805 return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
4811 const DFromV<
decltype(v)> d8;
4813 const Vec128<uint8_t, N> shifted{
4815 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
4822 if (__builtin_constant_p(bits)) {
4823 return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, bits)};
4826 return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
4833 if (__builtin_constant_p(bits)) {
4834 return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, bits)};
4837 return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
4842#if HWY_TARGET <= HWY_AVX3
4844 if (__builtin_constant_p(bits)) {
4845 return Vec128<int64_t, N>{
4846 _mm_srai_epi64(v.raw,
static_cast<Shift64Count>(bits))};
4849 return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
4851 const DFromV<
decltype(v)> di;
4855 return right | sign;
4861 const DFromV<
decltype(v)> di;
4864 const auto shifted_sign =
4865 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
4866 return (shifted ^ shifted_sign) - shifted_sign;
4874 Vec128<float16_t, N> b) {
4875 return Vec128<float16_t, N>{_mm_mul_ph(a.raw, b.raw)};
4879HWY_API Vec128<float, N>
operator*(Vec128<float, N> a, Vec128<float, N> b) {
4880 return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
4887HWY_API Vec128<double, N> operator*(
const Vec128<double, N> a,
4888 const Vec128<double, N> b) {
4889 return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
4897HWY_API Vec128<float16_t, N> operator/(
const Vec128<float16_t, N> a,
4898 const Vec128<float16_t, N> b) {
4899 return Vec128<float16_t, N>{_mm_div_ph(a.raw, b.raw)};
4904 const Vec128<float, N> b) {
4905 return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
4912HWY_API Vec128<double, N> operator/(
const Vec128<double, N> a,
4913 const Vec128<double, N> b) {
4914 return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
4923HWY_API Vec128<float16_t, N> ApproximateReciprocal(
4924 const Vec128<float16_t, N> v) {
4925 return Vec128<float16_t, N>{_mm_rcp_ph(v.raw)};
4930 return Vec128<float, N>{_mm_rcp_ps(v.raw)};
4936#if HWY_TARGET <= HWY_AVX3
4937#ifdef HWY_NATIVE_F64_APPROX_RECIP
4938#undef HWY_NATIVE_F64_APPROX_RECIP
4940#define HWY_NATIVE_F64_APPROX_RECIP
4952template <
class V, HWY_IF_FLOAT_V(V)>
4959#if HWY_TARGET <= HWY_AVX3
4961#ifdef HWY_NATIVE_MASKED_ARITH
4962#undef HWY_NATIVE_MASKED_ARITH
4964#define HWY_NATIVE_MASKED_ARITH
4967template <
typename T,
size_t N, HWY_IF_U8(T)>
4972template <
typename T,
size_t N, HWY_IF_I8(T)>
4973HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4974 Vec128<T, N> a, Vec128<T, N> b) {
4975 return Vec128<T, N>{_mm_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
4978template <
typename T,
size_t N, HWY_IF_U16(T)>
4980 Vec128<T, N> a, Vec128<T, N> b) {
4981 return Vec128<T, N>{_mm_mask_min_epu16(no.raw,
m.raw, a.raw, b.raw)};
4983template <
typename T,
size_t N, HWY_IF_I16(T)>
4985 Vec128<T, N> a, Vec128<T, N> b) {
4986 return Vec128<T, N>{_mm_mask_min_epi16(no.raw,
m.raw, a.raw, b.raw)};
4989template <
typename T,
size_t N, HWY_IF_U32(T)>
4991 Vec128<T, N> a, Vec128<T, N> b) {
4992 return Vec128<T, N>{_mm_mask_min_epu32(no.raw,
m.raw, a.raw, b.raw)};
4994template <
typename T,
size_t N, HWY_IF_I32(T)>
4996 Vec128<T, N> a, Vec128<T, N> b) {
4997 return Vec128<T, N>{_mm_mask_min_epi32(no.raw,
m.raw, a.raw, b.raw)};
5000template <
typename T,
size_t N, HWY_IF_U64(T)>
5002 Vec128<T, N> a, Vec128<T, N> b) {
5003 return Vec128<T, N>{_mm_mask_min_epu64(no.raw,
m.raw, a.raw, b.raw)};
5005template <
typename T,
size_t N, HWY_IF_I64(T)>
5007 Vec128<T, N> a, Vec128<T, N> b) {
5008 return Vec128<T, N>{_mm_mask_min_epi64(no.raw,
m.raw, a.raw, b.raw)};
5011template <
typename T,
size_t N, HWY_IF_F32(T)>
5013 Vec128<T, N> a, Vec128<T, N> b) {
5014 return Vec128<T, N>{_mm_mask_min_ps(no.raw,
m.raw, a.raw, b.raw)};
5017template <
typename T,
size_t N, HWY_IF_F64(T)>
5019 Vec128<T, N> a, Vec128<T, N> b) {
5020 return Vec128<T, N>{_mm_mask_min_pd(no.raw,
m.raw, a.raw, b.raw)};
5024template <
typename T,
size_t N, HWY_IF_F16(T)>
5026 Vec128<T, N> a, Vec128<T, N> b) {
5027 return Vec128<T, N>{_mm_mask_min_ph(no.raw,
m.raw, a.raw, b.raw)};
5033template <
typename T,
size_t N, HWY_IF_U8(T)>
5038template <
typename T,
size_t N, HWY_IF_I8(T)>
5039HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5040 Vec128<T, N> a, Vec128<T, N> b) {
5041 return Vec128<T, N>{_mm_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
5044template <
typename T,
size_t N, HWY_IF_U16(T)>
5046 Vec128<T, N> a, Vec128<T, N> b) {
5047 return Vec128<T, N>{_mm_mask_max_epu16(no.raw,
m.raw, a.raw, b.raw)};
5049template <
typename T,
size_t N, HWY_IF_I16(T)>
5051 Vec128<T, N> a, Vec128<T, N> b) {
5052 return Vec128<T, N>{_mm_mask_max_epi16(no.raw,
m.raw, a.raw, b.raw)};
5055template <
typename T,
size_t N, HWY_IF_U32(T)>
5057 Vec128<T, N> a, Vec128<T, N> b) {
5058 return Vec128<T, N>{_mm_mask_max_epu32(no.raw,
m.raw, a.raw, b.raw)};
5060template <
typename T,
size_t N, HWY_IF_I32(T)>
5062 Vec128<T, N> a, Vec128<T, N> b) {
5063 return Vec128<T, N>{_mm_mask_max_epi32(no.raw,
m.raw, a.raw, b.raw)};
5066template <
typename T,
size_t N, HWY_IF_U64(T)>
5068 Vec128<T, N> a, Vec128<T, N> b) {
5069 return Vec128<T, N>{_mm_mask_max_epu64(no.raw,
m.raw, a.raw, b.raw)};
5071template <
typename T,
size_t N, HWY_IF_I64(T)>
5073 Vec128<T, N> a, Vec128<T, N> b) {
5074 return Vec128<T, N>{_mm_mask_max_epi64(no.raw,
m.raw, a.raw, b.raw)};
5077template <
typename T,
size_t N, HWY_IF_F32(T)>
5079 Vec128<T, N> a, Vec128<T, N> b) {
5080 return Vec128<T, N>{_mm_mask_max_ps(no.raw,
m.raw, a.raw, b.raw)};
5083template <
typename T,
size_t N, HWY_IF_F64(T)>
5085 Vec128<T, N> a, Vec128<T, N> b) {
5086 return Vec128<T, N>{_mm_mask_max_pd(no.raw,
m.raw, a.raw, b.raw)};
5090template <
typename T,
size_t N, HWY_IF_F16(T)>
5092 Vec128<T, N> a, Vec128<T, N> b) {
5093 return Vec128<T, N>{_mm_mask_max_ph(no.raw,
m.raw, a.raw, b.raw)};
5099template <
typename T,
size_t N, HWY_IF_UI8(T)>
5105template <
typename T,
size_t N, HWY_IF_UI16(T)>
5106HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
5107 Vec128<T, N> a, Vec128<T, N> b) {
5108 return Vec128<T, N>{_mm_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
5111template <
typename T,
size_t N, HWY_IF_UI32(T)>
5113 Vec128<T, N> a, Vec128<T, N> b) {
5114 return Vec128<T, N>{_mm_mask_add_epi32(no.raw,
m.raw, a.raw, b.raw)};
5117template <
typename T,
size_t N, HWY_IF_UI64(T)>
5119 Vec128<T, N> a, Vec128<T, N> b) {
5120 return Vec128<T, N>{_mm_mask_add_epi64(no.raw,
m.raw, a.raw, b.raw)};
5123template <
typename T,
size_t N, HWY_IF_F32(T)>
5125 Vec128<T, N> a, Vec128<T, N> b) {
5126 return Vec128<T, N>{_mm_mask_add_ps(no.raw,
m.raw, a.raw, b.raw)};
5129template <
typename T,
size_t N, HWY_IF_F64(T)>
5131 Vec128<T, N> a, Vec128<T, N> b) {
5132 return Vec128<T, N>{_mm_mask_add_pd(no.raw,
m.raw, a.raw, b.raw)};
5136template <
typename T,
size_t N, HWY_IF_F16(T)>
5138 Vec128<T, N> a, Vec128<T, N> b) {
5139 return Vec128<T, N>{_mm_mask_add_ph(no.raw,
m.raw, a.raw, b.raw)};
5145template <
typename T,
size_t N, HWY_IF_UI8(T)>
5151template <
typename T,
size_t N, HWY_IF_UI16(T)>
5152HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
5153 Vec128<T, N> a, Vec128<T, N> b) {
5154 return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
5157template <
typename T,
size_t N, HWY_IF_UI32(T)>
5159 Vec128<T, N> a, Vec128<T, N> b) {
5160 return Vec128<T, N>{_mm_mask_sub_epi32(no.raw,
m.raw, a.raw, b.raw)};
5163template <
typename T,
size_t N, HWY_IF_UI64(T)>
5165 Vec128<T, N> a, Vec128<T, N> b) {
5166 return Vec128<T, N>{_mm_mask_sub_epi64(no.raw,
m.raw, a.raw, b.raw)};
5169template <
typename T,
size_t N, HWY_IF_F32(T)>
5171 Vec128<T, N> a, Vec128<T, N> b) {
5172 return Vec128<T, N>{_mm_mask_sub_ps(no.raw,
m.raw, a.raw, b.raw)};
5175template <
typename T,
size_t N, HWY_IF_F64(T)>
5177 Vec128<T, N> a, Vec128<T, N> b) {
5178 return Vec128<T, N>{_mm_mask_sub_pd(no.raw,
m.raw, a.raw, b.raw)};
5182template <
typename T,
size_t N, HWY_IF_F16(T)>
5184 Vec128<T, N> a, Vec128<T, N> b) {
5185 return Vec128<T, N>{_mm_mask_sub_ph(no.raw,
m.raw, a.raw, b.raw)};
5192template <
class V,
class M>
5212HWY_API Vec128<float16_t, N> MaskedMulOr(Vec128<float16_t, N> no,
5213 Mask128<float16_t, N> m,
5214 Vec128<float16_t, N> a,
5215 Vec128<float16_t, N> b) {
5216 return Vec128<float16_t, N>{_mm_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
5237HWY_API Vec128<float16_t, N> MaskedDivOr(Vec128<float16_t, N> no,
5238 Mask128<float16_t, N> m,
5239 Vec128<float16_t, N> a,
5240 Vec128<float16_t, N> b) {
5241 return Vec128<float16_t, N>{_mm_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
5246template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5253template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5260template <
typename T,
size_t N, HWY_IF_I8(T)>
5266template <
typename T,
size_t N, HWY_IF_U8(T)>
5267HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
5268 Vec128<T, N> a, Vec128<T, N> b) {
5269 return Vec128<T, N>{_mm_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
5272template <
typename T,
size_t N, HWY_IF_I16(T)>
5274 Vec128<T, N> a, Vec128<T, N> b) {
5275 return Vec128<T, N>{_mm_mask_adds_epi16(no.raw,
m.raw, a.raw, b.raw)};
5278template <
typename T,
size_t N, HWY_IF_U16(T)>
5280 Vec128<T, N> a, Vec128<T, N> b) {
5281 return Vec128<T, N>{_mm_mask_adds_epu16(no.raw,
m.raw, a.raw, b.raw)};
5286template <
typename T,
size_t N, HWY_IF_I8(T)>
5292template <
typename T,
size_t N, HWY_IF_U8(T)>
5293HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
5294 Vec128<T, N> a, Vec128<T, N> b) {
5295 return Vec128<T, N>{_mm_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
5298template <
typename T,
size_t N, HWY_IF_I16(T)>
5300 Vec128<T, N> a, Vec128<T, N> b) {
5301 return Vec128<T, N>{_mm_mask_subs_epi16(no.raw,
m.raw, a.raw, b.raw)};
5304template <
typename T,
size_t N, HWY_IF_U16(T)>
5306 Vec128<T, N> a, Vec128<T, N> b) {
5307 return Vec128<T, N>{_mm_mask_subs_epu16(no.raw,
m.raw, a.raw, b.raw)};
5316HWY_API Vec128<float16_t, N>
MulAdd(Vec128<float16_t, N> mul,
5317 Vec128<float16_t, N> x,
5318 Vec128<float16_t, N> add) {
5319 return Vec128<float16_t, N>{_mm_fmadd_ph(mul.raw, x.raw, add.raw)};
5324 Vec128<float16_t, N> x,
5325 Vec128<float16_t, N> add) {
5326 return Vec128<float16_t, N>{_mm_fnmadd_ph(mul.raw, x.raw, add.raw)};
5330HWY_API Vec128<float16_t, N>
MulSub(Vec128<float16_t, N> mul,
5331 Vec128<float16_t, N> x,
5332 Vec128<float16_t, N> sub) {
5333 return Vec128<float16_t, N>{_mm_fmsub_ph(mul.raw, x.raw, sub.raw)};
5338 Vec128<float16_t, N> x,
5339 Vec128<float16_t, N> sub) {
5340 return Vec128<float16_t, N>{_mm_fnmsub_ph(mul.raw, x.raw, sub.raw)};
5347#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5348 return mul * x + add;
5356#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5357 return mul * x + add;
5367#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5368 return add - mul * x;
5376#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5377 return add - mul * x;
5387#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5388 return mul * x - sub;
5396#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5397 return mul * x - sub;
5407#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5408 return Neg(mul) * x - sub;
5416#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5417 return Neg(mul) * x - sub;
5423#if HWY_TARGET <= HWY_SSSE3
5425#undef HWY_IF_MULADDSUB_V
5426#define HWY_IF_MULADDSUB_V(V) \
5427 HWY_IF_LANES_GT_D(DFromV<V>, 1), \
5428 HWY_IF_T_SIZE_ONE_OF_V( \
5429 V, (1 << 1) | ((hwy::IsFloat<TFromV<V>>()) \
5431 : ((1 << 2) | (1 << 4) | (1 << 8))))
5434template <
size_t N, HWY_IF_LANES_GT(N, 1)>
5435HWY_API Vec128<float16_t, N> MulAddSub(Vec128<float16_t, N> mul,
5436 Vec128<float16_t, N> x,
5437 Vec128<float16_t, N> sub_or_add) {
5438 return Vec128<float16_t, N>{_mm_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
5442template <
size_t N, HWY_IF_LANES_GT(N, 1)>
5445#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5446 return AddSub(mul * x, sub_or_add);
5454#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5455 return AddSub(mul * x, sub_or_add);
5468HWY_API Vec128<float16_t, N> Sqrt(Vec128<float16_t, N> v) {
5469 return Vec128<float16_t, N>{_mm_sqrt_ph(v.raw)};
5473HWY_API Vec128<float, N>
Sqrt(Vec128<float, N> v) {
5474 return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
5480HWY_API Vec128<double, N> Sqrt(Vec128<double, N> v) {
5481 return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
5490HWY_API Vec128<float16_t, N> ApproximateReciprocalSqrt(Vec128<float16_t, N> v) {
5491 return Vec128<float16_t, N>{_mm_rsqrt_ph(v.raw)};
5496 return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
5502#if HWY_TARGET <= HWY_AVX3
5503#ifdef HWY_NATIVE_F64_APPROX_RSQRT
5504#undef HWY_NATIVE_F64_APPROX_RSQRT
5506#define HWY_NATIVE_F64_APPROX_RSQRT
5513#if HWY_COMPILER_MSVC
5527template <
typename T,
size_t N>
5533 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
5542HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
5543 return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
5546HWY_API Vec128<uint16_t, N>
Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
5547#if HWY_TARGET >= HWY_SSSE3
5548 return detail::MinU(a, b);
5550 return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
5554HWY_API Vec128<uint32_t, N>
Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
5555#if HWY_TARGET >= HWY_SSSE3
5556 return detail::MinU(a, b);
5558 return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
5562HWY_API Vec128<uint64_t, N>
Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
5563#if HWY_TARGET <= HWY_AVX3
5564 return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
5566 return detail::MinU(a, b);
5572HWY_API Vec128<int8_t, N>
Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
5573#if HWY_TARGET >= HWY_SSSE3
5576 return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
5580HWY_API Vec128<int16_t, N>
Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
5581 return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
5584HWY_API Vec128<int32_t, N>
Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
5585#if HWY_TARGET >= HWY_SSSE3
5588 return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
5592HWY_API Vec128<int64_t, N>
Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
5593#if HWY_TARGET <= HWY_AVX3
5594 return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
5603HWY_API Vec128<float16_t, N>
Min(Vec128<float16_t, N> a,
5604 Vec128<float16_t, N> b) {
5605 return Vec128<float16_t, N>{_mm_min_ph(a.raw, b.raw)};
5609HWY_API Vec128<float, N>
Min(Vec128<float, N> a, Vec128<float, N> b) {
5610 return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
5613HWY_API Vec128<double, N>
Min(Vec128<double, N> a, Vec128<double, N> b) {
5614 return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
5620template <
typename T,
size_t N>
5626 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
5635HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
5636 return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
5639HWY_API Vec128<uint16_t, N>
Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
5640#if HWY_TARGET >= HWY_SSSE3
5641 return detail::MaxU(a, b);
5643 return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
5647HWY_API Vec128<uint32_t, N>
Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
5648#if HWY_TARGET >= HWY_SSSE3
5649 return detail::MaxU(a, b);
5651 return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
5655HWY_API Vec128<uint64_t, N>
Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
5656#if HWY_TARGET <= HWY_AVX3
5657 return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
5659 return detail::MaxU(a, b);
5665HWY_API Vec128<int8_t, N>
Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
5666#if HWY_TARGET >= HWY_SSSE3
5669 return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
5673HWY_API Vec128<int16_t, N>
Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
5674 return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
5677HWY_API Vec128<int32_t, N>
Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
5678#if HWY_TARGET >= HWY_SSSE3
5681 return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
5685HWY_API Vec128<int64_t, N>
Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
5686#if HWY_TARGET <= HWY_AVX3
5687 return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
5696HWY_API Vec128<float16_t, N>
Max(Vec128<float16_t, N> a,
5697 Vec128<float16_t, N> b) {
5698 return Vec128<float16_t, N>{_mm_max_ph(a.raw, b.raw)};
5702HWY_API Vec128<float, N>
Max(Vec128<float, N> a, Vec128<float, N> b) {
5703 return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
5706HWY_API Vec128<double, N>
Max(Vec128<double, N> a, Vec128<double, N> b) {
5707 return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
5716template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
5719 _mm_stream_si128(
reinterpret_cast<__m128i*
>(aligned),
BitCast(du, v).raw);
5721template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
5723 _mm_stream_ps(aligned, v.raw);
5725template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
5727 _mm_stream_pd(aligned, v.raw);
5738static_assert(
sizeof(
GatherIndex64) == 8,
"Must be 64-bit type");
5740#if HWY_TARGET <= HWY_AVX3
5742#ifdef HWY_NATIVE_SCATTER
5743#undef HWY_NATIVE_SCATTER
5745#define HWY_NATIVE_SCATTER
5750template <
int kScale,
class D,
class VI, HWY_IF_UI32_D(D)>
5753 if (
d.MaxBytes() == 16) {
5754 _mm_i32scatter_epi32(base, index.raw, v.raw, kScale);
5756 const __mmask8 mask = (1u <<
MaxLanes(
d)) - 1;
5757 _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, kScale);
5761template <
int kScale,
class D,
class VI, HWY_IF_UI64_D(D)>
5764 if (
d.MaxBytes() == 16) {
5765 _mm_i64scatter_epi64(base, index.raw, v.raw, kScale);
5767 const __mmask8 mask = (1u << MaxLanes(d)) - 1;
5768 _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, kScale);
5772template <
int kScale,
class D,
class VI, HWY_IF_F32_D(D)>
5775 if (
d.MaxBytes() == 16) {
5776 _mm_i32scatter_ps(base, index.raw, v.raw, kScale);
5778 const __mmask8 mask = (1u <<
MaxLanes(
d)) - 1;
5779 _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, kScale);
5783template <
int kScale,
class D,
class VI, HWY_IF_F64_D(D)>
5786 if (
d.MaxBytes() == 16) {
5787 _mm_i64scatter_pd(base, index.raw, v.raw, kScale);
5789 const __mmask8 mask = (1u <<
MaxLanes(
d)) - 1;
5790 _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, kScale);
5794template <
int kScale,
class D,
class VI, HWY_IF_UI32_D(D)>
5799 _mm_mask_i32scatter_epi32(base,
m.raw, index.raw, v.raw, kScale);
5802template <
int kScale,
class D,
class VI, HWY_IF_UI64_D(D)>
5807 _mm_mask_i64scatter_epi64(base,
m.raw, index.raw, v.raw, kScale);
5810template <
int kScale,
class D,
class VI, HWY_IF_F32_D(D)>
5815 _mm_mask_i32scatter_ps(base,
m.raw, index.raw, v.raw, kScale);
5818template <
int kScale,
class D,
class VI, HWY_IF_F64_D(D)>
5823 _mm_mask_i64scatter_pd(base,
m.raw, index.raw, v.raw, kScale);
5828template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5831 return detail::NativeScatter128<1>(v,
d, base, offset);
5833template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5836 return detail::NativeScatter128<
sizeof(
TFromD<D>)>(v,
d, base, index);
5838template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5842 return detail::NativeMaskedScatter128<
sizeof(
TFromD<D>)>(v,
m,
d, base,
5850#if HWY_TARGET <= HWY_AVX2
5852#ifdef HWY_NATIVE_GATHER
5853#undef HWY_NATIVE_GATHER
5855#define HWY_NATIVE_GATHER
5860template <
int kScale,
typename T,
size_t N, HWY_IF_UI32(T)>
5864 reinterpret_cast<const int32_t*
>(base),
indices.raw, kScale)};
5867template <
int kScale,
typename T,
size_t N, HWY_IF_UI64(T)>
5874template <
int kScale,
size_t N>
5880template <
int kScale,
size_t N>
5886template <
int kScale,
typename T,
size_t N, HWY_IF_UI32(T)>
5891#if HWY_TARGET <= HWY_AVX3
5893 no.
raw,
m.raw,
indices.raw,
reinterpret_cast<const int32_t*
>(base),
5897 _mm_mask_i32gather_epi32(no.
raw,
reinterpret_cast<const int32_t*
>(base),
5902template <
int kScale,
typename T,
size_t N, HWY_IF_UI64(T)>
5907#if HWY_TARGET <= HWY_AVX3
5918template <
int kScale,
size_t N>
5922#if HWY_TARGET <= HWY_AVX3
5924 _mm_mmask_i32gather_ps(no.
raw,
m.raw,
indices.raw, base, kScale)};
5927 _mm_mask_i32gather_ps(no.
raw, base,
indices.raw,
m.raw, kScale)};
5931template <
int kScale,
size_t N>
5935#if HWY_TARGET <= HWY_AVX3
5937 _mm_mmask_i64gather_pd(no.
raw,
m.raw,
indices.raw, base, kScale)};
5940 _mm_mask_i64gather_pd(no.
raw, base,
indices.raw,
m.raw, kScale)};
5946template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5952 return detail::NativeGather128<1>(base, offsets);
5955template <
class D, HWY_IF_V_SIZE_LE_D(D, 16),
typename T = TFromD<D>>
5961 return detail::NativeGather128<sizeof(T)>(base,
indices);
5964template <
class D, HWY_IF_V_SIZE_LE_D(D, 16),
typename T = TFromD<D>>
5974 return detail::NativeMaskedGatherOr128<sizeof(T)>(no,
m, base,
indices);
5993template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
5997template <
typename T,
size_t N>
5999 return Vec128<T, N / 2>{v.raw};
6004template <
int kBytes,
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6006 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
6009 d,
VFromD<
decltype(du)>{_mm_slli_si128(
BitCast(du, v).raw, kBytes)});
6013template <
int kBytes,
class V>
6014HWY_API V ShiftLeftBytes(
const V v) {
6015 return ShiftLeftBytes<kBytes>(DFromV<
decltype(v)>(), v);
6021template <
int kLanes,
class D>
6023 const Repartition<uint8_t,
decltype(d)> d8;
6024 return BitCast(d, ShiftLeftBytes<kLanes *
sizeof(TFromD<D>)>(BitCast(d8, v)));
6028template <
int kLanes,
class V>
6030 return ShiftLeftLanes<kLanes>(DFromV<
decltype(v)>(), v);
6034template <
int kBytes,
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6036 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
6039 if (
d.MaxBytes() != 16) {
6041 const VFromD<
decltype(dfull)> vfull{v.raw};
6045 d,
VFromD<
decltype(du)>{_mm_srli_si128(
BitCast(du, v).raw, kBytes)});
6050template <
int kLanes,
class D>
6052 const Repartition<uint8_t,
decltype(d)> d8;
6053 constexpr size_t kBytes = kLanes *
sizeof(TFromD<D>);
6054 return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
6060template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
6062 const Twice<RebindToUnsigned<
decltype(d)>> dut;
6063 using VUT =
VFromD<
decltype(dut)>;
6064 const VUT vut = BitCast(dut, v);
6065 return BitCast(d, LowerHalf(VUT{_mm_unpackhi_epi64(vut.raw, vut.raw)}));
6067template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
6071template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
6077template <
class D, HWY_IF_V_SIZE_LE_D(D, 4)>
6079 return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
6086template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
6087HWY_INLINE T ExtractLane(
const Vec128<T, N> v) {
6088 static_assert(kLane < N,
"Lane index out of bounds");
6089#if HWY_TARGET >= HWY_SSSE3
6090 const int pair = _mm_extract_epi16(v.raw, kLane / 2);
6091 constexpr int kShift = kLane & 1 ? 8 : 0;
6092 return static_cast<T
>((pair >> kShift) & 0xFF);
6094 return static_cast<T
>(_mm_extract_epi8(v.raw, kLane) & 0xFF);
6098template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
6100 static_assert(kLane < N,
"Lane index out of bounds");
6101 const DFromV<
decltype(v)> d;
6103 const uint16_t lane =
static_cast<uint16_t
>(
6104 _mm_extract_epi16(
BitCast(du, v).raw, kLane) & 0xFFFF);
6105 return BitCastScalar<T>(lane);
6108template <
size_t kLane,
typename T,
size_t N, HWY_IF_UI32(T)>
6110 static_assert(kLane < N,
"Lane index out of bounds");
6111#if HWY_TARGET >= HWY_SSSE3
6112 return static_cast<T
>(_mm_cvtsi128_si32(
6113 (kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, kLane)));
6115 return static_cast<T
>(_mm_extract_epi32(v.raw, kLane));
6119template <
size_t kLane,
typename T,
size_t N, HWY_IF_UI64(T)>
6121 static_assert(kLane < N,
"Lane index out of bounds");
6123 alignas(16) T lanes[2];
6124 Store(v, DFromV<
decltype(v)>(), lanes);
6125 return lanes[kLane];
6126#elif HWY_TARGET >= HWY_SSSE3
6127 return static_cast<T
>(
6128 _mm_cvtsi128_si64((kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, 0xEE)));
6130 return static_cast<T
>(_mm_extract_epi64(v.raw, kLane));
6134template <
size_t kLane,
size_t N>
6136 static_assert(kLane < N,
"Lane index out of bounds");
6137#if HWY_TARGET >= HWY_SSSE3
6138 return _mm_cvtss_f32((kLane == 0) ? v.raw
6139 : _mm_shuffle_ps(v.raw, v.raw, kLane));
6142 const int32_t bits = _mm_extract_ps(v.raw, kLane);
6143 return BitCastScalar<float>(bits);
6148template <
size_t kLane>
6150 static_assert(kLane == 0,
"Lane index out of bounds");
6154template <
size_t kLane>
6156 static_assert(kLane < 2,
"Lane index out of bounds");
6165template <
typename T>
6166HWY_API T ExtractLane(
const Vec128<T, 1> v,
size_t i) {
6172template <
typename T>
6173HWY_API T ExtractLane(
const Vec128<T, 2> v,
size_t i) {
6174#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6175 if (__builtin_constant_p(i)) {
6178 return detail::ExtractLane<0>(v);
6180 return detail::ExtractLane<1>(v);
6184 alignas(16) T lanes[2];
6185 Store(v, DFromV<
decltype(v)>(), lanes);
6189template <
typename T>
6191#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6192 if (__builtin_constant_p(i)) {
6195 return detail::ExtractLane<0>(v);
6197 return detail::ExtractLane<1>(v);
6199 return detail::ExtractLane<2>(v);
6201 return detail::ExtractLane<3>(v);
6205 alignas(16) T lanes[4];
6206 Store(v, DFromV<
decltype(v)>(), lanes);
6210template <
typename T>
6212#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6213 if (__builtin_constant_p(i)) {
6216 return detail::ExtractLane<0>(v);
6218 return detail::ExtractLane<1>(v);
6220 return detail::ExtractLane<2>(v);
6222 return detail::ExtractLane<3>(v);
6224 return detail::ExtractLane<4>(v);
6226 return detail::ExtractLane<5>(v);
6228 return detail::ExtractLane<6>(v);
6230 return detail::ExtractLane<7>(v);
6234 alignas(16) T lanes[8];
6235 Store(v, DFromV<
decltype(v)>(), lanes);
6239template <
typename T>
6241#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6242 if (__builtin_constant_p(i)) {
6245 return detail::ExtractLane<0>(v);
6247 return detail::ExtractLane<1>(v);
6249 return detail::ExtractLane<2>(v);
6251 return detail::ExtractLane<3>(v);
6253 return detail::ExtractLane<4>(v);
6255 return detail::ExtractLane<5>(v);
6257 return detail::ExtractLane<6>(v);
6259 return detail::ExtractLane<7>(v);
6261 return detail::ExtractLane<8>(v);
6263 return detail::ExtractLane<9>(v);
6265 return detail::ExtractLane<10>(v);
6267 return detail::ExtractLane<11>(v);
6269 return detail::ExtractLane<12>(v);
6271 return detail::ExtractLane<13>(v);
6273 return detail::ExtractLane<14>(v);
6275 return detail::ExtractLane<15>(v);
6279 alignas(16) T lanes[16];
6280 Store(v, DFromV<
decltype(v)>(), lanes);
6292#if HWY_TARGET <= HWY_AVX3
6294 const auto mask =
MFromD<
decltype(
d)>{
static_cast<RawMask
>(uint64_t{1} << i)};
6297 using TU =
TFromD<
decltype(du)>;
6304template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
6306 static_assert(kLane < N,
"Lane index out of bounds");
6307#if HWY_TARGET >= HWY_SSSE3
6308 return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
6314template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
6315HWY_INLINE Vec128<T, N> InsertLane(
const Vec128<T, N> v, T t) {
6316 static_assert(kLane < N,
"Lane index out of bounds");
6317 const DFromV<
decltype(v)> d;
6318 const RebindToUnsigned<
decltype(d)> du;
6319 const uint16_t bits = BitCastScalar<uint16_t>(t);
6320 return BitCast(d,
VFromD<
decltype(du)>{
6321 _mm_insert_epi16(BitCast(du, v).raw, bits, kLane)});
6324template <
size_t kLane,
typename T,
size_t N, HWY_IF_UI32(T)>
6326 static_assert(kLane < N,
"Lane index out of bounds");
6327#if HWY_TARGET >= HWY_SSSE3
6330 const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
6331 return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
6335template <
size_t kLane,
typename T,
size_t N, HWY_IF_UI64(T)>
6337 static_assert(kLane < N,
"Lane index out of bounds");
6338#if HWY_TARGET >= HWY_SSSE3 || HWY_ARCH_X86_32
6339 const DFromV<
decltype(v)> d;
6344 d, Vec128<double, N>{_mm_shuffle_pd(vt.raw,
BitCast(df, v).raw, 2)});
6347 d, Vec128<double, N>{_mm_shuffle_pd(
BitCast(df, v).raw, vt.raw, 0)});
6349 const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
6350 return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
6354template <
size_t kLane,
size_t N>
6356 static_assert(kLane < N,
"Lane index out of bounds");
6357#if HWY_TARGET >= HWY_SSSE3
6360 return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)};
6365template <
size_t kLane>
6367 static_assert(kLane == 0,
"Lane index out of bounds");
6371template <
size_t kLane>
6373 static_assert(kLane < 2,
"Lane index out of bounds");
6387template <
typename T>
6388HWY_API Vec128<T, 1> InsertLane(
const Vec128<T, 1> v,
size_t i, T t) {
6391 return Set(DFromV<
decltype(v)>(), t);
6394template <
typename T>
6395HWY_API Vec128<T, 2> InsertLane(
const Vec128<T, 2> v,
size_t i, T t) {
6396#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6397 if (__builtin_constant_p(i)) {
6400 return detail::InsertLane<0>(v, t);
6402 return detail::InsertLane<1>(v, t);
6406 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
6409template <
typename T>
6411#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6412 if (__builtin_constant_p(i)) {
6415 return detail::InsertLane<0>(v, t);
6417 return detail::InsertLane<1>(v, t);
6419 return detail::InsertLane<2>(v, t);
6421 return detail::InsertLane<3>(v, t);
6425 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
6428template <
typename T>
6430#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6431 if (__builtin_constant_p(i)) {
6434 return detail::InsertLane<0>(v, t);
6436 return detail::InsertLane<1>(v, t);
6438 return detail::InsertLane<2>(v, t);
6440 return detail::InsertLane<3>(v, t);
6442 return detail::InsertLane<4>(v, t);
6444 return detail::InsertLane<5>(v, t);
6446 return detail::InsertLane<6>(v, t);
6448 return detail::InsertLane<7>(v, t);
6452 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
6455template <
typename T>
6457#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6458 if (__builtin_constant_p(i)) {
6461 return detail::InsertLane<0>(v, t);
6463 return detail::InsertLane<1>(v, t);
6465 return detail::InsertLane<2>(v, t);
6467 return detail::InsertLane<3>(v, t);
6469 return detail::InsertLane<4>(v, t);
6471 return detail::InsertLane<5>(v, t);
6473 return detail::InsertLane<6>(v, t);
6475 return detail::InsertLane<7>(v, t);
6477 return detail::InsertLane<8>(v, t);
6479 return detail::InsertLane<9>(v, t);
6481 return detail::InsertLane<10>(v, t);
6483 return detail::InsertLane<11>(v, t);
6485 return detail::InsertLane<12>(v, t);
6487 return detail::InsertLane<13>(v, t);
6489 return detail::InsertLane<14>(v, t);
6491 return detail::InsertLane<15>(v, t);
6495 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
6500#if HWY_TARGET == HWY_SSE2
6501template <
int kBytes,
class D, HWY_IF_V_SIZE_D(D, 16)>
6503 static_assert(0 < kBytes && kBytes < 16,
"kBytes invalid");
6504 return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi));
6506template <
int kBytes,
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6508 constexpr size_t kSize =
d.MaxBytes();
6509 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
6511 const Twice<
decltype(
d)> dt;
6515template <
int kBytes,
class D, HWY_IF_V_SIZE_D(D, 16)>
6518 return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
6522template <
int kBytes,
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6524 constexpr size_t kSize =
d.MaxBytes();
6525 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
6527 using V8 = Vec128<uint8_t>;
6528 const DFromV<V8> dfull8;
6529 const Repartition<TFromD<D>,
decltype(dfull8)> dfull;
6530 const V8 hi8{
BitCast(d8, hi).raw};
6540template <
int kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
6542 const DFromV<
decltype(v)> d;
6544 using VU =
VFromD<
decltype(du)>;
6546 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
6548 const __m128i lo = _mm_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF);
6549 return BitCast(d, VU{_mm_unpacklo_epi64(lo, lo)});
6551 const __m128i hi = _mm_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF);
6552 return BitCast(d, VU{_mm_unpackhi_epi64(hi, hi)});
6556template <
int kLane,
typename T,
size_t N, HWY_IF_UI32(T)>
6558 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
6559 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
6562template <
int kLane,
typename T,
size_t N, HWY_IF_UI64(T)>
6564 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
6565 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
6568template <
int kLane,
size_t N>
6570 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
6574template <
int kLane,
size_t N>
6576 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
6583template <
typename T,
size_t N = 16 /
sizeof(T)>
6588template <
class D,
typename T = TFromD<D>,
typename TI,
size_t kN,
6589 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 1)>
6591 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
6592#if HWY_IS_DEBUG_BUILD
6593 const Rebind<TI,
decltype(
d)> di;
6603template <
class D,
typename T = TFromD<D>,
typename TI,
size_t kN,
6604 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 2)>
6605HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
6606 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
6607#if HWY_IS_DEBUG_BUILD
6608 const Rebind<TI,
decltype(d)> di;
6610 AllTrue(di, Lt(vec, Set(di, kN * 2))));
6613#if HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2
6615 return Indices128<T, kN>{vec.raw};
6617 const Repartition<uint8_t,
decltype(d)> d8;
6618 using V8 =
VFromD<
decltype(d8)>;
6619 alignas(16)
static constexpr uint8_t kByteOffsets[16] = {
6620 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
6623 alignas(16)
static constexpr uint8_t kBroadcastLaneBytes[16] = {
6624 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
6629 const V8 byte_indices =
BitCast(d8, ShiftLeft<1>(
BitCast(d16, lane_indices)));
6631 return Indices128<T, kN>{
Add(byte_indices,
Load(d8, kByteOffsets)).raw};
6635template <
class D,
typename T = TFromD<D>,
typename TI,
size_t kN,
6636 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 4)>
6638 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
6639#if HWY_IS_DEBUG_BUILD
6640 const Rebind<TI,
decltype(
d)> di;
6645#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
6647 return Indices128<T, kN>{vec.raw};
6650 using V8 =
VFromD<
decltype(d8)>;
6651 alignas(16)
static constexpr uint8_t kByteOffsets[16] = {
6652 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
6655 alignas(16)
static constexpr uint8_t kBroadcastLaneBytes[16] = {
6656 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
6661 const V8 byte_indices =
BitCast(d8, ShiftLeft<2>(
BitCast(d16, lane_indices)));
6663 return Indices128<T, kN>{
Add(byte_indices,
Load(d8, kByteOffsets)).raw};
6667template <
class D,
typename T = TFromD<D>,
typename TI,
size_t kN,
6668 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 8)>
6670 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
6671#if HWY_IS_DEBUG_BUILD
6672 const Rebind<TI,
decltype(
d)> di;
6674 AllTrue(di,
Lt(vec,
Set(di,
static_cast<TI
>(kN * 2)))));
6680 return Indices128<T, kN>{vec.raw};
6683template <
class D, HWY_IF_V_SIZE_LE_D(D, 16),
typename TI>
6685 D d,
const TI* idx) {
6686 static_assert(
sizeof(TFromD<D>) ==
sizeof(TI),
"Index size must match lane");
6687 const Rebind<TI,
decltype(
d)> di;
6691template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
6696template <
typename T,
size_t N, HWY_IF_UI16(T)>
6697HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
6698#if HWY_TARGET <= HWY_AVX3
6699 return {_mm_permutexvar_epi16(idx.raw, v.raw)};
6700#elif HWY_TARGET == HWY_SSE2
6701#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
6702 typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16)));
6703 return Vec128<T, N>{
reinterpret_cast<typename detail::Raw128<T>::type
>(
6704 __builtin_shuffle(
reinterpret_cast<GccU16RawVectType
>(v.raw),
6705 reinterpret_cast<GccU16RawVectType
>(idx.raw)))};
6707 const Full128<T> d_full;
6708 alignas(16) T src_lanes[8];
6709 alignas(16) uint16_t indices[8];
6710 alignas(16) T result_lanes[8];
6712 Store(Vec128<T>{v.raw}, d_full, src_lanes);
6713 _mm_store_si128(
reinterpret_cast<__m128i*
>(indices), idx.raw);
6715 for (
int i = 0; i < 8; i++) {
6716 result_lanes[i] = src_lanes[
indices[i] & 7u];
6719 return Vec128<T, N>{
Load(d_full, result_lanes).raw};
6727template <
size_t N, HWY_IF_V_SIZE_GT(
float16_t, N, 2)>
6729 Indices128<float16_t, N> idx) {
6730 return {_mm_permutexvar_ph(idx.raw, v.raw)};
6734template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
6736#if HWY_TARGET <= HWY_AVX2
6737 const DFromV<
decltype(v)> d;
6739 const Vec128<float, N> perm{_mm_permutevar_ps(
BitCast(df, v).raw, idx.raw)};
6741#elif HWY_TARGET == HWY_SSE2
6742#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
6743 typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
6744 return Vec128<T, N>{
reinterpret_cast<typename detail::Raw128<T>::type
>(
6745 __builtin_shuffle(
reinterpret_cast<GccU32RawVectType
>(v.raw),
6746 reinterpret_cast<GccU32RawVectType
>(idx.raw)))};
6748 const Full128<T> d_full;
6749 alignas(16) T src_lanes[4];
6750 alignas(16) uint32_t indices[4];
6751 alignas(16) T result_lanes[4];
6753 Store(Vec128<T>{v.raw}, d_full, src_lanes);
6754 _mm_store_si128(
reinterpret_cast<__m128i*
>(indices), idx.raw);
6756 for (
int i = 0; i < 4; i++) {
6757 result_lanes[i] = src_lanes[
indices[i] & 3u];
6760 return Vec128<T, N>{
Load(d_full, result_lanes).raw};
6767#if HWY_TARGET <= HWY_SSSE3
6768template <
size_t N, HWY_IF_V_SIZE_GT(
float, N, 4)>
6771#if HWY_TARGET <= HWY_AVX2
6774 const DFromV<
decltype(v)> df;
6783template <
typename T>
6784HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
6785 Indices128<T, 1> ) {
6789template <
typename T, HWY_IF_UI64(T)>
6793#if HWY_TARGET <= HWY_AVX2
6813#if HWY_TARGET <= HWY_AVX2
6831template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6839template <
class D, HWY_IF_LANES_D(D, 1)>
6845template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
6851template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
6857template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
6867 using VU =
VFromD<
decltype(du)>;
6870 if (kN == 1)
return v;
6872 return BitCast(
d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 0, 1))});
6875 return BitCast(
d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))});
6878#if HWY_TARGET == HWY_SSE2
6880 _mm_shufflehi_epi16(_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)),
6881 _MM_SHUFFLE(0, 1, 2, 3))};
6882 return BitCast(
d, VU{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))});
6886 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
6894 constexpr int kN =
static_cast<int>(MaxLanes(d));
6895 if (kN == 1)
return v;
6896#if HWY_TARGET <= HWY_SSSE3
6898 alignas(16)
static constexpr int8_t kReverse[16] = {
6899 kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8,
6900 kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16};
6901 const RebindToSigned<
decltype(d)> di;
6902 const VFromD<
decltype(di)> idx = Load(di, kReverse);
6903 return VFromD<D>{_mm_shuffle_epi8(BitCast(di, v).raw, idx.raw)};
6913template <
class D, HWY_IF_LANES_D(D, 1)>
6919template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
6921#if HWY_TARGET <= HWY_AVX3
6924#elif HWY_TARGET == HWY_SSE2
6926 using VU =
VFromD<
decltype(du)>;
6929 __m128i shuf_result = _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(2, 3, 0, 1));
6931 shuf_result = _mm_shufflehi_epi16(shuf_result, _MM_SHUFFLE(2, 3, 0, 1));
6933 return BitCast(d, VU{shuf_result});
6937 di, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C);
6943template <
class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_GT_D(D, 1)>
6949template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
6956template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
6959 using VU =
VFromD<
decltype(du)>;
6964 return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))});
6967#if HWY_TARGET == HWY_SSE2
6968 return BitCast(d, VU{_mm_shufflehi_epi16(
6969 _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)),
6970 _MM_SHUFFLE(0, 1, 2, 3))});
6974 di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
6980template <
class D, HWY_IF_T_SIZE_D(D, 4)>
6985template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
6992template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
6994#if HWY_TARGET == HWY_SSE2
7000 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
7016template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
7018 return VFromD<D>{_mm_unpackhi_epi8(a.raw, b.raw)};
7020template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
7022 const DFromV<
decltype(a)> d;
7023 const RebindToUnsigned<
decltype(d)> du;
7024 using VU =
VFromD<
decltype(du)>;
7026 d, VU{_mm_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
7028template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
7030 return VFromD<D>{_mm_unpackhi_epi32(a.raw, b.raw)};
7032template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
7034 return VFromD<D>{_mm_unpackhi_epi64(a.raw, b.raw)};
7036template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
7038 return VFromD<D>{_mm_unpackhi_ps(a.raw, b.raw)};
7040template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
7042 return VFromD<D>{_mm_unpackhi_pd(a.raw, b.raw)};
7046template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7048 const Half<
decltype(
d)> d2;
7055template <
int kLane,
class T,
size_t N, HWY_IF_T_SIZE(T, 1)>
7057 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
7058 const DFromV<
decltype(v)> d;
7060#if HWY_TARGET == HWY_SSE2
7061 const Full128<T> d_full;
7062 const Vec128<T> v_full{v.raw};
7063 const auto v_interleaved = (kLane < 8)
7067 d, Broadcast<kLane & 7>(
BitCast(Full128<uint16_t>(), v_interleaved)));
7078template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
7082template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
7087template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
7095#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7096#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7098#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7101template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
7105 const uint32_t x0) {
7108 static_cast<int32_t
>(x3),
static_cast<int32_t
>(x2),
7109 static_cast<int32_t
>(x1),
static_cast<int32_t
>(x0))});
7112template <
size_t kIdx3210,
class V>
7119 VFromD<
decltype(du)>{_mm_shufflelo_epi16(
7120 BitCast(du, v).raw,
static_cast<int>(kIdx3210 & 0xFF))});
7123#if HWY_TARGET == HWY_SSE2
7124template <
size_t kIdx3210,
class V>
7130 constexpr int kShuffle =
static_cast<int>(kIdx3210 & 0xFF);
7132 d,
VFromD<
decltype(du)>{_mm_shufflehi_epi16(
7133 _mm_shufflelo_epi16(
BitCast(du, v).raw, kShuffle), kShuffle)});
7136template <
size_t kIdx3210,
size_t kVectSize,
class V,
7137 hwy::EnableIf<(kVectSize == 4 || kVectSize == 8)>* =
nullptr>
7142 const DFromV<
decltype(v)> d;
7144 const Rebind<uint16_t,
decltype(
d)> du16;
7148 const auto shuf16_result = Per4LaneBlockShuffle(
7153template <
size_t kIdx3210,
size_t kVectSize,
class V>
7162 const auto zero =
Zero(
d);
7172 BitCast(di16, hi_shuf_result)));
7176template <
size_t kIdx3210,
class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
7180 return V{_mm_shuffle_epi32(v.raw,
static_cast<int>(kIdx3210 & 0xFF))};
7183template <
size_t kIdx3210,
class V, HWY_IF_FLOAT(TFromV<V>)>
7187 return V{_mm_shuffle_ps(v.raw, v.raw,
static_cast<int>(kIdx3210 & 0xFF))};
7196template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
7198 const DFromV<
decltype(v)> d;
7199 const Full64<uint64_t> du64;
7202 d,
ShiftLeftSame(vu64,
static_cast<int>(amt *
sizeof(TFromV<V>) * 8)));
7205#if HWY_TARGET <= HWY_SSSE3
7206template <
class V, HWY_IF_V_SIZE_V(V, 16)>
7208 const DFromV<
decltype(v)> d;
7211 Iota(du8,
static_cast<uint8_t
>(
size_t{0} - amt *
sizeof(TFromV<V>)));
7215template <
class V, HWY_IF_V_SIZE_V(V, 16)>
7217 const DFromV<
decltype(v)> d;
7220 constexpr size_t kNumOfLanesPerU64 = 8 /
sizeof(TFromV<V>);
7222 const auto vu64 =
BitCast(du64, v);
7224 BitCast(du64,
Set(di32, -
static_cast<int32_t
>(amt >= kNumOfLanesPerU64))),
7225 BitCast(du64, ShiftLeftBytes<8>(du64, vu64)), vu64);
7226 const auto v_lo = ShiftLeftBytes<8>(du64, v_hi);
7228 const int shl_amt =
static_cast<int>((amt *
sizeof(TFromV<V>) * 8) & 63);
7236template <
class D, HWY_IF_LANES_D(D, 1)>
7241template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
7243#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
7244 if (__builtin_constant_p(amt)) {
7249 return ShiftLeftLanes<1>(d, v);
7256 return detail::SlideUpLanes(v, amt);
7259template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
7261#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
7262 if (__builtin_constant_p(amt)) {
7267 return ShiftLeftLanes<1>(d, v);
7269 return ShiftLeftLanes<2>(d, v);
7271 return ShiftLeftLanes<3>(d, v);
7278 return detail::SlideUpLanes(v, amt);
7281template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
7283#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
7284 if (__builtin_constant_p(amt)) {
7289 return ShiftLeftLanes<1>(d, v);
7291 return ShiftLeftLanes<2>(d, v);
7293 return ShiftLeftLanes<3>(d, v);
7295 return ShiftLeftLanes<4>(d, v);
7297 return ShiftLeftLanes<5>(d, v);
7299 return ShiftLeftLanes<6>(d, v);
7301 return ShiftLeftLanes<7>(d, v);
7308 return detail::SlideUpLanes(v, amt);
7311template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
7313#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
7314 if (__builtin_constant_p(amt)) {
7319 return ShiftLeftLanes<1>(d, v);
7321 return ShiftLeftLanes<2>(d, v);
7323 return ShiftLeftLanes<3>(d, v);
7325 return ShiftLeftLanes<4>(d, v);
7327 return ShiftLeftLanes<5>(d, v);
7329 return ShiftLeftLanes<6>(d, v);
7331 return ShiftLeftLanes<7>(d, v);
7333 return ShiftLeftLanes<8>(d, v);
7335 return ShiftLeftLanes<9>(d, v);
7337 return ShiftLeftLanes<10>(d, v);
7339 return ShiftLeftLanes<11>(d, v);
7341 return ShiftLeftLanes<12>(d, v);
7343 return ShiftLeftLanes<13>(d, v);
7345 return ShiftLeftLanes<14>(d, v);
7347 return ShiftLeftLanes<15>(d, v);
7354 return detail::SlideUpLanes(v, amt);
7361template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
7363 const DFromV<
decltype(v)> d;
7367 static_cast<int>(amt *
sizeof(TFromV<V>) * 8)));
7370#if HWY_TARGET <= HWY_SSSE3
7371template <
class V, HWY_IF_V_SIZE_V(V, 16)>
7373 const DFromV<
decltype(v)> d;
7375 auto idx =
Iota(di8,
static_cast<int8_t
>(amt *
sizeof(TFromV<V>)));
7380template <
class V, HWY_IF_V_SIZE_V(V, 16)>
7382 const DFromV<
decltype(v)> d;
7385 constexpr size_t kNumOfLanesPerU64 = 8 /
sizeof(TFromV<V>);
7387 const auto vu64 =
BitCast(du64, v);
7389 BitCast(du64,
Set(di32, -
static_cast<int32_t
>(amt >= kNumOfLanesPerU64))),
7390 BitCast(du64, ShiftRightBytes<8>(du64, vu64)), vu64);
7391 const auto v_hi = ShiftRightBytes<8>(du64, v_lo);
7393 const int shr_amt =
static_cast<int>((amt *
sizeof(TFromV<V>) * 8) & 63);
7401template <
class D, HWY_IF_LANES_D(D, 1)>
7406template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
7408#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
7409 if (__builtin_constant_p(amt)) {
7414 return ShiftRightLanes<1>(d, v);
7421 return detail::SlideDownLanes(v, amt);
7424template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
7426#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
7427 if (__builtin_constant_p(amt)) {
7432 return ShiftRightLanes<1>(d, v);
7434 return ShiftRightLanes<2>(d, v);
7436 return ShiftRightLanes<3>(d, v);
7443 return detail::SlideDownLanes(v, amt);
7446template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
7448#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
7449 if (__builtin_constant_p(amt)) {
7454 return ShiftRightLanes<1>(d, v);
7456 return ShiftRightLanes<2>(d, v);
7458 return ShiftRightLanes<3>(d, v);
7460 return ShiftRightLanes<4>(d, v);
7462 return ShiftRightLanes<5>(d, v);
7464 return ShiftRightLanes<6>(d, v);
7466 return ShiftRightLanes<7>(d, v);
7473 return detail::SlideDownLanes(v, amt);
7476template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
7478#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
7479 if (__builtin_constant_p(amt)) {
7484 return ShiftRightLanes<1>(d, v);
7486 return ShiftRightLanes<2>(d, v);
7488 return ShiftRightLanes<3>(d, v);
7490 return ShiftRightLanes<4>(d, v);
7492 return ShiftRightLanes<5>(d, v);
7494 return ShiftRightLanes<6>(d, v);
7496 return ShiftRightLanes<7>(d, v);
7498 return ShiftRightLanes<8>(d, v);
7500 return ShiftRightLanes<9>(d, v);
7502 return ShiftRightLanes<10>(d, v);
7504 return ShiftRightLanes<11>(d, v);
7506 return ShiftRightLanes<12>(d, v);
7508 return ShiftRightLanes<13>(d, v);
7510 return ShiftRightLanes<14>(d, v);
7512 return ShiftRightLanes<15>(d, v);
7519 return detail::SlideDownLanes(v, amt);
7526#if HWY_TARGET <= HWY_AVX2
7528#ifdef HWY_NATIVE_STORE_N
7529#undef HWY_NATIVE_STORE_N
7531#define HWY_NATIVE_STORE_N
7536 (1 << 4) | (1 << 8))>
7538 size_t max_lanes_to_store) {
7539 const size_t num_lanes_to_store =
7542#if HWY_COMPILER_MSVC
7549#if HWY_COMPILER_MSVC
7554 detail::MaybeUnpoison(
p, num_lanes_to_store);
7557#if HWY_TARGET > HWY_AVX3
7561 size_t max_lanes_to_store) {
7562 if (max_lanes_to_store > 0) {
7570 size_t max_lanes_to_store) {
7571 if (max_lanes_to_store >= 1) {
7572 p[
static_cast<size_t>(max_lanes_to_store > 1)] = detail::ExtractLane<1>(v);
7579template <
class D, HWY_IF_T_SIZE_D(D, 1)>
7582 size_t num_lanes_to_store) {
7585 const auto v_full128 =
ResizeBitCast(Full128<TFromD<D>>(), v_trailing);
7586 if ((num_lanes_to_store & 2) != 0) {
7587 const uint16_t u16_bits =
GetLane(
BitCast(Full128<uint16_t>(), v_full128));
7588 p[num_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128);
7589 CopyBytes<sizeof(uint16_t)>(&u16_bits,
7590 p + (num_lanes_to_store & ~
size_t{3}));
7592 p[num_lanes_to_store - 1] =
GetLane(v_full128);
7596template <
class D, HWY_IF_T_SIZE_D(D, 2)>
7599 size_t num_lanes_to_store) {
7602 p[num_lanes_to_store - 1] =
GetLane(v_trailing);
7610 const size_t num_lanes_to_store =
7616 const Repartition<int32_t,
decltype(d_full)> di32_full;
7618 const auto i32_store_mask =
BitCast(
7622#if HWY_COMPILER_MSVC
7628 reinterpret_cast<int32_t*
>(p));
7630 constexpr size_t kNumOfLanesPerI32 = 4 /
sizeof(TFromD<D>);
7631 constexpr size_t kTrailingLenMask = kNumOfLanesPerI32 - 1;
7632 const size_t trailing_n = (num_lanes_to_store & kTrailingLenMask);
7634 if (trailing_n != 0) {
7637 num_lanes_to_store / kNumOfLanesPerI32));
7638 detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_lanes_to_store);
7641#if HWY_COMPILER_MSVC
7646 detail::MaybeUnpoison(p, num_lanes_to_store);
7656template <
class D, HWY_IF_V_SIZE_LE_D(D, 16),
class VH = VFromD<Half<D>>>
7658 const Half<
decltype(
d)> dh;
7662 const VU lo{
BitCast(duh, lo_half).raw};
7663 const VU hi{
BitCast(duh, hi_half).raw};
7669template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
7672 const Half<
decltype(du)> duh;
7676template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
7679 return IfThenElseZero(FirstN(d, MaxLanes(dh)),
VFromD<D>{lo.raw});
7683template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
7686 const Half<
decltype(du)> duh;
7692template <
class D, HWY_X86_IF_EMULATED_D(D)>
7695 const Half<
decltype(du)> duh;
7702template <
class D, HWY_IF_V_SIZE_D(D, 16)>
7709template <
class D, HWY_IF_V_SIZE_D(D, 16)>
7716template <
class D, HWY_IF_V_SIZE_D(D, 16)>
7718 return CombineShiftRightBytes<8>(d, hi, lo);
7722template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
7725#if HWY_TARGET >= HWY_SSSE3
7728 _MM_SHUFFLE2(1, 0))});
7735template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
7738#if HWY_TARGET >= HWY_SSSE3
7748template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
7751#if HWY_TARGET >= HWY_SSSE3
7761template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7763 const Half<
decltype(d)> d2;
7764 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
7767template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7769 const Half<
decltype(d)> d2;
7770 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
7773template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7776 const Half<
decltype(
d)> d2;
7780template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7782 const Half<
decltype(
d)> d2;
7789template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
7793 const Vec128<uint16_t> uH = ShiftRight<8>(
BitCast(dw, hi));
7794 const Vec128<uint16_t> uL = ShiftRight<8>(
BitCast(dw, lo));
7795 return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)};
7799template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
7801#if HWY_TARGET == HWY_SSE2
7804 const Vec64<uint16_t> uH = ShiftRight<8>(
BitCast(dw, hi));
7805 const Vec64<uint16_t> uL = ShiftRight<8>(
BitCast(dw, lo));
7806 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw),
7807 _MM_SHUFFLE(2, 0, 2, 0))};
7811 alignas(16)
const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
7820template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)>
7822#if HWY_TARGET == HWY_SSE2
7824 const Twice<
decltype(dw)> dw_2;
7826 const Vec32<uint16_t> uH = ShiftRight<8>(
BitCast(dw, hi));
7827 const Vec32<uint16_t> uL = ShiftRight<8>(
BitCast(dw, lo));
7828 const Vec64<uint16_t> uHL =
Combine(dw_2, uH, uL);
7829 return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)};
7833 alignas(16)
const uint8_t kCompactOddU8[4] = {1, 3};
7841template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
7847 const Vec128<int32_t> uH = ShiftRight<16>(
BitCast(dw, hi));
7848 const Vec128<int32_t> uL = ShiftRight<16>(
BitCast(dw, lo));
7849 return BitCast(d,
VFromD<
decltype(du)>{_mm_packs_epi32(uL.raw, uH.raw)});
7853template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
7855#if HWY_TARGET == HWY_SSE2
7859 const Vec64<int32_t> uH = ShiftRight<16>(
BitCast(dw, hi));
7860 const Vec64<int32_t> uL = ShiftRight<16>(
BitCast(dw, lo));
7861 return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(uL.raw, uH.raw),
7862 _MM_SHUFFLE(2, 0, 2, 0))};
7866 alignas(16)
const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
7875template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
7879 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
7880 _MM_SHUFFLE(3, 1, 3, 1))});
7884template <
class D, HWY_IF_LANES_D(D, 2)>
7892template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
7896 const Vec128<uint16_t> mask =
Set(dw, 0x00FF);
7897 const Vec128<uint16_t> uH =
And(
BitCast(dw, hi), mask);
7898 const Vec128<uint16_t> uL =
And(
BitCast(dw, lo), mask);
7899 return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)};
7903template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
7905#if HWY_TARGET == HWY_SSE2
7908 const Vec64<uint16_t> mask =
Set(dw, 0x00FF);
7909 const Vec64<uint16_t> uH =
And(
BitCast(dw, hi), mask);
7910 const Vec64<uint16_t> uL =
And(
BitCast(dw, lo), mask);
7911 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw),
7912 _MM_SHUFFLE(2, 0, 2, 0))};
7916 alignas(16)
const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
7925template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)>
7927#if HWY_TARGET == HWY_SSE2
7929 const Twice<
decltype(dw)> dw_2;
7931 const Vec32<uint16_t> mask =
Set(dw, 0x00FF);
7932 const Vec32<uint16_t> uH =
And(
BitCast(dw, hi), mask);
7933 const Vec32<uint16_t> uL =
And(
BitCast(dw, lo), mask);
7934 const Vec64<uint16_t> uHL =
Combine(dw_2, uH, uL);
7935 return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)};
7939 alignas(16)
const uint8_t kCompactEvenU8[4] = {0, 2};
7948template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
7950#if HWY_TARGET <= HWY_SSE4
7954 const Vec128<uint32_t> mask =
Set(dw, 0x0000FFFF);
7955 const Vec128<uint32_t> uH =
And(
BitCast(dw, hi), mask);
7956 const Vec128<uint32_t> uL =
And(
BitCast(dw, lo), mask);
7957 return BitCast(d,
VFromD<
decltype(du)>{_mm_packus_epi32(uL.raw, uH.raw)});
7958#elif HWY_TARGET == HWY_SSE2
7967 const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
7976template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
7978#if HWY_TARGET == HWY_SSE2
7985 alignas(16)
const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
7994template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
7998 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
7999 _MM_SHUFFLE(2, 0, 2, 0))});
8001template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
8003 return VFromD<D>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
8007template <
class D, HWY_IF_LANES_D(D, 2)>
8014template <
typename T>
8019template <
typename T>
8024template <
typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 2)>
8028#if HWY_TARGET <= HWY_SSSE3
8031 du, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
8040template <
typename T, HWY_IF_T_SIZE(T, 2)>
8045 BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0))});
8049template <
class V, HWY_IF_T_SIZE_V(V, 2)>
8053#if HWY_TARGET <= HWY_SSSE3
8055 du, 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c);
8059 d,
VFromD<
decltype(du)>{_mm_shufflehi_epi16(
8060 _mm_shufflelo_epi16(
BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0)),
8061 _MM_SHUFFLE(2, 2, 0, 0))});
8065template <
typename T, HWY_IF_UI32(T)>
8066HWY_API Vec128<T> DupEven(Vec128<T> v) {
8067 return Vec128<T>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
8076template <
typename T, HWY_IF_T_SIZE(T, 1)>
8081template <
typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 1)>
8085#if HWY_TARGET <= HWY_SSSE3
8088 du, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
8097template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)>
8102 BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1))});
8106template <
typename V, HWY_IF_T_SIZE_V(V, 2), HWY_IF_V_SIZE_GT_V(V, 8)>
8108 const DFromV<
decltype(v)> d;
8109 const RebindToUnsigned<
decltype(d)> du;
8110#if HWY_TARGET <= HWY_SSSE3
8111 const VFromD<
decltype(du)> shuffle = Dup128VecFromValues(
8112 du, 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e);
8113 return TableLookupBytes(v, BitCast(d, shuffle));
8116 d,
VFromD<
decltype(du)>{_mm_shufflehi_epi16(
8117 _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1)),
8118 _MM_SHUFFLE(3, 3, 1, 1))});
8122template <
typename T,
size_t N, HWY_IF_UI32(T)>
8124 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
8129 _mm_shuffle_ps(v.
raw, v.
raw, _MM_SHUFFLE(3, 3, 1, 1))};
8132template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
8133HWY_API Vec128<T, N> DupOdd(
const Vec128<T, N> v) {
8134 return InterleaveUpper(DFromV<
decltype(v)>(), v, v);
8139template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8140HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
8141 Indices128<T, N> idx) {
8142 const DFromV<
decltype(a)> d;
8143 const Twice<
decltype(d)> dt;
8147 const Vec128<T, N> idx_vec{idx.raw};
8148 const Indices128<T, N * 2> idx2{
Combine(dt, idx_vec, idx_vec).raw};
8151 const Indices128<T, N * 2> idx2{idx.raw};
8156template <
typename T, HWY_IF_T_SIZE(T, 1)>
8159#if HWY_TARGET <= HWY_AVX3_DL
8165#if HWY_TARGET <= HWY_SSE4
8167 const auto sel_hi_mask =
8171 const auto sel_hi_mask =
8176#if HWY_TARGET <= HWY_AVX3
8177 const Vec128<T> lookup_result{_mm_mask_shuffle_epi8(
8178 lo_lookup_result.raw, sel_hi_mask.raw, b.
raw, idx_vec.raw)};
8179 return lookup_result;
8182 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
8187template <
typename T, HWY_IF_T_SIZE(T, 2)>
8188HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
8189 Indices128<T> idx) {
8190#if HWY_TARGET <= HWY_AVX3
8191 return Vec128<T>{_mm_permutex2var_epi16(a.raw, idx.raw, b.raw)};
8192#elif HWY_TARGET == HWY_SSE2
8193 const DFromV<
decltype(a)> d;
8194 const RebindToSigned<
decltype(d)> di;
8195 const Vec128<T> idx_vec{idx.raw};
8196 const auto sel_hi_mask =
8200 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
8202 const DFromV<
decltype(a)> d;
8205 Indices128<uint8_t>{idx.raw}));
8209template <
typename T, HWY_IF_UI32(T)>
8211 Indices128<T> idx) {
8212#if HWY_TARGET <= HWY_AVX3
8213 return Vec128<T>{_mm_permutex2var_epi32(a.raw, idx.raw, b.raw)};
8215 const DFromV<
decltype(a)> d;
8217#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
8218 const Vec128<T> idx_vec{idx.raw};
8220#if HWY_TARGET <= HWY_AVX2
8225 const auto sel_hi_mask =
BitCast(d_sel, idx_vec) >
Set(d_sel, int32_t{3});
8231 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
8235 Indices128<uint8_t>{idx.raw}));
8242 Vec128<float16_t> b,
8243 Indices128<float16_t> idx) {
8244 return Vec128<float16_t>{_mm_permutex2var_ph(a.raw, idx.raw, b.raw)};
8249#if HWY_TARGET <= HWY_AVX3
8251#elif HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
8254#if HWY_TARGET <= HWY_AVX2
8255 const auto sel_hi_mask =
8259 const auto sel_hi_mask =
8265 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
8274template <
typename T, HWY_IF_UI64(T)>
8275HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
8276 Indices128<T> idx) {
8277#if HWY_TARGET <= HWY_AVX3
8278 return Vec128<T>{_mm_permutex2var_epi64(a.raw, idx.raw, b.raw)};
8280 const DFromV<
decltype(a)> d;
8281 const Vec128<T> idx_vec{idx.raw};
8282 const Indices128<T> idx_mod{
And(idx_vec,
Set(d, T{1})).raw};
8284#if HWY_TARGET <= HWY_SSE4
8292 Set(di32, int32_t{1}))));
8298 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
8304#if HWY_TARGET <= HWY_AVX3
8312#if HWY_TARGET <= HWY_SSE4
8316 const auto sel_hi_mask =
8318 Set(di32, int32_t{1}))));
8323 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
8329template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
8330HWY_INLINE Vec128<T, N> OddEven(
const Vec128<T, N> a,
const Vec128<T, N> b) {
8331 const DFromV<
decltype(a)> d;
8332 const Repartition<uint8_t,
decltype(d)> d8;
8333 alignas(16)
static constexpr uint8_t mask[16] = {
8334 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
8335 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
8338template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
8340 const DFromV<
decltype(a)> d;
8341#if HWY_TARGET >= HWY_SSSE3
8343 alignas(16)
static constexpr uint8_t mask[16] = {
8344 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
8353template <
typename T,
size_t N, HWY_IF_UI32(T)>
8355#if HWY_TARGET >= HWY_SSSE3
8356 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
8357 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
8358 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
8361 const DFromV<
decltype(a)> d;
8363 return BitCast(d, Vec128<float, N>{_mm_blend_ps(
BitCast(df, a).raw,
8368template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
8372 const DFromV<
decltype(a)> d;
8374#if HWY_TARGET >= HWY_SSSE3
8376 d, Vec128<double, N>{_mm_shuffle_pd(
8380 return BitCast(d, Vec128<double, N>{_mm_blend_pd(
BitCast(dd, a).raw,
8386HWY_API Vec128<float, N>
OddEven(Vec128<float, N> a, Vec128<float, N> b) {
8387#if HWY_TARGET >= HWY_SSSE3
8390 const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
8391 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
8392 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
8394 return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
8400template <
class D, HWY_IF_LANES_LE_D(D, 2)>
8406template <
class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
8413template <
class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
8419#if HWY_TARGET <= HWY_AVX3
8420template <
class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
8422 return VFromD<D>{_mm_mask_shuffle_epi32(
8423 a.raw,
static_cast<__mmask8
>(0x0A), b.raw,
8424 static_cast<_MM_PERM_ENUM
>(_MM_SHUFFLE(2, 2, 0, 0)))};
8426template <
class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
8428 return VFromD<D>{_mm_mask_shuffle_ps(a.raw,
static_cast<__mmask8
>(0x0A),
8429 b.raw, b.raw, _MM_SHUFFLE(2, 2, 0, 0))};
8432template <
class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
8437 d,
VFromD<
decltype(df)>{_mm_shuffle_ps(b2_b0_a2_a0.raw, b2_b0_a2_a0.raw,
8438 _MM_SHUFFLE(3, 1, 2, 0))});
8444template <
class D, HWY_IF_LANES_LE_D(D, 2)>
8450template <
class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
8457template <
class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
8463#if HWY_TARGET <= HWY_AVX3
8464template <
class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
8466 return VFromD<D>{_mm_mask_shuffle_epi32(
8467 b.raw,
static_cast<__mmask8
>(0x05), a.raw,
8468 static_cast<_MM_PERM_ENUM
>(_MM_SHUFFLE(3, 3, 1, 1)))};
8470template <
class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
8472 return VFromD<D>{_mm_mask_shuffle_ps(b.raw,
static_cast<__mmask8
>(0x05),
8473 a.raw, a.raw, _MM_SHUFFLE(3, 3, 1, 1))};
8476template <
class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
8481 d,
VFromD<
decltype(df)>{_mm_shuffle_ps(b3_b1_a3_a1.raw, b3_b1_a3_a1.raw,
8482 _MM_SHUFFLE(3, 1, 2, 0))});
8487template <
typename T,
size_t N>
8494template <
typename T,
size_t N>
8506#if HWY_TARGET == HWY_AVX2
8510 const Rebind<uint32_t,
decltype(
d)> du32;
8513#elif HWY_TARGET > HWY_AVX2
8515template <
typename T, HWY_IF_T_SIZE(T, 2)>
8517 const DFromV<
decltype(v)> d;
8520 const Rebind<float,
decltype(dw)> df;
8521 const auto zero =
Zero(d);
8524 const auto upper = exp +
Set(d, 0x3F80);
8526 const auto f0 =
ZipLower(dw, zero, upper);
8527 const auto f1 =
ZipUpper(dw, zero, upper);
8529 const VFromD<
decltype(dw)> bits0{_mm_cvtps_epi32(
BitCast(df, f0).raw)};
8530 const VFromD<
decltype(dw)> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
8531#if HWY_TARGET <= HWY_SSE4
8532 return VFromD<
decltype(du)>{_mm_packus_epi32(bits0.raw, bits1.raw)};
8538template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)>
8539HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(
const Vec128<T, N> v) {
8540 const DFromV<
decltype(v)> d;
8542 const Twice<
decltype(du)> dt_u;
8547 const auto upper = exp +
Set(d, 0x3F80);
8551 const VFromD<
decltype(dt_w)> bits0{_mm_cvtps_epi32(
BitCast(dt_f, f0).raw)};
8552#if HWY_TARGET <= HWY_SSE4
8553 return VFromD<
decltype(du)>{_mm_packus_epi32(bits0.raw, bits0.raw)};
8554#elif HWY_TARGET == HWY_SSSE3
8556 const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
8560 const auto bits0_i32 = ShiftRight<16>(
BitCast(dt_i32, ShiftLeft<16>(bits0)));
8561 return VFromD<
decltype(du)>{_mm_packs_epi32(bits0_i32.raw, bits0_i32.raw)};
8566template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
8567HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(
const Vec128<T, N> v) {
8568 const DFromV<
decltype(v)> d;
8569 const auto exp = ShiftLeft<23>(v);
8570 const auto f = exp +
Set(d, 0x3F800000);
8573 return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
8581#if HWY_TARGET <= HWY_AVX3
8583#elif HWY_TARGET == HWY_AVX2
8586 return v * Pow2(bits);
8590#if HWY_TARGET > HWY_AVX3
8593#if HWY_TARGET <= HWY_SSE4
8596 const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)});
8598 return Vec16<uint16_t>{_mm_sll_epi16(v.
raw, bits16.raw)};
8602#if HWY_TARGET <= HWY_AVX3
8606 const Rebind<uint16_t,
decltype(
d)> du16;
8609#elif HWY_TARGET <= HWY_AVX2
8610template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
8612 const DFromV<
decltype(v)> d;
8613 const Rebind<uint32_t,
decltype(
d)> du32;
8616template <
class V, HWY_IF_V_SIZE_V(V, 16)>
8618 const DFromV<
decltype(v)> d;
8619 const Half<
decltype(d)> dh;
8620 const Rebind<uint16_t,
decltype(d)> du16;
8621 const Rebind<uint32_t,
decltype(dh)> dh_u32;
8623 const VFromD<
decltype(dh_u32)> lo_shl_result =
8624 PromoteTo(dh_u32, LowerHalf(dh, v))
8625 << PromoteTo(dh_u32, LowerHalf(dh, bits));
8626 const VFromD<
decltype(dh_u32)> hi_shl_result =
8627 PromoteTo(dh_u32, UpperHalf(dh, v))
8628 << PromoteTo(dh_u32, UpperHalf(dh, bits));
8629 const VFromD<
decltype(du16)> u16_shl_result = ConcatEven(
8630 du16, BitCast(du16, hi_shl_result), BitCast(du16, lo_shl_result));
8631 return TruncateTo(d, u16_shl_result);
8640#if HWY_TARGET <= HWY_AVX3_DL
8643 alignas(16)
static constexpr uint8_t kMasks[16] = {
8644 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00};
8646 alignas(16)
static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10,
8647 0x20, 0x40, 0x80, 0x00};
8649 const VFromD<
decltype(
d)> mul =
8651 return VFromD<
decltype(
d)>{_mm_gf2p8mul_epi8(v.
raw, mul.raw)};
8652#elif HWY_TARGET <= HWY_AVX2
8658 using VW =
VFromD<
decltype(dw)>;
8659 const VW even_mask =
Set(dw, 0x00FF);
8660 const VW odd_mask =
Set(dw, 0xFF00);
8662 const VW bits16 =
BitCast(dw, bits);
8664 const VW evens =
Shl(tag, vw,
And(bits16, even_mask));
8665 const VW odds =
Shl(tag,
And(vw, odd_mask), ShiftRight<8>(bits16));
8671#if HWY_TARGET <= HWY_SSE4
8683#if HWY_TARGET >= HWY_SSE4
8684 return v * Pow2(bits);
8690#if HWY_TARGET >= HWY_SSE4
8693#if HWY_TARGET == HWY_SSE4
8705#if HWY_TARGET >= HWY_SSE4
8709 const __m128i bits1 = _mm_unpackhi_epi64(bits.
raw, bits.
raw);
8722template <
typename T,
size_t N>
8725 const DFromV<
decltype(v)> di;
8733template <
typename T,
size_t N>
8734HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
8735 return detail::Shl(hwy::TypeTag<T>(), v, bits);
8745#if HWY_TARGET <= HWY_AVX2
8748#if HWY_TARGET <= HWY_AVX3
8752 const Rebind<uint16_t,
decltype(
d)> du16;
8760 const DFromV<
decltype(v)> d;
8761 const Rebind<uint32_t,
decltype(
d)> du32;
8766template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
8768 const DFromV<
decltype(v)> d;
8769 const Rebind<uint32_t,
decltype(d)> du32;
8770 const RebindToSigned<
decltype(du32)> di32;
8772 BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits)));
8774template <
class V, HWY_IF_V_SIZE_V(V, 16)>
8776 const DFromV<
decltype(v)> d;
8777 const Half<
decltype(d)> dh;
8778 const Rebind<int16_t,
decltype(d)> di16;
8779 const Rebind<uint16_t,
decltype(d)> du16;
8780 const Rebind<int32_t,
decltype(dh)> dh_i32;
8781 const Rebind<uint32_t,
decltype(dh)> dh_u32;
8783 const auto lo_shr_result =
8784 BitCast(dh_i32, PromoteTo(dh_u32, LowerHalf(dh, v)) >>
8785 PromoteTo(dh_u32, LowerHalf(dh, bits)));
8786 const auto hi_shr_result =
8787 BitCast(dh_i32, PromoteTo(dh_u32, UpperHalf(dh, v)) >>
8788 PromoteTo(dh_u32, UpperHalf(dh, bits)));
8789 const auto i16_shr_result =
8790 BitCast(di16, OrderedDemote2To(du16, lo_shr_result, hi_shr_result));
8791 return DemoteTo(d, i16_shr_result);
8801#if HWY_TARGET <= HWY_AVX3
8803#elif HWY_TARGET <= HWY_AVX2
8804 return detail::AVX2ShrU16Vec128(in, bits);
8808 const auto out =
MulHigh(in, detail::Pow2(
Set(
d, 16) - bits));
8814#if HWY_TARGET > HWY_AVX3
8815HWY_API Vec16<uint16_t> operator>>(
const Vec16<uint16_t> in,
8816 const Vec16<uint16_t> bits) {
8817#if HWY_TARGET <= HWY_SSE4
8818 const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
8820 const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)});
8822 return Vec16<uint16_t>{_mm_srl_epi16(in.raw, bits16.raw)};
8830#if HWY_TARGET <= HWY_AVX2
8831 return detail::AVX2ShrU8Vec128(in, bits);
8835 using VW =
VFromD<
decltype(dw)>;
8836 const VW mask =
Set(dw, 0x00FF);
8837 const VW vw =
BitCast(dw, in);
8838 const VW bits16 =
BitCast(dw, bits);
8839 const VW evens =
And(vw, mask) >>
And(bits16, mask);
8841 const VW odds = vw >> ShiftRight<8>(bits16);
8847#if HWY_TARGET <= HWY_SSE4
8861#if HWY_TARGET >= HWY_SSE4
8863 const DFromV<
decltype(in)> d32;
8868 const auto mul = detail::Pow2(
Set(d32, 32) - bits);
8869 const auto out20 = ShiftRight<32>(
MulEven(in, mul));
8881#if HWY_TARGET >= HWY_SSE4
8884#if HWY_TARGET == HWY_SSE4
8894HWY_API Vec128<uint64_t> operator>>(
const Vec128<uint64_t> v,
8895 const Vec128<uint64_t> bits) {
8896#if HWY_TARGET >= HWY_SSE4
8897 const DFromV<
decltype(v)> d;
8899 const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
8900 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
8901 const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
8904 return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
8908 const Vec64<uint64_t> bits) {
8909 return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)};
8914#if HWY_TARGET <= HWY_AVX3
8918 const Rebind<int16_t,
decltype(
d)> di16;
8921#elif HWY_TARGET <= HWY_AVX2
8924 const DFromV<
decltype(v)> d;
8925 const Rebind<int32_t,
decltype(
d)> di32;
8928template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
8930 const DFromV<
decltype(v)> d;
8931 const Rebind<int32_t,
decltype(d)> di32;
8932 return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits));
8934template <
class V, HWY_IF_V_SIZE_V(V, 16)>
8936 const DFromV<
decltype(v)> d;
8937 const Half<
decltype(d)> dh;
8938 const Rebind<int16_t,
decltype(d)> di16;
8939 const Rebind<int32_t,
decltype(dh)> dh_i32;
8941 const auto lo_shr_result = PromoteTo(dh_i32, LowerHalf(dh, v)) >>
8942 PromoteTo(dh_i32, LowerHalf(dh, bits));
8943 const auto hi_shr_result = PromoteTo(dh_i32, UpperHalf(dh, v)) >>
8944 PromoteTo(dh_i32, UpperHalf(dh, bits));
8945 const auto i16_shr_result =
8946 OrderedDemote2To(di16, lo_shr_result, hi_shr_result);
8947 return DemoteTo(d, i16_shr_result);
8951#if HWY_TARGET > HWY_AVX3
8953template <
class DI,
class V>
8954HWY_INLINE V SignedShr(
const DI di,
const V v,
const V count_i) {
8955 const RebindToUnsigned<DI> du;
8956 const auto count =
BitCast(du, count_i);
8960 const auto abs =
BitCast(du, v ^ sign);
8961 return BitCast(di, abs >> count) ^ sign;
8970#if HWY_TARGET <= HWY_AVX3
8972#elif HWY_TARGET <= HWY_AVX2
8973 return detail::AVX2ShrI16Vec128(v, bits);
8976 return detail::SignedShr(
d, v, bits);
8980#if HWY_TARGET > HWY_AVX3
8981HWY_API Vec16<int16_t> operator>>(Vec16<int16_t> v, Vec16<int16_t> bits) {
8982#if HWY_TARGET <= HWY_SSE4
8983 const Vec16<int16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
8985 const auto bits16 = And(bits, Vec16<int16_t>{_mm_set_epi64x(0, 0xFFFF)});
8987 return Vec16<int16_t>{_mm_sra_epi16(v.raw, bits16.raw)};
8994#if HWY_TARGET <= HWY_AVX2
8995 return detail::AVX2ShrI8Vec128(v, bits);
8998 return detail::SignedShr(
d, v, bits);
9003#if HWY_TARGET <= HWY_SSE4
9008 const Rebind<int16_t,
decltype(
d)> di16;
9009 const Twice<
decltype(
d)> dt;
9011 const auto vi16 = ShiftRight<8>(
BitCast(di16,
Combine(dt, v, v)));
9021#if HWY_TARGET <= HWY_AVX2
9025 return detail::SignedShr(
d, v, bits);
9029#if HWY_TARGET > HWY_AVX2
9030HWY_API Vec32<int32_t> operator>>(Vec32<int32_t> v, Vec32<int32_t> bits) {
9031#if HWY_TARGET == HWY_SSE4
9032 const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)};
9034 const auto bits32 = Combine(Full64<int32_t>(), Zero(Full32<int32_t>()), bits);
9036 return Vec32<int32_t>{_mm_sra_epi32(v.raw, bits32.raw)};
9043#if HWY_TARGET <= HWY_AVX3
9047 return detail::SignedShr(
d, v, bits);
9055template <
class V, HWY_IF_U64(TFromV<V>)>
9057 const DFromV<
decltype(a)> du64;
9059 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
9060 const auto a32 =
BitCast(du32, a);
9061 const auto b32 =
BitCast(du32, b);
9069 const auto aLbL =
MulEven(a32, b32);
9070 const auto w3 = aLbL & maskL;
9072 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
9073 const auto w2 = t2 & maskL;
9074 const auto w1 = ShiftRight<32>(t2);
9076 const auto t =
MulEven(a32, bH) + w2;
9077 const auto k = ShiftRight<32>(t);
9079 mulH =
MulEven(aH, bH) + w1 + k;
9080 return ShiftLeft<32>(t) + w3;
9083template <
class V, HWY_IF_I64(TFromV<V>)>
9084static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) {
9085 const DFromV<
decltype(a)> di64;
9087 using VU64 =
VFromD<
decltype(du64)>;
9091 di64, SSE2Mul128(
BitCast(du64, a),
BitCast(du64, b), unsigned_mulH));
9099#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
9101template <
class V, HWY_IF_UI64(TFromV<V>),
9102 HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))>
9105 const V mulL = detail::SSE2Mul128(a, b, mulH);
9109template <
class V, HWY_IF_UI64(TFromV<V>),
9110 HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))>
9112 const DFromV<
decltype(a)> du64;
9114 const V mulL = detail::SSE2Mul128(a, b, mulH);
9115 return InterleaveUpper(du64, mulL, mulH);
9120template <
class V, HWY_IF_UI64(TFromV<V>),
9121 HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 8 : 0))>
9124 detail::SSE2Mul128(a, b, mulH);
9130template <
class T, HWY_IF_UI64(T)>
9131HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
9132 const DFromV<
decltype(a)> d;
9133 alignas(16) T mul[2];
9134 mul[0] =
Mul128(GetLane(a), GetLane(b), &mul[1]);
9135 return Load(d, mul);
9138template <
class T, HWY_IF_UI64(T)>
9140 const DFromV<
decltype(a)> d;
9141 const Half<
decltype(
d)> d2;
9142 alignas(16) T mul[2];
9145 mul[0] =
Mul128(a1, b1, &mul[1]);
9146 return Load(d, mul);
9149template <
class T, HWY_IF_UI64(T)>
9153 return Vec64<T>{_mm_cvtsi64_si128(
static_cast<int64_t
>(hi))};
9169 using VU32 =
VFromD<
decltype(du32)>;
9170 const VU32 odd =
Set(du32, 0xFFFF0000u);
9171 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
9173 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
9190 const auto p_lo = a * b;
9191 const auto p_hi =
MulHigh(a, b);
9194 const auto p_hi0_lo1 =
Or(ShiftLeft<16>(
BitCast(du32, p_hi)),
9195 ShiftRight<16>(
BitCast(du32, p_lo)));
9201#if HWY_TARGET <= HWY_SSSE3
9203#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
9204#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
9206#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
9211template <
class DI16, HWY_IF_I16_D(DI16), HWY_IF_V_SIZE_LE_D(DI16, 16)>
9222#if HWY_TARGET <= HWY_AVX3_DL
9224#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
9225#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
9227#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
9232template <
class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
9236 return VFromD<DI32>{_mm_dpwssds_epi32(sum.raw, a.raw, b.raw)};
9250 const RebindToUnsigned<
decltype(df32)> du32;
9254 using VU32 =
VFromD<
decltype(du32)>;
9255 const VU32 odd = Set(du32, 0xFFFF0000u);
9256 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
9257 const VU32 ao = And(BitCast(du32, a), odd);
9258 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
9259 const VU32 bo = And(BitCast(du32, b), odd);
9260 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
9261 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
9271#if HWY_TARGET <= HWY_AVX3_DL
9272 return VFromD<D32>{_mm_dpwssd_epi32(sum0.raw, a.raw, b.raw)};
9289HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(
const Vec128<int32_t, N> sum0,
9290 Vec128<int32_t, N> ) {
9295HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven(
9296 const Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> ) {
9302 return Add(sum0, sum1);
9306#if HWY_TARGET <= HWY_AVX3_DL
9308#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
9309#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
9311#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
9314template <
class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
9318 return VFromD<DI32>{_mm_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)};
9321#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
9322#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
9324#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
9326template <
class DI32, HWY_IF_I32_D(DI32)>
9328 VFromD<Repartition<int8_t, DI32>> a,
9329 VFromD<Repartition<int8_t, DI32>> b,
9332 const Repartition<uint8_t,
decltype(di32)> du8;
9334 const auto a_u = BitCast(du8, a);
9335 const auto result_sum_0 = SumOfMulQuadAccumulate(di32, a_u, b, sum);
9336 const auto result_sum_1 = ShiftLeft<8>(
9337 SumOfMulQuadAccumulate(di32, ShiftRight<7>(a_u), b, Zero(di32)));
9338 return result_sum_0 - result_sum_1;
9341#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
9342#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
9344#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
9346template <
class DU32, HWY_IF_U32_D(DU32)>
9348 DU32 du32,
VFromD<Repartition<uint8_t, DU32>> a,
9351 const Repartition<uint8_t,
decltype(du32)> du8;
9352 const RebindToSigned<
decltype(du8)> di8;
9353 const RebindToSigned<
decltype(du32)> di32;
9355 const auto b_i = BitCast(di8, b);
9356 const auto result_sum_0 =
9357 SumOfMulQuadAccumulate(di32, a, b_i, BitCast(di32, sum));
9358 const auto result_sum_1 = ShiftLeft<8>(
9359 SumOfMulQuadAccumulate(di32, a, BroadcastSignBit(b_i), Zero(di32)));
9361 return BitCast(du32, result_sum_0 - result_sum_1);
9371template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
9373#if HWY_TARGET >= HWY_SSSE3
9374 const __m128i zero = _mm_setzero_si128();
9375 return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)};
9377 return VFromD<D>{_mm_cvtepu8_epi16(v.raw)};
9380template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
9382#if HWY_TARGET >= HWY_SSSE3
9383 return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
9385 return VFromD<D>{_mm_cvtepu16_epi32(v.raw)};
9388template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
9390#if HWY_TARGET >= HWY_SSSE3
9391 return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
9393 return VFromD<D>{_mm_cvtepu32_epi64(v.raw)};
9396template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
9398#if HWY_TARGET >= HWY_SSSE3
9399 const __m128i zero = _mm_setzero_si128();
9400 const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
9401 return VFromD<D>{_mm_unpacklo_epi16(u16, zero)};
9403 return VFromD<D>{_mm_cvtepu8_epi32(v.raw)};
9406template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
9408#if HWY_TARGET > HWY_SSSE3
9409 const Rebind<uint32_t,
decltype(
d)> du32;
9411#elif HWY_TARGET == HWY_SSSE3
9412 alignas(16)
static constexpr int8_t kShuffle[16] = {
9413 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1};
9418 return VFromD<D>{_mm_cvtepu8_epi64(v.raw)};
9421template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
9423#if HWY_TARGET > HWY_SSSE3
9424 const Rebind<uint32_t,
decltype(
d)> du32;
9426#elif HWY_TARGET == HWY_SSSE3
9427 alignas(16)
static constexpr int8_t kShuffle[16] = {
9428 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1};
9433 return VFromD<D>{_mm_cvtepu16_epi64(v.raw)};
9447template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
9449#if HWY_TARGET >= HWY_SSSE3
9450 return ShiftRight<8>(
VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)});
9452 return VFromD<D>{_mm_cvtepi8_epi16(v.raw)};
9455template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
9457#if HWY_TARGET >= HWY_SSSE3
9458 return ShiftRight<16>(
VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)});
9460 return VFromD<D>{_mm_cvtepi16_epi32(v.raw)};
9463template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
9465#if HWY_TARGET >= HWY_SSSE3
9466 return ShiftRight<32>(
VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)});
9468 return VFromD<D>{_mm_cvtepi32_epi64(v.raw)};
9471template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
9473#if HWY_TARGET >= HWY_SSSE3
9474 const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
9475 const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
9478 return VFromD<D>{_mm_cvtepi8_epi32(v.raw)};
9481template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
9483#if HWY_TARGET >= HWY_SSSE3
9485 const Half<
decltype(di32)> dh_i32;
9487 const VFromD<
decltype(di32)> s4{
9488 _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))};
9492 return VFromD<D>{_mm_cvtepi8_epi64(v.raw)};
9495template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
9497#if HWY_TARGET >= HWY_SSSE3
9499 const Half<
decltype(di32)> dh_i32;
9501 const VFromD<
decltype(di32)> s2{
9502 _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))};
9506 return VFromD<D>{_mm_cvtepi16_epi64(v.raw)};
9510#if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)
9513#ifdef HWY_NATIVE_F16C
9514#undef HWY_NATIVE_F16C
9516#define HWY_NATIVE_F16C
9521#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
9522#define HWY_INLINE_F16 HWY_NOINLINE
9524#define HWY_INLINE_F16 HWY_INLINE
9526template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
9540#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
9541#undef HWY_NATIVE_PROMOTE_F16_TO_F64
9543#define HWY_NATIVE_PROMOTE_F16_TO_F64
9546template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9553template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
9555 const Rebind<uint16_t,
decltype(df32)> du16;
9560template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9565template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9567 return VFromD<D>{_mm_cvtepi32_pd(v.raw)};
9570#if HWY_TARGET <= HWY_AVX3
9571template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9573 return VFromD<D>{_mm_cvtepu32_pd(v.raw)};
9577template <
class D, HWY_IF_F64_D(D)>
9579 const Rebind<int32_t,
decltype(df64)> di32;
9582 Set(df64, 4294967296.0),
9589#if HWY_TARGET > HWY_AVX3
9594template <
class D, HWY_IF_LANES_D(D, 1)>
9602template <
class D, HWY_IF_LANES_D(D, 2)>
9606 Vec128<int32_t> v) {
9607 const Repartition<int32_t, D> d_from;
9611template <
class D,
class V, HWY_IF_LANES_LE_D(D, 2)>
9616 const Repartition<int32_t, D> d_from;
9625template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
9627 return VFromD<D>{_mm_packs_epi32(v.raw, v.raw)};
9630template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
9632#if HWY_TARGET >= HWY_SSSE3
9633 const Rebind<int32_t, D> di32;
9634 const auto zero_if_neg =
AndNot(ShiftRight<31>(v), v);
9636 const auto clamped =
Or(zero_if_neg, too_big);
9637#if HWY_TARGET == HWY_SSE2
9638 const Rebind<uint16_t,
decltype(di32)> du16;
9640 return BitCast(du16,
DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped))));
9644 alignas(16)
static constexpr uint16_t kLower2Bytes[16] = {
9645 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
9646 const auto lo2 =
Load(du16, kLower2Bytes);
9650 return VFromD<D>{_mm_packus_epi32(v.raw, v.raw)};
9654template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
9656 const DFromV<
decltype(v)> du32;
9658#if HWY_TARGET >= HWY_SSSE3
9659 const auto too_big =
9661 const auto clamped =
Or(
BitCast(di32, v), too_big);
9662#if HWY_TARGET == HWY_SSE2
9664 return BitCast(du16,
DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped))));
9667 const Repartition<uint16_t,
decltype(di32)> du16_full;
9669 alignas(16)
static constexpr uint16_t kLower2Bytes[16] = {
9670 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
9671 const auto lo2 =
Load(du16_full, kLower2Bytes);
9679template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
9681 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
9682 return VFromD<D>{_mm_packus_epi16(i16, i16)};
9685template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
9687 return VFromD<D>{_mm_packus_epi16(v.raw, v.raw)};
9690template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
9692 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
9693 return VFromD<D>{_mm_packs_epi16(i16, i16)};
9696template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
9698 return VFromD<D>{_mm_packs_epi16(v.raw, v.raw)};
9701template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
9703#if HWY_TARGET <= HWY_AVX3
9707 return VFromD<D>{_mm_cvtusepi32_epi8(v.raw)};
9709 const DFromV<
decltype(v)> du32;
9711 const auto max_i32 =
Set(du32, 0x7FFFFFFFu);
9713#if HWY_TARGET >= HWY_SSSE3
9731 const Repartition<uint8_t,
decltype(du32)> du32_as_du8;
9735 const auto clamped =
BitCast(di32,
Min(v, max_i32));
9742template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
9744 const DFromV<
decltype(v)> du16;
9746 const auto max_i16 =
Set(du16, 0x7FFF);
9748#if HWY_TARGET >= HWY_SSSE3
9766 const Repartition<uint8_t,
decltype(du16)> du16_as_du8;
9770 const auto clamped =
BitCast(di16,
Min(v, max_i16));
9776#if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)
9789 df16,
VFromD<
decltype(du16)>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
9798#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
9799#undef HWY_NATIVE_DEMOTE_F64_TO_F16
9801#define HWY_NATIVE_DEMOTE_F64_TO_F16
9804template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
9820#if HWY_AVX3_HAVE_F32_TO_BF16C
9821#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
9822#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
9824#define HWY_NATIVE_DEMOTE_F32_TO_BF16
9827template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
9829#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
9832 __asm__(
"vcvtneps2bf16 %1, %0" :
"=v"(raw_result) :
"v"(v.raw));
9837 return VFromD<D>{detail::BitCastToInteger(_mm_cvtneps_pbh(v.raw))};
9841template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
9844#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
9847 __asm__(
"vcvtne2ps2bf16 %2, %1, %0"
9849 :
"v"(b.raw),
"v"(a.raw));
9854 return VFromD<D>{detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw))};
9858template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
9862 detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw)),
9863 _MM_SHUFFLE(2, 0, 2, 0))};
9866template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
9868 const DFromV<
decltype(a)> d;
9869 const Twice<
decltype(
d)> dt;
9875template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
9878 const Twice<
decltype(
d)> dt;
9881template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
9885 _MM_SHUFFLE(2, 0, 2, 0))};
9887template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
9893template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
9894HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) {
9895 const DFromV<
decltype(a)> d;
9896 const Twice<
decltype(d)> dt;
9897 return DemoteTo(dn, Combine(dt, b, a));
9899template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
9900HWY_API VFromD<D> ReorderDemote2To(D dn, Vec64<int32_t> a, Vec64<int32_t> b) {
9901#if HWY_TARGET >= HWY_SSSE3
9902 const DFromV<
decltype(a)> d;
9903 const Twice<
decltype(d)> dt;
9904 return DemoteTo(dn, Combine(dt, b, a));
9907 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi32(a.raw, b.raw),
9908 _MM_SHUFFLE(2, 0, 2, 0))};
9911template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
9913#if HWY_TARGET >= HWY_SSSE3
9914 const Half<
decltype(dn)> dnh;
9915 const auto u16_a =
DemoteTo(dnh, a);
9916 const auto u16_b =
DemoteTo(dnh, b);
9917 return Combine(dn, u16_b, u16_a);
9920 return VFromD<D>{_mm_packus_epi32(a.raw, b.raw)};
9924template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
9927 const DFromV<
decltype(a)> du32;
9929 const auto max_i32 =
Set(du32, 0x7FFFFFFFu);
9931#if HWY_TARGET >= HWY_SSSE3
9932 const Repartition<uint8_t,
decltype(du32)> du32_as_du8;
9934 const auto clamped_a =
BitCast(
9936 const auto clamped_b =
BitCast(
9939 const auto clamped_a =
BitCast(di32,
Min(a, max_i32));
9940 const auto clamped_b =
BitCast(di32,
Min(b, max_i32));
9946template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
9948 VFromD<Repartition<uint32_t, D>> b) {
9949 const DFromV<
decltype(a)> d;
9950 const Twice<
decltype(d)> dt;
9951 return DemoteTo(dn, Combine(dt, b, a));
9955template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
9957 VFromD<Repartition<int16_t, D>> b) {
9958 const DFromV<
decltype(a)> d;
9959 const Twice<
decltype(d)> dt;
9960 return DemoteTo(dn, Combine(dt, b, a));
9962template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
9966 _MM_SHUFFLE(2, 0, 2, 0))};
9968template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
9974template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
9976 VFromD<Repartition<int16_t, D>> b) {
9977 const DFromV<
decltype(a)> d;
9978 const Twice<
decltype(d)> dt;
9979 return DemoteTo(dn, Combine(dt, b, a));
9981template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
9984 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(a.raw, b.raw),
9985 _MM_SHUFFLE(2, 0, 2, 0))};
9987template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
9989 Vec128<int16_t> b) {
9990 return VFromD<D>{_mm_packus_epi16(a.raw, b.raw)};
9993template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
9996 const DFromV<
decltype(a)> du16;
9998 const auto max_i16 =
Set(du16, 0x7FFFu);
10000#if HWY_TARGET >= HWY_SSSE3
10001 const Repartition<uint8_t,
decltype(du16)> du16_as_du8;
10003 const auto clamped_a =
BitCast(
10005 const auto clamped_b =
BitCast(
10008 const auto clamped_a =
BitCast(di16,
Min(a, max_i16));
10009 const auto clamped_b =
BitCast(di16,
Min(b, max_i16));
10015template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
10017 VFromD<Repartition<uint16_t, D>> b) {
10018 const DFromV<
decltype(a)> d;
10019 const Twice<
decltype(d)> dt;
10020 return DemoteTo(dn, Combine(dt, b, a));
10023template <
class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
10024 HWY_IF_V_SIZE_LE_D(D, 16),
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
10025 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
10026 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
10031#if HWY_AVX3_HAVE_F32_TO_BF16C
10034template <
class D, HWY_IF_BF16_D(D)>
10036 VFromD<Repartition<float, D>> b) {
10037 return ReorderDemote2To(dbf16, a, b);
10041template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
10053 return Min(v,
Set(
d, 2147483647.0));
10070#if HWY_COMPILER_GCC_ACTUAL
10084#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
10085#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
10087#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
10090template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
10092 return VFromD<D>{_mm_cvttpd_epi32(v.raw)};
10096template <
class D, HWY_IF_I32_D(D)>
10098 const Rebind<double,
decltype(di32)> df64;
10099 const VFromD<
decltype(df64)> clamped = detail::ClampF64ToI32Max(df64, v);
10100 return DemoteInRangeTo(di32, clamped);
10103#if HWY_TARGET <= HWY_AVX3
10104template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
10106 return VFromD<D>{_mm_cvttpd_epu32(v.raw)};
10109template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
10112 _mm_maskz_cvttpd_epu32(detail::UnmaskedNot(
MaskFromVec(v)).raw, v.raw)};
10118template <
class D, HWY_IF_U32_D(D)>
10121 const Rebind<double,
decltype(du32)> df64;
10124 const auto k2_31 =
Set(df64, 2147483648.0);
10125 const auto v_is_ge_k2_31 = (v >= k2_31);
10126 const auto clamped_lo31_f64 = v -
IfThenElseZero(v_is_ge_k2_31, k2_31);
10127 const auto clamped_lo31_u32 =
10129 const auto clamped_u32_msb = ShiftLeft<31>(
10131 return Or(clamped_lo31_u32, clamped_u32_msb);
10135template <
class D, HWY_IF_U32_D(D)>
10137 const Rebind<double,
decltype(du32)> df64;
10143#if HWY_TARGET <= HWY_AVX3
10144template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
10146 return VFromD<D>{_mm_cvtepi64_ps(v.raw)};
10148template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
10150 return VFromD<D>{_mm_cvtepu64_ps(v.raw)};
10154template <
class D, HWY_IF_F32_D(D)>
10156 const Rebind<double,
decltype(df32)> df64;
10161 const auto k2p64_63 =
Set(df64, 27670116110564327424.0);
10162 const auto f64_hi52 =
10164 const auto f64_lo12 =
10166 Set(du32, uint32_t{0x00000FFF}))));
10168 const auto f64_sum = f64_hi52 + f64_lo12;
10169 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
10171 const auto f64_sum_is_inexact =
10173 const auto f64_bits_decrement =
10174 And(ShiftRight<63>(
BitCast(du64,
Xor(f64_sum, f64_carry))),
10175 f64_sum_is_inexact);
10177 const auto adj_f64_val =
BitCast(
10179 Or(
BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));
10181 return DemoteTo(df32, adj_f64_val);
10185template <
class D, HWY_IF_F32_D(D)>
10187 const Rebind<double,
decltype(df32)> df64;
10192 const auto k2p64 =
Set(df64, 18446744073709551616.0);
10193 const auto f64_hi52 =
Or(
BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
10194 const auto f64_lo12 =
10196 Set(du32, uint32_t{0x00000FFF}))));
10198 const auto f64_sum = f64_hi52 + f64_lo12;
10199 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
10200 const auto f64_sum_is_inexact =
10203 const auto adj_f64_val =
BitCast(
10206 f64_sum_is_inexact));
10208 return DemoteTo(df32, adj_f64_val);
10215#if HWY_TARGET == HWY_SSE2
10217 const Rebind<uint8_t,
decltype(di32)> du8;
10220 const DFromV<
decltype(v)> d32;
10222 alignas(16)
static constexpr uint32_t k8From32[4] = {
10223 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
10231#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
10232#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
10234#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
10237#if HWY_TARGET <= HWY_AVX3
10238template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
10240 const Rebind<float,
decltype(di64)> df32;
10241 const RebindToFloat<
decltype(di64)> df64;
10242 const Twice<
decltype(df32)> dt_f32;
10244 return detail::FixConversionOverflow(
10246 BitCast(df64, InterleaveLower(ResizeBitCast(dt_f32, v),
10247 ResizeBitCast(dt_f32, v))),
10248 PromoteInRangeTo(di64, v));
10250template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
10252 return VFromD<D>{_mm_cvttps_epi64(v.raw)};
10254template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
10257 _mm_maskz_cvttps_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
10259template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
10261 return VFromD<D>{_mm_cvttps_epu64(v.raw)};
10266template <
class D, HWY_IF_I64_D(D)>
10268 const Rebind<int32_t,
decltype(di64)> di32;
10271 const Repartition<uint8_t,
decltype(du32)> du32_as_du8;
10273 const auto exponent_adj =
BitCast(
10276 BitCast(du32_as_du8,
Set(du32, uint32_t{157}))),
10277 BitCast(du32_as_du8,
Set(du32, uint32_t{32}))));
10281 const auto f32_to_i32_result =
ConvertTo(di32, adj_v);
10285 Set(di32, LimitsMax<int32_t>())))));
10293template <
class D, HWY_IF_UI64_D(D)>
10295 const Rebind<MakeNarrow<TFromD<D>>,
decltype(d64)> d32;
10299 const Repartition<uint8_t,
decltype(d32)> du32_as_du8;
10301 const auto exponent_adj =
BitCast(
10304 BitCast(du32_as_du8,
Set(du32, uint32_t{0xFFFFFF9Du}))));
10315template <
class DU64, HWY_IF_V_SIZE_LE_D(DU64, 16)>
10317 DU64 du64,
VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) {
10318 const Rebind<int32_t,
decltype(du64)> di32;
10319 const Twice<
decltype(di32)> dt_i32;
10321 const auto vt_i32_overflow_mask =
ResizeBitCast(dt_i32, i32_overflow_mask);
10326template <
class DU64, HWY_IF_V_SIZE_GT_D(DU64, 16)>
10328 DU64 du64,
VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) {
10336template <
class D, HWY_IF_U64_D(D)>
10338 const Rebind<int32_t,
decltype(du64)> di32;
10341 const Repartition<uint8_t,
decltype(du32)> du32_as_du8;
10345 const auto exponent_adj =
BitCast(
10347 ShiftRight<23>(
BitCast(du32, non_neg_v))),
10348 BitCast(du32_as_du8,
Set(du32, uint32_t{157}))),
10349 BitCast(du32_as_du8,
Set(du32, uint32_t{33}))));
10352 BitCast(df32,
BitCast(du32, non_neg_v) - ShiftLeft<23>(exponent_adj));
10356 const auto overflow_result =
10357 detail::PromoteF32ToU64OverflowMaskToU64(du64, i32_overflow_mask);
10367#if HWY_TARGET == HWY_SSE2
10369 const Vec128<int16_t> b) {
10370 const DFromV<
decltype(a)> d;
10373 auto lo_product = a * b;
10374 auto hi_product =
MulHigh(a, b);
10376 const VFromD<
decltype(di32)> i32_product_lo{
10377 _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)};
10378 const VFromD<
decltype(di32)> i32_product_hi{
10379 _mm_unpackhi_epi16(lo_product.raw, hi_product.raw)};
10381 const auto round_up_incr =
Set(di32, 0x4000);
10382 return ReorderDemote2To(d, ShiftRight<15>(i32_product_lo + round_up_incr),
10383 ShiftRight<15>(i32_product_hi + round_up_incr));
10386template <
size_t N, HWY_IF_V_SIZE_LE(
int16_t, N, 8)>
10388 const Vec128<int16_t, N> b) {
10389 const DFromV<
decltype(a)> d;
10390 const Rebind<int32_t,
decltype(
d)> di32;
10392 const auto lo_product = a * b;
10393 const auto hi_product =
MulHigh(a, b);
10394 const VFromD<
decltype(di32)> i32_product{
10395 _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)};
10397 return DemoteTo(d, ShiftRight<15>(i32_product +
Set(di32, 0x4000)));
10402 const Vec128<int16_t, N> b) {
10403 return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
10409template <
typename From,
class DTo, HWY_IF_LANES_D(DTo, 1)>
10412 const Repartition<TFromD<DTo>,
DFromV<
decltype(v)>> dto;
10416template <
class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
10418#if HWY_TARGET == HWY_SSE2
10425 alignas(16)
static constexpr uint8_t kIdx[16] = {0, 8, 0, 8, 0, 8, 0, 8,
10426 0, 8, 0, 8, 0, 8, 0, 8};
10432template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
10434#if HWY_TARGET == HWY_SSE2
10435 const Vec128<uint16_t, 1> lo{v.raw};
10436 const Vec128<uint16_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)};
10441 alignas(16)
static constexpr uint16_t kIdx[8] = {
10442 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u};
10448template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
10450 return VFromD<D>{_mm_shuffle_epi32(v.raw, 0x88)};
10453template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
10455 const DFromV<
decltype(v)> du32;
10456#if HWY_TARGET == HWY_SSE2
10458 const Rebind<uint8_t,
decltype(di32)> du8;
10459 return DemoteTo(du8,
BitCast(di32, ShiftRight<24>(ShiftLeft<24>(v))));
10462 alignas(16)
static constexpr uint8_t kIdx[16] = {
10463 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu,
10464 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu};
10469template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
10471 const DFromV<
decltype(v)> du32;
10472#if HWY_TARGET == HWY_SSE2
10474 const Rebind<uint16_t,
decltype(di32)> du16;
10477 du16,
DemoteTo(di16, ShiftRight<16>(
BitCast(di32, ShiftLeft<16>(v)))));
10484template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
10486 const DFromV<
decltype(v)> du16;
10487#if HWY_TARGET == HWY_SSE2
10489 const Rebind<uint8_t,
decltype(di16)> du8;
10501#if HWY_TARGET <= HWY_AVX3
10502template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
10504 return VFromD<D>{_mm_cvtsepi64_epi32(v.raw)};
10506template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)>
10508 return VFromD<D>{_mm_cvtsepi64_epi16(v.raw)};
10510template <
class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)>
10512 return VFromD<D>{_mm_cvtsepi64_epi8(v.raw)};
10515template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
10517 const __mmask8 non_neg_mask = detail::UnmaskedNot(
MaskFromVec(v)).raw;
10518 return VFromD<D>{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
10520template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
10522 const __mmask8 non_neg_mask = detail::UnmaskedNot(
MaskFromVec(v)).raw;
10523 return VFromD<D>{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
10525template <
class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
10527 const __mmask8 non_neg_mask = detail::UnmaskedNot(
MaskFromVec(v)).raw;
10528 return VFromD<D>{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
10531template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
10533 return VFromD<D>{_mm_cvtusepi64_epi32(v.raw)};
10535template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
10537 return VFromD<D>{_mm_cvtusepi64_epi16(v.raw)};
10539template <
class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
10541 return VFromD<D>{_mm_cvtusepi64_epi8(v.raw)};
10554#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
10555#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) HWY_IF_NOT_T_SIZE_V(V, 8)
10558template <
class D, HWY_IF_UNSIGNED_D(D)>
10560 D ,
VFromD<Rebind<uint64_t, D>> v) {
10564template <
class D, HWY_IF_SIGNED_D(D)>
10566 D ,
VFromD<Rebind<uint64_t, D>> v) {
10567 const DFromV<
decltype(v)> du64;
10574 D dn,
VFromD<Rebind<uint64_t, D>> v) {
10575 const Rebind<uint64_t, D> du64;
10577 constexpr int kShiftAmt =
static_cast<int>(
sizeof(TFromD<D>) * 8) -
10578 static_cast<int>(hwy::IsSigned<TFromD<D>>());
10580 const auto too_big =
BitCast(
10582 di64,
Gt(
BitCast(di64, ShiftRight<kShiftAmt>(v)),
Zero(di64))));
10586template <
class D, HWY_IF_V_SIZE_LE_D(D, 16),
class V>
10596 const DFromV<
decltype(v)> di64;
10603 const auto saturated_vals =
Xor(
10605 detail::DemoteFromU64Saturate(dn,
Xor(invert_mask,
BitCast(du64, v))));
10612 const DFromV<
decltype(v)> di64;
10616 return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals));
10629#if HWY_TARGET == HWY_SSE2
10633 const Rebind<int32_t,
decltype(dn)> di32;
10641 return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v));
10648 VFromD<Repartition<int64_t, D>> b) {
10649 const DFromV<
decltype(a)> d;
10650 const Twice<
decltype(
d)> dt;
10656 VFromD<Repartition<uint64_t, D>> b) {
10657 const DFromV<
decltype(a)> d;
10658 const Twice<
decltype(
d)> dt;
10662#if HWY_TARGET > HWY_AVX3
10665 VFromD<Repartition<uint64_t, D>> b) {
10666 const DFromV<
decltype(a)> d;
10667 const Twice<
decltype(
d)> dt;
10672#if HWY_TARGET > HWY_AVX2
10673template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
10675 Vec128<int64_t> b) {
10676 const DFromV<
decltype(a)> di64;
10678 const Half<
decltype(dn)> dnh;
10684 const auto saturated_a =
Xor(
10686 detail::DemoteFromU64Saturate(dnh,
Xor(invert_mask_a,
BitCast(du64, a))));
10687 const auto saturated_b =
Xor(
10689 detail::DemoteFromU64Saturate(dnh,
Xor(invert_mask_b,
BitCast(du64, b))));
10694template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
10696 Vec128<int64_t> b) {
10697 const DFromV<
decltype(a)> di64;
10699 const Half<
decltype(dn)> dnh;
10701 const auto saturated_a = detail::DemoteFromU64Saturate(
10703 const auto saturated_b = detail::DemoteFromU64Saturate(
10709template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
10711 Vec128<uint64_t> b) {
10712 const Half<
decltype(dn)> dnh;
10714 const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a);
10715 const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b);
10723#if HWY_HAVE_FLOAT16
10724template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
10726 return VFromD<D>{_mm_cvtepu16_ph(v.raw)};
10728template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
10730 return VFromD<D>{_mm_cvtepi16_ph(v.raw)};
10734template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
10736 return VFromD<D>{_mm_cvtepi32_ps(v.raw)};
10739#if HWY_TARGET <= HWY_AVX3
10740template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
10742 return VFromD<D>{_mm_cvtepu32_ps(v.raw)};
10745template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
10747 return VFromD<D>{_mm_cvtepi64_pd(v.raw)};
10750template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
10752 return VFromD<D>{_mm_cvtepu64_pd(v.raw)};
10756template <
class D, HWY_IF_F32_D(D)>
10759 const RebindToUnsigned<
decltype(df)> du32;
10760 const RebindToSigned<
decltype(df)> d32;
10762 const auto msk_lo = Set(du32, 0xFFFF);
10763 const auto cnst2_16_flt = Set(df, 65536.0f);
10766 const auto v_lo = BitCast(d32, And(v, msk_lo));
10767 const auto v_hi = BitCast(d32, ShiftRight<16>(v));
10768 return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
10772template <
class D, HWY_IF_F64_D(D)>
10775 const Repartition<uint32_t,
decltype(dd)> d32;
10776 const Repartition<uint64_t,
decltype(dd)> d64;
10779 const auto k84_63 = Set(d64, 0x4530000080000000ULL);
10780 const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
10783 const auto k52 = Set(d32, 0x43300000);
10784 const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
10786 const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
10787 return (v_upper - k84_63_52) + v_lower;
10793 const DFromV<
decltype(w)> d64;
10795 const auto cnst2_52_dbl =
Set(dd, 0x0010000000000000);
10801template <
class D, HWY_IF_F64_D(D)>
10805 using VU =
VFromD<
decltype(d64)>;
10807 const VU msk_lo =
Set(d64, 0xFFFFFFFF);
10808 const auto cnst2_32_dbl =
Set(dd, 4294967296.0);
10811 const VU v_lo =
And(v, msk_lo);
10812 const VU v_hi = ShiftRight<32>(v);
10814 const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo);
10815 return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl);
10821#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
10822#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
10824#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
10827#if HWY_HAVE_FLOAT16
10828template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
10830 return VFromD<D>{_mm_cvttph_epi16(v.raw)};
10834template <
class D, HWY_IF_I16_D(D)>
10839template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
10841 return VFromD<D>{_mm_cvttph_epu16(v.raw)};
10844template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
10847 _mm_maskz_cvttph_epu16(detail::UnmaskedNot(
MaskFromVec(v)).raw, v.raw)};
10851template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
10853 return VFromD<D>{_mm_cvttps_epi32(v.raw)};
10857template <
class D, HWY_IF_I32_D(D)>
10862#if HWY_TARGET <= HWY_AVX3
10863template <
class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
10869template <
class DI, HWY_IF_I64_D(DI)>
10874template <
class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
10879template <
class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
10882 _mm_maskz_cvttps_epu32(detail::UnmaskedNot(
MaskFromVec(v)).raw, v.raw)};
10885template <
class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
10890template <
class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
10893 _mm_maskz_cvttpd_epu64(detail::UnmaskedNot(
MaskFromVec(v)).raw, v.raw)};
10900template <
class DU32, HWY_IF_U32_D(DU32)>
10906 exp_diff =
Set(du32, uint32_t{158}) - ShiftRight<23>(
BitCast(du32, v));
10907 const auto scale_down_f32_val_mask =
10910 const auto v_scaled =
10911 BitCast(df32,
BitCast(du32, v) + ShiftLeft<23>(scale_down_f32_val_mask));
10912 const auto f32_to_u32_result =
10915 return f32_to_u32_result +
And(f32_to_u32_result, scale_down_f32_val_mask);
10922template <
class DU32, HWY_IF_U32_D(DU32)>
10924 VFromD<RebindToFloat<DU32>> v) {
10926 const auto f32_to_u32_result = detail::ConvInRangeF32ToU32(du32, v, exp_diff);
10927 return f32_to_u32_result;
10932template <
class DU32, HWY_IF_U32_D(DU32)>
10938 const auto f32_to_u32_result =
10939 detail::ConvInRangeF32ToU32(du32, non_neg_v, exp_diff);
10941 return Or(f32_to_u32_result,
10947template <
class D64, HWY_IF_UI64_D(D64)>
10949 VFromD<Rebind<double, D64>> v,
10953 using VU64 =
VFromD<
decltype(du64)>;
10954 const Repartition<uint16_t,
decltype(di64)> du16;
10955 const VU64 k1075 =
Set(du64, 1075);
10960 biased_exp =
And(biased_exp,
Set(d64, TFromD<D64>{0x7FF}));
10981 const VU64 shift_mnt =
BitCast(
10983 const VU64 shift_int =
BitCast(
10985 const VU64 mantissa =
BitCast(du64, v) &
Set(du64, (1ULL << 52) - 1);
10988 const VU64 int53 = (mantissa |
Set(du64, 1ULL << 52)) >> shift_mnt;
10996 return BitCast(d64, int53 << shift_int);
11002template <
class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)>
11004 return VFromD<DI>{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
11006template <
class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)>
11008 const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
11009 const Full64<double> dd2;
11010 const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(
UpperHalf(dd2, v).raw));
11011 return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)};
11014template <
class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
11020#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
11027 const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp);
11031 return (shifted ^ sign_mask) - sign_mask;
11040 const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp);
11042#if HWY_TARGET <= HWY_SSE4
11043 const auto in_range = biased_exp <
Set(di, 1086);
11053 const VI limit =
Set(di, LimitsMax<int64_t>()) - sign_mask;
11054 const VI magnitude =
IfThenElse(in_range, shifted, limit);
11057 return (magnitude ^ sign_mask) - sign_mask;
11062template <
class DU, HWY_IF_U64_D(DU)>
11065 const auto shifted = detail::ConvAbsInRangeF64ToUI64(du, v, biased_exp);
11070template <
class DU, HWY_IF_U64_D(DU)>
11072 const RebindToSigned<DU> di;
11077 detail::ConvAbsInRangeF64ToUI64(du,
ZeroIfNegative(v), biased_exp);
11080#if HWY_TARGET <= HWY_SSE4
11081 const VU out_of_range =
11085 const VU out_of_range =
BitCast(
11090 return (shifted | out_of_range);
11097 return detail::FixConversionOverflow(
11098 di, v,
VFromD<
decltype(di)>{_mm_cvtps_epi32(v.raw)});
11103#if HWY_TARGET >= HWY_SSSE3
11106template <
typename T,
size_t N>
11108 static_assert(IsFloat<T>(),
"Only for float");
11112 const DFromV<
decltype(v)> df;
11113 const auto max =
Set(df, MantissaEnd<T>());
11115 const auto added = large + v;
11116 const auto rounded = added - large;
11126template <
typename T,
size_t N>
11128 static_assert(IsFloat<T>(),
"Only for float");
11130 return Abs(v) <
Set(
d, MantissaEnd<T>());
11136template <
typename T,
size_t N>
11137HWY_API Vec128<T, N> Trunc(
const Vec128<T, N> v) {
11138 static_assert(IsFloat<T>(),
"Only for float");
11139 const DFromV<
decltype(v)> df;
11140 const RebindToSigned<
decltype(df)> di;
11142 const auto integer = ConvertTo(di, v);
11143 const auto int_f = ConvertTo(df, integer);
11145 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
11149template <
typename T,
size_t N>
11151 static_assert(IsFloat<T>(),
"Only for float");
11152 const DFromV<
decltype(v)> df;
11156 const auto int_f =
ConvertTo(df, integer);
11161 return IfThenElse(detail::UseInt(v), int_f - neg1, v);
11165template <
typename T,
size_t N>
11167 static_assert(IsFloat<T>(),
"Only for float");
11168 const DFromV<
decltype(v)> df;
11172 const auto int_f =
ConvertTo(df, integer);
11177 return IfThenElse(detail::UseInt(v), int_f + neg1, v);
11183#if HWY_HAVE_FLOAT16
11185HWY_API Vec128<float16_t, N> Round(
const Vec128<float16_t, N> v) {
11186 return Vec128<float16_t, N>{
11187 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
11191HWY_API Vec128<float, N>
Round(
const Vec128<float, N> v) {
11192 return Vec128<float, N>{
11193 _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
11196HWY_API Vec128<double, N>
Round(
const Vec128<double, N> v) {
11197 return Vec128<double, N>{
11198 _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
11202#if HWY_HAVE_FLOAT16
11204HWY_API Vec128<float16_t, N>
Trunc(
const Vec128<float16_t, N> v) {
11205 return Vec128<float16_t, N>{
11206 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
11210HWY_API Vec128<float, N>
Trunc(
const Vec128<float, N> v) {
11211 return Vec128<float, N>{
11212 _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
11215HWY_API Vec128<double, N>
Trunc(
const Vec128<double, N> v) {
11216 return Vec128<double, N>{
11217 _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
11221#if HWY_HAVE_FLOAT16
11223HWY_API Vec128<float16_t, N>
Ceil(
const Vec128<float16_t, N> v) {
11224 return Vec128<float16_t, N>{
11225 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
11229HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N> v) {
11230 return Vec128<float, N>{
11231 _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
11234HWY_API Vec128<double, N>
Ceil(
const Vec128<double, N> v) {
11235 return Vec128<double, N>{
11236 _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
11240#if HWY_HAVE_FLOAT16
11242HWY_API Vec128<float16_t, N>
Floor(
const Vec128<float16_t, N> v) {
11243 return Vec128<float16_t, N>{
11244 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
11248HWY_API Vec128<float, N>
Floor(
const Vec128<float, N> v) {
11249 return Vec128<float, N>{
11250 _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
11253HWY_API Vec128<double, N>
Floor(
const Vec128<double, N> v) {
11254 return Vec128<double, N>{
11255 _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
11262#define HWY_X86_FPCLASS_QNAN 0x01
11263#define HWY_X86_FPCLASS_POS0 0x02
11264#define HWY_X86_FPCLASS_NEG0 0x04
11265#define HWY_X86_FPCLASS_POS_INF 0x08
11266#define HWY_X86_FPCLASS_NEG_INF 0x10
11267#define HWY_X86_FPCLASS_SUBNORMAL 0x20
11268#define HWY_X86_FPCLASS_NEG 0x40
11269#define HWY_X86_FPCLASS_SNAN 0x80
11271#if HWY_HAVE_FLOAT16 || HWY_IDE
11274HWY_API Mask128<float16_t, N> IsNaN(
const Vec128<float16_t, N> v) {
11275 return Mask128<float16_t, N>{
11281 Vec128<float16_t, N> b) {
11285 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
11290HWY_API Mask128<float16_t, N>
IsInf(
const Vec128<float16_t, N> v) {
11291 return Mask128<float16_t, N>{_mm_fpclass_ph_mask(
11296HWY_API Mask128<float16_t, N>
IsFinite(
const Vec128<float16_t, N> v) {
11299 return Not(Mask128<float16_t, N>{_mm_fpclass_ph_mask(
11308#if HWY_TARGET <= HWY_AVX3
11317#if HWY_TARGET <= HWY_AVX3
11325#ifdef HWY_NATIVE_IS_EITHER_NAN
11326#undef HWY_NATIVE_IS_EITHER_NAN
11328#define HWY_NATIVE_IS_EITHER_NAN
11333#if HWY_TARGET <= HWY_AVX3
11343#if HWY_TARGET <= HWY_AVX3
11350#if HWY_TARGET <= HWY_AVX3
11353#ifdef HWY_NATIVE_ISINF
11354#undef HWY_NATIVE_ISINF
11356#define HWY_NATIVE_ISINF
11390#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4
11393#ifdef HWY_NATIVE_AES
11394#undef HWY_NATIVE_AES
11396#define HWY_NATIVE_AES
11399HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
11400 Vec128<uint8_t> round_key) {
11401 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
11405 Vec128<uint8_t> round_key) {
11406 return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
11410 return Vec128<uint8_t>{_mm_aesimc_si128(state.raw)};
11414 Vec128<uint8_t> round_key) {
11415 return Vec128<uint8_t>{_mm_aesdec_si128(state.raw, round_key.raw)};
11419 Vec128<uint8_t> round_key) {
11420 return Vec128<uint8_t>{_mm_aesdeclast_si128(state.raw, round_key.raw)};
11423template <u
int8_t kRcon>
11425 return Vec128<uint8_t>{_mm_aeskeygenassist_si128(v.raw, kRcon)};
11446#if HWY_TARGET > HWY_AVX3
11449template <
class D, HWY_IF_T_SIZE_D(D, 1)>
11450HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
11451 const RebindToUnsigned<
decltype(d)> du;
11454 const VFromD<D> vbits{_mm_cvtsi32_si128(
static_cast<int>(mask_bits))};
11456#if HWY_TARGET == HWY_SSE2
11458 __m128i unpacked_vbits = _mm_unpacklo_epi8(vbits.raw, vbits.raw);
11460 unpacked_vbits = _mm_unpacklo_epi16(unpacked_vbits, unpacked_vbits);
11463 const VFromD<
decltype(du)> rep8{
11464 _mm_unpacklo_epi32(unpacked_vbits, unpacked_vbits)};
11467 alignas(16)
static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
11468 1, 1, 1, 1, 1, 1, 1, 1};
11472 du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
11476template <
class D, HWY_IF_T_SIZE_D(D, 2)>
11479 alignas(16)
static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
11480 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
11484template <
class D, HWY_IF_T_SIZE_D(D, 4)>
11487 alignas(16)
static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
11488 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
11492template <
class D, HWY_IF_T_SIZE_D(D, 8)>
11495 alignas(16)
static constexpr uint64_t kBit[8] = {1, 2};
11503template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11505 constexpr size_t kN =
MaxLanes(d);
11506#if HWY_TARGET <= HWY_AVX3
11508 uint64_t mask_bits = 0;
11509 constexpr size_t kNumBytes = (kN + 7) / 8;
11510 CopyBytes<kNumBytes>(bits, &mask_bits);
11512 mask_bits &= (1ull << kN) - 1;
11515 return MFromD<D>::FromBits(mask_bits);
11517 uint64_t mask_bits = 0;
11518 constexpr size_t kNumBytes = (kN + 7) / 8;
11519 CopyBytes<kNumBytes>(bits, &mask_bits);
11521 mask_bits &= (1ull << kN) - 1;
11524 return detail::LoadMaskBits128(d, mask_bits);
11530template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11532 constexpr size_t kN =
MaxLanes(d);
11533 if (kN < 8) mask_bits &= (1u << kN) - 1;
11535#if HWY_TARGET <= HWY_AVX3
11536 return MFromD<D>::FromBits(mask_bits);
11538 return detail::LoadMaskBits128(d, mask_bits);
11542template <
typename T>
11543struct CompressIsPartition {
11544#if HWY_TARGET <= HWY_AVX3
11550 enum { value = (
sizeof(T) == 8) };
11553 enum { value = (
sizeof(T) != 1) };
11557#if HWY_TARGET <= HWY_AVX3
11562template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11564 constexpr size_t kN =
MaxLanes(d);
11565 constexpr size_t kNumBytes = (kN + 7) / 8;
11566 CopyBytes<kNumBytes>(&mask.raw, bits);
11570 const int mask_bits = (1 << kN) - 1;
11571 bits[0] =
static_cast<uint8_t
>(bits[0] & mask_bits);
11581template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11583 constexpr size_t kN =
MaxLanes(d);
11584 const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
11588template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11590 constexpr size_t kN =
MaxLanes(d);
11591 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
11595template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11597 constexpr size_t kN =
MaxLanes(d);
11598 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
11602template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11604 constexpr size_t kN =
MaxLanes(d);
11605 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
11609template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11611 constexpr size_t kN =
MaxLanes(d);
11612 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
11617template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11619 constexpr size_t kN =
MaxLanes(d);
11620 const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
11621 return mask_bits == 0;
11624template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11626 constexpr size_t kN =
MaxLanes(d);
11627 const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
11629 return mask_bits == (1ull << kN) - 1;
11637template <
typename T>
11642template <
size_t N, HWY_IF_V_SIZE_GT(
float, N, 4)>
11647template <
typename T, HWY_IF_T_SIZE(T, 8)>
11648HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
11652 alignas(16)
static constexpr uint8_t u8_indices[64] = {
11653 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
11654 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
11655 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
11656 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
11658 const DFromV<
decltype(v)> d;
11659 const Repartition<uint8_t,
decltype(d)> d8;
11660 const auto index = Load(d8, u8_indices + 16 * mask.raw);
11661 return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
11667template <
typename T>
11672template <
typename T, HWY_IF_T_SIZE(T, 8)>
11675 alignas(16)
static constexpr uint64_t packed_array[16] = {
11676 0x00000010, 0x00000001, 0x00000010, 0x00000010};
11680 const DFromV<
decltype(v)> d;
11682 const auto packed =
Set(du64, packed_array[mask.raw]);
11683 alignas(16)
static constexpr uint64_t shifts[2] = {0, 4};
11684 const auto indices = Indices128<T>{(packed >>
Load(du64, shifts)).raw};
11690 Mask128<uint64_t> ) {
11697template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
11705 constexpr size_t kN =
MaxLanes(d);
11706 if (kN != 16 /
sizeof(TFromD<D>)) {
11713#if HWY_MEM_OPS_MIGHT_FAULT
11716 alignas(16) TFromD<D> buf[
MaxLanes(d)];
11717 Store(compressed, d, buf);
11718 CopyBytes(buf, unaligned, count *
sizeof(TFromD<D>));
11722 detail::MaybeUnpoison(unaligned, count);
11735constexpr HWY_INLINE uint64_t U64FromInt(
int mask_bits) {
11736 return static_cast<uint64_t
>(
static_cast<unsigned>(mask_bits));
11739template <
typename T,
size_t N>
11741 const Mask128<T, N> mask) {
11742 const Simd<T, N, 0>
d;
11744 return U64FromInt(_mm_movemask_epi8(sign_bits));
11747template <
typename T,
size_t N>
11749 const Mask128<T, N> mask) {
11751 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
11752 return U64FromInt(_mm_movemask_epi8(sign_bits));
11755template <
typename T,
size_t N>
11757 const Simd<T, N, 0>
d;
11758 const Simd<float, N, 0> df;
11760 return U64FromInt(_mm_movemask_ps(sign_bits.raw));
11763template <
typename T,
size_t N>
11765 const Simd<T, N, 0>
d;
11766 const Simd<double, N, 0> df;
11768 return U64FromInt(_mm_movemask_pd(sign_bits.raw));
11771template <
typename T,
size_t N>
11779template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11781 constexpr size_t kNumBytes = (
MaxLanes(d) + 7) / 8;
11782 const uint64_t mask_bits = detail::BitsFromMask(mask);
11783 CopyBytes<kNumBytes>(&mask_bits, bits);
11789template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11792 return detail::BitsFromMask(mask) == 0;
11795template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11797 constexpr uint64_t kAllBits = (1ull <<
MaxLanes(d)) - 1;
11798 return detail::BitsFromMask(mask) == kAllBits;
11801template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11803 return PopCount(detail::BitsFromMask(mask));
11806template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11809 static_cast<uint32_t
>(detail::BitsFromMask(mask)));
11812template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11814 const uint32_t mask_bits =
static_cast<uint32_t
>(detail::BitsFromMask(mask));
11818template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11821 static_cast<uint32_t
>(detail::BitsFromMask(mask)));
11824template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11826 const uint32_t mask_bits =
static_cast<uint32_t
>(detail::BitsFromMask(mask));
11836template <
class D, HWY_IF_T_SIZE_D(D, 2)>
11839 const Rebind<uint8_t,
decltype(
d)> d8;
11840 const Twice<
decltype(d8)> d8t;
11851 alignas(16)
static constexpr uint8_t table[2048] = {
11853 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
11854 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
11855 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
11856 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
11857 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
11858 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
11859 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
11860 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
11861 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
11862 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
11863 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
11864 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
11865 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
11866 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
11867 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
11868 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
11869 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
11870 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
11871 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
11872 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
11873 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
11874 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
11875 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
11876 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
11877 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
11878 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
11879 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
11880 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
11881 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
11882 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
11883 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
11884 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
11885 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
11886 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
11887 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
11888 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
11889 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
11890 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
11891 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
11892 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
11893 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
11894 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
11895 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
11896 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
11897 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
11898 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
11899 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
11900 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
11901 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
11902 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
11903 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
11904 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
11905 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
11906 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
11907 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
11908 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
11909 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
11910 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
11911 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
11912 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
11913 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
11914 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
11915 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
11916 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
11917 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
11918 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
11919 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
11920 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
11921 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
11922 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
11923 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
11924 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
11925 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
11926 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
11927 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
11928 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
11929 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
11930 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
11931 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
11932 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
11933 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
11934 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
11935 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
11936 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
11937 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
11938 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
11939 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
11940 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
11941 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
11942 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
11943 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
11944 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
11945 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
11946 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
11947 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
11948 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
11949 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
11950 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
11951 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
11952 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
11953 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
11954 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
11955 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
11956 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
11957 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
11958 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
11959 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
11960 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
11961 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
11962 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
11963 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
11964 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
11965 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
11966 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
11967 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
11968 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
11969 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
11970 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
11971 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
11972 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
11973 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
11974 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
11975 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
11976 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
11977 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
11978 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
11979 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
11980 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
11982 const VFromD<
decltype(d8t)> byte_idx{
Load(d8, table + mask_bits * 8).raw};
11983 const VFromD<
decltype(du)> pairs =
ZipLower(byte_idx, byte_idx);
11987template <
class D, HWY_IF_T_SIZE_D(D, 2)>
11990 const Rebind<uint8_t,
decltype(
d)> d8;
11991 const Twice<
decltype(d8)> d8t;
12002 alignas(16)
static constexpr uint8_t table[2048] = {
12004 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
12005 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
12006 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
12007 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
12008 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
12009 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
12010 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
12011 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
12012 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
12013 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
12014 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
12015 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
12016 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
12017 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
12018 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
12019 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
12020 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
12021 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
12022 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
12023 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
12024 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
12025 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
12026 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
12027 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
12028 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
12029 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
12030 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
12031 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
12032 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
12033 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
12034 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
12035 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
12036 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
12037 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
12038 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
12039 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
12040 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
12041 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
12042 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
12043 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
12044 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
12045 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
12046 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
12047 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
12048 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
12049 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
12050 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
12051 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
12052 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
12053 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
12054 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
12055 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
12056 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
12057 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
12058 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
12059 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
12060 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
12061 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
12062 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
12063 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
12064 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
12065 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
12066 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
12067 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
12068 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
12069 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
12070 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
12071 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
12072 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
12073 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
12074 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
12075 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
12076 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
12077 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
12078 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
12079 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
12080 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
12081 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
12082 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
12083 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
12084 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
12085 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
12086 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
12087 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
12088 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
12089 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
12090 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
12091 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
12092 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
12093 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
12094 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
12095 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
12096 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
12097 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
12098 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
12099 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
12100 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
12101 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
12102 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
12103 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
12104 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
12105 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
12106 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
12107 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
12108 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
12109 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
12110 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
12111 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
12112 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
12113 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
12114 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
12115 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
12116 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
12117 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
12118 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
12119 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
12120 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
12121 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
12122 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
12123 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
12124 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
12125 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
12126 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
12127 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
12128 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
12129 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
12130 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
12131 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
12133 const VFromD<
decltype(d8t)> byte_idx{
Load(d8, table + mask_bits * 8).raw};
12134 const VFromD<
decltype(du)> pairs =
ZipLower(byte_idx, byte_idx);
12138template <
class D, HWY_IF_T_SIZE_D(D, 4)>
12143 alignas(16)
static constexpr uint8_t u8_indices[256] = {
12145 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12146 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12147 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
12148 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12149 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
12150 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
12151 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
12152 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12153 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12154 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
12155 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
12156 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
12157 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
12158 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
12159 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
12160 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
12163 return BitCast(d,
Load(d8, u8_indices + 16 * mask_bits));
12166template <
class D, HWY_IF_T_SIZE_D(D, 4)>
12171 alignas(16)
static constexpr uint8_t u8_indices[256] = {
12173 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
12174 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
12175 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
12176 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
12177 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
12178 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
12179 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
12180 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12181 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
12182 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
12183 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
12184 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
12185 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
12186 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12190 return BitCast(d,
Load(d8, u8_indices + 16 * mask_bits));
12193template <
class D, HWY_IF_T_SIZE_D(D, 8)>
12198 alignas(16)
static constexpr uint8_t u8_indices[64] = {
12200 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12201 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12202 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
12203 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
12206 return BitCast(d,
Load(d8, u8_indices + 16 * mask_bits));
12209template <
class D, HWY_IF_T_SIZE_D(D, 8)>
12214 alignas(16)
static constexpr uint8_t u8_indices[64] = {
12216 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12217 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
12218 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12219 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
12222 return BitCast(d,
Load(d8, u8_indices + 16 * mask_bits));
12225template <
typename T,
size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
12227 const DFromV<
decltype(v)> d;
12231 const auto indices =
BitCast(du, detail::IndicesFromBits128(d, mask_bits));
12235template <
typename T,
size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
12237 const DFromV<
decltype(v)> d;
12241 const auto indices =
BitCast(du, detail::IndicesFromNotBits128(d, mask_bits));
12248template <
typename T>
12254template <
typename T, HWY_IF_T_SIZE(T, 8)>
12257 const DFromV<
decltype(v)> d;
12259 const Vec128<T> maskL =
DupEven(m);
12260 const Vec128<T> maskH =
DupOdd(m);
12261 const Vec128<T> swap =
AndNot(maskL, maskH);
12266template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
12267HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
12268 return detail::CompressBits(v, detail::BitsFromMask(mask));
12274template <
typename T>
12280template <
typename T, HWY_IF_T_SIZE(T, 8)>
12283 const DFromV<
decltype(v)> d;
12285 const Vec128<T> maskL =
DupEven(m);
12286 const Vec128<T> maskH =
DupOdd(m);
12287 const Vec128<T> swap =
AndNot(maskH, maskL);
12291template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
12292HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
12295 if (N < 16 / sizeof(T)) {
12296 return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
12298 return detail::CompressNotBits(v, detail::BitsFromMask(mask));
12302HWY_API Vec128<u
int64_t> CompressBlocksNot(Vec128<u
int64_t> v,
12303 Mask128<u
int64_t> ) {
12307template <
typename T,
size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
12310 uint64_t mask_bits = 0;
12311 constexpr size_t kNumBytes = (N + 7) / 8;
12312 CopyBytes<kNumBytes>(bits, &mask_bits);
12314 mask_bits &= (1ull << N) - 1;
12317 return detail::CompressBits(v, mask_bits);
12322template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
12327 const uint64_t mask_bits = detail::BitsFromMask(m);
12329 const size_t count =
PopCount(mask_bits);
12332 const auto indices =
BitCast(du, detail::IndicesFromBits128(d, mask_bits));
12334 StoreU(compressed, d, unaligned);
12335 detail::MaybeUnpoison(unaligned, count);
12339template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
12344 const uint64_t mask_bits = detail::BitsFromMask(m);
12346 const size_t count =
PopCount(mask_bits);
12349 const auto indices =
BitCast(du, detail::IndicesFromBits128(d, mask_bits));
12352 detail::MaybeUnpoison(unaligned, count);
12356template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
12361 uint64_t mask_bits = 0;
12362 constexpr size_t kN =
MaxLanes(d);
12363 constexpr size_t kNumBytes = (kN + 7) / 8;
12364 CopyBytes<kNumBytes>(bits, &mask_bits);
12366 mask_bits &= (1ull << kN) - 1;
12368 const size_t count =
PopCount(mask_bits);
12371 const auto indices =
BitCast(du, detail::IndicesFromBits128(d, mask_bits));
12373 StoreU(compressed, d, unaligned);
12375 detail::MaybeUnpoison(unaligned, count);
12384#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
12389#ifdef HWY_NATIVE_EXPAND
12390#undef HWY_NATIVE_EXPAND
12392#define HWY_NATIVE_EXPAND
12397#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE
12411template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)>
12414 return VFromD<D>{_mm_maskz_expandloadu_epi8(mask.raw, unaligned)};
12417template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
12420 return VFromD<D>{_mm_maskz_expandloadu_epi16(mask.raw, unaligned)};
12437template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
12440 return VFromD<D>{_mm_maskz_expandloadu_epi32(mask.raw, unaligned)};
12443template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
12446 return VFromD<D>{_mm_maskz_expandloadu_epi64(mask.raw, unaligned)};
12452#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE
12454template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
12455HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
12456 const DFromV<decltype(v)> d;
12457 const RebindToUn
signed<decltype(d)> du;
12458 const MFromD<decltype(du)> mu = RebindMask(du, mask);
12459 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
12464template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
12465HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
12466 const DFromV<decltype(v)> d;
12467 const RebindToUn
signed<decltype(d)> du;
12468 const MFromD<decltype(du)> mu = RebindMask(du, mask);
12469 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
12474template <
class D, HWY_IF_V_SIZE_LE_D(D, 16),
12475 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
12476HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
12477 const TFromD<D>* HWY_RESTRICT unaligned) {
12478#if HWY_TARGET <= HWY_AVX3_DL
12479 const RebindToUn
signed<decltype(d)> du;
12480 using TU = TFromD<decltype(du)>;
12481 const TU* HWY_RESTRICT pu = re
interpret_cast<const TU*>(unaligned);
12482 const MFromD<decltype(du)> mu = RebindMask(du, mask);
12483 return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
12485 return Expand(LoadU(d, unaligned), mask);
12489template <
class D, HWY_IF_V_SIZE_LE_D(D, 16),
12490 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
12491HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
12492 const TFromD<D>* HWY_RESTRICT unaligned) {
12493#if HWY_TARGET <= HWY_AVX3
12494 const RebindToUn
signed<decltype(d)> du;
12495 using TU = TFromD<decltype(du)>;
12496 const TU* HWY_RESTRICT pu = re
interpret_cast<const TU*>(unaligned);
12497 const MFromD<decltype(du)> mu = RebindMask(du, mask);
12498 return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
12500 return Expand(LoadU(d, unaligned), mask);
12513#if HWY_TARGET <= HWY_AVX3
12516template <
class T, HWY_IF_LANES_LE(sizeof(T), 4)>
12519 const auto u32_val =
static_cast<uint32_t
>(
static_cast<TU
>(x));
12520#if HWY_COMPILER_CLANGCL
12521 return static_cast<uint32_t
>(u32_val & (0u - u32_val));
12523 return static_cast<uint32_t
>(_blsi_u32(u32_val));
12526template <
class T, HWY_IF_T_SIZE(T, 8)>
12528 const auto u64_val =
static_cast<uint64_t
>(x);
12529#if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32
12530 return static_cast<uint64_t
>(u64_val & (0ULL - u64_val));
12532 return static_cast<uint64_t
>(_blsi_u64(u64_val));
12536template <
class T, HWY_IF_LANES_LE(sizeof(T), 4)>
12539 const auto u32_val =
static_cast<uint32_t
>(
static_cast<TU
>(x));
12540#if HWY_COMPILER_CLANGCL
12541 return static_cast<uint32_t
>(u32_val ^ (u32_val - 1u));
12543 return static_cast<uint32_t
>(_blsmsk_u32(u32_val));
12546template <
class T, HWY_IF_T_SIZE(T, 8)>
12548 const auto u64_val =
static_cast<uint64_t
>(x);
12549#if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32
12550 return static_cast<uint64_t
>(u64_val ^ (u64_val - 1ULL));
12552 return static_cast<uint64_t
>(_blsmsk_u64(u64_val));
12558template <
class T,
size_t N>
12560 constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
12562 (0u - detail::AVX3Blsi(mask.
raw)) & kActiveElemMask)};
12564template <
class T,
size_t N>
12565HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
12566 constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
12567 return Mask128<T, N>{
static_cast<typename Mask128<T, N>::Raw
>(
12568 (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)};
12570template <
class T,
size_t N>
12572 constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
12573 return Mask128<T, N>{
static_cast<typename Mask128<T, N>::Raw
>(
12574 detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)};
12576template <
class T,
size_t N>
12578 return Mask128<T, N>{
12579 static_cast<typename Mask128<T, N>::Raw
>(detail::AVX3Blsi(mask.raw))};
12588 const FixedTag<T, 2>
d;
12592template <
class T,
size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
12594 const Simd<T, N, 0>
d;
12596 const auto neg_vmask =
12600template <
class T, HWY_IF_NOT_T_SIZE(T, 8)>
12602 const Full128<T>
d;
12606 using VF =
VFromD<
decltype(df32)>;
12609 vmask =
Or(vmask,
Neg(vmask));
12614 _MM_SHUFFLE(1, 1, 0, 0))}));
12618template <
class T,
size_t N>
12629 const FixedTag<T, 2>
d;
12633 const auto zero =
Zero(di);
12637template <
class T,
size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
12639 const Simd<T, N, 0>
d;
12643 const auto only_first_vmask =
12647template <
class T, HWY_IF_NOT_T_SIZE(T, 8)>
12649 const Full128<T>
d;
12653 const auto zero =
Zero(di64);
12662 const FixedTag<T, 1>
d;
12664 using TI = MakeSigned<T>;
12668template <
class T,
size_t N, HWY_IF_LANES_GT(N, 1)>
12670 const Simd<T, N, 0>
d;
12680#undef HWY_IF_SUM_OF_LANES_D
12681#define HWY_IF_SUM_OF_LANES_D(D) \
12682 HWY_IF_LANES_GT_D(D, 1), \
12683 hwy::EnableIf<!hwy::IsSame<TFromD<D>, uint8_t>() || \
12684 (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \
12687template <
class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 8)>
12689 return Set(d,
static_cast<uint8_t
>(GetLane(SumsOf8(v)) & 0xFF));
12691template <
class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 16)>
12696 return Broadcast<0>(
BitCast(d, sums));
12699#if HWY_TARGET <= HWY_SSE4
12701#undef HWY_IF_MINMAX_OF_LANES_D
12702#define HWY_IF_MINMAX_OF_LANES_D(D) \
12703 HWY_IF_LANES_GT_D(D, 1), \
12704 hwy::EnableIf<(!hwy::IsSame<TFromD<D>, uint8_t>() || \
12705 ((HWY_V_SIZE_D(D) < 8) || (HWY_V_SIZE_D(D) > 16))) && \
12706 (!hwy::IsSame<TFromD<D>, uint16_t>() || \
12707 (HWY_V_SIZE_D(D) != 16))>* = nullptr
12709template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
12714template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
12720template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
12722 const Rebind<uint16_t,
decltype(
d)> d16;
12725template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
12727 const Half<
decltype(
d)> dh;
12730 return Combine(
d, result, result);
12733template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
12738template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
12751template <
class D, HWY_IF_U64_D(D)>
12766 const auto eqHL =
Eq(a, b);
12768 const VFromD<D> ltLX = ShiftLeftLanes<1>(ltHL);
12774template <
class D, HWY_IF_U64_D(D)>
12778 return And(eqHL, eqLH);
12781template <
class D, HWY_IF_U64_D(D)>
12785 return Or(neHL, neLH);
12788template <
class D, HWY_IF_U64_D(D)>
12796template <
class D, HWY_IF_U64_D(D)>
12804template <
class D, HWY_IF_U64_D(D)>
12814template <
class D, HWY_IF_U64_D(D)>
12819template <
class D, HWY_IF_U64_D(D)>
12824template <
class D, HWY_IF_U64_D(D)>
12829template <
class D, HWY_IF_U64_D(D)>
12834template <
class D, HWY_IF_U64_D(D)>
12839template <
class D, HWY_IF_U64_D(D)>
12847template <
class D, HWY_IF_U64_D(D)>
12852template <
class D, HWY_IF_U64_D(D)>
12857template <
class D, HWY_IF_U64_D(D)>
12862template <
class D, HWY_IF_U64_D(D)>
12869#if HWY_TARGET <= HWY_AVX3
12871#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
12872#undef HWY_NATIVE_LEADING_ZERO_COUNT
12874#define HWY_NATIVE_LEADING_ZERO_COUNT
12877template <
class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
12879 return V{_mm_lzcnt_epi32(v.raw)};
12882template <
class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
12883HWY_API V LeadingZeroCount(V v) {
12884 return V{_mm_lzcnt_epi64(v.raw)};
12897#undef HWY_X86_IF_EMULATED_D
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_LANES_GT(kN, lanes)
Definition base.h:618
#define HWY_IF_CONSTEXPR
Definition base.h:310
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_MAYBE_UNUSED
Definition base.h:113
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
#define HWY_FENCE
Definition base.h:224
Definition arm_neon-inl.h:865
Raw raw
Definition arm_neon-inl.h:878
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:867
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition x86_128-inl.h:164
Definition arm_neon-inl.h:813
HWY_INLINE Vec128 & operator%=(const Vec128 other)
Definition x86_128-inl.h:109
T PrivateT
Definition arm_neon-inl.h:816
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition x86_128-inl.h:100
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:815
Raw raw
Definition arm_neon-inl.h:851
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition x86_128-inl.h:106
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition x86_128-inl.h:118
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition x86_128-inl.h:115
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition x86_128-inl.h:97
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition x86_128-inl.h:112
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition x86_128-inl.h:103
#define HWY_ARCH_X86_64
Definition detect_compiler_arch.h:173
#define HWY_SSE2
Definition detect_targets.h:80
#define HWY_AVX3_DL
Definition detect_targets.h:73
#define HWY_TARGET
Definition detect_targets.h:543
#define HWY_AVX3
Definition detect_targets.h:74
#define HWY_AVX3_ZEN4
Definition detect_targets.h:68
#define HWY_SSE4
Definition detect_targets.h:77
#define HWY_SSSE3
Definition detect_targets.h:78
HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV< V > t)
Definition x86_128-inl.h:6289
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:744
HWY_INLINE void NativeMaskedScatter128(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT base, VI index)
Definition x86_128-inl.h:5795
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE VFromD< D > IndicesFromBits128(D d, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5622
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition ops/shared-inl.h:151
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_API V InsertLane(const V v, TFromD< D > t)
Definition arm_neon-inl.h:1793
HWY_API Vec128< T, N > GaloisAffine(Vec128< T, N > v, VFromD< Repartition< uint64_t, Simd< T, N, 0 > > > matrix)
Definition x86_128-inl.h:1870
HWY_INLINE VFromD< DI > FixConversionOverflow(DI di, VFromD< RebindToFloat< DI > > original, VFromD< DI > converted)
Definition x86_128-inl.h:10061
static HWY_INLINE uint32_t AVX3Blsi(T x)
Definition x86_128-inl.h:12517
HWY_INLINE V Eq128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7087
HWY_INLINE V AVX2ShrI8Vec128(V v, V bits)
Definition x86_128-inl.h:8916
HWY_INLINE MFromD< D > LoadMaskBits128(D, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5084
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE V SlideUpLanes(V v, size_t amt)
Definition arm_neon-inl.h:6201
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE Mask128< T, N > UnmaskedNot(const Mask128< T, N > m)
Definition x86_128-inl.h:1635
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1593
HWY_INLINE VFromD< D > ReorderDemote2From64To32Combine(D dn, V a, V b)
Definition wasm_128-inl.h:4515
HWY_INLINE void NativeScatter128(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT base, VI index)
Definition x86_128-inl.h:5751
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:560
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE Vec128< T, N > NativeGather128(const T *HWY_RESTRICT base, Vec128< int32_t, N > indices)
Definition x86_128-inl.h:5861
HWY_INLINE V AVX2ShlU8Vec128(V v, V bits)
Definition x86_128-inl.h:8604
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
HWY_INLINE Vec128< float16_t, N > ConcatEven(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7002
HWY_INLINE MFromD< D > LoadMaskBits(D d, uint64_t mask_bits)
Definition arm_neon-inl.h:8051
static HWY_INLINE uint32_t AVX3Blsmsk(T x)
Definition x86_128-inl.h:12537
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:164
HWY_INLINE VFromD< Rebind< double, DFromV< VW > > > U64ToF64VecFast(VW w)
Definition wasm_128-inl.h:4656
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE Vec128< float16_t, N > ConcatOdd(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7009
HWY_API V AVX2ShlU16Vec128(V v, V bits)
Definition x86_128-inl.h:8508
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:570
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5005
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6088
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2845
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE VFromD< D > NativeLoadExpand(MFromD< D > mask, D, const uint8_t *HWY_RESTRICT unaligned)
Definition x86_128-inl.h:12412
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:1383
HWY_INLINE VFromD< D > PromoteOddTo(hwy::FloatTag to_type_tag, hwy::SizeTag< 4 > to_lane_size_tag, hwy::FloatTag from_type_tag, D d_to, svfloat16_t v)
Definition arm_sve-inl.h:4419
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition x86_128-inl.h:5621
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8296
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6031
HWY_INLINE V Ne128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7093
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8851
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:774
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
static HWY_INLINE V SSE2Mul128(V a, V b, V &mulH)
Definition x86_128-inl.h:9056
HWY_INLINE VFromD< D > PromoteEvenTo(hwy::SignedTag, hwy::SizeTag< 2 >, hwy::SignedTag, D d_to, svint8_t v)
Definition arm_sve-inl.h:4334
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8860
HWY_INLINE VFromD< Rebind< uint64_t, D > > DemoteFromU64MaskOutResult(D, VFromD< Rebind< uint64_t, D > > v)
Definition wasm_128-inl.h:4487
HWY_INLINE Vec128< T, N > NativeMaskedGatherOr128(Vec128< T, N > no, Mask128< T, N > m, const T *HWY_RESTRICT base, Vec128< int32_t, N > indices)
Definition x86_128-inl.h:5887
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_API Vec128< T, N > CompressNotBits(Vec128< T, N > v, uint64_t mask_bits)
Definition ppc_vsx-inl.h:6017
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6076
HWY_INLINE VFromD< Rebind< uint64_t, D > > DemoteFromU64Saturate(D dn, VFromD< Rebind< uint64_t, D > > v)
Definition wasm_128-inl.h:4501
HWY_API void ScalarMaskedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition x86_128-inl.h:3547
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE V SlideDownLanes(V v, size_t amt)
Definition arm_neon-inl.h:6346
HWY_INLINE Vec128< uint8_t, N > NativeExpand(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition x86_128-inl.h:12400
HWY_INLINE V AVX2ShrU8Vec128(V v, V bits)
Definition x86_128-inl.h:8750
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE VFromD< D > ClampF64ToI32Max(D d, VFromD< D > v)
Definition x86_128-inl.h:10050
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE VFromD< D > IndicesFromNotBits128(D d, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5771
HWY_INLINE V Lt128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7081
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition x86_128-inl.h:5528
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< DI32 > SatWidenMulPairwiseAccumulate(DI32 di32, VFromD< Repartition< int16_t, DI32 > > a, VFromD< Repartition< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition generic_ops-inl.h:5179
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API svbool_t MaskFalse(const D)
Definition arm_sve-inl.h:372
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
RepartitionToWide< RepartitionToWideX2< D > > RepartitionToWideX3
Definition ops/shared-inl.h:483
HWY_API Mask< D > SlideMask1Up(D d, Mask< D > m)
Definition generic_ops-inl.h:7071
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
V Shl(V a, V b)
Definition generic_ops-inl.h:7322
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API Vec128< int64_t > SaturatedAbs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3288
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API V MaskedModOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:4666
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API V AddSub(V a, V b)
Definition generic_ops-inl.h:775
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API V Rol(V a, V b)
Definition generic_ops-inl.h:445
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API svbool_t DemoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1420
HWY_API V ZeroIfNegative(V v)
Definition generic_ops-inl.h:266
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
Vec128< T, 4/sizeof(T)> Vec32
Definition arm_neon-inl.h:858
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API MFromD< DTo > OrderedDemote2MasksTo(DTo d_to, DFrom, MFromD< DFrom > a, MFromD< DFrom > b)
Definition x86_128-inl.h:1107
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfAdjQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3901
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfShuffledQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3943
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
Vec128< T, 2/sizeof(T)> Vec16
Definition arm_neon-inl.h:861
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v)
Definition generic_ops-inl.h:869
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec< DI16 > SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b)
Definition generic_ops-inl.h:5153
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Mask< D > SlideMask1Down(D d, Mask< D > m)
Definition generic_ops-inl.h:7076
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API Vec< RepartitionToWideX3< DFromV< V > > > SumsOf8AbsDiff(V a, V b)
Definition generic_ops-inl.h:2820
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API VFromD< D > PromoteLowerTo(D d, V v)
Definition generic_ops-inl.h:2984
HWY_API V MulAddSub(V mul, V x, V sub_or_add)
Definition arm_sve-inl.h:4285
HWY_API V RotateRightSame(V v, int bits)
Definition generic_ops-inl.h:601
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
Vec128< T, 8/sizeof(T)> Vec64
Definition arm_neon-inl.h:855
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API V Ror(V a, V b)
Definition generic_ops-inl.h:459
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API Mask< D > SlideMaskDownLanes(D d, Mask< D > m, size_t amt)
Definition generic_ops-inl.h:7086
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
unsigned int Shift64Count
Definition x86_128-inl.h:4535
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:476
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API svbool_t LowerHalfOfMask(D, svbool_t m)
Definition arm_sve-inl.h:1456
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API MFromD< DFromV< V > > IsEitherNaN(V a, V b)
Definition generic_ops-inl.h:1177
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API V Div(V a, V b)
Definition arm_sve-inl.h:4639
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition ops/shared-inl.h:407
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > MaskedGatherIndex(MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2731
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API svbool_t IsNegative(V v)
Definition arm_sve-inl.h:1623
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
long long int GatherIndex64
Definition x86_128-inl.h:5737
HWY_API Mask< D > SlideMaskUpLanes(D d, Mask< D > m, size_t amt)
Definition generic_ops-inl.h:7081
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API MFromD< D > UpperHalfOfMask(D, MFromD< Twice< D > > m)
Definition x86_128-inl.h:1051
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API MFromD< D > CombineMasks(D, MFromD< Half< D > > hi, MFromD< Half< D > > lo)
Definition x86_128-inl.h:959
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API svbool_t PromoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1394
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V RotateLeftSame(V v, int bits)
Definition generic_ops-inl.h:588
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API V Mod(V a, V b)
Definition arm_sve-inl.h:4660
HWY_API V IfNegativeThenZeroElse(V v, V no)
Definition generic_ops-inl.h:256
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API constexpr bool IsSigned()
Definition base.h:2134
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2577
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2540
typename EnableIfT< Condition >::type EnableIf
Definition base.h:486
HWY_API size_t PopCount(T x)
Definition base.h:2615
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue()
Definition base.h:2212
#define HWY_IF_U32_D(D)
Definition ops/shared-inl.h:579
#define HWY_IF_F16_D(D)
Definition ops/shared-inl.h:597
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_I64_D(D)
Definition ops/shared-inl.h:585
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_LANES_GT_D(D, lanes)
Definition ops/shared-inl.h:562
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_I32_D(D)
Definition ops/shared-inl.h:584
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_V_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:609
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_T_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:557
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_IF_I8_D(D)
Definition ops/shared-inl.h:582
#define HWY_IF_NOT_UNSIGNED_V(V)
Definition ops/shared-inl.h:614
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:555
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_MAX_BYTES
Definition set_macros-inl.h:168
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition arm_neon-inl.h:5654
__m128i raw
Definition x86_128-inl.h:6585
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:5655
Definition ops/shared-inl.h:198
HWY_INLINE __m128d operator()(__m128i v)
Definition x86_128-inl.h:295
HWY_INLINE __m128 operator()(__m128i v)
Definition x86_128-inl.h:291
Definition wasm_128-inl.h:179
HWY_INLINE __m128i operator()(__m128i v)
Definition x86_128-inl.h:281
HWY_INLINE __v128_u operator()(__v128_u v)
Definition wasm_128-inl.h:180
__f64x2 type
Definition wasm_128-inl.h:68
__f32x4 type
Definition wasm_128-inl.h:64
Definition x86_128-inl.h:67
__v128_u type
Definition wasm_128-inl.h:60
__mmask16 type
Definition x86_128-inl.h:143
__mmask8 type
Definition x86_128-inl.h:147
__mmask8 type
Definition x86_128-inl.h:151
__mmask8 type
Definition x86_128-inl.h:155
Definition x86_128-inl.h:140
int VFromD
Definition tuple-inl.h:25
#define HWY_X86_FPCLASS_NEG_INF
Definition x86_128-inl.h:11266
#define HWY_X86_FPCLASS_SNAN
Definition x86_128-inl.h:11269
#define HWY_X86_FPCLASS_POS_INF
Definition x86_128-inl.h:11265
#define HWY_X86_FPCLASS_QNAN
Definition x86_128-inl.h:11262