27#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
57 return Min(
Max(lo, v), hi);
62#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
64template <
size_t kLanes,
class D>
66 constexpr size_t kBytes = kLanes *
sizeof(TFromD<D>);
67 static_assert(kBytes < 16,
"Shift count is per-block");
68 return CombineShiftRightBytes<kBytes>(
d, hi, lo);
94 using TU =
TFromD<
decltype(du)>;
95 const TU max_x2 =
static_cast<TU
>(MaxExponentTimes2<T>());
104#if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
108template <
size_t kFromVectSize,
size_t kToVectSize,
class DTo,
class DFrom>
113 const Repartition<uint8_t, DTo> d_to_u8;
121template <
size_t kFromVectSize,
size_t kToVectSize,
class DTo,
class DFrom,
132template <
size_t kFromVectSize,
size_t kToVectSize,
class DTo,
class DFrom,
138 const Twice<
decltype(d_from)> dt_from;
144template <
size_t kFromVectSize,
size_t kToVectSize,
class DTo,
class DFrom,
150 using TFrom = TFromD<DFrom>;
151 constexpr size_t kNumOfFromLanes = kFromVectSize /
sizeof(TFrom);
152 const Repartition<TFrom,
decltype(d_to)> d_resize_to;
161template <
class DTo,
class DFrom>
171template <
class D,
typename T = TFromD<D>>
174#if HWY_MEM_OPS_MIGHT_FAULT
176 for (
size_t i = 0; i < num; ++i) {
186template <
class D,
typename T = TFromD<D>>
189#if HWY_MEM_OPS_MIGHT_FAULT
191 for (
size_t i = 0; i < num; ++i) {
201#if (defined(HWY_NATIVE_IS_NEGATIVE) == defined(HWY_TARGET_TOGGLE))
202#ifdef HWY_NATIVE_IS_NEGATIVE
203#undef HWY_NATIVE_IS_NEGATIVE
205#define HWY_NATIVE_IS_NEGATIVE
208template <
class V, HWY_IF_NOT_UNSIGNED_V(V)>
218#if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
219#ifdef HWY_NATIVE_MASK_FALSE
220#undef HWY_NATIVE_MASK_FALSE
222#define HWY_NATIVE_MASK_FALSE
233#if (defined(HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO) == defined(HWY_TARGET_TOGGLE))
234#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
235#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
237#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
240template <
class V, HWY_IF_NOT_UNSIGNED_V(V)>
248#if (defined(HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE) == defined(HWY_TARGET_TOGGLE))
249#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
250#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
252#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
255template <
class V, HWY_IF_NOT_UNSIGNED_V(V)>
265template <
class V, HWY_IF_NOT_UNSIGNED_V(V)>
271#if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
272#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
273#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
275#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
287#if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
288#ifdef HWY_NATIVE_PROMOTE_MASK_TO
289#undef HWY_NATIVE_PROMOTE_MASK_TO
291#define HWY_NATIVE_PROMOTE_MASK_TO
294template <
class DTo,
class DFrom>
298 "sizeof(TFromD<DTo>) must be greater than sizeof(TFromD<DFrom>)");
301 "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
314#if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
315#ifdef HWY_NATIVE_DEMOTE_MASK_TO
316#undef HWY_NATIVE_DEMOTE_MASK_TO
318#define HWY_NATIVE_DEMOTE_MASK_TO
321template <
class DTo,
class DFrom>
324 "sizeof(TFromD<DTo>) must be less than sizeof(TFromD<DFrom>)");
327 "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
340#if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
341#ifdef HWY_NATIVE_COMBINE_MASKS
342#undef HWY_NATIVE_COMBINE_MASKS
344#define HWY_NATIVE_COMBINE_MASKS
347#if HWY_TARGET != HWY_SCALAR
350 const Half<
decltype(
d)> dh;
359#if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
360#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
361#undef HWY_NATIVE_LOWER_HALF_OF_MASK
363#define HWY_NATIVE_LOWER_HALF_OF_MASK
368 const Twice<
decltype(
d)> dt;
376#if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
377#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
378#undef HWY_NATIVE_UPPER_HALF_OF_MASK
380#define HWY_NATIVE_UPPER_HALF_OF_MASK
383#if HWY_TARGET != HWY_SCALAR
386 const Twice<
decltype(
d)> dt;
395#if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \
396 defined(HWY_TARGET_TOGGLE))
397#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
398#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
400#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
403#if HWY_TARGET != HWY_SCALAR
404template <
class DTo,
class DFrom>
408 sizeof(TFromD<DTo>) ==
sizeof(TFromD<DFrom>) / 2,
409 "sizeof(TFromD<DTo>) must be equal to sizeof(TFromD<DFrom>) / 2");
410 static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(),
411 "Mask<DTo> must be the same type as "
412 "Mask<Repartition<TFromD<DTo>, DFrom>>>()");
426template <
int kBits,
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
428 constexpr size_t kSizeInBits =
sizeof(
TFromV<V>) * 8;
429 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
431 constexpr int kRotateRightAmt =
432 (kBits == 0) ? 0 :
static_cast<int>(kSizeInBits) - kBits;
433 return RotateRight<kRotateRightAmt>(v);
437#if (defined(HWY_NATIVE_ROL_ROR_8) == defined(HWY_TARGET_TOGGLE))
438#ifdef HWY_NATIVE_ROL_ROR_8
439#undef HWY_NATIVE_ROL_ROR_8
441#define HWY_NATIVE_ROL_ROR_8
444template <
class V, HWY_IF_UI8(TFromV<V>)>
450 const auto shift_amt_mask =
Set(du, uint8_t{7});
451 const auto shl_amt =
And(
BitCast(du, b), shift_amt_mask);
454 const auto vu =
BitCast(du, a);
458template <
class V, HWY_IF_UI8(TFromV<V>)>
464 const auto shift_amt_mask =
Set(du, uint8_t{7});
465 const auto shr_amt =
And(
BitCast(du, b), shift_amt_mask);
468 const auto vu =
BitCast(du, a);
474#if (defined(HWY_NATIVE_ROL_ROR_16) == defined(HWY_TARGET_TOGGLE))
475#ifdef HWY_NATIVE_ROL_ROR_16
476#undef HWY_NATIVE_ROL_ROR_16
478#define HWY_NATIVE_ROL_ROR_16
481template <
class V, HWY_IF_UI16(TFromV<V>)>
487 const auto shift_amt_mask =
Set(du, uint16_t{15});
488 const auto shl_amt =
And(
BitCast(du, b), shift_amt_mask);
491 const auto vu =
BitCast(du, a);
495template <
class V, HWY_IF_UI16(TFromV<V>)>
501 const auto shift_amt_mask =
Set(du, uint16_t{15});
502 const auto shr_amt =
And(
BitCast(du, b), shift_amt_mask);
505 const auto vu =
BitCast(du, a);
511#if (defined(HWY_NATIVE_ROL_ROR_32_64) == defined(HWY_TARGET_TOGGLE))
512#ifdef HWY_NATIVE_ROL_ROR_32_64
513#undef HWY_NATIVE_ROL_ROR_32_64
515#define HWY_NATIVE_ROL_ROR_32_64
518template <
class V, HWY_IF_UI32(TFromV<V>)>
524 const auto shift_amt_mask =
Set(du, uint32_t{31});
525 const auto shl_amt =
And(
BitCast(du, b), shift_amt_mask);
528 const auto vu =
BitCast(du, a);
532template <
class V, HWY_IF_UI32(TFromV<V>)>
538 const auto shift_amt_mask =
Set(du, uint32_t{31});
539 const auto shr_amt =
And(
BitCast(du, b), shift_amt_mask);
542 const auto vu =
BitCast(du, a);
546#if HWY_HAVE_INTEGER64
547template <
class V, HWY_IF_UI64(TFromV<V>)>
553 const auto shift_amt_mask =
Set(du, uint64_t{63});
554 const auto shl_amt =
And(
BitCast(du, b), shift_amt_mask);
557 const auto vu =
BitCast(du, a);
561template <
class V, HWY_IF_UI64(TFromV<V>)>
567 const auto shift_amt_mask =
Set(du, uint64_t{63});
568 const auto shr_amt =
And(
BitCast(du, b), shift_amt_mask);
571 const auto vu =
BitCast(du, a);
580#if (defined(HWY_NATIVE_ROL_ROR_SAME_8) == defined(HWY_TARGET_TOGGLE))
581#ifdef HWY_NATIVE_ROL_ROR_SAME_8
582#undef HWY_NATIVE_ROL_ROR_SAME_8
584#define HWY_NATIVE_ROL_ROR_SAME_8
587template <
class V, HWY_IF_UI8(TFromV<V>)>
592 const int shl_amt = bits & 7;
593 const int shr_amt =
static_cast<int>((0u -
static_cast<unsigned>(bits)) & 7u);
595 const auto vu =
BitCast(du, v);
600template <
class V, HWY_IF_UI8(TFromV<V>)>
605 const int shr_amt = bits & 7;
606 const int shl_amt =
static_cast<int>((0u -
static_cast<unsigned>(bits)) & 7u);
608 const auto vu =
BitCast(du, v);
615#if (defined(HWY_NATIVE_ROL_ROR_SAME_16) == defined(HWY_TARGET_TOGGLE))
616#ifdef HWY_NATIVE_ROL_ROR_SAME_16
617#undef HWY_NATIVE_ROL_ROR_SAME_16
619#define HWY_NATIVE_ROL_ROR_SAME_16
622template <
class V, HWY_IF_UI16(TFromV<V>)>
627 const int shl_amt = bits & 15;
629 static_cast<int>((0u -
static_cast<unsigned>(bits)) & 15u);
631 const auto vu =
BitCast(du, v);
636template <
class V, HWY_IF_UI16(TFromV<V>)>
641 const int shr_amt = bits & 15;
643 static_cast<int>((0u -
static_cast<unsigned>(bits)) & 15u);
645 const auto vu =
BitCast(du, v);
651#if (defined(HWY_NATIVE_ROL_ROR_SAME_32_64) == defined(HWY_TARGET_TOGGLE))
652#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
653#undef HWY_NATIVE_ROL_ROR_SAME_32_64
655#define HWY_NATIVE_ROL_ROR_SAME_32_64
658template <
class V, HWY_IF_UI32(TFromV<V>)>
663 const int shl_amt = bits & 31;
665 static_cast<int>((0u -
static_cast<unsigned>(bits)) & 31u);
667 const auto vu =
BitCast(du, v);
672template <
class V, HWY_IF_UI32(TFromV<V>)>
677 const int shr_amt = bits & 31;
679 static_cast<int>((0u -
static_cast<unsigned>(bits)) & 31u);
681 const auto vu =
BitCast(du, v);
686#if HWY_HAVE_INTEGER64
687template <
class V, HWY_IF_UI64(TFromV<V>)>
692 const int shl_amt = bits & 63;
694 static_cast<int>((0u -
static_cast<unsigned>(bits)) & 63u);
696 const auto vu =
BitCast(du, v);
701template <
class V, HWY_IF_UI64(TFromV<V>)>
706 const int shr_amt = bits & 63;
708 static_cast<int>((0u -
static_cast<unsigned>(bits)) & 63u);
710 const auto vu =
BitCast(du, v);
719#if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
720#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
721#undef HWY_NATIVE_INTERLEAVE_WHOLE
723#define HWY_NATIVE_INTERLEAVE_WHOLE
726#if HWY_TARGET != HWY_SCALAR
727template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
733template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
752#if HWY_TARGET != HWY_SCALAR
763#if HWY_TARGET != HWY_SCALAR
774template <
class V, HWY_IF_LANES_D(DFromV<V>, 1)>
789template <
class V, HWY_IF_ADDSUB_V(V)>
791 using D =
DFromV<
decltype(a)>;
796 const Rebind<TNegate, D> d_negate;
801 return Add(a, negated_even_b);
805#if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
806#ifdef HWY_NATIVE_MASKED_ARITH
807#undef HWY_NATIVE_MASKED_ARITH
809#define HWY_NATIVE_MASKED_ARITH
812template <
class V,
class M>
817template <
class V,
class M>
822template <
class V,
class M>
827template <
class V,
class M>
832template <
class V,
class M>
837template <
class V,
class M>
842template <
class V,
class M>
847template <
class V,
class M>
852template <
class V,
class M>
860#if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
861 defined(HWY_TARGET_TOGGLE))
862#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
863#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
865#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
868template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
870#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
881template <
class V, HWY_IF_FLOAT_V(V)>
888#if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE))
889#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
890#undef HWY_NATIVE_SATURATED_NEG_8_16_32
892#define HWY_NATIVE_SATURATED_NEG_8_16_32
902template <
class V, HWY_IF_I32(TFromV<V>)>
906#if HWY_TARGET == HWY_RVV || \
907 (HWY_TARGET >= HWY_PPC10 && HWY_TARGET <= HWY_PPC8) || \
908 (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
921#if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE))
922#ifdef HWY_NATIVE_SATURATED_NEG_64
923#undef HWY_NATIVE_SATURATED_NEG_64
925#define HWY_NATIVE_SATURATED_NEG_64
928template <
class V, HWY_IF_I64(TFromV<V>)>
930#if HWY_TARGET == HWY_RVV || \
931 (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
936 const auto neg_v =
Neg(v);
944#if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE))
945#ifdef HWY_NATIVE_SATURATED_ABS
946#undef HWY_NATIVE_SATURATED_ABS
948#define HWY_NATIVE_SATURATED_ABS
951template <
class V, HWY_IF_SIGNED_V(V)>
965#if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
966#ifdef HWY_NATIVE_REDUCE_SCALAR
967#undef HWY_NATIVE_REDUCE_SCALAR
969#define HWY_NATIVE_REDUCE_SCALAR
997template <
class D,
class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
1004template <
class D,
class Func, HWY_IF_V_SIZE_D(D, 32)>
1012template <
class D,
class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
1017template <
class D,
class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
1020 const VFromD<D> v03_12_12_03 = f(v3210, v0123);
1022 return f(v03_12_12_03, v12_03_03_12);
1025template <
class D,
class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
1030 f(v34_25_16_07,
Reverse4(
d, v34_25_16_07));
1031 return f(v0347_1625_1625_0347,
Reverse2(
d, v0347_1625_1625_0347));
1034template <
class D,
class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)>
1037 using VW =
VFromD<
decltype(dw)>;
1040 const VW even =
And(vw,
Set(dw, 0xFF));
1041 const VW odd = ShiftRight<8>(vw);
1043#if HWY_IS_LITTLE_ENDIAN
1050template <
class D,
class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)>
1053 using VW =
VFromD<
decltype(dw)>;
1057 const VW even = ShiftRight<8>(ShiftLeft<8>(vw));
1058 const VW odd = ShiftRight<8>(vw);
1060#if HWY_IS_LITTLE_ENDIAN
1069template <
class D, HWY_IF_SUM_OF_LANES_D(D)>
1071 const detail::AddFunc f;
1075template <
class D, HWY_IF_MINMAX_OF_LANES_D(D)>
1077 const detail::MinFunc f;
1081template <
class D, HWY_IF_MINMAX_OF_LANES_D(D)>
1083 const detail::MaxFunc f;
1088template <
class D, HWY_IF_REDUCE_D(D)>
1092template <
class D, HWY_IF_REDUCE_D(D)>
1096template <
class D, HWY_IF_REDUCE_D(D)>
1105template <
class D, HWY_IF_LANES_D(D, 1)>
1109template <
class D, HWY_IF_LANES_D(D, 1)>
1113template <
class D, HWY_IF_LANES_D(D, 1)>
1118template <
class D, HWY_IF_LANES_D(D, 1)>
1122template <
class D, HWY_IF_LANES_D(D, 1)>
1126template <
class D, HWY_IF_LANES_D(D, 1)>
1135#if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE))
1136#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
1137#undef HWY_NATIVE_REDUCE_SUM_4_UI8
1139#define HWY_NATIVE_REDUCE_SUM_4_UI8
1141template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
1150#if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE))
1151#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
1152#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
1154#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
1156template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
1161template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
1169#if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE))
1170#ifdef HWY_NATIVE_IS_EITHER_NAN
1171#undef HWY_NATIVE_IS_EITHER_NAN
1173#define HWY_NATIVE_IS_EITHER_NAN
1176template <
class V, HWY_IF_FLOAT_V(V)>
1186#if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE))
1187#ifdef HWY_NATIVE_ISINF
1188#undef HWY_NATIVE_ISINF
1190#define HWY_NATIVE_ISINF
1193template <
class V,
class D = DFromV<V>>
1207template <
class V,
class D = DFromV<V>>
1216#if HWY_COMPILER_MSVC
1217 const VFromD<
decltype(du)> shl = ShiftLeft<1>(vu);
1219 const VFromD<
decltype(du)> shl =
Add(vu, vu);
1225 const VFromD<
decltype(di)> exp =
1235 (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
1236#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1237#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1239#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1242template <
class D, HWY_IF_LANES_GT_D(D, 1)>
1251template <
class D, HWY_IF_LANES_D(D, 1)>
1254 v0 =
LoadU(
d, unaligned + 0);
1255 v1 =
LoadU(
d, unaligned + 1);
1278template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1284 A =
LoadU(
d, unaligned + 0 * kN);
1285 B =
LoadU(
d, unaligned + 1 * kN);
1286 C =
LoadU(
d, unaligned + 2 * kN);
1291template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
1296 using VU =
VFromD<
decltype(du)>;
1303 constexpr uint8_t Z = 0x80;
1305 Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1307 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z);
1309 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13);
1311 Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1313 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z);
1315 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14);
1317 Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1319 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z);
1321 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15);
1331 v0 =
Xor3(v0L, v0M, v0U);
1332 v1 =
Xor3(v1L, v1M, v1U);
1333 v2 =
Xor3(v2L, v2M, v2U);
1337template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
1342 using VU =
VFromD<
decltype(du)>;
1348 constexpr uint8_t Z = 0x80;
1350 Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1352 Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1354 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0);
1356 Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1358 Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1360 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0);
1362 Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1364 Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1366 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0);
1376 v0 =
Xor3(v0L, v0M, v0U);
1377 v1 =
Xor3(v1L, v1M, v1U);
1378 v2 =
Xor3(v2L, v2M, v2U);
1382template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
1388 using VU8 =
VFromD<
decltype(du8)>;
1395 constexpr uint8_t Z = 0x80;
1397 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1399 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z);
1400 const VU8 idx_v0C =
Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
1401 Z, 0x04, 0x05, 0x0A, 0x0B);
1403 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1405 0x0A, 0x0B, Z, Z, Z, Z, Z, Z);
1406 const VU8 idx_v1C =
Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
1407 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D);
1409 Z, Z, Z, Z, Z, Z, Z, Z, Z);
1411 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z);
1412 const VU8 idx_v2C =
Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
1413 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F);
1423 v0 =
Xor3(v0L, v0M, v0U);
1424 v1 =
Xor3(v1L, v1M, v1U);
1425 v2 =
Xor3(v2L, v2M, v2U);
1428template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
1437 const V vxx_02_03_xx =
OddEven(C, B);
1443 const V vxx_xx_10_11 =
OddEven(A, B);
1444 const V v12_13_xx_xx =
OddEven(B, C);
1447 const V vxx_20_21_xx =
OddEven(B, A);
1451template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
1463template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
1466 v0 =
LoadU(
d, unaligned + 0);
1467 v1 =
LoadU(
d, unaligned + 1);
1468 v2 =
LoadU(
d, unaligned + 2);
1476template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1482 vA =
LoadU(
d, unaligned + 0 * kN);
1483 vB =
LoadU(
d, unaligned + 1 * kN);
1484 vC =
LoadU(
d, unaligned + 2 * kN);
1485 vD =
LoadU(
d, unaligned + 3 * kN);
1490template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
1495 using V64 =
VFromD<
decltype(d64)>;
1528template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
1536 using VW =
VFromD<
decltype(dw)>;
1566template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
1587template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
1600template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
1604 v0 =
LoadU(
d, unaligned + 0);
1605 v1 =
LoadU(
d, unaligned + 1);
1606 v2 =
LoadU(
d, unaligned + 2);
1607 v3 =
LoadU(
d, unaligned + 3);
1615template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1619 StoreU(A,
d, unaligned + 0 * kN);
1620 StoreU(B,
d, unaligned + 1 * kN);
1626template <
class D, HWY_IF_V_SIZE_GT_D(D, 8)>
1635template <
class V,
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
1638 const Twice<
decltype(
d)> d2;
1642 StoreU(v10, d2, unaligned);
1651template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1655 StoreU(A,
d, unaligned + 0 * kN);
1656 StoreU(B,
d, unaligned + 1 * kN);
1657 StoreU(C,
d, unaligned + 2 * kN);
1663template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
1667 using TU =
TFromD<
decltype(du)>;
1668 using VU =
VFromD<
decltype(du)>;
1669 const VU k5 =
Set(du, TU{5});
1670 const VU k6 =
Set(du, TU{6});
1676 const VFromD<
decltype(du)> shuf_A0 =
1677 Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3,
1678 0x80, 0x80, 4, 0x80, 0x80, 5);
1680 const VFromD<
decltype(du)> shuf_A1 =
1681 Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
1682 3, 0x80, 0x80, 4, 0x80, 0x80);
1686 const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
1693 const VU shuf_B0 = shuf_A2 + k6;
1694 const VU shuf_B1 = shuf_A0 + k5;
1695 const VU shuf_B2 = shuf_A1 + k5;
1702 const VU shuf_C0 = shuf_B2 + k6;
1703 const VU shuf_C1 = shuf_B0 + k5;
1704 const VU shuf_C2 = shuf_B1 + k5;
1714template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
1718 using VU8 =
VFromD<
decltype(du8)>;
1719 const VU8 k2 =
Set(du8, uint8_t{2 *
sizeof(TFromD<D>)});
1720 const VU8 k3 =
Set(du8, uint8_t{3 *
sizeof(TFromD<D>)});
1726 const VFromD<
decltype(du8)> shuf_A1 =
1727 Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3,
1728 0x80, 0x80, 0x80, 0x80, 4, 5);
1729 const VFromD<
decltype(du8)> shuf_A2 =
1730 Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80,
1731 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80);
1735 const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
1743 const VU8 shuf_B0 = shuf_A1 + k3;
1744 const VU8 shuf_B1 = shuf_A2 + k3;
1745 const VU8 shuf_B2 = shuf_A0 + k2;
1752 const VU8 shuf_C0 = shuf_B1 + k3;
1753 const VU8 shuf_C1 = shuf_B2 + k3;
1754 const VU8 shuf_C2 = shuf_B0 + k2;
1764template <
class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
1775 const VFromD<D> v1_321 = ShiftRightLanes<1>(
d, v1);
1776 const VFromD<D> v0_32 = ShiftRightLanes<2>(
d, v0);
1794template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
1804template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
1809 constexpr size_t kFullN = 16 /
sizeof(TFromD<D>);
1810 const Full128<uint8_t> du;
1811 using VU =
VFromD<
decltype(du)>;
1812 const Full128<TFromD<D>> d_full;
1813 const VU k5 =
Set(du, uint8_t{5});
1814 const VU k6 =
Set(du, uint8_t{6});
1816 const VFromD<
decltype(d_full)> v0{part0.raw};
1817 const VFromD<
decltype(d_full)> v1{part1.raw};
1818 const VFromD<
decltype(d_full)> v2{part2.raw};
1823 alignas(16)
static constexpr uint8_t tbl_v0[16] = {
1824 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
1825 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
1826 alignas(16)
static constexpr uint8_t tbl_v1[16] = {
1827 0x80, 0, 0x80, 0x80, 1, 0x80,
1828 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
1831 const VU shuf_A0 =
Load(du, tbl_v0);
1832 const VU shuf_A1 =
Load(du, tbl_v1);
1833 const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
1837 const auto A =
BitCast(d_full, A0 | A1 | A2);
1838 StoreU(A, d_full, unaligned + 0 * kFullN);
1841 const VU shuf_B0 = shuf_A2 + k6;
1842 const VU shuf_B1 = shuf_A0 + k5;
1843 const VU shuf_B2 = shuf_A1 + k5;
1848 StoreU(B,
d, unaligned + 1 * kFullN);
1852template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 4)>
1856 const Twice<D> d_full;
1857 const Full128<uint8_t> du8;
1858 using VU8 =
VFromD<
decltype(du8)>;
1859 const VU8 k2 =
Set(du8, uint8_t{2 *
sizeof(TFromD<D>)});
1860 const VU8 k3 =
Set(du8, uint8_t{3 *
sizeof(TFromD<D>)});
1862 const VFromD<
decltype(d_full)> v0{part0.raw};
1863 const VFromD<
decltype(d_full)> v1{part1.raw};
1864 const VFromD<
decltype(d_full)> v2{part2.raw};
1870 alignas(16)
static constexpr uint8_t tbl_v1[16] = {
1871 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
1872 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
1873 alignas(16)
static constexpr uint8_t tbl_v2[16] = {
1874 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
1875 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
1879 const VU8 shuf_A1 =
Load(du8, tbl_v1);
1881 const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
1882 const VU8 shuf_A2 =
Load(du8, tbl_v2);
1887 const VFromD<
decltype(d_full)> A =
BitCast(d_full, A0 | A1 | A2);
1888 StoreU(A, d_full, unaligned);
1891 const VU8 shuf_B0 = shuf_A1 + k3;
1892 const VU8 shuf_B1 = shuf_A2 + k3;
1893 const VU8 shuf_B2 = shuf_A0 + k2;
1897 const VFromD<
decltype(d_full)> B =
BitCast(d_full, vB0 | vB1 | vB2);
1902template <
class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_D(D, 2)>
1910 StoreU(v10_v00,
d, unaligned + 0 * kN);
1911 StoreU(v01_v20,
d, unaligned + 1 * kN);
1912 StoreU(v21_v11,
d, unaligned + 2 * kN);
1925 using VU =
VFromD<
decltype(du)>;
1928 const VFromD<
decltype(d_full)> v0{part0.raw};
1929 const VFromD<
decltype(d_full)> v1{part1.raw};
1930 const VFromD<
decltype(d_full)> v2{part2.raw};
1935 alignas(16)
static constexpr uint8_t tbl_v0[16] = {
1936 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
1937 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1940 const VU shuf_A0 =
Load(du, tbl_v0);
1941 const VU shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
1942 const VU shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
1946 const VFromD<
decltype(d_full)> A =
BitCast(d_full, A0 | A1 | A2);
1953template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 2)>
1958 const Full128<uint8_t> du8;
1959 using VU8 =
VFromD<
decltype(du8)>;
1960 const Full128<TFromD<D>> d_full;
1962 const VFromD<
decltype(d_full)> v0{part0.raw};
1963 const VFromD<
decltype(d_full)> v1{part1.raw};
1964 const VFromD<
decltype(d_full)> v2{part2.raw};
1969 alignas(16)
static constexpr uint8_t tbl_v2[16] = {
1970 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
1971 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
1974 const VU8 shuf_A2 =
Load(du8, tbl_v2);
1976 CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
1978 CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
1982 const auto A =
BitCast(d_full, A0 | A1 | A2);
1983 alignas(16) TFromD<D> buf[
MaxLanes(d_full)];
1989template <
class D, HWY_IF_LANES_D(D, 1)>
2002template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
2007 StoreU(vA,
d, unaligned + 0 * kN);
2008 StoreU(vB,
d, unaligned + 1 * kN);
2009 StoreU(vC,
d, unaligned + 2 * kN);
2010 StoreU(vD,
d, unaligned + 3 * kN);
2016template <
class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
2021 const auto v10L =
ZipLower(dw, v0, v1);
2022 const auto v32L =
ZipLower(dw, v2, v3);
2023 const auto v10U =
ZipUpper(dw, v0, v1);
2024 const auto v32U =
ZipUpper(dw, v2, v3);
2034template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
2047template <
class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
2052 const Full128<TFromD<D>> d_full;
2054 const VFromD<
decltype(d_full)> v0{part0.raw};
2055 const VFromD<
decltype(d_full)> v1{part1.raw};
2056 const VFromD<
decltype(d_full)> v2{part2.raw};
2057 const VFromD<
decltype(d_full)> v3{part3.raw};
2058 const auto v10 =
ZipLower(dw, v0, v1);
2059 const auto v32 =
ZipLower(dw, v2, v3);
2062 StoreU(A, d_full, unaligned);
2067template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
2072 const Full128<TFromD<D>> d_full;
2073 const VFromD<
decltype(d_full)> v0{part0.raw};
2074 const VFromD<
decltype(d_full)> v1{part1.raw};
2075 const VFromD<
decltype(d_full)> v2{part2.raw};
2076 const VFromD<
decltype(d_full)> v3{part3.raw};
2079 StoreU(A, d_full, unaligned);
2084template <
class D, HWY_IF_V_SIZE_LE_D(D, 4)>
2091 const VFromD<
decltype(d_full)> v0{part0.raw};
2092 const VFromD<
decltype(d_full)> v1{part1.raw};
2093 const VFromD<
decltype(d_full)> v2{part2.raw};
2094 const VFromD<
decltype(d_full)> v3{part3.raw};
2095 const auto v10 =
ZipLower(dw, v0, v1);
2096 const auto v32 =
ZipLower(dw, v2, v3);
2099 StoreU(v3210, d_full, buf);
2107#if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE))
2109#ifdef HWY_NATIVE_LOAD_N
2110#undef HWY_NATIVE_LOAD_N
2112#define HWY_NATIVE_LOAD_N
2115#if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
2118template <
class DTo,
class DFrom>
2121#if HWY_TARGET <= HWY_SSE2
2148 return (num_lanes > 0) ?
LoadU(
d,
p) : no;
2155 const FixedTag<TFromD<D>, 1> d1;
2157 if (num_lanes >= 2)
return LoadU(
d,
p);
2158 if (num_lanes == 0)
return Zero(
d);
2159 return detail::LoadNResizeBitCast(
d, d1,
LoadU(d1,
p));
2166 const FixedTag<TFromD<D>, 1> d1;
2168 if (num_lanes >= 2)
return LoadU(
d,
p);
2169 if (num_lanes == 0)
return no;
2177 const FixedTag<TFromD<D>, 2> d2;
2178 const Half<
decltype(d2)> d1;
2180 if (num_lanes >= 4)
return LoadU(
d,
p);
2181 if (num_lanes == 0)
return Zero(
d);
2182 if (num_lanes == 1)
return detail::LoadNResizeBitCast(
d, d1,
LoadU(d1,
p));
2186 return (num_lanes == 2) ? v_lo :
InsertLane(v_lo, 2,
p[2]);
2193 const FixedTag<TFromD<D>, 2> d2;
2195 if (num_lanes >= 4)
return LoadU(
d,
p);
2196 if (num_lanes == 0)
return no;
2197 if (num_lanes == 1)
return InsertLane(no, 0,
p[0]);
2202 return (num_lanes == 2) ? v_lo :
InsertLane(v_lo, 2,
p[2]);
2209 const FixedTag<TFromD<D>, 4> d4;
2210 const Half<
decltype(d4)> d2;
2211 const Half<
decltype(d2)> d1;
2213 if (num_lanes >= 8)
return LoadU(
d,
p);
2214 if (num_lanes == 0)
return Zero(
d);
2215 if (num_lanes == 1)
return detail::LoadNResizeBitCast(
d, d1,
LoadU(d1,
p));
2217 const size_t leading_len = num_lanes & 4;
2220 if ((num_lanes & 2) != 0) {
2221 const VFromD<
decltype(d2)> v_trailing_lo2 =
LoadU(d2,
p + leading_len);
2222 if ((num_lanes & 1) != 0) {
2225 detail::LoadNResizeBitCast(d2, d1,
LoadU(d1,
p + leading_len + 2)),
2228 v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
2230 }
else if ((num_lanes & 1) != 0) {
2231 v_trailing = detail::LoadNResizeBitCast(d4, d1,
LoadU(d1,
p + leading_len));
2234 if (leading_len != 0) {
2237 return detail::LoadNResizeBitCast(
d, d4, v_trailing);
2245 const FixedTag<TFromD<D>, 4> d4;
2246 const Half<
decltype(d4)> d2;
2247 const Half<
decltype(d2)> d1;
2249 if (num_lanes >= 8)
return LoadU(
d,
p);
2250 if (num_lanes == 0)
return no;
2251 if (num_lanes == 1)
return InsertLane(no, 0,
p[0]);
2253 const size_t leading_len = num_lanes & 4;
2256 if ((num_lanes & 2) != 0) {
2257 const VFromD<
decltype(d2)> v_trailing_lo2 =
LoadU(d2,
p + leading_len);
2258 if ((num_lanes & 1) != 0) {
2268 }
else if ((num_lanes & 1) != 0) {
2272 if (leading_len != 0) {
2283 const FixedTag<TFromD<D>, 8> d8;
2284 const Half<
decltype(d8)> d4;
2285 const Half<
decltype(d4)> d2;
2286 const Half<
decltype(d2)> d1;
2288 if (num_lanes >= 16)
return LoadU(
d,
p);
2289 if (num_lanes == 0)
return Zero(
d);
2290 if (num_lanes == 1)
return detail::LoadNResizeBitCast(
d, d1,
LoadU(d1,
p));
2292 const size_t leading_len = num_lanes & 12;
2295 if ((num_lanes & 2) != 0) {
2296 const VFromD<
decltype(d2)> v_trailing_lo2 =
LoadU(d2,
p + leading_len);
2297 if ((num_lanes & 1) != 0) {
2300 detail::LoadNResizeBitCast(d2, d1,
LoadU(d1,
p + leading_len + 2)),
2303 v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
2305 }
else if ((num_lanes & 1) != 0) {
2306 v_trailing = detail::LoadNResizeBitCast(d4, d1,
LoadU(d1,
p + leading_len));
2309 if (leading_len != 0) {
2310 if (leading_len >= 8) {
2311 const VFromD<
decltype(d8)> v_hi7 =
2312 ((leading_len & 4) != 0)
2314 : detail::LoadNResizeBitCast(d8, d4, v_trailing);
2317 return detail::LoadNResizeBitCast(
d, d8,
2321 return detail::LoadNResizeBitCast(
d, d4, v_trailing);
2329 const FixedTag<TFromD<D>, 8> d8;
2330 const Half<
decltype(d8)> d4;
2331 const Half<
decltype(d4)> d2;
2332 const Half<
decltype(d2)> d1;
2334 if (num_lanes >= 16)
return LoadU(
d,
p);
2335 if (num_lanes == 0)
return no;
2336 if (num_lanes == 1)
return InsertLane(no, 0,
p[0]);
2338 const size_t leading_len = num_lanes & 12;
2341 if ((num_lanes & 2) != 0) {
2342 const VFromD<
decltype(d2)> v_trailing_lo2 =
LoadU(d2,
p + leading_len);
2343 if ((num_lanes & 1) != 0) {
2353 }
else if ((num_lanes & 1) != 0) {
2357 if (leading_len != 0) {
2358 if (leading_len >= 8) {
2359 const VFromD<
decltype(d8)> v_hi7 =
2360 ((leading_len & 4) != 0)
2371 const Repartition<uint32_t, D> du32;
2373 const VFromD<
decltype(du32)> lo8 =
2379#if HWY_MAX_BYTES >= 32
2381template <
class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
2386 const Half<
decltype(
d)> dh;
2387 const size_t half_N =
Lanes(dh);
2388 if (num_lanes <= half_N) {
2392 const VFromD<
decltype(dh)> v_hi =
LoadN(dh,
p + half_N, num_lanes - half_N);
2397template <
class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
2402 const Half<
decltype(
d)> dh;
2403 const size_t half_N =
Lanes(dh);
2405 if (num_lanes <= half_N) {
2410 const VFromD<
decltype(dh)> v_hi =
2411 LoadNOr(no_h, dh,
p + half_N, num_lanes - half_N);
2418template <
class D, HWY_IF_BF16_D(D)>
2421 const RebindToUnsigned<D> du;
2425template <
class D, HWY_IF_BF16_D(D)>
2428 const RebindToUnsigned<D> du;
2439#if HWY_MEM_OPS_MIGHT_FAULT
2440 if (num_lanes <= 0)
return Zero(
d);
2449#if HWY_MEM_OPS_MIGHT_FAULT
2450 if (num_lanes <= 0)
return no;
2460#if (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
2461#ifdef HWY_NATIVE_STORE_N
2462#undef HWY_NATIVE_STORE_N
2464#define HWY_NATIVE_STORE_N
2467#if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
2470template <
class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
2472 constexpr size_t kMinShrVectBytes =
2474 const FixedTag<uint8_t, kMinShrVectBytes> d_shift;
2479template <
class DH, HWY_IF_V_SIZE_GT_D(DH, 4)>
2487 typename T = TFromD<D>>
2489 size_t max_lanes_to_store) {
2490 if (max_lanes_to_store > 0) {
2496 typename T = TFromD<D>>
2498 size_t max_lanes_to_store) {
2499 if (max_lanes_to_store > 1) {
2501 }
else if (max_lanes_to_store == 1) {
2502 const FixedTag<TFromD<D>, 1> d1;
2508 typename T = TFromD<D>>
2510 size_t max_lanes_to_store) {
2511 const FixedTag<TFromD<D>, 2> d2;
2512 const Half<
decltype(d2)> d1;
2514 if (max_lanes_to_store > 1) {
2515 if (max_lanes_to_store >= 4) {
2519 if (max_lanes_to_store == 3) {
2523 }
else if (max_lanes_to_store == 1) {
2529 typename T = TFromD<D>>
2531 size_t max_lanes_to_store) {
2532 const FixedTag<TFromD<D>, 4> d4;
2533 const Half<
decltype(d4)> d2;
2534 const Half<
decltype(d2)> d1;
2536 if (max_lanes_to_store <= 1) {
2537 if (max_lanes_to_store == 1) {
2540 }
else if (max_lanes_to_store >= 8) {
2542 }
else if (max_lanes_to_store >= 4) {
2544 StoreN(detail::StoreNGetUpperHalf(d4, v), d4,
p + 4,
2545 max_lanes_to_store - 4);
2552 typename T = TFromD<D>>
2554 size_t max_lanes_to_store) {
2555 const FixedTag<TFromD<D>, 8> d8;
2556 const Half<
decltype(d8)> d4;
2557 const Half<
decltype(d4)> d2;
2558 const Half<
decltype(d2)> d1;
2560 if (max_lanes_to_store <= 1) {
2561 if (max_lanes_to_store == 1) {
2564 }
else if (max_lanes_to_store >= 16) {
2566 }
else if (max_lanes_to_store >= 8) {
2568 StoreN(detail::StoreNGetUpperHalf(d8, v), d8,
p + 8,
2569 max_lanes_to_store - 8);
2575#if HWY_MAX_BYTES >= 32
2576template <
class D, HWY_IF_V_SIZE_GT_D(D, 16),
typename T = TFromD<D>>
2578 size_t max_lanes_to_store) {
2579 const size_t N =
Lanes(
d);
2580 if (max_lanes_to_store >= N) {
2585 const Half<
decltype(
d)> dh;
2586 const size_t half_N =
Lanes(dh);
2587 if (max_lanes_to_store <= half_N) {
2597template <
class D,
typename T = TFromD<D>>
2599 size_t max_lanes_to_store) {
2600 const size_t N =
Lanes(
d);
2601 const size_t clamped_max_lanes_to_store =
HWY_MIN(max_lanes_to_store, N);
2602#if HWY_MEM_OPS_MIGHT_FAULT
2603 if (clamped_max_lanes_to_store == 0)
return;
2616#if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
2617#ifdef HWY_NATIVE_SCATTER
2618#undef HWY_NATIVE_SCATTER
2620#define HWY_NATIVE_SCATTER
2623template <
class D,
typename T = TFromD<D>>
2627 using TI =
TFromD<
decltype(di)>;
2628 static_assert(
sizeof(T) ==
sizeof(TI),
"Index/lane size must match");
2634 Store(offset, di, offset_lanes);
2636 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
2637 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2638 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2642template <
class D,
typename T = TFromD<D>>
2646 using TI =
TFromD<
decltype(di)>;
2647 static_assert(
sizeof(T) ==
sizeof(TI),
"Index/lane size must match");
2653 Store(index, di, index_lanes);
2655 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2656 base[index_lanes[i]] = lanes[i];
2660template <
class D,
typename T = TFromD<D>>
2665 using TI =
TFromD<
decltype(di)>;
2666 static_assert(
sizeof(T) ==
sizeof(TI),
"Index/lane size must match");
2672 Store(index, di, index_lanes);
2677 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2678 if (mask_lanes[i]) base[index_lanes[i]] = lanes[i];
2686#if (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
2687#ifdef HWY_NATIVE_GATHER
2688#undef HWY_NATIVE_GATHER
2690#define HWY_NATIVE_GATHER
2693template <
class D,
typename T = TFromD<D>>
2697 using TI =
TFromD<
decltype(di)>;
2698 static_assert(
sizeof(T) ==
sizeof(TI),
"Index/lane size must match");
2701 Store(offset, di, offset_lanes);
2704 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
2705 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2707 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
2709 return Load(
d, lanes);
2712template <
class D,
typename T = TFromD<D>>
2716 using TI =
TFromD<
decltype(di)>;
2717 static_assert(
sizeof(T) ==
sizeof(TI),
"Index/lane size must match");
2720 Store(index, di, index_lanes);
2723 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2725 lanes[i] = base[index_lanes[i]];
2727 return Load(
d, lanes);
2730template <
class D,
typename T = TFromD<D>>
2735 using TI =
TFromD<
decltype(di)>;
2736 static_assert(
sizeof(T) ==
sizeof(TI),
"Index/lane size must match");
2739 Store(index, di, index_lanes);
2745 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2747 lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0};
2749 return Load(
d, lanes);
2752template <
class D,
typename T = TFromD<D>>
2757 using TI =
TFromD<
decltype(di)>;
2758 static_assert(
sizeof(T) ==
sizeof(TI),
"Index/lane size must match");
2761 Store(index, di, index_lanes);
2770 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
2772 lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : no_lanes[i];
2774 return Load(
d, lanes);
2781template <
class D,
typename T = TFromD<D>>
2784 const size_t max_lanes_to_store) {
2788template <
class D,
typename T = TFromD<D>>
2791 const size_t max_lanes_to_load) {
2797#if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
2798#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
2799#undef HWY_NATIVE_INTEGER_ABS_DIFF
2801#define HWY_NATIVE_INTEGER_ABS_DIFF
2804template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
2811#if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
2812#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
2813#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
2815#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
2818template <
class V, HWY_IF_UI8_D(DFromV<V>),
2819 HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))>
2832#if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
2833#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
2834#undef HWY_NATIVE_I32_SATURATED_ADDSUB
2836#define HWY_NATIVE_I32_SATURATED_ADDSUB
2839template <
class V, HWY_IF_I32_D(DFromV<V>)>
2842 const auto sum =
Add(a, b);
2843 const auto overflow_mask =
AndNot(
Xor(a, b),
Xor(a, sum));
2844 const auto overflow_result =
2849template <
class V, HWY_IF_I32_D(DFromV<V>)>
2852 const auto diff =
Sub(a, b);
2853 const auto overflow_mask =
And(
Xor(a, b),
Xor(a, diff));
2854 const auto overflow_result =
2861#if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
2862#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
2863#undef HWY_NATIVE_I64_SATURATED_ADDSUB
2865#define HWY_NATIVE_I64_SATURATED_ADDSUB
2868template <
class V, HWY_IF_I64_D(DFromV<V>)>
2871 const auto sum =
Add(a, b);
2872 const auto overflow_mask =
AndNot(
Xor(a, b),
Xor(a, sum));
2873 const auto overflow_result =
2878template <
class V, HWY_IF_I64_D(DFromV<V>)>
2881 const auto diff =
Sub(a, b);
2882 const auto overflow_mask =
And(
Xor(a, b),
Xor(a, diff));
2883 const auto overflow_result =
2890#if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
2891#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
2892#undef HWY_NATIVE_U32_SATURATED_ADDSUB
2894#define HWY_NATIVE_U32_SATURATED_ADDSUB
2897template <
class V, HWY_IF_U32_D(DFromV<V>)>
2902template <
class V, HWY_IF_U32_D(DFromV<V>)>
2904 return Sub(a,
Min(a, b));
2909#if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
2910#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
2911#undef HWY_NATIVE_U64_SATURATED_ADDSUB
2913#define HWY_NATIVE_U64_SATURATED_ADDSUB
2916template <
class V, HWY_IF_U64_D(DFromV<V>)>
2921template <
class V, HWY_IF_U64_D(DFromV<V>)>
2923 return Sub(a,
Min(a, b));
2933 hwy::EnableIf<(
sizeof(TFromD<DN>) <
sizeof(TFromV<V>))>* =
nullptr,
2953#if HWY_TARGET != HWY_SCALAR || HWY_IDE
2967 const auto i2i_demote_result =
2983template <
class D,
class V>
2993#if (defined(HWY_NATIVE_PROMOTE_UPPER_TO) == defined(HWY_TARGET_TOGGLE))
2994#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
2995#undef HWY_NATIVE_PROMOTE_UPPER_TO
2997#define HWY_NATIVE_PROMOTE_UPPER_TO
3001#if HWY_TARGET != HWY_SCALAR || HWY_IDE
3003template <
class D,
class V>
3007 const Rebind<TFromV<V>,
decltype(
d)> dh;
3016#if HWY_TARGET != HWY_SCALAR
3029template <
size_t kToLaneSize,
class D,
class V>
3034#if HWY_IS_LITTLE_ENDIAN
3038 const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(
BitCast(d_to, v));
3042 const auto even_in_hi =
BitCast(d_to, v);
3046 return ShiftRight<kToLaneSize * 4>(even_in_hi);
3049template <
size_t kToLaneSize,
class D,
class V>
3054#if HWY_IS_LITTLE_ENDIAN
3057 const auto odd_in_hi =
BitCast(d_to, v);
3062 const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(
BitCast(d_to, v));
3066 return ShiftRight<kToLaneSize * 4>(odd_in_hi);
3070template <
size_t kToLaneSize,
class D,
class V>
3075#if HWY_IS_LITTLE_ENDIAN
3082 Set(d_to,
static_cast<TFromD<D>
>(
LimitsMax<TFromV<V>>())));
3090 return ShiftRight<kToLaneSize * 4>(
BitCast(d_to, v));
3094template <
size_t kToLaneSize,
class D,
class V>
3099#if HWY_IS_LITTLE_ENDIAN
3106 return ShiftRight<kToLaneSize * 4>(
BitCast(d_to, v));
3114 Set(d_to,
static_cast<TFromD<D>
>(
LimitsMax<TFromV<V>>())));
3120template <
size_t kToLaneSize,
class D,
class V>
3131template <
size_t kToLaneSize,
class D,
class V>
3152template <
class FromTypeTag,
class DF32,
class VBF16,
3157 FromTypeTag , DF32 d_to,
3160#if HWY_IS_LITTLE_ENDIAN
3185template <
class FromTypeTag,
class DF32,
class VBF16,
3190 FromTypeTag , DF32 d_to,
3193#if HWY_IS_LITTLE_ENDIAN
3209template <
class ToTypeTag,
size_t kToLaneSize,
class FromTypeTag,
class D,
3213 FromTypeTag , D d_to, V v) {
3217template <
class ToTypeTag,
size_t kToLaneSize,
class FromTypeTag,
class D,
3221 FromTypeTag , D d_to, V v) {
3226template <
class ToTypeTag,
size_t kToLaneSize,
class FromTypeTag,
class D,
3230 FromTypeTag , D d_to, V v) {
3237template <
class D,
class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
3238 class V2 = VFromD<Repartition<TFromV<V>, D>>,
3239 HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
3246template <
class D,
class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
3247 class V2 = VFromD<Repartition<TFromV<V>, D>>,
3248 HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
3258#if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
3259#ifdef HWY_NATIVE_F16C
3260#undef HWY_NATIVE_F16C
3262#define HWY_NATIVE_F16C
3265template <
class D, HWY_IF_F32_D(D)>
3269 const Rebind<uint16_t,
decltype(df32)> du16;
3270 using VU32 =
VFromD<
decltype(du32)>;
3273 const VU32 sign = ShiftRight<15>(bits16);
3274 const VU32 biased_exp =
And(ShiftRight<10>(bits16),
Set(du32, 0x1F));
3275 const VU32 mantissa =
And(bits16,
Set(du32, 0x3FF));
3276 const VU32 subnormal =
3278 Set(df32, 1.0f / 16384 / 1024)));
3280 const VU32 biased_exp32 =
Add(biased_exp,
Set(du32, 127 - 15));
3281 const VU32 mantissa32 =
ShiftLeft<23 - 10>(mantissa);
3282 const VU32 normal =
Or(ShiftLeft<23>(biased_exp32), mantissa32);
3283 const VU32 bits32 =
IfThenElse(
Eq(biased_exp,
Zero(du32)), subnormal, normal);
3284 return BitCast(df32,
Or(ShiftLeft<31>(sign), bits32));
3287template <
class D, HWY_IF_F16_D(D)>
3290 const Rebind<int32_t,
decltype(df16)> di32;
3313 const auto hi9_bits = ShiftRight<23>(
BitCast(du32, v));
3315 const auto k13 =
Set(du32, uint32_t{13u});
3318 const auto k126 =
Set(du32, uint32_t{126u});
3324#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
3325 const auto k255 =
Set(du32, uint32_t{255u});
3327 k255,
Max(
Min(
Add(
And(hi9_bits, k255), k13), k255), k126), hi9_bits);
3338 const Repartition<uint8_t,
decltype(du32)> du32_as_u8;
3339 const auto round_incr_hi9_bits =
BitCast(
3349 const auto round_incr =
BitCast(df32, ShiftLeft<23>(round_incr_hi9_bits));
3355 const auto rounded_val =
Add(v, round_incr);
3358 const auto rounded_val_bits =
BitCast(du32, rounded_val);
3394#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
3396 Min(
Add(ShiftLeft<10>(
And(round_incr_hi9_bits, k255)),
3397 And(rounded_val_bits,
3398 Set(du32,
static_cast<uint32_t
>(uint32_t{0xFFu} << 10)))),
3399 Set(du32,
static_cast<uint32_t
>(uint32_t{157u} << 10)));
3401 auto f16_exp_bits = ShiftLeft<10>(
BitCast(
3404 BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))),
3405 BitCast(du32_as_u8,
Set(du32, uint32_t{157})))));
3409 Sub(f16_exp_bits,
Set(du32,
static_cast<uint32_t
>(uint32_t{126u} << 10)));
3411 const auto f16_unmasked_mant_bits =
3414 const auto f16_exp_mant_bits =
3415 OrAnd(
BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits,
3416 Set(di32, int32_t{0x03FF}));
3422 const auto f16_bits_as_i32 =
3423 OrAnd(f16_exp_mant_bits, ShiftRight<16>(
BitCast(di32, rounded_val_bits)),
3424 Set(di32,
static_cast<int32_t
>(0xFFFF8000u)));
3431#if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE))
3432#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
3433#undef HWY_NATIVE_DEMOTE_F64_TO_F16
3435#define HWY_NATIVE_DEMOTE_F64_TO_F16
3439template <
class D, HWY_IF_F16_D(D)>
3441 const Rebind<double, D> df64;
3442 const Rebind<uint64_t, D> du64;
3443 const Rebind<float, D> df32;
3449 const auto vf64_rounded =
OrAnd(
3451 BitCast(df64,
Set(du64,
static_cast<uint64_t
>(0xFFFFFFFFE0000000u)))),
3453 Set(du64,
static_cast<uint64_t
>(0x000000001FFFFFFFu)))),
3454 BitCast(df64,
Set(du64,
static_cast<uint64_t
>(0x0000000020000000ULL))));
3463#if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE))
3464#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
3465#undef HWY_NATIVE_PROMOTE_F16_TO_F64
3467#define HWY_NATIVE_PROMOTE_F16_TO_F64
3471template <
class D, HWY_IF_F64_D(D)>
3480#if (defined(HWY_NATIVE_DEMOTE_F32_TO_BF16) == defined(HWY_TARGET_TOGGLE))
3481#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
3482#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
3484#define HWY_NATIVE_DEMOTE_F32_TO_BF16
3494template <
class V, HWY_IF_F32(TFromV<V>)>
3499 const auto is_non_nan =
Not(
IsNaN(v));
3500 const auto bits32 =
BitCast(du32, v);
3502 const auto round_incr =
3503 Add(
And(ShiftRight<16>(bits32),
Set(du32, uint32_t{1})),
3504 Set(du32, uint32_t{0x7FFFu}));
3506 RebindMask(du32, is_non_nan), bits32, round_incr);
3511template <
class D, HWY_IF_BF16_D(D)>
3514 const Twice<
decltype(du16)> dt_u16;
3517#if HWY_IS_LITTLE_ENDIAN
3526template <
class D, HWY_IF_BF16_D(D)>
3531 const auto rounded_a_bits32 =
3533 const auto rounded_b_bits32 =
3535#if HWY_IS_LITTLE_ENDIAN
3537 BitCast(du16, rounded_a_bits32)));
3540 BitCast(du16, rounded_a_bits32)));
3544template <
class D, HWY_IF_BF16_D(D)>
3549#if HWY_IS_LITTLE_ENDIAN
3564#if (defined(HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO) == \
3565 defined(HWY_TARGET_TOGGLE))
3566#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3567#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3569#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3572#if HWY_HAVE_INTEGER64
3573template <
class D64, HWY_IF_UI64_D(D64)>
3582#if (defined(HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO) == defined(HWY_TARGET_TOGGLE))
3583#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3584#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3586#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3600#if (defined(HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO) == \
3601 defined(HWY_TARGET_TOGGLE))
3602#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3603#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3605#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3609template <
class D32, HWY_IF_UI32_D(D32)>
3619template <
class D, HWY_IF_UI64_D(D),
class V, HWY_IF_F32(TFromV<V>)>
3627#if HWY_TARGET != HWY_SCALAR
3628template <
class D, HWY_IF_UI64_D(D),
class V, HWY_IF_F32(TFromV<V>)>
3630#if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
3631 ((HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
3638 const Rebind<TFromV<V>,
decltype(
d)> dh;
3651template <
class D, HWY_IF_UI64_D(D),
class V, HWY_IF_F32(TFromV<V>)>
3653#if HWY_TARGET == HWY_SCALAR
3655#elif (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
3656 ((HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
3663 const DFromV<
decltype(v)> d_from;
3674#if HWY_TARGET != HWY_SCALAR
3675template <
class D, HWY_IF_UI64_D(D),
class V, HWY_IF_F32(TFromV<V>)>
3677#if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
3678 ((HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
3685 const DFromV<
decltype(v)> d_from;
3686 const Rebind<TFromV<V>,
decltype(
d)> dh;
3692 return PromoteOddTo(
d, v);
3699#if HWY_TARGET != HWY_SCALAR
3702template <
class TypeTag,
size_t kLaneSize,
class V>
3723template <
class TypeTag,
size_t kLaneSize,
class V>
3741 (defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE))
3743#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3744#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3746#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3750#if HWY_TARGET != HWY_SCALAR || HWY_IDE
3762#if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE))
3763#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
3764#undef HWY_NATIVE_LEADING_ZERO_COUNT
3766#define HWY_NATIVE_LEADING_ZERO_COUNT
3771template <
class D, HWY_IF_U32_D(D)>
3774#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
3796 return BitCast(
d, ShiftRight<23>(f32_bits));
3800template <
class V, HWY_IF_U32_D(DFromV<V>)>
3806#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
3812 return ShiftRight<23>(f32_bits);
3817 const Rebind<uint32_t,
decltype(
d)> du32;
3818 const auto f32_biased_exp_as_u32 =
3823#if HWY_TARGET != HWY_SCALAR
3826 const Half<
decltype(
d)> dh;
3827 const Rebind<uint32_t,
decltype(dh)> du32;
3834#if HWY_TARGET <= HWY_SSE2
3839 BitCast(di32, hi_f32_biased_exp_as_u32)));
3842 hi_f32_biased_exp_as_u32);
3849 const Rebind<uint32_t,
decltype(
d)> du32;
3850 const auto f32_biased_exp_as_u32 =
3852 return U8FromU32(f32_biased_exp_as_u32);
3855#if HWY_TARGET != HWY_SCALAR
3859 const Half<
decltype(
d)> dh;
3860 const Rebind<uint32_t,
decltype(dh)> du32;
3869#if HWY_TARGET <= HWY_SSE2
3872 const auto f32_biased_exp_as_i16 =
3874 BitCast(di32, hi_f32_biased_exp_as_u32));
3875 return DemoteTo(
d, f32_biased_exp_as_i16);
3878 du16, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32);
3885 const Half<
decltype(
d)> dh;
3886 const Half<
decltype(dh)> dq;
3887 const Rebind<uint32_t,
decltype(dq)> du32;
3903#if HWY_TARGET <= HWY_SSE2
3907 const auto lo_f32_biased_exp_as_i16 =
3909 BitCast(di32, f32_biased_exp_as_u32_q1));
3910 const auto hi_f32_biased_exp_as_i16 =
3912 BitCast(di32, f32_biased_exp_as_u32_q3));
3914 hi_f32_biased_exp_as_i16);
3917 du16, f32_biased_exp_as_u32_q0, f32_biased_exp_as_u32_q1);
3919 du16, f32_biased_exp_as_u32_q2, f32_biased_exp_as_u32_q3);
3921 hi_f32_biased_exp_as_u16);
3926#if HWY_TARGET == HWY_SCALAR
3929#elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2
3948template <
class D, HWY_IF_U64_D(D)>
3950#if HWY_TARGET == HWY_SCALAR
3951 const uint64_t u64_val =
GetLane(v);
3952 const float f32_val =
static_cast<float>(u64_val);
3953 const uint32_t f32_bits = BitCastScalar<uint32_t>(f32_val);
3954 return Set(
d,
static_cast<uint64_t
>(f32_bits >> 23));
3958 const auto f32_biased_exp_adj =
3961 const auto adj_f32_biased_exp =
Add(f32_biased_exp, f32_biased_exp_adj);
3963 return ShiftRight<32>(
BitCast(
3969template <
class V, HWY_IF_UNSIGNED_V(V)>
3989 return AndNot(ShiftRight<24>(v), v);
3994template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
3998 using TU =
TFromD<
decltype(du)>;
4005template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4009 using TU =
TFromD<
decltype(du)>;
4011 constexpr TU kNumOfBitsInT{
sizeof(TU) * 8};
4014 const auto lz_count =
Sub(
Set(du, TU{kNumOfBitsInT + 126}), f32_biased_exp);
4021template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4026 using TU =
TFromD<
decltype(du)>;
4028 const auto vi =
BitCast(di, v);
4031 constexpr TU kNumOfBitsInT{
sizeof(TU) * 8};
4033 const auto tz_count =
Sub(f32_biased_exp,
Set(du, TU{127}));
4044#if HWY_TARGET != HWY_SCALAR || HWY_IDE
4059HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
4062 const auto mask =
Set(du, uint8_t{0xF});
4066 const VFromD<
decltype(du)> basisL =
4068 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA);
4069 const VFromD<
decltype(du)> basisU =
4071 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD);
4072 const auto sL =
And(state, mask);
4073 const auto sU = ShiftRight<4>(state);
4076 state =
Xor(gf4L, gf4U);
4082 du, 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3);
4084 du, 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4);
4085 const auto sL =
And(state, mask);
4086 const auto sU = ShiftRight<4>(state);
4087 const auto sX =
Xor(sU, sL);
4096 return Xor(affL, affU);
4104 const VFromD<
decltype(du)> affineL =
4106 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15);
4107 const VFromD<
decltype(du)> affineU =
4109 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E);
4110 return Xor(SubBytesMulInverseAndAffineLookup(state, affineL, affineU),
4111 Set(du, uint8_t{0x63}));
4117 const VFromD<
decltype(du)> gF2P4InvToGF2P8InvL =
4119 0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7);
4120 const VFromD<
decltype(du)> gF2P4InvToGF2P8InvU =
4122 0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA);
4125 const auto b =
Xor(
Xor3(
Or(ShiftLeft<1>(state), ShiftRight<7>(state)),
4126 Or(ShiftLeft<3>(state), ShiftRight<5>(state)),
4127 Or(ShiftLeft<6>(state), ShiftRight<2>(state))),
4128 Set(du, uint8_t{0x05}));
4136 return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL,
4137 gF2P4InvToGF2P8InvU);
4144#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
4145#ifdef HWY_NATIVE_AES
4146#undef HWY_NATIVE_AES
4148#define HWY_NATIVE_AES
4152#if HWY_TARGET != HWY_SCALAR
4161 du, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11);
4170 du, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3);
4180 return Xor(
Add(v, v), overflow);
4192 du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
4194 du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
4195 const auto d = GF2P8Mod11BMulBy2(state);
4197 const auto d_s2301 =
Xor(
d, s2301);
4198 const auto t_s2301 =
Xor(state, d_s2301);
4200 return Xor(d_s2301, t1230_s3012);
4212 du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
4214 du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
4216 const auto sx2 = GF2P8Mod11BMulBy2(state);
4217 const auto sx4 = GF2P8Mod11BMulBy2(sx2);
4218 const auto sx8 = GF2P8Mod11BMulBy2(sx4);
4219 const auto sx9 =
Xor(sx8, state);
4220 const auto sx11 =
Xor(sx9, sx2);
4221 const auto sx13 =
Xor(sx9, sx4);
4222 const auto sx14 =
Xor3(sx8, sx4, sx2);
4226 const auto sx13_2301_sx9_3012 =
TableLookupBytes(sx13_0123_sx9_1230, v2301);
4227 return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012);
4236 state = detail::SubBytes(state);
4237 state = detail::ShiftRows(state);
4238 state = detail::MixColumns(state);
4239 state =
Xor(state, round_key);
4246 state = detail::SubBytes(state);
4247 state = detail::ShiftRows(state);
4248 state =
Xor(state, round_key);
4254 return detail::InvMixColumns(state);
4259 state = detail::InvSubBytes(state);
4260 state = detail::InvShiftRows(state);
4261 state = detail::InvMixColumns(state);
4262 state =
Xor(state, round_key);
4269 state = detail::InvSubBytes(state);
4270 state = detail::InvShiftRows(state);
4271 state =
Xor(state, round_key);
4275template <u
int8_t kRcon,
class V, HWY_IF_U8_D(DFromV<V>)>
4278 const V rconXorMask =
Dup128VecFromValues(
d, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0,
4279 0, 0, kRcon, 0, 0, 0);
4280 const V rotWordShuffle =
Dup128VecFromValues(
d, 4, 5, 6, 7, 5, 6, 7, 4, 12,
4281 13, 14, 15, 13, 14, 15, 12);
4282 const auto sub_word_result = detail::SubBytes(v);
4283 const auto rot_word_result =
4285 return Xor(rot_word_result, rconXorMask);
4294 static_assert(
IsSame<
TFromD<
decltype(
d)>, uint64_t>(),
"V must be u64");
4295 const auto k1 =
Set(
d, 0x1111111111111111ULL);
4296 const auto k2 =
Set(
d, 0x2222222222222222ULL);
4297 const auto k4 =
Set(
d, 0x4444444444444444ULL);
4298 const auto k8 =
Set(
d, 0x8888888888888888ULL);
4299 const auto a0 =
And(a, k1);
4300 const auto a1 =
And(a, k2);
4301 const auto a2 =
And(a, k4);
4302 const auto a3 =
And(a, k8);
4303 const auto b0 =
And(b, k1);
4304 const auto b1 =
And(b, k2);
4305 const auto b2 =
And(b, k4);
4306 const auto b3 =
And(b, k8);
4322 static_assert(
IsSame<
TFromD<
decltype(
d)>, uint64_t>(),
"V must be u64");
4323 const auto k1 =
Set(
d, 0x1111111111111111ULL);
4324 const auto k2 =
Set(
d, 0x2222222222222222ULL);
4325 const auto k4 =
Set(
d, 0x4444444444444444ULL);
4326 const auto k8 =
Set(
d, 0x8888888888888888ULL);
4327 const auto a0 =
And(a, k1);
4328 const auto a1 =
And(a, k2);
4329 const auto a2 =
And(a, k4);
4330 const auto a3 =
And(a, k8);
4331 const auto b0 =
And(b, k1);
4332 const auto b1 =
And(b, k2);
4333 const auto b2 =
And(b, k4);
4334 const auto b3 =
And(b, k8);
4352#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
4353#ifdef HWY_NATIVE_POPCNT
4354#undef HWY_NATIVE_POPCNT
4356#define HWY_NATIVE_POPCNT
4362#if HWY_TARGET == HWY_RVV
4363#define HWY_IF_POPCNT(D) \
4364 hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
4368#define HWY_IF_POPCNT(D) void* = nullptr
4371template <
class V,
class D = DFromV<V>, HWY_IF_U8_D(D),
4372 HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
4376 Dup128VecFromValues(
d, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
4377 const auto lo =
And(v,
Set(
d, uint8_t{0xF}));
4378 const auto hi = ShiftRight<4>(v);
4383#if HWY_TARGET != HWY_RVV
4385template <
class V,
class D = DFromV<V>, HWY_IF_U8_D(D),
4386 HWY_IF_V_SIZE_LE_D(D, 8)>
4390 const V k33 =
Set(
d, uint8_t{0x33});
4391 v =
Sub(v,
And(ShiftRight<1>(v),
Set(
d, uint8_t{0x55})));
4392 v =
Add(
And(ShiftRight<2>(v), k33),
And(v, k33));
4393 return And(
Add(v, ShiftRight<4>(v)),
Set(
d, uint8_t{0x0F}));
4397template <
class V,
class D = DFromV<V>, HWY_IF_U16_D(D)>
4402 return Add(ShiftRight<8>(vals),
And(vals,
Set(
d, uint16_t{0xFF})));
4405template <
class V,
class D = DFromV<V>, HWY_IF_U32_D(D)>
4410 return Add(ShiftRight<16>(vals),
And(vals,
Set(
d, uint32_t{0xFF})));
4413#if HWY_HAVE_INTEGER64
4414template <
class V,
class D = DFromV<V>, HWY_IF_U64_D(D)>
4419 return Add(ShiftRight<32>(vals),
And(vals,
Set(
d, 0xFFULL)));
4427#if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
4428#ifdef HWY_NATIVE_MUL_8
4429#undef HWY_NATIVE_MUL_8
4431#define HWY_NATIVE_MUL_8
4452 const Half<
decltype(
d)> dh;
4458 const VFromD<
decltype(dw)> m0 = a0 * b0;
4459 const VFromD<
decltype(dw)> m1 = a1 * b1;
4467#if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
4468#ifdef HWY_NATIVE_MUL_64
4469#undef HWY_NATIVE_MUL_64
4471#define HWY_NATIVE_MUL_64
4479 using T =
TFromD<
decltype(
d)>;
4481 const TU xu =
static_cast<TU
>(
GetLane(x));
4482 const TU yu =
static_cast<TU
>(
GetLane(y));
4483 return Set(
d,
static_cast<T
>(xu * yu));
4486template <
class V,
class D64 = DFromV<V>, HWY_IF_U64_D(D64),
4487 HWY_IF_V_SIZE_GT_D(D64, 8)>
4489 RepartitionToNarrow<D64> d32;
4495 auto hi =
BitCast(d32, ShiftLeft<32>(
BitCast(D64{}, lohi + hilo)));
4496 return BitCast(D64{}, lolo + hi);
4498template <
class V,
class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
4499 HWY_IF_V_SIZE_GT_D(DI64, 8)>
4501 RebindToUnsigned<DI64> du64;
4509#if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
4510#ifdef HWY_NATIVE_INT_FMA
4511#undef HWY_NATIVE_INT_FMA
4513#define HWY_NATIVE_INT_FMA
4516#ifdef HWY_NATIVE_INT_FMSUB
4517#undef HWY_NATIVE_INT_FMSUB
4519#define HWY_NATIVE_INT_FMSUB
4522template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4524 return Add(
Mul(mul, x), add);
4527template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4529 return Sub(add,
Mul(mul, x));
4532template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4534 return Sub(
Mul(mul, x), sub);
4539#if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
4540#ifdef HWY_NATIVE_INT_FMSUB
4541#undef HWY_NATIVE_INT_FMSUB
4543#define HWY_NATIVE_INT_FMSUB
4546template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4548 const DFromV<
decltype(mul)>
d;
4555template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4557 const DFromV<
decltype(mul)>
d;
4567template <
class V, HWY_IF_LANES_D(DFromV<V>, 1)>
4569 return MulSub(mul, x, sub_or_add);
4579template <
class V, HWY_IF_MULADDSUB_V(V)>
4581 using D = DFromV<V>;
4582 using T = TFromD<D>;
4586 const Rebind<TNegate, D> d_negate;
4590 return MulAdd(mul, x, add);
4594#if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
4595#ifdef HWY_NATIVE_INT_DIV
4596#undef HWY_NATIVE_INT_DIV
4598#define HWY_NATIVE_INT_DIV
4611template <
class D,
class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
4616template <
class D,
class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
4621#if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
4622template <
class D,
class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
4630template <
class D,
class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)>
4632 const Twice<
decltype(df32)> dt_f32;
4637#if HWY_IS_LITTLE_ENDIAN
4653template <
class D,
class V, HWY_IF_F32_D(D), HWY_IF_U64(TFromV<V>)>
4655 const Twice<
decltype(df32)> dt_f32;
4660#if HWY_IS_LITTLE_ENDIAN
4701#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
4712 if (kOrigLaneSize > 1) {
4714 Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
4745 auto q1_negate_mask = r0;
4747 q1_negate_mask =
Xor(q1_negate_mask,
BitCast(di, b));
4782 const Rebind<TF,
decltype(
d)> df;
4784 if (!IsSigned<T>()) {
4800#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
4804 Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
4806 const auto flt_recip_b =
Div(
Set(df, TF(1.0)), flt_b);
4849#if !HWY_HAVE_FLOAT64
4852 if (
sizeof(T) == 8) {
4891 auto q4_negate_mask = r3;
4893 q4_negate_mask =
Xor(q4_negate_mask,
BitCast(di, b));
4904template <
size_t kOrigLaneSize,
class V,
4917 const Rebind<TW,
decltype(
d)> dw;
4919#if HWY_TARGET <= HWY_SSE2
4931 const decltype(dw) dw_i;
4932 const decltype(
d) d_demote_to;
4936 d,
DemoteTo(d_demote_to, IntDivUsingFloatDiv<kOrigLaneSize>(
4940template <
size_t kOrigLaneSize,
class V,
4948#if HWY_TARGET <= HWY_SSE2
4960 const decltype(dw) dw_i;
4961 const decltype(
d) d_demote_to;
4966 IntDivUsingFloatDiv<kOrigLaneSize>(
4968 IntDivUsingFloatDiv<kOrigLaneSize>(
4972#if !HWY_HAVE_FLOAT16
4973template <
size_t kOrigLaneSize,
class V, HWY_IF_UI8(TFromV<V>),
4974 HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)>
4977 const Rebind<MakeWide<TFromV<V>>,
decltype(
d)> dw;
4979#if HWY_TARGET <= HWY_SSE2
4985 const decltype(dw) dw_i;
4991template <
size_t kOrigLaneSize,
class V, HWY_IF_UI8(TFromV<V>),
4992 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
4997#if HWY_TARGET <= HWY_SSE2
5003 const decltype(dw) dw_i;
5012template <
size_t kOrigLaneSize,
class V,
5016 return IntDivUsingFloatDiv<kOrigLaneSize>(a, b);
5020template <
size_t kOrigLaneSize,
class V, HWY_IF_UI32(TFromV<V>),
5021 HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
5024 const Rebind<double,
decltype(
d)> df64;
5033template <
size_t kOrigLaneSize,
class V, HWY_IF_UI32(TFromV<V>),
5034 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
5037 const Half<
decltype(
d)> dh;
5059 (1 << 2) | (1 << 4) | (1 << 8))>
5064#if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
5065 HWY_TARGET == HWY_WASM_EMU256
5066template <
size_t kOrigLaneSize,
class V, HWY_IF_UI8(TFromV<V>),
5067 HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
5074template <
size_t kOrigLaneSize,
class V, HWY_IF_UI8(TFromV<V>),
5075 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
5088#if HWY_TARGET == HWY_SCALAR
5090template <
class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5092 return detail::IntDiv<sizeof(T)>(a, b);
5094template <
class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5096 return detail::IntMod<sizeof(T)>(a, b);
5101template <
class T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5103 return detail::IntDiv<sizeof(T)>(a, b);
5106template <
class T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5108 return detail::IntMod<sizeof(T)>(a, b);
5112template <
class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5114 return detail::IntDiv<sizeof(T)>(a, b);
5116template <
class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5118 return detail::IntMod<sizeof(T)>(a, b);
5123template <
class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5125 return detail::IntDiv<sizeof(T)>(a, b);
5127template <
class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5129 return detail::IntMod<sizeof(T)>(a, b);
5139#if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \
5140 defined(HWY_TARGET_TOGGLE))
5142#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
5143#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
5145#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
5148template <
class DI16,
class VU8,
class VI8,
5149 class VU8_2 = Vec<Repartition<uint8_t, DI16>>,
HWY_IF_I16_D(DI16),
5159 const auto a1 =
BitCast(di16, PromoteOddTo(du16, a));
5160 const auto b1 = PromoteOddTo(di16, b);
5169#if (defined(HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM) == \
5170 defined(HWY_TARGET_TOGGLE))
5172#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
5173#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
5175#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
5178template <
class DI32, HWY_IF_I32_D(DI32)>
5189 const auto mul_overflow =
5193 Add(product, mul_overflow));
5200#if (defined(HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT) == \
5201 defined(HWY_TARGET_TOGGLE))
5203#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5204#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5206#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5209template <
class DI32, HWY_IF_I32_D(DI32)>
5211 VFromD<Rebind<int16_t, DI32>> a,
5212 VFromD<Rebind<int16_t, DI32>> b,
5214 const Repartition<int16_t, DI32> dt_i16;
5229#if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
5230 defined(HWY_TARGET_TOGGLE))
5232#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5233#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5235#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5238template <
class DI32, HWY_IF_I32_D(DI32)>
5248 const auto a1 = PromoteOddTo(di16, a);
5249 const auto b1 = PromoteOddTo(di16, b);
5257#if (defined(HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE) == \
5258 defined(HWY_TARGET_TOGGLE))
5260#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5261#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5263#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5266template <
class DU32, HWY_IF_U32_D(DU32)>
5274 const auto lo8_mask =
Set(di16, int16_t{0x00FF});
5275 const auto a0 =
And(
BitCast(di16, a), lo8_mask);
5276 const auto b0 =
And(
BitCast(di16, b), lo8_mask);
5287#if (defined(HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE) == \
5288 defined(HWY_TARGET_TOGGLE))
5290#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5291#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5293#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5296template <
class DI32, HWY_IF_I32_D(DI32)>
5303 const auto a0 =
And(
BitCast(di16, a_u),
Set(di16, int16_t{0x00FF}));
5304 const auto b0 = ShiftRight<8>(ShiftLeft<8>(
BitCast(di16, b_i)));
5306 const auto a1 =
BitCast(di16, ShiftRight<8>(
BitCast(du16, a_u)));
5307 const auto b1 = ShiftRight<8>(
BitCast(di16, b_i));
5321#if (defined(HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE) == \
5322 defined(HWY_TARGET_TOGGLE))
5324#ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5325#undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5327#define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5330#if HWY_HAVE_INTEGER64
5331template <
class DI64, HWY_IF_I64_D(DI64)>
5333 DI64 di64,
VFromD<Repartition<int16_t, DI64>> a,
5343 const auto i32_pairwise_sum_overflow =
5348 const auto hi32_mask =
Set(di64,
static_cast<int64_t
>(~int64_t{0xFFFFFFFF}));
5349 const auto p0_zero_out_mask =
5350 ShiftLeft<32>(
BitCast(di64, i32_pairwise_sum_overflow));
5351 const auto p1_zero_out_mask =
5352 And(
BitCast(di64, i32_pairwise_sum_overflow), hi32_mask);
5356 ShiftRight<32>(ShiftLeft<32>(
BitCast(di64, i32_pairwise_sum))));
5358 AndNot(p1_zero_out_mask, ShiftRight<32>(
BitCast(di64, i32_pairwise_sum)));
5360 return Add(sum,
Add(p0, p1));
5365#if (defined(HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE) == \
5366 defined(HWY_TARGET_TOGGLE))
5368#ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5369#undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5371#define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5374#if HWY_HAVE_INTEGER64
5375template <
class DU64, HWY_IF_U64_D(DU64)>
5377 DU64 du64,
VFromD<Repartition<uint16_t, DU64>> a,
5379 const auto u32_even_prod =
MulEven(a, b);
5380 const auto u32_odd_prod =
MulOdd(a, b);
5385 Add(PromoteOddTo(du64, u32_even_prod), PromoteOddTo(du64, u32_odd_prod));
5387 return Add(sum,
Add(p0, p1));
5394#if (defined(HWY_NATIVE_F64_APPROX_RECIP) == defined(HWY_TARGET_TOGGLE))
5395#ifdef HWY_NATIVE_F64_APPROX_RECIP
5396#undef HWY_NATIVE_F64_APPROX_RECIP
5398#define HWY_NATIVE_F64_APPROX_RECIP
5402template <
class V, HWY_IF_F64_D(DFromV<V>)>
5413#if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE))
5414#ifdef HWY_NATIVE_F64_APPROX_RSQRT
5415#undef HWY_NATIVE_F64_APPROX_RSQRT
5417#define HWY_NATIVE_F64_APPROX_RSQRT
5421template <
class V, HWY_IF_F64_D(DFromV<V>)>
5425 const auto half =
Mul(v,
Set(
d, 0.5));
5427 const auto guess =
BitCast(
d,
Sub(
Set(du, uint64_t{0x5FE6EB50C7B537A9u}),
5428 ShiftRight<1>(
BitCast(du, v))));
5438#if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
5439#ifdef HWY_NATIVE_COMPRESS8
5440#undef HWY_NATIVE_COMPRESS8
5442#define HWY_NATIVE_COMPRESS8
5445template <
class V,
class D,
typename T, HWY_IF_T_SIZE(T, 1)>
5455 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5456 1, 0, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5457 2, 0, 1, 3, 4, 5, 6, 7, 0, 2, 1, 3, 4, 5, 6, 7,
5458 1, 2, 0, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5459 3, 0, 1, 2, 4, 5, 6, 7, 0, 3, 1, 2, 4, 5, 6, 7,
5460 1, 3, 0, 2, 4, 5, 6, 7, 0, 1, 3, 2, 4, 5, 6, 7,
5461 2, 3, 0, 1, 4, 5, 6, 7, 0, 2, 3, 1, 4, 5, 6, 7,
5462 1, 2, 3, 0, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5463 4, 0, 1, 2, 3, 5, 6, 7, 0, 4, 1, 2, 3, 5, 6, 7,
5464 1, 4, 0, 2, 3, 5, 6, 7, 0, 1, 4, 2, 3, 5, 6, 7,
5465 2, 4, 0, 1, 3, 5, 6, 7, 0, 2, 4, 1, 3, 5, 6, 7,
5466 1, 2, 4, 0, 3, 5, 6, 7, 0, 1, 2, 4, 3, 5, 6, 7,
5467 3, 4, 0, 1, 2, 5, 6, 7, 0, 3, 4, 1, 2, 5, 6, 7,
5468 1, 3, 4, 0, 2, 5, 6, 7, 0, 1, 3, 4, 2, 5, 6, 7,
5469 2, 3, 4, 0, 1, 5, 6, 7, 0, 2, 3, 4, 1, 5, 6, 7,
5470 1, 2, 3, 4, 0, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5471 5, 0, 1, 2, 3, 4, 6, 7, 0, 5, 1, 2, 3, 4, 6, 7,
5472 1, 5, 0, 2, 3, 4, 6, 7, 0, 1, 5, 2, 3, 4, 6, 7,
5473 2, 5, 0, 1, 3, 4, 6, 7, 0, 2, 5, 1, 3, 4, 6, 7,
5474 1, 2, 5, 0, 3, 4, 6, 7, 0, 1, 2, 5, 3, 4, 6, 7,
5475 3, 5, 0, 1, 2, 4, 6, 7, 0, 3, 5, 1, 2, 4, 6, 7,
5476 1, 3, 5, 0, 2, 4, 6, 7, 0, 1, 3, 5, 2, 4, 6, 7,
5477 2, 3, 5, 0, 1, 4, 6, 7, 0, 2, 3, 5, 1, 4, 6, 7,
5478 1, 2, 3, 5, 0, 4, 6, 7, 0, 1, 2, 3, 5, 4, 6, 7,
5479 4, 5, 0, 1, 2, 3, 6, 7, 0, 4, 5, 1, 2, 3, 6, 7,
5480 1, 4, 5, 0, 2, 3, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
5481 2, 4, 5, 0, 1, 3, 6, 7, 0, 2, 4, 5, 1, 3, 6, 7,
5482 1, 2, 4, 5, 0, 3, 6, 7, 0, 1, 2, 4, 5, 3, 6, 7,
5483 3, 4, 5, 0, 1, 2, 6, 7, 0, 3, 4, 5, 1, 2, 6, 7,
5484 1, 3, 4, 5, 0, 2, 6, 7, 0, 1, 3, 4, 5, 2, 6, 7,
5485 2, 3, 4, 5, 0, 1, 6, 7, 0, 2, 3, 4, 5, 1, 6, 7,
5486 1, 2, 3, 4, 5, 0, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5487 6, 0, 1, 2, 3, 4, 5, 7, 0, 6, 1, 2, 3, 4, 5, 7,
5488 1, 6, 0, 2, 3, 4, 5, 7, 0, 1, 6, 2, 3, 4, 5, 7,
5489 2, 6, 0, 1, 3, 4, 5, 7, 0, 2, 6, 1, 3, 4, 5, 7,
5490 1, 2, 6, 0, 3, 4, 5, 7, 0, 1, 2, 6, 3, 4, 5, 7,
5491 3, 6, 0, 1, 2, 4, 5, 7, 0, 3, 6, 1, 2, 4, 5, 7,
5492 1, 3, 6, 0, 2, 4, 5, 7, 0, 1, 3, 6, 2, 4, 5, 7,
5493 2, 3, 6, 0, 1, 4, 5, 7, 0, 2, 3, 6, 1, 4, 5, 7,
5494 1, 2, 3, 6, 0, 4, 5, 7, 0, 1, 2, 3, 6, 4, 5, 7,
5495 4, 6, 0, 1, 2, 3, 5, 7, 0, 4, 6, 1, 2, 3, 5, 7,
5496 1, 4, 6, 0, 2, 3, 5, 7, 0, 1, 4, 6, 2, 3, 5, 7,
5497 2, 4, 6, 0, 1, 3, 5, 7, 0, 2, 4, 6, 1, 3, 5, 7,
5498 1, 2, 4, 6, 0, 3, 5, 7, 0, 1, 2, 4, 6, 3, 5, 7,
5499 3, 4, 6, 0, 1, 2, 5, 7, 0, 3, 4, 6, 1, 2, 5, 7,
5500 1, 3, 4, 6, 0, 2, 5, 7, 0, 1, 3, 4, 6, 2, 5, 7,
5501 2, 3, 4, 6, 0, 1, 5, 7, 0, 2, 3, 4, 6, 1, 5, 7,
5502 1, 2, 3, 4, 6, 0, 5, 7, 0, 1, 2, 3, 4, 6, 5, 7,
5503 5, 6, 0, 1, 2, 3, 4, 7, 0, 5, 6, 1, 2, 3, 4, 7,
5504 1, 5, 6, 0, 2, 3, 4, 7, 0, 1, 5, 6, 2, 3, 4, 7,
5505 2, 5, 6, 0, 1, 3, 4, 7, 0, 2, 5, 6, 1, 3, 4, 7,
5506 1, 2, 5, 6, 0, 3, 4, 7, 0, 1, 2, 5, 6, 3, 4, 7,
5507 3, 5, 6, 0, 1, 2, 4, 7, 0, 3, 5, 6, 1, 2, 4, 7,
5508 1, 3, 5, 6, 0, 2, 4, 7, 0, 1, 3, 5, 6, 2, 4, 7,
5509 2, 3, 5, 6, 0, 1, 4, 7, 0, 2, 3, 5, 6, 1, 4, 7,
5510 1, 2, 3, 5, 6, 0, 4, 7, 0, 1, 2, 3, 5, 6, 4, 7,
5511 4, 5, 6, 0, 1, 2, 3, 7, 0, 4, 5, 6, 1, 2, 3, 7,
5512 1, 4, 5, 6, 0, 2, 3, 7, 0, 1, 4, 5, 6, 2, 3, 7,
5513 2, 4, 5, 6, 0, 1, 3, 7, 0, 2, 4, 5, 6, 1, 3, 7,
5514 1, 2, 4, 5, 6, 0, 3, 7, 0, 1, 2, 4, 5, 6, 3, 7,
5515 3, 4, 5, 6, 0, 1, 2, 7, 0, 3, 4, 5, 6, 1, 2, 7,
5516 1, 3, 4, 5, 6, 0, 2, 7, 0, 1, 3, 4, 5, 6, 2, 7,
5517 2, 3, 4, 5, 6, 0, 1, 7, 0, 2, 3, 4, 5, 6, 1, 7,
5518 1, 2, 3, 4, 5, 6, 0, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5519 7, 0, 1, 2, 3, 4, 5, 6, 0, 7, 1, 2, 3, 4, 5, 6,
5520 1, 7, 0, 2, 3, 4, 5, 6, 0, 1, 7, 2, 3, 4, 5, 6,
5521 2, 7, 0, 1, 3, 4, 5, 6, 0, 2, 7, 1, 3, 4, 5, 6,
5522 1, 2, 7, 0, 3, 4, 5, 6, 0, 1, 2, 7, 3, 4, 5, 6,
5523 3, 7, 0, 1, 2, 4, 5, 6, 0, 3, 7, 1, 2, 4, 5, 6,
5524 1, 3, 7, 0, 2, 4, 5, 6, 0, 1, 3, 7, 2, 4, 5, 6,
5525 2, 3, 7, 0, 1, 4, 5, 6, 0, 2, 3, 7, 1, 4, 5, 6,
5526 1, 2, 3, 7, 0, 4, 5, 6, 0, 1, 2, 3, 7, 4, 5, 6,
5527 4, 7, 0, 1, 2, 3, 5, 6, 0, 4, 7, 1, 2, 3, 5, 6,
5528 1, 4, 7, 0, 2, 3, 5, 6, 0, 1, 4, 7, 2, 3, 5, 6,
5529 2, 4, 7, 0, 1, 3, 5, 6, 0, 2, 4, 7, 1, 3, 5, 6,
5530 1, 2, 4, 7, 0, 3, 5, 6, 0, 1, 2, 4, 7, 3, 5, 6,
5531 3, 4, 7, 0, 1, 2, 5, 6, 0, 3, 4, 7, 1, 2, 5, 6,
5532 1, 3, 4, 7, 0, 2, 5, 6, 0, 1, 3, 4, 7, 2, 5, 6,
5533 2, 3, 4, 7, 0, 1, 5, 6, 0, 2, 3, 4, 7, 1, 5, 6,
5534 1, 2, 3, 4, 7, 0, 5, 6, 0, 1, 2, 3, 4, 7, 5, 6,
5535 5, 7, 0, 1, 2, 3, 4, 6, 0, 5, 7, 1, 2, 3, 4, 6,
5536 1, 5, 7, 0, 2, 3, 4, 6, 0, 1, 5, 7, 2, 3, 4, 6,
5537 2, 5, 7, 0, 1, 3, 4, 6, 0, 2, 5, 7, 1, 3, 4, 6,
5538 1, 2, 5, 7, 0, 3, 4, 6, 0, 1, 2, 5, 7, 3, 4, 6,
5539 3, 5, 7, 0, 1, 2, 4, 6, 0, 3, 5, 7, 1, 2, 4, 6,
5540 1, 3, 5, 7, 0, 2, 4, 6, 0, 1, 3, 5, 7, 2, 4, 6,
5541 2, 3, 5, 7, 0, 1, 4, 6, 0, 2, 3, 5, 7, 1, 4, 6,
5542 1, 2, 3, 5, 7, 0, 4, 6, 0, 1, 2, 3, 5, 7, 4, 6,
5543 4, 5, 7, 0, 1, 2, 3, 6, 0, 4, 5, 7, 1, 2, 3, 6,
5544 1, 4, 5, 7, 0, 2, 3, 6, 0, 1, 4, 5, 7, 2, 3, 6,
5545 2, 4, 5, 7, 0, 1, 3, 6, 0, 2, 4, 5, 7, 1, 3, 6,
5546 1, 2, 4, 5, 7, 0, 3, 6, 0, 1, 2, 4, 5, 7, 3, 6,
5547 3, 4, 5, 7, 0, 1, 2, 6, 0, 3, 4, 5, 7, 1, 2, 6,
5548 1, 3, 4, 5, 7, 0, 2, 6, 0, 1, 3, 4, 5, 7, 2, 6,
5549 2, 3, 4, 5, 7, 0, 1, 6, 0, 2, 3, 4, 5, 7, 1, 6,
5550 1, 2, 3, 4, 5, 7, 0, 6, 0, 1, 2, 3, 4, 5, 7, 6,
5551 6, 7, 0, 1, 2, 3, 4, 5, 0, 6, 7, 1, 2, 3, 4, 5,
5552 1, 6, 7, 0, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5,
5553 2, 6, 7, 0, 1, 3, 4, 5, 0, 2, 6, 7, 1, 3, 4, 5,
5554 1, 2, 6, 7, 0, 3, 4, 5, 0, 1, 2, 6, 7, 3, 4, 5,
5555 3, 6, 7, 0, 1, 2, 4, 5, 0, 3, 6, 7, 1, 2, 4, 5,
5556 1, 3, 6, 7, 0, 2, 4, 5, 0, 1, 3, 6, 7, 2, 4, 5,
5557 2, 3, 6, 7, 0, 1, 4, 5, 0, 2, 3, 6, 7, 1, 4, 5,
5558 1, 2, 3, 6, 7, 0, 4, 5, 0, 1, 2, 3, 6, 7, 4, 5,
5559 4, 6, 7, 0, 1, 2, 3, 5, 0, 4, 6, 7, 1, 2, 3, 5,
5560 1, 4, 6, 7, 0, 2, 3, 5, 0, 1, 4, 6, 7, 2, 3, 5,
5561 2, 4, 6, 7, 0, 1, 3, 5, 0, 2, 4, 6, 7, 1, 3, 5,
5562 1, 2, 4, 6, 7, 0, 3, 5, 0, 1, 2, 4, 6, 7, 3, 5,
5563 3, 4, 6, 7, 0, 1, 2, 5, 0, 3, 4, 6, 7, 1, 2, 5,
5564 1, 3, 4, 6, 7, 0, 2, 5, 0, 1, 3, 4, 6, 7, 2, 5,
5565 2, 3, 4, 6, 7, 0, 1, 5, 0, 2, 3, 4, 6, 7, 1, 5,
5566 1, 2, 3, 4, 6, 7, 0, 5, 0, 1, 2, 3, 4, 6, 7, 5,
5567 5, 6, 7, 0, 1, 2, 3, 4, 0, 5, 6, 7, 1, 2, 3, 4,
5568 1, 5, 6, 7, 0, 2, 3, 4, 0, 1, 5, 6, 7, 2, 3, 4,
5569 2, 5, 6, 7, 0, 1, 3, 4, 0, 2, 5, 6, 7, 1, 3, 4,
5570 1, 2, 5, 6, 7, 0, 3, 4, 0, 1, 2, 5, 6, 7, 3, 4,
5571 3, 5, 6, 7, 0, 1, 2, 4, 0, 3, 5, 6, 7, 1, 2, 4,
5572 1, 3, 5, 6, 7, 0, 2, 4, 0, 1, 3, 5, 6, 7, 2, 4,
5573 2, 3, 5, 6, 7, 0, 1, 4, 0, 2, 3, 5, 6, 7, 1, 4,
5574 1, 2, 3, 5, 6, 7, 0, 4, 0, 1, 2, 3, 5, 6, 7, 4,
5575 4, 5, 6, 7, 0, 1, 2, 3, 0, 4, 5, 6, 7, 1, 2, 3,
5576 1, 4, 5, 6, 7, 0, 2, 3, 0, 1, 4, 5, 6, 7, 2, 3,
5577 2, 4, 5, 6, 7, 0, 1, 3, 0, 2, 4, 5, 6, 7, 1, 3,
5578 1, 2, 4, 5, 6, 7, 0, 3, 0, 1, 2, 4, 5, 6, 7, 3,
5579 3, 4, 5, 6, 7, 0, 1, 2, 0, 3, 4, 5, 6, 7, 1, 2,
5580 1, 3, 4, 5, 6, 7, 0, 2, 0, 1, 3, 4, 5, 6, 7, 2,
5581 2, 3, 4, 5, 6, 7, 0, 1, 0, 2, 3, 4, 5, 6, 7, 1,
5582 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
5584 for (
size_t i = 0; i <
Lanes(
d); i += 8) {
5587 const size_t bits8 = bits[i / 8];
5590 StoreU(compressed, d8, pos);
5593 return static_cast<size_t>(pos - unaligned);
5596template <
class V,
class M,
class D,
typename T, HWY_IF_T_SIZE(T, 1)>
5603template <
class V,
class M,
class D,
typename T, HWY_IF_T_SIZE(T, 1)>
5613template <
class V,
class M,
typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
5618 return Load(
d, lanes);
5621template <
class V,
typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
5626 return Load(
d, lanes);
5629template <
class V,
class M,
typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
5640#if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
5641#ifdef HWY_NATIVE_EXPAND
5642#undef HWY_NATIVE_EXPAND
5644#define HWY_NATIVE_EXPAND
5658 static_assert(N <= 8,
"Should only be called for half-vectors");
5661 alignas(16)
static constexpr uint8_t table[2048] = {
5663 128, 128, 128, 128, 128, 128, 128, 128,
5664 0, 128, 128, 128, 128, 128, 128, 128,
5665 128, 0, 128, 128, 128, 128, 128, 128,
5666 0, 1, 128, 128, 128, 128, 128, 128,
5667 128, 128, 0, 128, 128, 128, 128, 128,
5668 0, 128, 1, 128, 128, 128, 128, 128,
5669 128, 0, 1, 128, 128, 128, 128, 128,
5670 0, 1, 2, 128, 128, 128, 128, 128,
5671 128, 128, 128, 0, 128, 128, 128, 128,
5672 0, 128, 128, 1, 128, 128, 128, 128,
5673 128, 0, 128, 1, 128, 128, 128, 128,
5674 0, 1, 128, 2, 128, 128, 128, 128,
5675 128, 128, 0, 1, 128, 128, 128, 128,
5676 0, 128, 1, 2, 128, 128, 128, 128,
5677 128, 0, 1, 2, 128, 128, 128, 128,
5678 0, 1, 2, 3, 128, 128, 128, 128,
5679 128, 128, 128, 128, 0, 128, 128, 128,
5680 0, 128, 128, 128, 1, 128, 128, 128,
5681 128, 0, 128, 128, 1, 128, 128, 128,
5682 0, 1, 128, 128, 2, 128, 128, 128,
5683 128, 128, 0, 128, 1, 128, 128, 128,
5684 0, 128, 1, 128, 2, 128, 128, 128,
5685 128, 0, 1, 128, 2, 128, 128, 128,
5686 0, 1, 2, 128, 3, 128, 128, 128,
5687 128, 128, 128, 0, 1, 128, 128, 128,
5688 0, 128, 128, 1, 2, 128, 128, 128,
5689 128, 0, 128, 1, 2, 128, 128, 128,
5690 0, 1, 128, 2, 3, 128, 128, 128,
5691 128, 128, 0, 1, 2, 128, 128, 128,
5692 0, 128, 1, 2, 3, 128, 128, 128,
5693 128, 0, 1, 2, 3, 128, 128, 128,
5694 0, 1, 2, 3, 4, 128, 128, 128,
5695 128, 128, 128, 128, 128, 0, 128, 128,
5696 0, 128, 128, 128, 128, 1, 128, 128,
5697 128, 0, 128, 128, 128, 1, 128, 128,
5698 0, 1, 128, 128, 128, 2, 128, 128,
5699 128, 128, 0, 128, 128, 1, 128, 128,
5700 0, 128, 1, 128, 128, 2, 128, 128,
5701 128, 0, 1, 128, 128, 2, 128, 128,
5702 0, 1, 2, 128, 128, 3, 128, 128,
5703 128, 128, 128, 0, 128, 1, 128, 128,
5704 0, 128, 128, 1, 128, 2, 128, 128,
5705 128, 0, 128, 1, 128, 2, 128, 128,
5706 0, 1, 128, 2, 128, 3, 128, 128,
5707 128, 128, 0, 1, 128, 2, 128, 128,
5708 0, 128, 1, 2, 128, 3, 128, 128,
5709 128, 0, 1, 2, 128, 3, 128, 128,
5710 0, 1, 2, 3, 128, 4, 128, 128,
5711 128, 128, 128, 128, 0, 1, 128, 128,
5712 0, 128, 128, 128, 1, 2, 128, 128,
5713 128, 0, 128, 128, 1, 2, 128, 128,
5714 0, 1, 128, 128, 2, 3, 128, 128,
5715 128, 128, 0, 128, 1, 2, 128, 128,
5716 0, 128, 1, 128, 2, 3, 128, 128,
5717 128, 0, 1, 128, 2, 3, 128, 128,
5718 0, 1, 2, 128, 3, 4, 128, 128,
5719 128, 128, 128, 0, 1, 2, 128, 128,
5720 0, 128, 128, 1, 2, 3, 128, 128,
5721 128, 0, 128, 1, 2, 3, 128, 128,
5722 0, 1, 128, 2, 3, 4, 128, 128,
5723 128, 128, 0, 1, 2, 3, 128, 128,
5724 0, 128, 1, 2, 3, 4, 128, 128,
5725 128, 0, 1, 2, 3, 4, 128, 128,
5726 0, 1, 2, 3, 4, 5, 128, 128,
5727 128, 128, 128, 128, 128, 128, 0, 128,
5728 0, 128, 128, 128, 128, 128, 1, 128,
5729 128, 0, 128, 128, 128, 128, 1, 128,
5730 0, 1, 128, 128, 128, 128, 2, 128,
5731 128, 128, 0, 128, 128, 128, 1, 128,
5732 0, 128, 1, 128, 128, 128, 2, 128,
5733 128, 0, 1, 128, 128, 128, 2, 128,
5734 0, 1, 2, 128, 128, 128, 3, 128,
5735 128, 128, 128, 0, 128, 128, 1, 128,
5736 0, 128, 128, 1, 128, 128, 2, 128,
5737 128, 0, 128, 1, 128, 128, 2, 128,
5738 0, 1, 128, 2, 128, 128, 3, 128,
5739 128, 128, 0, 1, 128, 128, 2, 128,
5740 0, 128, 1, 2, 128, 128, 3, 128,
5741 128, 0, 1, 2, 128, 128, 3, 128,
5742 0, 1, 2, 3, 128, 128, 4, 128,
5743 128, 128, 128, 128, 0, 128, 1, 128,
5744 0, 128, 128, 128, 1, 128, 2, 128,
5745 128, 0, 128, 128, 1, 128, 2, 128,
5746 0, 1, 128, 128, 2, 128, 3, 128,
5747 128, 128, 0, 128, 1, 128, 2, 128,
5748 0, 128, 1, 128, 2, 128, 3, 128,
5749 128, 0, 1, 128, 2, 128, 3, 128,
5750 0, 1, 2, 128, 3, 128, 4, 128,
5751 128, 128, 128, 0, 1, 128, 2, 128,
5752 0, 128, 128, 1, 2, 128, 3, 128,
5753 128, 0, 128, 1, 2, 128, 3, 128,
5754 0, 1, 128, 2, 3, 128, 4, 128,
5755 128, 128, 0, 1, 2, 128, 3, 128,
5756 0, 128, 1, 2, 3, 128, 4, 128,
5757 128, 0, 1, 2, 3, 128, 4, 128,
5758 0, 1, 2, 3, 4, 128, 5, 128,
5759 128, 128, 128, 128, 128, 0, 1, 128,
5760 0, 128, 128, 128, 128, 1, 2, 128,
5761 128, 0, 128, 128, 128, 1, 2, 128,
5762 0, 1, 128, 128, 128, 2, 3, 128,
5763 128, 128, 0, 128, 128, 1, 2, 128,
5764 0, 128, 1, 128, 128, 2, 3, 128,
5765 128, 0, 1, 128, 128, 2, 3, 128,
5766 0, 1, 2, 128, 128, 3, 4, 128,
5767 128, 128, 128, 0, 128, 1, 2, 128,
5768 0, 128, 128, 1, 128, 2, 3, 128,
5769 128, 0, 128, 1, 128, 2, 3, 128,
5770 0, 1, 128, 2, 128, 3, 4, 128,
5771 128, 128, 0, 1, 128, 2, 3, 128,
5772 0, 128, 1, 2, 128, 3, 4, 128,
5773 128, 0, 1, 2, 128, 3, 4, 128,
5774 0, 1, 2, 3, 128, 4, 5, 128,
5775 128, 128, 128, 128, 0, 1, 2, 128,
5776 0, 128, 128, 128, 1, 2, 3, 128,
5777 128, 0, 128, 128, 1, 2, 3, 128,
5778 0, 1, 128, 128, 2, 3, 4, 128,
5779 128, 128, 0, 128, 1, 2, 3, 128,
5780 0, 128, 1, 128, 2, 3, 4, 128,
5781 128, 0, 1, 128, 2, 3, 4, 128,
5782 0, 1, 2, 128, 3, 4, 5, 128,
5783 128, 128, 128, 0, 1, 2, 3, 128,
5784 0, 128, 128, 1, 2, 3, 4, 128,
5785 128, 0, 128, 1, 2, 3, 4, 128,
5786 0, 1, 128, 2, 3, 4, 5, 128,
5787 128, 128, 0, 1, 2, 3, 4, 128,
5788 0, 128, 1, 2, 3, 4, 5, 128,
5789 128, 0, 1, 2, 3, 4, 5, 128,
5790 0, 1, 2, 3, 4, 5, 6, 128,
5791 128, 128, 128, 128, 128, 128, 128, 0,
5792 0, 128, 128, 128, 128, 128, 128, 1,
5793 128, 0, 128, 128, 128, 128, 128, 1,
5794 0, 1, 128, 128, 128, 128, 128, 2,
5795 128, 128, 0, 128, 128, 128, 128, 1,
5796 0, 128, 1, 128, 128, 128, 128, 2,
5797 128, 0, 1, 128, 128, 128, 128, 2,
5798 0, 1, 2, 128, 128, 128, 128, 3,
5799 128, 128, 128, 0, 128, 128, 128, 1,
5800 0, 128, 128, 1, 128, 128, 128, 2,
5801 128, 0, 128, 1, 128, 128, 128, 2,
5802 0, 1, 128, 2, 128, 128, 128, 3,
5803 128, 128, 0, 1, 128, 128, 128, 2,
5804 0, 128, 1, 2, 128, 128, 128, 3,
5805 128, 0, 1, 2, 128, 128, 128, 3,
5806 0, 1, 2, 3, 128, 128, 128, 4,
5807 128, 128, 128, 128, 0, 128, 128, 1,
5808 0, 128, 128, 128, 1, 128, 128, 2,
5809 128, 0, 128, 128, 1, 128, 128, 2,
5810 0, 1, 128, 128, 2, 128, 128, 3,
5811 128, 128, 0, 128, 1, 128, 128, 2,
5812 0, 128, 1, 128, 2, 128, 128, 3,
5813 128, 0, 1, 128, 2, 128, 128, 3,
5814 0, 1, 2, 128, 3, 128, 128, 4,
5815 128, 128, 128, 0, 1, 128, 128, 2,
5816 0, 128, 128, 1, 2, 128, 128, 3,
5817 128, 0, 128, 1, 2, 128, 128, 3,
5818 0, 1, 128, 2, 3, 128, 128, 4,
5819 128, 128, 0, 1, 2, 128, 128, 3,
5820 0, 128, 1, 2, 3, 128, 128, 4,
5821 128, 0, 1, 2, 3, 128, 128, 4,
5822 0, 1, 2, 3, 4, 128, 128, 5,
5823 128, 128, 128, 128, 128, 0, 128, 1,
5824 0, 128, 128, 128, 128, 1, 128, 2,
5825 128, 0, 128, 128, 128, 1, 128, 2,
5826 0, 1, 128, 128, 128, 2, 128, 3,
5827 128, 128, 0, 128, 128, 1, 128, 2,
5828 0, 128, 1, 128, 128, 2, 128, 3,
5829 128, 0, 1, 128, 128, 2, 128, 3,
5830 0, 1, 2, 128, 128, 3, 128, 4,
5831 128, 128, 128, 0, 128, 1, 128, 2,
5832 0, 128, 128, 1, 128, 2, 128, 3,
5833 128, 0, 128, 1, 128, 2, 128, 3,
5834 0, 1, 128, 2, 128, 3, 128, 4,
5835 128, 128, 0, 1, 128, 2, 128, 3,
5836 0, 128, 1, 2, 128, 3, 128, 4,
5837 128, 0, 1, 2, 128, 3, 128, 4,
5838 0, 1, 2, 3, 128, 4, 128, 5,
5839 128, 128, 128, 128, 0, 1, 128, 2,
5840 0, 128, 128, 128, 1, 2, 128, 3,
5841 128, 0, 128, 128, 1, 2, 128, 3,
5842 0, 1, 128, 128, 2, 3, 128, 4,
5843 128, 128, 0, 128, 1, 2, 128, 3,
5844 0, 128, 1, 128, 2, 3, 128, 4,
5845 128, 0, 1, 128, 2, 3, 128, 4,
5846 0, 1, 2, 128, 3, 4, 128, 5,
5847 128, 128, 128, 0, 1, 2, 128, 3,
5848 0, 128, 128, 1, 2, 3, 128, 4,
5849 128, 0, 128, 1, 2, 3, 128, 4,
5850 0, 1, 128, 2, 3, 4, 128, 5,
5851 128, 128, 0, 1, 2, 3, 128, 4,
5852 0, 128, 1, 2, 3, 4, 128, 5,
5853 128, 0, 1, 2, 3, 4, 128, 5,
5854 0, 1, 2, 3, 4, 5, 128, 6,
5855 128, 128, 128, 128, 128, 128, 0, 1,
5856 0, 128, 128, 128, 128, 128, 1, 2,
5857 128, 0, 128, 128, 128, 128, 1, 2,
5858 0, 1, 128, 128, 128, 128, 2, 3,
5859 128, 128, 0, 128, 128, 128, 1, 2,
5860 0, 128, 1, 128, 128, 128, 2, 3,
5861 128, 0, 1, 128, 128, 128, 2, 3,
5862 0, 1, 2, 128, 128, 128, 3, 4,
5863 128, 128, 128, 0, 128, 128, 1, 2,
5864 0, 128, 128, 1, 128, 128, 2, 3,
5865 128, 0, 128, 1, 128, 128, 2, 3,
5866 0, 1, 128, 2, 128, 128, 3, 4,
5867 128, 128, 0, 1, 128, 128, 2, 3,
5868 0, 128, 1, 2, 128, 128, 3, 4,
5869 128, 0, 1, 2, 128, 128, 3, 4,
5870 0, 1, 2, 3, 128, 128, 4, 5,
5871 128, 128, 128, 128, 0, 128, 1, 2,
5872 0, 128, 128, 128, 1, 128, 2, 3,
5873 128, 0, 128, 128, 1, 128, 2, 3,
5874 0, 1, 128, 128, 2, 128, 3, 4,
5875 128, 128, 0, 128, 1, 128, 2, 3,
5876 0, 128, 1, 128, 2, 128, 3, 4,
5877 128, 0, 1, 128, 2, 128, 3, 4,
5878 0, 1, 2, 128, 3, 128, 4, 5,
5879 128, 128, 128, 0, 1, 128, 2, 3,
5880 0, 128, 128, 1, 2, 128, 3, 4,
5881 128, 0, 128, 1, 2, 128, 3, 4,
5882 0, 1, 128, 2, 3, 128, 4, 5,
5883 128, 128, 0, 1, 2, 128, 3, 4,
5884 0, 128, 1, 2, 3, 128, 4, 5,
5885 128, 0, 1, 2, 3, 128, 4, 5,
5886 0, 1, 2, 3, 4, 128, 5, 6,
5887 128, 128, 128, 128, 128, 0, 1, 2,
5888 0, 128, 128, 128, 128, 1, 2, 3,
5889 128, 0, 128, 128, 128, 1, 2, 3,
5890 0, 1, 128, 128, 128, 2, 3, 4,
5891 128, 128, 0, 128, 128, 1, 2, 3,
5892 0, 128, 1, 128, 128, 2, 3, 4,
5893 128, 0, 1, 128, 128, 2, 3, 4,
5894 0, 1, 2, 128, 128, 3, 4, 5,
5895 128, 128, 128, 0, 128, 1, 2, 3,
5896 0, 128, 128, 1, 128, 2, 3, 4,
5897 128, 0, 128, 1, 128, 2, 3, 4,
5898 0, 1, 128, 2, 128, 3, 4, 5,
5899 128, 128, 0, 1, 128, 2, 3, 4,
5900 0, 128, 1, 2, 128, 3, 4, 5,
5901 128, 0, 1, 2, 128, 3, 4, 5,
5902 0, 1, 2, 3, 128, 4, 5, 6,
5903 128, 128, 128, 128, 0, 1, 2, 3,
5904 0, 128, 128, 128, 1, 2, 3, 4,
5905 128, 0, 128, 128, 1, 2, 3, 4,
5906 0, 1, 128, 128, 2, 3, 4, 5,
5907 128, 128, 0, 128, 1, 2, 3, 4,
5908 0, 128, 1, 128, 2, 3, 4, 5,
5909 128, 0, 1, 128, 2, 3, 4, 5,
5910 0, 1, 2, 128, 3, 4, 5, 6,
5911 128, 128, 128, 0, 1, 2, 3, 4,
5912 0, 128, 128, 1, 2, 3, 4, 5,
5913 128, 0, 128, 1, 2, 3, 4, 5,
5914 0, 1, 128, 2, 3, 4, 5, 6,
5915 128, 128, 0, 1, 2, 3, 4, 5,
5916 0, 128, 1, 2, 3, 4, 5, 6,
5917 128, 0, 1, 2, 3, 4, 5, 6,
5918 0, 1, 2, 3, 4, 5, 6, 7};
5919 return LoadU(du8, table + mask_bits * 8);
5925template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)>
5931 detail::IndicesForExpandFromBits<N>(mask_bits);
5936template <
typename T, HWY_IF_T_SIZE(T, 1)>
5940 const Half<
decltype(du)> duh;
5944 const uint64_t maskL = mask_bits & 0xFF;
5945 const uint64_t maskH = mask_bits >> 8;
5952 alignas(16)
static constexpr uint8_t iota[32] = {
5953 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
5954 11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128,
5955 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
5958 const VFromD<
decltype(duh)> vH =
5961 const VFromD<
decltype(duh)> idxL = detail::IndicesForExpandFromBits<8>(maskL);
5962 const VFromD<
decltype(duh)> idxH = detail::IndicesForExpandFromBits<8>(maskH);
5969template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
5974 const Rebind<uint8_t,
decltype(
d)> du8;
5979 alignas(16)
static constexpr uint8_t table[2048] = {
5981 128, 128, 128, 128, 128, 128, 128, 128,
5982 0, 128, 128, 128, 128, 128, 128, 128,
5983 128, 0, 128, 128, 128, 128, 128, 128,
5984 0, 2, 128, 128, 128, 128, 128, 128,
5985 128, 128, 0, 128, 128, 128, 128, 128,
5986 0, 128, 2, 128, 128, 128, 128, 128,
5987 128, 0, 2, 128, 128, 128, 128, 128,
5988 0, 2, 4, 128, 128, 128, 128, 128,
5989 128, 128, 128, 0, 128, 128, 128, 128,
5990 0, 128, 128, 2, 128, 128, 128, 128,
5991 128, 0, 128, 2, 128, 128, 128, 128,
5992 0, 2, 128, 4, 128, 128, 128, 128,
5993 128, 128, 0, 2, 128, 128, 128, 128,
5994 0, 128, 2, 4, 128, 128, 128, 128,
5995 128, 0, 2, 4, 128, 128, 128, 128,
5996 0, 2, 4, 6, 128, 128, 128, 128,
5997 128, 128, 128, 128, 0, 128, 128, 128,
5998 0, 128, 128, 128, 2, 128, 128, 128,
5999 128, 0, 128, 128, 2, 128, 128, 128,
6000 0, 2, 128, 128, 4, 128, 128, 128,
6001 128, 128, 0, 128, 2, 128, 128, 128,
6002 0, 128, 2, 128, 4, 128, 128, 128,
6003 128, 0, 2, 128, 4, 128, 128, 128,
6004 0, 2, 4, 128, 6, 128, 128, 128,
6005 128, 128, 128, 0, 2, 128, 128, 128,
6006 0, 128, 128, 2, 4, 128, 128, 128,
6007 128, 0, 128, 2, 4, 128, 128, 128,
6008 0, 2, 128, 4, 6, 128, 128, 128,
6009 128, 128, 0, 2, 4, 128, 128, 128,
6010 0, 128, 2, 4, 6, 128, 128, 128,
6011 128, 0, 2, 4, 6, 128, 128, 128,
6012 0, 2, 4, 6, 8, 128, 128, 128,
6013 128, 128, 128, 128, 128, 0, 128, 128,
6014 0, 128, 128, 128, 128, 2, 128, 128,
6015 128, 0, 128, 128, 128, 2, 128, 128,
6016 0, 2, 128, 128, 128, 4, 128, 128,
6017 128, 128, 0, 128, 128, 2, 128, 128,
6018 0, 128, 2, 128, 128, 4, 128, 128,
6019 128, 0, 2, 128, 128, 4, 128, 128,
6020 0, 2, 4, 128, 128, 6, 128, 128,
6021 128, 128, 128, 0, 128, 2, 128, 128,
6022 0, 128, 128, 2, 128, 4, 128, 128,
6023 128, 0, 128, 2, 128, 4, 128, 128,
6024 0, 2, 128, 4, 128, 6, 128, 128,
6025 128, 128, 0, 2, 128, 4, 128, 128,
6026 0, 128, 2, 4, 128, 6, 128, 128,
6027 128, 0, 2, 4, 128, 6, 128, 128,
6028 0, 2, 4, 6, 128, 8, 128, 128,
6029 128, 128, 128, 128, 0, 2, 128, 128,
6030 0, 128, 128, 128, 2, 4, 128, 128,
6031 128, 0, 128, 128, 2, 4, 128, 128,
6032 0, 2, 128, 128, 4, 6, 128, 128,
6033 128, 128, 0, 128, 2, 4, 128, 128,
6034 0, 128, 2, 128, 4, 6, 128, 128,
6035 128, 0, 2, 128, 4, 6, 128, 128,
6036 0, 2, 4, 128, 6, 8, 128, 128,
6037 128, 128, 128, 0, 2, 4, 128, 128,
6038 0, 128, 128, 2, 4, 6, 128, 128,
6039 128, 0, 128, 2, 4, 6, 128, 128,
6040 0, 2, 128, 4, 6, 8, 128, 128,
6041 128, 128, 0, 2, 4, 6, 128, 128,
6042 0, 128, 2, 4, 6, 8, 128, 128,
6043 128, 0, 2, 4, 6, 8, 128, 128,
6044 0, 2, 4, 6, 8, 10, 128, 128,
6045 128, 128, 128, 128, 128, 128, 0, 128,
6046 0, 128, 128, 128, 128, 128, 2, 128,
6047 128, 0, 128, 128, 128, 128, 2, 128,
6048 0, 2, 128, 128, 128, 128, 4, 128,
6049 128, 128, 0, 128, 128, 128, 2, 128,
6050 0, 128, 2, 128, 128, 128, 4, 128,
6051 128, 0, 2, 128, 128, 128, 4, 128,
6052 0, 2, 4, 128, 128, 128, 6, 128,
6053 128, 128, 128, 0, 128, 128, 2, 128,
6054 0, 128, 128, 2, 128, 128, 4, 128,
6055 128, 0, 128, 2, 128, 128, 4, 128,
6056 0, 2, 128, 4, 128, 128, 6, 128,
6057 128, 128, 0, 2, 128, 128, 4, 128,
6058 0, 128, 2, 4, 128, 128, 6, 128,
6059 128, 0, 2, 4, 128, 128, 6, 128,
6060 0, 2, 4, 6, 128, 128, 8, 128,
6061 128, 128, 128, 128, 0, 128, 2, 128,
6062 0, 128, 128, 128, 2, 128, 4, 128,
6063 128, 0, 128, 128, 2, 128, 4, 128,
6064 0, 2, 128, 128, 4, 128, 6, 128,
6065 128, 128, 0, 128, 2, 128, 4, 128,
6066 0, 128, 2, 128, 4, 128, 6, 128,
6067 128, 0, 2, 128, 4, 128, 6, 128,
6068 0, 2, 4, 128, 6, 128, 8, 128,
6069 128, 128, 128, 0, 2, 128, 4, 128,
6070 0, 128, 128, 2, 4, 128, 6, 128,
6071 128, 0, 128, 2, 4, 128, 6, 128,
6072 0, 2, 128, 4, 6, 128, 8, 128,
6073 128, 128, 0, 2, 4, 128, 6, 128,
6074 0, 128, 2, 4, 6, 128, 8, 128,
6075 128, 0, 2, 4, 6, 128, 8, 128,
6076 0, 2, 4, 6, 8, 128, 10, 128,
6077 128, 128, 128, 128, 128, 0, 2, 128,
6078 0, 128, 128, 128, 128, 2, 4, 128,
6079 128, 0, 128, 128, 128, 2, 4, 128,
6080 0, 2, 128, 128, 128, 4, 6, 128,
6081 128, 128, 0, 128, 128, 2, 4, 128,
6082 0, 128, 2, 128, 128, 4, 6, 128,
6083 128, 0, 2, 128, 128, 4, 6, 128,
6084 0, 2, 4, 128, 128, 6, 8, 128,
6085 128, 128, 128, 0, 128, 2, 4, 128,
6086 0, 128, 128, 2, 128, 4, 6, 128,
6087 128, 0, 128, 2, 128, 4, 6, 128,
6088 0, 2, 128, 4, 128, 6, 8, 128,
6089 128, 128, 0, 2, 128, 4, 6, 128,
6090 0, 128, 2, 4, 128, 6, 8, 128,
6091 128, 0, 2, 4, 128, 6, 8, 128,
6092 0, 2, 4, 6, 128, 8, 10, 128,
6093 128, 128, 128, 128, 0, 2, 4, 128,
6094 0, 128, 128, 128, 2, 4, 6, 128,
6095 128, 0, 128, 128, 2, 4, 6, 128,
6096 0, 2, 128, 128, 4, 6, 8, 128,
6097 128, 128, 0, 128, 2, 4, 6, 128,
6098 0, 128, 2, 128, 4, 6, 8, 128,
6099 128, 0, 2, 128, 4, 6, 8, 128,
6100 0, 2, 4, 128, 6, 8, 10, 128,
6101 128, 128, 128, 0, 2, 4, 6, 128,
6102 0, 128, 128, 2, 4, 6, 8, 128,
6103 128, 0, 128, 2, 4, 6, 8, 128,
6104 0, 2, 128, 4, 6, 8, 10, 128,
6105 128, 128, 0, 2, 4, 6, 8, 128,
6106 0, 128, 2, 4, 6, 8, 10, 128,
6107 128, 0, 2, 4, 6, 8, 10, 128,
6108 0, 2, 4, 6, 8, 10, 12, 128,
6109 128, 128, 128, 128, 128, 128, 128, 0,
6110 0, 128, 128, 128, 128, 128, 128, 2,
6111 128, 0, 128, 128, 128, 128, 128, 2,
6112 0, 2, 128, 128, 128, 128, 128, 4,
6113 128, 128, 0, 128, 128, 128, 128, 2,
6114 0, 128, 2, 128, 128, 128, 128, 4,
6115 128, 0, 2, 128, 128, 128, 128, 4,
6116 0, 2, 4, 128, 128, 128, 128, 6,
6117 128, 128, 128, 0, 128, 128, 128, 2,
6118 0, 128, 128, 2, 128, 128, 128, 4,
6119 128, 0, 128, 2, 128, 128, 128, 4,
6120 0, 2, 128, 4, 128, 128, 128, 6,
6121 128, 128, 0, 2, 128, 128, 128, 4,
6122 0, 128, 2, 4, 128, 128, 128, 6,
6123 128, 0, 2, 4, 128, 128, 128, 6,
6124 0, 2, 4, 6, 128, 128, 128, 8,
6125 128, 128, 128, 128, 0, 128, 128, 2,
6126 0, 128, 128, 128, 2, 128, 128, 4,
6127 128, 0, 128, 128, 2, 128, 128, 4,
6128 0, 2, 128, 128, 4, 128, 128, 6,
6129 128, 128, 0, 128, 2, 128, 128, 4,
6130 0, 128, 2, 128, 4, 128, 128, 6,
6131 128, 0, 2, 128, 4, 128, 128, 6,
6132 0, 2, 4, 128, 6, 128, 128, 8,
6133 128, 128, 128, 0, 2, 128, 128, 4,
6134 0, 128, 128, 2, 4, 128, 128, 6,
6135 128, 0, 128, 2, 4, 128, 128, 6,
6136 0, 2, 128, 4, 6, 128, 128, 8,
6137 128, 128, 0, 2, 4, 128, 128, 6,
6138 0, 128, 2, 4, 6, 128, 128, 8,
6139 128, 0, 2, 4, 6, 128, 128, 8,
6140 0, 2, 4, 6, 8, 128, 128, 10,
6141 128, 128, 128, 128, 128, 0, 128, 2,
6142 0, 128, 128, 128, 128, 2, 128, 4,
6143 128, 0, 128, 128, 128, 2, 128, 4,
6144 0, 2, 128, 128, 128, 4, 128, 6,
6145 128, 128, 0, 128, 128, 2, 128, 4,
6146 0, 128, 2, 128, 128, 4, 128, 6,
6147 128, 0, 2, 128, 128, 4, 128, 6,
6148 0, 2, 4, 128, 128, 6, 128, 8,
6149 128, 128, 128, 0, 128, 2, 128, 4,
6150 0, 128, 128, 2, 128, 4, 128, 6,
6151 128, 0, 128, 2, 128, 4, 128, 6,
6152 0, 2, 128, 4, 128, 6, 128, 8,
6153 128, 128, 0, 2, 128, 4, 128, 6,
6154 0, 128, 2, 4, 128, 6, 128, 8,
6155 128, 0, 2, 4, 128, 6, 128, 8,
6156 0, 2, 4, 6, 128, 8, 128, 10,
6157 128, 128, 128, 128, 0, 2, 128, 4,
6158 0, 128, 128, 128, 2, 4, 128, 6,
6159 128, 0, 128, 128, 2, 4, 128, 6,
6160 0, 2, 128, 128, 4, 6, 128, 8,
6161 128, 128, 0, 128, 2, 4, 128, 6,
6162 0, 128, 2, 128, 4, 6, 128, 8,
6163 128, 0, 2, 128, 4, 6, 128, 8,
6164 0, 2, 4, 128, 6, 8, 128, 10,
6165 128, 128, 128, 0, 2, 4, 128, 6,
6166 0, 128, 128, 2, 4, 6, 128, 8,
6167 128, 0, 128, 2, 4, 6, 128, 8,
6168 0, 2, 128, 4, 6, 8, 128, 10,
6169 128, 128, 0, 2, 4, 6, 128, 8,
6170 0, 128, 2, 4, 6, 8, 128, 10,
6171 128, 0, 2, 4, 6, 8, 128, 10,
6172 0, 2, 4, 6, 8, 10, 128, 12,
6173 128, 128, 128, 128, 128, 128, 0, 2,
6174 0, 128, 128, 128, 128, 128, 2, 4,
6175 128, 0, 128, 128, 128, 128, 2, 4,
6176 0, 2, 128, 128, 128, 128, 4, 6,
6177 128, 128, 0, 128, 128, 128, 2, 4,
6178 0, 128, 2, 128, 128, 128, 4, 6,
6179 128, 0, 2, 128, 128, 128, 4, 6,
6180 0, 2, 4, 128, 128, 128, 6, 8,
6181 128, 128, 128, 0, 128, 128, 2, 4,
6182 0, 128, 128, 2, 128, 128, 4, 6,
6183 128, 0, 128, 2, 128, 128, 4, 6,
6184 0, 2, 128, 4, 128, 128, 6, 8,
6185 128, 128, 0, 2, 128, 128, 4, 6,
6186 0, 128, 2, 4, 128, 128, 6, 8,
6187 128, 0, 2, 4, 128, 128, 6, 8,
6188 0, 2, 4, 6, 128, 128, 8, 10,
6189 128, 128, 128, 128, 0, 128, 2, 4,
6190 0, 128, 128, 128, 2, 128, 4, 6,
6191 128, 0, 128, 128, 2, 128, 4, 6,
6192 0, 2, 128, 128, 4, 128, 6, 8,
6193 128, 128, 0, 128, 2, 128, 4, 6,
6194 0, 128, 2, 128, 4, 128, 6, 8,
6195 128, 0, 2, 128, 4, 128, 6, 8,
6196 0, 2, 4, 128, 6, 128, 8, 10,
6197 128, 128, 128, 0, 2, 128, 4, 6,
6198 0, 128, 128, 2, 4, 128, 6, 8,
6199 128, 0, 128, 2, 4, 128, 6, 8,
6200 0, 2, 128, 4, 6, 128, 8, 10,
6201 128, 128, 0, 2, 4, 128, 6, 8,
6202 0, 128, 2, 4, 6, 128, 8, 10,
6203 128, 0, 2, 4, 6, 128, 8, 10,
6204 0, 2, 4, 6, 8, 128, 10, 12,
6205 128, 128, 128, 128, 128, 0, 2, 4,
6206 0, 128, 128, 128, 128, 2, 4, 6,
6207 128, 0, 128, 128, 128, 2, 4, 6,
6208 0, 2, 128, 128, 128, 4, 6, 8,
6209 128, 128, 0, 128, 128, 2, 4, 6,
6210 0, 128, 2, 128, 128, 4, 6, 8,
6211 128, 0, 2, 128, 128, 4, 6, 8,
6212 0, 2, 4, 128, 128, 6, 8, 10,
6213 128, 128, 128, 0, 128, 2, 4, 6,
6214 0, 128, 128, 2, 128, 4, 6, 8,
6215 128, 0, 128, 2, 128, 4, 6, 8,
6216 0, 2, 128, 4, 128, 6, 8, 10,
6217 128, 128, 0, 2, 128, 4, 6, 8,
6218 0, 128, 2, 4, 128, 6, 8, 10,
6219 128, 0, 2, 4, 128, 6, 8, 10,
6220 0, 2, 4, 6, 128, 8, 10, 12,
6221 128, 128, 128, 128, 0, 2, 4, 6,
6222 0, 128, 128, 128, 2, 4, 6, 8,
6223 128, 0, 128, 128, 2, 4, 6, 8,
6224 0, 2, 128, 128, 4, 6, 8, 10,
6225 128, 128, 0, 128, 2, 4, 6, 8,
6226 0, 128, 2, 128, 4, 6, 8, 10,
6227 128, 0, 2, 128, 4, 6, 8, 10,
6228 0, 2, 4, 128, 6, 8, 10, 12,
6229 128, 128, 128, 0, 2, 4, 6, 8,
6230 0, 128, 128, 2, 4, 6, 8, 10,
6231 128, 0, 128, 2, 4, 6, 8, 10,
6232 0, 2, 128, 4, 6, 8, 10, 12,
6233 128, 128, 0, 2, 4, 6, 8, 10,
6234 0, 128, 2, 4, 6, 8, 10, 12,
6235 128, 0, 2, 4, 6, 8, 10, 12,
6236 0, 2, 4, 6, 8, 10, 12, 14};
6239 const Twice<
decltype(du8)> du8x2;
6248 Set(du,
static_cast<uint16_t
>(HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001)));
6252template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
6253HWY_API Vec128<T, N>
Expand(Vec128<T, N> v, Mask128<T, N> mask) {
6259 alignas(16)
static constexpr uint32_t packed_array[16] = {
6261 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
6262 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
6263 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
6266 const Vec128<uint32_t, N> packed =
Set(du, packed_array[mask_bits]);
6267 alignas(16)
static constexpr uint32_t shifts[4] = {0, 4, 8, 12};
6268 Vec128<uint32_t, N>
indices = packed >>
Load(du, shifts);
6272 const Vec128<uint32_t, N> expand =
6278template <
typename T, HWY_IF_T_SIZE(T, 8)>
6285template <
typename T>
6291template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6306#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \
6307 HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
6308 HWY_TARGET != HWY_SVE2_128
6318#if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
6319#ifdef HWY_NATIVE_REVERSE2_8
6320#undef HWY_NATIVE_REVERSE2_8
6322#define HWY_NATIVE_REVERSE2_8
6325#undef HWY_PREFER_ROTATE
6328#if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \
6329 HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8
6330#define HWY_PREFER_ROTATE 1
6332#define HWY_PREFER_ROTATE 0
6335template <
class D, HWY_IF_T_SIZE_D(D, 1)>
6338#if HWY_PREFER_ROTATE && HWY_TARGET > HWY_AVX3
6342 const VFromD<D> shuffle =
Dup128VecFromValues(
d, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,
6343 11, 10, 13, 12, 15, 14);
6348template <
class D, HWY_IF_T_SIZE_D(D, 1)>
6350#if HWY_PREFER_ROTATE
6356 du8, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
6361template <
class D, HWY_IF_T_SIZE_D(D, 1)>
6363#if HWY_PREFER_ROTATE
6364 const Repartition<uint32_t, D> du32;
6369 du8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
6378#if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE))
6379#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
6380#undef HWY_NATIVE_REVERSE_LANE_BYTES
6382#define HWY_NATIVE_REVERSE_LANE_BYTES
6385template <
class V, HWY_IF_T_SIZE_V(V, 2)>
6392template <
class V, HWY_IF_T_SIZE_V(V, 4)>
6399template <
class V, HWY_IF_T_SIZE_V(V, 8)>
6415#undef HWY_REVERSE_BITS_MIN_BYTES
6416#if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \
6417 HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256)
6418#define HWY_REVERSE_BITS_MIN_BYTES 2
6420#define HWY_REVERSE_BITS_MIN_BYTES 1
6423#if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE))
6424#ifdef HWY_NATIVE_REVERSE_BITS_UI8
6425#undef HWY_NATIVE_REVERSE_BITS_UI8
6427#define HWY_NATIVE_REVERSE_BITS_UI8
6432template <
int kShiftAmt,
int kShrResultMask,
class V,
6437#if HWY_REVERSE_BITS_MIN_BYTES == 2
6443 const auto v_to_shift =
BitCast(d_shift, v);
6444 const auto shl_result =
BitCast(
d, ShiftLeft<kShiftAmt>(v_to_shift));
6445 const auto shr_result =
BitCast(
d, ShiftRight<kShiftAmt>(v_to_shift));
6446 const auto shr_result_mask =
6447 BitCast(
d,
Set(du,
static_cast<uint8_t
>(kShrResultMask)));
6448 return Or(
And(shr_result, shr_result_mask),
6449 AndNot(shr_result_mask, shl_result));
6452#if HWY_REVERSE_BITS_MIN_BYTES == 2
6453template <
int kShiftAmt,
int kShrResultMask,
class V,
6463template <
class V, HWY_IF_T_SIZE_V(V, 1)>
6465 auto result = detail::UI8ReverseBitsStep<1, 0x55>(v);
6466 result = detail::UI8ReverseBitsStep<2, 0x33>(result);
6467 result = detail::UI8ReverseBitsStep<4, 0x0F>(result);
6473#if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE))
6474#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
6475#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
6477#define HWY_NATIVE_REVERSE_BITS_UI16_32_64
6491#if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE))
6492#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6493#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6495#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6498#if HWY_TARGET != HWY_SCALAR
6505 const uint32_t x0) {
6506#if HWY_TARGET == HWY_RVV
6507 constexpr int kPow2 =
d.Pow2();
6508 constexpr int kLoadPow2 =
HWY_MAX(kPow2, -1);
6509 const ScalableTag<uint32_t, kLoadPow2> d_load;
6511 constexpr size_t kMaxBytes =
d.MaxBytes();
6512#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
6513 constexpr size_t kMinLanesToLoad = 2;
6515 constexpr size_t kMinLanesToLoad = 4;
6517 constexpr size_t kNumToLoad =
6518 HWY_MAX(kMaxBytes /
sizeof(uint32_t), kMinLanesToLoad);
6519 const CappedTag<uint32_t, kNumToLoad> d_load;
6529#if HWY_TARGET != HWY_SCALAR
6553HWY_INLINE uint32_t U8x4Per4LaneBlkIndices(
const uint32_t idx3,
6554 const uint32_t idx2,
6555 const uint32_t idx1,
6556 const uint32_t idx0) {
6557#if HWY_IS_LITTLE_ENDIAN
6558 return static_cast<uint32_t
>((idx3 << 24) | (idx2 << 16) | (idx1 << 8) |
6561 return static_cast<uint32_t
>(idx3 | (idx2 << 8) | (idx1 << 16) |
6567HWY_INLINE Vec<D> TblLookupPer4LaneBlkU8IdxInBlk(D
d,
const uint32_t idx3,
6568 const uint32_t idx2,
6569 const uint32_t idx1,
6570 const uint32_t idx0) {
6571#if HWY_TARGET == HWY_RVV
6572 const AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
6574 const Repartition<uint32_t, D> du32;
6578 d,
Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0)));
6581#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
6582 HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_EMU128
6583#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr
6585#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8)
6587template <
class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
6588HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, V
idx) {
6589 const DFromV<decltype(v)> d;
6590 const Repartition<u
int8_t, decltype(d)> du8;
6591 return BitCast(d, TableLookupBytes(BitCast(du8, v), BitCast(du8,
idx)));
6594template <
class D, HWY_IF_T_SIZE_D(D, 1)>
6595HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D
d,
const uint32_t idx3,
6596 const uint32_t idx2,
6597 const uint32_t idx1,
6598 const uint32_t idx0) {
6600 const uint32_t idx3210 = U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0);
6602 du32,
static_cast<uint32_t
>(idx3210 + 0x0C0C0C0C),
6603 static_cast<uint32_t
>(idx3210 + 0x08080808),
6604 static_cast<uint32_t
>(idx3210 + 0x04040404),
6605 static_cast<uint32_t
>(idx3210));
6609template <
class D, HWY_IF_T_SIZE_D(D, 2)>
6610HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D
d,
const uint32_t idx3,
6611 const uint32_t idx2,
6612 const uint32_t idx1,
6613 const uint32_t idx0) {
6615#if HWY_IS_LITTLE_ENDIAN
6616 const uint32_t idx10 =
static_cast<uint32_t
>((idx1 << 16) | idx0);
6617 const uint32_t idx32 =
static_cast<uint32_t
>((idx3 << 16) | idx2);
6618 constexpr uint32_t kLaneByteOffsets{0x01000100};
6620 const uint32_t idx10 =
static_cast<uint32_t
>(idx1 | (idx0 << 16));
6621 const uint32_t idx32 =
static_cast<uint32_t
>(idx3 | (idx2 << 16));
6622 constexpr uint32_t kLaneByteOffsets{0x00010001};
6624 constexpr uint32_t kHiLaneByteOffsets{kLaneByteOffsets + 0x08080808u};
6627 du32,
static_cast<uint32_t
>(idx32 * 0x0202u + kHiLaneByteOffsets),
6628 static_cast<uint32_t
>(idx10 * 0x0202u + kHiLaneByteOffsets),
6629 static_cast<uint32_t
>(idx32 * 0x0202u + kLaneByteOffsets),
6630 static_cast<uint32_t
>(idx10 * 0x0202u + kLaneByteOffsets));
6634template <
class D, HWY_IF_T_SIZE_D(D, 4)>
6635HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D
d,
const uint32_t idx3,
6636 const uint32_t idx2,
6637 const uint32_t idx1,
6638 const uint32_t idx0) {
6640#if HWY_IS_LITTLE_ENDIAN
6641 constexpr uint32_t kLaneByteOffsets{0x03020100};
6643 constexpr uint32_t kLaneByteOffsets{0x00010203};
6647 du32,
static_cast<uint32_t
>(idx3 * 0x04040404u + kLaneByteOffsets),
6648 static_cast<uint32_t
>(idx2 * 0x04040404u + kLaneByteOffsets),
6649 static_cast<uint32_t
>(idx1 * 0x04040404u + kLaneByteOffsets),
6650 static_cast<uint32_t
>(idx0 * 0x04040404u + kLaneByteOffsets));
6655template <
class D, HWY_IF_T_SIZE_D(D, 1)>
6657 const uint32_t idx2,
6658 const uint32_t idx1,
6659 const uint32_t idx0) {
6660 return TblLookupPer4LaneBlkU8IdxInBlk(
d, idx3, idx2, idx1, idx0);
6663#if HWY_TARGET == HWY_RVV
6664template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
6666 const uint32_t idx2,
6667 const uint32_t idx1,
6668 const uint32_t idx0) {
6669 const Rebind<uint8_t,
decltype(
d)> du8;
6671 TblLookupPer4LaneBlkU8IdxInBlk(du8, idx3, idx2, idx1, idx0));
6674template <
class D, HWY_IF_T_SIZE_D(D, 2)>
6676 const uint32_t idx2,
6677 const uint32_t idx1,
6678 const uint32_t idx0) {
6679 const uint16_t u16_idx0 =
static_cast<uint16_t
>(idx0);
6680 const uint16_t u16_idx1 =
static_cast<uint16_t
>(idx1);
6681 const uint16_t u16_idx2 =
static_cast<uint16_t
>(idx2);
6682 const uint16_t u16_idx3 =
static_cast<uint16_t
>(idx3);
6683#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
6684 constexpr size_t kMinLanesToLoad = 4;
6686 constexpr size_t kMinLanesToLoad = 8;
6689 const CappedTag<uint16_t, kNumToLoad> d_load;
6692 u16_idx0, u16_idx1, u16_idx2, u16_idx3));
6695template <
class D, HWY_IF_T_SIZE_D(D, 4)>
6697 const uint32_t idx2,
6698 const uint32_t idx1,
6699 const uint32_t idx0) {
6703template <
class D, HWY_IF_T_SIZE_D(D, 8)>
6705 const uint32_t idx2,
6706 const uint32_t idx1,
6707 const uint32_t idx0) {
6709 const Rebind<uint32_t,
decltype(
d)> du32;
6715template <
class D, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D)>
6716HWY_INLINE IndicesFromD<D> TblLookupPer4LaneBlkShufIdx(D
d,
const uint32_t idx3,
6717 const uint32_t idx2,
6718 const uint32_t idx1,
6719 const uint32_t idx0) {
6721 using TU =
TFromD<
decltype(du)>;
6722 auto idx_in_blk = TblLookupPer4LaneBlkIdxInBlk(du, idx3, idx2, idx1, idx0);
6726 idx_in_blk =
And(idx_in_blk,
Set(du,
static_cast<TU
>(kN - 1)));
6729#if HWY_TARGET == HWY_RVV
6730 const auto blk_offsets = AndS(
Iota0(du),
static_cast<TU
>(~TU{3}));
6732 const auto blk_offsets =
6733 And(
Iota(du, TU{0}),
Set(du,
static_cast<TU
>(~TU{3})));
6738template <
class V, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(DFromV<V>)>
6743#undef HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE
6746HWY_INLINE V TblLookupPer4LaneBlkShuf(V v,
size_t idx3210) {
6748 const uint32_t idx3 =
static_cast<uint32_t
>((idx3210 >> 6) & 3);
6749 const uint32_t idx2 =
static_cast<uint32_t
>((idx3210 >> 4) & 3);
6750 const uint32_t idx1 =
static_cast<uint32_t
>((idx3210 >> 2) & 3);
6751 const uint32_t idx0 =
static_cast<uint32_t
>(idx3210 & 3);
6752 const auto idx = TblLookupPer4LaneBlkShufIdx(
d, idx3, idx2, idx1, idx0);
6753 return Per4LaneBlkShufDoTblLookup(v, idx);
6759template <
size_t kIdx3210,
size_t kLaneSize,
size_t kVectSize,
class V>
6764 return TblLookupPer4LaneBlkShuf(v, kIdx3210);
6777template <
size_t kLaneSize,
class V>
6787template <
size_t kLaneSize,
class V>
6807 const auto vw = Per4LaneBlockShufCastToWide(
6817 const auto vw = Per4LaneBlockShufCastToWide(
6819 const DFromV<
decltype(vw)> dw;
6823#if HWY_MAX_BYTES >= 32
6824template <
class V, HWY_IF_T_SIZE_V(V, 8)>
6830template <
class V, HWY_IF_LANES_D(DFromV<V>, 4),
6831 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
6832HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> , V v) {
6833 const DFromV<decltype(v)> d;
6834 return InterleaveLower(d, v, v);
6837template <
class V, HWY_IF_T_SIZE_V(V, 4)>
6843template <
class V, HWY_IF_LANES_D(DFromV<V>, 4)>
6860template <
class V, HWY_IF_LANES_D(DFromV<V>, 4)>
6876 const auto vw = Per4LaneBlockShufCastToWide(
6886template <
class V, HWY_IF_T_SIZE_V(V, 4)>
6892template <
size_t kIdx3210,
class V>
6902template <
size_t kIdx3,
size_t kIdx2,
size_t kIdx1,
size_t kIdx0,
class V,
6905 static_assert(kIdx0 <= 3,
"kIdx0 <= 3 must be true");
6906 static_assert(kIdx1 <= 3,
"kIdx1 <= 3 must be true");
6907 static_assert(kIdx2 <= 3,
"kIdx2 <= 3 must be true");
6908 static_assert(kIdx3 <= 3,
"kIdx3 <= 3 must be true");
6913#if HWY_TARGET != HWY_SCALAR
6914template <
size_t kIdx3,
size_t kIdx2,
size_t kIdx1,
size_t kIdx0,
class V,
6917 static_assert(kIdx0 <= 3,
"kIdx0 <= 3 must be true");
6918 static_assert(kIdx1 <= 3,
"kIdx1 <= 3 must be true");
6919 static_assert(kIdx2 <= 3,
"kIdx2 <= 3 must be true");
6920 static_assert(kIdx3 <= 3,
"kIdx3 <= 3 must be true");
6922 constexpr bool isReverse2 = (kIdx0 == 1 || kIdx1 == 0) && (kIdx0 != kIdx1);
6923 constexpr size_t kPer2BlkIdx0 = (kIdx0 <= 1) ? kIdx0 : (isReverse2 ? 1 : 0);
6924 constexpr size_t kPer2BlkIdx1 = (kIdx1 <= 1) ? kIdx1 : (isReverse2 ? 0 : 1);
6926 constexpr size_t kIdx10 = (kPer2BlkIdx1 << 1) | kPer2BlkIdx0;
6927 static_assert(kIdx10 <= 3,
"kIdx10 <= 3 must be true");
6931template <
size_t kIdx3,
size_t kIdx2,
size_t kIdx1,
size_t kIdx0,
class V,
6934 static_assert(kIdx0 <= 3,
"kIdx0 <= 3 must be true");
6935 static_assert(kIdx1 <= 3,
"kIdx1 <= 3 must be true");
6936 static_assert(kIdx2 <= 3,
"kIdx2 <= 3 must be true");
6937 static_assert(kIdx3 <= 3,
"kIdx3 <= 3 must be true");
6939 constexpr size_t kIdx3210 =
6940 (kIdx3 << 6) | (kIdx2 << 4) | (kIdx1 << 2) | kIdx0;
6949 return (
d.MaxBytes() <= 16) ? 1 : ((
Lanes(
d) *
sizeof(
TFromD<D>) + 15) / 16);
6953#if (defined(HWY_NATIVE_BLK_INSERT_EXTRACT) == defined(HWY_TARGET_TOGGLE))
6954#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
6955#undef HWY_NATIVE_BLK_INSERT_EXTRACT
6957#define HWY_NATIVE_BLK_INSERT_EXTRACT
6960template <
int kBlockIdx,
class V, HWY_IF_V_SIZE_LE_V(V, 16)>
6962 static_assert(kBlockIdx == 0,
"Invalid block index");
6963 return blk_to_insert;
6966template <
int kBlockIdx,
class V, HWY_IF_V_SIZE_LE_V(V, 16)>
6968 static_assert(kBlockIdx == 0,
"Invalid block index");
6972template <
int kBlockIdx,
class V, HWY_IF_V_SIZE_LE_V(V, 16)>
6974 static_assert(kBlockIdx == 0,
"Invalid block index");
6981#if (defined(HWY_NATIVE_BROADCASTLANE) == defined(HWY_TARGET_TOGGLE))
6982#ifdef HWY_NATIVE_BROADCASTLANE
6983#undef HWY_NATIVE_BROADCASTLANE
6985#define HWY_NATIVE_BROADCASTLANE
6988template <
int kLane,
class V, HWY_IF_V_SIZE_LE_V(V, 16)>
6990 return Broadcast<kLane>(v);
6996#if (defined(HWY_NATIVE_SLIDE1_UP_DOWN) == defined(HWY_TARGET_TOGGLE))
6997#ifdef HWY_NATIVE_SLIDE1_UP_DOWN
6998#undef HWY_NATIVE_SLIDE1_UP_DOWN
7000#define HWY_NATIVE_SLIDE1_UP_DOWN
7003template <
class D, HWY_IF_LANES_D(D, 1)>
7007template <
class D, HWY_IF_LANES_D(D, 1)>
7012#if HWY_TARGET != HWY_SCALAR
7013template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
7015 return ShiftLeftLanes<1>(
d, v);
7017template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
7019 return ShiftRightLanes<1>(
d, v);
7027template <
int kBlocks,
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
7029 static_assert(kBlocks == 0,
"kBlocks == 0 must be true");
7033#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
7034template <
int kBlocks,
class D, HWY_IF_V_SIZE_GT_D(D, 16)>
7036 static_assert(0 <= kBlocks &&
static_cast<size_t>(kBlocks) <
d.MaxBlocks(),
7037 "kBlocks must be between 0 and d.MaxBlocks() - 1");
7038 constexpr size_t kLanesPerBlock = 16 /
sizeof(TFromD<D>);
7039 return SlideUpLanes(
d, v,
static_cast<size_t>(kBlocks) * kLanesPerBlock);
7045template <
int kBlocks,
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
7047 static_assert(kBlocks == 0,
"kBlocks == 0 must be true");
7051#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
7052template <
int kBlocks,
class D, HWY_IF_V_SIZE_GT_D(D, 16)>
7054 static_assert(0 <= kBlocks &&
static_cast<size_t>(kBlocks) <
d.MaxBlocks(),
7055 "kBlocks must be between 0 and d.MaxBlocks() - 1");
7056 constexpr size_t kLanesPerBlock = 16 /
sizeof(TFromD<D>);
7057 return SlideDownLanes(
d, v,
static_cast<size_t>(kBlocks) * kLanesPerBlock);
7062#if (defined(HWY_NATIVE_SLIDE_MASK) == defined(HWY_TARGET_TOGGLE))
7064#ifdef HWY_NATIVE_SLIDE_MASK
7065#undef HWY_NATIVE_SLIDE_MASK
7067#define HWY_NATIVE_SLIDE_MASK
7094#if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
7095 defined(HWY_TARGET_TOGGLE))
7096#ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
7097#undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
7099#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
7102#if HWY_TARGET != HWY_SCALAR
7103template <
int kAOffset,
int kBOffset,
class V8, HWY_IF_UI8_D(DFromV<V8>)>
7105 static_assert(0 <= kAOffset && kAOffset <= 1,
7106 "kAOffset must be between 0 and 1");
7107 static_assert(0 <= kBOffset && kBOffset <= 3,
7108 "kBOffset must be between 0 and 3");
7109 using D8 = DFromV<V8>;
7118#if HWY_TARGET == HWY_RVV
7124 constexpr int kInterleavePow2 =
HWY_MAX(d8.Pow2(), 0);
7125 const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave;
7126#elif HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
7127 HWY_TARGET == HWY_SVE2_128
7131 const D8 d8_interleave;
7135 constexpr size_t kInterleaveLanes =
7137 const FixedTag<TFromD<D8>, kInterleaveLanes> d8_interleave;
7144 const auto a_to_interleave =
ResizeBitCast(d8_interleave, a);
7146 const auto a_interleaved_lo =
7148 const auto a_interleaved_hi =
7168 d8_interleave, a_interleaved_hi, a_interleaved_lo));
7171 d8_interleave, a_interleaved_hi, a_interleaved_lo));
7183 const V8 b01 =
BitCast(d8, Broadcast<kBOffset * 2>(
BitCast(d16, b)));
7184 const V8 b23 =
BitCast(d8, Broadcast<kBOffset * 2 + 1>(
BitCast(d16, b)));
7186 const VFromD<
decltype(du16)> absdiff_sum_01 =
7188 const VFromD<
decltype(du16)> absdiff_sum_23 =
7190 return BitCast(d16,
Add(absdiff_sum_01, absdiff_sum_23));
7198#if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \
7199 defined(HWY_TARGET_TOGGLE))
7200#ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
7201#undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
7203#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
7206#if HWY_TARGET != HWY_SCALAR
7207template <
int kIdx3,
int kIdx2,
int kIdx1,
int kIdx0,
class V8,
7211 static_assert(0 <= kIdx0 && kIdx0 <= 3,
"kIdx0 must be between 0 and 3");
7212 static_assert(0 <= kIdx1 && kIdx1 <= 3,
"kIdx1 must be between 0 and 3");
7213 static_assert(0 <= kIdx2 && kIdx2 <= 3,
"kIdx2 must be between 0 and 3");
7214 static_assert(0 <= kIdx3 && kIdx3 <= 3,
"kIdx3 must be between 0 and 3");
7216#if HWY_TARGET == HWY_RVV
7228 const DFromV<
decltype(a)> d8;
7239 Per4LaneBlockShuffle<kIdx3, kIdx2, kIdx1, kIdx0>(
BitCast(d32, vA));
7248#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
7254 const auto a_0123_2345 =
BitCast(
7256 const auto a_1234_3456 =
7260 const auto a_0123_2345 =
7262 const auto a_1234_3456 =
BitCast(
7264 OddEven(ShiftLeftBytes<1>(d32, a_shuf), ShiftRightBytes<1>(d32, a_shuf)));
7270#if HWY_IS_LITTLE_ENDIAN
7271 odd_sums = ShiftLeft<16>(odd_sums);
7273 even_sums = ShiftLeft<16>(even_sums);
7278#if HWY_TARGET == HWY_RVV
7292#if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE))
7293#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
7294#undef HWY_NATIVE_OPERATOR_REPLACEMENTS
7296#define HWY_NATIVE_OPERATOR_REPLACEMENTS
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V)
Definition arm_sve-inl.h:2568
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_IF_LANES(kN, lanes)
Definition base.h:616
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_IF_T_SIZE(T, bytes)
Definition base.h:639
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_LANES_GT(kN, lanes)
Definition base.h:618
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_GT(T, bytes)
Definition base.h:649
#define HWY_IF_LANES_LE(kN, lanes)
Definition base.h:617
Definition arm_neon-inl.h:865
Definition arm_neon-inl.h:813
Raw raw
Definition arm_neon-inl.h:851
#define HWY_WASM_EMU256
Definition detect_targets.h:117
#define HWY_SSE2
Definition detect_targets.h:80
#define HWY_NEON
Definition detect_targets.h:93
#define HWY_TARGET
Definition detect_targets.h:543
#define HWY_WASM
Definition detect_targets.h:118
#define HWY_NEON_WITHOUT_AES
Definition detect_targets.h:94
#define HWY_REVERSE_BITS_MIN_BYTES
Definition generic_ops-inl.h:6418
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:744
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition ops/shared-inl.h:151
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_INLINE V NormalizeForUIntTruncConvToF32(V v)
Definition generic_ops-inl.h:3977
HWY_INLINE VFromD< RepartitionToWide< DFromV< V > > > SumsOf2(hwy::SignedTag, hwy::SizeTag< 1 >, V v)
Definition arm_neon-inl.h:1959
HWY_API VFromD< DTo > ConvertTo(hwy::FloatTag, DTo, Vec128< TFrom, HWY_MAX_LANES_D(DTo)> from)
Definition emu128-inl.h:1857
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:689
HWY_INLINE V IntDivUsingFloatDiv(V a, V b)
Definition generic_ops-inl.h:4674
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:560
VFromD< F32ExpLzcntMinMaxRepartition< DFromV< V > > > F32ExpLzcntMinMaxCmpV
Definition generic_ops-inl.h:3939
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE void LoadTransposedBlocks4(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &vA, VFromD< D > &vB, VFromD< D > &vC, VFromD< D > &vD)
Definition generic_ops-inl.h:1477
HWY_INLINE void StoreTransposedBlocks3(VFromD< D > A, VFromD< D > B, VFromD< D > C, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1652
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:681
static HWY_INLINE HWY_MAYBE_UNUSED TFromV< V > GetLane(V v)
Definition arm_neon-inl.h:1634
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
HWY_INLINE Vec128< float16_t, N > ConcatEven(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7002
HWY_INLINE V IntDiv(V a, V b)
Definition generic_ops-inl.h:4909
HWY_INLINE void StoreTransposedBlocks2(VFromD< D > A, VFromD< D > B, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1616
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE Vec128< float16_t, N > ConcatOdd(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7009
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:570
HWY_INLINE VFromD< DTo > ZeroExtendResizeBitCast(FromSizeTag, ToSizeTag, DTo d_to, DFrom, VFromD< DFrom > v)
Definition emu128-inl.h:140
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
HWY_INLINE F32ExpLzcntMinMaxCmpV< V > F32ExpLzcntMinMaxBitCast(V v)
Definition generic_ops-inl.h:3942
HWY_INLINE VFromD< D > ReduceWithinBlocks(D d, Func f, VFromD< D > v10)
Definition generic_ops-inl.h:1013
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
HWY_INLINE void StoreTransposedBlocks4(VFromD< D > vA, VFromD< D > vB, VFromD< D > vC, VFromD< D > vD, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:2003
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:1383
HWY_INLINE Vec< D > IntDivConvIntToFloat(D df, V vi)
Definition generic_ops-inl.h:4617
HWY_INLINE VFromD< D > PromoteOddTo(hwy::FloatTag to_type_tag, hwy::SizeTag< 4 > to_lane_size_tag, hwy::FloatTag from_type_tag, D d_to, svfloat16_t v)
Definition arm_sve-inl.h:4419
HWY_INLINE V IntMod(V a, V b)
Definition generic_ops-inl.h:5060
typename AdjustSimdTagToMinVecPow2_t< RemoveConst< D > >::type AdjustSimdTagToMinVecPow2
Definition rvv-inl.h:70
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:774
HWY_INLINE svuint8_t IndicesForExpandFromBits(uint64_t mask_bits)
Definition arm_sve-inl.h:4933
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
HWY_INLINE VFromD< D > PromoteEvenTo(hwy::SignedTag, hwy::SizeTag< 2 >, hwy::SignedTag, D d_to, svint8_t v)
Definition arm_sve-inl.h:4334
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
HWY_INLINE void LoadTransposedBlocks3(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &A, VFromD< D > &B, VFromD< D > &C)
Definition generic_ops-inl.h:1279
HWY_INLINE Vec< D > IntDivConvFloatToInt(D di, V vf)
Definition generic_ops-inl.h:4612
HWY_INLINE V I32RangeU32ToF32BiasedExp(V v)
Definition generic_ops-inl.h:3801
HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v)
Definition arm_sve-inl.h:2690
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE VFromD< D > ReduceAcrossBlocks(D, Func, VFromD< D > v)
Definition generic_ops-inl.h:998
HWY_INLINE V UI8ReverseBitsStep(V v)
Definition generic_ops-inl.h:6434
RebindToUnsigned< D > F32ExpLzcntMinMaxRepartition
Definition generic_ops-inl.h:3928
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
HWY_INLINE VFromD< D > UIntToF32BiasedExp(D d, VFromD< D > v)
Definition generic_ops-inl.h:3772
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API V SaturatedNeg(V v)
Definition generic_ops-inl.h:897
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< DI32 > SatWidenMulPairwiseAccumulate(DI32 di32, VFromD< Repartition< int16_t, DI32 > > a, VFromD< Repartition< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition generic_ops-inl.h:5179
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API svbool_t MaskFalse(const D)
Definition arm_sve-inl.h:372
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
RepartitionToWide< RepartitionToWideX2< D > > RepartitionToWideX3
Definition ops/shared-inl.h:483
HWY_API Mask< D > SlideMask1Up(D d, Mask< D > m)
Definition generic_ops-inl.h:7071
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
V Shl(V a, V b)
Definition generic_ops-inl.h:7322
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API Vec128< int64_t > SaturatedAbs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3288
HWY_API Vec< D > NaN(D d)
Definition generic_ops-inl.h:82
HWY_API VFromD< D > GatherIndexN(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index, const size_t max_lanes_to_load)
Definition generic_ops-inl.h:2789
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API V MaskedModOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:4666
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API V AddSub(V a, V b)
Definition generic_ops-inl.h:775
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API VFromD< D > PromoteInRangeLowerTo(D d, V v)
Definition generic_ops-inl.h:3620
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API VFromD< DTo > ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, VFromD< DFrom > v)
Definition generic_ops-inl.h:162
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API TFromD< D > ReduceMax(D d, VFromD< D > v)
Definition arm_sve-inl.h:3213
HWY_API V Rol(V a, V b)
Definition generic_ops-inl.h:445
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
HWY_API VFromD< DI32 > SatWidenMulAccumFixedPoint(DI32, VFromD< Rebind< int16_t, DI32 > > a, VFromD< Rebind< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition arm_neon-inl.h:6496
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API svbool_t DemoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1420
HWY_API V ZeroIfNegative(V v)
Definition generic_ops-inl.h:266
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API MFromD< DTo > OrderedDemote2MasksTo(DTo d_to, DFrom, MFromD< DFrom > a, MFromD< DFrom > b)
Definition x86_128-inl.h:1107
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfAdjQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3901
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API VFromD< D > SlideDownBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7046
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > PromoteInRangeEvenTo(D d, V v)
Definition generic_ops-inl.h:3652
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfShuffledQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3943
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API VFromD< RepartitionToWideX2< DFromV< V > > > SumsOf4(V v)
Definition generic_ops-inl.h:3733
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v)
Definition generic_ops-inl.h:869
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API Vec< DI16 > SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b)
Definition generic_ops-inl.h:5153
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Mask< D > SlideMask1Down(D d, Mask< D > m)
Definition generic_ops-inl.h:7076
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec< RepartitionToWideX3< DFromV< V > > > SumsOf8AbsDiff(V a, V b)
Definition generic_ops-inl.h:2820
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API V ReverseLaneBytes(V v)
Definition generic_ops-inl.h:6386
HWY_API VFromD< D > PromoteLowerTo(D d, V v)
Definition generic_ops-inl.h:2984
HWY_API V MulAddSub(V mul, V x, V sub_or_add)
Definition arm_sve-inl.h:4285
HWY_API V RotateRightSame(V v, int bits)
Definition generic_ops-inl.h:601
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< DTo > PromoteEvenTo(DTo d_to, Vec1< TFrom > v)
Definition scalar-inl.h:1478
HWY_API V Per4LaneBlockShuffle(V v)
Definition generic_ops-inl.h:6904
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_API TFromD< D > ReduceMin(D d, VFromD< D > v)
Definition arm_sve-inl.h:3208
HWY_API Vec1< MakeWide< T > > SumsOf2(const Vec1< T > v)
Definition scalar-inl.h:549
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API void ScatterIndexN(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index, const size_t max_lanes_to_store)
Definition generic_ops-inl.h:2782
HWY_API Vec< D > Inf(D d)
Definition generic_ops-inl.h:91
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
HWY_API V IfNegativeThenElseZero(V v, V yes)
Definition generic_ops-inl.h:241
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API V Ror(V a, V b)
Definition generic_ops-inl.h:459
HWY_API VFromD< DN > OrderedTruncate2To(DN dn, V a, V b)
Definition emu128-inl.h:1978
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API V RotateLeft(V v)
Definition generic_ops-inl.h:427
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API Mask< D > SlideMaskDownLanes(D d, Mask< D > m, size_t amt)
Definition generic_ops-inl.h:7086
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API void LoadInterleaved2(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1)
Definition arm_neon-inl.h:9049
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:476
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
VFromD< ScalableTag< bfloat16_t > > VBF16
Definition arm_sve-inl.h:410
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
decltype(MaskFromVec(Zero(D()))) Mask
Definition generic_ops-inl.h:52
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API svbool_t LowerHalfOfMask(D, svbool_t m)
Definition arm_sve-inl.h:1456
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
RepartitionToWide< RepartitionToWide< D > > RepartitionToWideX2
Definition ops/shared-inl.h:480
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API MFromD< DFromV< V > > IsEitherNaN(V a, V b)
Definition generic_ops-inl.h:1177
HWY_API V Div(V a, V b)
Definition arm_sve-inl.h:4639
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API V Clamp(const V v, const V lo, const V hi)
Definition generic_ops-inl.h:56
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7353
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API V HighestSetBitIndex(V v)
Definition arm_neon-inl.h:9523
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API size_t Blocks(D d)
Definition generic_ops-inl.h:6948
HWY_API VFromD< D > MaskedGatherIndex(MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2731
HWY_API void SafeFillN(const size_t num, const T value, D d, T *HWY_RESTRICT to)
Definition generic_ops-inl.h:172
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
V Shr(V a, V b)
Definition generic_ops-inl.h:7326
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API svbool_t IsNegative(V v)
Definition arm_sve-inl.h:1623
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition generic_ops-inl.h:187
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_API Mask< D > SlideMaskUpLanes(D d, Mask< D > m, size_t amt)
Definition generic_ops-inl.h:7081
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API V TrailingZeroCount(V v)
Definition arm_neon-inl.h:9530
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
decltype(GetLane(V())) LaneType
Definition generic_ops-inl.h:39
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API MFromD< D > UpperHalfOfMask(D, MFromD< Twice< D > > m)
Definition x86_128-inl.h:1051
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API V ReverseBits(V v)
Definition generic_ops-inl.h:6464
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_API MFromD< D > CombineMasks(D, MFromD< Half< D > > hi, MFromD< Half< D > > lo)
Definition x86_128-inl.h:959
HWY_API Vec1< T > operator%(Vec1< T > a, Vec1< T > b)
Definition generic_ops-inl.h:5095
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
HWY_API VFromD< D > SlideUpBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7028
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API svbool_t PromoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1394
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V RotateLeftSame(V v, int bits)
Definition generic_ops-inl.h:588
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition rvv-inl.h:3761
decltype(IndicesFromVec(D(), Zero(RebindToUnsigned< D >()))) IndicesFromD
Definition generic_ops-inl.h:6302
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API V Mod(V a, V b)
Definition arm_sve-inl.h:4660
HWY_API V IfNegativeThenZeroElse(V v, V no)
Definition generic_ops-inl.h:256
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API V BroadcastLane(const V v)
Definition arm_sve-inl.h:4146
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
FuncOutput(*)(const void *, FuncInput) Func
Definition nanobenchmark.h:87
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API constexpr bool IsSame()
Definition base.h:499
HWY_API constexpr bool IsSigned()
Definition base.h:2134
typename detail::Relations< T >::Float MakeFloat
Definition base.h:2082
typename IfT< Condition, Then, Else >::type If
Definition base.h:520
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition base.h:2114
constexpr MakeUnsigned< T > SignMask()
Definition base.h:2287
typename EnableIfT< Condition >::type EnableIf
Definition base.h:486
constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed+R::is_float+R::is_bf16)<< 8)>
Definition base.h:2105
HWY_API size_t PopCount(T x)
Definition base.h:2615
HWY_API constexpr T LimitsMax()
Definition base.h:2174
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue()
Definition base.h:2212
#define HWY_IF_U8_D(D)
Definition ops/shared-inl.h:577
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_U16_D(D)
Definition ops/shared-inl.h:578
#define HWY_IF_I16_D(D)
Definition ops/shared-inl.h:583
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_V_SIZE_GT_V(V, bytes)
Definition ops/shared-inl.h:636
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_NOT_BF16_D(D)
Definition ops/shared-inl.h:595
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:624
#define HWY_IF_LANES_LE_D(D, lanes)
Definition ops/shared-inl.h:561
#define HWY_IF_LANES_GT_D(D, lanes)
Definition ops/shared-inl.h:562
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_NOT_FLOAT_V(V)
Definition ops/shared-inl.h:618
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_V_SIZE_V(V, bytes)
Definition ops/shared-inl.h:632
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_V_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:609
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_IF_I8_D(D)
Definition ops/shared-inl.h:582
#define HWY_IF_UI8_D(D)
Definition ops/shared-inl.h:589
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_MAX_BYTES
Definition set_macros-inl.h:168
#define HWY_ALIGN
Definition set_macros-inl.h:167
#define HWY_HAVE_INTEGER64
Definition set_macros-inl.h:172
#define HWY_HAVE_FLOAT64
Definition set_macros-inl.h:174
#define HWY_HAVE_FLOAT16
Definition set_macros-inl.h:173
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition ops/shared-inl.h:198
Definition scalar-inl.h:36
Definition generic_ops-inl.h:975
V operator()(V a, V b) const
Definition generic_ops-inl.h:977
Definition generic_ops-inl.h:989
V operator()(V a, V b) const
Definition generic_ops-inl.h:991
Definition generic_ops-inl.h:982
V operator()(V a, V b) const
Definition generic_ops-inl.h:984
int VFromD
Definition tuple-inl.h:25