19#if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
20#define HWY_S390X_HAVE_Z14 1
22#define HWY_S390X_HAVE_Z14 0
25#pragma push_macro("vector")
26#pragma push_macro("pixel")
27#pragma push_macro("bool")
39#pragma pop_macro("vector")
40#pragma pop_macro("pixel")
41#pragma pop_macro("bool")
50#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \
51 (defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__))
52#define HWY_PPC_HAVE_9 1
54#define HWY_PPC_HAVE_9 0
57#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \
58 (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
59#define HWY_PPC_HAVE_10 1
61#define HWY_PPC_HAVE_10 0
64#if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13
65#define HWY_S390X_HAVE_Z15 1
67#define HWY_S390X_HAVE_Z15 0
89#define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \
91 struct Raw128<LANE_TYPE> { \
92 using type = __vector RAW_VECT_LANE_TYPE; \
93 using RawBoolVec = __vector __bool RAW_BOOL_VECT_LANE_TYPE; \
94 using RawT = RAW_VECT_LANE_TYPE; \
95 typedef LANE_TYPE AlignedRawVec \
96 __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \
97 typedef LANE_TYPE UnalignedRawVec __attribute__(( \
98 __vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \
122template <
typename T,
size_t N = 16 /
sizeof(T)>
128 static constexpr size_t kPrivateN = N;
133 return *
this = (*
this * other);
136 return *
this = (*
this / other);
139 return *
this = (*
this + other);
142 return *
this = (*
this - other);
145 return *
this = (*
this % other);
148 return *
this = (*
this & other);
151 return *
this = (*
this | other);
154 return *
this = (*
this ^ other);
161using Vec64 = Vec128<T, 8 /
sizeof(T)>;
164using Vec32 = Vec128<T, 4 /
sizeof(T)>;
167using Vec16 = Vec128<T, 2 /
sizeof(T)>;
170template <
typename T,
size_t N = 16 /
sizeof(T)>
175 static constexpr size_t kPrivateN = N;
185using TFromV =
typename V::PrivateT;
190template <
class D,
typename T = TFromD<D>>
207template <
class D,
typename FromT>
209 Vec128<FromT, Repartition<FromT, D>().
MaxLanes()> v) {
214 reinterpret_cast<typename detail::Raw128<TFromD<D>
>::type>(v.raw)};
219template <
class D,
typename FromV>
225 reinterpret_cast<typename detail::Raw128<TFromD<D>
>::type>(v.raw)};
231template <
class D, HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)>
234 return VFromD<D>{vec_splats(
static_cast<RawLane
>(t))};
237template <
class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)>
246#if HWY_COMPILER_GCC_ACTUAL
253 typename detail::Raw128<
TFromD<D>>::type raw;
254 return VFromD<
decltype(
d)>{raw};
263template <
typename T,
size_t N>
265 return static_cast<T
>(v.raw[0]);
270template <
class D, HWY_IF_T_SIZE_D(D, 1)>
272 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
273 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
274 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
275 TFromD<D> t11, TFromD<D> t12,
276 TFromD<D> t13, TFromD<D> t14,
278 const typename detail::Raw128<TFromD<D>>::type raw = {
279 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15};
283template <
class D, HWY_IF_UI16_D(D)>
285 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
286 TFromD<D> t5, TFromD<D> t6,
288 const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3,
293template <
class D, HWY_IF_SPECIAL_FLOAT_D(D)>
295 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
296 TFromD<D> t5, TFromD<D> t6,
301 du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
302 BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
303 BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
304 BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
307template <
class D, HWY_IF_T_SIZE_D(D, 4)>
309 TFromD<D> t2, TFromD<D> t3) {
310 const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3};
314template <
class D, HWY_IF_T_SIZE_D(D, 8)>
316 const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1};
324template <
typename T,
size_t N>
325HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
328 using VU =
VFromD<
decltype(du)>;
329#if HWY_S390X_HAVE_Z14
339template <
typename T,
size_t N>
340HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
341 const DFromV<
decltype(mask)>
d;
343 using VU =
VFromD<
decltype(du)>;
350template <
typename T,
size_t N>
351HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
354 using VU =
VFromD<
decltype(du)>;
355#if HWY_S390X_HAVE_Z14
364template <
typename T,
size_t N>
365HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
368 using VU =
VFromD<
decltype(du)>;
369#if HWY_S390X_HAVE_Z14
377template <
typename T,
size_t N>
381 using VU =
VFromD<
decltype(du)>;
391 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
392 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
393 __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
394 __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
395 __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
396 __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
397 __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
398 __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]);
404 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
405 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
406 __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
407 __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]);
413 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
414 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]);
420 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]);
437template <u
int8_t kTernLogOp,
class V>
441 using VU =
VFromD<
decltype(du)>;
442 const auto a_raw =
BitCast(du, a).raw;
443 const auto b_raw =
BitCast(du, b).raw;
444 const auto c_raw =
BitCast(du, c).raw;
446#if HWY_COMPILER_GCC_ACTUAL
448 typename detail::Raw128<TFromV<VU>>::type raw_ternlog_result;
449 __asm__(
"xxeval %x0,%x1,%x2,%x3,%4"
450 :
"=wa"(raw_ternlog_result)
451 :
"wa"(a_raw),
"wa"(b_raw),
"wa"(c_raw),
452 "n"(static_cast<unsigned>(kTernLogOp))
455 const auto raw_ternlog_result =
456 vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp);
459 return BitCast(
d, VU{raw_ternlog_result});
466template <
typename T,
size_t N>
467HWY_API Vec128<T, N>
Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
469#if defined(__OPTIMIZE__)
474 return Xor(x1,
Xor(x2, x3));
478 return detail::TernaryLogic<0x69>(x1, x2, x3);
481 return Xor(x1,
Xor(x2, x3));
486template <
typename T,
size_t N>
487HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
489#if defined(__OPTIMIZE__)
494 return Or(o1,
Or(o2, o3));
498 return detail::TernaryLogic<0x7F>(o1, o2, o3);
501 return Or(o1,
Or(o2, o3));
506template <
typename T,
size_t N>
507HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
509#if defined(__OPTIMIZE__)
512 return Or(o,
And(a1, a2));
516 return detail::TernaryLogic<0x1F>(o, a1, a2);
519 return Or(o,
And(a1, a2));
524template <
typename T,
size_t N>
536#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
537#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
539#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
549template <
typename T,
size_t N>
554template <
typename T,
size_t N>
559template <
typename T,
size_t N>
568template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
576template <
typename T,
size_t N, HWY_IF_FLOAT3264(T)>
578#if HWY_S390X_HAVE_Z14
581 return Vec128<T, N>{vec_neg(v.raw)};
585template <
typename T,
size_t N, HWY_IF_SPECIAL_FLOAT(T)>
586HWY_API Vec128<T, N>
Neg(
const Vec128<T, N> v) {
593template <
class T,
size_t N, HWY_IF_SIGNED(T)>
600template <
class T,
size_t N, HWY_IF_FLOAT3264(T)>
602 return Vec128<T, N>{vec_abs(v.raw)};
607#if HWY_S390X_HAVE_Z14
610 static_assert(IsFloat<TFromV<V>>(),
"Only makes sense for floating-point");
612 const DFromV<
decltype(magn)>
d;
629 Vec128<float, N> sign) {
632#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
633 return Vec128<float, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
634#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
635 HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp)
636 return Vec128<float, N>{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)};
638 return Vec128<float, N>{vec_cpsgn(sign.raw, magn.raw)};
644 Vec128<double, N> sign) {
647#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
648 return Vec128<double, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
649#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
650 HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp)
651 return Vec128<double, N>{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)};
653 return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)};
658template <
typename T,
size_t N>
661 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
672template <
class D, HWY_IF_V_SIZE_D(D, 16),
typename T = TFromD<D>>
684 return Vec128<T>{
reinterpret_cast<ResultRaw
>(*p)};
692template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
typename T = TFromD<D>>
707template <
typename T,
size_t N>
709 using Raw =
typename detail::Raw128<T>::RawBoolVec;
710 return Mask128<T, N>{
reinterpret_cast<Raw
>(v.raw)};
716template <
typename T,
size_t N>
725 reinterpret_cast<typename detail::Raw128<TFromD<D>
>::type>(v.raw)};
729template <
typename T,
size_t N>
739template <
typename T,
size_t N>
745template <
typename T,
size_t N>
752template <
typename T,
size_t N>
754 return Mask128<T, N>{vec_nor(
m.raw,
m.raw)};
757template <
typename T,
size_t N>
758HWY_API Mask128<T, N>
And(Mask128<T, N> a, Mask128<T, N> b) {
759#if HWY_S390X_HAVE_Z14
760 return Mask128<T, N>{a.raw & b.raw};
762 return Mask128<T, N>{vec_and(a.raw, b.raw)};
766template <
typename T,
size_t N>
767HWY_API Mask128<T, N>
AndNot(Mask128<T, N> a, Mask128<T, N> b) {
768 return Mask128<T, N>{vec_andc(b.raw, a.raw)};
771template <
typename T,
size_t N>
772HWY_API Mask128<T, N>
Or(Mask128<T, N> a, Mask128<T, N> b) {
773#if HWY_S390X_HAVE_Z14
774 return Mask128<T, N>{a.raw | b.raw};
776 return Mask128<T, N>{vec_or(a.raw, b.raw)};
780template <
typename T,
size_t N>
781HWY_API Mask128<T, N>
Xor(Mask128<T, N> a, Mask128<T, N> b) {
782#if HWY_S390X_HAVE_Z14
783 return Mask128<T, N>{a.raw ^ b.raw};
785 return Mask128<T, N>{vec_xor(a.raw, b.raw)};
789template <
typename T,
size_t N>
791 return Mask128<T, N>{vec_nor(a.raw, b.raw)};
796template <
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
800 using TU =
TFromD<
decltype(du)>;
802#if HWY_S390X_HAVE_Z14
805 <<
Set(du,
static_cast<TU
>(bits)).raw});
810 vec_sl(
BitCast(du, v).raw,
Set(du,
static_cast<TU
>(bits)).raw)});
816template <
typename T,
size_t N, HWY_IF_UNSIGNED(T)>
819#if HWY_S390X_HAVE_Z14
826template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
828#if HWY_S390X_HAVE_Z14
829 using TI =
typename detail::Raw128<T>::RawT;
830 return Vec128<T, N>{v.
raw >> vec_splats(
static_cast<TI
>(bits))};
832 using TU =
typename detail::Raw128<MakeUnsigned<T>>::RawT;
833 return Vec128<T, N>{vec_sra(v.raw, vec_splats(
static_cast<TU
>(bits)))};
839template <
int kBits,
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
841 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
847template <
int kBits,
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
849 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
855template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
863template <
typename T,
size_t N,
typename TI,
size_t NI>
865 Vec128<TI, NI> from) {
868 vec_perm(bytes.raw, bytes.raw,
BitCast(du8_from, from).raw))};
873template <
class V,
class VI>
882template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)>
884 return Vec128<T>{vec_reve(v.raw)};
895template <
typename T,
size_t N>
897 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
898 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
899 const __vector
unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
900 12, 13, 14, 15, 8, 9, 10, 11};
901 return Vec128<T, N>{vec_perm(v.raw, v.raw, kShuffle)};
909template <
typename T, HWY_IF_T_SIZE(T, 1)>
911 const __vector
unsigned char kShuffle16 = {1, 0, 19, 18};
914template <
typename T, HWY_IF_T_SIZE(T, 2)>
916 const __vector
unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21};
919template <
typename T, HWY_IF_T_SIZE(T, 4)>
921 const __vector
unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
922 28, 29, 30, 31, 24, 25, 26, 27};
926template <
typename T, HWY_IF_T_SIZE(T, 1)>
928 const __vector
unsigned char kShuffle = {0, 3, 18, 17};
931template <
typename T, HWY_IF_T_SIZE(T, 2)>
933 const __vector
unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19};
936template <
typename T, HWY_IF_T_SIZE(T, 4)>
938 const __vector
unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15,
939 24, 25, 26, 27, 20, 21, 22, 23};
943template <
typename T, HWY_IF_T_SIZE(T, 1)>
945 const __vector
unsigned char kShuffle = {2, 1, 16, 19};
948template <
typename T, HWY_IF_T_SIZE(T, 2)>
950 const __vector
unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23};
953template <
typename T, HWY_IF_T_SIZE(T, 4)>
955 const __vector
unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7,
956 16, 17, 18, 19, 28, 29, 30, 31};
963template <
class T, HWY_IF_T_SIZE(T, 4)>
969template <
class T, HWY_IF_T_SIZE(T, 8)>
975template <
class T, HWY_IF_T_SIZE(T, 4)>
977#if HWY_IS_LITTLE_ENDIAN
984template <
class T, HWY_IF_T_SIZE(T, 4)>
986#if HWY_IS_LITTLE_ENDIAN
993template <
class T, HWY_IF_T_SIZE(T, 4)>
1002template <
class DTo,
typename TFrom,
size_t NFrom>
1004 static_assert(
sizeof(TFrom) ==
sizeof(TFromD<DTo>),
"Must have same size");
1005 return MFromD<DTo>{
m.raw};
1008template <
typename T,
size_t N>
1010 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1011 return (v & bit) == bit;
1016template <
typename T,
size_t N>
1018 return Mask128<T, N>{vec_cmpeq(a.raw, b.raw)};
1104template <
typename T,
size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1111template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1116template <
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1123template <
typename T,
size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1128template <
typename T,
size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1136template <
class D, HWY_IF_V_SIZE_D(D, 16),
typename T = TFromD<D>>
1139 const LoadRaw*
HWY_RESTRICT praw =
reinterpret_cast<const LoadRaw*
>(
p);
1141 return Vec128<T>{
reinterpret_cast<ResultRaw
>(*praw)};
1145template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
typename T = TFromD<D>>
1151template <
class D,
typename T = TFromD<D>>
1156#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
1157#ifdef HWY_NATIVE_LOAD_N
1158#undef HWY_NATIVE_LOAD_N
1160#define HWY_NATIVE_LOAD_N
1163template <
class D,
typename T = TFromD<D>>
1165 size_t max_lanes_to_load) {
1166#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
1167 if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
1177 const size_t num_of_bytes_to_load =
1180#if HWY_S390X_HAVE_Z14
1181 return (num_of_bytes_to_load > 0)
1183 const_cast<unsigned char*
>(
1184 reinterpret_cast<const unsigned char*
>(
p)),
1185 static_cast<unsigned>(num_of_bytes_to_load - 1))})
1190 VFromD<
decltype(du8)>{vec_xl_len(
1191 const_cast<unsigned char*
>(
reinterpret_cast<const unsigned char*
>(
p)),
1192 num_of_bytes_to_load)});
1196template <
class D,
typename T = TFromD<D>>
1198 size_t max_lanes_to_load) {
1199#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
1200 if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
1211 LoadN(
d,
p, max_lanes_to_load), no);
1219template <
class D, HWY_IF_T_SIZE_D(D, 1)>
1221 constexpr __vector
unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7,
1222 8, 9, 10, 11, 12, 13, 14, 15};
1226template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
1228 constexpr __vector
unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
1232template <
class D, HWY_IF_UI32_D(D)>
1234 constexpr __vector
unsigned int kU32Iota0 = {0, 1, 2, 3};
1238template <
class D, HWY_IF_UI64_D(D)>
1240 constexpr __vector
unsigned long long kU64Iota0 = {0, 1};
1244template <
class D, HWY_IF_F32_D(D)>
1246 constexpr __vector
float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f};
1250template <
class D, HWY_IF_F64_D(D)>
1252 constexpr __vector
double kF64Iota0 = {0.0, 1.0};
1258template <
class D,
typename T2>
1268 using TU =
TFromD<
decltype(du)>;
1273template <
class D,
typename T = TFromD<D>>
1279template <
class D,
typename T = TFromD<D>>
1287template <
class D, HWY_IF_V_SIZE_D(D, 16),
typename T = TFromD<D>>
1304template <
class D, HWY_IF_V_SIZE_D(D, 16),
typename T = TFromD<D>>
1307 *
reinterpret_cast<StoreRaw*
>(
p) =
reinterpret_cast<StoreRaw
>(v.
raw);
1310template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
typename T = TFromD<D>>
1320template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
typename T = TFromD<D>>
1325#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
1327#ifdef HWY_NATIVE_STORE_N
1328#undef HWY_NATIVE_STORE_N
1330#define HWY_NATIVE_STORE_N
1333template <
class D,
typename T = TFromD<D>>
1335 size_t max_lanes_to_store) {
1336#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
1337 if (__builtin_constant_p(max_lanes_to_store) && max_lanes_to_store == 0) {
1348 const size_t num_of_bytes_to_store =
1351#if HWY_S390X_HAVE_Z14
1352 if (num_of_bytes_to_store > 0) {
1353 vec_store_len(
BitCast(du8, v).raw,
reinterpret_cast<unsigned char*
>(
p),
1354 static_cast<unsigned>(num_of_bytes_to_store - 1));
1357 vec_xst_len(
BitCast(du8, v).raw,
reinterpret_cast<unsigned char*
>(
p),
1358 num_of_bytes_to_store);
1369 using TI =
TFromD<
decltype(di)>;
1374 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
1397template <
typename T,
size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1403#if HWY_S390X_HAVE_Z14
1414template <
typename T,
size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1420#if HWY_S390X_HAVE_Z14
1430template <
class V, HWY_IF_U8(TFromV<V>)>
1435template <
class V, HWY_IF_I8(TFromV<V>)>
1437#if HWY_S390X_HAVE_Z14
1438 const DFromV<
decltype(v)> di8;
1443 Set(di64, int64_t{-1024});
1453#if HWY_S390X_HAVE_Z14
1467 const auto sum =
Add(a, b);
1468 const auto overflow_mask =
AndNot(
Xor(a, b),
Xor(a, sum));
1475#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
1476#undef HWY_NATIVE_I32_SATURATED_ADDSUB
1478#define HWY_NATIVE_I32_SATURATED_ADDSUB
1481#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
1482#undef HWY_NATIVE_U32_SATURATED_ADDSUB
1484#define HWY_NATIVE_U32_SATURATED_ADDSUB
1490 return Vec128<T, N>{vec_adds(a.raw, b.raw)};
1496#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
1497#undef HWY_NATIVE_I64_SATURATED_ADDSUB
1499#define HWY_NATIVE_I64_SATURATED_ADDSUB
1502template <
class V, HWY_IF_I64_D(DFromV<V>)>
1505 const auto sum =
Add(a, b);
1506 const auto overflow_mask =
1508 const auto overflow_result =
1519#if HWY_S390X_HAVE_Z14
1526 return Sub(a,
Min(a, b));
1533 const auto diff =
Sub(a, b);
1534 const auto overflow_mask =
And(
Xor(a, b),
Xor(a, diff));
1544 return Vec128<T, N>{vec_subs(a.raw, b.raw)};
1550template <
class V, HWY_IF_I64_D(DFromV<V>)>
1553 const auto diff =
Sub(a, b);
1554 const auto overflow_mask =
1556 const auto overflow_result =
1576#ifdef HWY_NATIVE_MUL_8
1577#undef HWY_NATIVE_MUL_8
1579#define HWY_NATIVE_MUL_8
1581#ifdef HWY_NATIVE_MUL_64
1582#undef HWY_NATIVE_MUL_64
1584#define HWY_NATIVE_MUL_64
1587template <
typename T,
size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1593#if HWY_S390X_HAVE_Z14
1604#if HWY_S390X_HAVE_Z14
1605#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1606 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
1607#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1608 hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
1609#elif HWY_PPC_HAVE_10
1610#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1611 HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))
1612#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1613 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))
1615#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1616 hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
1617#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1618 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
1621#if HWY_S390X_HAVE_Z14 || HWY_PPC_HAVE_10
1625 return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
1632 const auto p_even =
MulEven(a, b);
1634#if HWY_IS_LITTLE_ENDIAN
1637 vec_sld(p_even_full.raw, p_even_full.raw, 16 -
sizeof(T))};
1644template <
typename T,
size_t N,
1653#if HWY_IS_LITTLE_ENDIAN
1661template <
class T, HWY_IF_UI64(T)>
1665 return Set(Full64<T>(), p_hi);
1668template <
class T, HWY_IF_UI64(T)>
1671 const Half<
decltype(
d)> dh;
1677#undef HWY_PPC_IF_MULHIGH_USING_VEC_MULH
1678#undef HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH
1682template <
typename T,
size_t N,
1687 return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mule(a.raw, b.raw)};
1692template <
typename T,
size_t N,
1695HWY_API Vec128<MakeWide<T>, (N + 1) / 2>
MulOdd(Vec128<T, N> a,
1697 return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
1702#ifdef HWY_NATIVE_ROL_ROR_8
1703#undef HWY_NATIVE_ROL_ROR_8
1705#define HWY_NATIVE_ROL_ROR_8
1708#ifdef HWY_NATIVE_ROL_ROR_16
1709#undef HWY_NATIVE_ROL_ROR_16
1711#define HWY_NATIVE_ROL_ROR_16
1714#ifdef HWY_NATIVE_ROL_ROR_32_64
1715#undef HWY_NATIVE_ROL_ROR_32_64
1717#define HWY_NATIVE_ROL_ROR_32_64
1720template <
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1728template <
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1736template <
int kBits,
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1739 constexpr size_t kSizeInBits =
sizeof(T) * 8;
1740 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
1744 :
Rol(v,
Set(
d,
static_cast<T
>(
static_cast<int>(kSizeInBits) -
1749#ifdef HWY_NATIVE_ROL_ROR_SAME_8
1750#undef HWY_NATIVE_ROL_ROR_SAME_8
1752#define HWY_NATIVE_ROL_ROR_SAME_8
1755#ifdef HWY_NATIVE_ROL_ROR_SAME_16
1756#undef HWY_NATIVE_ROL_ROR_SAME_16
1758#define HWY_NATIVE_ROL_ROR_SAME_16
1761#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
1762#undef HWY_NATIVE_ROL_ROR_SAME_32_64
1764#define HWY_NATIVE_ROL_ROR_SAME_32_64
1767template <
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1770 return Rol(v,
Set(
d,
static_cast<T
>(
static_cast<unsigned>(bits))));
1773template <
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1776 return Rol(v,
Set(
d,
static_cast<T
>(0u -
static_cast<unsigned>(bits))));
1781template <
typename T,
size_t N>
1784 static_assert(IsSigned<T>(),
"Only works for signed/float");
1790 d,
VFromD<
decltype(du)>{vec_blendv(
1799#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1800#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1802#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1805#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1806#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1808#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1811template <
class V, HWY_IF_NOT_UNSIGNED_V(V)>
1817template <
class V, HWY_IF_NOT_UNSIGNED_V(V)>
1825template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1833template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1834HWY_API Vec128<T, N>
MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
1836 return Vec128<T, N>{vec_madd(mul.raw, x.raw, add.raw)};
1840template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1846 return Vec128<T, N>{vec_nmsub(mul.raw, x.raw, add.raw)};
1850template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1851HWY_API Vec128<T, N>
MulSub(Vec128<T, N> mul, Vec128<T, N> x,
1853 return Vec128<T, N>{vec_msub(mul.raw, x.raw, sub.raw)};
1857template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1863 return Vec128<T, N>{vec_nmadd(mul.raw, x.raw, sub.raw)};
1869#ifdef HWY_NATIVE_F64_APPROX_RECIP
1870#undef HWY_NATIVE_F64_APPROX_RECIP
1872#define HWY_NATIVE_F64_APPROX_RECIP
1875template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1877#if HWY_S390X_HAVE_Z14
1878 return Vec128<T, N>{a.raw / b.raw};
1880 return Vec128<T, N>{vec_div(a.raw, b.raw)};
1884template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1886#if HWY_S390X_HAVE_Z14
1888 return Set(
d, T(1.0)) / v;
1896#if HWY_S390X_HAVE_Z14
1903 const auto half = v *
Set(
d, 0.5f);
1906 d,
Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(
BitCast(du, v)));
1908 return guess *
NegMulAdd(half * guess, guess,
Set(
d, 1.5f));
1912#ifdef HWY_NATIVE_F64_APPROX_RSQRT
1913#undef HWY_NATIVE_F64_APPROX_RSQRT
1915#define HWY_NATIVE_F64_APPROX_RSQRT
1919template <
class T,
size_t N, HWY_IF_FLOAT(T)>
1921 return Vec128<T, N>{vec_rsqrte(v.raw)};
1926template <
class T,
size_t N, HWY_IF_FLOAT(T)>
1928 return Vec128<T, N>{vec_sqrt(v.raw)};
1933template <
typename T,
size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1940template <
typename T,
size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1948#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
1949#undef HWY_NATIVE_INTEGER_ABS_DIFF
1951#define HWY_NATIVE_INTEGER_ABS_DIFF
1957 return V{vec_absd(a.raw, b.raw)};
1960template <
class V, HWY_IF_U64_D(DFromV<V>)>
1965template <
class V, HWY_IF_SIGNED_V(V)>
1974#ifdef HWY_NATIVE_INT_DIV
1975#undef HWY_NATIVE_INT_DIV
1977#define HWY_NATIVE_INT_DIV
1982 Vec128<int32_t, N> b) {
1990 __vector
signed int raw_result;
1991 __asm__(
"vdivsw %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
1992 return Vec128<int32_t, N>{raw_result};
1997 Vec128<uint32_t, N> b) {
2004 __vector
unsigned int raw_result;
2005 __asm__(
"vdivuw %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
2006 return Vec128<uint32_t, N>{raw_result};
2011 Vec128<int64_t, N> b) {
2019 __vector
signed long long raw_result;
2020 __asm__(
"vdivsd %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
2021 return Vec128<int64_t, N>{raw_result};
2026 Vec128<uint64_t, N> b) {
2033 __vector
unsigned long long raw_result;
2034 __asm__(
"vdivud %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
2035 return Vec128<uint64_t, N>{raw_result};
2052 const Rebind<MakeWide<T>,
decltype(
d)> dw;
2058 Vec128<int32_t, N> b) {
2066 __vector
signed int raw_result;
2067 __asm__(
"vmodsw %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
2068 return Vec128<int32_t, N>{raw_result};
2073 Vec128<uint32_t, N> b) {
2080 __vector
unsigned int raw_result;
2081 __asm__(
"vmoduw %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
2082 return Vec128<uint32_t, N>{raw_result};
2087 Vec128<int64_t, N> b) {
2095 __vector
signed long long raw_result;
2096 __asm__(
"vmodsd %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
2097 return Vec128<int64_t, N>{raw_result};
2102 Vec128<uint64_t, N> b) {
2109 __vector
unsigned long long raw_result;
2110 __asm__(
"vmodud %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
2111 return Vec128<uint64_t, N>{raw_result};
2128 const Rebind<MakeWide<T>,
decltype(
d)> dw;
2139 __builtin_prefetch(aligned, 1, 0);
2151template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2155template <
typename T,
size_t N>
2157 return Vec128<T, N / 2>{v.raw};
2168template <
int kBytes,
class D>
2170 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2171 if (kBytes == 0)
return v;
2172 const auto zeros =
Zero(
d);
2173#if HWY_IS_LITTLE_ENDIAN
2174 return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
2176 return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
2180template <
int kBytes,
typename T,
size_t N>
2182 return ShiftLeftBytes<kBytes>(
DFromV<
decltype(v)>(), v);
2193template <
int kLanes,
class D,
typename T = TFromD<D>>
2199template <
int kLanes,
typename T,
size_t N>
2201 return ShiftLeftLanes<kLanes>(
DFromV<
decltype(v)>(), v);
2212template <
int kBytes,
class D>
2214 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2215 if (kBytes == 0)
return v;
2218 if (
d.MaxBytes() != 16) {
2219 const Full128<TFromD<D>> dfull;
2220 VFromD<
decltype(dfull)> vfull{v.raw};
2224 const auto zeros =
Zero(
d);
2225#if HWY_IS_LITTLE_ENDIAN
2226 return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
2228 return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
2240template <
int kLanes,
class D>
2243 constexpr size_t kBytes = kLanes *
sizeof(TFromD<D>);
2249template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2255template <
typename T,
size_t N>
2257 return static_cast<T
>(v.raw[i]);
2261template <
typename T,
size_t N>
2263#if HWY_IS_LITTLE_ENDIAN
2265 raw_result[i] = BitCastScalar<typename detail::Raw128<T>::RawT>(t);
2266 return Vec128<T, N>{raw_result};
2270 alignas(16) T lanes[16 /
sizeof(T)];
2273 return Load(
d, lanes);
2283template <
int kBytes,
class D, HWY_IF_V_SIZE_D(D, 16),
typename T = TFromD<D>>
2285 constexpr size_t kSize = 16;
2286 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
2287#if HWY_IS_LITTLE_ENDIAN
2294template <
int kBytes,
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2296 constexpr size_t kSize =
d.MaxBytes();
2297 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
2299 using V8 = Vec128<uint8_t>;
2300 const DFromV<V8> dfull8;
2301 const Repartition<TFromD<D>,
decltype(dfull8)> dfull;
2311template <
int kLane,
typename T,
size_t N>
2313 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
2314 return Vec128<T, N>{vec_splat(v.raw, kLane)};
2320template <
typename T,
size_t N = 16 /
sizeof(T)>
2327template <
class D, HWY_IF_T_SIZE_D(D, 1)>
2334template <
class D, HWY_IF_T_SIZE_D(D, 2)>
2338#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2339 constexpr __vector
unsigned char kBroadcastLaneBytes = {
2340 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
2342 constexpr __vector
unsigned char kBroadcastLaneBytes = {
2343 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
2345 return VFromD<
decltype(d8)>{kBroadcastLaneBytes};
2348template <
class D, HWY_IF_T_SIZE_D(D, 4)>
2352#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2353 constexpr __vector
unsigned char kBroadcastLaneBytes = {
2354 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2356 constexpr __vector
unsigned char kBroadcastLaneBytes = {
2357 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15};
2359 return VFromD<
decltype(d8)>{kBroadcastLaneBytes};
2362template <
class D, HWY_IF_T_SIZE_D(D, 8)>
2366#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2367 constexpr __vector
unsigned char kBroadcastLaneBytes = {
2368 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2370 constexpr __vector
unsigned char kBroadcastLaneBytes = {
2371 7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
2373 return VFromD<
decltype(d8)>{kBroadcastLaneBytes};
2376template <
class D, HWY_IF_T_SIZE_D(D, 1)>
2382template <
class D, HWY_IF_T_SIZE_D(D, 2)>
2385 constexpr __vector
unsigned char kByteOffsets = {0, 1, 0, 1, 0, 1, 0, 1,
2386 0, 1, 0, 1, 0, 1, 0, 1};
2387 return VFromD<
decltype(d8)>{kByteOffsets};
2390template <
class D, HWY_IF_T_SIZE_D(D, 4)>
2393 constexpr __vector
unsigned char kByteOffsets = {0, 1, 2, 3, 0, 1, 2, 3,
2394 0, 1, 2, 3, 0, 1, 2, 3};
2395 return VFromD<
decltype(d8)>{kByteOffsets};
2398template <
class D, HWY_IF_T_SIZE_D(D, 8)>
2401 constexpr __vector
unsigned char kByteOffsets = {0, 1, 2, 3, 4, 5, 6, 7,
2402 0, 1, 2, 3, 4, 5, 6, 7};
2403 return VFromD<
decltype(d8)>{kByteOffsets};
2408template <
class D,
typename TI, HWY_IF_T_SIZE_D(D, 1)>
2411 using T = TFromD<D>;
2412 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2413#if HWY_IS_DEBUG_BUILD
2415 using TU =
TFromD<
decltype(du)>;
2424template <
class D,
typename TI,
2428 using T = TFromD<D>;
2429 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2430#if HWY_IS_DEBUG_BUILD
2432 using TU =
TFromD<
decltype(du)>;
2438 using V8 =
VFromD<
decltype(d8)>;
2443 constexpr int kIndexShiftAmt =
static_cast<int>(
FloorLog2(
sizeof(T)));
2444 const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
2446 return Indices128<TFromD<D>,
MaxLanes(D())>{sum.raw};
2449template <
class D,
typename TI>
2451 D
d,
const TI* idx) {
2452 const Rebind<TI,
decltype(
d)> di;
2456template <
typename T,
size_t N>
2464template <
typename T>
2470template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2472 Indices128<T, N> idx) {
2474 const Twice<
decltype(
d)> dt;
2479 const Vec128<T, N> idx_vec{idx.
raw};
2480 const Indices128<T, N * 2> idx2{
Combine(dt, idx_vec, idx_vec).raw};
2483 const Indices128<T, N * 2> idx2{idx.raw};
2490template <
typename T>
2507template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
2513template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
2519template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
2521 const __vector
unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
2522 14, 15, 12, 13, 10, 11, 8, 9};
2523 return Vec64<T>{vec_perm(v.raw, v.raw, kShuffle)};
2527template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
2535#if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
2536 (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400)
2539#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
2540#undef HWY_NATIVE_REVERSE_LANE_BYTES
2542#define HWY_NATIVE_REVERSE_LANE_BYTES
2548 return V{vec_revb(v.raw)};
2552#ifdef HWY_NATIVE_REVERSE2_8
2553#undef HWY_NATIVE_REVERSE2_8
2555#define HWY_NATIVE_REVERSE2_8
2558template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2564template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2570template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2578template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2583template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2588template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2596template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
2601template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
2607template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
2613template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
2620template <
class D, HWY_IF_T_SIZE_D(D, 2)>
2622 const __vector
unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
2623 14, 15, 12, 13, 10, 11, 8, 9};
2624 return VFromD<D>{vec_perm(v.raw, v.raw, kShuffle)};
2627template <
class D, HWY_IF_T_SIZE_D(D, 4)>
2632template <
class D, HWY_IF_T_SIZE_D(D, 8)>
2639template <
class D, HWY_IF_T_SIZE_D(D, 2)>
2644template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
2645HWY_API VFromD<D> Reverse8(D , VFromD<D> ) {
2655template <
typename T,
size_t N>
2657 return Vec128<T, N>{vec_mergeh(a.raw, b.raw)};
2669template <
class D,
typename T = TFromD<D>>
2675template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2677 const Half<
decltype(
d)> d2;
2686template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2690template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2695template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2705#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
2706#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
2708#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
2715 const uint32_t x0) {
2716 const __vector
unsigned int raw = {x0, x1, x2, x3};
2727 using VU8 =
VFromD<
decltype(du8)>;
2728 const auto v_shift_amt =
2730 Set(Full128<uint32_t>(),
2731 static_cast<uint32_t
>(amt *
sizeof(TFromD<D>) * 8)));
2733#if HWY_S390X_HAVE_Z14
2734 return BitCast(
d, VU8{vec_srb(
BitCast(du8, v).raw, v_shift_amt.raw)});
2736#if HWY_IS_LITTLE_ENDIAN
2737 return BitCast(
d, VU8{vec_slo(
BitCast(du8, v).raw, v_shift_amt.raw)});
2739 return BitCast(
d, VU8{vec_sro(
BitCast(du8, v).raw, v_shift_amt.raw)});
2746template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2750 const auto v_shift_amt =
2751 Set(du,
static_cast<TU
>(amt *
sizeof(TFromD<D>) * 8));
2753#if HWY_IS_LITTLE_ENDIAN
2760template <
class D, HWY_IF_V_SIZE_D(D, 16)>
2763 using VU8 =
VFromD<
decltype(du8)>;
2764 const auto v_shift_amt =
2766 Set(Full128<uint32_t>(),
2767 static_cast<uint32_t
>(amt *
sizeof(TFromD<D>) * 8)));
2769#if HWY_S390X_HAVE_Z14
2770 return BitCast(
d, VU8{vec_slb(
BitCast(du8, v).raw, v_shift_amt.raw)});
2772#if HWY_IS_LITTLE_ENDIAN
2773 return BitCast(
d, VU8{vec_sro(
BitCast(du8, v).raw, v_shift_amt.raw)});
2775 return BitCast(
d, VU8{vec_slo(
BitCast(du8, v).raw, v_shift_amt.raw)});
2785template <
class D, HWY_IF_V_SIZE_LE_D(D, 16),
class VH = VFromD<Half<D>>>
2787 const Half<
decltype(
d)> dh;
2791 const VU lo{
reinterpret_cast<Raw
>(lo_half.raw)};
2792 const VU hi{
reinterpret_cast<Raw
>(hi_half.raw)};
2807template <
class D,
typename T = TFromD<D>>
2814template <
class D,
typename T = TFromD<D>>
2821template <
class D,
typename T = TFromD<D>>
2823 return CombineShiftRightBytes<8>(
d, hi, lo);
2827template <
class D,
typename T = TFromD<D>>
2829 const __vector
unsigned char kShuffle = {0, 1, 2, 3, 4, 5, 6, 7,
2830 24, 25, 26, 27, 28, 29, 30, 31};
2836template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2838 const Half<
decltype(
d)> d2;
2842template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2844 const Half<
decltype(
d)> d2;
2848template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2850 const Half<
decltype(
d)> d2;
2854template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2856 const Half<
decltype(
d)> d2;
2863 hwy::EnableIf<(
sizeof(FromT) >=
sizeof(TFromD<D>) * 2)>* =
nullptr,
2867#if HWY_IS_LITTLE_ENDIAN
2882 return VFromD<D>{vec_pack(lo.raw, hi.raw)};
2891 return VFromD<D>{vec_pack(v.raw, v.raw)};
2895 hwy::EnableIf<(
sizeof(FromT) >=
sizeof(TFromD<D>) * 4)>* =
nullptr,
2898 Vec128<FromT, Rebind<FromT, D>().
MaxLanes()> v) {
2899 const Rebind<MakeNarrow<FromT>,
decltype(
d)> d2;
2906template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2910#if HWY_IS_LITTLE_ENDIAN
2922template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2925 const __vector
unsigned char kCompactOddU8 = {1, 3, 5, 7, 17, 19, 21, 23};
2930template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2933 const __vector
unsigned char kCompactOddU8 = {1, 3, 17, 19};
2934 return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
2938template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
2942#if HWY_IS_LITTLE_ENDIAN
2943 const Vec128<uint32_t> uH = ShiftRight<16>(
BitCast(dw, hi));
2944 const Vec128<uint32_t> uL = ShiftRight<16>(
BitCast(dw, lo));
2946 const Vec128<uint32_t> uH =
BitCast(dw, hi);
2947 const Vec128<uint32_t> uL =
BitCast(dw, lo);
2953template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
2956 const __vector
unsigned char kCompactOddU16 = {2, 3, 6, 7, 18, 19, 22, 23};
2957 return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU16)};
2961template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
2963#if HWY_IS_LITTLE_ENDIAN
2965 const __vector
unsigned char kShuffle = {4, 5, 6, 7, 12, 13, 14, 15,
2966 20, 21, 22, 23, 28, 29, 30, 31};
2967 return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
2976template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
2984template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2988#if HWY_IS_LITTLE_ENDIAN
3000template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3003 const __vector
unsigned char kCompactEvenU8 = {0, 2, 4, 6, 16, 18, 20, 22};
3008template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3011 const __vector
unsigned char kCompactEvenU8 = {0, 2, 16, 18};
3012 return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
3016template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3021#if HWY_IS_LITTLE_ENDIAN
3022 const Vec128<uint32_t> uH =
BitCast(dw, hi);
3023 const Vec128<uint32_t> uL =
BitCast(dw, lo);
3025 const Vec128<uint32_t> uH = ShiftRight<16>(
BitCast(dw, hi));
3026 const Vec128<uint32_t> uL = ShiftRight<16>(
BitCast(dw, lo));
3032template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3035 const __vector
unsigned char kCompactEvenU16 = {0, 1, 4, 5, 16, 17, 20, 21};
3036 return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU16)};
3040template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
3042#if HWY_IS_LITTLE_ENDIAN
3048 constexpr __vector
unsigned char kShuffle = {0, 1, 2, 3, 8, 9, 10, 11,
3049 16, 17, 18, 19, 24, 25, 26, 27};
3050 return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
3055template <
typename D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
3061#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3062#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3064#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3071#if HWY_IS_LITTLE_ENDIAN
3080template <
typename T>
3085template <
typename T>
3090template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
3094 constexpr __vector
unsigned char kShuffle = {0, 0, 2, 2, 4, 4, 6, 6,
3095 8, 8, 10, 10, 12, 12, 14, 14};
3099template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
3103 constexpr __vector
unsigned char kShuffle = {0, 1, 0, 1, 4, 5, 4, 5,
3104 8, 9, 8, 9, 12, 13, 12, 13};
3108template <
typename T, HWY_IF_T_SIZE(T, 4)>
3110#if HWY_S390X_HAVE_Z14
3114 v,
BitCast(
d,
Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10,
3115 11, 8, 9, 10, 11)));
3123template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
3127 constexpr __vector
unsigned char kShuffle = {1, 1, 3, 3, 5, 5, 7, 7,
3128 9, 9, 11, 11, 13, 13, 15, 15};
3132template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
3136 constexpr __vector
unsigned char kShuffle = {2, 3, 2, 3, 6, 7, 6, 7,
3137 10, 11, 10, 11, 14, 15, 14, 15};
3141template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
3143#if HWY_S390X_HAVE_Z14
3147 v,
BitCast(
d,
Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14,
3148 15, 12, 13, 14, 15)));
3150 return Vec128<T, N>{vec_mergeo(v.raw, v.raw)};
3154template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
3161template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
3164 const __vector
unsigned char mask = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3165 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3169template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
3172 const __vector
unsigned char mask = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
3173 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
3177template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
3180 const __vector
unsigned char mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0,
3181 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0};
3185template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
3190 const __vector
unsigned char mask = {
3191 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0};
3197template <
class D, HWY_IF_T_SIZE_D(D, 1)>
3199 const Full128<TFromD<D>> d_full;
3200 const Indices128<TFromD<D>> idx{
3201 Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24,
3202 10, 26, 12, 28, 14, 30)
3208template <
class D, HWY_IF_T_SIZE_D(D, 2)>
3210 const Full128<TFromD<D>> d_full;
3212 16, 17, 4, 5, 20, 21, 8,
3213 9, 24, 25, 12, 13, 28, 29)
3219template <
class D, HWY_IF_T_SIZE_D(D, 4)>
3221#if HWY_S390X_HAVE_Z14
3222 const Full128<TFromD<D>> d_full;
3224 2, 3, 16, 17, 18, 19, 8,
3225 9, 10, 11, 24, 25, 26, 27)
3231 return VFromD<D>{vec_mergee(a.raw, b.raw)};
3235template <
class D, HWY_IF_T_SIZE_D(D, 8)>
3242template <
class D, HWY_IF_T_SIZE_D(D, 1)>
3244 const Full128<TFromD<D>> d_full;
3245 const Indices128<TFromD<D>> idx{
3246 Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25,
3247 11, 27, 13, 29, 15, 31)
3253template <
class D, HWY_IF_T_SIZE_D(D, 2)>
3255 const Full128<TFromD<D>> d_full;
3256 const Indices128<TFromD<D>> idx{
3257 Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10,
3258 11, 26, 27, 14, 15, 30, 31)
3264template <
class D, HWY_IF_T_SIZE_D(D, 4)>
3266#if HWY_S390X_HAVE_Z14
3267 const Full128<TFromD<D>> d_full;
3268 const Indices128<TFromD<D>> idx{
3269 Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12,
3270 13, 14, 15, 28, 29, 30, 31)
3276 return VFromD<D>{vec_mergeo(a.raw, b.raw)};
3280template <
class D, HWY_IF_T_SIZE_D(D, 8)>
3286template <
typename T,
size_t N>
3293template <
typename T,
size_t N>
3300#if HWY_S390X_HAVE_Z14
3302 const DFromV<
decltype(a)> di16;
3305 const auto round_up_incr =
Set(di32, 0x4000);
3306 const auto i32_product =
MulEven(a, b) + round_up_incr;
3310template <
size_t N, HWY_IF_LANES_GT(N, 1)>
3312 Vec128<int16_t, N> b) {
3313 const DFromV<
decltype(a)> di16;
3316 const auto round_up_incr =
Set(di32, 0x4000);
3317 const auto even_product =
MulEven(a, b) + round_up_incr;
3318 const auto odd_product =
MulOdd(a, b) + round_up_incr;
3321 BitCast(di16, ShiftLeft<1>(even_product)));
3326 Vec128<int16_t, N> b) {
3327 const Vec128<int16_t> zero =
Zero(Full128<int16_t>());
3328 return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
3335template <
typename T,
size_t N>
3338#if HWY_S390X_HAVE_Z14
3346template <
typename T,
size_t N>
3349 const DFromV<
decltype(v)> di;
3357template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
3365template <
typename T,
size_t N>
3368#if HWY_S390X_HAVE_Z14
3375template <
typename T,
size_t N>
3378#if HWY_S390X_HAVE_Z14
3381 const DFromV<
decltype(v)> di;
3389template <
typename T,
size_t N>
3396template <
class T, HWY_IF_UI64(T)>
3398#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
3400 const V64 mul128_result =
reinterpret_cast<V64
>(vec_mule(a.raw, b.raw));
3401#if HWY_IS_LITTLE_ENDIAN
3402 return Vec128<T>{mul128_result};
3407 return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
3410 alignas(16) T mul[2];
3412 return Load(Full128<T>(), mul);
3416template <
class T, HWY_IF_UI64(T)>
3418#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
3420 const V64 mul128_result =
reinterpret_cast<V64
>(vec_mulo(a.raw, b.raw));
3421#if HWY_IS_LITTLE_ENDIAN
3422 return Vec128<T>{mul128_result};
3427 return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
3430 alignas(16) T mul[2];
3434 return Load(Full128<T>(), mul);
3447 using VU32 =
VFromD<
decltype(du32)>;
3448 const VU32 odd =
Set(du32, 0xFFFF0000u);
3449 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
3451 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
3461#if HWY_S390X_HAVE_Z14
3480 using VU32 =
VFromD<
decltype(du32)>;
3481 const VU32 odd =
Set(du32, 0xFFFF0000u);
3482 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
3484 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
3496#if HWY_S390X_HAVE_Z14
3499 return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)};
3518 return Add(sum0, sum1);
3522#if !HWY_S390X_HAVE_Z14
3524#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3525#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3527#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3530template <
class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
3532 DI32 ,
VFromD<Repartition<int16_t, DI32>> a,
3540#if !HWY_S390X_HAVE_Z14
3542#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
3543#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
3545#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
3547template <
class DU32, HWY_IF_U32_D(DU32)>
3549 DU32 ,
VFromD<Repartition<uint8_t, DU32>> a,
3554#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
3555#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
3557#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
3560template <
class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
3562 DI32 ,
VFromD<Repartition<uint8_t, DI32>> a_u,
3564 return VFromD<DI32>{vec_msum(b_i.raw, a_u.raw, sum.raw)};
3567#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
3568#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
3570#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
3572template <
class DI32, HWY_IF_I32_D(DI32)>
3574 VFromD<Repartition<int8_t, DI32>> a,
3575 VFromD<Repartition<int8_t, DI32>> b,
3579 const auto result_sum_0 =
3582 return result_sum_0 - result_sum_1;
3592template <
class D,
typename FromT,
HWY_IF_T_SIZE_D(D, 2 *
sizeof(FromT)),
3599 const VFromD<
decltype(d2)> twice{v.raw};
3604#if HWY_IS_LITTLE_ENDIAN
3612template <
class D,
typename FromT,
HWY_IF_T_SIZE_D(D, 2 *
sizeof(FromT)),
3615 Vec128<FromT, Rebind<FromT, D>().
MaxLanes()> v) {
3616 using Raw =
typename detail::Raw128<TFromD<D>>::type;
3617 return VFromD<D>{
reinterpret_cast<Raw
>(vec_unpackh(v.raw))};
3624 Vec128<FromT, Rebind<FromT, D>().
MaxLanes()> v) {
3625 const DFromV<
decltype(v)> d8;
3626 const Rebind<MakeWide<FromT>,
decltype(d8)> d16;
3644#ifdef HWY_NATIVE_F16C
3645#undef HWY_NATIVE_F16C
3647#define HWY_NATIVE_F16C
3650template <
class D, HWY_IF_F32_D(D)>
3652 return VFromD<D>{vec_extract_fp32_from_shorth(v.raw)};
3657template <
class D, HWY_IF_F32_D(D)>
3659 const Rebind<uint16_t,
decltype(df32)> du16;
3664template <
class D, HWY_IF_F64_D(D)>
3667#if HWY_IS_LITTLE_ENDIAN
3674template <
class D, HWY_IF_F64_D(D)>
3676#if HWY_S390X_HAVE_Z14
3682#if HWY_IS_LITTLE_ENDIAN
3690template <
class D, HWY_IF_F64_D(D)>
3692#if HWY_S390X_HAVE_Z14
3698#if HWY_IS_LITTLE_ENDIAN
3706#if !HWY_S390X_HAVE_Z14
3710static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
3711#if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND)
3722template <
class D, HWY_IF_I64_D(D)>
3724#if !HWY_S390X_HAVE_Z14 && \
3725 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3726 const __vector
float raw_v =
3728 return VFromD<
decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
3735template <
class D, HWY_IF_U64_D(D)>
3737#if !HWY_S390X_HAVE_Z14 && \
3738 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3739 const __vector
float raw_v =
3741 return VFromD<
decltype(du64)>{
reinterpret_cast<__vector
unsigned long long>(
3742 __builtin_vsx_xvcvspuxds(raw_v))};
3751#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
3752#undef HWY_NATIVE_PROMOTE_UPPER_TO
3754#define HWY_NATIVE_PROMOTE_UPPER_TO
3765#if HWY_IS_LITTLE_ENDIAN
3777 using Raw =
typename detail::Raw128<TFromD<D>>::type;
3778 return VFromD<D>{
reinterpret_cast<Raw
>(vec_unpackl(v.raw))};
3782template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3794template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3801template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3804#if HWY_IS_LITTLE_ENDIAN
3811template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3813#if HWY_S390X_HAVE_Z14
3818 const __vector
signed int raw_v =
3820#if HWY_IS_LITTLE_ENDIAN
3828template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3830#if HWY_S390X_HAVE_Z14
3835 const __vector
unsigned int raw_v =
3837#if HWY_IS_LITTLE_ENDIAN
3845template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
3847#if !HWY_S390X_HAVE_Z14 && \
3848 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3849 const __vector
float raw_v =
3850 detail::VsxF2INormalizeSrcVals(
InterleaveUpper(Full128<float>(), v, v))
3852 return VFromD<
decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
3859template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
3861#if !HWY_S390X_HAVE_Z14 && \
3862 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3863 const __vector
float raw_v =
3864 detail::VsxF2INormalizeSrcVals(
InterleaveUpper(Full128<float>(), v, v))
3866 return VFromD<
decltype(du64)>{
reinterpret_cast<__vector
unsigned long long>(
3867 __builtin_vsx_xvcvspuxds(raw_v))};
3875template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
class V>
3886#if HWY_PPC_HAVE_9 && \
3887 (HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200)
3889#if HWY_IS_LITTLE_ENDIAN
3890template <
class D,
class V>
3897template <
class D,
class V>
3905template <
class D,
class V>
3912template <
class D,
class V>
3924#if HWY_S390X_HAVE_Z14
3925template <
class D,
class V>
3932template <
class D,
class V,
class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
3935 FromTypeTag , D d_to, V v) {
3940template <
class D,
class V,
class FromTypeTag>
3950template <
class D,
class V>
3955#if !HWY_S390X_HAVE_Z14 && \
3956 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3958 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3959#if HWY_IS_LITTLE_ENDIAN
3964 __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
3968 return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
3978template <
class D,
class V>
3983#if !HWY_S390X_HAVE_Z14 && \
3984 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3986 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3987#if HWY_IS_LITTLE_ENDIAN
3992 reinterpret_cast<__vector
unsigned long long>(__builtin_vsx_xvcvspuxds(
3993 vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
3997 return VFromD<D>{
reinterpret_cast<__vector
unsigned long long>(
3998 __builtin_vsx_xvcvspuxds(normalized_v.raw))};
4008#if HWY_S390X_HAVE_Z14
4009template <
class D,
class V>
4015 d_to, V{vec_sld(v.raw, v.raw, 4)});
4017template <
class D,
class V,
class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
4020 FromTypeTag , D d_to, V v) {
4025template <
class D,
class V,
class FromTypeTag>
4035template <
class D,
class V>
4040#if !HWY_S390X_HAVE_Z14 && \
4041 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
4043 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
4044#if HWY_IS_LITTLE_ENDIAN
4047 return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
4053 __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
4063template <
class D,
class V>
4068#if !HWY_S390X_HAVE_Z14 && \
4069 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
4071 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
4072#if HWY_IS_LITTLE_ENDIAN
4075 return VFromD<D>{
reinterpret_cast<__vector
unsigned long long>(
4076 __builtin_vsx_xvcvspuxds(normalized_v.raw))};
4082 reinterpret_cast<__vector
unsigned long long>(__builtin_vsx_xvcvspuxds(
4083 vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
4101 return VFromD<D>{vec_packsu(v.raw, v.raw)};
4108 Vec128<FromT, Rebind<FromT, D>().
MaxLanes()> v) {
4109 return VFromD<D>{vec_packs(v.raw, v.raw)};
4116 Vec128<FromT, Rebind<FromT, D>().
MaxLanes()> v) {
4117 return VFromD<D>{vec_packs(v.raw, v.raw)};
4122 hwy::EnableIf<(
sizeof(FromT) >=
sizeof(TFromD<D>) * 4)>* =
nullptr>
4124 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4125 const Rebind<MakeNarrow<FromT>, D> d2;
4131 hwy::EnableIf<(
sizeof(FromT) >=
sizeof(TFromD<D>) * 4)>* =
nullptr>
4133 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4134 const Rebind<MakeNarrow<FromT>, D> d2;
4140 hwy::EnableIf<(
sizeof(FromT) >=
sizeof(TFromD<D>) * 4)>* =
nullptr>
4142 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4143 const Rebind<MakeUnsigned<MakeNarrow<FromT>>, D> d2;
4147#if HWY_PPC_HAVE_9 && \
4148 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp))
4152template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
4155#if HWY_COMPILER_GCC_ACTUAL
4157 return VFromD<D>{vec_pack_to_short_fp32(v.raw, v.raw)};
4158#elif HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp)
4163 const Rebind<uint32_t, D> du;
4164 const VFromD<
decltype(du)> bits16{
4165 reinterpret_cast<__vector
unsigned int>(__builtin_vsx_xvcvsphp(v.raw))};
4168#error "Only define the function if we have a native implementation"
4176#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
4177#undef HWY_NATIVE_DEMOTE_F64_TO_F16
4179#define HWY_NATIVE_DEMOTE_F64_TO_F16
4189static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) {
4192 __vector
unsigned long long raw_result;
4193 __asm__(
"xscvdphp %x0, %x1" :
"=wa"(raw_result) :
"wa"(vf64.raw));
4194 return Vec128<uint64_t>{raw_result};
4199template <
class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)>
4202 const Rebind<uint64_t,
decltype(df16)> du64;
4204 const Full128<double> df64_full;
4205#if HWY_IS_LITTLE_ENDIAN
4206 const auto bits16_as_u64 =
4209 const auto bits16_as_u64 =
4216template <
class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)>
4219 const Rebind<uint64_t,
decltype(df16)> du64;
4220 const Rebind<double,
decltype(df16)> df64;
4222#if HWY_IS_LITTLE_ENDIAN
4223 const auto bits64_as_u64_0 = detail::VsxXscvdphp(
InterleaveLower(df64, v, v));
4224 const auto bits64_as_u64_1 = detail::VsxXscvdphp(v);
4225 const auto bits64_as_u64 =
4228 const auto bits64_as_u64_0 = detail::VsxXscvdphp(v);
4229 const auto bits64_as_u64_1 = detail::VsxXscvdphp(
InterleaveUpper(df64, v, v));
4230 const auto bits64_as_u64 =
4237#elif HWY_S390X_HAVE_Z14
4239#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
4240#undef HWY_NATIVE_DEMOTE_F64_TO_F16
4242#define HWY_NATIVE_DEMOTE_F64_TO_F16
4247template <
class DF32, HWY_IF_F32_D(DF32)>
4252 __vector
float raw_f32_in_even;
4253 __asm__(
"vledb %0,%1,0,3" :
"=v"(raw_f32_in_even) :
"v"(v.raw));
4255 const VFromD<
decltype(dt_f32)> f32_in_even{raw_f32_in_even};
4261template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
4263 const Rebind<float,
decltype(df16)> df32;
4269#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4271#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
4272#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
4274#define HWY_NATIVE_DEMOTE_F32_TO_BF16
4281template <
class D, HWY_IF_BF16_D(D)>
4283 D dbf16,
VFromD<Rebind<float, D>> v) {
4284 const Rebind<uint32_t,
decltype(dbf16)> du32;
4285 const Repartition<uint8_t,
decltype(du32)> du32_as_du8;
4287 using VU32 = __vector
unsigned int;
4293 __builtin_vsx_xvcvspbf16(
BitCast(du32_as_du8, v).raw))};
4298template <
class D, HWY_IF_BF16_D(D)>
4313 const Twice<
decltype(
d)> dt;
4321 const Twice<
decltype(dn)> dn_full;
4322 const Repartition<uint32_t,
decltype(dn_full)> du32_full;
4324 const VFromD<
decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
4325 const auto vu32_full =
BitCast(du32_full, v_full);
4343 const Twice<
decltype(
d)> dt;
4351 const Twice<
decltype(dn)> dn_full;
4352 const Repartition<uint32_t,
decltype(dn_full)> du32_full;
4354 const VFromD<
decltype(dn_full)> v_full{vec_packsu(a.raw, b.raw)};
4355 const auto vu32_full =
BitCast(du32_full, v_full);
4373 const Twice<
decltype(
d)> dt;
4381 const Twice<
decltype(dn)> dn_full;
4382 const Repartition<uint32_t,
decltype(dn_full)> du32_full;
4384 const VFromD<
decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
4385 const auto vu32_full =
BitCast(du32_full, v_full);
4397#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4398template <
class D,
class V, HWY_IF_BF16_D(D), HWY_IF_F32(TFromV<V>),
4399 HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V) * 2)>
4402 const Half<
decltype(dbf16)> dh_bf16;
4405 detail::VsxXvcvspbf16(dh_bf16, b)));
4409template <
class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
class V,
4410 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4411 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
4412 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
4417#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4418template <
class D, HWY_IF_BF16_D(D),
class V, HWY_IF_F32(TFromV<V>),
4419 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
4425template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
4430template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
4432#if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
4438#if HWY_S390X_HAVE_Z14
4439 const Twice<
decltype(
d)> dt;
4449template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
4451#if HWY_S390X_HAVE_Z14
4452 const Rebind<int64_t,
decltype(di32)> di64;
4456 return Vec32<int32_t>{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
4460template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
4462#if HWY_S390X_HAVE_Z14
4463 const Rebind<int64_t,
decltype(di32)> di64;
4468#if HWY_IS_LITTLE_ENDIAN
4470 vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
4473 vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)};
4482template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
4484#if HWY_S390X_HAVE_Z14
4485 const Rebind<uint64_t,
decltype(du32)> du64;
4489 return Vec32<uint32_t>{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
4493template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
4495#if HWY_S390X_HAVE_Z14
4496 const Rebind<uint64_t,
decltype(du32)> du64;
4500#if HWY_IS_LITTLE_ENDIAN
4502 vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
4505 vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)};
4514#if HWY_S390X_HAVE_Z14
4517template <
class V, HWY_IF_I64(TFromV<V>)>
4519 __vector
double raw_result;
4521 __asm__(
"vcdgb %0,%1,0,3" :
"=v"(raw_result) :
"v"(v.raw));
4525template <
class V, HWY_IF_U64(TFromV<V>)>
4527 __vector
double raw_result;
4529 __asm__(
"vcdlgb %0,%1,0,3" :
"=v"(raw_result) :
"v"(v.raw));
4536template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
4538#if HWY_S390X_HAVE_Z14
4546template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
4548#if HWY_S390X_HAVE_Z14
4551#if HWY_IS_LITTLE_ENDIAN
4558 const Rebind<uint64_t,
decltype(df32)> du64;
4564template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
4566#if HWY_S390X_HAVE_Z14
4574template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
4576#if HWY_S390X_HAVE_Z14
4579#if HWY_IS_LITTLE_ENDIAN
4586 const Rebind<uint64_t,
decltype(df32)> du64;
4596 const Rebind<uint8_t,
decltype(du16)> du8;
4604#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4609 const Rebind<double,
decltype(df32)> df64;
4622template <
class D,
typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT)>
4624 Vec128<FromT, Rebind<FromT, D>().
MaxLanes()> v) {
4626#if HWY_COMPILER_CLANG
4630 return
VFromD<D>{vec_float(v.raw)};
4641 Vec128<FromT, Rebind<FromT, D>().
MaxLanes()> v) {
4646#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4647template <
class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
4650 const Rebind<int64_t,
decltype(di32)> di64;
4653template <
class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_D(D, 16)>
4655 Vec128<
float, Rebind<float, D>().
MaxLanes()> v) {
4661template <
class D, HWY_IF_I32_D(D)>
4663 Vec128<
float, Rebind<float, D>().
MaxLanes()> v) {
4664#if defined(__OPTIMIZE__)
4666 constexpr int32_t kMinI32 = LimitsMin<int32_t>();
4667 constexpr int32_t kMaxI32 = LimitsMax<int32_t>();
4670 (v.raw[0] >= -2147483648.0f)
4671 ? ((v.raw[0] < 2147483648.0f) ?
static_cast<int32_t
>(v.raw[0])
4673 : ((v.raw[0] < 0) ? kMinI32 : 0),
4674 (v.raw[1] >= -2147483648.0f)
4675 ? ((v.raw[1] < 2147483648.0f) ? static_cast<int32_t>(v.raw[1])
4677 : ((v.raw[1] < 0) ? kMinI32 : 0),
4678 (v.raw[2] >= -2147483648.0f)
4679 ? ((v.raw[2] < 2147483648.0f) ? static_cast<int32_t>(v.raw[2])
4681 : ((v.raw[2] < 0) ? kMinI32 : 0),
4682 (v.raw[3] >= -2147483648.0f)
4683 ? ((v.raw[3] < 2147483648.0f) ? static_cast<int32_t>(v.raw[3])
4685 : ((v.raw[3] < 0) ? kMinI32 : 0));
4689#if HWY_S390X_HAVE_Z15
4692 __vector
signed int raw_result;
4693 __asm__(
"vcfeb %0,%1,0,5" :
"=v"(raw_result) :
"v"(v.raw));
4697#if HWY_COMPILER_CLANG
4700 return
VFromD<D>{vec_cts(v.raw, 0)};
4706template <
class D, HWY_IF_I64_D(D)>
4709#if defined(__OPTIMIZE__)
4711 constexpr int64_t kMinI64 = LimitsMin<int64_t>();
4712 constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
4714 (v.raw[0] >= -9223372036854775808.0)
4715 ? ((v.raw[0] < 9223372036854775808.0)
4716 ?
static_cast<int64_t
>(v.raw[0])
4718 : ((v.raw[0] < 0) ? kMinI64 : 0LL),
4719 (v.raw[1] >= -9223372036854775808.0)
4720 ? ((v.raw[1] < 9223372036854775808.0)
4721 ?
static_cast<int64_t
>(v.raw[1])
4723 : ((v.raw[1] < 0) ? kMinI64 : 0LL));
4729 __vector
signed long long raw_result;
4730#if HWY_S390X_HAVE_Z14
4731 __asm__(
"vcgdb %0,%1,0,5" :
"=v"(raw_result) :
"v"(v.raw));
4733 __asm__(
"xvcvdpsxds %x0,%x1"
4735 :
"wa"(detail::VsxF2INormalizeSrcVals(v).raw));
4740#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4741template <
class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
4743 Vec128<
float, Rebind<float, D>().
MaxLanes()> v) {
4744 const Rebind<uint64_t,
decltype(du32)> du64;
4747template <
class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_D(D, 16)>
4749 Vec128<
float, Rebind<float, D>().
MaxLanes()> v) {
4755template <
class D, HWY_IF_U32_D(D)>
4757 Vec128<
float, Rebind<float, D>().
MaxLanes()> v) {
4758#if defined(__OPTIMIZE__)
4760 constexpr uint32_t kMaxU32 = LimitsMax<uint32_t>();
4764 ? ((v.raw[0] < 4294967296.0f) ?
static_cast<uint32_t
>(v.raw[0])
4768 ? ((v.raw[1] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[1])
4772 ? ((v.raw[2] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[2])
4776 ? ((v.raw[3] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[3])
4782#if HWY_S390X_HAVE_Z15
4785 __vector
unsigned int raw_result;
4786 __asm__(
"vclfeb %0,%1,0,5" :
"=v"(raw_result) :
"v"(v.raw));
4790#if HWY_COMPILER_CLANG
4793 VFromD<D> result{vec_ctu(v.raw, 0)};
4800template <
class D, HWY_IF_U64_D(D)>
4802 Vec128<
double, Rebind<double, D>().
MaxLanes()> v) {
4804#if HWY_COMPILER_CLANG
4808#if defined(__OPTIMIZE__)
4810 constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
4813 (v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0)
4814 ?
static_cast<uint64_t
>(v.raw[0])
4817 (v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0)
4818 ? static_cast<uint64_t>(v.raw[1])
4826 __vector
unsigned long long raw_result;
4827#if HWY_S390X_HAVE_Z14
4828 __asm__(
"vclgdb %0,%1,0,5" :
"=v"(raw_result) :
"v"(v.raw));
4830 __asm__(
"xvcvdpuxds %x0,%x1"
4832 :
"wa"(detail::VsxF2INormalizeSrcVals(v).raw));
4842 return Vec128<float, N>{vec_round(v.raw)};
4847#if HWY_S390X_HAVE_Z14
4862template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4868template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4874template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4881template <
typename T,
size_t N>
4883 static_assert(IsFloat<T>(),
"Only for float");
4887template <
typename T,
size_t N>
4889 static_assert(IsFloat<T>(),
"Only for float");
4897 Eq(
Add(vu, vu),
Set(du,
static_cast<TU
>(hwy::MaxExponentTimes2<T>()))));
4901template <
typename T,
size_t N>
4903 static_assert(IsFloat<T>(),
"Only for float");
4911 Lt(
Add(vu, vu),
Set(du,
static_cast<TU
>(hwy::MaxExponentTimes2<T>()))));
4916#if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO)
4919#ifdef HWY_NATIVE_AES
4920#undef HWY_NATIVE_AES
4922#define HWY_NATIVE_AES
4926#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600
4927using CipherTag = Full128<uint64_t>;
4929using CipherTag = Full128<uint8_t>;
4935 Vec128<uint8_t> round_key) {
4936 const detail::CipherTag dc;
4937 const Full128<uint8_t> du8;
4938#if HWY_IS_LITTLE_ENDIAN
4940 BitCast(du8, detail::CipherVec{vec_cipher_be(
4944 return BitCast(du8, detail::CipherVec{vec_cipher_be(
4950 Vec128<uint8_t> round_key) {
4951 const detail::CipherTag dc;
4952 const Full128<uint8_t> du8;
4953#if HWY_IS_LITTLE_ENDIAN
4955 BitCast(du8, detail::CipherVec{vec_cipherlast_be(
4959 return BitCast(du8, detail::CipherVec{vec_cipherlast_be(
4965 Vec128<uint8_t> round_key) {
4966 const detail::CipherTag dc;
4967 const Full128<uint8_t> du8;
4968#if HWY_IS_LITTLE_ENDIAN
4974 return Xor(
BitCast(du8, detail::CipherVec{vec_ncipher_be(
4981 Vec128<uint8_t> round_key) {
4982 const detail::CipherTag dc;
4983 const Full128<uint8_t> du8;
4984#if HWY_IS_LITTLE_ENDIAN
4986 BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
4990 return BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
4996 const Full128<uint8_t> du8;
4997 const auto zero =
Zero(du8);
5008template <u
int8_t kRcon>
5010 constexpr __vector
unsigned char kRconXorMask = {0, 0, 0, 0, kRcon, 0, 0, 0,
5011 0, 0, 0, 0, kRcon, 0, 0, 0};
5012 constexpr __vector
unsigned char kRotWordShuffle = {
5013 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12};
5014 const detail::CipherTag dc;
5015 const Full128<uint8_t> du8;
5016 const auto sub_word_result =
5017 BitCast(du8, detail::CipherVec{vec_sbox_be(
BitCast(dc, v).raw)});
5018 const auto rot_word_result =
5020 return Xor(rot_word_result, Vec128<uint8_t>{kRconXorMask});
5025 Vec128<uint64_t, N> b) {
5032 const auto zero =
Zero(
d);
5034 using VU64 = __vector
unsigned long long;
5035 const VU64 pmsum_result =
reinterpret_cast<VU64
>(
5038#if HWY_IS_LITTLE_ENDIAN
5039 return Vec128<uint64_t, N>{pmsum_result};
5045 return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
5051 Vec128<uint64_t, N> b) {
5058 const auto zero =
Zero(
d);
5060 using VU64 = __vector
unsigned long long;
5061 const VU64 pmsum_result =
reinterpret_cast<VU64
>(
5062 vec_pmsum_be(vec_mergel(zero.raw, a.raw), vec_mergel(zero.raw, b.raw)));
5064#if HWY_IS_LITTLE_ENDIAN
5065 return Vec128<uint64_t, N>{pmsum_result};
5071 return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
5083template <
class D, HWY_IF_T_SIZE_D(D, 1)>
5088#if HWY_IS_LITTLE_ENDIAN
5098 BitCast(du8,
Set(du16,
static_cast<uint16_t
>(mask_bits)));
5101#if HWY_IS_LITTLE_ENDIAN
5102 const __vector
unsigned char kRep8 = {0, 0, 0, 0, 0, 0, 0, 0,
5103 1, 1, 1, 1, 1, 1, 1, 1};
5105 const __vector
unsigned char kRep8 = {1, 1, 1, 1, 1, 1, 1, 1,
5106 0, 0, 0, 0, 0, 0, 0, 0};
5110 const __vector
unsigned char kBit = {1, 2, 4, 8, 16, 32, 64, 128,
5111 1, 2, 4, 8, 16, 32, 64, 128};
5116template <
class D, HWY_IF_T_SIZE_D(D, 2)>
5121#if HWY_IS_LITTLE_ENDIAN
5128 const __vector
unsigned short kBit = {1, 2, 4, 8, 16, 32, 64, 128};
5129 const auto vmask_bits =
5130 Set(Full128<uint16_t>(),
static_cast<uint16_t
>(mask_bits));
5131 return MFromD<D>{
TestBit(vmask_bits, Vec128<uint16_t>{kBit}).raw};
5135template <
class D, HWY_IF_T_SIZE_D(D, 4)>
5138 const Vec128<uint32_t> mask_vec{vec_genwm(mask_bits)};
5140#if HWY_IS_LITTLE_ENDIAN
5147 const __vector
unsigned int kBit = {1, 2, 4, 8};
5148 const auto vmask_bits =
5149 Set(Full128<uint32_t>(),
static_cast<uint32_t
>(mask_bits));
5150 return MFromD<D>{
TestBit(vmask_bits, Vec128<uint32_t>{kBit}).raw};
5154template <
class D, HWY_IF_T_SIZE_D(D, 8)>
5157 const Vec128<uint64_t> mask_vec{vec_gendm(mask_bits)};
5159#if HWY_IS_LITTLE_ENDIAN
5166 const __vector
unsigned long long kBit = {1, 2};
5167 const auto vmask_bits =
5168 Set(Full128<uint64_t>(),
static_cast<uint64_t
>(mask_bits));
5169 return MFromD<D>{
TestBit(vmask_bits, Vec128<uint64_t>{kBit}).raw};
5176template <
class D, HWY_IF_LANES_LE_D(D, 8)>
5179 uint64_t mask_bits = bits[0];
5182 if (kN < 8) mask_bits &= (1u << kN) - 1;
5187template <
class D, HWY_IF_LANES_D(D, 16)>
5194 uint16_t u16_mask_bits;
5195 CopyBytes<sizeof(uint16_t)>(bits, &u16_mask_bits);
5197#if HWY_IS_LITTLE_ENDIAN
5206#if HWY_HAS_BUILTIN(__builtin_bswap16)
5210 d,
static_cast<uint16_t
>((u16_mask_bits << 8) | (u16_mask_bits >> 8)));
5215template <
typename T>
5216struct CompressIsPartition {
5218 enum {
value = (
sizeof(T) != 1) };
5226 if (kN < 8) mask_bits &= (1u << kN) - 1;
5234#if !HWY_S390X_HAVE_Z14 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5237HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
5238 __vector
unsigned char bit_shuffle) {
5242 const Vec128<uint64_t> extracted{
5243 reinterpret_cast<VU64
>(vec_vbpermq(sign_bits.raw, bit_shuffle))};
5244 return extracted.raw[HWY_IS_LITTLE_ENDIAN];
5249#if HWY_S390X_HAVE_Z14
5250template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
5252 const DFromM<
decltype(mask)>
d;
5257 du8,
And(sign_bits,
Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128,
5258 1, 2, 4, 8, 16, 32, 64, 128)));
5261template <
typename T>
5263 const DFromM<
decltype(mask)>
d;
5268 const auto mask_bytes =
SumsOf8(
5269 And(sign_bits,
Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
5270 4, 8, 16, 32, 64, 128)));
5272 const Rebind<uint8_t,
decltype(du64)> du8_2;
5273 const Repartition<uint16_t,
decltype(du8_2)> du16_1;
5278template <
typename T,
size_t N>
5280 const DFromM<
decltype(mask)>
d;
5284#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5285 return static_cast<uint64_t
>(vec_extractm(sign_bits.raw));
5287 const __vector
unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
5288 56, 48, 40, 32, 24, 16, 8, 0};
5289 return ExtractSignBits(sign_bits, kBitShuffle);
5294template <
typename T,
size_t N>
5296 const DFromM<
decltype(mask)>
d;
5299#if HWY_S390X_HAVE_Z14
5302 du,
And(sign_bits,
Dup128VecFromValues(du, 1, 2, 4, 8, 16, 32, 64, 128)));
5307#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5308 return static_cast<uint64_t
>(vec_extractm(
BitCast(du, sign_bits).raw));
5311#if HWY_IS_LITTLE_ENDIAN
5312 const __vector
unsigned char kBitShuffle = {
5313 112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
5315 const __vector
unsigned char kBitShuffle = {
5316 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0};
5318 return ExtractSignBits(sign_bits, kBitShuffle);
5323template <
typename T,
size_t N>
5325 const DFromM<
decltype(mask)>
d;
5328#if HWY_S390X_HAVE_Z14
5335#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5336 return static_cast<uint64_t
>(vec_extractm(
BitCast(du, sign_bits).raw));
5339#if HWY_IS_LITTLE_ENDIAN
5340 const __vector
unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
5341 128, 128, 128, 128, 128, 128,
5342 128, 128, 128, 128};
5344 const __vector
unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
5345 128, 128, 128, 128, 128, 128,
5348 return ExtractSignBits(sign_bits, kBitShuffle);
5353template <
typename T,
size_t N>
5355 const DFromM<
decltype(mask)>
d;
5358#if HWY_S390X_HAVE_Z14
5365#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5366 return static_cast<uint64_t
>(vec_extractm(
BitCast(du, sign_bits).raw));
5369#if HWY_IS_LITTLE_ENDIAN
5370 const __vector
unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
5371 128, 128, 128, 128, 128, 128,
5372 128, 128, 128, 128};
5374 const __vector
unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
5375 128, 128, 128, 128, 128, 128,
5378 return ExtractSignBits(sign_bits, kBitShuffle);
5384template <
typename T,
size_t N>
5385constexpr uint64_t
OnlyActive(uint64_t mask_bits) {
5386 return ((N *
sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
5389template <
typename T,
size_t N>
5397template <
class D, HWY_IF_LANES_LE_D(D, 8)>
5402 return sizeof(uint8_t);
5405template <
class D, HWY_IF_LANES_D(D, 16)>
5415#if HWY_IS_LITTLE_ENDIAN
5416 const uint16_t u16_mask_bits =
static_cast<uint16_t
>(mask_bits);
5425#if HWY_HAS_BUILTIN(__builtin_bswap16)
5426 const uint16_t u16_mask_bits =
5427 __builtin_bswap16(
static_cast<uint16_t
>(mask_bits));
5429 const uint16_t u16_mask_bits =
static_cast<uint16_t
>(
5430 (mask_bits << 8) | (static_cast<uint16_t>(mask_bits) >> 8));
5434 CopyBytes<sizeof(uint16_t)>(&u16_mask_bits, bits);
5435 return sizeof(uint16_t);
5440template <
class D, HWY_IF_V_SIZE_D(D, 16)>
5443 return static_cast<bool>(
5447template <
class D, HWY_IF_V_SIZE_D(D, 16)>
5450 using TU =
TFromD<
decltype(du)>;
5452 Set(du, hwy::LimitsMax<TU>()).raw));
5455template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
5457 const Full128<TFromD<D>> d_full;
5463template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
5465 const Full128<TFromD<D>> d_full;
5476#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5481#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \
5482 HWY_IS_LITTLE_ENDIAN
5486 __asm__(
"vctzlsbb %0,%1" :
"=r"(idx) :
"v"(v.raw));
5487 return static_cast<size_t>(idx);
5489 return static_cast<size_t>(vec_cntlz_lsbb(v.raw));
5495#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \
5496 HWY_IS_LITTLE_ENDIAN
5500 __asm__(
"vclzlsbb %0,%1" :
"=r"(idx) :
"v"(v.raw));
5501 return static_cast<size_t>(idx);
5503 return static_cast<size_t>(vec_cnttz_lsbb(v.raw));
5510template <
class D,
typename T = TFromD<D>>
5513#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5517 return detail::VsxCntlzLsbb(bytes) /
sizeof(T);
5524template <
class D,
typename T = TFromD<D>>
5527#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5528 constexpr size_t kN = 16 /
sizeof(T);
5532 const size_t idx = detail::VsxCntlzLsbb(bytes) /
sizeof(T);
5533 return idx == kN ? -1 :
static_cast<intptr_t
>(idx);
5541template <
class D,
typename T = TFromD<D>>
5544#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5548 const size_t idx = detail::VsxCnttzLsbb(bytes) /
sizeof(T);
5549 return 16 /
sizeof(T) - 1 - idx;
5556template <
class D,
typename T = TFromD<D>>
5559#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5560 constexpr size_t kN = 16 /
sizeof(T);
5564 const size_t idx = detail::VsxCnttzLsbb(bytes) /
sizeof(T);
5565 return idx == kN ? -1 :
static_cast<intptr_t
>(kN - 1 - idx);
5579template <
bool kIsCompress,
class D, HWY_IF_T_SIZE_D(D, 1)>
5581 constexpr unsigned kGenPcvmMode =
5582 (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
5586 typename detail::Raw128<TFromD<D>>::type idx;
5587 __asm__(
"xxgenpcvbm %x0, %1, %2"
5589 :
"v"(mask.raw),
"i"(kGenPcvmMode));
5590 return VFromD<
decltype(
d)>{idx};
5592template <
bool kIsCompress,
class D, HWY_IF_T_SIZE_D(D, 2)>
5594 constexpr unsigned kGenPcvmMode =
5595 (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
5599 typename detail::Raw128<TFromD<D>>::type idx;
5600 __asm__(
"xxgenpcvhm %x0, %1, %2"
5602 :
"v"(mask.raw),
"i"(kGenPcvmMode));
5603 return VFromD<
decltype(
d)>{idx};
5605template <
bool kIsCompress,
class D, HWY_IF_T_SIZE_D(D, 4)>
5607 constexpr unsigned kGenPcvmMode =
5608 (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
5612 typename detail::Raw128<TFromD<D>>::type idx;
5613 __asm__(
"xxgenpcvwm %x0, %1, %2"
5615 :
"v"(mask.raw),
"i"(kGenPcvmMode));
5616 return VFromD<
decltype(
d)>{idx};
5621template <
class D, HWY_IF_T_SIZE_D(D, 2)>
5624 const Rebind<uint8_t,
decltype(
d)> d8;
5625 const Twice<
decltype(d8)> d8t;
5631 alignas(16)
static constexpr uint8_t table[2048] = {
5633 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5634 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5635 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
5636 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5637 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
5638 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
5639 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
5640 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5641 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
5642 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
5643 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
5644 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
5645 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
5646 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
5647 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
5648 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5649 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
5650 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
5651 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
5652 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
5653 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
5654 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
5655 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
5656 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
5657 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
5658 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
5659 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
5660 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
5661 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
5662 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
5663 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
5664 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5665 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
5666 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
5667 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
5668 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
5669 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
5670 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
5671 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
5672 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
5673 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
5674 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
5675 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
5676 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
5677 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
5678 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
5679 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
5680 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
5681 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
5682 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
5683 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
5684 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
5685 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
5686 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
5687 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
5688 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
5689 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
5690 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
5691 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
5692 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
5693 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
5694 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
5695 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
5696 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5697 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
5698 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
5699 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
5700 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
5701 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
5702 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
5703 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
5704 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
5705 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
5706 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
5707 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
5708 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
5709 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
5710 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
5711 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
5712 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
5713 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
5714 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
5715 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
5716 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
5717 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
5718 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
5719 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
5720 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
5721 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
5722 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
5723 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
5724 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
5725 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
5726 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
5727 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
5728 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
5729 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
5730 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
5731 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
5732 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
5733 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
5734 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
5735 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
5736 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
5737 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
5738 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
5739 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
5740 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
5741 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
5742 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
5743 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
5744 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
5745 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
5746 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
5747 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
5748 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
5749 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
5750 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
5751 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
5752 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
5753 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
5754 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
5755 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
5756 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
5757 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
5758 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
5759 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
5760 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5762 const VFromD<
decltype(d8t)> byte_idx{
Load(d8, table + mask_bits * 8).raw};
5764 constexpr uint16_t kPairIndexIncrement =
5765 HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
5767 return BitCast(
d, pairs +
Set(du, kPairIndexIncrement));
5770template <
class D, HWY_IF_T_SIZE_D(D, 2)>
5773 const Rebind<uint8_t,
decltype(
d)> d8;
5774 const Twice<
decltype(d8)> d8t;
5780 alignas(16)
static constexpr uint8_t table[2048] = {
5782 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
5783 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
5784 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
5785 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
5786 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
5787 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
5788 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
5789 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
5790 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
5791 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
5792 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
5793 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
5794 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
5795 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
5796 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
5797 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
5798 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
5799 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
5800 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
5801 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
5802 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
5803 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
5804 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
5805 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
5806 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
5807 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
5808 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
5809 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
5810 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
5811 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
5812 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
5813 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
5814 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
5815 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
5816 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
5817 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
5818 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
5819 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
5820 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
5821 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
5822 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
5823 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
5824 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
5825 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
5826 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
5827 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
5828 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
5829 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
5830 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
5831 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
5832 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
5833 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
5834 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
5835 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
5836 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
5837 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
5838 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
5839 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
5840 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
5841 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
5842 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
5843 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
5844 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
5845 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
5846 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
5847 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
5848 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
5849 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
5850 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
5851 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
5852 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
5853 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
5854 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
5855 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
5856 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
5857 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
5858 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
5859 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
5860 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
5861 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
5862 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
5863 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
5864 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
5865 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
5866 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
5867 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
5868 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
5869 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
5870 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
5871 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
5872 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
5873 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
5874 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
5875 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
5876 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
5877 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
5878 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
5879 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
5880 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
5881 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
5882 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
5883 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
5884 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
5885 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
5886 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
5887 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
5888 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
5889 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
5890 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
5891 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
5892 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
5893 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
5894 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
5895 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
5896 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
5897 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
5898 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
5899 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
5900 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
5901 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
5902 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
5903 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
5904 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
5905 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
5906 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
5907 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
5908 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
5909 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
5911 const VFromD<
decltype(d8t)> byte_idx{
Load(d8, table + mask_bits * 8).raw};
5913 constexpr uint16_t kPairIndexIncrement =
5914 HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
5916 return BitCast(
d, pairs +
Set(du, kPairIndexIncrement));
5919template <
class D, HWY_IF_T_SIZE_D(D, 4)>
5924 alignas(16)
static constexpr uint8_t u8_indices[256] = {
5926 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5927 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5928 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
5929 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5930 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
5931 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
5932 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
5933 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5934 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5935 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
5936 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
5937 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5938 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5939 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
5940 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
5941 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5944 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5947template <
class D, HWY_IF_T_SIZE_D(D, 4)>
5952 alignas(16)
static constexpr uint8_t u8_indices[256] = {
5954 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
5955 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5956 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
5957 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5958 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
5959 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
5960 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5961 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5962 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
5963 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5964 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
5965 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
5966 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5967 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5971 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5974template <
class D, HWY_IF_T_SIZE_D(D, 8)>
5979 alignas(16)
static constexpr uint8_t u8_indices[64] = {
5981 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5982 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5983 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5984 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5987 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5990template <
class D, HWY_IF_T_SIZE_D(D, 8)>
5995 alignas(16)
static constexpr uint8_t u8_indices[64] = {
5997 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5998 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5999 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6000 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6003 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6006template <
typename T,
size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
6016template <
typename T,
size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
6029template <
typename T>
6035template <
typename T, HWY_IF_T_SIZE(T, 8)>
6047#ifdef HWY_NATIVE_COMPRESS8
6048#undef HWY_NATIVE_COMPRESS8
6050#define HWY_NATIVE_COMPRESS8
6054template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
6058 v, detail::CompressOrExpandIndicesFromMask<true>(
d, mask));
6063template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
6064HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
6065 return detail::CompressBits(v, detail::BitsFromMask(mask));
6071template <
typename T>
6077template <
typename T, HWY_IF_T_SIZE(T, 8)>
6082 const Vec128<T> maskL =
DupEven(
m);
6083 const Vec128<T> maskH =
DupOdd(
m);
6084 const Vec128<T> swap =
AndNot(maskH, maskL);
6090template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
6094 v, detail::CompressOrExpandIndicesFromMask<true>(
d,
Not(mask)));
6099template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
6100HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
6103 if (N < 16 / sizeof(T)) {
6104 return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
6106 return detail::CompressNotBits(v, detail::BitsFromMask(mask));
6110HWY_API Vec128<u
int64_t> CompressBlocksNot(Vec128<u
int64_t> v,
6111 Mask128<u
int64_t> ) {
6116template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
6124template <
typename T,
size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
6129 uint64_t mask_bits = bits[0];
6131 mask_bits &= (1ull << N) - 1;
6140template <
class D, HWY_IF_T_SIZE_D(D, 1)>
6144 const auto indices = detail::CompressOrExpandIndicesFromMask<true>(
d,
m);
6146 StoreU(compressed,
d, unaligned);
6151template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
6158 const size_t count =
PopCount(mask_bits);
6162 StoreU(compressed,
d, unaligned);
6167template <
class D, HWY_IF_T_SIZE_D(D, 1)>
6171 const auto indices = detail::CompressOrExpandIndicesFromMask<true>(
d,
m);
6173 StoreN(compressed,
d, unaligned, count);
6178template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
6185 const size_t count =
PopCount(mask_bits);
6189#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
6190 StoreN(compressed,
d, unaligned, count);
6198template <
class D, HWY_IF_T_SIZE_D(D, 1)>
6205template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
6212 uint64_t mask_bits = bits[0];
6215 mask_bits &= (1ull << kN) - 1;
6217 const size_t count =
PopCount(mask_bits);
6221 StoreU(compressed,
d, unaligned);
6228#ifdef HWY_NATIVE_EXPAND
6229#undef HWY_NATIVE_EXPAND
6231#define HWY_NATIVE_EXPAND
6234template <
typename T,
size_t N,
6236HWY_API Vec128<T, N>
Expand(Vec128<T, N> v, Mask128<T, N> mask) {
6238 const auto idx = detail::CompressOrExpandIndicesFromMask<false>(
d, mask);
6242template <
typename T, HWY_IF_T_SIZE(T, 8)>
6249template <
typename T>
6250HWY_API Vec128<T, 1>
Expand(Vec128<T, 1> v, Mask128<T, 1> mask) {
6254template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6269#if HWY_IS_LITTLE_ENDIAN
6279template <
class V, HWY_IF_T_SIZE_V(V, 1)>
6284template <
class V, HWY_IF_T_SIZE_V(V, 2)>
6289template <
class V, HWY_IF_T_SIZE_V(V, 4)>
6294template <
class V, HWY_IF_T_SIZE_V(V, 8)>
6307#if HWY_S390X_HAVE_Z14
6309 vec_sub_u128(
reinterpret_cast<__vector
unsigned char>(a.raw),
6310 reinterpret_cast<__vector
unsigned char>(b.raw)))};
6311#elif defined(__SIZEOF_INT128__)
6312 using VU128 = __vector
unsigned __int128;
6314 vec_sub(
reinterpret_cast<VU128
>(a.raw),
reinterpret_cast<VU128
>(b.raw)))};
6319 const auto u64_a =
BitCast(du64, a);
6320 const auto u64_b =
BitCast(du64, b);
6322 const auto diff_u64 = u64_a - u64_b;
6323 const auto borrow_u64 =
VecFromMask(du64, u64_a < u64_b);
6325#if HWY_IS_LITTLE_ENDIAN
6326 const auto borrow_u64_shifted = ShiftLeftBytes<8>(du64, borrow_u64);
6328 const auto borrow_u64_shifted = ShiftRightBytes<8>(du64, borrow_u64);
6331 const auto diff_i128 =
BitCast(
d, diff_u64 + borrow_u64_shifted);
6345 const FixedTag<T, 2>
d;
6349template <
class T,
size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
6351 const Simd<T, N, 0>
d;
6352 const Full64<T> d_full64;
6355 const auto vmask_le64 =
6358 const auto neg_vmask_le64 =
Neg(vmask_le64);
6364template <
class T, HWY_IF_NOT_T_SIZE(T, 8)>
6376template <
class T,
size_t N>
6387 const FixedTag<T, 2>
d;
6391 const auto zero =
Zero(di);
6395template <
class T,
size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
6397 const Simd<T, N, 0>
d;
6398 const Full64<T> d_full64;
6402 const auto vmask_le64 =
6405 const auto neg_vmask_le64 =
Neg(vmask_le64);
6409 const auto first_vmask =
BitCast(di,
And(vmask, neg_vmask));
6412template <
class T, HWY_IF_NOT_T_SIZE(T, 8)>
6427 const FixedTag<T, 1>
d;
6429 using TI = MakeSigned<T>;
6433template <
class T,
size_t N, HWY_IF_LANES_GT(N, 1)>
6435 const Simd<T, N, 0>
d;
6442#if !HWY_S390X_HAVE_Z14
6446 __vector
signed int b) {
6447 const Repartition<int32_t, D> di32;
6450 const int64_t sum0 =
6451 static_cast<int64_t
>(a[0]) +
static_cast<int64_t
>(a[1]) +
6452 static_cast<int64_t
>(a[2]) +
static_cast<int64_t
>(a[3]) +
6453 static_cast<int64_t
>(b[0]);
6454 const int64_t sum1 =
6455 static_cast<int64_t
>(a[4]) +
static_cast<int64_t
>(a[5]) +
6456 static_cast<int64_t
>(a[6]) +
static_cast<int64_t
>(a[7]) +
6457 static_cast<int64_t
>(b[1]);
6458 const int64_t sum2 =
6459 static_cast<int64_t
>(a[8]) +
static_cast<int64_t
>(a[9]) +
6460 static_cast<int64_t
>(a[10]) +
static_cast<int64_t
>(a[11]) +
6461 static_cast<int64_t
>(b[2]);
6462 const int64_t sum3 =
6463 static_cast<int64_t
>(a[12]) +
static_cast<int64_t
>(a[13]) +
6464 static_cast<int64_t
>(a[14]) +
static_cast<int64_t
>(a[15]) +
6465 static_cast<int64_t
>(b[3]);
6466 const int32_t sign0 =
static_cast<int32_t
>(sum0 >> 63);
6467 const int32_t sign1 =
static_cast<int32_t
>(sum1 >> 63);
6468 const int32_t sign2 =
static_cast<int32_t
>(sum2 >> 63);
6469 const int32_t sign3 =
static_cast<int32_t
>(sum3 >> 63);
6473 VFromD<
decltype(di32)>{Raw{
6474 (sign0 == (sum0 >> 31)) ?
static_cast<int32_t
>(sum0)
6475 : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
6476 (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
6477 : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
6478 (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
6479 : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
6480 (sign3 == (sum3 >> 31))
6481 ? static_cast<int32_t>(sum3)
6482 : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
6493 __vector
unsigned int b) {
6494 const Repartition<uint32_t, D> du32;
6497 const uint64_t sum0 =
6498 static_cast<uint64_t
>(a[0]) +
static_cast<uint64_t
>(a[1]) +
6499 static_cast<uint64_t
>(a[2]) +
static_cast<uint64_t
>(a[3]) +
6500 static_cast<uint64_t
>(b[0]);
6501 const uint64_t sum1 =
6502 static_cast<uint64_t
>(a[4]) +
static_cast<uint64_t
>(a[5]) +
6503 static_cast<uint64_t
>(a[6]) +
static_cast<uint64_t
>(a[7]) +
6504 static_cast<uint64_t
>(b[1]);
6505 const uint64_t sum2 =
6506 static_cast<uint64_t
>(a[8]) +
static_cast<uint64_t
>(a[9]) +
6507 static_cast<uint64_t
>(a[10]) +
static_cast<uint64_t
>(a[11]) +
6508 static_cast<uint64_t
>(b[2]);
6509 const uint64_t sum3 =
6510 static_cast<uint64_t
>(a[12]) +
static_cast<uint64_t
>(a[13]) +
6511 static_cast<uint64_t
>(a[14]) +
static_cast<uint64_t
>(a[15]) +
6512 static_cast<uint64_t
>(b[3]);
6515 VFromD<
decltype(du32)>{(__vector
unsigned int){
6516 static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
6517 static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
6518 static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
6519 static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
6531 __vector
signed int b) {
6532 const Repartition<int32_t, D> di32;
6534 const Repartition<uint64_t, D> du64;
6535 constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
6537 __builtin_constant_p(b[kDestLaneOffset + 2])) {
6538 const int64_t sum0 =
static_cast<int64_t
>(a[0]) +
6539 static_cast<int64_t
>(a[1]) +
6540 static_cast<int64_t
>(b[kDestLaneOffset]);
6541 const int64_t sum1 =
static_cast<int64_t
>(a[2]) +
6542 static_cast<int64_t
>(a[3]) +
6543 static_cast<int64_t
>(b[kDestLaneOffset + 2]);
6544 const int32_t sign0 =
static_cast<int32_t
>(sum0 >> 63);
6545 const int32_t sign1 =
static_cast<int32_t
>(sum1 >> 63);
6546 return BitCast(
d,
VFromD<
decltype(du64)>{(__vector
unsigned long long){
6547 (sign0 == (sum0 >> 31))
6548 ?
static_cast<uint32_t
>(sum0)
6549 : static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
6550 (sign1 == (sum1 >> 31))
6551 ? static_cast<uint32_t>(sum1)
6552 : static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
6556 __vector
signed int sum;
6562 __asm__(
"vsum2sws %0,%1,%2" :
"=v"(sum) :
"v"(a),
"v"(b));
6571 __vector
signed int b) {
6572 const Repartition<int32_t, D> di32;
6575 const int64_t sum0 =
static_cast<int64_t
>(a[0]) +
6576 static_cast<int64_t
>(a[1]) +
6577 static_cast<int64_t
>(b[0]);
6578 const int64_t sum1 =
static_cast<int64_t
>(a[2]) +
6579 static_cast<int64_t
>(a[3]) +
6580 static_cast<int64_t
>(b[1]);
6581 const int64_t sum2 =
static_cast<int64_t
>(a[4]) +
6582 static_cast<int64_t
>(a[5]) +
6583 static_cast<int64_t
>(b[2]);
6584 const int64_t sum3 =
static_cast<int64_t
>(a[6]) +
6585 static_cast<int64_t
>(a[7]) +
6586 static_cast<int64_t
>(b[3]);
6587 const int32_t sign0 =
static_cast<int32_t
>(sum0 >> 63);
6588 const int32_t sign1 =
static_cast<int32_t
>(sum1 >> 63);
6589 const int32_t sign2 =
static_cast<int32_t
>(sum2 >> 63);
6590 const int32_t sign3 =
static_cast<int32_t
>(sum3 >> 63);
6594 VFromD<
decltype(di32)>{Raw{
6595 (sign0 == (sum0 >> 31)) ?
static_cast<int32_t
>(sum0)
6596 : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
6597 (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
6598 : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
6599 (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
6600 : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
6601 (sign3 == (sum3 >> 31))
6602 ? static_cast<int32_t>(sum3)
6603 : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
6614 __vector
signed int b) {
6615 const Repartition<int32_t, D> di32;
6617 constexpr int kDestLaneOffset = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
6620 static_cast<int64_t
>(a[0]) +
static_cast<int64_t
>(a[1]) +
6621 static_cast<int64_t
>(a[2]) +
static_cast<int64_t
>(a[3]) +
6622 static_cast<int64_t
>(b[kDestLaneOffset]);
6623 const int32_t sign =
static_cast<int32_t
>(sum >> 63);
6624#if HWY_IS_LITTLE_ENDIAN
6626 d,
VFromD<
decltype(di32)>{(__vector
signed int){
6627 (sign == (sum >> 31)) ?
static_cast<int32_t
>(sum)
6628 : static_cast<int32_t>(sign ^ 0x7FFFFFFF),
6633 (sign == (sum >> 31))
6634 ?
static_cast<int32_t
>(sum)
6635 : static_cast<int32_t>(sign ^ 0x7FFFFFFF)}});
6640 __vector
signed int sum;
6646 __asm__(
"vsumsws %0,%1,%2" :
"=v"(sum) :
"v"(a),
"v"(b));
6653HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) {
6656 return AltivecVsum4shs(di32,
Xor(
BitCast(di16, v),
Set(di16, -32768)).raw,
6657 Set(di32, 65536).raw);
6668#if HWY_S390X_HAVE_Z14
6669 return VFromD<
decltype(dw)>{vec_sum4(v.raw,
Zero(
d).raw)};
6671 return BitCast(dw, AltivecU16SumsOf2(v));
6682#if HWY_S390X_HAVE_Z14
6686 Set(dw, int32_t{-65536});
6688 return AltivecVsum4shs(dw, v.raw,
Zero(dw).raw);
6692#if HWY_S390X_HAVE_Z14
6699 return VFromD<
decltype(dw)>{vec_sum2(v.raw,
Zero(
d).raw)};
6712 Set(dw, int64_t{-4294967296LL});
6723#if HWY_S390X_HAVE_Z14
6724 return VFromD<
decltype(dw2)>{vec_sum4(v.raw,
Zero(
d).raw)};
6726 return AltivecVsum4ubs(dw2, v.raw,
Zero(dw2).raw);
6737#if HWY_S390X_HAVE_Z14
6741 Set(dw2, int32_t{-512});
6743 return AltivecVsum4sbs(dw2, v.raw,
Zero(dw2).raw);
6755#if HWY_S390X_HAVE_Z14
6756 return VFromD<
decltype(dw2)>{vec_sum2(v.raw,
Zero(
d).raw)};
6771#if HWY_S390X_HAVE_Z14
6775 Set(dw2, int64_t{-131072});
6777 const auto sums_of_4_in_lo32 =
6778 AltivecVsum2sws(dw,
SumsOf2(v).raw,
Zero(dw).raw);
6780#if HWY_IS_LITTLE_ENDIAN
6794#undef HWY_IF_SUM_OF_LANES_D
6795#if HWY_S390X_HAVE_Z14
6796#define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D)
6798#define HWY_IF_SUM_OF_LANES_D(D) \
6799 HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))
6802#if HWY_S390X_HAVE_Z14
6816template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
6822template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
6824 constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
6825 return Broadcast<kSumLaneIdx>(
6829template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
6831 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
6832 return Broadcast<kSumLaneIdx>(
6836template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
6838 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
6839#if HWY_S390X_HAVE_Z14
6840 return Broadcast<kSumLaneIdx>(
6845 return Broadcast<kSumLaneIdx>(
6846 detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw));
6850template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
6852#if HWY_S390X_HAVE_Z14
6856 constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
6857 return Broadcast<kSumLaneIdx>(
6862template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
6864#if HWY_S390X_HAVE_Z14
6868 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
6869 return Broadcast<kSumLaneIdx>(
6874template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
6876#if HWY_S390X_HAVE_Z14
6880 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
6882 const auto zero =
Zero(di32);
6883 return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6884 di16, detail::AltivecVsum4shs(di32, v.
raw, zero.raw).raw, zero.raw));
6888template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
6890 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
6891 return Broadcast<kSumLaneIdx>(
6895template <
class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
6897 const Twice<
decltype(du8)> dt_u8;
6901template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
6903 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
6907template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
6909 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
6911#if HWY_S390X_HAVE_Z14
6912 return Broadcast<kSumLaneIdx>(
6919 return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6920 du8, detail::AltivecVsum4ubs(di32, v.
raw, zero.raw).raw,
6925template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
6927#if HWY_S390X_HAVE_Z14
6931 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
6932 return Broadcast<kSumLaneIdx>(
6937template <
class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I8_D(D)>
6939 const Twice<
decltype(di8)> dt_i8;
6943template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
6945#if HWY_S390X_HAVE_Z14
6949 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
6954template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
6956#if HWY_S390X_HAVE_Z14
6960 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
6963 return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6964 di8, detail::AltivecVsum4sbs(di32, v.
raw, zero.
raw).raw, zero.
raw));
6968#if HWY_S390X_HAVE_Z14
6969template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_UI32_D(D)>
6972 return Broadcast<1>(
6977template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
6989#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
6990#undef HWY_NATIVE_REDUCE_SUM_4_UI8
6992#define HWY_NATIVE_REDUCE_SUM_4_UI8
6995template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
7005template <
class D,
class V = VFromD<D>>
7007 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
7008#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
7010 using VU64 = __vector
unsigned long long;
7011 using VU128 = __vector
unsigned __int128;
7012#if HWY_IS_LITTLE_ENDIAN
7013 const VU128 a_u128 =
reinterpret_cast<VU128
>(a.raw);
7014 const VU128 b_u128 =
reinterpret_cast<VU128
>(b.raw);
7021 const VU128 a_u128 =
reinterpret_cast<VU128
>(vec_sld(a.raw, a.raw, 8));
7022 const VU128 b_u128 =
reinterpret_cast<VU128
>(vec_sld(b.raw, b.raw, 8));
7024 return V{
reinterpret_cast<VU64
>(vec_cmplt(a_u128, b_u128))};
7039 const auto eqHL =
Eq(a, b);
7041 const V ltLX = ShiftLeftLanes<1>(ltHL);
7042 const V vecHx =
IfThenElse(eqHL, ltLX, ltHL);
7048template <
class D,
class V = VFromD<D>>
7050 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
7051#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
7053 using VU64 = __vector
unsigned long long;
7054 using VU128 = __vector
unsigned __int128;
7055 return V{
reinterpret_cast<VU64
>(vec_cmpeq(
reinterpret_cast<VU128
>(a.raw),
7056 reinterpret_cast<VU128
>(b.raw)))};
7060 return And(eqHL, eqLH);
7064template <
class D,
class V = VFromD<D>>
7066 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
7067#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
7069 using VU64 = __vector
unsigned long long;
7070 using VU128 = __vector
unsigned __int128;
7071 return V{
reinterpret_cast<VU64
>(vec_cmpne(
reinterpret_cast<VU128
>(a.raw),
7072 reinterpret_cast<VU128
>(b.raw)))};
7076 return Or(neHL, neLH);
7080template <
class D,
class V = VFromD<D>>
7086template <
class D,
class V = VFromD<D>>
7092template <
class D,
class V = VFromD<D>>
7100template <
class D,
class V = VFromD<D>>
7105template <
class D,
class V = VFromD<D>>
7110template <
class D,
class V = VFromD<D>>
7115template <
class D,
class V = VFromD<D>>
7120template <
class D,
class V = VFromD<D>>
7125template <
class D,
class V = VFromD<D>>
7133template <
class D,
class V = VFromD<D>>
7138template <
class D,
class V = VFromD<D>>
7143template <
class D,
class V = VFromD<D>>
7148template <
class D,
class V = VFromD<D>>
7155#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
7156#undef HWY_NATIVE_LEADING_ZERO_COUNT
7158#define HWY_NATIVE_LEADING_ZERO_COUNT
7161template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
7163#if HWY_S390X_HAVE_Z14
7167#if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
7170 __asm__(
"" :
"+v"(v.raw));
7175 return V{vec_cntlz(v.raw)};
7179template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
7182 using T =
TFromD<
decltype(
d)>;
7186#if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
7187template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
7189#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
7190 return V{vec_vctz(v.raw)};
7192#if HWY_S390X_HAVE_Z14
7196#if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
7199 __asm__(
"" :
"+v"(v.raw));
7204 return V{vec_cnttz(v.raw)};
7209template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
7213 using TI =
TFromD<
decltype(di)>;
7215 const auto vi =
BitCast(di, v);
7216 const auto lowest_bit =
And(vi,
Neg(vi));
7217 constexpr TI kNumOfBitsInT{
sizeof(TI) * 8};
7220 Set(di, kNumOfBitsInT), bit_idx));
7224#undef HWY_PPC_HAVE_9
7225#undef HWY_PPC_HAVE_10
7226#undef HWY_S390X_HAVE_Z14
7227#undef HWY_S390X_HAVE_Z15
#define HWY_RESTRICT
Definition base.h:95
#define HWY_RCAST_ALIGNED(type, ptr)
Definition base.h:144
#define HWY_IF_SIGNED(T)
Definition base.h:622
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_IF_T_SIZE(T, bytes)
Definition base.h:639
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_LANES_GT(kN, lanes)
Definition base.h:618
#define HWY_IF_V_SIZE_LE(T, kN, bytes)
Definition base.h:611
#define HWY_IF_NOT_FLOAT(T)
Definition base.h:626
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
#define HWY_IF_UNSIGNED(T)
Definition base.h:620
#define HWY_IF_UI32(T)
Definition base.h:686
Definition arm_neon-inl.h:865
T PrivateT
Definition arm_neon-inl.h:870
Raw raw
Definition arm_neon-inl.h:878
detail::Raw128< T >::RawBoolVec raw
Definition ppc_vsx-inl.h:172
Definition arm_neon-inl.h:813
HWY_INLINE Vec128 & operator%=(const Vec128 other)
Definition ppc_vsx-inl.h:144
T PrivateT
Definition arm_neon-inl.h:816
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition ppc_vsx-inl.h:135
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:815
Raw raw
Definition arm_neon-inl.h:851
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition ppc_vsx-inl.h:141
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition ppc_vsx-inl.h:153
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition ppc_vsx-inl.h:150
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition ppc_vsx-inl.h:132
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition ppc_vsx-inl.h:147
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition ppc_vsx-inl.h:138
HWY_INLINE V Per128BitBlkRevLanesOnBe(V v)
Definition ppc_vsx-inl.h:6299
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:744
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE VFromD< D > IndicesFromBits128(D d, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5622
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_INLINE Vec128< T > SumOfU32OrU64LanesAsU128(Vec128< T > v)
Definition ppc_vsx-inl.h:6807
HWY_INLINE V Eq128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7087
HWY_API Vec128< T, N > Shl(hwy::UnsignedTag, Vec128< T, N > v, Vec128< T, N > bits)
Definition ppc_vsx-inl.h:3336
HWY_INLINE VFromD< RepartitionToWide< DFromV< V > > > SumsOf2(hwy::SignedTag, hwy::SizeTag< 1 >, V v)
Definition arm_neon-inl.h:1959
HWY_INLINE MFromD< D > LoadMaskBits128(D, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5084
HWY_API VFromD< DTo > ConvertTo(hwy::FloatTag, DTo, Vec128< TFrom, HWY_MAX_LANES_D(DTo)> from)
Definition emu128-inl.h:1857
HWY_INLINE V Per64BitBlkRevLanesOnBe(V v)
Definition ppc_vsx-inl.h:6280
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_API VFromD< D > Truncate2To(D, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> lo, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> hi)
Definition ppc_vsx-inl.h:2879
static HWY_INLINE bool IsConstantRawAltivecVect(hwy::SizeTag< 1 >, RawV v)
Definition ppc_vsx-inl.h:389
HWY_API Vec128< T, N > Shr(hwy::UnsignedTag, Vec128< T, N > v, Vec128< T, N > bits)
Definition ppc_vsx-inl.h:3366
static HWY_INLINE HWY_MAYBE_UNUSED TFromV< V > GetLane(V v)
Definition arm_neon-inl.h:1634
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
HWY_INLINE Vec128< float16_t, N > ConcatEven(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7002
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecBroadcastLaneBytes(D d)
Definition arm_neon-inl.h:5661
HWY_INLINE V I128Subtract(V a, V b)
Definition ppc_vsx-inl.h:6306
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecByteOffsets(D d)
Definition arm_neon-inl.h:5695
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6088
static HWY_INLINE VFromD< DF32 > DemoteToF32WithRoundToOdd(DF32 df32, VFromD< Rebind< double, DF32 > > v)
Definition ppc_vsx-inl.h:4248
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE VFromD< D > PromoteOddTo(hwy::FloatTag to_type_tag, hwy::SizeTag< 4 > to_lane_size_tag, hwy::FloatTag from_type_tag, D d_to, svfloat16_t v)
Definition arm_sve-inl.h:4419
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6031
HWY_INLINE V Ne128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7093
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
HWY_INLINE VFromD< RebindToFloat< DFromV< V > > > ConvToF64WithRoundToOdd(V v)
Definition ppc_vsx-inl.h:4518
HWY_INLINE VFromD< D > PromoteEvenTo(hwy::SignedTag, hwy::SizeTag< 2 >, hwy::SignedTag, D d_to, svint8_t v)
Definition arm_sve-inl.h:4334
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:8276
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition ops/shared-inl.h:325
HWY_API Vec128< T, N > CompressBits(Vec128< T, N > v, uint64_t mask_bits)
Definition ppc_vsx-inl.h:6007
HWY_API Vec128< T, N > CompressNotBits(Vec128< T, N > v, uint64_t mask_bits)
Definition ppc_vsx-inl.h:6017
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6076
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE VFromD< D > IndicesFromNotBits128(D d, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5771
HWY_INLINE V Lt128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7081
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
hwy::If<(!hwy::IsFloat< TFromD< D > >() &&!hwy::IsSpecialFloat< TFromD< D > >()), RebindToUnsigned< D >, D > RebindToUnsignedIfNotFloat
Definition ppc_vsx-inl.h:1390
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< DI32 > SatWidenMulPairwiseAccumulate(DI32 di32, VFromD< Repartition< int16_t, DI32 > > a, VFromD< Repartition< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition generic_ops-inl.h:5179
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
RepartitionToWide< RepartitionToWideX2< D > > RepartitionToWideX3
Definition ops/shared-inl.h:483
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API V Rol(V a, V b)
Definition generic_ops-inl.h:445
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
Vec128< T, 4/sizeof(T)> Vec32
Definition arm_neon-inl.h:858
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
Vec128< T, 2/sizeof(T)> Vec16
Definition arm_neon-inl.h:861
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API VFromD< RepartitionToWideX2< DFromV< V > > > SumsOf4(V v)
Definition generic_ops-inl.h:3733
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API V ReverseLaneBytes(V v)
Definition generic_ops-inl.h:6386
HWY_API VFromD< D > PromoteLowerTo(D d, V v)
Definition generic_ops-inl.h:2984
HWY_API V RotateRightSame(V v, int bits)
Definition generic_ops-inl.h:601
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
Vec128< T, 8/sizeof(T)> Vec64
Definition arm_neon-inl.h:855
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API Vec1< MakeWide< T > > SumsOf2(const Vec1< T > v)
Definition scalar-inl.h:549
Simd< typename M::PrivateT, M::kPrivateN, 0 > DFromM
Definition arm_neon-inl.h:888
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
HWY_API V IfNegativeThenElseZero(V v, V yes)
Definition generic_ops-inl.h:241
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API V Ror(V a, V b)
Definition generic_ops-inl.h:459
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API VFromD< DN > OrderedTruncate2To(DN dn, V a, V b)
Definition emu128-inl.h:1978
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:476
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
RepartitionToWide< RepartitionToWide< D > > RepartitionToWideX2
Definition ops/shared-inl.h:480
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API V HighestSetBitIndex(V v)
Definition arm_neon-inl.h:9523
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API V TrailingZeroCount(V v)
Definition arm_neon-inl.h:9530
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API Vec1< T > operator%(Vec1< T > a, Vec1< T > b)
Definition generic_ops-inl.h:5095
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V RotateLeftSame(V v, int bits)
Definition generic_ops-inl.h:588
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API V IfNegativeThenZeroElse(V v, V no)
Definition generic_ops-inl.h:256
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2551
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From &val)
Definition base.h:1024
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
constexpr size_t FloorLog2(TI x)
Definition base.h:2662
typename IfT< Condition, Then, Else >::type If
Definition base.h:520
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
HWY_API constexpr bool IsSpecialFloat()
Definition base.h:832
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2588
typename EnableIfT< Condition >::type EnableIf
Definition base.h:486
HWY_API size_t PopCount(T x)
Definition base.h:2615
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_UI32_D(D)
Definition ops/shared-inl.h:591
#define HWY_IF_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:624
#define HWY_IF_LANES_GT_D(D, lanes)
Definition ops/shared-inl.h:562
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_F64_D(D)
Definition ops/shared-inl.h:601
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_IF_NOT_FLOAT_D(D)
Definition ops/shared-inl.h:536
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_S390X_HAVE_Z15
Definition ppc_vsx-inl.h:67
#define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE)
Definition ppc_vsx-inl.h:89
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T)
Definition ppc_vsx-inl.h:1607
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T)
Definition ppc_vsx-inl.h:1605
#define HWY_AFTER_NAMESPACE()
Definition set_macros-inl.h:633
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
@ value
Definition arm_neon-inl.h:8429
Definition arm_neon-inl.h:5654
__vector unsigned char raw
Definition ppc_vsx-inl.h:2322
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:5655
Definition ops/shared-inl.h:198
Definition x86_128-inl.h:67
__v128_u type
Definition wasm_128-inl.h:60
int VFromD
Definition tuple-inl.h:25