26#ifndef HWY_SVE_IS_POW2
27#define HWY_SVE_IS_POW2 1
30#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
31#define HWY_SVE_HAVE_2 1
33#define HWY_SVE_HAVE_2 0
38#if HWY_ARM_HAVE_SCALAR_BF16_TYPE && defined(__ARM_FEATURE_SVE_BF16)
39#define HWY_SVE_HAVE_BF16_FEATURE 1
41#define HWY_SVE_HAVE_BF16_FEATURE 0
46#if HWY_SVE_HAVE_BF16_FEATURE || \
47 (HWY_COMPILER_CLANG >= 1200 && defined(__ARM_FEATURE_SVE_BF16)) || \
48 HWY_COMPILER_GCC_ACTUAL >= 1000
49#define HWY_SVE_HAVE_BF16_VEC 1
51#define HWY_SVE_HAVE_BF16_VEC 0
57#if HWY_SVE_HAVE_BF16_VEC && defined(__ARM_FEATURE_SVE_BF16)
58#define HWY_SVE_HAVE_F32_TO_BF16C 1
60#define HWY_SVE_HAVE_F32_TO_BF16C 0
85#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP)
86#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP)
87#define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
88 X_MACRO(uint, u, 32, 16, NAME, OP)
89#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
90 X_MACRO(uint, u, 64, 32, NAME, OP)
93#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP)
94#define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP)
95#define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP)
96#define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP)
99#define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
100 X_MACRO(float, f, 16, 16, NAME, OP)
101#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
102 X_MACRO(float, f, 32, 16, NAME, OP)
103#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
104 X_MACRO(float, f, 64, 32, NAME, OP)
106#define HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP) \
107 X_MACRO(bfloat, bf, 16, 16, NAME, OP)
109#if HWY_SVE_HAVE_BF16_FEATURE
110#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
111 HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP)
119#define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
120#define HWY_SVE_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
122#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
123#define HWY_SVE_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
124#define HWY_SVE_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
128#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
129 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
130 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
131 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
132 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
134#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
135 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
136 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
137 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
138 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
140#define HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP) \
141 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
142 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
147#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
148 HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
149 HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
152#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
153 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
154 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
156#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
157 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
158 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
160#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
161 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
162 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
164#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
165 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
166 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
168#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
169 HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
170 HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
171 HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
174#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
175 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
176 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
178#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
179 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
180 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
182#define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
183 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
184 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
185 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
188#define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
189#define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2>
190#define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
191#define HWY_SVE_TUPLE(BASE, BITS, MUL) sv##BASE##BITS##x##MUL##_t
195#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \
197 struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
198 using type = ScalableTag<HWY_SVE_T(BASE, BITS)>; \
202#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
211#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \
212 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
213 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
215#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \
216 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
217 return sv##OP##_##CHAR##BITS(v); \
221#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
222 HWY_API HWY_SVE_V(BASE, BITS) \
223 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
224 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
226#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \
227 HWY_API HWY_SVE_V(BASE, BITS) \
228 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
229 return sv##OP##_##CHAR##BITS(a, b); \
233#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
234 HWY_API HWY_SVE_V(BASE, BITS) \
235 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
236 return sv##OP##_##CHAR##BITS(a, b); \
239#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
240 HWY_API HWY_SVE_V(BASE, BITS) \
241 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
242 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
247#define HWY_SVE_RETV_ARGMVV(BASE, CHAR, BITS, HALF, NAME, OP) \
248 HWY_API HWY_SVE_V(BASE, BITS) \
249 NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
250 return sv##OP##_##CHAR##BITS##_x(m, a, b); \
253#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
254 HWY_API HWY_SVE_V(BASE, BITS) \
255 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \
256 HWY_SVE_V(BASE, BITS) c) { \
257 return sv##OP##_##CHAR##BITS(a, b, c); \
265template <
typename T, HWY_IF_T_SIZE(T, 1)>
267 return svcntb_pat(SV_ALL);
269template <
typename T, HWY_IF_T_SIZE(T, 2)>
271 return svcnth_pat(SV_ALL);
273template <
typename T, HWY_IF_T_SIZE(T, 4)>
275 return svcntw_pat(SV_ALL);
277template <
typename T, HWY_IF_T_SIZE(T, 8)>
279 return svcntd_pat(SV_ALL);
285#define HWY_SVE_ALL_PTRUE(BITS) svptrue_b##BITS()
286#define HWY_SVE_PTRUE(BITS) svptrue_b##BITS()
288#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL)
289#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
298template <
typename T,
size_t N,
int kPow2>
300 const size_t actual = detail::AllHardwareLanes<T>();
301 constexpr size_t kMaxLanes =
MaxLanes(
d);
302 constexpr int kClampedPow2 =
HWY_MIN(kPow2, 0);
315#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \
316 template <size_t N, int kPow2> \
317 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \
318 const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \
319 return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
322#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
326template <
class D, HWY_SVE_IF_EMULATED_D(D)>
338#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
339 template <size_t N, int kPow2> \
340 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
341 return HWY_SVE_PTRUE(BITS); \
343 template <size_t N, int kPow2> \
344 HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
345 return HWY_SVE_ALL_PTRUE(BITS); \
350#undef HWY_SVE_WRAP_PTRUE
365#ifdef HWY_NATIVE_MASK_FALSE
366#undef HWY_NATIVE_MASK_FALSE
368#define HWY_NATIVE_MASK_FALSE
380#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
381 template <size_t N, int kPow2> \
382 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
383 HWY_SVE_T(BASE, BITS) arg) { \
384 return sv##OP##_##CHAR##BITS(arg); \
388#if HWY_SVE_HAVE_BF16_FEATURE
390#elif HWY_SVE_HAVE_BF16_VEC
392template <
class D, HWY_IF_BF16_D(D)>
394 return svreinterpret_bf16_u16(
399template <
class D, HWY_IF_BF16_D(D)>
402 return Set(du, BitCastScalar<uint16_t>(arg));
408using VFromD =
decltype(
Set(D(), TFromD<D>()));
426#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \
427 HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
430 template <size_t N, int kPow2> \
431 HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
432 HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(BASE, BITS) v) { \
437#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \
438 HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
439 return sv##OP##_u8_##CHAR##BITS(v); \
441 template <size_t N, int kPow2> \
442 HWY_INLINE HWY_SVE_V(BASE, BITS) \
443 BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) , svuint8_t v) { \
444 return sv##OP##_##CHAR##BITS##_u8(v); \
455#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
458template <
class V, HWY_SVE_IF_EMULATED_D(DFromV<V>)>
464template <
class D, HWY_SVE_IF_EMULATED_D(D)>
471#undef HWY_SVE_CAST_NOP
476template <
class D,
class FromV>
483#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
484 template <size_t N, int kPow2> \
485 HWY_API HWY_SVE_V(BASE, BITS) \
486 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
487 return sv##OP##_##CHAR##BITS(); \
491#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
495template <
class D, HWY_SVE_IF_EMULATED_D(D)>
504#define HWY_SVE_CREATE(BASE, CHAR, BITS, HALF, NAME, OP) \
505 template <size_t N, int kPow2> \
506 HWY_API HWY_SVE_TUPLE(BASE, BITS, 2) \
507 NAME##2(HWY_SVE_D(BASE, BITS, N, kPow2) , \
508 HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1) { \
509 return sv##OP##2_##CHAR##BITS(v0, v1); \
511 template <size_t N, int kPow2> \
512 HWY_API HWY_SVE_TUPLE(BASE, BITS, 3) NAME##3( \
513 HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(BASE, BITS) v0, \
514 HWY_SVE_V(BASE, BITS) v1, HWY_SVE_V(BASE, BITS) v2) { \
515 return sv##OP##3_##CHAR##BITS(v0, v1, v2); \
517 template <size_t N, int kPow2> \
518 HWY_API HWY_SVE_TUPLE(BASE, BITS, 4) \
519 NAME##4(HWY_SVE_D(BASE, BITS, N, kPow2) , \
520 HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
521 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3) { \
522 return sv##OP##4_##CHAR##BITS(v0, v1, v2, v3); \
526#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
538#define HWY_SVE_GET(BASE, CHAR, BITS, HALF, NAME, OP) \
539 template <size_t kIndex> \
540 HWY_API HWY_SVE_V(BASE, BITS) NAME##2(HWY_SVE_TUPLE(BASE, BITS, 2) tuple) { \
541 return sv##OP##2_##CHAR##BITS(tuple, kIndex); \
543 template <size_t kIndex> \
544 HWY_API HWY_SVE_V(BASE, BITS) NAME##3(HWY_SVE_TUPLE(BASE, BITS, 3) tuple) { \
545 return sv##OP##3_##CHAR##BITS(tuple, kIndex); \
547 template <size_t kIndex> \
548 HWY_API HWY_SVE_V(BASE, BITS) NAME##4(HWY_SVE_TUPLE(BASE, BITS, 4) tuple) { \
549 return sv##OP##4_##CHAR##BITS(tuple, kIndex); \
553#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
558#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
559 template <size_t kIndex> \
560 HWY_API HWY_SVE_TUPLE(BASE, BITS, 2) \
561 NAME##2(HWY_SVE_TUPLE(BASE, BITS, 2) tuple, HWY_SVE_V(BASE, BITS) vec) { \
562 return sv##OP##2_##CHAR##BITS(tuple, kIndex, vec); \
564 template <size_t kIndex> \
565 HWY_API HWY_SVE_TUPLE(BASE, BITS, 3) \
566 NAME##3(HWY_SVE_TUPLE(BASE, BITS, 3) tuple, HWY_SVE_V(BASE, BITS) vec) { \
567 return sv##OP##3_##CHAR##BITS(tuple, kIndex, vec); \
569 template <size_t kIndex> \
570 HWY_API HWY_SVE_TUPLE(BASE, BITS, 4) \
571 NAME##4(HWY_SVE_TUPLE(BASE, BITS, 4) tuple, HWY_SVE_V(BASE, BITS) vec) { \
572 return sv##OP##4_##CHAR##BITS(tuple, kIndex, vec); \
576#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
584template <
class D,
class FromV>
591template <
class D, HWY_IF_I8_D(D)>
599 return svdupq_n_s8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
603template <
class D, HWY_IF_U8_D(D)>
611 return svdupq_n_u8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
615template <
class D, HWY_IF_I16_D(D)>
620 return svdupq_n_s16(t0, t1, t2, t3, t4, t5, t6, t7);
623template <
class D, HWY_IF_U16_D(D)>
628 return svdupq_n_u16(t0, t1, t2, t3, t4, t5, t6, t7);
631template <
class D, HWY_IF_F16_D(D)>
636 return svdupq_n_f16(t0, t1, t2, t3, t4, t5, t6, t7);
639template <
class D, HWY_IF_BF16_D(D)>
643#if HWY_SVE_HAVE_BF16_FEATURE
645 return svdupq_n_bf16(t0, t1, t2, t3, t4, t5, t6, t7);
650 du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
651 BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
652 BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
653 BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
657template <
class D, HWY_IF_I32_D(D)>
660 return svdupq_n_s32(t0, t1, t2, t3);
663template <
class D, HWY_IF_U32_D(D)>
666 return svdupq_n_u32(t0, t1, t2, t3);
669template <
class D, HWY_IF_F32_D(D)>
672 return svdupq_n_f32(t0, t1, t2, t3);
675template <
class D, HWY_IF_I64_D(D)>
677 return svdupq_n_s64(t0, t1);
680template <
class D, HWY_IF_U64_D(D)>
682 return svdupq_n_u64(t0, t1);
685template <
class D, HWY_IF_F64_D(D)>
687 return svdupq_n_f64(t0, t1);
705template <
class V, HWY_IF_FLOAT_V(V)>
720template <
class V, HWY_IF_FLOAT_V(V)>
735template <
class V, HWY_IF_FLOAT_V(V)>
745#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
746 HWY_API HWY_SVE_V(BASE, BITS) \
747 NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
748 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
752#undef HWY_SVE_RETV_ARGPVN_SWAP
755#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
756 HWY_API HWY_SVE_V(BASE, BITS) \
757 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
758 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
761#undef HWY_SVE_RETV_ARGPVV_SWAP
763template <
class V, HWY_IF_FLOAT_V(V)>
776template <
class V, HWY_IF_FLOAT_V(V)>
786 return Xor(x1,
Xor(x2, x3));
793 return Or(o1,
Or(o2, o3));
799 return Or(o,
And(a1, a2));
804#ifdef HWY_NATIVE_POPCNT
805#undef HWY_NATIVE_POPCNT
807#define HWY_NATIVE_POPCNT
811#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \
812 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
813 return BitCast(DFromV<decltype(v)>(), \
814 sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
827 using TU =
TFromD<
decltype(du)>;
833#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
834#undef HWY_NATIVE_SATURATED_NEG_8_16_32
836#define HWY_NATIVE_SATURATED_NEG_8_16_32
839#ifdef HWY_NATIVE_SATURATED_NEG_64
840#undef HWY_NATIVE_SATURATED_NEG_64
842#define HWY_NATIVE_SATURATED_NEG_64
853#ifdef HWY_NATIVE_SATURATED_ABS
854#undef HWY_NATIVE_SATURATED_ABS
856#define HWY_NATIVE_SATURATED_ABS
865#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
866#undef HWY_NATIVE_OPERATOR_REPLACEMENTS
868#define HWY_NATIVE_OPERATOR_REPLACEMENTS
883#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \
884 HWY_API HWY_SVE_V(BASE, BITS) \
885 NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
886 return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
890#undef HWY_SVE_RETV_ARGPVN_MASK
899 const svbool_t pg = detail::PTrue(du64);
901 const svuint32_t sums_of_4 = svdot_n_u32(
Zero(du32), v, 1);
905 return svadalp_u64_x(pg,
Zero(du64), sums_of_4);
907 const svuint64_t hi = svlsr_n_u64_x(pg,
BitCast(du64, sums_of_4), 32);
909 const svuint64_t lo = svextw_u64_x(pg,
BitCast(du64, sums_of_4));
917 const svbool_t pg = detail::PTrue(di64);
919 const svint32_t sums_of_4 = svdot_n_s32(
Zero(di32), v, 1);
921 return svadalp_s64_x(pg,
Zero(di64), sums_of_4);
923 const svint64_t hi = svasr_n_s64_x(pg,
BitCast(di64, sums_of_4), 32);
925 const svint64_t lo = svextw_s64_x(pg,
BitCast(di64, sums_of_4));
937 const svbool_t pg = detail::PTrue(di16);
938 return svadalp_s16_x(pg,
Zero(di16), v);
944 const svbool_t pg = detail::PTrue(du16);
945 return svadalp_u16_x(pg,
Zero(du16), v);
951 const svbool_t pg = detail::PTrue(di32);
952 return svadalp_s32_x(pg,
Zero(di32), v);
958 const svbool_t pg = detail::PTrue(du32);
959 return svadalp_u32_x(pg,
Zero(du32), v);
965 const svbool_t pg = detail::PTrue(di64);
966 return svadalp_s64_x(pg,
Zero(di64), v);
972 const svbool_t pg = detail::PTrue(du64);
973 return svadalp_u64_x(pg,
Zero(du64), v);
1006#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
1007#undef HWY_NATIVE_I32_SATURATED_ADDSUB
1009#define HWY_NATIVE_I32_SATURATED_ADDSUB
1012#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
1013#undef HWY_NATIVE_U32_SATURATED_ADDSUB
1015#define HWY_NATIVE_U32_SATURATED_ADDSUB
1018#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
1019#undef HWY_NATIVE_I64_SATURATED_ADDSUB
1021#define HWY_NATIVE_I64_SATURATED_ADDSUB
1024#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
1025#undef HWY_NATIVE_U64_SATURATED_ADDSUB
1027#define HWY_NATIVE_U64_SATURATED_ADDSUB
1037#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
1038#undef HWY_NATIVE_INTEGER_ABS_DIFF
1040#define HWY_NATIVE_INTEGER_ABS_DIFF
1047#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1048 template <int kBits> \
1049 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1050 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
1052 HWY_API HWY_SVE_V(BASE, BITS) \
1053 NAME##Same(HWY_SVE_V(BASE, BITS) v, int bits) { \
1054 return sv##OP##_##CHAR##BITS##_x( \
1055 HWY_SVE_PTRUE(BITS), v, static_cast<HWY_SVE_T(uint, BITS)>(bits)); \
1065#undef HWY_SVE_SHIFT_N
1071#define HWY_SVE_ROTATE_RIGHT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1072 template <int kBits> \
1073 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1074 if (kBits == 0) return v; \
1075 return sv##OP##_##CHAR##BITS(v, Zero(DFromV<decltype(v)>()), \
1076 HWY_MAX(kBits, 1)); \
1082#undef HWY_SVE_ROTATE_RIGHT_N
1085template <
int kBits,
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
1090 constexpr size_t kSizeInBits =
sizeof(TFromV<V>) * 8;
1091 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
1092 if (kBits == 0)
return v;
1101#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
1102 HWY_API HWY_SVE_V(BASE, BITS) \
1103 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
1104 const RebindToUnsigned<DFromV<decltype(v)>> du; \
1105 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
1106 BitCast(du, bits)); \
1131#ifdef HWY_NATIVE_MUL_8
1132#undef HWY_NATIVE_MUL_8
1134#define HWY_NATIVE_MUL_8
1136#ifdef HWY_NATIVE_MUL_64
1137#undef HWY_NATIVE_MUL_64
1139#define HWY_NATIVE_MUL_64
1150 return svqrdmulh_s16(a, b);
1155 const svuint16_t lo =
BitCast(du,
Mul(a, b));
1156 const svint16_t hi =
MulHigh(a, b);
1160 const svuint16_t lo_top2 = ShiftRight<14>(lo);
1162 const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1));
1168#ifdef HWY_NATIVE_INT_DIV
1169#undef HWY_NATIVE_INT_DIV
1171#define HWY_NATIVE_INT_DIV
1179#ifdef HWY_NATIVE_F64_APPROX_RECIP
1180#undef HWY_NATIVE_F64_APPROX_RECIP
1182#define HWY_NATIVE_F64_APPROX_RECIP
1191#ifdef HWY_NATIVE_F64_APPROX_RSQRT
1192#undef HWY_NATIVE_F64_APPROX_RSQRT
1194#define HWY_NATIVE_F64_APPROX_RSQRT
1202#ifdef HWY_NATIVE_INT_FMA
1203#undef HWY_NATIVE_INT_FMA
1205#define HWY_NATIVE_INT_FMA
1208#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
1209 HWY_API HWY_SVE_V(BASE, BITS) \
1210 NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
1211 HWY_SVE_V(BASE, BITS) add) { \
1212 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
1238template <
class D,
typename MFrom>
1252 return svand_b_z(b, b, a);
1255 return svbic_b_z(b, b, a);
1258 return svsel_b(a, a, b);
1261 return svsel_b(a, svnand_b_z(a, a, b), b);
1270#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
1271 template <size_t N, int kPow2> \
1272 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \
1273 return sv##OP##_b##BITS(detail::MakeMask(d), m); \
1277#undef HWY_SVE_COUNT_TRUE
1282#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \
1283 template <size_t N, int kPow2> \
1284 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , svbool_t m) { \
1285 return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
1289#undef HWY_SVE_COUNT_TRUE_FULL
1309 :
static_cast<intptr_t
>(
1320#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
1321 HWY_API HWY_SVE_V(BASE, BITS) \
1322 NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
1323 return sv##OP##_##CHAR##BITS(m, yes, no); \
1328#undef HWY_SVE_IF_THEN_ELSE
1330template <
class V,
class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1339template <
class V,
class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
1344template <
class V,
class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1346 const RebindToUnsigned<D> du;
1352template <
class V,
class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
1357template <
class V,
class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1359 const RebindToUnsigned<D> du;
1386#ifdef HWY_NATIVE_PROMOTE_MASK_TO
1387#undef HWY_NATIVE_PROMOTE_MASK_TO
1389#define HWY_NATIVE_PROMOTE_MASK_TO
1392template <
class DTo,
class DFrom,
1395 return svunpklo_b(
m);
1398template <
class DTo,
class DFrom,
1401 using TFrom = TFromD<DFrom>;
1403 static_assert(
sizeof(TWFrom) >
sizeof(TFrom),
1404 "sizeof(TWFrom) > sizeof(TFrom) must be true");
1406 const Rebind<TWFrom,
decltype(d_from)> dw_from;
1412#ifdef HWY_NATIVE_DEMOTE_MASK_TO
1413#undef HWY_NATIVE_DEMOTE_MASK_TO
1415#define HWY_NATIVE_DEMOTE_MASK_TO
1421 return svuzp1_b8(
m,
m);
1427 return svuzp1_b16(
m,
m);
1433 return svuzp1_b32(
m,
m);
1436template <
class DTo,
class DFrom,
1441 static_assert(
sizeof(TNFrom) <
sizeof(TFrom),
1442 "sizeof(TNFrom) < sizeof(TFrom) must be true");
1444 const Rebind<TNFrom,
decltype(d_from)> dn_from;
1449#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
1450#undef HWY_NATIVE_LOWER_HALF_OF_MASK
1452#define HWY_NATIVE_LOWER_HALF_OF_MASK
1462#ifdef HWY_NATIVE_MASKED_ARITH
1463#undef HWY_NATIVE_MASKED_ARITH
1465#define HWY_NATIVE_MASKED_ARITH
1483template <
class V,
class M>
1488template <
class V,
class M>
1493template <
class V,
class M>
1498template <
class V,
class M>
1503template <
class V,
class M>
1508template <
class V,
class M,
1511 (1 << 4) | (1 << 8))>
1519template <
class V,
class M>
1521 return IfThenElse(
m, detail::MaskedSatAdd(
m, a, b), no);
1524template <
class V,
class M>
1526 return IfThenElse(
m, detail::MaskedSatSub(
m, a, b), no);
1529template <
class V,
class M>
1534template <
class V,
class M>
1543#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \
1544 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
1545 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
1547#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1548 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
1549 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
1590#undef HWY_SVE_COMPARE
1591#undef HWY_SVE_COMPARE_N
1596 return detail::NeN(
And(a, bit), 0);
1603 return detail::NeN(v, ConvertScalarTo<T>(0));
1616#ifdef HWY_NATIVE_IS_NEGATIVE
1617#undef HWY_NATIVE_IS_NEGATIVE
1619#define HWY_NATIVE_IS_NEGATIVE
1622template <
class V, HWY_IF_NOT_UNSIGNED_V(V)>
1626 using TI =
TFromD<
decltype(di)>;
1628 return detail::LtN(
BitCast(di, v),
static_cast<TI
>(0));
1635#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP) \
1636 HWY_API HWY_SVE_V(BASE, BITS) \
1637 NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \
1638 HWY_SVE_V(BASE, BITS) no) { \
1639 return sv##OP##_##CHAR##BITS(yes, no, mask); \
1643#undef HWY_SVE_IF_VEC
1645template <
class V, HWY_IF_FLOAT_V(V)>
1664#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
1665#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
1667#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
1678 const DFromV<
decltype(magn)>
d;
1702#ifdef HWY_NATIVE_ISINF
1703#undef HWY_NATIVE_ISINF
1705#define HWY_NATIVE_ISINF
1717 const VFromD<
decltype(du)> v2 =
Add(vu, vu);
1719 const VFromD<
decltype(di)> max2 =
Set(di, hwy::MaxExponentTimes2<T>());
1734 const VFromD<
decltype(di)> exp =
1736 return RebindMask(
d, detail::LtN(exp, hwy::MaxExponentField<T>()));
1743#define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \
1744 template <size_t N, int kPow2> \
1745 HWY_API HWY_SVE_V(BASE, BITS) \
1746 LoadU(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1747 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1748 return svld1_##CHAR##BITS(detail::MakeMask(d), \
1749 detail::NativeLanePointer(p)); \
1751 template <size_t N, int kPow2> \
1752 HWY_API HWY_SVE_V(BASE, BITS) \
1753 MaskedLoad(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) , \
1754 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1755 return svld1_##CHAR##BITS(m, detail::NativeLanePointer(p)); \
1757 template <size_t N, int kPow2> \
1758 HWY_API void StoreU(HWY_SVE_V(BASE, BITS) v, \
1759 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1760 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1761 svst1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), v); \
1763 template <size_t N, int kPow2> \
1764 HWY_API void Stream(HWY_SVE_V(BASE, BITS) v, \
1765 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1766 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1767 svstnt1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), \
1770 template <size_t N, int kPow2> \
1771 HWY_API void BlendedStore(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1772 HWY_SVE_D(BASE, BITS, N, kPow2) , \
1773 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1774 svst1_##CHAR##BITS(m, detail::NativeLanePointer(p), v); \
1780template <
class D, HWY_SVE_IF_EMULATED_D(D)>
1786template <
class D, HWY_SVE_IF_EMULATED_D(D)>
1792template <
class D, HWY_SVE_IF_EMULATED_D(D)>
1802template <
class D, HWY_SVE_IF_EMULATED_D(D)>
1812#if HWY_TARGET != HWY_SVE2_128
1814#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
1815 template <size_t N, int kPow2> \
1816 HWY_API HWY_SVE_V(BASE, BITS) \
1817 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
1818 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1820 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), \
1821 detail::NativeLanePointer(p)); \
1827template <
class D, HWY_SVE_IF_EMULATED_D(D)>
1836#if HWY_TARGET == HWY_SVE2_128
1845template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1851template <
class D, HWY_IF_V_SIZE_GT_D(D, 16)>
1853 return detail::LoadDupFull128(
d,
p);
1867template <
class V,
class D>
1883#ifdef HWY_NATIVE_SCATTER
1884#undef HWY_NATIVE_SCATTER
1886#define HWY_NATIVE_SCATTER
1889#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1890 template <size_t N, int kPow2> \
1891 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1892 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1893 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1894 HWY_SVE_V(int, BITS) offset) { \
1895 sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \
1899#define HWY_SVE_MASKED_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1900 template <size_t N, int kPow2> \
1901 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1902 HWY_SVE_D(BASE, BITS, N, kPow2) , \
1903 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1904 HWY_SVE_V(int, BITS) indices) { \
1905 sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices, v); \
1911#undef HWY_SVE_SCATTER_OFFSET
1912#undef HWY_SVE_MASKED_SCATTER_INDEX
1922#ifdef HWY_NATIVE_GATHER
1923#undef HWY_NATIVE_GATHER
1925#define HWY_NATIVE_GATHER
1928#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1929 template <size_t N, int kPow2> \
1930 HWY_API HWY_SVE_V(BASE, BITS) \
1931 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1932 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1933 HWY_SVE_V(int, BITS) offset) { \
1934 return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \
1937#define HWY_SVE_MASKED_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1938 template <size_t N, int kPow2> \
1939 HWY_API HWY_SVE_V(BASE, BITS) \
1940 NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1941 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1942 HWY_SVE_V(int, BITS) indices) { \
1943 const RebindToSigned<decltype(d)> di; \
1945 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
1946 return sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices); \
1952#undef HWY_SVE_GATHER_OFFSET
1953#undef HWY_SVE_MASKED_GATHER_INDEX
1971#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1972#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1974#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1977#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP) \
1978 template <size_t N, int kPow2> \
1979 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1980 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1981 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
1982 const HWY_SVE_TUPLE(BASE, BITS, 2) tuple = sv##OP##_##CHAR##BITS( \
1983 detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
1984 v0 = svget2(tuple, 0); \
1985 v1 = svget2(tuple, 1); \
1993#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP) \
1994 template <size_t N, int kPow2> \
1995 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1996 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1997 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1998 HWY_SVE_V(BASE, BITS) & v2) { \
1999 const HWY_SVE_TUPLE(BASE, BITS, 3) tuple = sv##OP##_##CHAR##BITS( \
2000 detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
2001 v0 = svget3(tuple, 0); \
2002 v1 = svget3(tuple, 1); \
2003 v2 = svget3(tuple, 2); \
2011#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP) \
2012 template <size_t N, int kPow2> \
2013 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
2014 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
2015 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
2016 HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
2017 const HWY_SVE_TUPLE(BASE, BITS, 4) tuple = sv##OP##_##CHAR##BITS( \
2018 detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
2019 v0 = svget4(tuple, 0); \
2020 v1 = svget4(tuple, 1); \
2021 v2 = svget4(tuple, 2); \
2022 v3 = svget4(tuple, 3); \
2030#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
2031 template <size_t N, int kPow2> \
2032 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
2033 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
2034 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
2035 sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
2036 detail::NativeLanePointer(unaligned), \
2037 Create2(d, v0, v1)); \
2041#undef HWY_SVE_STORE2
2045#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \
2046 template <size_t N, int kPow2> \
2047 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
2048 HWY_SVE_V(BASE, BITS) v2, \
2049 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
2050 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
2051 sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
2052 detail::NativeLanePointer(unaligned), \
2053 Create3(d, v0, v1, v2)); \
2057#undef HWY_SVE_STORE3
2061#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \
2062 template <size_t N, int kPow2> \
2063 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
2064 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
2065 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
2066 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
2067 sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
2068 detail::NativeLanePointer(unaligned), \
2069 Create4(d, v0, v1, v2, v3)); \
2073#undef HWY_SVE_STORE4
2080#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \
2081 template <size_t N, int kPow2> \
2082 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
2083 HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(BASE, HALF) v) { \
2084 return sv##OP##_##CHAR##BITS(v); \
2092template <
size_t N,
int kPow2>
2097template <
size_t N,
int kPow2>
2102template <
size_t N,
int kPow2>
2107template <
size_t N,
int kPow2>
2114template <
size_t N,
int kPow2>
2120template <
size_t N,
int kPow2>
2138#ifdef HWY_NATIVE_F16C
2139#undef HWY_NATIVE_F16C
2141#define HWY_NATIVE_F16C
2149template <
size_t N,
int kPow2>
2151 const svfloat16_t v) {
2154 const svfloat16_t vv = detail::ZipLowerSame(v, v);
2158#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
2159#undef HWY_NATIVE_PROMOTE_F16_TO_F64
2161#define HWY_NATIVE_PROMOTE_F16_TO_F64
2164template <
size_t N,
int kPow2>
2166 const svfloat16_t v) {
2169 const svfloat16_t vv = detail::ZipLowerSame(v, v);
2171 detail::ZipLowerSame(vv, vv));
2174template <
size_t N,
int kPow2>
2176 const svfloat32_t v) {
2177 const svfloat32_t vv = detail::ZipLowerSame(v, v);
2181template <
size_t N,
int kPow2>
2183 const svint32_t v) {
2184 const svint32_t vv = detail::ZipLowerSame(v, v);
2188template <
size_t N,
int kPow2>
2190 const svuint32_t v) {
2191 const svuint32_t vv = detail::ZipLowerSame(v, v);
2195template <
size_t N,
int kPow2>
2197 const svfloat32_t v) {
2198 const svfloat32_t vv = detail::ZipLowerSame(v, v);
2202template <
size_t N,
int kPow2>
2204 const svfloat32_t v) {
2205 const svfloat32_t vv = detail::ZipLowerSame(v, v);
2215#undef HWY_SVE_PROMOTE_TO
2218#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
2219#undef HWY_NATIVE_PROMOTE_UPPER_TO
2221#define HWY_NATIVE_PROMOTE_UPPER_TO
2225template <
class D,
class V,
typename TD = TFromD<D>,
typename TV = TFromV<V>,
2226 hwy::EnableIf<IsInteger<TD>() && IsInteger<TV>() &&
2227 (IsSigned<TD>() == IsSigned<TV>())>* =
nullptr>
2230 return detail::PromoteUpperTo(
d, v);
2237template <
class D,
class V,
typename TD = TFromD<D>,
typename TV = TFromV<V>,
2238 hwy::EnableIf<!IsInteger<TD>() || !IsInteger<TV>() ||
2239 (IsSigned<TD>() != IsSigned<TV>())>* =
nullptr>
2243 const Rebind<TFromV<V>,
decltype(
d)> dh;
2252template <
typename TN,
class VU>
2254 return detail::MinN(v,
static_cast<TFromV<VU>>(LimitsMax<TN>()));
2258template <
typename TN,
class VI>
2260 return detail::MinN(detail::MaxN(v, LimitsMin<TN>()), LimitsMax<TN>());
2265template <
size_t N,
int kPow2>
2268 const svuint8_t vn =
BitCast(dn, svqxtunb_s16(v));
2270 const DFromV<
decltype(v)> di;
2272 using TN =
TFromD<
decltype(dn)>;
2274 const svuint16_t clamped =
BitCast(du, detail::MaxN(v, 0));
2276 const svuint8_t vn =
BitCast(dn, detail::SaturateU<TN>(clamped));
2278 return svuzp1_u8(vn, vn);
2281template <
size_t N,
int kPow2>
2284 const svuint16_t vn =
BitCast(dn, svqxtunb_s32(v));
2286 const DFromV<
decltype(v)> di;
2288 using TN =
TFromD<
decltype(dn)>;
2290 const svuint32_t clamped =
BitCast(du, detail::MaxN(v, 0));
2292 const svuint16_t vn =
BitCast(dn, detail::SaturateU<TN>(clamped));
2294 return svuzp1_u16(vn, vn);
2297template <
size_t N,
int kPow2>
2299 const DFromV<
decltype(v)> di;
2303 const svuint16_t cast16 =
BitCast(d2, svqxtnb_u16(svqxtunb_s32(v)));
2305 using TN =
TFromD<
decltype(dn)>;
2307 const svuint32_t clamped =
BitCast(du, detail::MaxN(v, 0));
2309 const svuint16_t cast16 =
BitCast(d2, detail::SaturateU<TN>(clamped));
2311 const svuint8_t x2 =
BitCast(dn, svuzp1_u16(cast16, cast16));
2312 return svuzp1_u8(x2, x2);
2320 const svuint16_t cast16 =
BitCast(du16, v);
2321 const svuint16_t x2 = svuzp1_u16(cast16, cast16);
2322 const svuint8_t cast8 =
BitCast(du8, x2);
2323 return svuzp1_u8(cast8, cast8);
2326template <
size_t N,
int kPow2>
2329 const svuint8_t vn =
BitCast(dn, svqxtnb_u16(v));
2331 using TN =
TFromD<
decltype(dn)>;
2332 const svuint8_t vn =
BitCast(dn, detail::SaturateU<TN>(v));
2334 return svuzp1_u8(vn, vn);
2337template <
size_t N,
int kPow2>
2340 const svuint16_t vn =
BitCast(dn, svqxtnb_u32(v));
2342 using TN =
TFromD<
decltype(dn)>;
2343 const svuint16_t vn =
BitCast(dn, detail::SaturateU<TN>(v));
2345 return svuzp1_u16(vn, vn);
2348template <
size_t N,
int kPow2>
2350 using TN =
TFromD<
decltype(dn)>;
2351 return U8FromU32(detail::SaturateU<TN>(v));
2356template <
size_t N,
int kPow2>
2358 const svuint64_t v) {
2360 const svuint8_t v1 =
BitCast(
d, v);
2361 const svuint8_t v2 = svuzp1_u8(v1, v1);
2362 const svuint8_t v3 = svuzp1_u8(v2, v2);
2363 return svuzp1_u8(v3, v3);
2366template <
size_t N,
int kPow2>
2368 const svuint64_t v) {
2370 const svuint16_t v1 =
BitCast(
d, v);
2371 const svuint16_t v2 = svuzp1_u16(v1, v1);
2372 return svuzp1_u16(v2, v2);
2375template <
size_t N,
int kPow2>
2377 const svuint64_t v) {
2379 const svuint32_t v1 =
BitCast(
d, v);
2380 return svuzp1_u32(v1, v1);
2383template <
size_t N,
int kPow2>
2385 const svuint32_t v) {
2387 const svuint8_t v1 =
BitCast(
d, v);
2388 const svuint8_t v2 = svuzp1_u8(v1, v1);
2389 return svuzp1_u8(v2, v2);
2392template <
size_t N,
int kPow2>
2394 const svuint32_t v) {
2396 const svuint16_t v1 =
BitCast(
d, v);
2397 return svuzp1_u16(v1, v1);
2400template <
size_t N,
int kPow2>
2402 const svuint16_t v) {
2404 const svuint8_t v1 =
BitCast(
d, v);
2405 return svuzp1_u8(v1, v1);
2410template <
size_t N,
int kPow2>
2413 const svint8_t vn =
BitCast(dn, svqxtnb_s16(v));
2415 using TN =
TFromD<
decltype(dn)>;
2416 const svint8_t vn =
BitCast(dn, detail::SaturateI<TN>(v));
2418 return svuzp1_s8(vn, vn);
2421template <
size_t N,
int kPow2>
2424 const svint16_t vn =
BitCast(dn, svqxtnb_s32(v));
2426 using TN =
TFromD<
decltype(dn)>;
2427 const svint16_t vn =
BitCast(dn, detail::SaturateI<TN>(v));
2429 return svuzp1_s16(vn, vn);
2432template <
size_t N,
int kPow2>
2436 const svint16_t cast16 =
BitCast(d2, svqxtnb_s16(svqxtnb_s32(v)));
2438 using TN =
TFromD<
decltype(dn)>;
2439 const svint16_t cast16 =
BitCast(d2, detail::SaturateI<TN>(v));
2441 const svint8_t v2 =
BitCast(dn, svuzp1_s16(cast16, cast16));
2442 return BitCast(dn, svuzp1_s8(v2, v2));
2447template <
size_t N,
int kPow2>
2449 const Rebind<uint64_t,
decltype(dn)> du64;
2452 const svuint64_t vn =
BitCast(du64, svqxtnb_s64(v));
2454 using TN =
TFromD<
decltype(dn)>;
2455 const svuint64_t vn =
BitCast(du64, detail::SaturateI<TN>(v));
2460template <
size_t N,
int kPow2>
2462 const Rebind<uint64_t,
decltype(dn)> du64;
2465 const svuint64_t vn =
BitCast(du64, svqxtnb_s32(svqxtnb_s64(v)));
2467 using TN =
TFromD<
decltype(dn)>;
2468 const svuint64_t vn =
BitCast(du64, detail::SaturateI<TN>(v));
2473template <
size_t N,
int kPow2>
2475 const Rebind<uint64_t,
decltype(dn)> du64;
2477 using TN =
TFromD<
decltype(dn)>;
2478 const svuint64_t vn =
BitCast(du64, detail::SaturateI<TN>(v));
2482template <
size_t N,
int kPow2>
2484 const Rebind<uint64_t,
decltype(dn)> du64;
2486 const svuint64_t vn =
BitCast(du64, svqxtunb_s64(v));
2488 using TN =
TFromD<
decltype(dn)>;
2490 const svuint64_t clamped =
BitCast(du64, detail::MaxN(v, 0));
2492 const svuint64_t vn = detail::SaturateU<TN>(clamped);
2497template <
size_t N,
int kPow2>
2499 const Rebind<uint64_t,
decltype(dn)> du64;
2501 const svuint64_t vn =
BitCast(du64, svqxtnb_u32(svqxtunb_s64(v)));
2503 using TN =
TFromD<
decltype(dn)>;
2505 const svuint64_t clamped =
BitCast(du64, detail::MaxN(v, 0));
2507 const svuint64_t vn = detail::SaturateU<TN>(clamped);
2512template <
size_t N,
int kPow2>
2514 const Rebind<uint64_t,
decltype(dn)> du64;
2515 using TN =
TFromD<
decltype(dn)>;
2517 const svuint64_t clamped =
BitCast(du64, detail::MaxN(v, 0));
2519 const svuint64_t vn = detail::SaturateU<TN>(clamped);
2523template <
size_t N,
int kPow2>
2525 const Rebind<uint64_t,
decltype(dn)> du64;
2527 const svuint64_t vn =
BitCast(du64, svqxtnb_u64(v));
2529 using TN =
TFromD<
decltype(dn)>;
2530 const svuint64_t vn =
BitCast(du64, detail::SaturateU<TN>(v));
2535template <
size_t N,
int kPow2>
2537 const Rebind<uint64_t,
decltype(dn)> du64;
2539 const svuint64_t vn =
BitCast(du64, svqxtnb_u32(svqxtnb_u64(v)));
2541 using TN =
TFromD<
decltype(dn)>;
2542 const svuint64_t vn =
BitCast(du64, detail::SaturateU<TN>(v));
2547template <
size_t N,
int kPow2>
2549 const Rebind<uint64_t,
decltype(dn)> du64;
2550 using TN =
TFromD<
decltype(dn)>;
2551 const svuint64_t vn =
BitCast(du64, detail::SaturateU<TN>(v));
2567#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
2568#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
2569 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
2584#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
2585 HWY_INLINE HWY_SVE_V(BASE, BITS) \
2586 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
2587 return sv##OP##_##CHAR##BITS(lo, hi); \
2591#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2597#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
2600#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2602 ConcatEvenBlocks, uzp1q)
2607#undef HWY_SVE_CONCAT_EVERY_SECOND
2611#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
2612 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
2613 HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
2614 return sv##OP##_##CHAR##BITS(mask, lo, hi); \
2617#if HWY_SVE_HAVE_BF16_FEATURE
2620template <
class V, HWY_IF_BF16_D(DFromV<V>)>
2627#undef HWY_SVE_SPLICE
2636 const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
2637 const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
2646 const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
2647 const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
2655template <
size_t N,
int kPow2>
2657 const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(
d), v);
2658 return detail::ConcatEvenFull(in_even,
2662#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
2663#undef HWY_NATIVE_DEMOTE_F64_TO_F16
2665#define HWY_NATIVE_DEMOTE_F64_TO_F16
2668template <
size_t N,
int kPow2>
2670 const svfloat16_t in_lo16 = svcvt_f16_f64_x(detail::PTrue(
d), v);
2671 const svfloat16_t in_even = detail::ConcatEvenFull(in_lo16, in_lo16);
2672 return detail::ConcatEvenFull(in_even,
2676#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
2677#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
2679#define HWY_NATIVE_DEMOTE_F32_TO_BF16
2682#if !HWY_SVE_HAVE_F32_TO_BF16C
2691 const DFromV<
decltype(v)> df32;
2694 const auto is_non_nan =
Eq(v, v);
2695 const auto bits32 =
BitCast(du32, v);
2697 const auto round_incr =
2698 detail::AddN(detail::AndN(ShiftRight<16>(bits32), 1u), 0x7FFFu);
2699 return MaskedAddOr(detail::OrN(bits32, 0x00400000u), is_non_nan, bits32,
2706template <
size_t N,
int kPow2>
2708#if HWY_SVE_HAVE_F32_TO_BF16C
2709 const VBF16 in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), v);
2710 return detail::ConcatEvenFull(in_even, in_even);
2712 const svuint16_t in_odd =
2714 return BitCast(dbf16, detail::ConcatOddFull(in_odd, in_odd));
2718template <
size_t N,
int kPow2>
2720 const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(
d), v);
2721 return detail::ConcatEvenFull(in_even,
2725template <
size_t N,
int kPow2>
2727 const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(
d), v);
2728 return detail::ConcatEvenFull(in_even,
2732template <
size_t N,
int kPow2>
2734 const svuint32_t in_even = svcvt_u32_f64_x(detail::PTrue(
d), v);
2735 return detail::ConcatEvenFull(in_even,
2739template <
size_t N,
int kPow2>
2741 const svfloat32_t in_even = svcvt_f32_s64_x(detail::PTrue(
d), v);
2742 return detail::ConcatEvenFull(in_even,
2746template <
size_t N,
int kPow2>
2748 const svfloat32_t in_even = svcvt_f32_u64_x(detail::PTrue(
d), v);
2749 return detail::ConcatEvenFull(in_even,
2755#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
2757 template <size_t N, int kPow2> \
2758 HWY_API HWY_SVE_V(BASE, BITS) \
2759 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(int, BITS) v) { \
2760 return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2763 template <size_t N, int kPow2> \
2764 HWY_API HWY_SVE_V(BASE, BITS) \
2765 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(uint, BITS) v) { \
2766 return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2769 template <size_t N, int kPow2> \
2770 HWY_API HWY_SVE_V(int, BITS) \
2771 NAME(HWY_SVE_D(int, BITS, N, kPow2) , HWY_SVE_V(BASE, BITS) v) { \
2772 return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2775 template <size_t N, int kPow2> \
2776 HWY_API HWY_SVE_V(uint, BITS) \
2777 NAME(HWY_SVE_D(uint, BITS, N, kPow2) , HWY_SVE_V(BASE, BITS) v) { \
2778 return sv##OP##_u##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2783#undef HWY_SVE_CONVERT
2786template <
class VF,
class DI = RebindToSigned<DFromV<VF>>>
2794#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
2795 template <size_t N, int kPow2, typename T2> \
2796 HWY_API HWY_SVE_V(BASE, BITS) \
2797 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , T2 first) { \
2798 return sv##OP##_##CHAR##BITS( \
2799 ConvertScalarTo<HWY_SVE_T(BASE, BITS)>(first), 1); \
2805template <
class D,
typename T2, HWY_IF_FLOAT_D(D)>
2814template <
class D,
class V>
2816 static_assert(IsSame<TFromD<D>,
TFromV<V>>(),
"D/V mismatch");
2817#if HWY_TARGET == HWY_SVE2_128
2819 return detail::ZipLowerSame(a, b);
2823 const auto a64 =
BitCast(d64, a);
2824 const auto b64 =
BitCast(d64, b);
2825 const auto a_blocks = detail::ConcatEvenFull(a64, a64);
2826 const auto b_blocks = detail::ConcatEvenFull(b64, b64);
2846template <
class D,
class V = VFromD<D>,
2847 hwy::EnableIf<detail::IsFull(D())>* =
nullptr>
2849#if HWY_TARGET == HWY_SVE2_128
2851 return detail::ZipUpperSame(a, b);
2855 const auto a64 =
BitCast(d64, a);
2856 const auto b64 =
BitCast(d64, b);
2857 const auto a_blocks = detail::ConcatOddFull(a64, a64);
2858 const auto b_blocks = detail::ConcatOddFull(b64, b64);
2864template <
class D,
class V = VFromD<D>,
2865 hwy::EnableIf<!detail::IsFull(D())>* =
nullptr>
2868 if (
Lanes(
d) *
sizeof(TFromD<D>) < 16) {
2869 const Half<
decltype(
d)> d2;
2876#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
2877#undef HWY_NATIVE_INTERLEAVE_WHOLE
2879#define HWY_NATIVE_INTERLEAVE_WHOLE
2884 return detail::ZipLowerSame(a, b);
2892 return detail::ZipUpperSame(a, b);
2895 const Half<
decltype(
d)> d2;
2903template <
size_t kLaneSize,
size_t kVectSize,
class V,
2913 const auto evens =
BitCast(dw, ConcatEvenFull(v, v));
2914 return BitCast(
d, ZipLowerSame(evens, evens));
2917template <
size_t kLaneSize,
size_t kVectSize,
class V,
2927 const auto odds =
BitCast(dw, ConcatOddFull(v, v));
2928 return BitCast(
d, ZipLowerSame(odds, odds));
2937#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2938template <
class D, HWY_IF_T_SIZE_D(D, 1)>
2942 return svptrue_pat_b8(SV_VL16);
2944 return svptrue_pat_b8(SV_VL8);
2946 return svptrue_pat_b8(SV_VL4);
2948 return svptrue_pat_b8(SV_VL2);
2950 return svptrue_pat_b8(SV_VL1);
2953template <
class D, HWY_IF_T_SIZE_D(D, 2)>
2957 return svptrue_pat_b16(SV_VL8);
2959 return svptrue_pat_b16(SV_VL4);
2961 return svptrue_pat_b16(SV_VL2);
2963 return svptrue_pat_b16(SV_VL1);
2966template <
class D, HWY_IF_T_SIZE_D(D, 4)>
2970 return svptrue_pat_b32(SV_VL4);
2972 return svptrue_pat_b32(SV_VL2);
2974 return svptrue_pat_b32(SV_VL1);
2977template <
class D, HWY_IF_T_SIZE_D(D, 8)>
2981 return svptrue_pat_b64(SV_VL2);
2983 return svptrue_pat_b64(SV_VL1);
2987#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2988template <
class D, HWY_IF_T_SIZE_D(D, 1)>
2992 return svptrue_pat_b8(SV_VL8);
2994 return svptrue_pat_b8(SV_VL4);
2996 return svptrue_pat_b8(SV_VL2);
3000 return svptrue_pat_b8(SV_VL1);
3003template <
class D, HWY_IF_T_SIZE_D(D, 2)>
3007 return svptrue_pat_b16(SV_VL4);
3009 return svptrue_pat_b16(SV_VL2);
3013 return svptrue_pat_b16(SV_VL1);
3016template <
class D, HWY_IF_T_SIZE_D(D, 4)>
3018 return svptrue_pat_b32(
Lanes(
d) == 4 ? SV_VL2 : SV_VL1);
3020template <
class D, HWY_IF_T_SIZE_D(D, 8)>
3022 return svptrue_pat_b64(SV_VL1);
3025#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
3035 if (HWY_SVE_IS_POW2 &&
IsFull(
d)) {
3045#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \
3046 template <size_t kIndex> \
3047 HWY_API HWY_SVE_V(BASE, BITS) \
3048 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
3049 return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
3057template <
class D,
class V>
3063template <
class D,
class V>
3066#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
3067 return detail::ConcatEvenBlocks(hi, lo);
3069#if HWY_TARGET == HWY_SVE2_128
3071 const auto lo64 =
BitCast(du64, lo);
3079template <
class D,
class V>
3081#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
3090template <
class D,
class V>
3093#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
3094 return detail::ConcatOddBlocks(hi, lo);
3096#if HWY_TARGET == HWY_SVE2_128
3098 const auto lo64 =
BitCast(du64, lo);
3108template <
class D,
class V2>
3114template <
class D,
class V>
3121template <
class D2,
class V>
3131template <
class DH,
class V>
3133 const Twice<
decltype(dh)>
d;
3137#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
3147#ifdef HWY_NATIVE_REDUCE_SCALAR
3148#undef HWY_NATIVE_REDUCE_SCALAR
3150#define HWY_NATIVE_REDUCE_SCALAR
3155#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
3156 HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
3158 using T = HWY_SVE_T(BASE, BITS); \
3159 using TU = MakeUnsigned<T>; \
3160 constexpr uint64_t kMask = LimitsMax<TU>(); \
3161 return static_cast<T>(static_cast<TU>( \
3162 static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \
3165#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
3166 HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
3167 return sv##OP##_##CHAR##BITS(pg, v); \
3179#undef HWY_SVE_REDUCE
3180#undef HWY_SVE_REDUCE_ADD
3187#undef HWY_IF_REDUCE_D
3188#define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
3190#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
3191#undef HWY_NATIVE_REDUCE_SUM_4_UI8
3193#define HWY_NATIVE_REDUCE_SUM_4_UI8
3196#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
3197#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
3199#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
3202template <
class D, HWY_IF_REDUCE_D(D)>
3207template <
class D, HWY_IF_REDUCE_D(D)>
3212template <
class D, HWY_IF_REDUCE_D(D)>
3219template <
class D, HWY_IF_LANES_GT_D(D, 1)>
3223template <
class D, HWY_IF_LANES_GT_D(D, 1)>
3227template <
class D, HWY_IF_LANES_GT_D(D, 1)>
3237#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
3238 HWY_INLINE HWY_SVE_T(BASE, BITS) \
3239 NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
3240 return sv##OP##_##CHAR##BITS(mask, v); \
3245#undef HWY_SVE_GET_LANE
3264 using TI =
TFromD<
decltype(di)>;
3265 const svbool_t is_i = detail::EqN(
Iota(di, 0),
static_cast<TI
>(i));
3277 return detail::InterleaveEven(v, v);
3288 return detail::InterleaveOdd(v, v);
3295#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
3296 HWY_API HWY_SVE_V(BASE, BITS) \
3297 NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \
3298 return sv##OP##_##CHAR##BITS(even, odd, 0); \
3302#undef HWY_SVE_ODD_EVEN
3304template <
class V, HWY_IF_FLOAT_V(V)>
3315 const auto odd_in_even = detail::Ext<1>(odd, odd);
3316 return detail::InterleaveEven(even, odd_in_even);
3324 return detail::InterleaveEven(a, b);
3330 return detail::InterleaveOdd(a, b);
3337#if HWY_TARGET == HWY_SVE_256
3339#elif HWY_TARGET == HWY_SVE2_128
3345 using TU =
TFromD<
decltype(du)>;
3346 constexpr size_t kShift =
CeilLog2(16 /
sizeof(TU));
3347 const auto idx_block = ShiftRight<kShift>(
Iota(du, 0));
3348 const auto lsb = detail::AndN(idx_block,
static_cast<TU
>(1));
3349 const svbool_t is_even = detail::EqN(lsb,
static_cast<TU
>(0));
3356template <
class D,
class VI>
3359 static_assert(
sizeof(
TFromD<D>) ==
sizeof(TI),
"Index/lane size mismatch");
3362#if HWY_IS_DEBUG_BUILD
3364 const size_t twice_max_lanes =
Lanes(
d) * 2;
3367 detail::AndN(
indices,
static_cast<TU
>(twice_max_lanes - 1)))));
3374template <
class D,
typename TI>
3376 static_assert(
sizeof(
TFromD<D>) ==
sizeof(TI),
"Index size must match lane");
3380#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \
3381 HWY_API HWY_SVE_V(BASE, BITS) \
3382 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \
3383 return sv##OP##_##CHAR##BITS(v, idx); \
3387#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3394#define HWY_SVE_TABLE2(BASE, CHAR, BITS, HALF, NAME, OP) \
3395 HWY_API HWY_SVE_V(BASE, BITS) \
3396 NAME(HWY_SVE_TUPLE(BASE, BITS, 2) tuple, HWY_SVE_V(uint, BITS) idx) { \
3397 return sv##OP##_##CHAR##BITS(tuple, idx); \
3401#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3413#if HWY_SVE_HAVE_2 && HWY_SVE_IS_POW2
3415 return detail::NativeTwoTableLookupLanes(
Create2(
d, a, b), idx);
3419 using TU =
TFromD<
decltype(du)>;
3421 const size_t num_of_lanes =
Lanes(
d);
3422 const auto idx_mod = detail::AndN(idx,
static_cast<TU
>(num_of_lanes - 1));
3423 const auto sel_a_mask =
Eq(idx, idx_mod);
3427 return IfThenElse(sel_a_mask, a_lookup_result, b_lookup_result);
3441template <
typename T,
size_t N,
int kPow2>
3452#if HWY_TARGET == HWY_SVE_256
3454#elif HWY_TARGET == HWY_SVE2_128
3459 constexpr auto kLanesPerBlock =
3461 const VFromD<
decltype(du)> idx = detail::XorN(
Iota(du, 0), kLanesPerBlock);
3470#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \
3471 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
3472 return sv##OP##_##CHAR##BITS(v); \
3476#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3479#undef HWY_SVE_REVERSE
3483template <
class D,
class V>
3486 const auto reversed = detail::ReverseFull(v);
3493 const svbool_t all_true = detail::AllPTrue(dfull);
3494 const size_t all_lanes = detail::AllHardwareLanes<T>();
3495 const size_t want_lanes =
Lanes(
d);
3497 const svbool_t mask =
3498 svnot_b_z(all_true,
FirstN(dfull, all_lanes - want_lanes));
3505#ifdef HWY_NATIVE_REVERSE2_8
3506#undef HWY_NATIVE_REVERSE2_8
3508#define HWY_NATIVE_REVERSE2_8
3511template <
class D, HWY_IF_T_SIZE_D(D, 1)>
3518template <
class D, HWY_IF_T_SIZE_D(D, 2)>
3525template <
class D, HWY_IF_T_SIZE_D(D, 4)>
3532template <
class D, HWY_IF_T_SIZE_D(D, 8)>
3534#if HWY_TARGET == HWY_SVE2_128
3536 return detail::Ext<1>(v, v);
3540 const auto odd_in_even = detail::Ext<1>(v, v);
3541 return detail::InterleaveEven(odd_in_even, v);
3546template <
class D, HWY_IF_T_SIZE_D(D, 1)>
3553template <
class D, HWY_IF_T_SIZE_D(D, 2)>
3560template <
class D, HWY_IF_T_SIZE_D(D, 4)>
3563 return detail::ReverseFull(v);
3567 const auto idx = detail::XorN(
Iota(du, 0), 3);
3571template <
class D, HWY_IF_T_SIZE_D(D, 8)>
3574 return detail::ReverseFull(v);
3578 const auto idx = detail::XorN(
Iota(du, 0), 3);
3584template <
class D, HWY_IF_T_SIZE_D(D, 1)>
3590template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
3593 const auto idx = detail::XorN(
Iota(du, 0), 7);
3599#ifdef HWY_NATIVE_REVERSE_BITS_UI8
3600#undef HWY_NATIVE_REVERSE_BITS_UI8
3602#define HWY_NATIVE_REVERSE_BITS_UI8
3605#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
3606#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
3608#define HWY_NATIVE_REVERSE_BITS_UI16_32_64
3611#define HWY_SVE_REVERSE_BITS(BASE, CHAR, BITS, HALF, NAME, OP) \
3612 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
3613 const DFromV<decltype(v)> d; \
3614 return sv##OP##_##CHAR##BITS##_x(detail::PTrue(d), v); \
3618#undef HWY_SVE_REVERSE_BITS
3629#ifdef HWY_NATIVE_SLIDE1_UP_DOWN
3630#undef HWY_NATIVE_SLIDE1_UP_DOWN
3632#define HWY_NATIVE_SLIDE1_UP_DOWN
3645 using TU =
TFromD<
decltype(du)>;
3646 const auto idx =
Iota(du,
static_cast<TU
>(amt));
3658#if HWY_TARGET != HWY_SVE2_128
3660#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
3661#undef HWY_NATIVE_BLK_INSERT_EXTRACT
3663#define HWY_NATIVE_BLK_INSERT_EXTRACT
3666template <
int kBlockIdx,
class V>
3669 static_assert(0 <= kBlockIdx && kBlockIdx <
d.MaxBlocks(),
3670 "Invalid block index");
3672#if HWY_TARGET == HWY_SVE_256
3678 constexpr size_t kBlockOffset =
3679 static_cast<size_t>(kBlockIdx) * kLanesPerBlock;
3680 const auto splice_mask =
FirstN(
d, kBlockOffset);
3681 const auto sel_lo_mask =
FirstN(
d, kBlockOffset + kLanesPerBlock);
3683 const auto splice_result =
detail::Splice(blk_to_insert, v, splice_mask);
3684 return IfThenElse(sel_lo_mask, splice_result, v);
3688template <
int kBlockIdx,
class V>
3691 static_assert(0 <= kBlockIdx && kBlockIdx <
d.MaxBlocks(),
3692 "Invalid block index");
3694 if (kBlockIdx == 0)
return v;
3696#if HWY_TARGET == HWY_SVE_256
3700 using TU =
TFromD<
decltype(du)>;
3702 constexpr size_t kBlockOffset =
3703 static_cast<size_t>(kBlockIdx) * kLanesPerBlock;
3704 const auto splice_mask =
3705 RebindMask(
d, detail::LtN(
Iota(du,
static_cast<TU
>(0u - kBlockOffset)),
3706 static_cast<TU
>(kLanesPerBlock)));
3711template <
int kBlockIdx,
class V>
3714 static_assert(0 <= kBlockIdx && kBlockIdx <
d.MaxBlocks(),
3715 "Invalid block index");
3718 using VU =
VFromD<
decltype(du)>;
3721#if HWY_TARGET == HWY_SVE_256
3725 using TU =
TFromD<
decltype(du)>;
3727 constexpr size_t kBlockOffset =
3728 static_cast<size_t>(kBlockIdx) * kLanesPerBlock;
3730 const VU idx = detail::AddN(
3731 detail::AndN(
Iota(du, TU{0}),
static_cast<TU
>(kLanesPerBlock - 1)),
3732 static_cast<TU
>(kBlockOffset));
3741template <
typename T>
3742struct CompressIsPartition {
3743#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
3746 enum {
value = (
sizeof(T) == 8) };
3752#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \
3753 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
3754 return sv##OP##_##CHAR##BITS(mask, v); \
3757#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
3763#undef HWY_SVE_COMPRESS
3765#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
3766template <
class V, HWY_IF_T_SIZE_V(V, 8)>
3774 const svuint64_t bits =
Shl(
Set(du64, 1),
Iota(du64, 2));
3775 const size_t offset = detail::SumOfLanesM(mask, bits);
3778 alignas(16)
static constexpr uint64_t table[4 * 16] = {
3780 0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2,
3781 1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2,
3782 0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3};
3787#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
3788template <
class V, HWY_IF_T_SIZE_V(V, 8)>
3795 const svbool_t maskLL = svzip1_b64(mask, mask);
3801template <
class V, HWY_IF_T_SIZE_V(V, 2)>
3803 static_assert(!IsSame<V, svfloat16_t>(),
"Must use overload");
3804 const DFromV<V> d16;
3809 const auto v32H = detail::PromoteUpperTo(dw, v);
3810 const svbool_t mask32L = svunpklo_b(mask16);
3811 const svbool_t mask32H = svunpkhi_b(mask16);
3813 const auto compressedL =
Compress(v32L, mask32L);
3814 const auto compressedH =
Compress(v32H, mask32H);
3817 const V evenL =
BitCast(d16, compressedL);
3818 const V evenH =
BitCast(d16, compressedH);
3819 const V v16L = detail::ConcatEvenFull(evenL, evenL);
3820 const V v16H = detail::ConcatEvenFull(evenH, evenH);
3825 const size_t countL = detail::CountTrueFull(dw, mask32L);
3826 const auto compressed_maskL =
FirstN(d16, countL);
3832 const DFromV<
decltype(v)> df;
3840template <
class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4))>
3841HWY_API V CompressNot(V v, const sv
bool_t mask) {
3842 return Compress(v, Not(mask));
3845template <
class V, HWY_IF_T_SIZE_V(V, 8)>
3847#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
3853 const svbool_t maskLL = svzip1_b64(mask, mask);
3856#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
3863 const svuint64_t bits =
Shl(
Set(du64, 1),
Iota(du64, 2));
3864 const size_t offset = detail::SumOfLanesM(mask, bits);
3867 alignas(16)
static constexpr uint64_t table[4 * 16] = {
3869 0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3,
3870 0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3,
3871 2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
3880#if HWY_TARGET == HWY_SVE2_128
3884#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
3886 CopyBytes<4>(&mask, &bits);
3888 const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u);
3890 alignas(16)
static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1,
3891 0, 1, 2, 3, 0, 1, 2, 3};
3900template <
class V,
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
3908template <
class V,
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
3912 const svbool_t store_mask =
FirstN(
d, count);
3923 return static_cast<size_t>(detail::ExtractLastMatchingLaneM(
3940#if HWY_TARGET != HWY_SVE2_128
3945template <
class D,
class V>
3946HWY_INLINE V OffsetsOf128BitBlocks(
const D
d,
const V iota0) {
3948 return detail::AndNotN(
static_cast<T
>(LanesPerBlock(
d) - 1), iota0);
3951template <
size_t kLanes,
class D, HWY_IF_T_SIZE_D(D, 1)>
3952svbool_t FirstNPerBlock(D
d) {
3955 const svuint8_t idx_mod =
3956 svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
3957 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
3958 6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock,
3959 9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock,
3960 12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock,
3961 15 % kLanesPerBlock);
3962 return detail::LtN(
BitCast(du, idx_mod), kLanes);
3964template <
size_t kLanes,
class D, HWY_IF_T_SIZE_D(D, 2)>
3968 const svuint16_t idx_mod =
3969 svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
3970 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
3971 6 % kLanesPerBlock, 7 % kLanesPerBlock);
3972 return detail::LtN(
BitCast(du, idx_mod), kLanes);
3974template <
size_t kLanes,
class D, HWY_IF_T_SIZE_D(D, 4)>
3978 const svuint32_t idx_mod =
3979 svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
3980 3 % kLanesPerBlock);
3981 return detail::LtN(
BitCast(du, idx_mod), kLanes);
3983template <
size_t kLanes,
class D, HWY_IF_T_SIZE_D(D, 8)>
3987 const svuint64_t idx_mod =
3988 svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock);
3989 return detail::LtN(
BitCast(du, idx_mod), kLanes);
3995template <
size_t kBytes,
class D,
class V = VFromD<D>>
3998 const auto hi8 =
BitCast(d8, hi);
3999 const auto lo8 =
BitCast(d8, lo);
4000#if HWY_TARGET == HWY_SVE2_128
4001 return BitCast(
d, detail::Ext<kBytes>(hi8, lo8));
4004 const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
4014 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 4,
"Defined for 32-bit types");
4023 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 4,
"Defined for 32-bit types");
4024 const svuint8_t v8 =
BitCast(d8, v);
4025 return BitCast(
d, CombineShiftRightBytes<12>(d8, v8, v8));
4033 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 4,
"Defined for 32-bit types");
4034 const svuint8_t v8 =
BitCast(d8, v);
4035 return BitCast(
d, CombineShiftRightBytes<4>(d8, v8, v8));
4043 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 4,
"Defined for 32-bit types");
4044 const svuint8_t v8 =
BitCast(d8, v);
4045 return BitCast(
d, CombineShiftRightBytes<8>(d8, v8, v8));
4053 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 8,
"Defined for 64-bit types");
4054 const svuint8_t v8 =
BitCast(d8, v);
4055 return BitCast(
d, CombineShiftRightBytes<8>(d8, v8, v8));
4065template <
class D,
class V = VFromD<D>>
4067#if HWY_TARGET == HWY_SVE_256
4073#elif HWY_TARGET == HWY_SVE2_128
4083template <
class V,
class VI>
4087#if HWY_TARGET == HWY_SVE2_128
4091 const auto idx8 =
Add(
BitCast(du8, idx), offsets128);
4096template <
class V,
class VI>
4102 auto idx8 =
BitCast(di8, idx);
4103 const auto msb = detail::LtN(idx8, 0);
4111#ifdef HWY_NATIVE_BROADCASTLANE
4112#undef HWY_NATIVE_BROADCASTLANE
4114#define HWY_NATIVE_BROADCASTLANE
4118#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP) \
4119 template <int kLane> \
4120 HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
4121 return sv##OP##_##CHAR##BITS(v, kLane); \
4125#undef HWY_SVE_BROADCAST
4128template <
int kLane,
class V>
4133 static_assert(0 <= kLane && kLane < kLanesPerBlock,
"Invalid lane");
4134#if HWY_TARGET == HWY_SVE2_128
4135 return detail::BroadcastLane<kLane>(v);
4139 idx = detail::AddN(idx, kLane);
4145template <
int kLane,
class V>
4147 static_assert(0 <= kLane && kLane <
HWY_MAX_LANES_V(V),
"Invalid lane");
4148 return detail::BroadcastLane<kLane>(v);
4153template <
size_t kLanes,
class D,
class V = VFromD<D>>
4155 const auto zero =
Zero(
d);
4157#if HWY_TARGET == HWY_SVE2_128
4161 return IfThenElse(detail::FirstNPerBlock<kLanes>(
d), zero, shifted);
4165template <
size_t kLanes,
class V>
4167 return ShiftLeftLanes<kLanes>(
DFromV<V>(), v);
4171template <
size_t kLanes,
class D,
class V = VFromD<D>>
4178#if HWY_TARGET == HWY_SVE2_128
4179 return detail::Ext<kLanes>(
Zero(
d), v);
4181 const auto shifted = detail::Ext<kLanes>(v, v);
4191template <
int kBytes,
class D,
class V = VFromD<D>>
4197template <
int kBytes,
class V>
4199 return ShiftLeftBytes<kBytes>(
DFromV<V>(), v);
4203template <
int kBytes,
class D,
class V = VFromD<D>>
4211template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4217template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4223template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4237#define HWY_SVE_ADDSUB_F(BASE, CHAR, BITS, HALF, NAME, OP) \
4238 HWY_API HWY_SVE_V(BASE, BITS) \
4239 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
4240 const DFromV<decltype(b)> d; \
4241 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, Reverse2(d, b), \
4247#undef HWY_SVE_ADDSUB_F
4253#define HWY_SVE_ADDSUB_UI(BASE, CHAR, BITS, HALF, NAME, OP) \
4254 HWY_API HWY_SVE_V(BASE, BITS) \
4255 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
4256 const DFromV<decltype(b)> d; \
4257 return sv##OP##_##CHAR##BITS(a, Reverse2(d, b), 90); \
4262#undef HWY_SVE_ADDSUB_UI
4265#undef HWY_IF_ADDSUB_V
4266#define HWY_IF_ADDSUB_V(V) \
4267 HWY_IF_LANES_GT_D(DFromV<V>, 1), \
4268 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
4276#undef HWY_IF_ADDSUB_V
4277#define HWY_IF_ADDSUB_V(V) \
4278 HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
4284template <
class V, HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_FLOAT_V(V)>
4289 const T neg_zero = ConvertScalarTo<T>(-0.0f);
4297#undef HWY_IF_MULADDSUB_V
4298#define HWY_IF_MULADDSUB_V(V) \
4299 HWY_IF_LANES_GT_D(DFromV<V>, 1), \
4300 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
4302template <
class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
4303 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4315#undef HWY_IF_MULADDSUB_V
4316#define HWY_IF_MULADDSUB_V(V) \
4317 HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
4322template <
size_t N,
int kPow2>
4325 return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0),
BitCast(du16, v)));
4338 return svextb_s16_x(detail::PTrue(d_to),
BitCast(d_to, v));
4346 return svexth_s32_x(detail::PTrue(d_to),
BitCast(d_to, v));
4354 return svextw_s64_x(detail::PTrue(d_to),
BitCast(d_to, v));
4364 return svcvt_f32_f16_x(detail::PTrue(d_from), v);
4374 return svcvt_f64_f32_x(detail::PTrue(d_from), v);
4384 return svcvt_f64_s32_x(detail::PTrue(d_from), v);
4394 return svcvt_f64_u32_x(detail::PTrue(d_from), v);
4404 return svcvt_s64_f32_x(detail::PTrue(d_from), v);
4414 return svcvt_u64_f32_x(detail::PTrue(d_from), v);
4423 return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4428template <
class FromTypeTag,
class D,
class V>
4431 FromTypeTag from_type_tag, D d_to, V v) {
4432 return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4437template <
class ToTypeTag,
class D, HWY_IF_UI64_D(D)>
4442 return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4450template <
size_t N,
int kPow2>
4453#if HWY_SVE_HAVE_F32_TO_BF16C
4454 const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
4455 return svcvtnt_bf16_f32_x(b_in_even, detail::PTrue(dbf16), a);
4458 const auto a_in_odd =
4460 const auto b_in_odd =
4462 return BitCast(dbf16, detail::InterleaveOdd(b_in_odd, a_in_odd));
4466template <
size_t N,
int kPow2>
4471 const svint16_t a_in_even = svqxtnb_s32(a);
4472 return svqxtnt_s32(a_in_even, b);
4474 const svint16_t a16 =
BitCast(d16, detail::SaturateI<int16_t>(a));
4475 const svint16_t b16 =
BitCast(d16, detail::SaturateI<int16_t>(b));
4476 return detail::InterleaveEven(a16, b16);
4480template <
size_t N,
int kPow2>
4485 const svuint16_t a_in_even = svqxtunb_s32(a);
4486 return svqxtunt_s32(a_in_even, b);
4489 const svuint32_t clamped_a =
BitCast(du32, detail::MaxN(a, 0));
4490 const svuint32_t clamped_b =
BitCast(du32, detail::MaxN(b, 0));
4491 const svuint16_t a16 =
BitCast(d16, detail::SaturateU<uint16_t>(clamped_a));
4492 const svuint16_t b16 =
BitCast(d16, detail::SaturateU<uint16_t>(clamped_b));
4493 return detail::InterleaveEven(a16, b16);
4497template <
size_t N,
int kPow2>
4502 const svuint16_t a_in_even = svqxtnb_u32(a);
4503 return svqxtnt_u32(a_in_even, b);
4505 const svuint16_t a16 =
BitCast(d16, detail::SaturateU<uint16_t>(a));
4506 const svuint16_t b16 =
BitCast(d16, detail::SaturateU<uint16_t>(b));
4507 return detail::InterleaveEven(a16, b16);
4511template <
size_t N,
int kPow2>
4516 const svint8_t a_in_even = svqxtnb_s16(a);
4517 return svqxtnt_s16(a_in_even, b);
4519 const svint8_t a8 =
BitCast(d8, detail::SaturateI<int8_t>(a));
4520 const svint8_t b8 =
BitCast(d8, detail::SaturateI<int8_t>(b));
4521 return detail::InterleaveEven(a8, b8);
4525template <
size_t N,
int kPow2>
4530 const svuint8_t a_in_even = svqxtunb_s16(a);
4531 return svqxtunt_s16(a_in_even, b);
4534 const svuint16_t clamped_a =
BitCast(du16, detail::MaxN(a, 0));
4535 const svuint16_t clamped_b =
BitCast(du16, detail::MaxN(b, 0));
4536 const svuint8_t a8 =
BitCast(d8, detail::SaturateU<uint8_t>(clamped_a));
4537 const svuint8_t b8 =
BitCast(d8, detail::SaturateU<uint8_t>(clamped_b));
4538 return detail::InterleaveEven(a8, b8);
4542template <
size_t N,
int kPow2>
4547 const svuint8_t a_in_even = svqxtnb_u16(a);
4548 return svqxtnt_u16(a_in_even, b);
4550 const svuint8_t a8 =
BitCast(d8, detail::SaturateU<uint8_t>(a));
4551 const svuint8_t b8 =
BitCast(d8, detail::SaturateU<uint8_t>(b));
4552 return detail::InterleaveEven(a8, b8);
4556template <
size_t N,
int kPow2>
4561 const svint32_t a_in_even = svqxtnb_s64(a);
4562 return svqxtnt_s64(a_in_even, b);
4564 const svint32_t a32 =
BitCast(d32, detail::SaturateI<int32_t>(a));
4565 const svint32_t b32 =
BitCast(d32, detail::SaturateI<int32_t>(b));
4566 return detail::InterleaveEven(a32, b32);
4570template <
size_t N,
int kPow2>
4575 const svuint32_t a_in_even = svqxtunb_s64(a);
4576 return svqxtunt_s64(a_in_even, b);
4579 const svuint64_t clamped_a =
BitCast(du64, detail::MaxN(a, 0));
4580 const svuint64_t clamped_b =
BitCast(du64, detail::MaxN(b, 0));
4581 const svuint32_t a32 =
BitCast(d32, detail::SaturateU<uint32_t>(clamped_a));
4582 const svuint32_t b32 =
BitCast(d32, detail::SaturateU<uint32_t>(clamped_b));
4583 return detail::InterleaveEven(a32, b32);
4587template <
size_t N,
int kPow2>
4592 const svuint32_t a_in_even = svqxtnb_u64(a);
4593 return svqxtnt_u64(a_in_even, b);
4595 const svuint32_t a32 =
BitCast(d32, detail::SaturateU<uint32_t>(a));
4596 const svuint32_t b32 =
BitCast(d32, detail::SaturateU<uint32_t>(b));
4597 return detail::InterleaveEven(a32, b32);
4606 return detail::InterleaveEven(clamped_a, clamped_b);
4609template <
class D,
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
4610 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4611 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2)>
4613 const Half<
decltype(dn)> dnh;
4614 const auto demoted_a =
DemoteTo(dnh, a);
4615 const auto demoted_b =
DemoteTo(dnh, b);
4616 return Combine(dn, demoted_b, demoted_a);
4619template <
size_t N,
int kPow2>
4622#if HWY_SVE_HAVE_F32_TO_BF16C
4624 const VBF16 a_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), a);
4625 const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
4626 return ConcatEven(dbf16, b_in_even, a_in_even);
4641 const Half<
decltype(
d)> dh;
4665template <
class V,
class M>
4679 static_assert(IsSigned<TFromV<V>>(),
"Only works for signed/float");
4691 return ShiftRight<1>(detail::AddN(
Add(a, b), 1));
4698template <
class D, HWY_IF_T_SIZE_D(D, 1)>
4702 const svuint8_t iota =
Iota(du, 0);
4705 const svuint8_t bytes =
BitCast(du, svld1ub_u64(detail::PTrue(
d), bits));
4707 const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
4709 const svuint8_t bit =
4710 svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4714template <
class D, HWY_IF_T_SIZE_D(D, 2)>
4717 const RebindToUnsigned<D> du;
4718 const Repartition<uint8_t, D> du8;
4721 const svuint8_t bytes = svld1(
FirstN(du8, (
Lanes(du) + 7) / 8), bits);
4724 const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(
Iota(du8, 0)));
4726 const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
4730template <
class D, HWY_IF_T_SIZE_D(D, 4)>
4733 const RebindToUnsigned<D> du;
4734 const Repartition<uint8_t, D> du8;
4738 const svuint8_t bytes = svld1(
FirstN(du8, 8), bits);
4741 const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(
Iota(du8, 0)));
4744 const svuint32_t bit =
Shl(
Set(du, 1), detail::AndN(
Iota(du, 0), 7));
4749template <
class D, HWY_IF_T_SIZE_D(D, 8)>
4752 const RebindToUnsigned<D> du;
4757 CopyBytes<4>(bits, &mask_bits);
4758 const auto vbits =
Set(du, mask_bits);
4761 const svuint64_t bit =
Shl(
Set(du, 1),
Iota(du, 0));
4768template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 8)>
4773 if (kN < 8) mask_bits &= (1u << kN) - 1;
4776 const svuint8_t bytes =
BitCast(du,
Set(du,
static_cast<uint8_t
>(mask_bits)));
4778 const svuint8_t bit =
4779 svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4783template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
4790 const svuint8_t bytes =
4791 BitCast(du,
Set(du16,
static_cast<uint16_t
>(mask_bits)));
4793 const svuint8_t rep8 = svtbl_u8(bytes, ShiftRight<3>(
Iota(du, 0)));
4795 const svuint8_t bit =
4796 svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4800template <
class D, HWY_IF_T_SIZE_D(D, 2)>
4806 if (kN < 8) mask_bits &= (1u << kN) - 1;
4809 const svuint8_t bytes =
Set(du8,
static_cast<uint8_t
>(mask_bits));
4811 const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
4815template <
class D, HWY_IF_T_SIZE_D(D, 4)>
4821 if (kN < 4) mask_bits &= (1u << kN) - 1;
4824 const svuint8_t bytes =
Set(du8,
static_cast<uint8_t
>(mask_bits));
4826 const svuint32_t bit = svdupq_n_u32(1, 2, 4, 8);
4830template <
class D, HWY_IF_T_SIZE_D(D, 8)>
4838 const svuint8_t bytes =
Set(du8,
static_cast<uint8_t
>(mask_bits));
4840 const svuint64_t bit = svdupq_n_u64(1, 2);
4849template <
class T, HWY_IF_T_SIZE(T, 1)>
4851 return svdup_n_u8_z(
m, 1);
4853template <
class T, HWY_IF_T_SIZE(T, 2)>
4856 const svuint8_t b16 =
BitCast(d8, svdup_n_u16_z(
m, 1));
4857 return detail::ConcatEvenFull(b16, b16);
4859template <
class T, HWY_IF_T_SIZE(T, 4)>
4863template <
class T, HWY_IF_T_SIZE(T, 8)>
4865 const ScalableTag<uint32_t> d32;
4866 const svuint32_t b64 =
BitCast(d32, svdup_n_u64_z(
m, 1));
4867 return U8FromU32(detail::ConcatEvenFull(b64, b64));
4890 svuint64_t bits_in_u64 =
4893 const size_t num_bits =
Lanes(
d);
4894 const size_t num_bytes = (num_bits + 8 - 1) / 8;
4902 const int mask =
static_cast<int>((1ull << num_bits) - 1);
4903 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
4911template <
class V, HWY_IF_NOT_T_SIZE_V(V, 1)>
4917template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
4925#ifdef HWY_NATIVE_EXPAND
4926#undef HWY_NATIVE_EXPAND
4928#define HWY_NATIVE_EXPAND
4935 alignas(16)
static constexpr uint8_t table[8 * 256] = {
4937 128, 128, 128, 128, 128, 128, 128, 128,
4938 0, 128, 128, 128, 128, 128, 128, 128,
4939 128, 0, 128, 128, 128, 128, 128, 128,
4940 0, 1, 128, 128, 128, 128, 128, 128,
4941 128, 128, 0, 128, 128, 128, 128, 128,
4942 0, 128, 1, 128, 128, 128, 128, 128,
4943 128, 0, 1, 128, 128, 128, 128, 128,
4944 0, 1, 2, 128, 128, 128, 128, 128,
4945 128, 128, 128, 0, 128, 128, 128, 128,
4946 0, 128, 128, 1, 128, 128, 128, 128,
4947 128, 0, 128, 1, 128, 128, 128, 128,
4948 0, 1, 128, 2, 128, 128, 128, 128,
4949 128, 128, 0, 1, 128, 128, 128, 128,
4950 0, 128, 1, 2, 128, 128, 128, 128,
4951 128, 0, 1, 2, 128, 128, 128, 128,
4952 0, 1, 2, 3, 128, 128, 128, 128,
4953 128, 128, 128, 128, 0, 128, 128, 128,
4954 0, 128, 128, 128, 1, 128, 128, 128,
4955 128, 0, 128, 128, 1, 128, 128, 128,
4956 0, 1, 128, 128, 2, 128, 128, 128,
4957 128, 128, 0, 128, 1, 128, 128, 128,
4958 0, 128, 1, 128, 2, 128, 128, 128,
4959 128, 0, 1, 128, 2, 128, 128, 128,
4960 0, 1, 2, 128, 3, 128, 128, 128,
4961 128, 128, 128, 0, 1, 128, 128, 128,
4962 0, 128, 128, 1, 2, 128, 128, 128,
4963 128, 0, 128, 1, 2, 128, 128, 128,
4964 0, 1, 128, 2, 3, 128, 128, 128,
4965 128, 128, 0, 1, 2, 128, 128, 128,
4966 0, 128, 1, 2, 3, 128, 128, 128,
4967 128, 0, 1, 2, 3, 128, 128, 128,
4968 0, 1, 2, 3, 4, 128, 128, 128,
4969 128, 128, 128, 128, 128, 0, 128, 128,
4970 0, 128, 128, 128, 128, 1, 128, 128,
4971 128, 0, 128, 128, 128, 1, 128, 128,
4972 0, 1, 128, 128, 128, 2, 128, 128,
4973 128, 128, 0, 128, 128, 1, 128, 128,
4974 0, 128, 1, 128, 128, 2, 128, 128,
4975 128, 0, 1, 128, 128, 2, 128, 128,
4976 0, 1, 2, 128, 128, 3, 128, 128,
4977 128, 128, 128, 0, 128, 1, 128, 128,
4978 0, 128, 128, 1, 128, 2, 128, 128,
4979 128, 0, 128, 1, 128, 2, 128, 128,
4980 0, 1, 128, 2, 128, 3, 128, 128,
4981 128, 128, 0, 1, 128, 2, 128, 128,
4982 0, 128, 1, 2, 128, 3, 128, 128,
4983 128, 0, 1, 2, 128, 3, 128, 128,
4984 0, 1, 2, 3, 128, 4, 128, 128,
4985 128, 128, 128, 128, 0, 1, 128, 128,
4986 0, 128, 128, 128, 1, 2, 128, 128,
4987 128, 0, 128, 128, 1, 2, 128, 128,
4988 0, 1, 128, 128, 2, 3, 128, 128,
4989 128, 128, 0, 128, 1, 2, 128, 128,
4990 0, 128, 1, 128, 2, 3, 128, 128,
4991 128, 0, 1, 128, 2, 3, 128, 128,
4992 0, 1, 2, 128, 3, 4, 128, 128,
4993 128, 128, 128, 0, 1, 2, 128, 128,
4994 0, 128, 128, 1, 2, 3, 128, 128,
4995 128, 0, 128, 1, 2, 3, 128, 128,
4996 0, 1, 128, 2, 3, 4, 128, 128,
4997 128, 128, 0, 1, 2, 3, 128, 128,
4998 0, 128, 1, 2, 3, 4, 128, 128,
4999 128, 0, 1, 2, 3, 4, 128, 128,
5000 0, 1, 2, 3, 4, 5, 128, 128,
5001 128, 128, 128, 128, 128, 128, 0, 128,
5002 0, 128, 128, 128, 128, 128, 1, 128,
5003 128, 0, 128, 128, 128, 128, 1, 128,
5004 0, 1, 128, 128, 128, 128, 2, 128,
5005 128, 128, 0, 128, 128, 128, 1, 128,
5006 0, 128, 1, 128, 128, 128, 2, 128,
5007 128, 0, 1, 128, 128, 128, 2, 128,
5008 0, 1, 2, 128, 128, 128, 3, 128,
5009 128, 128, 128, 0, 128, 128, 1, 128,
5010 0, 128, 128, 1, 128, 128, 2, 128,
5011 128, 0, 128, 1, 128, 128, 2, 128,
5012 0, 1, 128, 2, 128, 128, 3, 128,
5013 128, 128, 0, 1, 128, 128, 2, 128,
5014 0, 128, 1, 2, 128, 128, 3, 128,
5015 128, 0, 1, 2, 128, 128, 3, 128,
5016 0, 1, 2, 3, 128, 128, 4, 128,
5017 128, 128, 128, 128, 0, 128, 1, 128,
5018 0, 128, 128, 128, 1, 128, 2, 128,
5019 128, 0, 128, 128, 1, 128, 2, 128,
5020 0, 1, 128, 128, 2, 128, 3, 128,
5021 128, 128, 0, 128, 1, 128, 2, 128,
5022 0, 128, 1, 128, 2, 128, 3, 128,
5023 128, 0, 1, 128, 2, 128, 3, 128,
5024 0, 1, 2, 128, 3, 128, 4, 128,
5025 128, 128, 128, 0, 1, 128, 2, 128,
5026 0, 128, 128, 1, 2, 128, 3, 128,
5027 128, 0, 128, 1, 2, 128, 3, 128,
5028 0, 1, 128, 2, 3, 128, 4, 128,
5029 128, 128, 0, 1, 2, 128, 3, 128,
5030 0, 128, 1, 2, 3, 128, 4, 128,
5031 128, 0, 1, 2, 3, 128, 4, 128,
5032 0, 1, 2, 3, 4, 128, 5, 128,
5033 128, 128, 128, 128, 128, 0, 1, 128,
5034 0, 128, 128, 128, 128, 1, 2, 128,
5035 128, 0, 128, 128, 128, 1, 2, 128,
5036 0, 1, 128, 128, 128, 2, 3, 128,
5037 128, 128, 0, 128, 128, 1, 2, 128,
5038 0, 128, 1, 128, 128, 2, 3, 128,
5039 128, 0, 1, 128, 128, 2, 3, 128,
5040 0, 1, 2, 128, 128, 3, 4, 128,
5041 128, 128, 128, 0, 128, 1, 2, 128,
5042 0, 128, 128, 1, 128, 2, 3, 128,
5043 128, 0, 128, 1, 128, 2, 3, 128,
5044 0, 1, 128, 2, 128, 3, 4, 128,
5045 128, 128, 0, 1, 128, 2, 3, 128,
5046 0, 128, 1, 2, 128, 3, 4, 128,
5047 128, 0, 1, 2, 128, 3, 4, 128,
5048 0, 1, 2, 3, 128, 4, 5, 128,
5049 128, 128, 128, 128, 0, 1, 2, 128,
5050 0, 128, 128, 128, 1, 2, 3, 128,
5051 128, 0, 128, 128, 1, 2, 3, 128,
5052 0, 1, 128, 128, 2, 3, 4, 128,
5053 128, 128, 0, 128, 1, 2, 3, 128,
5054 0, 128, 1, 128, 2, 3, 4, 128,
5055 128, 0, 1, 128, 2, 3, 4, 128,
5056 0, 1, 2, 128, 3, 4, 5, 128,
5057 128, 128, 128, 0, 1, 2, 3, 128,
5058 0, 128, 128, 1, 2, 3, 4, 128,
5059 128, 0, 128, 1, 2, 3, 4, 128,
5060 0, 1, 128, 2, 3, 4, 5, 128,
5061 128, 128, 0, 1, 2, 3, 4, 128,
5062 0, 128, 1, 2, 3, 4, 5, 128,
5063 128, 0, 1, 2, 3, 4, 5, 128,
5064 0, 1, 2, 3, 4, 5, 6, 128,
5065 128, 128, 128, 128, 128, 128, 128, 0,
5066 0, 128, 128, 128, 128, 128, 128, 1,
5067 128, 0, 128, 128, 128, 128, 128, 1,
5068 0, 1, 128, 128, 128, 128, 128, 2,
5069 128, 128, 0, 128, 128, 128, 128, 1,
5070 0, 128, 1, 128, 128, 128, 128, 2,
5071 128, 0, 1, 128, 128, 128, 128, 2,
5072 0, 1, 2, 128, 128, 128, 128, 3,
5073 128, 128, 128, 0, 128, 128, 128, 1,
5074 0, 128, 128, 1, 128, 128, 128, 2,
5075 128, 0, 128, 1, 128, 128, 128, 2,
5076 0, 1, 128, 2, 128, 128, 128, 3,
5077 128, 128, 0, 1, 128, 128, 128, 2,
5078 0, 128, 1, 2, 128, 128, 128, 3,
5079 128, 0, 1, 2, 128, 128, 128, 3,
5080 0, 1, 2, 3, 128, 128, 128, 4,
5081 128, 128, 128, 128, 0, 128, 128, 1,
5082 0, 128, 128, 128, 1, 128, 128, 2,
5083 128, 0, 128, 128, 1, 128, 128, 2,
5084 0, 1, 128, 128, 2, 128, 128, 3,
5085 128, 128, 0, 128, 1, 128, 128, 2,
5086 0, 128, 1, 128, 2, 128, 128, 3,
5087 128, 0, 1, 128, 2, 128, 128, 3,
5088 0, 1, 2, 128, 3, 128, 128, 4,
5089 128, 128, 128, 0, 1, 128, 128, 2,
5090 0, 128, 128, 1, 2, 128, 128, 3,
5091 128, 0, 128, 1, 2, 128, 128, 3,
5092 0, 1, 128, 2, 3, 128, 128, 4,
5093 128, 128, 0, 1, 2, 128, 128, 3,
5094 0, 128, 1, 2, 3, 128, 128, 4,
5095 128, 0, 1, 2, 3, 128, 128, 4,
5096 0, 1, 2, 3, 4, 128, 128, 5,
5097 128, 128, 128, 128, 128, 0, 128, 1,
5098 0, 128, 128, 128, 128, 1, 128, 2,
5099 128, 0, 128, 128, 128, 1, 128, 2,
5100 0, 1, 128, 128, 128, 2, 128, 3,
5101 128, 128, 0, 128, 128, 1, 128, 2,
5102 0, 128, 1, 128, 128, 2, 128, 3,
5103 128, 0, 1, 128, 128, 2, 128, 3,
5104 0, 1, 2, 128, 128, 3, 128, 4,
5105 128, 128, 128, 0, 128, 1, 128, 2,
5106 0, 128, 128, 1, 128, 2, 128, 3,
5107 128, 0, 128, 1, 128, 2, 128, 3,
5108 0, 1, 128, 2, 128, 3, 128, 4,
5109 128, 128, 0, 1, 128, 2, 128, 3,
5110 0, 128, 1, 2, 128, 3, 128, 4,
5111 128, 0, 1, 2, 128, 3, 128, 4,
5112 0, 1, 2, 3, 128, 4, 128, 5,
5113 128, 128, 128, 128, 0, 1, 128, 2,
5114 0, 128, 128, 128, 1, 2, 128, 3,
5115 128, 0, 128, 128, 1, 2, 128, 3,
5116 0, 1, 128, 128, 2, 3, 128, 4,
5117 128, 128, 0, 128, 1, 2, 128, 3,
5118 0, 128, 1, 128, 2, 3, 128, 4,
5119 128, 0, 1, 128, 2, 3, 128, 4,
5120 0, 1, 2, 128, 3, 4, 128, 5,
5121 128, 128, 128, 0, 1, 2, 128, 3,
5122 0, 128, 128, 1, 2, 3, 128, 4,
5123 128, 0, 128, 1, 2, 3, 128, 4,
5124 0, 1, 128, 2, 3, 4, 128, 5,
5125 128, 128, 0, 1, 2, 3, 128, 4,
5126 0, 128, 1, 2, 3, 4, 128, 5,
5127 128, 0, 1, 2, 3, 4, 128, 5,
5128 0, 1, 2, 3, 4, 5, 128, 6,
5129 128, 128, 128, 128, 128, 128, 0, 1,
5130 0, 128, 128, 128, 128, 128, 1, 2,
5131 128, 0, 128, 128, 128, 128, 1, 2,
5132 0, 1, 128, 128, 128, 128, 2, 3,
5133 128, 128, 0, 128, 128, 128, 1, 2,
5134 0, 128, 1, 128, 128, 128, 2, 3,
5135 128, 0, 1, 128, 128, 128, 2, 3,
5136 0, 1, 2, 128, 128, 128, 3, 4,
5137 128, 128, 128, 0, 128, 128, 1, 2,
5138 0, 128, 128, 1, 128, 128, 2, 3,
5139 128, 0, 128, 1, 128, 128, 2, 3,
5140 0, 1, 128, 2, 128, 128, 3, 4,
5141 128, 128, 0, 1, 128, 128, 2, 3,
5142 0, 128, 1, 2, 128, 128, 3, 4,
5143 128, 0, 1, 2, 128, 128, 3, 4,
5144 0, 1, 2, 3, 128, 128, 4, 5,
5145 128, 128, 128, 128, 0, 128, 1, 2,
5146 0, 128, 128, 128, 1, 128, 2, 3,
5147 128, 0, 128, 128, 1, 128, 2, 3,
5148 0, 1, 128, 128, 2, 128, 3, 4,
5149 128, 128, 0, 128, 1, 128, 2, 3,
5150 0, 128, 1, 128, 2, 128, 3, 4,
5151 128, 0, 1, 128, 2, 128, 3, 4,
5152 0, 1, 2, 128, 3, 128, 4, 5,
5153 128, 128, 128, 0, 1, 128, 2, 3,
5154 0, 128, 128, 1, 2, 128, 3, 4,
5155 128, 0, 128, 1, 2, 128, 3, 4,
5156 0, 1, 128, 2, 3, 128, 4, 5,
5157 128, 128, 0, 1, 2, 128, 3, 4,
5158 0, 128, 1, 2, 3, 128, 4, 5,
5159 128, 0, 1, 2, 3, 128, 4, 5,
5160 0, 1, 2, 3, 4, 128, 5, 6,
5161 128, 128, 128, 128, 128, 0, 1, 2,
5162 0, 128, 128, 128, 128, 1, 2, 3,
5163 128, 0, 128, 128, 128, 1, 2, 3,
5164 0, 1, 128, 128, 128, 2, 3, 4,
5165 128, 128, 0, 128, 128, 1, 2, 3,
5166 0, 128, 1, 128, 128, 2, 3, 4,
5167 128, 0, 1, 128, 128, 2, 3, 4,
5168 0, 1, 2, 128, 128, 3, 4, 5,
5169 128, 128, 128, 0, 128, 1, 2, 3,
5170 0, 128, 128, 1, 128, 2, 3, 4,
5171 128, 0, 128, 1, 128, 2, 3, 4,
5172 0, 1, 128, 2, 128, 3, 4, 5,
5173 128, 128, 0, 1, 128, 2, 3, 4,
5174 0, 128, 1, 2, 128, 3, 4, 5,
5175 128, 0, 1, 2, 128, 3, 4, 5,
5176 0, 1, 2, 3, 128, 4, 5, 6,
5177 128, 128, 128, 128, 0, 1, 2, 3,
5178 0, 128, 128, 128, 1, 2, 3, 4,
5179 128, 0, 128, 128, 1, 2, 3, 4,
5180 0, 1, 128, 128, 2, 3, 4, 5,
5181 128, 128, 0, 128, 1, 2, 3, 4,
5182 0, 128, 1, 128, 2, 3, 4, 5,
5183 128, 0, 1, 128, 2, 3, 4, 5,
5184 0, 1, 2, 128, 3, 4, 5, 6,
5185 128, 128, 128, 0, 1, 2, 3, 4,
5186 0, 128, 128, 1, 2, 3, 4, 5,
5187 128, 0, 128, 1, 2, 3, 4, 5,
5188 0, 1, 128, 2, 3, 4, 5, 6,
5189 128, 128, 0, 1, 2, 3, 4, 5,
5190 0, 128, 1, 2, 3, 4, 5, 6,
5191 128, 0, 1, 2, 3, 4, 5, 6,
5192 0, 1, 2, 3, 4, 5, 6, 7};
5193 return Load(du8, table + mask_bits * 8);
5196template <
class D, HWY_IF_T_SIZE_D(D, 1)>
5200template <
class D,
class DU = RebindToUn
signed<D>, HWY_IF_NOT_T_SIZE_D(D, 1)>
5210 uint8_t mask_bytes[256 / 8];
5217 svbool_t next = svpfalse_b();
5218 size_t input_consumed = 0;
5219 const V iota =
Iota(
d, 0);
5220 for (
size_t i = 0; i <
Lanes(
d); i += 8) {
5221 uint64_t mask_bits = mask_bytes[i / 8];
5226 input_consumed +=
PopCount(mask_bits);
5227 next = detail::GeN(iota, ConvertScalarTo<T>(input_consumed));
5239template <
class V, HWY_IF_T_SIZE_V(V, 1)>
5241#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
5243 uint8_t mask_bytes[256 / 8];
5245 const uint64_t maskL = mask_bytes[0];
5246 const uint64_t maskH = mask_bytes[1];
5252 const T countL =
static_cast<T
>(
PopCount(maskL));
5263template <
class V, HWY_IF_T_SIZE_V(V, 2)>
5265#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
5268 const Rebind<uint8_t,
decltype(
d)> du8;
5271 const svuint16_t bits =
Shl(
Set(du16, 1),
Iota(du16, 3));
5272 const size_t offset = detail::SumOfLanesM(mask, bits);
5276 alignas(16)
static constexpr uint8_t table[8 * 256] = {
5278 255, 255, 255, 255, 255, 255, 255, 255,
5279 0, 255, 255, 255, 255, 255, 255, 255,
5280 255, 0, 255, 255, 255, 255, 255, 255,
5281 0, 1, 255, 255, 255, 255, 255, 255,
5282 255, 255, 0, 255, 255, 255, 255, 255,
5283 0, 255, 1, 255, 255, 255, 255, 255,
5284 255, 0, 1, 255, 255, 255, 255, 255,
5285 0, 1, 2, 255, 255, 255, 255, 255,
5286 255, 255, 255, 0, 255, 255, 255, 255,
5287 0, 255, 255, 1, 255, 255, 255, 255,
5288 255, 0, 255, 1, 255, 255, 255, 255,
5289 0, 1, 255, 2, 255, 255, 255, 255,
5290 255, 255, 0, 1, 255, 255, 255, 255,
5291 0, 255, 1, 2, 255, 255, 255, 255,
5292 255, 0, 1, 2, 255, 255, 255, 255,
5293 0, 1, 2, 3, 255, 255, 255, 255,
5294 255, 255, 255, 255, 0, 255, 255, 255,
5295 0, 255, 255, 255, 1, 255, 255, 255,
5296 255, 0, 255, 255, 1, 255, 255, 255,
5297 0, 1, 255, 255, 2, 255, 255, 255,
5298 255, 255, 0, 255, 1, 255, 255, 255,
5299 0, 255, 1, 255, 2, 255, 255, 255,
5300 255, 0, 1, 255, 2, 255, 255, 255,
5301 0, 1, 2, 255, 3, 255, 255, 255,
5302 255, 255, 255, 0, 1, 255, 255, 255,
5303 0, 255, 255, 1, 2, 255, 255, 255,
5304 255, 0, 255, 1, 2, 255, 255, 255,
5305 0, 1, 255, 2, 3, 255, 255, 255,
5306 255, 255, 0, 1, 2, 255, 255, 255,
5307 0, 255, 1, 2, 3, 255, 255, 255,
5308 255, 0, 1, 2, 3, 255, 255, 255,
5309 0, 1, 2, 3, 4, 255, 255, 255,
5310 255, 255, 255, 255, 255, 0, 255, 255,
5311 0, 255, 255, 255, 255, 1, 255, 255,
5312 255, 0, 255, 255, 255, 1, 255, 255,
5313 0, 1, 255, 255, 255, 2, 255, 255,
5314 255, 255, 0, 255, 255, 1, 255, 255,
5315 0, 255, 1, 255, 255, 2, 255, 255,
5316 255, 0, 1, 255, 255, 2, 255, 255,
5317 0, 1, 2, 255, 255, 3, 255, 255,
5318 255, 255, 255, 0, 255, 1, 255, 255,
5319 0, 255, 255, 1, 255, 2, 255, 255,
5320 255, 0, 255, 1, 255, 2, 255, 255,
5321 0, 1, 255, 2, 255, 3, 255, 255,
5322 255, 255, 0, 1, 255, 2, 255, 255,
5323 0, 255, 1, 2, 255, 3, 255, 255,
5324 255, 0, 1, 2, 255, 3, 255, 255,
5325 0, 1, 2, 3, 255, 4, 255, 255,
5326 255, 255, 255, 255, 0, 1, 255, 255,
5327 0, 255, 255, 255, 1, 2, 255, 255,
5328 255, 0, 255, 255, 1, 2, 255, 255,
5329 0, 1, 255, 255, 2, 3, 255, 255,
5330 255, 255, 0, 255, 1, 2, 255, 255,
5331 0, 255, 1, 255, 2, 3, 255, 255,
5332 255, 0, 1, 255, 2, 3, 255, 255,
5333 0, 1, 2, 255, 3, 4, 255, 255,
5334 255, 255, 255, 0, 1, 2, 255, 255,
5335 0, 255, 255, 1, 2, 3, 255, 255,
5336 255, 0, 255, 1, 2, 3, 255, 255,
5337 0, 1, 255, 2, 3, 4, 255, 255,
5338 255, 255, 0, 1, 2, 3, 255, 255,
5339 0, 255, 1, 2, 3, 4, 255, 255,
5340 255, 0, 1, 2, 3, 4, 255, 255,
5341 0, 1, 2, 3, 4, 5, 255, 255,
5342 255, 255, 255, 255, 255, 255, 0, 255,
5343 0, 255, 255, 255, 255, 255, 1, 255,
5344 255, 0, 255, 255, 255, 255, 1, 255,
5345 0, 1, 255, 255, 255, 255, 2, 255,
5346 255, 255, 0, 255, 255, 255, 1, 255,
5347 0, 255, 1, 255, 255, 255, 2, 255,
5348 255, 0, 1, 255, 255, 255, 2, 255,
5349 0, 1, 2, 255, 255, 255, 3, 255,
5350 255, 255, 255, 0, 255, 255, 1, 255,
5351 0, 255, 255, 1, 255, 255, 2, 255,
5352 255, 0, 255, 1, 255, 255, 2, 255,
5353 0, 1, 255, 2, 255, 255, 3, 255,
5354 255, 255, 0, 1, 255, 255, 2, 255,
5355 0, 255, 1, 2, 255, 255, 3, 255,
5356 255, 0, 1, 2, 255, 255, 3, 255,
5357 0, 1, 2, 3, 255, 255, 4, 255,
5358 255, 255, 255, 255, 0, 255, 1, 255,
5359 0, 255, 255, 255, 1, 255, 2, 255,
5360 255, 0, 255, 255, 1, 255, 2, 255,
5361 0, 1, 255, 255, 2, 255, 3, 255,
5362 255, 255, 0, 255, 1, 255, 2, 255,
5363 0, 255, 1, 255, 2, 255, 3, 255,
5364 255, 0, 1, 255, 2, 255, 3, 255,
5365 0, 1, 2, 255, 3, 255, 4, 255,
5366 255, 255, 255, 0, 1, 255, 2, 255,
5367 0, 255, 255, 1, 2, 255, 3, 255,
5368 255, 0, 255, 1, 2, 255, 3, 255,
5369 0, 1, 255, 2, 3, 255, 4, 255,
5370 255, 255, 0, 1, 2, 255, 3, 255,
5371 0, 255, 1, 2, 3, 255, 4, 255,
5372 255, 0, 1, 2, 3, 255, 4, 255,
5373 0, 1, 2, 3, 4, 255, 5, 255,
5374 255, 255, 255, 255, 255, 0, 1, 255,
5375 0, 255, 255, 255, 255, 1, 2, 255,
5376 255, 0, 255, 255, 255, 1, 2, 255,
5377 0, 1, 255, 255, 255, 2, 3, 255,
5378 255, 255, 0, 255, 255, 1, 2, 255,
5379 0, 255, 1, 255, 255, 2, 3, 255,
5380 255, 0, 1, 255, 255, 2, 3, 255,
5381 0, 1, 2, 255, 255, 3, 4, 255,
5382 255, 255, 255, 0, 255, 1, 2, 255,
5383 0, 255, 255, 1, 255, 2, 3, 255,
5384 255, 0, 255, 1, 255, 2, 3, 255,
5385 0, 1, 255, 2, 255, 3, 4, 255,
5386 255, 255, 0, 1, 255, 2, 3, 255,
5387 0, 255, 1, 2, 255, 3, 4, 255,
5388 255, 0, 1, 2, 255, 3, 4, 255,
5389 0, 1, 2, 3, 255, 4, 5, 255,
5390 255, 255, 255, 255, 0, 1, 2, 255,
5391 0, 255, 255, 255, 1, 2, 3, 255,
5392 255, 0, 255, 255, 1, 2, 3, 255,
5393 0, 1, 255, 255, 2, 3, 4, 255,
5394 255, 255, 0, 255, 1, 2, 3, 255,
5395 0, 255, 1, 255, 2, 3, 4, 255,
5396 255, 0, 1, 255, 2, 3, 4, 255,
5397 0, 1, 2, 255, 3, 4, 5, 255,
5398 255, 255, 255, 0, 1, 2, 3, 255,
5399 0, 255, 255, 1, 2, 3, 4, 255,
5400 255, 0, 255, 1, 2, 3, 4, 255,
5401 0, 1, 255, 2, 3, 4, 5, 255,
5402 255, 255, 0, 1, 2, 3, 4, 255,
5403 0, 255, 1, 2, 3, 4, 5, 255,
5404 255, 0, 1, 2, 3, 4, 5, 255,
5405 0, 1, 2, 3, 4, 5, 6, 255,
5406 255, 255, 255, 255, 255, 255, 255, 0,
5407 0, 255, 255, 255, 255, 255, 255, 1,
5408 255, 0, 255, 255, 255, 255, 255, 1,
5409 0, 1, 255, 255, 255, 255, 255, 2,
5410 255, 255, 0, 255, 255, 255, 255, 1,
5411 0, 255, 1, 255, 255, 255, 255, 2,
5412 255, 0, 1, 255, 255, 255, 255, 2,
5413 0, 1, 2, 255, 255, 255, 255, 3,
5414 255, 255, 255, 0, 255, 255, 255, 1,
5415 0, 255, 255, 1, 255, 255, 255, 2,
5416 255, 0, 255, 1, 255, 255, 255, 2,
5417 0, 1, 255, 2, 255, 255, 255, 3,
5418 255, 255, 0, 1, 255, 255, 255, 2,
5419 0, 255, 1, 2, 255, 255, 255, 3,
5420 255, 0, 1, 2, 255, 255, 255, 3,
5421 0, 1, 2, 3, 255, 255, 255, 4,
5422 255, 255, 255, 255, 0, 255, 255, 1,
5423 0, 255, 255, 255, 1, 255, 255, 2,
5424 255, 0, 255, 255, 1, 255, 255, 2,
5425 0, 1, 255, 255, 2, 255, 255, 3,
5426 255, 255, 0, 255, 1, 255, 255, 2,
5427 0, 255, 1, 255, 2, 255, 255, 3,
5428 255, 0, 1, 255, 2, 255, 255, 3,
5429 0, 1, 2, 255, 3, 255, 255, 4,
5430 255, 255, 255, 0, 1, 255, 255, 2,
5431 0, 255, 255, 1, 2, 255, 255, 3,
5432 255, 0, 255, 1, 2, 255, 255, 3,
5433 0, 1, 255, 2, 3, 255, 255, 4,
5434 255, 255, 0, 1, 2, 255, 255, 3,
5435 0, 255, 1, 2, 3, 255, 255, 4,
5436 255, 0, 1, 2, 3, 255, 255, 4,
5437 0, 1, 2, 3, 4, 255, 255, 5,
5438 255, 255, 255, 255, 255, 0, 255, 1,
5439 0, 255, 255, 255, 255, 1, 255, 2,
5440 255, 0, 255, 255, 255, 1, 255, 2,
5441 0, 1, 255, 255, 255, 2, 255, 3,
5442 255, 255, 0, 255, 255, 1, 255, 2,
5443 0, 255, 1, 255, 255, 2, 255, 3,
5444 255, 0, 1, 255, 255, 2, 255, 3,
5445 0, 1, 2, 255, 255, 3, 255, 4,
5446 255, 255, 255, 0, 255, 1, 255, 2,
5447 0, 255, 255, 1, 255, 2, 255, 3,
5448 255, 0, 255, 1, 255, 2, 255, 3,
5449 0, 1, 255, 2, 255, 3, 255, 4,
5450 255, 255, 0, 1, 255, 2, 255, 3,
5451 0, 255, 1, 2, 255, 3, 255, 4,
5452 255, 0, 1, 2, 255, 3, 255, 4,
5453 0, 1, 2, 3, 255, 4, 255, 5,
5454 255, 255, 255, 255, 0, 1, 255, 2,
5455 0, 255, 255, 255, 1, 2, 255, 3,
5456 255, 0, 255, 255, 1, 2, 255, 3,
5457 0, 1, 255, 255, 2, 3, 255, 4,
5458 255, 255, 0, 255, 1, 2, 255, 3,
5459 0, 255, 1, 255, 2, 3, 255, 4,
5460 255, 0, 1, 255, 2, 3, 255, 4,
5461 0, 1, 2, 255, 3, 4, 255, 5,
5462 255, 255, 255, 0, 1, 2, 255, 3,
5463 0, 255, 255, 1, 2, 3, 255, 4,
5464 255, 0, 255, 1, 2, 3, 255, 4,
5465 0, 1, 255, 2, 3, 4, 255, 5,
5466 255, 255, 0, 1, 2, 3, 255, 4,
5467 0, 255, 1, 2, 3, 4, 255, 5,
5468 255, 0, 1, 2, 3, 4, 255, 5,
5469 0, 1, 2, 3, 4, 5, 255, 6,
5470 255, 255, 255, 255, 255, 255, 0, 1,
5471 0, 255, 255, 255, 255, 255, 1, 2,
5472 255, 0, 255, 255, 255, 255, 1, 2,
5473 0, 1, 255, 255, 255, 255, 2, 3,
5474 255, 255, 0, 255, 255, 255, 1, 2,
5475 0, 255, 1, 255, 255, 255, 2, 3,
5476 255, 0, 1, 255, 255, 255, 2, 3,
5477 0, 1, 2, 255, 255, 255, 3, 4,
5478 255, 255, 255, 0, 255, 255, 1, 2,
5479 0, 255, 255, 1, 255, 255, 2, 3,
5480 255, 0, 255, 1, 255, 255, 2, 3,
5481 0, 1, 255, 2, 255, 255, 3, 4,
5482 255, 255, 0, 1, 255, 255, 2, 3,
5483 0, 255, 1, 2, 255, 255, 3, 4,
5484 255, 0, 1, 2, 255, 255, 3, 4,
5485 0, 1, 2, 3, 255, 255, 4, 5,
5486 255, 255, 255, 255, 0, 255, 1, 2,
5487 0, 255, 255, 255, 1, 255, 2, 3,
5488 255, 0, 255, 255, 1, 255, 2, 3,
5489 0, 1, 255, 255, 2, 255, 3, 4,
5490 255, 255, 0, 255, 1, 255, 2, 3,
5491 0, 255, 1, 255, 2, 255, 3, 4,
5492 255, 0, 1, 255, 2, 255, 3, 4,
5493 0, 1, 2, 255, 3, 255, 4, 5,
5494 255, 255, 255, 0, 1, 255, 2, 3,
5495 0, 255, 255, 1, 2, 255, 3, 4,
5496 255, 0, 255, 1, 2, 255, 3, 4,
5497 0, 1, 255, 2, 3, 255, 4, 5,
5498 255, 255, 0, 1, 2, 255, 3, 4,
5499 0, 255, 1, 2, 3, 255, 4, 5,
5500 255, 0, 1, 2, 3, 255, 4, 5,
5501 0, 1, 2, 3, 4, 255, 5, 6,
5502 255, 255, 255, 255, 255, 0, 1, 2,
5503 0, 255, 255, 255, 255, 1, 2, 3,
5504 255, 0, 255, 255, 255, 1, 2, 3,
5505 0, 1, 255, 255, 255, 2, 3, 4,
5506 255, 255, 0, 255, 255, 1, 2, 3,
5507 0, 255, 1, 255, 255, 2, 3, 4,
5508 255, 0, 1, 255, 255, 2, 3, 4,
5509 0, 1, 2, 255, 255, 3, 4, 5,
5510 255, 255, 255, 0, 255, 1, 2, 3,
5511 0, 255, 255, 1, 255, 2, 3, 4,
5512 255, 0, 255, 1, 255, 2, 3, 4,
5513 0, 1, 255, 2, 255, 3, 4, 5,
5514 255, 255, 0, 1, 255, 2, 3, 4,
5515 0, 255, 1, 2, 255, 3, 4, 5,
5516 255, 0, 1, 2, 255, 3, 4, 5,
5517 0, 1, 2, 3, 255, 4, 5, 6,
5518 255, 255, 255, 255, 0, 1, 2, 3,
5519 0, 255, 255, 255, 1, 2, 3, 4,
5520 255, 0, 255, 255, 1, 2, 3, 4,
5521 0, 1, 255, 255, 2, 3, 4, 5,
5522 255, 255, 0, 255, 1, 2, 3, 4,
5523 0, 255, 1, 255, 2, 3, 4, 5,
5524 255, 0, 1, 255, 2, 3, 4, 5,
5525 0, 1, 2, 255, 3, 4, 5, 6,
5526 255, 255, 255, 0, 1, 2, 3, 4,
5527 0, 255, 255, 1, 2, 3, 4, 5,
5528 255, 0, 255, 1, 2, 3, 4, 5,
5529 0, 1, 255, 2, 3, 4, 5, 6,
5530 255, 255, 0, 1, 2, 3, 4, 5,
5531 0, 255, 1, 2, 3, 4, 5, 6,
5532 255, 0, 1, 2, 3, 4, 5, 6,
5533 0, 1, 2, 3, 4, 5, 6, 7};
5541template <
class V, HWY_IF_T_SIZE_V(V, 4)>
5543#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
5547 const svuint32_t bits =
Shl(
Set(du32, 1),
Iota(du32, 0));
5548 const size_t code = detail::SumOfLanesM(mask, bits);
5550 alignas(16)
constexpr uint32_t packed_array[256] = {
5552 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0,
5553 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10,
5554 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0,
5555 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210,
5556 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0,
5557 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10,
5558 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0,
5559 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210,
5560 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0,
5561 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10,
5562 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0,
5563 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210,
5564 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0,
5565 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10,
5566 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0,
5567 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210,
5568 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0,
5569 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10,
5570 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0,
5571 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210,
5572 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0,
5573 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10,
5574 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0,
5575 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210,
5576 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0,
5577 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10,
5578 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0,
5579 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210,
5580 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0,
5581 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10,
5582 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0,
5583 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210,
5584 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0,
5585 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10,
5586 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0,
5587 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210,
5588 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0,
5589 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10,
5590 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0,
5591 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210,
5592 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0,
5593 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10,
5594 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210};
5598 const svuint32_t packed =
Set(du32, packed_array[code]);
5599 const svuint32_t
indices = detail::AndN(
Shr(packed, svindex_u32(0, 4)), 0xF);
5601#elif HWY_TARGET == HWY_SVE2_128
5605 const svuint32_t bits =
Shl(
Set(du32, 1),
Iota(du32, 0));
5606 const size_t offset = detail::SumOfLanesM(mask, bits);
5608 alignas(16)
constexpr uint32_t packed_array[16] = {
5610 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
5611 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
5612 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
5616 const svuint32_t packed =
Set(du32, packed_array[offset]);
5617 const svuint32_t
indices = detail::AndN(
Shr(packed, svindex_u32(0, 4)), 0xF);
5624template <
class V, HWY_IF_T_SIZE_V(V, 8)>
5626#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
5633 const svuint64_t bits =
Shl(
Set(du64, 1),
Iota(du64, 2));
5634 const size_t offset = detail::SumOfLanesM(mask, bits);
5636 alignas(16)
static constexpr uint64_t table[4 * 16] = {
5638 255, 255, 255, 255, 0, 255, 255, 255, 255, 0, 255, 255, 0, 1, 255, 255,
5639 255, 255, 0, 255, 0, 255, 1, 255, 255, 0, 1, 255, 0, 1, 2, 255,
5640 255, 255, 255, 0, 0, 255, 255, 1, 255, 0, 255, 1, 0, 1, 255, 2,
5641 255, 255, 0, 1, 0, 255, 1, 2, 255, 0, 1, 2, 0, 1, 2, 3};
5644#elif HWY_TARGET == HWY_SVE2_128
5664#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
5665 HWY_API HWY_SVE_V(BASE, BITS) \
5666 NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \
5667 return sv##OP##_##CHAR##BITS(a, b); \
5676#undef HWY_SVE_MUL_EVEN
5680template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>,
5681 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
5682HWY_API VFromD<DW> MulEven(const V a, const V b) {
5684 return BitCast(DW(), detail::MulEvenNative(a, b));
5686 const auto lo = Mul(a, b);
5687 const auto hi = MulHigh(a, b);
5688 return BitCast(DW(), detail::InterleaveEven(lo, hi));
5692template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>,
5693 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
5694HWY_API VFromD<DW> MulOdd(const V a, const V b) {
5696 return BitCast(DW(), detail::MulOddNative(a, b));
5698 const auto lo = Mul(a, b);
5699 const auto hi = MulHigh(a, b);
5700 return BitCast(DW(), detail::InterleaveOdd(lo, hi));
5704HWY_API sv
int64_t MulEven(const sv
int64_t a, const sv
int64_t b) {
5705 const auto lo = Mul(a, b);
5706 const auto hi = MulHigh(a, b);
5707 return detail::InterleaveEven(lo, hi);
5710HWY_API svu
int64_t MulEven(const svu
int64_t a, const svu
int64_t b) {
5711 const auto lo = Mul(a, b);
5712 const auto hi = MulHigh(a, b);
5713 return detail::InterleaveEven(lo, hi);
5716HWY_API sv
int64_t MulOdd(const sv
int64_t a, const sv
int64_t b) {
5717 const auto lo = Mul(a, b);
5718 const auto hi = MulHigh(a, b);
5719 return detail::InterleaveOdd(lo, hi);
5722HWY_API svu
int64_t MulOdd(const svu
int64_t a, const svu
int64_t b) {
5723 const auto lo = Mul(a, b);
5724 const auto hi = MulHigh(a, b);
5725 return detail::InterleaveOdd(lo, hi);
5730template <
size_t N,
int kPow2>
5733#if HWY_SVE_HAVE_F32_TO_BF16C
5734 const svfloat32_t even = svbfmlalb_f32(
Zero(df32), a, b);
5735 return svbfmlalt_f32(even, a, b);
5740 using VU32 =
VFromD<
decltype(du32)>;
5741 const VU32 odd =
Set(du32, 0xFFFF0000u);
5742 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
5744 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
5751template <
size_t N,
int kPow2>
5756 return svmlalt_s32(svmullb_s32(a, b), a, b);
5758 const svbool_t pg = detail::PTrue(d32);
5761 const svint32_t ae = svexth_s32_x(pg,
BitCast(d32, a));
5762 const svint32_t be = svexth_s32_x(pg,
BitCast(d32, b));
5763 const svint32_t ao = ShiftRight<16>(
BitCast(d32, a));
5764 const svint32_t bo = ShiftRight<16>(
BitCast(d32, b));
5765 return svmla_s32_x(pg, svmul_s32_x(pg, ao, bo), ae, be);
5769template <
size_t N,
int kPow2>
5771 svuint16_t a, svuint16_t b) {
5774 return svmlalt_u32(svmullb_u32(a, b), a, b);
5776 const svbool_t pg = detail::PTrue(d32);
5779 const svuint32_t ae = svexth_u32_x(pg,
BitCast(d32, a));
5780 const svuint32_t be = svexth_u32_x(pg,
BitCast(d32, b));
5781 const svuint32_t ao = ShiftRight<16>(
BitCast(d32, a));
5782 const svuint32_t bo = ShiftRight<16>(
BitCast(d32, b));
5783 return svmla_u32_x(pg, svmul_u32_x(pg, ao, bo), ae, be);
5791#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5792#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5794#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5797template <
class DI32, HWY_IF_I32_D(DI32)>
5802 return svqdmlalb_s32(sum, detail::ZipLowerSame(a, a),
5803 detail::ZipLowerSame(b, b));
5810template <
size_t N,
int kPow2>
5813 const svfloat32_t sum0,
5814 svfloat32_t& sum1) {
5815#if HWY_SVE_HAVE_BF16_FEATURE
5817 sum1 = svbfmlalt_f32(sum1, a, b);
5818 return svbfmlalb_f32(sum0, a, b);
5823 using VU32 =
VFromD<
decltype(du32)>;
5824 const VU32 odd =
Set(du32, 0xFFFF0000u);
5825 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
5827 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
5834template <
size_t N,
int kPow2>
5836 svint16_t a, svint16_t b,
5837 const svint32_t sum0,
5841 sum1 = svmlalt_s32(sum1, a, b);
5842 return svmlalb_s32(sum0, a, b);
5844 const svbool_t pg = detail::PTrue(d32);
5847 const svint32_t ae = svexth_s32_x(pg,
BitCast(d32, a));
5848 const svint32_t be = svexth_s32_x(pg,
BitCast(d32, b));
5849 const svint32_t ao = ShiftRight<16>(
BitCast(d32, a));
5850 const svint32_t bo = ShiftRight<16>(
BitCast(d32, b));
5851 sum1 = svmla_s32_x(pg, sum1, ao, bo);
5852 return svmla_s32_x(pg, sum0, ae, be);
5856template <
size_t N,
int kPow2>
5858 svuint16_t a, svuint16_t b,
5859 const svuint32_t sum0,
5863 sum1 = svmlalt_u32(sum1, a, b);
5864 return svmlalb_u32(sum0, a, b);
5866 const svbool_t pg = detail::PTrue(d32);
5869 const svuint32_t ae = svexth_u32_x(pg,
BitCast(d32, a));
5870 const svuint32_t be = svexth_u32_x(pg,
BitCast(d32, b));
5871 const svuint32_t ao = ShiftRight<16>(
BitCast(d32, a));
5872 const svuint32_t bo = ShiftRight<16>(
BitCast(d32, b));
5873 sum1 = svmla_u32_x(pg, sum1, ao, bo);
5874 return svmla_u32_x(pg, sum0, ae, be);
5882 return Add(sum0, sum1);
5887#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5888#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5890#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5893template <
class DI32, HWY_IF_I32_D(DI32)>
5895 svint8_t b, svint32_t sum) {
5896 return svdot_s32(sum, a, b);
5899#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5900#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5902#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5905template <
class DU32, HWY_IF_U32_D(DU32)>
5907 svuint8_t b, svuint32_t sum) {
5908 return svdot_u32(sum, a, b);
5911#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5912#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5914#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5917template <
class DI32, HWY_IF_I32_D(DI32)>
5919 svint8_t b_i, svint32_t sum) {
5926 const auto b_u =
BitCast(du8, b_i);
5927 const auto result_sum0 = svdot_u32(
BitCast(du32, sum), a_u, b_u);
5928 const auto result_sum1 =
5929 ShiftLeft<8>(svdot_u32(
Zero(du32), a_u, ShiftRight<7>(b_u)));
5931 return BitCast(di32,
Sub(result_sum0, result_sum1));
5934#ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5935#undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5937#define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5940template <
class DI64, HWY_IF_I64_D(DI64)>
5942 svint16_t b, svint64_t sum) {
5943 return svdot_s64(sum, a, b);
5946#ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5947#undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5949#define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5952template <
class DU64, HWY_IF_U64_D(DU64)>
5954 svuint16_t b, svuint64_t sum) {
5955 return svdot_u64(sum, a, b);
5962#if defined(__ARM_FEATURE_SVE2_AES) || \
5963 (HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH && HWY_BASELINE_SVE2 == 0)
5966#ifdef HWY_NATIVE_AES
5967#undef HWY_NATIVE_AES
5969#define HWY_NATIVE_AES
5974 return Xor(svaesmc_u8(svaese_u8(state, svdup_n_u8(0))), round_key);
5978 return Xor(svaese_u8(state, svdup_n_u8(0)), round_key);
5982 return svaesimc_u8(state);
5986 return Xor(svaesimc_u8(svaesd_u8(state, svdup_n_u8(0))), round_key);
5990 return Xor(svaesd_u8(state, svdup_n_u8(0)), round_key);
5993template <u
int8_t kRcon>
5995 alignas(16)
static constexpr uint8_t kRconXorMask[16] = {
5996 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0};
5997 alignas(16)
static constexpr uint8_t kRotWordShuffle[16] = {
5998 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12};
6007 return svpmullb_pair(a, b);
6011 return svpmullt_pair(a, b);
6019#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP) \
6020 template <size_t N, int kPow2> \
6021 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , svbool_t m) { \
6022 return sv##OP##_b##BITS(m, m); \
6029#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
6032 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
6033 const svbool_t eqHx =
Eq(a, b);
6048#if HWY_TARGET == HWY_SVE_256
6051 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
6052 const svbool_t eqHx =
Eq(a, b);
6053 const svbool_t ltHL =
Lt(a, b);
6055 const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(
d, ltHL), ltHL);
6057 return detail::DupOddB(
d, ltHx);
6065 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
6066 const svbool_t ltHL =
Lt(a, b);
6067 return detail::DupOddB(
d, ltHL);
6072#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
6077 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
6082 const svuint64_t eqHH =
DupOdd(eqHL);
6083 const svuint64_t eqLL =
DupEven(eqHL);
6084 return And(eqLL, eqHH);
6089 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
6094 const svuint64_t neHH =
DupOdd(neHL);
6095 const svuint64_t neLL =
DupEven(neHL);
6096 return Or(neLL, neHH);
6104#if HWY_TARGET == HWY_SVE_256
6107 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
6108 const svbool_t eqHL =
Eq(a, b);
6109 const svbool_t eqHH = detail::DupOddB(
d, eqHL);
6110 const svbool_t eqLL = detail::DupEvenB(
d, eqHL);
6111 return And(eqLL, eqHH);
6117#if HWY_TARGET == HWY_SVE_256
6120 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
6121 const svbool_t neHL =
Ne(a, b);
6122 const svbool_t neHH = detail::DupOddB(
d, neHL);
6123 const svbool_t neLL = detail::DupEvenB(
d, neHL);
6124 return Or(neLL, neHH);
6132 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
6133 const svbool_t eqHL =
Eq(a, b);
6134 return detail::DupOddB(
d, eqHL);
6139 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
6140 const svbool_t neHL =
Ne(a, b);
6141 return detail::DupOddB(
d, neHL);
6148#if HWY_TARGET == HWY_SVE_256
6157#if HWY_TARGET == HWY_SVE_256
6176#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
6177#undef HWY_NATIVE_LEADING_ZERO_COUNT
6179#define HWY_NATIVE_LEADING_ZERO_COUNT
6182#define HWY_SVE_LEADING_ZERO_COUNT(BASE, CHAR, BITS, HALF, NAME, OP) \
6183 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
6184 const DFromV<decltype(v)> d; \
6185 return BitCast(d, sv##OP##_##CHAR##BITS##_x(detail::PTrue(d), v)); \
6189#undef HWY_SVE_LEADING_ZERO_COUNT
6191template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
6196template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
6199 using T =
TFromD<
decltype(
d)>;
6204#undef HWY_SVE_ALL_PTRUE
6206#undef HWY_SVE_FOREACH
6207#undef HWY_SVE_FOREACH_BF16
6208#undef HWY_SVE_FOREACH_BF16_UNCONDITIONAL
6209#undef HWY_SVE_FOREACH_F
6210#undef HWY_SVE_FOREACH_F16
6211#undef HWY_SVE_FOREACH_F32
6212#undef HWY_SVE_FOREACH_F3264
6213#undef HWY_SVE_FOREACH_F64
6214#undef HWY_SVE_FOREACH_I
6215#undef HWY_SVE_FOREACH_I08
6216#undef HWY_SVE_FOREACH_I16
6217#undef HWY_SVE_FOREACH_I32
6218#undef HWY_SVE_FOREACH_I64
6219#undef HWY_SVE_FOREACH_IF
6220#undef HWY_SVE_FOREACH_U
6221#undef HWY_SVE_FOREACH_U08
6222#undef HWY_SVE_FOREACH_U16
6223#undef HWY_SVE_FOREACH_U32
6224#undef HWY_SVE_FOREACH_U64
6225#undef HWY_SVE_FOREACH_UI
6226#undef HWY_SVE_FOREACH_UI08
6227#undef HWY_SVE_FOREACH_UI16
6228#undef HWY_SVE_FOREACH_UI32
6229#undef HWY_SVE_FOREACH_UI64
6230#undef HWY_SVE_FOREACH_UIF3264
6231#undef HWY_SVE_HAVE_2
6232#undef HWY_SVE_IF_EMULATED_D
6233#undef HWY_SVE_IF_NOT_EMULATED_D
6235#undef HWY_SVE_RETV_ARGMVV
6236#undef HWY_SVE_RETV_ARGPV
6237#undef HWY_SVE_RETV_ARGPVN
6238#undef HWY_SVE_RETV_ARGPVV
6239#undef HWY_SVE_RETV_ARGV
6240#undef HWY_SVE_RETV_ARGVN
6241#undef HWY_SVE_RETV_ARGVV
6242#undef HWY_SVE_RETV_ARGVVV
6244#undef HWY_SVE_UNDEFINED
#define HWY_SVE_RETV_ARGMVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:247
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:160
#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1977
#define HWY_SVE_ROTATE_RIGHT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1071
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:101
#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:6019
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1270
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2611
#define HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:106
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2584
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:85
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2794
#define HWY_SVE_REVERSE_BITS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3611
#define HWY_SVE_MASKED_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1899
#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2011
#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:437
#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1208
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:426
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:122
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:182
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:315
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3470
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:883
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1928
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:380
#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:253
#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3295
#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:174
#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:4118
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:168
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:215
#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3155
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:93
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:483
#define HWY_SVE_PTRUE(BITS)
Definition arm_sve-inl.h:289
#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1993
#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3045
#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2061
#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2045
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1282
#define HWY_SVE_GET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:538
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:147
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:221
#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3380
#define HWY_SVE_TABLE2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3394
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:233
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1101
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2080
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3752
#define HWY_SVE_ADDSUB_UI(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:4253
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1547
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:811
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:134
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:195
#define HWY_SVE_LEADING_ZERO_COUNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:6182
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2755
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1320
#define HWY_SVE_MASKED_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1937
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:745
#define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1743
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:178
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:86
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:128
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1543
#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1635
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:338
#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:5664
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2030
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:164
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1889
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1047
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3165
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:156
#define HWY_SVE_ADDSUB_F(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:4237
#define HWY_SVE_CREATE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:504
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:239
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:211
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:755
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3237
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_LANES_GT(kN, lanes)
Definition base.h:618
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_TARGET
Definition detect_targets.h:543
#define HWY_SVE_256
Definition detect_targets.h:90
#define HWY_SVE2_128
Definition detect_targets.h:89
HWY_INLINE V ExpandLoop(V v, svbool_t mask)
Definition arm_sve-inl.h:5207
HWY_INLINE Vec256< T > BroadcastLane(hwy::SizeTag< 0 >, Vec256< T > v)
Definition x86_256-inl.h:4186
HWY_INLINE svuint8_t BoolFromMask(svbool_t m)
Definition arm_sve-inl.h:4850
HWY_INLINE VFromD< RepartitionToWide< DFromV< V > > > SumsOf2(hwy::SignedTag, hwy::SizeTag< 1 >, V v)
Definition arm_neon-inl.h:1959
HWY_INLINE svuint8_t LaneIndicesFromByteIndices(D, svuint8_t idx)
Definition arm_sve-inl.h:5197
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x)
Definition arm_sve-inl.h:4871
svbool_t MaskLowerHalf(D d)
Definition arm_sve-inl.h:2939
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition rvv-inl.h:2966
svbool_t MakeMask(D d)
Definition arm_sve-inl.h:359
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 > d)
Definition arm_sve-inl.h:3442
VI SaturateI(VI v)
Definition arm_sve-inl.h:2259
HWY_API svbool_t PFalse()
Definition arm_sve-inl.h:352
svbool_t MaskUpperHalf(D d)
Definition arm_sve-inl.h:3033
VFromD< D > Ext(D d, VFromD< Half< D > > v)
Definition rvv-inl.h:764
VU SaturateU(VU v)
Definition arm_sve-inl.h:2253
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6088
HWY_INLINE VFromD< D > PromoteOddTo(hwy::FloatTag to_type_tag, hwy::SizeTag< 4 > to_lane_size_tag, hwy::FloatTag from_type_tag, D d_to, svfloat16_t v)
Definition arm_sve-inl.h:4419
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6031
HWY_INLINE svuint8_t IndicesForExpandFromBits(uint64_t mask_bits)
Definition arm_sve-inl.h:4933
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition ops/shared-inl.h:146
HWY_INLINE VFromD< D > PromoteEvenTo(hwy::SignedTag, hwy::SizeTag< 2 >, hwy::SignedTag, D d_to, svint8_t v)
Definition arm_sve-inl.h:4334
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition ops/shared-inl.h:325
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition rvv-inl.h:2972
HWY_INLINE V Splice(V hi, V lo, svbool_t mask)
Definition arm_sve-inl.h:2621
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6076
HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v)
Definition arm_sve-inl.h:2690
HWY_INLINE size_t AllHardwareLanes()
Definition arm_sve-inl.h:266
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API V SaturatedNeg(V v)
Definition generic_ops-inl.h:897
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API svbool_t MaskFalse(const D)
Definition arm_sve-inl.h:372
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
V Shl(V a, V b)
Definition generic_ops-inl.h:7322
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API Vec128< int64_t > SaturatedAbs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3288
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API V MaskedModOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:4666
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API V AddSub(V a, V b)
Definition generic_ops-inl.h:775
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API TFromD< D > ReduceMax(D d, VFromD< D > v)
Definition arm_sve-inl.h:3213
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
HWY_API VFromD< DI32 > SatWidenMulAccumFixedPoint(DI32, VFromD< Rebind< int16_t, DI32 > > a, VFromD< Rebind< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition arm_neon-inl.h:6496
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API svbool_t DemoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1420
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API V MulAddSub(V mul, V x, V sub_or_add)
Definition arm_sve-inl.h:4285
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
typename detail::CappedTagChecker< T, kLimit, kPow2 >::type CappedTag
Definition ops/shared-inl.h:379
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API TFromD< D > ReduceMin(D d, VFromD< D > v)
Definition arm_sve-inl.h:3208
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API void LoadInterleaved2(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1)
Definition arm_neon-inl.h:9049
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:476
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
VFromD< ScalableTag< bfloat16_t > > VBF16
Definition arm_sve-inl.h:410
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:367
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API svbool_t LowerHalfOfMask(D, svbool_t m)
Definition arm_sve-inl.h:1456
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
RepartitionToWide< RepartitionToWide< D > > RepartitionToWideX2
Definition ops/shared-inl.h:480
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API V Div(V a, V b)
Definition arm_sve-inl.h:4639
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7353
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API V HighestSetBitIndex(V v)
Definition arm_neon-inl.h:9523
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > MaskedGatherIndex(MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2731
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
V Shr(V a, V b)
Definition generic_ops-inl.h:7326
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API svbool_t IsNegative(V v)
Definition arm_sve-inl.h:1623
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API V TrailingZeroCount(V v)
Definition arm_neon-inl.h:9530
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API V ReverseBits(V v)
Definition generic_ops-inl.h:6464
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API svbool_t PromoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1394
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API V Mod(V a, V b)
Definition arm_sve-inl.h:4660
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API V BroadcastLane(const V v)
Definition arm_sve-inl.h:4146
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API constexpr bool IsSame()
Definition base.h:499
constexpr size_t CeilLog2(TI x)
Definition base.h:2669
typename detail::Relations< T >::Narrow MakeNarrow
Definition base.h:2088
HWY_API size_t PopCount(T x)
Definition base.h:2615
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:626
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_T_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:557
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_IF_T_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:555
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition tuple-inl.h:30
Definition tuple-inl.h:36
Definition tuple-inl.h:43
@ value
Definition arm_neon-inl.h:8429
Definition arm_sve-inl.h:68
Definition ops/shared-inl.h:198
int VFromD
Definition tuple-inl.h:25
HWY_API Vec2< D > Create2(D, VFromD< D > v0, VFromD< D > v1)
Definition tuple-inl.h:52
HWY_API Vec4< D > Create4(D, VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3)
Definition tuple-inl.h:62
HWY_API Vec3< D > Create3(D, VFromD< D > v0, VFromD< D > v1, VFromD< D > v2)
Definition tuple-inl.h:57