41#define HWY_NEON_BUILD_TPL_1
42#define HWY_NEON_BUILD_TPL_2
43#define HWY_NEON_BUILD_TPL_3
47#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
48#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
49#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
52#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
53#define HWY_NEON_BUILD_PARAM_2(type, size) \
54 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
55#define HWY_NEON_BUILD_PARAM_3(type, size) \
56 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
57 const Vec128<type##_t, size> c
61#define HWY_NEON_BUILD_ARG_1 a.raw
62#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
63#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
72#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
78#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
79 HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
80 HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
81 name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
82 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
83 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
93#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
94 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
95 HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
96 HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
97 HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
98 HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
101#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
102 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
103 HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
104 HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
105 HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
106 HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
109#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
110 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
111 HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
112 HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
113 HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
116#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
117 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
118 HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
119 HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
120 HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
123#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
124 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
125 HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
126 HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
129#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
130 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
131 HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
132 HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
135#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
136 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
137 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
140#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
141 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
142 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
144#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && HWY_HAVE_SCALAR_BF16_TYPE
145#define HWY_NEON_HAVE_BFLOAT16 1
147#define HWY_NEON_HAVE_BFLOAT16 0
152#if HWY_NEON_HAVE_BFLOAT16 || \
153 (defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
154 (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
155#define HWY_NEON_HAVE_F32_TO_BF16C 1
157#define HWY_NEON_HAVE_F32_TO_BF16C 0
161#if HWY_NEON_HAVE_BFLOAT16
162#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
163 HWY_NEON_DEF_FUNCTION(bfloat16, 8, name, prefix##q, infix, bf16, args) \
164 HWY_NEON_DEF_FUNCTION(bfloat16, 4, name, prefix, infix, bf16, args) \
165 HWY_NEON_DEF_FUNCTION(bfloat16, 2, name, prefix, infix, bf16, args) \
166 HWY_NEON_DEF_FUNCTION(bfloat16, 1, name, prefix, infix, bf16, args)
168#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
172#define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \
174 HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args) \
175 HWY_NEON_DEF_FUNCTION(float16, 4, name, prefix, infix, f16, args) \
176 HWY_NEON_DEF_FUNCTION(float16, 2, name, prefix, infix, f16, args) \
177 HWY_NEON_DEF_FUNCTION(float16, 1, name, prefix, infix, f16, args)
181#define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \
182 HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, args)
184#define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args)
188#if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
189#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
190#elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
191#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D)
192#elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
193#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
194#elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
200#define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
202#error "Logic error, handled all four cases"
206#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
207 HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
208 HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
209 HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
213#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
214 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
215 HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
217#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
222#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
223 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
224 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
225 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
228#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
229 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
230 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
231 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
234#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
235 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
236 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
239#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
240 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
241 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
244#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
245 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
246 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
248#define HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
249 HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \
250 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
252#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
253 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
254 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
257#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
258 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
259 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
261#define HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \
262 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
263 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)
265#define HWY_NEON_DEF_FUNCTION_UIF_8_16_32(name, prefix, infix, args) \
266 HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \
267 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args)
269#define HWY_NEON_DEF_FUNCTION_UIF_64(name, prefix, infix, args) \
270 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
271 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
272 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
275#define HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) \
276 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
277 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args)
278#define HWY_NEON_DEF_FUNCTION_FULL_UIF_64(name, prefix, infix, args) \
279 HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) \
280 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args)
283#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args) \
284 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
285 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
286 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
287 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
288 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
289 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
290 HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args)
293#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
294#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
295#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
296#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
297#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
298#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
299#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
300#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
301#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
302#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
303#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
304#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
305#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
306#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
307#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
308#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
309#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
310#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
311#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
312#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
313#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
314#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
315#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
316#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
317#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
318#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
319#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
320#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
321#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
322#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
323#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
324#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
325#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
326#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
327#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
328#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
329#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
330#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
331#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
332#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
333#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
334#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
335#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
336#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
337#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
338#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
339#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
340#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
341#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
342#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
343#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
344#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
345#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
346#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
347#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
348#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
354template <
typename T,
size_t N>
356template <
typename T,
size_t N>
358template <
typename T,
size_t N>
613template <
typename T,
size_t N>
700 using type = float64x2_t;
703struct Raw128<double, 1> {
704 using type = float64x1_t;
708#if HWY_NEON_HAVE_F16C
715struct Tuple2<float16_t, N> {
720struct Tuple3<float16_t, 8> {
724struct Tuple3<float16_t, N> {
729struct Tuple4<float16_t, 8> {
733struct Tuple4<float16_t, N> {
738struct Raw128<float16_t, 8> {
739 using type = float16x8_t;
742struct Raw128<float16_t, N> {
743 using type = float16x4_t;
759#if HWY_NEON_HAVE_BFLOAT16
775struct Tuple3<bfloat16_t, N> {
780struct Tuple4<bfloat16_t, 8> {
784struct Tuple4<bfloat16_t, N> {
789struct Raw128<bfloat16_t, 8> {
790 using type = bfloat16x8_t;
793struct Raw128<bfloat16_t, N> {
794 using type = bfloat16x4_t;
812template <
typename T,
size_t N = 16 /
sizeof(T)>
817 static constexpr size_t kPrivateN = N;
827 return *
this = (*
this * other);
830 return *
this = (*
this / other);
833 return *
this = (*
this + other);
836 return *
this = (*
this - other);
839 return *
this = (*
this % other);
842 return *
this = (*
this & other);
845 return *
this = (*
this | other);
848 return *
this = (*
this ^ other);
864template <
typename T,
size_t N = 16 /
sizeof(T)>
871 static constexpr size_t kPrivateN = N;
902#define HWY_NEON_BUILD_TPL_HWY_SET
903#define HWY_NEON_BUILD_RET_HWY_SET(type, size) Vec128<type##_t, size>
904#define HWY_NEON_BUILD_PARAM_HWY_SET(type, size) \
905 Simd<type##_t, size, 0> , type##_t t
906#define HWY_NEON_BUILD_ARG_HWY_SET t
909#if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C
914template <
class D, HWY_NEON_IF_EMULATED_D(D)>
916 const uint16_t tu = BitCastScalar<uint16_t>(t);
920#undef HWY_NEON_BUILD_TPL_HWY_SET
921#undef HWY_NEON_BUILD_RET_HWY_SET
922#undef HWY_NEON_BUILD_PARAM_HWY_SET
923#undef HWY_NEON_BUILD_ARG_HWY_SET
930template <
class D, HWY_IF_V_SIZE_D(D, 16),
typename T>
936template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
typename T>
966#if !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
973 static_assert(
sizeof(T) >= 1,
"sizeof(T) >= 1 must be true");
974 static_assert(
sizeof(T) <= 8,
"sizeof(T) <= 8 must be true");
983template <
class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
991#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
992 typedef int8_t GccI8RawVectType __attribute__((__vector_size__(8)));
994 const GccI8RawVectType raw = {
995 static_cast<int8_t
>(t0),
static_cast<int8_t
>(t1),
static_cast<int8_t
>(t2),
996 static_cast<int8_t
>(t3),
static_cast<int8_t
>(t4),
static_cast<int8_t
>(t5),
997 static_cast<int8_t
>(t6),
static_cast<int8_t
>(t7)};
1003 {t0, t1, t2, t3, t4, t5, t6, t7}})));
1007template <
class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1012#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1013 typedef int16_t GccI16RawVectType __attribute__((__vector_size__(8)));
1015 const GccI16RawVectType raw = {
1016 static_cast<int16_t
>(t0),
static_cast<int16_t
>(t1),
1017 static_cast<int16_t
>(t2),
static_cast<int16_t
>(t3)};
1022 BitCastScalar<uint64_t>(
1027template <
class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1030#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1031 typedef int32_t GccI32RawVectType __attribute__((__vector_size__(8)));
1033 const GccI32RawVectType raw = {
static_cast<int32_t
>(t0),
1034 static_cast<int32_t
>(t1)};
1039 BitCastScalar<uint64_t>(
1044template <
class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1046 TFromD<D> , TFromD<D> ) {
1047#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1048 typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
1050 const GccF32RawVectType raw = {t0, t1};
1054 Set(Full64<uint64_t>(),
1055 BitCastScalar<uint64_t>(
1056 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
1060template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
1065template <
class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
1067 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1068 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
1069 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
1070 TFromD<D> t11, TFromD<D> t12,
1071 TFromD<D> t13, TFromD<D> t14,
1073#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1074 typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16)));
1076 const GccI8RawVectType raw = {
1077 static_cast<int8_t
>(t0),
static_cast<int8_t
>(t1),
1078 static_cast<int8_t
>(t2),
static_cast<int8_t
>(t3),
1079 static_cast<int8_t
>(t4),
static_cast<int8_t
>(t5),
1080 static_cast<int8_t
>(t6),
static_cast<int8_t
>(t7),
1081 static_cast<int8_t
>(t8),
static_cast<int8_t
>(t9),
1082 static_cast<int8_t
>(t10),
static_cast<int8_t
>(t11),
1083 static_cast<int8_t
>(t12),
static_cast<int8_t
>(t13),
1084 static_cast<int8_t
>(t14),
static_cast<int8_t
>(t15)};
1087 const Half<
decltype(
d)> dh;
1090 t8, t9, t10, t11, t12, t13, t14, t15),
1091 Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t0, t1,
1092 t2, t3, t4, t5, t6, t7));
1096template <
class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
1098 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1099 TFromD<D> t5, TFromD<D> t6,
1101#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1102 typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16)));
1104 const GccI16RawVectType raw = {
1105 static_cast<int16_t
>(t0),
static_cast<int16_t
>(t1),
1106 static_cast<int16_t
>(t2),
static_cast<int16_t
>(t3),
1107 static_cast<int16_t
>(t4),
static_cast<int16_t
>(t5),
1108 static_cast<int16_t
>(t6),
static_cast<int16_t
>(t7)};
1111 const Half<
decltype(
d)> dh;
1112 return Combine(
d,
Dup128VecFromValues(dh, t4, t5, t6, t7, t4, t5, t6, t7),
1117template <
class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 16)>
1119 TFromD<D> t2, TFromD<D> t3) {
1120#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1121 typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16)));
1123 const GccI32RawVectType raw = {
1124 static_cast<int32_t
>(t0),
static_cast<int32_t
>(t1),
1125 static_cast<int32_t
>(t2),
static_cast<int32_t
>(t3)};
1128 const Half<
decltype(
d)> dh;
1134template <
class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 16)>
1136 TFromD<D> t2, TFromD<D> t3) {
1137#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1138 typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
1140 const GccF32RawVectType raw = {t0, t1, t2, t3};
1143 const Half<
decltype(
d)> dh;
1149template <
class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 16)>
1151#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1152 typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16)));
1154 const GccI64RawVectType raw = {
static_cast<int64_t
>(t0),
1155 static_cast<int64_t
>(t1)};
1158 const Half<
decltype(
d)> dh;
1164template <
class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 16)>
1166#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1167 typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
1169 const GccF64RawVectType raw = {t0, t1};
1172 const Half<
decltype(
d)> dh;
1179template <
class D, HWY_IF_BF16_D(D)>
1187 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
1188 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
1189 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
1190 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
1193#if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
1194template <
class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1196 TFromD<D> t2, TFromD<D> t3,
1197 TFromD<D> , TFromD<D> ,
1198 TFromD<D> , TFromD<D> ) {
1199 typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(8)));
1201 const GccF16RawVectType raw = {
1202 static_cast<__fp16
>(t0),
static_cast<__fp16
>(t1),
static_cast<__fp16
>(t2),
1203 static_cast<__fp16
>(t3)};
1206template <
class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
1208 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1209 TFromD<D> t5, TFromD<D> t6,
1211 typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(16)));
1213 const GccF16RawVectType raw = {
1214 static_cast<__fp16
>(t0),
static_cast<__fp16
>(t1),
static_cast<__fp16
>(t2),
1215 static_cast<__fp16
>(t3),
static_cast<__fp16
>(t4),
static_cast<__fp16
>(t5),
1216 static_cast<__fp16
>(t6),
static_cast<__fp16
>(t7)};
1221template <
class D, HWY_IF_F16_D(D)>
1223 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1224 TFromD<D> t5, TFromD<D> t6,
1229 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
1230 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
1231 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
1232 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
1238template <
class D, HWY_IF_T_SIZE_D(D, 1)>
1247template <
class D, HWY_IF_UI16_D(D)>
1250 TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5},
1251 TFromD<D>{6}, TFromD<D>{7});
1254template <
class D, HWY_IF_F16_D(D)>
1258 uint16_t{0x4000}, uint16_t{0x4200},
1259 uint16_t{0x4400}, uint16_t{0x4500},
1260 uint16_t{0x4600}, uint16_t{0x4700}));
1263template <
class D, HWY_IF_T_SIZE_D(D, 4)>
1269template <
class D, HWY_IF_T_SIZE_D(D, 8)>
1274#if HWY_COMPILER_MSVC
1275template <
class V, HWY_IF_V_SIZE_LE_V(V, 4)>
1277 constexpr size_t kVecSizeInBytes =
HWY_MAX_LANES_V(V) *
sizeof(TFromV<V>);
1278 constexpr uint64_t kU64MaskOutMask =
1279 hwy::LimitsMax<hwy::UnsignedFromSize<kVecSizeInBytes>>();
1283 using VU8 =
VFromD<
decltype(du8)>;
1284 const auto mask_out_mask =
1285 BitCast(
d, VU8(vreinterpret_u8_u64(vdup_n_u64(kU64MaskOutMask))));
1286 return v & mask_out_mask;
1288template <
class V, HWY_IF_V_SIZE_GT_V(V, 4)>
1296template <
class D,
typename T2>
1298 const auto result_iota =
1300#if HWY_COMPILER_MSVC
1301 return detail::MaskOutIota(result_iota);
1313template <
class D, HWY_IF_U8_D(D)>
1318template <
class D, HWY_IF_U16_D(D)>
1323template <
class D, HWY_IF_U32_D(D)>
1328template <
class D, HWY_IF_U64_D(D)>
1334template <
class D, HWY_IF_I8_D(D)>
1339template <
class D, HWY_IF_I16_D(D)>
1344template <
class D, HWY_IF_I32_D(D)>
1349template <
class D, HWY_IF_I64_D(D)>
1356template <
class D, HWY_IF_F16_D(D)>
1357HWY_API Vec128<float16_t>
Combine(D, Vec64<float16_t> hi, Vec64<float16_t> lo) {
1358 return Vec128<float16_t>(vcombine_f16(lo.raw, hi.raw));
1362#if HWY_NEON_HAVE_BFLOAT16
1363template <
class D, HWY_IF_BF16_D(D)>
1365 return VFromD<D>(vcombine_bf16(lo.raw, hi.raw));
1369template <
class D,
class DH = Half<D>, HWY_NEON_IF_EMULATED_D(D)>
1372 const Half<
decltype(du)> duh;
1376template <
class D, HWY_IF_F32_D(D)>
1381template <
class D, HWY_IF_F64_D(D)>
1384 return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
1394#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
1395#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
1396 Vec128<uint8_t, size * sizeof(type##_t)>
1397#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
1398#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
1416#if !HWY_HAVE_FLOAT16
1417#if HWY_NEON_HAVE_F16C
1428#if !HWY_NEON_HAVE_BFLOAT16
1435#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
1436#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
1437#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
1438#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
1440template <
class D, HWY_IF_U8_D(D)>
1447template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
1450 return VFromD<D>(vreinterpret_s8_u8(v.raw));
1452template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
1455 return VFromD<D>(vreinterpret_u16_u8(v.raw));
1457template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
1460 return VFromD<D>(vreinterpret_s16_u8(v.raw));
1462template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
1464 VFromD<Repartition<uint8_t, D>> v) {
1465 return VFromD<D>(vreinterpret_u32_u8(v.raw));
1467template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
1469 VFromD<Repartition<uint8_t, D>> v) {
1470 return VFromD<D>(vreinterpret_s32_u8(v.raw));
1473template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
1477template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
1483template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
1485#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1486 return VFromD<D>(vreinterpret_f16_u8(v.raw));
1493template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
1495#if HWY_NEON_HAVE_BFLOAT16
1496 return VFromD<D>(vreinterpret_bf16_u8(v.raw));
1498 const RebindToUnsigned<D> du;
1503template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
1505 VFromD<Repartition<uint8_t, D>> v) {
1506 return VFromD<D>(vreinterpret_f32_u8(v.raw));
1510template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)>
1512 return Vec64<double>(vreinterpret_f64_u8(v.raw));
1518template <
class D, HWY_IF_I8_D(D)>
1522template <
class D, HWY_IF_U16_D(D)>
1526template <
class D, HWY_IF_I16_D(D)>
1530template <
class D, HWY_IF_U32_D(D)>
1534template <
class D, HWY_IF_I32_D(D)>
1538template <
class D, HWY_IF_U64_D(D)>
1542template <
class D, HWY_IF_I64_D(D)>
1547template <
class D, HWY_IF_F32_D(D)>
1553template <
class D, HWY_IF_F64_D(D)>
1560template <
class D, HWY_IF_F16_D(D)>
1562#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1569template <
class D, HWY_IF_BF16_D(D)>
1571#if HWY_NEON_HAVE_BFLOAT16
1580template <
class D,
class FromT>
1607 const DFromV<
decltype(v)> d_from;
1608 const Half<
decltype(d_from)> dh_from;
1616 const Full64<TFromV<FromV>> d_full64_from;
1617 const Full128<TFromV<FromV>> d_full128_from;
1625#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
1626#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
1627#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
1628#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1633template <
size_t kLane,
class V, HWY_NEON_IF_EMULATED_D(DFromV<V>)>
1637 return BitCastScalar<TFromV<V>>(GetLane<kLane>(
BitCast(du, v)));
1640#undef HWY_NEON_BUILD_TPL_HWY_GET
1641#undef HWY_NEON_BUILD_RET_HWY_GET
1642#undef HWY_NEON_BUILD_PARAM_HWY_GET
1643#undef HWY_NEON_BUILD_ARG_HWY_GET
1649 return detail::GetLane<0>(v);
1656template <
typename T>
1660 return detail::GetLane<0>(v);
1663template <
typename T>
1665#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1666 if (__builtin_constant_p(i)) {
1669 return detail::GetLane<0>(v);
1671 return detail::GetLane<1>(v);
1675 alignas(16) T lanes[2];
1680template <
typename T>
1682#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1683 if (__builtin_constant_p(i)) {
1686 return detail::GetLane<0>(v);
1688 return detail::GetLane<1>(v);
1690 return detail::GetLane<2>(v);
1692 return detail::GetLane<3>(v);
1696 alignas(16) T lanes[4];
1701template <
typename T>
1703#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1704 if (__builtin_constant_p(i)) {
1707 return detail::GetLane<0>(v);
1709 return detail::GetLane<1>(v);
1711 return detail::GetLane<2>(v);
1713 return detail::GetLane<3>(v);
1715 return detail::GetLane<4>(v);
1717 return detail::GetLane<5>(v);
1719 return detail::GetLane<6>(v);
1721 return detail::GetLane<7>(v);
1725 alignas(16) T lanes[8];
1730template <
typename T>
1732#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1733 if (__builtin_constant_p(i)) {
1736 return detail::GetLane<0>(v);
1738 return detail::GetLane<1>(v);
1740 return detail::GetLane<2>(v);
1742 return detail::GetLane<3>(v);
1744 return detail::GetLane<4>(v);
1746 return detail::GetLane<5>(v);
1748 return detail::GetLane<6>(v);
1750 return detail::GetLane<7>(v);
1752 return detail::GetLane<8>(v);
1754 return detail::GetLane<9>(v);
1756 return detail::GetLane<10>(v);
1758 return detail::GetLane<11>(v);
1760 return detail::GetLane<12>(v);
1762 return detail::GetLane<13>(v);
1764 return detail::GetLane<14>(v);
1766 return detail::GetLane<15>(v);
1770 alignas(16) T lanes[16];
1778#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
1779#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
1780#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
1781 Vec128<type##_t, size> v, type##_t t
1782#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1787#undef HWY_NEON_BUILD_TPL_HWY_INSERT
1788#undef HWY_NEON_BUILD_RET_HWY_INSERT
1789#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1790#undef HWY_NEON_BUILD_ARG_HWY_INSERT
1792template <
size_t kLane,
class V,
class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
1796 const uint16_t tu = BitCastScalar<uint16_t>(t);
1805template <
typename T>
1812template <
typename T>
1814#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1815 if (__builtin_constant_p(i)) {
1818 return detail::InsertLane<0>(v, t);
1820 return detail::InsertLane<1>(v, t);
1825 alignas(16) T lanes[2];
1828 return Load(
d, lanes);
1831template <
typename T>
1833#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1834 if (__builtin_constant_p(i)) {
1837 return detail::InsertLane<0>(v, t);
1839 return detail::InsertLane<1>(v, t);
1841 return detail::InsertLane<2>(v, t);
1843 return detail::InsertLane<3>(v, t);
1848 alignas(16) T lanes[4];
1851 return Load(
d, lanes);
1854template <
typename T>
1856#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1857 if (__builtin_constant_p(i)) {
1860 return detail::InsertLane<0>(v, t);
1862 return detail::InsertLane<1>(v, t);
1864 return detail::InsertLane<2>(v, t);
1866 return detail::InsertLane<3>(v, t);
1868 return detail::InsertLane<4>(v, t);
1870 return detail::InsertLane<5>(v, t);
1872 return detail::InsertLane<6>(v, t);
1874 return detail::InsertLane<7>(v, t);
1879 alignas(16) T lanes[8];
1882 return Load(
d, lanes);
1885template <
typename T>
1887#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1888 if (__builtin_constant_p(i)) {
1891 return detail::InsertLane<0>(v, t);
1893 return detail::InsertLane<1>(v, t);
1895 return detail::InsertLane<2>(v, t);
1897 return detail::InsertLane<3>(v, t);
1899 return detail::InsertLane<4>(v, t);
1901 return detail::InsertLane<5>(v, t);
1903 return detail::InsertLane<6>(v, t);
1905 return detail::InsertLane<7>(v, t);
1907 return detail::InsertLane<8>(v, t);
1909 return detail::InsertLane<9>(v, t);
1911 return detail::InsertLane<10>(v, t);
1913 return detail::InsertLane<11>(v, t);
1915 return detail::InsertLane<12>(v, t);
1917 return detail::InsertLane<13>(v, t);
1919 return detail::InsertLane<14>(v, t);
1921 return detail::InsertLane<15>(v, t);
1926 alignas(16) T lanes[16];
1929 return Load(
d, lanes);
1958template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1964template <
class V, HWY_IF_V_SIZE_V(V, 16)>
1970template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1976template <
class V, HWY_IF_V_SIZE_V(V, 16)>
1982template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1988template <
class V, HWY_IF_V_SIZE_V(V, 16)>
1994template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
2000template <
class V, HWY_IF_V_SIZE_V(V, 16)>
2006template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
2012template <
class V, HWY_IF_V_SIZE_V(V, 16)>
2018template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
2024template <
class V, HWY_IF_V_SIZE_V(V, 16)>
2034#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
2035#undef HWY_NATIVE_I32_SATURATED_ADDSUB
2037#define HWY_NATIVE_I32_SATURATED_ADDSUB
2040#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
2041#undef HWY_NATIVE_U32_SATURATED_ADDSUB
2043#define HWY_NATIVE_U32_SATURATED_ADDSUB
2046#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
2047#undef HWY_NATIVE_I64_SATURATED_ADDSUB
2049#define HWY_NATIVE_I64_SATURATED_ADDSUB
2052#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
2053#undef HWY_NATIVE_U64_SATURATED_ADDSUB
2055#define HWY_NATIVE_U64_SATURATED_ADDSUB
2077#if !HWY_HAVE_FLOAT16
2082 using TU =
TFromD<
decltype(du)>;
2112#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
2113#undef HWY_NATIVE_SATURATED_NEG_8_16_32
2115#define HWY_NATIVE_SATURATED_NEG_8_16_32
2121#ifdef HWY_NATIVE_SATURATED_NEG_64
2122#undef HWY_NATIVE_SATURATED_NEG_64
2124#define HWY_NATIVE_SATURATED_NEG_64
2128 return Vec64<int64_t>(vqneg_s64(v.raw));
2132 return Vec128<int64_t>(vqnegq_s64(v.raw));
2139#pragma push_macro("HWY_NEON_DEF_FUNCTION")
2140#undef HWY_NEON_DEF_FUNCTION
2141#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
2142 template <int kBits> \
2143 HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
2144 return kBits == 0 ? v \
2145 : Vec128<type##_t, size>(HWY_NEON_EVAL( \
2146 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
2154#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2157template <
int kBits,
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
2162 constexpr size_t kSizeInBits =
sizeof(T) * 8;
2163 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
2164 if (kBits == 0)
return v;
2178template <
size_t N, HWY_IF_V_SIZE_LE(u
int8_t, N, 8)>
2187template <
size_t N, HWY_IF_V_SIZE_LE(u
int16_t, N, 8)>
2196template <
size_t N, HWY_IF_V_SIZE_LE(u
int32_t, N, 8)>
2212template <
size_t N, HWY_IF_V_SIZE_LE(
int8_t, N, 8)>
2221template <
size_t N, HWY_IF_V_SIZE_LE(
int16_t, N, 8)>
2230template <
size_t N, HWY_IF_V_SIZE_LE(
int32_t, N, 8)>
2247 const int8x16_t neg_bits =
Neg(
BitCast(di, bits)).raw;
2250template <
size_t N, HWY_IF_V_SIZE_LE(u
int8_t, N, 8)>
2254 const int8x8_t neg_bits =
Neg(
BitCast(di, bits)).raw;
2260 const int16x8_t neg_bits =
Neg(
BitCast(di, bits)).raw;
2263template <
size_t N, HWY_IF_V_SIZE_LE(u
int16_t, N, 8)>
2267 const int16x4_t neg_bits =
Neg(
BitCast(di, bits)).raw;
2273 const int32x4_t neg_bits =
Neg(
BitCast(di, bits)).raw;
2276template <
size_t N, HWY_IF_V_SIZE_LE(u
int32_t, N, 8)>
2280 const int32x2_t neg_bits =
Neg(
BitCast(di, bits)).raw;
2286 const int64x2_t neg_bits =
Neg(
BitCast(di, bits)).raw;
2291 const int64x1_t neg_bits =
Neg(
BitCast(di, bits)).raw;
2298template <
size_t N, HWY_IF_V_SIZE_LE(
int8_t, N, 8)>
2307template <
size_t N, HWY_IF_V_SIZE_LE(
int16_t, N, 8)>
2316template <
size_t N, HWY_IF_V_SIZE_LE(
int32_t, N, 8)>
2331template <
typename T,
size_t N>
2333 return v << Set(DFromV<decltype(v)>(),
static_cast<T
>(bits));
2335template <
typename T,
size_t N>
2337 return v >>
Set(
DFromV<
decltype(v)>(),
static_cast<T
>(bits));
2343#ifdef HWY_NATIVE_MUL_8
2344#undef HWY_NATIVE_MUL_8
2346#define HWY_NATIVE_MUL_8
2358 int16x8_t rlo = vmull_s8(vget_low_s8(a.raw), vget_low_s8(b.raw));
2360 int16x8_t rhi = vmull_high_s8(a.raw, b.raw);
2362 int16x8_t rhi = vmull_s8(vget_high_s8(a.raw), vget_high_s8(b.raw));
2365 vuzp2q_s8(vreinterpretq_s8_s16(rlo), vreinterpretq_s8_s16(rhi)));
2368 uint16x8_t rlo = vmull_u8(vget_low_u8(a.
raw), vget_low_u8(b.
raw));
2370 uint16x8_t rhi = vmull_high_u8(a.
raw, b.
raw);
2372 uint16x8_t rhi = vmull_u8(vget_high_u8(a.
raw), vget_high_u8(b.
raw));
2375 vuzp2q_u8(vreinterpretq_u8_u16(rlo), vreinterpretq_u8_u16(rhi)));
2378template <
size_t N, HWY_IF_V_SIZE_LE(
int8_t, N, 8)>
2380 int8x16_t hi_lo = vreinterpretq_s8_s16(vmull_s8(a.
raw, b.
raw));
2383template <
size_t N, HWY_IF_V_SIZE_LE(u
int8_t, N, 8)>
2385 uint8x16_t hi_lo = vreinterpretq_u8_u16(vmull_u8(a.
raw, b.
raw));
2390 int32x4_t rlo = vmull_s16(vget_low_s16(a.
raw), vget_low_s16(b.
raw));
2392 int32x4_t rhi = vmull_high_s16(a.
raw, b.
raw);
2394 int32x4_t rhi = vmull_s16(vget_high_s16(a.
raw), vget_high_s16(b.
raw));
2397 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
2400 uint32x4_t rlo = vmull_u16(vget_low_u16(a.
raw), vget_low_u16(b.
raw));
2402 uint32x4_t rhi = vmull_high_u16(a.
raw, b.
raw);
2404 uint32x4_t rhi = vmull_u16(vget_high_u16(a.
raw), vget_high_u16(b.
raw));
2407 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
2410template <
size_t N, HWY_IF_V_SIZE_LE(
int16_t, N, 8)>
2412 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.
raw, b.
raw));
2415template <
size_t N, HWY_IF_V_SIZE_LE(u
int16_t, N, 8)>
2418 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.
raw, b.
raw));
2423 int64x2_t rlo = vmull_s32(vget_low_s32(a.
raw), vget_low_s32(b.
raw));
2425 int64x2_t rhi = vmull_high_s32(a.
raw, b.
raw);
2427 int64x2_t rhi = vmull_s32(vget_high_s32(a.
raw), vget_high_s32(b.
raw));
2430 vuzp2q_s32(vreinterpretq_s32_s64(rlo), vreinterpretq_s32_s64(rhi)));
2433 uint64x2_t rlo = vmull_u32(vget_low_u32(a.
raw), vget_low_u32(b.
raw));
2435 uint64x2_t rhi = vmull_high_u32(a.
raw, b.
raw);
2437 uint64x2_t rhi = vmull_u32(vget_high_u32(a.
raw), vget_high_u32(b.
raw));
2440 vuzp2q_u32(vreinterpretq_u32_u64(rlo), vreinterpretq_u32_u64(rhi)));
2443template <
size_t N, HWY_IF_V_SIZE_LE(
int32_t, N, 8)>
2445 int32x4_t hi_lo = vreinterpretq_s32_s64(vmull_s32(a.
raw, b.
raw));
2448template <
size_t N, HWY_IF_V_SIZE_LE(u
int32_t, N, 8)>
2451 uint32x4_t hi_lo = vreinterpretq_u32_u64(vmull_u32(a.
raw, b.
raw));
2455template <
class T, HWY_IF_UI64(T)>
2461 Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi_1);
2466template <
class T, HWY_IF_UI64(T)>
2476template <
size_t N, HWY_IF_V_SIZE_LE(
int16_t, N, 8)>
2485#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
2486HWY_INLINE float64x1_t vrecpe_f64(float64x1_t raw) {
2487 const CappedTag<double, 1>
d;
2488 const Twice<
decltype(
d)> dt;
2489 using VT =
VFromD<
decltype(dt)>;
2498#ifdef HWY_NATIVE_F64_APPROX_RECIP
2499#undef HWY_NATIVE_F64_APPROX_RECIP
2501#define HWY_NATIVE_F64_APPROX_RECIP
2510template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2513 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
2514 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
2515 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
2525#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
2526#undef HWY_NATIVE_INTEGER_ABS_DIFF
2528#define HWY_NATIVE_INTEGER_ABS_DIFF
2534#ifdef HWY_NATIVE_INT_FMA
2535#undef HWY_NATIVE_INT_FMA
2537#define HWY_NATIVE_INT_FMA
2549template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)>
2555template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)>
2562template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)>
2563HWY_API Vec128<T, N>
MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
2565 return Add(
Mul(mul, x), add);
2568template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)>
2571 return Sub(add,
Mul(mul, x));
2587 return mul * x + add;
2593 return add - mul * x;
2599template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2605template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2611template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2617template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2626#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 490
2627HWY_INLINE float64x1_t vrsqrte_f64(float64x1_t raw) {
2628 const CappedTag<double, 1>
d;
2629 const Twice<
decltype(
d)> dt;
2630 using VT =
VFromD<
decltype(dt)>;
2631 const VFromD<
decltype(
d)> v(raw);
2640#ifdef HWY_NATIVE_F64_APPROX_RSQRT
2641#undef HWY_NATIVE_F64_APPROX_RSQRT
2643#define HWY_NATIVE_F64_APPROX_RSQRT
2653template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2657 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
2658 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
2659 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
2661 const auto root = v * recip;
2671template <
typename T>
2677template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2681 using V8 =
decltype(
Zero(d8));
2704template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
2707 return detail::reversed_andnot(mask, not_mask);
2711template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2713 const Vec128<T, N> mask) {
2714 const DFromV<
decltype(mask)>
d;
2716 VFromD<
decltype(du)> ret =
2717 detail::reversed_andnot(
BitCast(du, mask),
BitCast(du, not_mask));
2746#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3)
2752HWY_API Vec128<T, N>
Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
2753 return Xor(x1,
Xor(x2, x3));
2756template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2757HWY_API Vec128<T, N>
Xor3(
const Vec128<T, N> x1,
const Vec128<T, N> x2,
2758 const Vec128<T, N> x3) {
2765template <
typename T,
size_t N>
2767 return Xor(x1,
Xor(x2, x3));
2772template <
typename T,
size_t N>
2774 return Or(o1,
Or(o2, o3));
2778template <
typename T,
size_t N>
2780 return Or(o,
And(a1, a2));
2784template <
typename T,
size_t N>
2792#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
2793#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
2795#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
2805template <
typename T,
size_t N>
2810template <
typename T,
size_t N>
2815template <
typename T,
size_t N>
2825 return Max(a, b) -
Min(a, b);
2836#ifdef HWY_NATIVE_POPCNT
2837#undef HWY_NATIVE_POPCNT
2839#define HWY_NATIVE_POPCNT
2844template <
typename T>
2849template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2857template <
typename T>
2860 const uint8x16_t bytes = vcntq_u8(
BitCast(d8, v).raw);
2863template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2867 const uint8x8_t bytes = vcnt_u8(
BitCast(d8, v).raw);
2871template <
typename T>
2874 const uint8x16_t bytes = vcntq_u8(
BitCast(d8, v).raw);
2875 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
2877template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2881 const uint8x8_t bytes = vcnt_u8(
BitCast(d8, v).raw);
2885template <
typename T>
2888 const uint8x16_t bytes = vcntq_u8(
BitCast(d8, v).raw);
2889 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
2891template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2895 const uint8x8_t bytes = vcnt_u8(
BitCast(d8, v).raw);
2896 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
2901template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
2914#ifdef HWY_NATIVE_SATURATED_ABS
2915#undef HWY_NATIVE_SATURATED_ABS
2917#define HWY_NATIVE_SATURATED_ABS
2923template <typename T,
size_t N>
2925 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
2926 const DFromV<
decltype(magn)>
d;
2931template <
typename T,
size_t N>
2933 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
2934 const DFromV<
decltype(abs)>
d;
2940template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
2950template <
typename T,
size_t N>
2968template <
typename TFrom,
size_t NFrom,
class DTo>
2970 static_assert(
sizeof(TFrom) ==
sizeof(
TFromD<DTo>),
"Must have same size");
2976#define HWY_NEON_BUILD_TPL_HWY_IF
2977#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
2978#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
2979 const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
2980 const Vec128<type##_t, size> no
2981#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
2986#define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_BF16(TFromV<V>)
2988#define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_SPECIAL_FLOAT_V(V)
2991template <
class V, HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V)>
2993 const DFromV<
decltype(yes)>
d;
2999#undef HWY_NEON_IF_EMULATED_IF_THEN_ELSE
3000#undef HWY_NEON_BUILD_TPL_HWY_IF
3001#undef HWY_NEON_BUILD_RET_HWY_IF
3002#undef HWY_NEON_BUILD_PARAM_HWY_IF
3003#undef HWY_NEON_BUILD_ARG_HWY_IF
3006template <
typename T,
size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
3010template <
typename T,
size_t N, HWY_IF_SPECIAL_FLOAT(T)>
3012 const DFromV<
decltype(yes)>
d;
3018template <
typename T,
size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
3022template <
typename T,
size_t N, HWY_IF_SPECIAL_FLOAT(T)>
3029template <
typename T,
size_t N>
3032 static_assert(IsSigned<T>(),
"Only works for signed/float");
3042template <
typename T,
size_t N>
3047template <
typename T,
size_t N>
3053template <
typename T,
size_t N>
3059template <
typename T,
size_t N>
3065template <
typename T,
size_t N>
3071template <
typename T,
size_t N>
3103#define HWY_NEON_BUILD_TPL_HWY_COMPARE
3104#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
3105#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
3106 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
3107#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
3137#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
3138#undef HWY_NEON_BUILD_RET_HWY_COMPARE
3139#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
3140#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
3148 const Vec128<int64_t, N> b) {
3149 const Simd<int32_t, N * 2, 0> d32;
3150 const Simd<int64_t, N, 0> d64;
3158 const Vec128<uint64_t, N> b) {
3159 const Simd<uint32_t, N * 2, 0> d32;
3160 const Simd<uint64_t, N, 0> d64;
3167 const Vec128<int64_t> b) {
3168 const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
3172 const Vec64<int64_t> b) {
3173 const int64x1_t sub = vqsub_s64(a.raw, b.raw);
3179 const Vec128<uint64_t, N> b) {
3180 const DFromV<
decltype(a)> du;
3182 const Vec128<uint64_t, N> msb =
AndNot(a, b) |
AndNot(a ^ b, a - b);
3188 const Vec128<int64_t, N> b) {
3194 const Vec128<uint64_t, N> b) {
3203#pragma push_macro("HWY_NEON_DEF_FUNCTION")
3204#undef HWY_NEON_DEF_FUNCTION
3208#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
3209 HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \
3210 Vec128<type##_t, size> b) { \
3211 return Not(a == b); \
3216#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
3220template <
typename T,
size_t N>
3224template <
typename T,
size_t N>
3234 using TI =
TFromD<
decltype(di)>;
3240#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
3241#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
3242#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
3243 Vec128<type##_t, size> v, Vec128<type##_t, size> bit
3244#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
3256 return (v & bit) == bit;
3261 return (v & bit) == bit;
3265#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
3266#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
3267#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
3268#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
3275 const auto zero =
Zero(
DFromV<
decltype(v)>());
3283 const auto zero =
Zero(
DFromV<
decltype(v)>());
3292 const auto zero =
Zero(
DFromV<
decltype(v)>());
3300 const auto zero =
Zero(
DFromV<
decltype(v)>());
3315 const DFromV<
decltype(a)> du;
3341#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3344template <
class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)>
3347 const Twice<
decltype(
d)> dt;
3354HWY_API Vec64<double>
Min(Vec64<double> a, Vec64<double> b) {
3355#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3356 return detail::F64Vec64Min(a, b);
3358 return Vec64<double>(vminnm_f64(a.raw, b.raw));
3362HWY_API Vec128<double>
Min(Vec128<double> a, Vec128<double> b) {
3363 return Vec128<double>(vminnmq_f64(a.raw, b.raw));
3381 const DFromV<
decltype(a)> du;
3407#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3410template <
class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)>
3413 const Twice<
decltype(
d)> dt;
3420HWY_API Vec64<double>
Max(Vec64<double> a, Vec64<double> b) {
3421#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3422 return detail::F64Vec64Max(a, b);
3424 return Vec64<double>(vmaxnm_f64(a.raw, b.raw));
3428HWY_API Vec128<double>
Max(Vec128<double> a, Vec128<double> b) {
3429 return Vec128<double>(vmaxnmq_f64(a.raw, b.raw));
3441template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
3446template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
3451template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
3456template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
3461template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
3466template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
3471template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
3476template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
3482template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3488#if HWY_NEON_HAVE_BFLOAT16
3489template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
3495template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3500template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3503 return Vec128<double>(vld1q_f64(unaligned));
3509template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
3513template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
3517template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
3521template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
3525template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
3529template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
3533template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
3537template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
3542template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
3547#if HWY_NEON_HAVE_BFLOAT16
3548template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
3553template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3558template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
3560 return Vec64<double>(vld1_f64(
p));
3568template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
3572template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
3576template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
3587 CopyBytes<4>(
p, &buf);
3592template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
3596 CopyBytes<4>(
p, &buf);
3600#if HWY_NEON_HAVE_BFLOAT16
3601template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
3605 CopyBytes<4>(
p, &buf);
3614template <
class D, HWY_IF_LANES_D(D, 1), HWY_IF_U16_D(D)>
3618template <
class D, HWY_IF_LANES_D(D, 1), HWY_IF_I16_D(D)>
3623template <
class D, HWY_IF_LANES_D(D, 1), HWY_IF_F16_D(D)>
3628#if HWY_NEON_HAVE_BFLOAT16
3629template <
class D, HWY_IF_LANES_D(D, 1), HWY_IF_BF16_D(D)>
3636template <
class D, HWY_IF_LANES_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
3640 CopyBytes<2>(
p, &buf);
3645template <
class D, HWY_IF_LANES_D(D, 1), HWY_IF_U8_D(D)>
3649template <
class D, HWY_IF_LANES_D(D, 1), HWY_IF_I8_D(D)>
3656template <
class D, HWY_NEON_IF_EMULATED_D(D)>
3681template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
3688template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
3691 vst1q_u8(unaligned, v.
raw);
3693template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
3696 vst1q_u16(unaligned, v.
raw);
3698template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
3701 vst1q_u32(unaligned, v.
raw);
3703template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
3706 vst1q_u64(unaligned, v.
raw);
3708template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
3711 vst1q_s8(unaligned, v.
raw);
3713template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
3716 vst1q_s16(unaligned, v.
raw);
3718template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
3721 vst1q_s32(unaligned, v.
raw);
3723template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
3726 vst1q_s64(unaligned, v.
raw);
3729template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3735#if HWY_NEON_HAVE_BFLOAT16
3736template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
3742template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3745 vst1q_f32(unaligned, v.
raw);
3748template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3751 vst1q_f64(unaligned, v.raw);
3757template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
3761template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
3765template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
3769template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
3773template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
3777template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
3781template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
3785template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
3790template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
3796#if HWY_NEON_HAVE_BFLOAT16
3797template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
3803template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3808template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
3816template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
3818 vst1_lane_u32(
p, v.
raw, 0);
3820template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
3822 vst1_lane_s32(
p, v.
raw, 0);
3824template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
3826 vst1_lane_f32(
p, v.
raw, 0);
3835 CopyBytes<4>(&buf,
p);
3839template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
3843 CopyBytes<4>(&buf,
p);
3846#if HWY_NEON_HAVE_BFLOAT16
3847template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
3851 CopyBytes<4>(&buf,
p);
3857template <
class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U16_D(D)>
3859 vst1_lane_u16(
p, v.
raw, 0);
3861template <
class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I16_D(D)>
3863 vst1_lane_s16(
p, v.
raw, 0);
3866template <
class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_F16_D(D)>
3871#if HWY_NEON_HAVE_BFLOAT16
3872template <
class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_BF16_D(D)>
3878template <
class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
3882 CopyBytes<2>(&buf,
p);
3887template <
class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_U8_D(D)>
3889 vst1_lane_u8(
p, v.
raw, 0);
3891template <
class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_I8_D(D)>
3893 vst1_lane_s8(
p, v.
raw, 0);
3898template <
class D, HWY_NEON_IF_EMULATED_D(D)>
3905#if HWY_COMPILER_GCC_ACTUAL
3922 const auto blended =
3935 __builtin_prefetch(aligned, 1, 0);
3936#elif HWY_COMPILER_MSVC
3937 __prefetch2(aligned, 0x11);
3947#if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
3950template <
class D, HWY_IF_F16_D(D)>
3952 return Vec128<float16_t>(vcvtq_f16_s16(v.raw));
3954template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
3959template <
class D, HWY_IF_F16_D(D)>
3961 return Vec128<float16_t>(vcvtq_f16_u16(v.raw));
3963template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
3970template <
class D, HWY_IF_F32_D(D)>
3974template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
3979template <
class D, HWY_IF_F32_D(D)>
3983template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
3990template <
class D, HWY_IF_F64_D(D)>
3992 return Vec128<double>(vcvtq_f64_s64(v.raw));
3994template <
class D, HWY_IF_F64_D(D)>
3997#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3998 return Set(Full64<double>(),
static_cast<double>(
GetLane(v)));
4000 return Vec64<double>(vcvt_f64_s64(v.raw));
4004template <
class D, HWY_IF_F64_D(D)>
4006 return Vec128<double>(vcvtq_f64_u64(v.raw));
4008template <
class D, HWY_IF_F64_D(D)>
4011#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
4012 return Set(Full64<double>(),
static_cast<double>(
GetLane(v)));
4014 return Vec64<double>(vcvt_f64_u64(v.raw));
4022template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
4024#if HWY_COMPILER_CLANG && \
4025 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4030 int32x4_t raw_result;
4033 "fcvtzs %0.4s, %1.4s"
4035 "vcvt.s32.f32 %0, %1"
4044template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4046#if HWY_COMPILER_CLANG && \
4047 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4052 int32x2_t raw_result;
4055 "fcvtzs %0.2s, %1.2s"
4057 "vcvt.s32.f32 %0, %1"
4066template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
4068#if HWY_COMPILER_CLANG && \
4069 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4074 uint32x4_t raw_result;
4077 "fcvtzu %0.4s, %1.4s"
4079 "vcvt.u32.f32 %0, %1"
4088template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4090#if HWY_COMPILER_CLANG && \
4091 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4096 uint32x2_t raw_result;
4099 "fcvtzu %0.2s, %1.2s"
4101 "vcvt.u32.f32 %0, %1"
4114template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
4116#if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
4119 int64x2_t raw_result;
4120 __asm__(
"fcvtzs %0.2d, %1.2d" :
"=w"(raw_result) :
"w"(v.raw));
4126template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
4128#if HWY_ARCH_ARM_A64 && \
4129 ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
4130 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
4135 int64x1_t raw_result;
4136 __asm__(
"fcvtzs %d0, %d1" :
"=w"(raw_result) :
"w"(v.raw));
4137 return Vec64<int64_t>(raw_result);
4139 return Vec64<int64_t>(vcvt_s64_f64(v.raw));
4142template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
4144#if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
4147 uint64x2_t raw_result;
4148 __asm__(
"fcvtzu %0.2d, %1.2d" :
"=w"(raw_result) :
"w"(v.raw));
4149 return Vec128<uint64_t>(raw_result);
4151 return Vec128<uint64_t>(vcvtq_u64_f64(v.raw));
4154template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
4156#if HWY_ARCH_ARM_A64 && \
4157 ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
4158 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
4164 uint64x1_t raw_result;
4165 __asm__(
"fcvtzu %d0, %d1" :
"=w"(raw_result) :
"w"(v.raw));
4166 return Vec64<uint64_t>(raw_result);
4168 return Vec64<uint64_t>(vcvt_u64_f64(v.raw));
4174#if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
4177template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
4179#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4182 int16x8_t raw_result;
4183 __asm__(
"fcvtzs %0.8h, %1.8h" :
"=w"(raw_result) :
"w"(v.raw));
4184 return Vec128<int16_t>(raw_result);
4186 return Vec128<int16_t>(vcvtq_s16_f16(v.raw));
4189template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
4191#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4194 int16x4_t raw_result;
4195 __asm__(
"fcvtzs %0.4h, %1.4h" :
"=w"(raw_result) :
"w"(v.raw));
4202template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
4204#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4207 uint16x8_t raw_result;
4208 __asm__(
"fcvtzu %0.8h, %1.8h" :
"=w"(raw_result) :
"w"(v.raw));
4209 return Vec128<uint16_t>(raw_result);
4211 return Vec128<uint16_t>(vcvtq_u16_f16(v.raw));
4214template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
4216#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4219 uint16x4_t raw_result;
4220 __asm__(
"fcvtzu %0.4h, %1.4h" :
"=w"(raw_result) :
"w"(v.raw));
4251template <
class D, HWY_IF_U16_D(D)>
4255template <
class D, HWY_IF_U32_D(D)>
4257 uint16x8_t a = vmovl_u8(v.
raw);
4260template <
class D, HWY_IF_U32_D(D)>
4264template <
class D, HWY_IF_U64_D(D)>
4268template <
class D, HWY_IF_I16_D(D)>
4272template <
class D, HWY_IF_I32_D(D)>
4274 uint16x8_t a = vmovl_u8(v.
raw);
4277template <
class D, HWY_IF_I32_D(D)>
4281template <
class D, HWY_IF_I64_D(D)>
4287template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
4289 return VFromD<D>(vget_low_u16(vmovl_u8(v.raw)));
4291template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4293 return VFromD<D>(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(v.raw)))));
4295template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4297 return VFromD<D>(vget_low_u32(vmovl_u16(v.raw)));
4299template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
4301 return VFromD<D>(vget_low_u64(vmovl_u32(v.raw)));
4303template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
4306 return BitCast(
d, VU16(vget_low_u16(vmovl_u8(v.raw))));
4308template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4310 const uint32x4_t u32 = vmovl_u16(vget_low_u16(vmovl_u8(v.raw)));
4311 return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(u32)));
4313template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4315 return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(vmovl_u16(v.raw))));
4317template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
4319 using DU = RebindToUnsigned<D>;
4329 const Rebind<uint32_t,
decltype(
d)> du32;
4334template <
class D, HWY_IF_I16_D(D)>
4338template <
class D, HWY_IF_I32_D(D)>
4340 int16x8_t a = vmovl_s8(v.
raw);
4343template <
class D, HWY_IF_I32_D(D)>
4347template <
class D, HWY_IF_I64_D(D)>
4353template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
4355 return VFromD<D>(vget_low_s16(vmovl_s8(v.raw)));
4357template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4359 return VFromD<D>(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(v.raw)))));
4361template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4363 return VFromD<D>(vget_low_s32(vmovl_s16(v.raw)));
4365template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
4367 return VFromD<D>(vget_low_s64(vmovl_s32(v.raw)));
4375 const Rebind<int32_t,
decltype(
d)> di32;
4379#if HWY_NEON_HAVE_F16C
4382#ifdef HWY_NATIVE_F16C
4383#undef HWY_NATIVE_F16C
4385#define HWY_NATIVE_F16C
4388template <
class D, HWY_IF_F32_D(D)>
4390 return Vec128<float>(vcvt_f32_f16(v.raw));
4392template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
4394 return VFromD<D>(vget_low_f32(vcvt_f32_f16(v.raw)));
4401template <
class D, HWY_IF_F64_D(D)>
4403 return Vec128<double>(vcvt_f64_f32(v.raw));
4406template <
class D, HWY_IF_F64_D(D)>
4408 return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw)));
4411template <
class D, HWY_IF_F64_D(D)>
4413 const int64x2_t i64 = vmovl_s32(v.raw);
4414 return Vec128<double>(vcvtq_f64_s64(i64));
4417template <
class D, HWY_IF_F64_D(D)>
4419 return ConvertTo(
d, Vec64<int64_t>(vget_low_s64(vmovl_s32(v.raw))));
4422template <
class D, HWY_IF_F64_D(D)>
4424 const uint64x2_t u64 = vmovl_u32(v.raw);
4425 return Vec128<double>(vcvtq_f64_u64(u64));
4428template <
class D, HWY_IF_F64_D(D)>
4430 return ConvertTo(
d, Vec64<uint64_t>(vget_low_u64(vmovl_u32(v.raw))));
4433template <
class D, HWY_IF_UI64_D(D)>
4441template <
class D, HWY_IF_I64_D(D)>
4443 const Rebind<int32_t,
decltype(di64)> di32;
4446 const Repartition<uint8_t,
decltype(du32)> du32_as_du8;
4448 const auto exponent_adj =
BitCast(
4451 BitCast(du32_as_du8,
Set(du32, uint32_t{157}))),
4452 BitCast(du32_as_du8,
Set(du32, uint32_t{32}))));
4456 const auto f32_to_i32_result =
ConvertTo(di32, adj_v);
4460 Set(di32, LimitsMax<int32_t>())))));
4467template <
class D, HWY_IF_U64_D(D)>
4469 const Rebind<uint32_t,
decltype(du64)> du32;
4471 const Repartition<uint8_t,
decltype(du32)> du32_as_du8;
4473 const auto exponent_adj =
BitCast(
4476 BitCast(du32_as_du8,
Set(du32, uint32_t{158}))),
4477 BitCast(du32_as_du8,
Set(du32, uint32_t{32}))));
4481 const auto f32_to_u32_result =
ConvertTo(du32, adj_v);
4484 VecFromMask(du32, f32_to_u32_result ==
Set(du32, LimitsMax<uint32_t>())));
4490#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4491#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4493#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4496template <
class D, HWY_IF_UI64_D(D)>
4501 const Repartition<uint8_t,
decltype(d32)> du32_as_du8;
4503 constexpr uint32_t kExpAdjDecr =
4504 0xFFFFFF9Du +
static_cast<uint32_t
>(!IsSigned<TFromD<D>>());
4506 const auto exponent_adj =
BitCast(
4508 BitCast(du32_as_du8,
Set(du32, kExpAdjDecr))));
4522#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
4523#undef HWY_NATIVE_PROMOTE_UPPER_TO
4525#define HWY_NATIVE_PROMOTE_UPPER_TO
4529template <
class D, HWY_IF_U16_D(D)>
4531 return Vec128<uint16_t>(vmovl_high_u8(v.raw));
4533template <
class D, HWY_IF_U32_D(D)>
4535 return Vec128<uint32_t>(vmovl_high_u16(v.raw));
4537template <
class D, HWY_IF_U64_D(D)>
4539 return Vec128<uint64_t>(vmovl_high_u32(v.raw));
4541template <
class D, HWY_IF_I16_D(D)>
4543 return BitCast(
d, Vec128<uint16_t>(vmovl_high_u8(v.raw)));
4545template <
class D, HWY_IF_I32_D(D)>
4547 return BitCast(
d, Vec128<uint32_t>(vmovl_high_u16(v.raw)));
4549template <
class D, HWY_IF_I64_D(D)>
4551 return BitCast(
d, Vec128<uint64_t>(vmovl_high_u32(v.raw)));
4555template <
class D, HWY_IF_I16_D(D)>
4557 return Vec128<int16_t>(vmovl_high_s8(v.raw));
4559template <
class D, HWY_IF_I32_D(D)>
4561 return Vec128<int32_t>(vmovl_high_s16(v.raw));
4563template <
class D, HWY_IF_I64_D(D)>
4565 return Vec128<int64_t>(vmovl_high_s32(v.raw));
4568#if HWY_NEON_HAVE_F16C
4570template <
class D, HWY_IF_F32_D(D)>
4572 return Vec128<float>(vcvt_high_f32_f16(v.raw));
4577template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
4586template <
class D, HWY_IF_F64_D(D)>
4588 return Vec128<double>(vcvt_high_f64_f32(v.raw));
4591template <
class D, HWY_IF_F64_D(D)>
4593 const int64x2_t i64 = vmovl_high_s32(v.raw);
4594 return Vec128<double>(vcvtq_f64_s64(i64));
4597template <
class D, HWY_IF_F64_D(D)>
4599 const uint64x2_t u64 = vmovl_high_u32(v.raw);
4600 return Vec128<double>(vcvtq_f64_u64(u64));
4605template <
class D, HWY_IF_UI64_D(D)>
4611 const Rebind<float,
decltype(
d)> dh;
4617template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
class V>
4619 const Rebind<TFromV<V>,
decltype(
d)> dh;
4628template <
class D, HWY_IF_U16_D(D)>
4632template <
class D, HWY_IF_I16_D(D)>
4636template <
class D, HWY_IF_U8_D(D)>
4638 const uint16x4_t a = vqmovun_s32(v.
raw);
4641template <
class D, HWY_IF_U8_D(D)>
4645template <
class D, HWY_IF_I8_D(D)>
4647 const int16x4_t a = vqmovn_s32(v.
raw);
4650template <
class D, HWY_IF_I8_D(D)>
4654template <
class D, HWY_IF_U16_D(D)>
4658template <
class D, HWY_IF_U8_D(D)>
4660 const uint16x4_t a = vqmovn_u32(v.
raw);
4663template <
class D, HWY_IF_U8_D(D)>
4669template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
4671 return VFromD<D>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
4673template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)>
4675 return VFromD<D>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
4677template <
class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
4679 const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
4680 return VFromD<D>(vqmovn_u16(vcombine_u16(a, a)));
4682template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
4684 return VFromD<D>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
4686template <
class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)>
4688 const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
4689 return VFromD<D>(vqmovn_s16(vcombine_s16(a, a)));
4691template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
4693 return VFromD<D>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
4695template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
4697 return VFromD<D>(vqmovn_u32(vcombine_u32(v.raw, v.raw)));
4699template <
class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
4701 const uint16x4_t a = vqmovn_u32(vcombine_u32(v.raw, v.raw));
4702 return VFromD<D>(vqmovn_u16(vcombine_u16(a, a)));
4704template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
4706 return VFromD<D>(vqmovn_u16(vcombine_u16(v.raw, v.raw)));
4709template <
class D, HWY_IF_I32_D(D)>
4713template <
class D, HWY_IF_U32_D(D)>
4717template <
class D, HWY_IF_U32_D(D)>
4730 const Rebind<uint32_t, D> du32;
4740template <
class D, HWY_IF_I32_D(D)>
4744template <
class D, HWY_IF_U32_D(D)>
4748template <
class D, HWY_IF_U32_D(D)>
4761 const Rebind<uint32_t, D> du32;
4771#if HWY_NEON_HAVE_F16C
4775template <
class D, HWY_IF_F16_D(D)>
4777 return Vec64<float16_t>{vcvt_f16_f32(v.raw)};
4779template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
4781 return VFromD<D>(vcvt_f16_f32(vcombine_f32(v.raw, v.raw)));
4786#if HWY_NEON_HAVE_F32_TO_BF16C
4787#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
4788#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
4790#define HWY_NATIVE_DEMOTE_F32_TO_BF16
4794#if HWY_NEON_HAVE_BFLOAT16
4797static HWY_INLINE bfloat16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
4809static HWY_INLINE uint16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
4810 return vreinterpret_u16_bf16(raw);
4815template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
4817 return VFromD<D>(detail::BitCastFromRawNeonBF16(vcvt_bf16_f32(v.raw)));
4819template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_BF16_D(D)>
4821 return VFromD<D>(detail::BitCastFromRawNeonBF16(
4822 vcvt_bf16_f32(vcombine_f32(v.raw, v.raw))));
4828template <
class D, HWY_IF_F32_D(D)>
4830 return Vec64<float>(vcvt_f32_f64(v.raw));
4832template <
class D, HWY_IF_F32_D(D)>
4834 return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
4837template <
class D, HWY_IF_UI32_D(D)>
4839 const Rebind<MakeWide<TFromD<D>>, D> d64;
4845template <
class D, HWY_IF_F32_D(D)>
4847 const Rebind<int64_t,
decltype(df32)> di64;
4853 const auto k2p64_63 =
Set(df64, 27670116110564327424.0);
4854 const auto f64_hi52 =
4856 const auto f64_lo12 =
4859 const auto f64_sum = f64_hi52 + f64_lo12;
4860 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
4862 const auto f64_sum_is_inexact =
4864 const auto f64_bits_decrement =
4866 f64_sum_is_inexact);
4868 const auto adj_f64_val =
BitCast(
4870 Or(
BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));
4872 return DemoteTo(df32, adj_f64_val);
4877 Set(du32, uint32_t{0x007FFFFFu}));
4881 const auto k2p41_f32 =
Set(df32, 2199023255552.0f);
4882 const auto k2p64_63_f32 =
Set(df32, 27670116110564327424.0f);
4884 const auto hi23_f32 =
4886 const auto mid23_f32 =
4888 const auto lo18_f32 =
ConvertTo(df32, lo18);
4890 const auto s_hi46 = hi23_f32 + mid23_f32;
4891 const auto c_hi46 = (hi23_f32 - s_hi46) + mid23_f32;
4893 auto s_lo = c_hi46 + lo18_f32;
4894 const auto c_lo = (c_hi46 - s_lo) + lo18_f32;
4896 const auto s_lo_inexact_mask =
4898 const auto s_lo_mag_adj = ShiftRight<31>(
4904 return s_hi46 + s_lo;
4908template <
class D, HWY_IF_F32_D(D)>
4911 const Rebind<uint64_t,
decltype(df32)> du64;
4914 const auto k2p64 =
Set(df64, 18446744073709551616.0);
4915 const auto f64_hi52 =
Or(
BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
4916 const auto f64_lo12 =
4919 const auto f64_sum = f64_hi52 + f64_lo12;
4920 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
4921 const auto f64_sum_is_inexact =
4924 const auto adj_f64_val =
BitCast(
4927 f64_sum_is_inexact));
4929 return DemoteTo(df32, adj_f64_val);
4933 const auto hi23 =
TruncateTo(du32, ShiftRight<41>(v));
4935 Set(du32, uint32_t{0x007FFFFFu}));
4936 const auto lo18 =
And(
TruncateTo(du32, v),
Set(du32, uint32_t{0x0003FFFFu}));
4938 const auto k2p41_f32 =
Set(df32, 2199023255552.0f);
4939 const auto k2p64_f32 =
Set(df32, 18446744073709551616.0f);
4941 const auto hi23_f32 =
4943 const auto mid23_f32 =
4945 const auto lo18_f32 =
ConvertTo(df32, lo18);
4947 const auto s_hi46 = hi23_f32 + mid23_f32;
4948 const auto c_hi46 = (hi23_f32 - s_hi46) + mid23_f32;
4950 auto s_lo = c_hi46 + lo18_f32;
4951 const auto c_lo = (c_hi46 - s_lo) + lo18_f32;
4953 const auto s_lo_inexact_mask =
4955 const auto s_lo_mag_adj = ShiftRight<31>(
4961 return s_hi46 + s_lo;
4967 const uint8x16_t w = vuzp1q_u8(org_v, org_v);
4970template <
size_t N, HWY_IF_V_SIZE_LE(u
int32_t, N, 8)>
4973 const uint8x8_t w = vuzp1_u8(org_v, org_v);
5013 const DFromV<
decltype(v)> df;
5017 const auto int_f =
ConvertTo(df, integer);
5024 const DFromV<
decltype(v)> df;
5032 const auto added = large + v;
5033 const auto rounded = added - large;
5041 const DFromV<
decltype(v)> df;
5045 const auto int_f =
ConvertTo(df, integer);
5055 const DFromV<
decltype(v)> df;
5059 const auto int_f =
ConvertTo(df, integer);
5074 return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
5076template <
size_t N, HWY_IF_V_SIZE_LE(
float, N, 8)>
5078 return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
5092template <
typename T,
size_t N>
5102template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
5136 return Vec64<float16_t>(vget_low_f16(v.raw));
5139#if HWY_NEON_HAVE_BFLOAT16
5141 return Vec64<bfloat16_t>(vget_low_bf16(v.raw));
5146 return Vec64<double>(vget_low_f64(v.raw));
5150template <
class V, HWY_NEON_IF_EMULATED_D(DFromV<V>), HWY_IF_V_SIZE_V(V, 16)>
5165template <
int kBytes,
class D,
typename T = TFromD<D>>
5167 static_assert(0 < kBytes && kBytes < 16,
"kBytes must be in [1, 15]");
5169 uint8x16_t v8 = vextq_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
5174template <
int kBytes,
class D,
typename T = TFromD<D>>
5176 static_assert(0 < kBytes && kBytes < 8,
"kBytes must be in [1, 7]");
5178 uint8x8_t v8 = vext_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
5190template <
int kBytes>
5200 template <
class T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
5204 const auto zero64 =
Zero(d64);
5205 const decltype(zero64) v64(v.
raw);
5207 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
5212 template <
class T,
size_t N>
5219 template <
class T,
size_t N>
5225template <
int kBytes>
5227 template <
class T,
size_t N>
5231 if (
d.MaxBytes() < 8) {
5232 constexpr size_t kReg =
d.
MaxBytes() == 16 ? 16 : 8;
5233 const Simd<T, kReg /
sizeof(T), 0> dreg;
5237 return CombineShiftRightBytes<kBytes>(
d,
Zero(
d), v);
5242 template <
class T,
size_t N>
5249 template <
class T,
size_t N>
5257template <
int kBytes,
class D>
5262template <
int kBytes,
typename T,
size_t N>
5264 return ShiftLeftBytes<kBytes>(
DFromV<
decltype(v)>(), v);
5267template <
int kLanes,
class D>
5273template <
int kLanes,
typename T,
size_t N>
5275 return ShiftLeftLanes<kLanes>(
DFromV<
decltype(v)>(), v);
5279template <
int kBytes,
class D>
5285template <
int kLanes,
class D>
5293template <
int kBytes,
class D, HWY_IF_V_SIZE_LE_D(D, 4)>
5295 constexpr size_t kSize =
d.MaxBytes();
5296 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
5300 using V64 =
VFromD<
decltype(d_full8)>;
5301 const V64 hi64(
BitCast(d8, hi).raw);
5312template <
class D, HWY_IF_U8_D(D)>
5316template <
class D, HWY_IF_U16_D(D)>
5320template <
class D, HWY_IF_U32_D(D)>
5324template <
class D, HWY_IF_U64_D(D)>
5328template <
class D, HWY_IF_I8_D(D)>
5332template <
class D, HWY_IF_I16_D(D)>
5336template <
class D, HWY_IF_I32_D(D)>
5340template <
class D, HWY_IF_I64_D(D)>
5345template <
class D, HWY_IF_F16_D(D)>
5347 return Vec64<float16_t>(vget_high_f16(v.raw));
5350#if HWY_NEON_HAVE_BFLOAT16
5351template <
class D, HWY_IF_BF16_D(D)>
5353 return Vec64<bfloat16_t>(vget_high_bf16(v.raw));
5356template <
class D, HWY_IF_F32_D(D)>
5361template <
class D, HWY_IF_F64_D(D)>
5363 return Vec64<double>(vget_high_f64(v.raw));
5367template <
class D, HWY_NEON_IF_EMULATED_D(D), HWY_IF_V_SIZE_GT_D(D, 4)>
5370 const Half<
decltype(du)> duh;
5375template <
class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
5379 const VFromD<
decltype(du)> upper =
5386template <
int kLane,
typename T>
5395 static_assert(0 <= kLane && kLane < 16,
"Invalid lane");
5396 return Vec128<uint8_t>(vdupq_laneq_u8(v.raw, kLane));
5401 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5402 return Vec128<uint8_t, N>(vdup_lane_u8(v.raw, kLane));
5406 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
5407 return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
5412 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5413 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
5417 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
5418 return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
5423 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5424 return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
5428 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
5429 return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
5435 static_assert(0 <= kLane && kLane < 16,
"Invalid lane");
5436 return Vec128<int8_t>(vdupq_laneq_s8(v.raw, kLane));
5441 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5442 return Vec128<int8_t, N>(vdup_lane_s8(v.raw, kLane));
5446 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
5447 return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
5452 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5453 return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
5457 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
5458 return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
5463 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5464 return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
5468 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
5469 return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
5476 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
5477 return Vec128<float16_t>(vdupq_laneq_f16(v.raw, kLane));
5482 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5483 return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
5487#if HWY_NEON_HAVE_BFLOAT16
5490 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
5491 return Vec128<bfloat16_t>(vdupq_laneq_bf16(v.raw, kLane));
5496 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5497 return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
5503 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
5504 return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
5509 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5510 return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
5514 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
5515 return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
5524 static_assert(0 <= kLane && kLane < 16,
"Invalid lane");
5530 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5535 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
5541 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5546 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
5552 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5557 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
5564 static_assert(0 <= kLane && kLane < 16,
"Invalid lane");
5570 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5575 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
5581 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5586 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
5592 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5597 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
5605 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
5606 return Vec128<float16_t>(vdupq_n_f16(vgetq_lane_f16(v.raw, kLane)));
5611 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5612 return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
5615#if HWY_NEON_HAVE_BFLOAT16
5618 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
5619 return Vec128<bfloat16_t>(vdupq_n_bf16(vgetq_lane_bf16(v.raw, kLane)));
5624 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5625 return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
5630 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
5636 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
5642template <
int kLane,
typename V, HWY_NEON_IF_EMULATED_D(DFromV<V>),
5643 HWY_IF_LANES_GT_D(DFromV<V>, 1)>
5653template <
typename T,
size_t N>
5660template <
class D, HWY_IF_T_SIZE_D(D, 1)>
5667template <
class D, HWY_IF_T_SIZE_D(D, 2)>
5671 alignas(16)
static constexpr uint8_t kBroadcastLaneBytes[16] = {
5672 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
5673 return Load(d8, kBroadcastLaneBytes);
5676template <
class D, HWY_IF_T_SIZE_D(D, 4)>
5680 alignas(16)
static constexpr uint8_t kBroadcastLaneBytes[16] = {
5681 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
5682 return Load(d8, kBroadcastLaneBytes);
5685template <
class D, HWY_IF_T_SIZE_D(D, 8)>
5689 alignas(16)
static constexpr uint8_t kBroadcastLaneBytes[16] = {
5690 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
5691 return Load(d8, kBroadcastLaneBytes);
5694template <
class D, HWY_IF_T_SIZE_D(D, 1)>
5700template <
class D, HWY_IF_T_SIZE_D(D, 2)>
5703 alignas(16)
static constexpr uint8_t kByteOffsets[16] = {
5704 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
5705 return Load(d8, kByteOffsets);
5708template <
class D, HWY_IF_T_SIZE_D(D, 4)>
5711 alignas(16)
static constexpr uint8_t kByteOffsets[16] = {
5712 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
5713 return Load(d8, kByteOffsets);
5716template <
class D, HWY_IF_T_SIZE_D(D, 8)>
5719 alignas(16)
static constexpr uint8_t kByteOffsets[16] = {
5720 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
5721 return Load(d8, kByteOffsets);
5726template <
class D,
typename TI, HWY_IF_T_SIZE_D(D, 1)>
5730 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
5731#if HWY_IS_DEBUG_BUILD
5733 using TU =
TFromD<
decltype(du)>;
5742template <
class D,
typename TI,
5746 using T = TFromD<D>;
5747 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
5748#if HWY_IS_DEBUG_BUILD
5750 using TU =
TFromD<
decltype(du)>;
5756 using V8 =
VFromD<
decltype(d8)>;
5761 constexpr int kIndexShiftAmt =
static_cast<int>(
FloorLog2(
sizeof(T)));
5762 const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
5767template <
class D,
typename TI>
5770 const Rebind<TI,
decltype(
d)> di;
5774template <
typename T,
size_t N>
5782template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
5786 const Twice<
decltype(
d)> dt;
5799template <
typename T>
5804 const auto a_u8 =
BitCast(du8, a);
5805 const auto b_u8 =
BitCast(du8, b);
5809 const Twice<
decltype(du8)> dt_u8;
5813 detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}};
5818template <
typename T>
5823 const auto a_u8 =
BitCast(du8, a);
5824 const auto b_u8 =
BitCast(du8, b);
5828 detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}};
5831 const Half<
decltype(
d)> dh;
5833 const auto a_lo_u8 =
LowerHalf(dh_u8, a_u8);
5834 const auto a_hi_u8 =
UpperHalf(dh_u8, a_u8);
5835 const auto b_lo_u8 =
LowerHalf(dh_u8, b_u8);
5836 const auto b_hi_u8 =
UpperHalf(dh_u8, b_u8);
5837 const auto idx_lo_u8 =
LowerHalf(dh_u8, idx_u8);
5838 const auto idx_hi_u8 =
UpperHalf(dh_u8, idx_u8);
5841 {{a_lo_u8.raw, a_hi_u8.raw, b_lo_u8.raw, b_hi_u8.raw}}};
5842 const auto lo_result =
5844 const auto hi_result =
5846 return Combine(
d, hi_result, lo_result);
5853#ifdef HWY_NATIVE_REVERSE2_8
5854#undef HWY_NATIVE_REVERSE2_8
5856#define HWY_NATIVE_REVERSE2_8
5859template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
5864template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
5870template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
5875template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
5881template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
5886template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
5892template <
class D, HWY_IF_T_SIZE_D(D, 8)>
5894 return CombineShiftRightBytes<8>(
d, v, v);
5899template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
5904template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
5910template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
5915template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
5921template <
class D, HWY_IF_T_SIZE_D(D, 4)>
5927template <
class D, HWY_IF_T_SIZE_D(D, 8)>
5934template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
5939template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
5945template <
class D, HWY_IF_T_SIZE_D(D, 2)>
5951template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
5952HWY_API VFromD<D> Reverse8(D, VFromD<D>) {
5958template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
5963template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
5968template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 4)>
5973template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 8)>
5978template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 16)>
5988#ifdef HWY_NATIVE_REVERSE_BITS_UI8
5989#undef HWY_NATIVE_REVERSE_BITS_UI8
5991#define HWY_NATIVE_REVERSE_BITS_UI8
6007template <
typename T>
6009 return CombineShiftRightBytes<8>(
DFromV<
decltype(v)>(), v, v);
6011template <
typename T>
6013 return CombineShiftRightBytes<8>(
DFromV<
decltype(v)>(), v, v);
6017template <
typename T>
6019 return CombineShiftRightBytes<4>(
DFromV<
decltype(v)>(), v, v);
6023template <
typename T>
6025 return CombineShiftRightBytes<12>(
DFromV<
decltype(v)>(), v, v);
6029template <
typename T>
6045template <
typename T, HWY_IF_T_SIZE(T, 8)>
6048 return CombineShiftRightBytes<8>(
d, b,
Shuffle01(a));
6052#if !HWY_HAVE_FLOAT16
6053template <
size_t N, HWY_IF_V_SIZE_GT(
float16_t, N, 4)>
6063template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
6085template <
typename T, HWY_IF_T_SIZE(T, 8)>
6088 return CombineShiftRightBytes<8>(
d,
Shuffle01(b), a);
6094template <
class D, HWY_IF_V_SIZE_GT_D(D, 4)>
6100template <
class D, HWY_IF_V_SIZE_LE_D(D, 4)>
6102 const Half<
decltype(
d)> d2;
6112template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
6116template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
6121template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
6129#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
6131#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6132#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6134#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6137template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6141 const uint32_t x0) {
6142 typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8)));
6143 const GccU32RawVectType raw = {x0, x1};
6144 return ResizeBitCast(
d, Vec64<uint32_t>(
reinterpret_cast<uint32x2_t
>(raw)));
6147template <
class D, HWY_IF_V_SIZE_D(D, 16)>
6151 const uint32_t x0) {
6152 typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
6153 const GccU32RawVectType raw = {x0, x1, x2, x3};
6154 return ResizeBitCast(
d, Vec128<uint32_t>(
reinterpret_cast<uint32x4_t
>(raw)));
6158template <
size_t kLaneSize,
size_t kVectSize,
class V,
6172template <
size_t kLaneSize,
size_t kVectSize,
class V,
6200template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
6206 du,
static_cast<TU
>(amt *
sizeof(
TFromV<V>) * 8)));
6209template <
class V, HWY_IF_V_SIZE_V(V, 16)>
6214 Iota(du8,
static_cast<uint8_t
>(
size_t{0} - amt *
sizeof(TFromV<V>)));
6220template <
class D, HWY_IF_LANES_D(D, 1)>
6225template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
6227#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6228 if (__builtin_constant_p(amt)) {
6233 return ShiftLeftLanes<1>(
d, v);
6243template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
6245#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6246 if (__builtin_constant_p(amt)) {
6251 return ShiftLeftLanes<1>(
d, v);
6253 return ShiftLeftLanes<2>(
d, v);
6255 return ShiftLeftLanes<3>(
d, v);
6265template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
6267#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6268 if (__builtin_constant_p(amt)) {
6273 return ShiftLeftLanes<1>(
d, v);
6275 return ShiftLeftLanes<2>(
d, v);
6277 return ShiftLeftLanes<3>(
d, v);
6279 return ShiftLeftLanes<4>(
d, v);
6281 return ShiftLeftLanes<5>(
d, v);
6283 return ShiftLeftLanes<6>(
d, v);
6285 return ShiftLeftLanes<7>(
d, v);
6295template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
6297#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6298 if (__builtin_constant_p(amt)) {
6303 return ShiftLeftLanes<1>(
d, v);
6305 return ShiftLeftLanes<2>(
d, v);
6307 return ShiftLeftLanes<3>(
d, v);
6309 return ShiftLeftLanes<4>(
d, v);
6311 return ShiftLeftLanes<5>(
d, v);
6313 return ShiftLeftLanes<6>(
d, v);
6315 return ShiftLeftLanes<7>(
d, v);
6317 return ShiftLeftLanes<8>(
d, v);
6319 return ShiftLeftLanes<9>(
d, v);
6321 return ShiftLeftLanes<10>(
d, v);
6323 return ShiftLeftLanes<11>(
d, v);
6325 return ShiftLeftLanes<12>(
d, v);
6327 return ShiftLeftLanes<13>(
d, v);
6329 return ShiftLeftLanes<14>(
d, v);
6331 return ShiftLeftLanes<15>(
d, v);
6345template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
6352 du,
static_cast<TU
>(TU{0} - amt *
sizeof(
TFromV<V>) * 8)));
6355template <
class V, HWY_IF_V_SIZE_V(V, 16)>
6359 auto idx =
Iota(di8,
static_cast<int8_t
>(amt *
sizeof(
TFromV<V>)));
6366template <
class D, HWY_IF_LANES_D(D, 1)>
6371template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
6373#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6374 if (__builtin_constant_p(amt)) {
6379 return ShiftRightLanes<1>(
d, v);
6389template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
6391#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6392 if (__builtin_constant_p(amt)) {
6397 return ShiftRightLanes<1>(
d, v);
6399 return ShiftRightLanes<2>(
d, v);
6401 return ShiftRightLanes<3>(
d, v);
6411template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
6413#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6414 if (__builtin_constant_p(amt)) {
6419 return ShiftRightLanes<1>(
d, v);
6421 return ShiftRightLanes<2>(
d, v);
6423 return ShiftRightLanes<3>(
d, v);
6425 return ShiftRightLanes<4>(
d, v);
6427 return ShiftRightLanes<5>(
d, v);
6429 return ShiftRightLanes<6>(
d, v);
6431 return ShiftRightLanes<7>(
d, v);
6441template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
6443#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
6444 if (__builtin_constant_p(amt)) {
6449 return ShiftRightLanes<1>(
d, v);
6451 return ShiftRightLanes<2>(
d, v);
6453 return ShiftRightLanes<3>(
d, v);
6455 return ShiftRightLanes<4>(
d, v);
6457 return ShiftRightLanes<5>(
d, v);
6459 return ShiftRightLanes<6>(
d, v);
6461 return ShiftRightLanes<7>(
d, v);
6463 return ShiftRightLanes<8>(
d, v);
6465 return ShiftRightLanes<9>(
d, v);
6467 return ShiftRightLanes<10>(
d, v);
6469 return ShiftRightLanes<11>(
d, v);
6471 return ShiftRightLanes<12>(
d, v);
6473 return ShiftRightLanes<13>(
d, v);
6475 return ShiftRightLanes<14>(
d, v);
6477 return ShiftRightLanes<15>(
d, v);
6489#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
6490#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
6492#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
6495template <
class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
6500 return VFromD<DI32>(vqdmlal_s16(sum.raw, a.raw, b.raw));
6503template <
class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
6505 VFromD<Rebind<int16_t, DI32>> a,
6506 VFromD<Rebind<int16_t, DI32>> b,
6508 const Full128<TFromD<DI32>> di32_full;
6509 const Rebind<int16_t,
decltype(di32_full)> di16_full64;
6518#if HWY_NEON_HAVE_F32_TO_BF16C
6521#if HWY_NEON_HAVE_BFLOAT16
6524static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(bfloat16x4_t raw) {
6527static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(bfloat16x8_t raw) {
6539static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(uint16x4_t raw) {
6540 return vreinterpret_bf16_u16(raw);
6542static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(uint16x8_t raw) {
6543 return vreinterpretq_bf16_u16(raw);
6548template <
class D, HWY_IF_V_SIZE_D(D, 16)>
6550 Vec128<bfloat16_t> b,
6551 const Vec128<float> sum0,
6553 return Vec128<float>(vbfdotq_f32(sum0.raw,
6554 detail::BitCastToRawNeonBF16(a.raw),
6555 detail::BitCastToRawNeonBF16(b.raw)));
6558template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6560 D ,
VFromD<Repartition<bfloat16_t, D>> a,
6563 return VFromD<D>(vbfdot_f32(sum0.raw, detail::BitCastToRawNeonBF16(a.raw),
6564 detail::BitCastToRawNeonBF16(b.raw)));
6575 using VU32 =
VFromD<
decltype(du32)>;
6576 const VU32 odd =
Set(du32, 0xFFFF0000u);
6577 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
6579 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
6587template <
class D, HWY_IF_I32_D(D)>
6603template <
class D, HWY_IF_I32_D(D)>
6616template <
class D, HWY_IF_I32_D(D)>
6629template <
class D, HWY_IF_U32_D(D)>
6646template <
class D, HWY_IF_U32_D(D)>
6659template <
class D, HWY_IF_U32_D(D)>
6674template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6689#if HWY_NEON_HAVE_BFLOAT16
6693 return Add(sum0, sum1);
6704 const Half<
decltype(
d)> d64;
6732 const Half<
decltype(
d)> d64;
6755#if HWY_NEON_HAVE_F32_TO_BF16C
6757template <
class D, HWY_IF_V_SIZE_D(D, 16)>
6759 Vec128<bfloat16_t> b) {
6760 return Vec128<float>(vbfdotq_f32(
Zero(d32).raw,
6761 detail::BitCastToRawNeonBF16(a.raw),
6762 detail::BitCastToRawNeonBF16(b.raw)));
6765template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6767 VFromD<Repartition<bfloat16_t, D>> a,
6768 VFromD<Repartition<bfloat16_t, D>> b) {
6770 detail::BitCastToRawNeonBF16(a.raw),
6771 detail::BitCastToRawNeonBF16(b.raw)));
6775template <
class D32, HWY_IF_F32_D(D32)>
6780 using VU32 =
VFromD<
decltype(du32)>;
6781 const VU32 odd =
Set(du32, 0xFFFF0000u);
6782 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
6784 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
6791template <
class D, HWY_IF_I32_D(D)>
6806template <
class D, HWY_IF_I32_D(D)>
6817template <
class D, HWY_IF_I32_D(D)>
6827template <
class D, HWY_IF_U32_D(D)>
6843template <
class D, HWY_IF_U32_D(D)>
6854template <
class D, HWY_IF_U32_D(D)>
6874template <
class D, HWY_IF_V_SIZE_GT_D(D, 4)>
6888#define HWY_NEON_BUILD_TPL_HWY_TRN
6889#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
6892#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
6893 Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
6894#define HWY_NEON_BUILD_ARG_HWY_TRN a, b
6913#undef HWY_NEON_BUILD_TPL_HWY_TRN
6914#undef HWY_NEON_BUILD_RET_HWY_TRN
6915#undef HWY_NEON_BUILD_PARAM_HWY_TRN
6916#undef HWY_NEON_BUILD_ARG_HWY_TRN
6922template <
class D, HWY_IF_V_SIZE_LE_D(D, 4)>
6929 using VU =
VFromD<
decltype(du)>;
6931 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
6939template <
class D, HWY_IF_V_SIZE_GT_D(D, 4)>
6947template <
class D, HWY_IF_V_SIZE_LE_D(D, 4)>
6954 using VU =
VFromD<
decltype(du)>;
6956 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
6964template <
class D, HWY_IF_V_SIZE_GT_D(D, 4)>
6970template <
class D, HWY_IF_V_SIZE_LE_D(D, 4)>
6972 constexpr size_t kSize =
d.MaxBytes();
6974 const Full64<uint8_t> d8x8;
6975 const Full64<TFromD<D>> d64;
6976 using V8x8 =
VFromD<
decltype(d8x8)>;
6977 const V8x8 hi8x8(
BitCast(d8, hi).raw);
7000#if !HWY_HAVE_FLOAT16
7019template <
class D, HWY_IF_V_SIZE_GT_D(D, 4)>
7025template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
7027 const Twice<
decltype(
d)> d2;
7038template <
class D, HWY_IF_LANES_D(D, 2),
typename T = TFromD<D>>
7046template <
class D, HWY_IF_V_SIZE_GT_D(D, 4)>
7052template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
7054 const Twice<
decltype(
d)> d2;
7065template <
class D, HWY_IF_LANES_D(D, 2),
typename T = TFromD<D>>
7072template <
typename T,
size_t N,
7076 return detail::InterleaveEven(v, v);
7082template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
7089template <
typename T,
size_t N,
7093 return detail::InterleaveOdd(v, v);
7099template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
7106template <
typename T,
size_t N>
7110 alignas(16)
static constexpr uint8_t kBytes[16] = {
7111 ((0 /
sizeof(T)) & 1) ? 0 : 0xFF, ((1 /
sizeof(T)) & 1) ? 0 : 0xFF,
7112 ((2 /
sizeof(T)) & 1) ? 0 : 0xFF, ((3 /
sizeof(T)) & 1) ? 0 : 0xFF,
7113 ((4 /
sizeof(T)) & 1) ? 0 : 0xFF, ((5 /
sizeof(T)) & 1) ? 0 : 0xFF,
7114 ((6 /
sizeof(T)) & 1) ? 0 : 0xFF, ((7 /
sizeof(T)) & 1) ? 0 : 0xFF,
7115 ((8 /
sizeof(T)) & 1) ? 0 : 0xFF, ((9 /
sizeof(T)) & 1) ? 0 : 0xFF,
7116 ((10 /
sizeof(T)) & 1) ? 0 : 0xFF, ((11 /
sizeof(T)) & 1) ? 0 : 0xFF,
7117 ((12 /
sizeof(T)) & 1) ? 0 : 0xFF, ((13 /
sizeof(T)) & 1) ? 0 : 0xFF,
7118 ((14 /
sizeof(T)) & 1) ? 0 : 0xFF, ((15 /
sizeof(T)) & 1) ? 0 : 0xFF,
7125template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
7126HWY_API VFromD<D> InterleaveEven(D , VFromD<D> a, VFromD<D> b) {
7128 return detail::InterleaveEven(a, b);
7130 return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[0]);
7134template <
class D, HWY_IF_T_SIZE_D(D, 8)>
7140template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
7141HWY_API VFromD<D> InterleaveOdd(D , VFromD<D> a, VFromD<D> b) {
7143 return detail::InterleaveOdd(a, b);
7145 return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[1]);
7149template <
class D, HWY_IF_T_SIZE_D(D, 8)>
7155template <
typename T,
size_t N>
7161template <
typename T,
size_t N>
7168template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
7175#if HWY_NEON_HAVE_F32_TO_BF16C
7176template <
class D, HWY_IF_BF16_D(D)>
7178 VFromD<Repartition<float, D>> b) {
7179 const Half<
decltype(dbf16)> dh_bf16;
7184template <
class D, HWY_IF_I32_D(D)>
7193 return Combine(d32, b32, a32);
7197template <
class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7200 const Rebind<int64_t,
decltype(d32)> dt;
7204template <
class D, HWY_IF_U32_D(D)>
7213 return Combine(d32, b32, a32);
7217template <
class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7219 VFromD<Repartition<int64_t, D>> b) {
7220 const Rebind<int64_t,
decltype(d32)> dt;
7224template <
class D, HWY_IF_U32_D(D)>
7233 return Combine(d32, b32, a32);
7237template <
class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7240 const Rebind<uint64_t,
decltype(d32)> dt;
7244template <
class D, HWY_IF_I16_D(D)>
7253 return Combine(d16, b16, a16);
7257template <
class D, HWY_IF_I16_D(D)>
7265template <
class D, HWY_IF_I16_D(D)>
7273template <
class D, HWY_IF_U16_D(D)>
7282 return Combine(d16, b16, a16);
7286template <
class D, HWY_IF_U16_D(D)>
7294template <
class D, HWY_IF_U16_D(D)>
7302template <
class D, HWY_IF_U16_D(D)>
7311 return Combine(d16, b16, a16);
7315template <
class D, HWY_IF_U16_D(D)>
7323template <
class D, HWY_IF_U16_D(D)>
7331template <
class D, HWY_IF_I8_D(D)>
7344template <
class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7347 const Rebind<int16_t,
decltype(d8)> dt;
7351template <
class D, HWY_IF_U8_D(D)>
7364template <
class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7366 VFromD<Repartition<int16_t, D>> b) {
7367 const Rebind<int16_t,
decltype(d8)> dt;
7371template <
class D, HWY_IF_U8_D(D)>
7384template <
class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7387 const Rebind<uint16_t,
decltype(d8)> dt;
7398#if HWY_NEON_HAVE_F32_TO_BF16C
7399template <
class D, HWY_IF_BF16_D(D)>
7401 VFromD<Repartition<float, D>> b) {
7410#if HWY_TARGET == HWY_NEON
7412#ifdef HWY_NATIVE_AES
7413#undef HWY_NATIVE_AES
7415#define HWY_NATIVE_AES
7458 (uint64x2_t)vmull_high_p64((poly64x2_t)a.
raw, (poly64x2_t)b.
raw));
7465template <
class D, HWY_IF_F32_D(D)>
7467 const Rebind<uint16_t,
decltype(df32)> du16;
7474template <
class DTo,
typename TTo = TFromD<DTo>,
typename TFrom,
7475 HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED(TTo),
7476 hwy::EnableIf<(sizeof(TTo) < sizeof(TFrom))>* =
nullptr>
7477HWY_API Vec128<TTo, 1> TruncateTo(DTo , Vec128<TFrom, 1> v) {
7478 const Repartition<TTo, DFromV<decltype(v)>> d;
7479 return Vec128<TTo, 1>{BitCast(d, v).raw};
7482template <
class D, HWY_IF_U8_D(D)>
7492template <
class D, HWY_IF_U16_D(D)>
7501template <
class D, HWY_IF_U32_D(D)>
7509template <
class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)>
7518template <
class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 1)>
7526template <
class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)>
7543 vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed)));
7550 vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed)));
7557 vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed)));
7564 vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed)));
7571 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
7578 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
7587 return Vec128<int16_t, (N + 1) / 2>(
7588 vget_low_s16(vmull_s8(a_packed, b_packed)));
7596 return Vec128<uint16_t, (N + 1) / 2>(
7597 vget_low_u16(vmull_u8(a_packed, b_packed)));
7605 return Vec128<int32_t, (N + 1) / 2>(
7606 vget_low_s32(vmull_s16(a_packed, b_packed)));
7614 return Vec128<uint32_t, (N + 1) / 2>(
7615 vget_low_u32(vmull_u16(a_packed, b_packed)));
7623 return Vec128<int64_t, (N + 1) / 2>(
7624 vget_low_s64(vmull_s32(a_packed, b_packed)));
7632 return Vec128<uint64_t, (N + 1) / 2>(
7633 vget_low_u64(vmull_u32(a_packed, b_packed)));
7636template <
class T, HWY_IF_UI64(T)>
7650 vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed)));
7654 uint8x16_t a_packed =
ConcatOdd(
d, a, a).raw;
7655 uint8x16_t b_packed =
ConcatOdd(
d, b, b).raw;
7657 vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed)));
7664 vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed)));
7668 uint16x8_t a_packed =
ConcatOdd(
d, a, a).raw;
7669 uint16x8_t b_packed =
ConcatOdd(
d, b, b).raw;
7671 vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed)));
7678 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
7682 uint32x4_t a_packed =
ConcatOdd(
d, a, a).raw;
7683 uint32x4_t b_packed =
ConcatOdd(
d, b, b).raw;
7685 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
7694 return Vec128<int16_t, (N + 1) / 2>(
7695 vget_low_s16(vmull_s8(a_packed, b_packed)));
7703 return Vec128<uint16_t, (N + 1) / 2>(
7704 vget_low_u16(vmull_u8(a_packed, b_packed)));
7712 return Vec128<int32_t, (N + 1) / 2>(
7713 vget_low_s32(vmull_s16(a_packed, b_packed)));
7719 uint16x4_t a_packed =
ConcatOdd(
d, a, a).raw;
7720 uint16x4_t b_packed =
ConcatOdd(
d, b, b).raw;
7721 return Vec128<uint32_t, (N + 1) / 2>(
7722 vget_low_u32(vmull_u16(a_packed, b_packed)));
7730 return Vec128<int64_t, (N + 1) / 2>(
7731 vget_low_s64(vmull_s32(a_packed, b_packed)));
7737 uint32x2_t a_packed =
ConcatOdd(
d, a, a).raw;
7738 uint32x2_t b_packed =
ConcatOdd(
d, b, b).raw;
7739 return Vec128<uint64_t, (N + 1) / 2>(
7740 vget_low_u64(vmull_u32(a_packed, b_packed)));
7743template <
class T, HWY_IF_UI64(T)>
7746 T lo =
Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi);
7753template <
typename T,
typename TI>
7755 const DFromV<
decltype(from)>
d;
7761 uint8x16_t table0 =
BitCast(d8, bytes).raw;
7763 table.val[0] = vget_low_u8(table0);
7764 table.val[1] = vget_high_u8(table0);
7765 uint8x16_t idx =
BitCast(d8, from).raw;
7766 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
7767 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
7773template <
typename T,
typename TI,
size_t NI, HWY_IF_V_SIZE_LE(TI, NI, 8)>
7777 const auto idx_full =
Combine(d_full, from64, from64);
7783template <
typename T,
size_t N,
typename TI, HWY_IF_V_SIZE_LE(T, N, 8)>
7790template <
typename T,
size_t N,
typename TI,
size_t NI,
7794 const DFromV<
decltype(bytes)>
d;
7796 const Repartition<uint8_t,
decltype(d_idx)> d_idx8;
7799 const auto from8 =
BitCast(d_idx8, from);
7800 const VFromD<
decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
7805template <
class V,
class VI>
7812#if HWY_TARGET == HWY_NEON
7813template <u
int8_t kRcon>
7815 alignas(16)
static constexpr uint8_t kRconXorMask[16] = {
7816 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0};
7817 alignas(16)
static constexpr uint8_t kRotWordShuffle[16] = {
7818 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12};
7835#ifdef HWY_NATIVE_REDUCE_SCALAR
7836#undef HWY_NATIVE_REDUCE_SCALAR
7838#define HWY_NATIVE_REDUCE_SCALAR
7842#define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \
7843 template <class D, HWY_IF_LANES_D(D, size)> \
7844 HWY_API type##_t name(D , Vec128<type##_t, size> v) { \
7845 return HWY_NEON_EVAL(prefix##infix##suffix, v.raw); \
7849#define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \
7850 HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8) \
7851 HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8) \
7852 HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16) \
7853 HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16) \
7854 HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32) \
7855 HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32) \
7856 HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8) \
7857 HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8) \
7858 HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16) \
7859 HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16) \
7860 HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32) \
7861 HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32) \
7862 HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32) \
7863 HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32) \
7864 HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64)
7867#define HWY_NEON_DEF_REDUCTION_UI64(name, prefix) \
7868 HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64) \
7869 HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64)
7872#define HWY_NEON_DEF_REDUCTION_F16(name, prefix) \
7873 HWY_NEON_DEF_REDUCTION(float16, 4, name, prefix, _, f16) \
7874 HWY_NEON_DEF_REDUCTION(float16, 8, name, prefix##q, _, f16)
7876#define HWY_NEON_DEF_REDUCTION_F16(name, prefix)
7879HWY_NEON_DEF_REDUCTION_CORE_TYPES(
ReduceMin, vminv)
7880HWY_NEON_DEF_REDUCTION_CORE_TYPES(
ReduceMax, vmaxv)
7881HWY_NEON_DEF_REDUCTION_F16(
ReduceMin, vminv)
7882HWY_NEON_DEF_REDUCTION_F16(
ReduceMax, vmaxv)
7884HWY_NEON_DEF_REDUCTION_CORE_TYPES(
ReduceSum, vaddv)
7885HWY_NEON_DEF_REDUCTION_UI64(
ReduceSum, vaddv)
7907template <
class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
7912template <
class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
7917template <
class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 8)>
7919 const float16x4_t x2 = vpadd_f16(v.raw, v.raw);
7922template <
class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
7924 const Half<
decltype(
d)> dh;
7929#undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
7930#undef HWY_NEON_DEF_REDUCTION_F16
7931#undef HWY_NEON_DEF_REDUCTION_UI64
7932#undef HWY_NEON_DEF_REDUCTION
7936template <
class D, HWY_IF_LANES_GT_D(D, 1)>
7940template <
class D, HWY_IF_LANES_GT_D(D, 1)>
7944template <
class D, HWY_IF_LANES_GT_D(D, 1)>
7953#undef HWY_IF_SUM_OF_LANES_D
7954#define HWY_IF_SUM_OF_LANES_D(D) \
7955 hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
7956 (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
7958#undef HWY_IF_MINMAX_OF_LANES_D
7959#define HWY_IF_MINMAX_OF_LANES_D(D) \
7960 hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
7961 (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
7966#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
7967#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
7968 template <class D, HWY_IF_LANES_D(D, size)> \
7969 HWY_API Vec128<type##_t, size> name##OfLanes(D , \
7970 Vec128<type##_t, size> v) { \
7971 HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
7972 if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7973 if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7974 return Vec128<type##_t, size>(tmp); \
7979#define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
7981 template <class D, HWY_IF_LANES_D(D, size)> \
7982 HWY_API Vec128<type##_t, size> name##OfLanes(D , \
7983 Vec128<type##_t, size> v) { \
7984 HWY_NEON_BUILD_TYPE_T(type, half) tmp; \
7985 tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \
7986 vget_low_##suffix(v.raw)); \
7987 if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7988 if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7989 if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7990 return Vec128<type##_t, size>(vcombine_##suffix(tmp, tmp)); \
7993#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \
7994 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint32, 2, name, prefix, u32) \
7995 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint16, 4, name, prefix, u16) \
7996 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8) \
7997 HWY_NEON_DEF_PAIRWISE_REDUCTION(int32, 2, name, prefix, s32) \
7998 HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16) \
7999 HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8) \
8000 HWY_NEON_DEF_PAIRWISE_REDUCTION(float32, 2, name, prefix, f32) \
8001 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint32, 4, 2, name, prefix, u32) \
8002 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \
8003 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8) \
8004 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int32, 4, 2, name, prefix, s32) \
8005 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16) \
8006 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8) \
8007 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(float32, 4, 2, name, prefix, f32)
8013#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
8014#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
8015#undef HWY_NEON_DEF_PAIRWISE_REDUCTION
8016#undef HWY_NEON_BUILD_TYPE_T
8020#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
8021#undef HWY_NATIVE_REDUCE_SUM_4_UI8
8023#define HWY_NATIVE_REDUCE_SUM_4_UI8
8026template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
8040template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
8045template <
typename T>
8050template <
class D, HWY_IF_T_SIZE_D(D, 1)>
8055 const auto vmask_bits =
Set64(du, mask_bits);
8058 alignas(16)
static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
8059 1, 1, 1, 1, 1, 1, 1, 1};
8062 alignas(16)
static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
8063 1, 2, 4, 8, 16, 32, 64, 128};
8067template <
class D, HWY_IF_T_SIZE_D(D, 2)>
8070 alignas(16)
static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
8071 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
8075template <
class D, HWY_IF_T_SIZE_D(D, 4)>
8078 alignas(16)
static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
8079 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
8083template <
class D, HWY_IF_T_SIZE_D(D, 8)>
8086 alignas(16)
static constexpr uint64_t kBit[8] = {1, 2};
8093template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
8095 uint64_t mask_bits = 0;
8096 CopyBytes<(
d.MaxLanes() + 7) / 8>(bits, &mask_bits);
8105 if (kN < 8) mask_bits &= (1u << kN) - 1;
8115template <
class D, HWY_IF_V_SIZE_D(D, 16)>
8123template <
class D, HWY_IF_V_SIZE_D(D, 8)>
8126 const Twice<
decltype(
d)> d2;
8132template <
class D, HWY_IF_V_SIZE_LE_D(D, 4)>
8134 const Mask64<TFromD<D>> mask64(mask.raw);
8137 return nib & ((1ull << (
d.MaxBytes() * 4)) - 1);
8140template <
typename T>
8142 alignas(16)
static constexpr uint8_t kSliceLanes[16] = {
8143 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
8151 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.
raw, values.
raw));
8152 const uint8x8_t x4 = vpadd_u8(x2, x2);
8153 const uint8x8_t x8 = vpadd_u8(x4, x4);
8154 return vget_lane_u64(vreinterpret_u64_u8(x8), 0) & 0xFFFF;
8157 const uint16x8_t x2 = vpaddlq_u8(values.
raw);
8158 const uint32x4_t x4 = vpaddlq_u16(x2);
8159 const uint64x2_t x8 = vpaddlq_u32(x4);
8160 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
8164template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8168 alignas(8)
static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
8169 0x10, 0x20, 0x40, 0x80};
8170 const DFromM<
decltype(mask)>
d;
8176 return vaddv_u8(values.
raw);
8178 const uint16x4_t x2 = vpaddl_u8(values.
raw);
8179 const uint32x2_t x4 = vpaddl_u16(x2);
8180 const uint64x1_t x8 = vpaddl_u32(x4);
8181 return vget_lane_u64(x8, 0);
8185template <
typename T>
8187 alignas(16)
static constexpr uint16_t kSliceLanes[8] = {
8188 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80};
8194 return vaddvq_u16(values.
raw);
8196 const uint32x4_t x2 = vpaddlq_u16(values.
raw);
8197 const uint64x2_t x4 = vpaddlq_u32(x2);
8198 return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
8202template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8206 alignas(8)
static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
8207 const DFromM<
decltype(mask)>
d;
8212 return vaddv_u16(values.
raw);
8214 const uint32x2_t x2 = vpaddl_u16(values.
raw);
8215 const uint64x1_t x4 = vpaddl_u32(x2);
8216 return vget_lane_u64(x4, 0);
8220template <
typename T>
8222 alignas(16)
static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
8228 return vaddvq_u32(values.
raw);
8230 const uint64x2_t x2 = vpaddlq_u32(values.
raw);
8231 return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
8235template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8239 alignas(8)
static constexpr uint32_t kSliceLanes[2] = {1, 2};
8240 const DFromM<
decltype(mask)>
d;
8245 return vaddv_u32(values.
raw);
8247 const uint64x1_t x2 = vpaddl_u32(values.
raw);
8248 return vget_lane_u64(x2, 0);
8252template <
typename T>
8254 alignas(16)
static constexpr uint64_t kSliceLanes[2] = {1, 2};
8260 return vaddvq_u64(values.
raw);
8262 return vgetq_lane_u64(values.
raw, 0) + vgetq_lane_u64(values.
raw, 1);
8266template <
typename T>
8271 return vget_lane_u64(values.
raw, 0);
8275template <
typename T,
size_t N>
8277 return ((N *
sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
8280template <
typename T,
size_t N>
8295template <
typename T>
8298 const int8x16_t ones =
8302 return static_cast<size_t>(vaddvq_s8(ones));
8304 const int16x8_t x2 = vpaddlq_s8(ones);
8305 const int32x4_t x4 = vpaddlq_s16(x2);
8306 const int64x2_t x8 = vpaddlq_s32(x4);
8307 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
8310template <
typename T>
8313 const int16x8_t ones =
8317 return static_cast<size_t>(vaddvq_s16(ones));
8319 const int32x4_t x2 = vpaddlq_s16(ones);
8320 const int64x2_t x4 = vpaddlq_s32(x2);
8321 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
8325template <
typename T>
8328 const int32x4_t ones =
8332 return static_cast<size_t>(vaddvq_s32(ones));
8334 const int64x2_t x2 = vpaddlq_s32(ones);
8335 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
8339template <
typename T>
8343 const int64x2_t ones =
8345 return static_cast<size_t>(vaddvq_s64(ones));
8349 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
8350 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
8357template <
class D,
typename T = TFromD<D>>
8363template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
8365 constexpr int kDiv = 4 *
sizeof(
TFromD<D>);
8372 constexpr size_t kDiv = 4 *
sizeof(
TFromD<D>);
8379 if (nib == 0)
return -1;
8380 constexpr size_t kDiv = 4 *
sizeof(
TFromD<D>);
8387 constexpr size_t kDiv = 4 *
sizeof(
TFromD<D>);
8394 if (nib == 0)
return -1;
8395 constexpr size_t kDiv = 4 *
sizeof(
TFromD<D>);
8404 const size_t kNumBytes = (
d.MaxLanes() + 7) / 8;
8405 CopyBytes<kNumBytes>(&mask_bits, bits);
8415template <
class D,
typename T = TFromD<D>>
8420template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
8427template <
typename T>
8435template <
class D, HWY_IF_V_SIZE_D(D, 16)>
8442template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
8444 return Load(
d, bytes);
8447template <
typename T,
size_t N>
8449 uint64_t mask_bits) {
8463 alignas(16)
static constexpr uint8_t table[256 * 8] = {
8465 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
8466 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
8467 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
8468 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
8469 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
8470 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
8471 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
8472 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
8473 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
8474 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
8475 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
8476 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
8477 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
8478 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
8479 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
8480 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
8481 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
8482 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
8483 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
8484 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
8485 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
8486 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
8487 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
8488 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
8489 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
8490 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
8491 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
8492 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
8493 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
8494 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
8495 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
8496 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
8497 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
8498 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
8499 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
8500 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
8501 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
8502 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
8503 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
8504 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
8505 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
8506 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
8507 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
8508 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
8509 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
8510 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
8511 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
8512 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
8513 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
8514 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
8515 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
8516 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
8517 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
8518 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
8519 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
8520 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
8521 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
8522 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
8523 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
8524 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
8525 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
8526 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
8527 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
8528 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
8529 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
8530 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
8531 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
8532 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
8533 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
8534 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
8535 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
8536 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
8537 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
8538 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
8539 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
8540 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
8541 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
8542 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
8543 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
8544 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
8545 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
8546 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
8547 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
8548 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
8549 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
8550 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
8551 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
8552 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
8553 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
8554 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
8555 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
8556 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
8557 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
8558 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
8559 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
8560 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
8561 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
8562 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
8563 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
8564 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
8565 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
8566 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
8567 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
8568 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
8569 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
8570 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
8571 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
8572 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
8573 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
8574 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
8575 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
8576 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
8577 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
8578 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
8579 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
8580 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
8581 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
8582 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
8583 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
8584 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
8585 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
8586 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
8587 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
8588 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
8589 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
8590 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
8591 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
8592 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
8599template <
typename T,
size_t N>
8601 uint64_t mask_bits) {
8615 alignas(16)
static constexpr uint8_t table[256 * 8] = {
8617 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
8618 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
8619 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
8620 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
8621 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
8622 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
8623 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
8624 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
8625 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
8626 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
8627 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
8628 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
8629 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
8630 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
8631 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
8632 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
8633 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
8634 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
8635 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
8636 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
8637 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
8638 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
8639 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
8640 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
8641 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
8642 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
8643 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
8644 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
8645 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
8646 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
8647 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
8648 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
8649 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
8650 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
8651 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
8652 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
8653 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
8654 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
8655 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
8656 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
8657 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
8658 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
8659 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
8660 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
8661 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
8662 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
8663 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
8664 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
8665 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
8666 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
8667 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
8668 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
8669 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
8670 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
8671 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
8672 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
8673 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
8674 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
8675 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
8676 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
8677 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
8678 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
8679 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
8680 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
8681 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
8682 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
8683 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
8684 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
8685 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
8686 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
8687 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
8688 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
8689 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
8690 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
8691 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
8692 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
8693 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
8694 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
8695 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
8696 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
8697 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
8698 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
8699 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
8700 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
8701 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
8702 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
8703 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
8704 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
8705 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
8706 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
8707 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
8708 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
8709 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
8710 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
8711 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
8712 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
8713 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
8714 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
8715 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
8716 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
8717 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
8718 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
8719 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
8720 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
8721 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
8722 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
8723 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
8724 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
8725 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
8726 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
8727 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
8728 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
8729 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
8730 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
8731 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
8732 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
8733 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
8734 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
8735 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
8736 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
8737 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
8738 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
8739 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
8740 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
8741 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
8742 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
8743 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
8744 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
8751template <
typename T,
size_t N>
8753 uint64_t mask_bits) {
8757 alignas(16)
static constexpr uint8_t u8_indices[16 * 16] = {
8759 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8760 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8761 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
8762 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8763 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
8764 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
8765 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
8766 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8767 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
8768 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
8769 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
8770 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
8771 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
8772 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
8773 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
8774 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
8777 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
8780template <
typename T,
size_t N>
8782 uint64_t mask_bits) {
8786 alignas(16)
static constexpr uint8_t u8_indices[16 * 16] = {
8788 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
8789 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
8790 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
8791 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
8792 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
8793 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
8794 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
8795 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8796 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
8797 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
8798 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
8799 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
8800 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
8801 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
8805 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
8808#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
8810template <
typename T,
size_t N>
8812 uint64_t mask_bits) {
8816 alignas(16)
static constexpr uint8_t u8_indices[64] = {
8818 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8819 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8820 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
8821 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
8823 const Simd<T, N, 0>
d;
8825 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
8828template <
typename T,
size_t N>
8830 uint64_t mask_bits) {
8834 alignas(16)
static constexpr uint8_t u8_indices[4 * 16] = {
8836 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8837 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
8838 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8839 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
8841 const Simd<T, N, 0>
d;
8843 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
8850template <
typename T,
size_t N>
8853 detail::IdxFromBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
8854 using D =
DFromV<
decltype(v)>;
8859template <
typename T,
size_t N>
8862 detail::IdxFromNotBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
8863 using D =
DFromV<
decltype(v)>;
8871template <
typename T>
8877template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
8889template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
8890HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
8891 return detail::Compress(v, detail::BitsFromMask(mask));
8895template <
typename T>
8901template <
typename T, HWY_IF_T_SIZE(T, 8)>
8913template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
8914HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
8917 if (N < 16 / sizeof(T)) {
8918 return detail::Compress(v, detail::BitsFromMask(Not(mask)));
8920 return detail::CompressNot(v, detail::BitsFromMask(mask));
8924HWY_API Vec128<u
int64_t> CompressBlocksNot(Vec128<u
int64_t> v,
8925 Mask128<u
int64_t> ) {
8931template <
typename T,
size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
8934 uint64_t mask_bits = 0;
8935 constexpr size_t kNumBytes = (N + 7) / 8;
8936 CopyBytes<kNumBytes>(bits, &mask_bits);
8938 mask_bits &= (1ull << N) - 1;
8945template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
8954template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
8959 const size_t count =
PopCount(mask_bits);
8961 const VFromD<
decltype(du)> compressed =
8969template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
8972 uint64_t mask_bits = 0;
8973 constexpr size_t kNumBytes = (
d.MaxLanes() + 7) / 8;
8974 CopyBytes<kNumBytes>(bits, &mask_bits);
8975 if (
d.MaxLanes() < 8) {
8976 mask_bits &= (1ull <<
d.MaxLanes()) - 1;
8986#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
8987#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
8989#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
8994#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
8995#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
8998#define HWY_IF_LOAD_INT(D) HWY_IF_V_SIZE_GT_D(D, 4)
8999#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
9002#define HWY_IF_LOAD_INT(D) \
9003 HWY_IF_V_SIZE_GT_D(D, 4), \
9004 hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
9006#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
9007 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
9008 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
9009 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
9010 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
9011 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
9017#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
9018 decltype(Tuple2<type##_t, size>().raw)
9020#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
9021 const NativeLaneType<type##_t>*from, Tuple2<type##_t, size>
9023#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
9024#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
9026#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
9027 decltype(Tuple3<type##_t, size>().raw)
9028#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
9029 const NativeLaneType<type##_t>*from, Tuple3<type##_t, size>
9031#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
9032#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
9034#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
9035 decltype(Tuple4<type##_t, size>().raw)
9036#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
9037 const NativeLaneType<type##_t>*from, Tuple4<type##_t, size>
9039#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
9040#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
9042#undef HWY_NEON_DEF_FUNCTION_LOAD_INT
9043#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
9044#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
9048template <
class D, HWY_IF_LOAD_INT(D),
typename T = TFromD<D>>
9058template <
class D, HWY_IF_V_SIZE_LE_D(D, 4),
typename T = TFromD<D>>
9062 alignas(16) T buf[2 * 8 /
sizeof(T)] = {};
9065 detail::Tuple2<T,
d.MaxLanes()>());
9072template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9075 const Half<
decltype(
d)> dh;
9076 VFromD<
decltype(dh)> v00, v10, v01, v11;
9086template <
class D, HWY_IF_LOAD_INT(D),
typename T = TFromD<D>>
9097template <
class D, HWY_IF_V_SIZE_LE_D(D, 4),
typename T = TFromD<D>>
9101 alignas(16) T buf[3 * 8 /
sizeof(T)] = {};
9104 detail::Tuple3<T,
d.MaxLanes()>());
9112template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9114 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
9115 const Half<
decltype(
d)> dh;
9116 VFromD<
decltype(dh)> v00, v10, v20, v01, v11, v21;
9127template <
class D, HWY_IF_LOAD_INT(D),
typename T = TFromD<D>>
9140template <
class D, HWY_IF_V_SIZE_LE_D(D, 4),
typename T = TFromD<D>>
9144 alignas(16) T buf[4 * 8 /
sizeof(T)] = {};
9147 detail::Tuple4<T,
d.MaxLanes()>());
9156template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9158 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
9160 const Half<
decltype(
d)> dh;
9161 VFromD<
decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
9173#undef HWY_IF_LOAD_INT
9178#define HWY_NEON_BUILD_TPL_HWY_STORE_INT
9179#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
9180#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
9183#define HWY_IF_STORE_INT(D) HWY_IF_V_SIZE_GT_D(D, 4)
9184#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
9187#define HWY_IF_STORE_INT(D) \
9188 HWY_IF_V_SIZE_GT_D(D, 4), \
9189 hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
9191#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
9192 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
9193 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
9194 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
9195 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
9196 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
9199#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
9200 Tuple2<type##_t, size> tup, NativeLaneType<type##_t>*to
9202#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
9204#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
9205 Tuple3<type##_t, size> tup, NativeLaneType<type##_t>*to
9207#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
9209#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
9210 Tuple4<type##_t, size> tup, NativeLaneType<type##_t>*to
9212#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
9214#undef HWY_NEON_DEF_FUNCTION_STORE_INT
9215#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
9216#undef HWY_NEON_BUILD_RET_HWY_STORE_INT
9217#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
9220template <
class D, HWY_IF_STORE_INT(D),
typename T = TFromD<D>>
9228template <
class D, HWY_IF_V_SIZE_LE_D(D, 4),
typename T = TFromD<D>>
9231 alignas(16) T buf[2 * 8 /
sizeof(T)];
9232 detail::Tuple2<T,
d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
9239template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9242 const Half<
decltype(
d)> dh;
9252template <
class D, HWY_IF_STORE_INT(D),
typename T = TFromD<D>>
9260template <
class D, HWY_IF_V_SIZE_LE_D(D, 4),
typename T = TFromD<D>>
9263 alignas(16) T buf[3 * 8 /
sizeof(T)];
9264 detail::Tuple3<T,
d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
9271template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9274 const Half<
decltype(
d)> dh;
9284template <
class D, HWY_IF_STORE_INT(D),
typename T = TFromD<D>>
9287 detail::Tuple4<T,
d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
9292template <
class D, HWY_IF_V_SIZE_LE_D(D, 4),
typename T = TFromD<D>>
9295 alignas(16) T buf[4 * 8 /
sizeof(T)];
9296 detail::Tuple4<T,
d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
9303template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9306 const Half<
decltype(
d)> dh;
9316#undef HWY_IF_STORE_INT
9329template <
class T,
size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
9333 const auto neg_vmask =
9337template <
class T, HWY_IF_NOT_T_SIZE(T, 8)>
9343 vmask =
Or(vmask,
Neg(vmask));
9350template <
class T,
size_t N>
9365 const auto zero =
Zero(di);
9369template <
class T,
size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
9375 const auto only_first_vmask =
9379template <
class T, HWY_IF_NOT_T_SIZE(T, 8)>
9385 const auto zero =
Zero(di64);
9400template <
class T,
size_t N, HWY_IF_LANES_GT(N, 1)>
9408template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9410 static_assert(IsSame<TFromD<D>, uint64_t>(),
"T must be u64");
9435template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9443template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9445 static_assert(IsSame<TFromD<D>, uint64_t>(),
"T must be u64");
9450template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9458template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9460 static_assert(IsSame<TFromD<D>, uint64_t>(),
"T must be u64");
9465template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9496#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
9497#undef HWY_NATIVE_LEADING_ZERO_COUNT
9499#define HWY_NATIVE_LEADING_ZERO_COUNT
9511 const auto v_k32 =
BitCast(du32,
Set(du, 32));
9513 const auto v_u32_lo_lzcnt =
9515 const auto v_u32_hi_lzcnt =
9519 d,
IfThenElse(v_u32_hi_lzcnt == v_k32, v_u32_lo_lzcnt, v_u32_hi_lzcnt));
9522template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
9525 using T =
TFromD<
decltype(
d)>;
9529template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, 1)>
9603#undef HWY_NEON_BUILD_ARG_1
9604#undef HWY_NEON_BUILD_ARG_2
9605#undef HWY_NEON_BUILD_ARG_3
9606#undef HWY_NEON_BUILD_PARAM_1
9607#undef HWY_NEON_BUILD_PARAM_2
9608#undef HWY_NEON_BUILD_PARAM_3
9609#undef HWY_NEON_BUILD_RET_1
9610#undef HWY_NEON_BUILD_RET_2
9611#undef HWY_NEON_BUILD_RET_3
9612#undef HWY_NEON_BUILD_TPL_1
9613#undef HWY_NEON_BUILD_TPL_2
9614#undef HWY_NEON_BUILD_TPL_3
9615#undef HWY_NEON_DEF_FUNCTION
9616#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
9617#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
9618#undef HWY_NEON_DEF_FUNCTION_BFLOAT_16
9619#undef HWY_NEON_DEF_FUNCTION_FLOAT_16
9620#undef HWY_NEON_DEF_FUNCTION_FLOAT_16_32
9621#undef HWY_NEON_DEF_FUNCTION_FLOAT_32
9622#undef HWY_NEON_DEF_FUNCTION_FLOAT_64
9623#undef HWY_NEON_DEF_FUNCTION_FULL_UI
9624#undef HWY_NEON_DEF_FUNCTION_FULL_UI_64
9625#undef HWY_NEON_DEF_FUNCTION_FULL_UIF_64
9626#undef HWY_NEON_DEF_FUNCTION_INT_16
9627#undef HWY_NEON_DEF_FUNCTION_INT_32
9628#undef HWY_NEON_DEF_FUNCTION_INT_64
9629#undef HWY_NEON_DEF_FUNCTION_INT_8
9630#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
9631#undef HWY_NEON_DEF_FUNCTION_INTS
9632#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
9633#undef HWY_NEON_DEF_FUNCTION_UI_8_16_32
9634#undef HWY_NEON_DEF_FUNCTION_UIF_64
9635#undef HWY_NEON_DEF_FUNCTION_UIF_8_16_32
9636#undef HWY_NEON_DEF_FUNCTION_UINT_16
9637#undef HWY_NEON_DEF_FUNCTION_UINT_32
9638#undef HWY_NEON_DEF_FUNCTION_UINT_64
9639#undef HWY_NEON_DEF_FUNCTION_UINT_8
9640#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
9641#undef HWY_NEON_DEF_FUNCTION_UINTS
9643#undef HWY_NEON_IF_EMULATED_D
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition arm_neon-inl.h:252
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition arm_neon-inl.h:239
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition arm_neon-inl.h:257
#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:228
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition arm_neon-inl.h:135
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args)
Definition arm_neon-inl.h:9191
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition arm_neon-inl.h:93
#define HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:248
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition arm_neon-inl.h:244
#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
Definition arm_neon-inl.h:168
#define HWY_NEON_DEF_FUNCTION_FULL_UIF_64(name, prefix, infix, args)
Definition arm_neon-inl.h:278
#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args)
Definition arm_neon-inl.h:283
#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix)
Definition arm_neon-inl.h:7993
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition arm_neon-inl.h:123
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:222
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition arm_neon-inl.h:78
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition arm_neon-inl.h:109
#define HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:261
#define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, args)
Definition arm_neon-inl.h:172
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition arm_neon-inl.h:234
#define HWY_NEON_DEF_FUNCTION_UIF_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:265
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition arm_neon-inl.h:101
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args)
Definition arm_neon-inl.h:9006
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_IF_FLOAT(T)
Definition base.h:625
#define HWY_RESTRICT
Definition base.h:95
#define HWY_RCAST_ALIGNED(type, ptr)
Definition base.h:144
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_LANES_GT(kN, lanes)
Definition base.h:618
#define HWY_IF_V_SIZE_LE(T, kN, bytes)
Definition base.h:611
#define HWY_IF_NOT_FLOAT(T)
Definition base.h:626
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_MAYBE_UNUSED
Definition base.h:113
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
Definition arm_neon-inl.h:865
HWY_INLINE Mask128()
Definition arm_neon-inl.h:873
Mask128(const Mask128 &)=default
T PrivateT
Definition arm_neon-inl.h:870
Mask128 & operator=(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition arm_neon-inl.h:876
Raw raw
Definition arm_neon-inl.h:878
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:867
Definition arm_neon-inl.h:813
HWY_INLINE Vec128()
Definition arm_neon-inl.h:819
HWY_INLINE Vec128 & operator%=(const Vec128 other)
Definition arm_neon-inl.h:838
T PrivateT
Definition arm_neon-inl.h:816
HWY_INLINE Vec128(const Raw raw)
Definition arm_neon-inl.h:822
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition arm_neon-inl.h:829
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:815
Raw raw
Definition arm_neon-inl.h:851
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition arm_neon-inl.h:835
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition arm_neon-inl.h:847
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition arm_neon-inl.h:844
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition arm_neon-inl.h:826
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition arm_neon-inl.h:841
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition arm_neon-inl.h:832
#define HWY_COMPILER_GCC_ACTUAL
Definition detect_compiler_arch.h:121
#define HWY_ARCH_ARM_A64
Definition detect_compiler_arch.h:202
HWY_INLINE VFromD< D > Set64(D, uint64_t mask_bits)
Definition arm_neon-inl.h:8041
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_INLINE uint64_t NibblesFromMask(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8116
HWY_API V InsertLane(const V v, TFromD< D > t)
Definition arm_neon-inl.h:1793
HWY_INLINE VFromD< RepartitionToWide< DFromV< V > > > SumsOf2(hwy::SignedTag, hwy::SizeTag< 1 >, V v)
Definition arm_neon-inl.h:1959
HWY_INLINE T * NativeLanePointer(T *p)
Definition ops/shared-inl.h:111
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:689
HWY_API Vec128< TFromD< D >, MaxLanes(D())> NativeSet(D d, TFromD< D > t)
Definition arm_neon-inl.h:915
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE V SlideUpLanes(V v, size_t amt)
Definition arm_neon-inl.h:6201
_u8_
Definition arm_neon-inl.h:1408
HWY_API Vec128< float, N > NegMulAdd(Vec128< float, N > add, Vec128< float, N > mul, Vec128< float, N > x)
Definition arm_neon-inl.h:2591
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, uint64_t mask_bits)
Definition arm_neon-inl.h:8448
vreinterpret
Definition arm_neon-inl.h:1408
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:681
static HWY_INLINE HWY_MAYBE_UNUSED TFromV< V > GetLane(V v)
Definition arm_neon-inl.h:1634
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
HWY_INLINE Vec128< float16_t, N > ConcatEven(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7002
HWY_INLINE MFromD< D > LoadMaskBits(D d, uint64_t mask_bits)
Definition arm_neon-inl.h:8051
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecBroadcastLaneBytes(D d)
Definition arm_neon-inl.h:5661
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE Vec128< float16_t, N > ConcatOdd(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7009
HWY_INLINE Vec128< uint32_t > ConvertFToU(D, Vec128< float > v)
Definition arm_neon-inl.h:4067
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5005
HWY_API Vec128< float, N > MulAdd(Vec128< float, N > add, Vec128< float, N > mul, Vec128< float, N > x)
Definition arm_neon-inl.h:2585
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecByteOffsets(D d)
Definition arm_neon-inl.h:5695
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2845
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8296
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8851
HWY_INLINE Vec128< int32_t > ConvertFToI(D, Vec128< float > v)
Definition arm_neon-inl.h:4023
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:8276
HWY_INLINE Vec128< uint8_t > Load8Bytes(D, const uint8_t *bytes)
Definition arm_neon-inl.h:8436
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8860
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE V SlideDownLanes(V v, size_t amt)
Definition arm_neon-inl.h:6346
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, uint64_t mask_bits)
Definition arm_neon-inl.h:8600
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_API V SaturatedNeg(V v)
Definition generic_ops-inl.h:897
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API Vec128< int64_t > SaturatedAbs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3288
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API TFromD< D > ReduceMax(D d, VFromD< D > v)
Definition arm_sve-inl.h:3213
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
HWY_API VFromD< DI32 > SatWidenMulAccumFixedPoint(DI32, VFromD< Rebind< int16_t, DI32 > > a, VFromD< Rebind< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition arm_neon-inl.h:6496
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API VFromD< RepartitionToWideX2< DFromV< V > > > SumsOf4(V v)
Definition generic_ops-inl.h:3733
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API V ReverseLaneBytes(V v)
Definition generic_ops-inl.h:6386
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API V Per4LaneBlockShuffle(V v)
Definition generic_ops-inl.h:6904
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_API TFromD< D > ReduceMin(D d, VFromD< D > v)
Definition arm_sve-inl.h:3208
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API void LoadInterleaved2(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1)
Definition arm_neon-inl.h:9049
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
Simd< T, 8/sizeof(T), 0 > Full64
Definition ops/shared-inl.h:417
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API V HighestSetBitIndex(V v)
Definition arm_neon-inl.h:9523
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition ops/shared-inl.h:407
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API V TrailingZeroCount(V v)
Definition arm_neon-inl.h:9530
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API V ReverseBits(V v)
Definition generic_ops-inl.h:6464
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
double float64_t
Definition base.h:406
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2551
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
constexpr size_t FloorLog2(TI x)
Definition base.h:2662
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd< float >()
Definition base.h:2320
float float32_t
Definition base.h:405
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2588
HWY_API size_t PopCount(T x)
Definition base.h:2615
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_I64_D(D)
Definition ops/shared-inl.h:585
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_SPECIAL_FLOAT_D(D)
Definition ops/shared-inl.h:542
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_UI64_D(D)
Definition ops/shared-inl.h:592
#define HWY_IF_LANES_GT_D(D, lanes)
Definition ops/shared-inl.h:562
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_V_SIZE_V(V, bytes)
Definition ops/shared-inl.h:632
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_IF_NOT_FLOAT_D(D)
Definition ops/shared-inl.h:536
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:555
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_HAVE_FLOAT64
Definition set_macros-inl.h:174
#define HWY_HAVE_FLOAT16
Definition set_macros-inl.h:173
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition arm_neon-inl.h:8428
@ value
Definition arm_neon-inl.h:8429
Definition arm_neon-inl.h:5654
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:5655
Definition ops/shared-inl.h:198
constexpr size_t MaxBytes() const
Definition ops/shared-inl.h:250
float32x4_t type
Definition arm_neon-inl.h:690
float32x2_t type
Definition arm_neon-inl.h:694
int16x8_t type
Definition arm_neon-inl.h:663
int16x4_t type
Definition arm_neon-inl.h:667
int32x4_t type
Definition arm_neon-inl.h:672
int32x2_t type
Definition arm_neon-inl.h:676
int64x1_t type
Definition arm_neon-inl.h:685
int64x2_t type
Definition arm_neon-inl.h:681
int8x16_t type
Definition arm_neon-inl.h:654
int8x8_t type
Definition arm_neon-inl.h:658
uint16x8_t type
Definition arm_neon-inl.h:627
uint16x4_t type
Definition arm_neon-inl.h:631
uint32x4_t type
Definition arm_neon-inl.h:636
uint32x2_t type
Definition arm_neon-inl.h:640
uint64x1_t type
Definition arm_neon-inl.h:649
uint64x2_t type
Definition arm_neon-inl.h:645
uint8x16_t type
Definition arm_neon-inl.h:618
uint8x8_t type
Definition arm_neon-inl.h:622
Definition x86_128-inl.h:67
__v128_u type
Definition wasm_128-inl.h:60
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:5213
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:5220
Definition arm_neon-inl.h:5191
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:5201
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition arm_neon-inl.h:5194
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:5243
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:5250
Definition arm_neon-inl.h:5226
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition arm_neon-inl.h:5228
float32x4x2_t raw
Definition arm_neon-inl.h:428
float32x2x2_t raw
Definition arm_neon-inl.h:432
int16x8x2_t raw
Definition arm_neon-inl.h:387
int16x4x2_t raw
Definition arm_neon-inl.h:391
int32x4x2_t raw
Definition arm_neon-inl.h:403
int32x2x2_t raw
Definition arm_neon-inl.h:407
int64x2x2_t raw
Definition arm_neon-inl.h:419
int64x1x2_t raw
Definition arm_neon-inl.h:423
int8x16x2_t raw
Definition arm_neon-inl.h:371
int8x8x2_t raw
Definition arm_neon-inl.h:375
uint16x8x2_t raw
Definition arm_neon-inl.h:379
uint16x4x2_t raw
Definition arm_neon-inl.h:383
uint32x4x2_t raw
Definition arm_neon-inl.h:395
uint32x2x2_t raw
Definition arm_neon-inl.h:399
uint64x2x2_t raw
Definition arm_neon-inl.h:411
uint64x1x2_t raw
Definition arm_neon-inl.h:415
uint8x16x2_t raw
Definition arm_neon-inl.h:363
uint8x8x2_t raw
Definition arm_neon-inl.h:367
Definition arm_neon-inl.h:355
float32x4x3_t raw
Definition arm_neon-inl.h:512
float32x2x3_t raw
Definition arm_neon-inl.h:516
int16x8x3_t raw
Definition arm_neon-inl.h:471
int16x4x3_t raw
Definition arm_neon-inl.h:475
int32x4x3_t raw
Definition arm_neon-inl.h:487
int32x2x3_t raw
Definition arm_neon-inl.h:491
int64x2x3_t raw
Definition arm_neon-inl.h:503
int64x1x3_t raw
Definition arm_neon-inl.h:507
int8x16x3_t raw
Definition arm_neon-inl.h:455
int8x8x3_t raw
Definition arm_neon-inl.h:459
uint16x8x3_t raw
Definition arm_neon-inl.h:463
uint16x4x3_t raw
Definition arm_neon-inl.h:467
uint32x4x3_t raw
Definition arm_neon-inl.h:479
uint32x2x3_t raw
Definition arm_neon-inl.h:483
uint64x2x3_t raw
Definition arm_neon-inl.h:495
uint64x1x3_t raw
Definition arm_neon-inl.h:499
uint8x16x3_t raw
Definition arm_neon-inl.h:447
uint8x8x3_t raw
Definition arm_neon-inl.h:451
Definition arm_neon-inl.h:357
float32x4x4_t raw
Definition arm_neon-inl.h:596
float32x2x4_t raw
Definition arm_neon-inl.h:600
int16x8x4_t raw
Definition arm_neon-inl.h:555
int16x4x4_t raw
Definition arm_neon-inl.h:559
int32x4x4_t raw
Definition arm_neon-inl.h:571
int32x2x4_t raw
Definition arm_neon-inl.h:575
int64x2x4_t raw
Definition arm_neon-inl.h:587
int64x1x4_t raw
Definition arm_neon-inl.h:591
int8x16x4_t raw
Definition arm_neon-inl.h:539
int8x8x4_t raw
Definition arm_neon-inl.h:543
uint16x8x4_t raw
Definition arm_neon-inl.h:547
uint16x4x4_t raw
Definition arm_neon-inl.h:551
uint32x4x4_t raw
Definition arm_neon-inl.h:563
uint32x2x4_t raw
Definition arm_neon-inl.h:567
uint64x2x4_t raw
Definition arm_neon-inl.h:579
uint64x1x4_t raw
Definition arm_neon-inl.h:583
uint8x16x4_t raw
Definition arm_neon-inl.h:531
uint8x8x4_t raw
Definition arm_neon-inl.h:535
Definition arm_neon-inl.h:359
Definition arm_neon-inl.h:972
T vals[8/sizeof(T)]
Definition arm_neon-inl.h:975
int VFromD
Definition tuple-inl.h:25