Grok 12.0.1
arm_neon-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
3// SPDX-License-Identifier: Apache-2.0
4// SPDX-License-Identifier: BSD-3-Clause
5//
6// Licensed under the Apache License, Version 2.0 (the "License");
7// you may not use this file except in compliance with the License.
8// You may obtain a copy of the License at
9//
10// http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18// 128-bit Arm NEON vectors and operations.
19// External include guard in highway.h - see comment there.
20
21// Arm NEON intrinsics are documented at:
22// https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
23
24#include "hwy/ops/shared-inl.h"
25
27HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
28#include <arm_neon.h> // NOLINT(build/include_order)
30
32namespace hwy {
33namespace HWY_NAMESPACE {
34
35namespace detail { // for code folding and Raw128
36
37// Macros used to define single and double function calls for multiple types
38// for full and half vectors. These macros are undefined at the end of the file.
39
40// HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
41#define HWY_NEON_BUILD_TPL_1
42#define HWY_NEON_BUILD_TPL_2
43#define HWY_NEON_BUILD_TPL_3
44
45// HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can
46// extend it to int32x4x2_t packs.
47#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
48#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
49#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
50
51// HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
52#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
53#define HWY_NEON_BUILD_PARAM_2(type, size) \
54 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
55#define HWY_NEON_BUILD_PARAM_3(type, size) \
56 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
57 const Vec128<type##_t, size> c
58
59// HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
60// function.
61#define HWY_NEON_BUILD_ARG_1 a.raw
62#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
63#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
64
65// We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
66// the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
67// itself like with some of the library "functions" such as vshlq_u8. For
68// example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
69// "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
70// Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
71// expects two arguments.
72#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
73
74// Main macro definition that defines a single function for the given type and
75// size of vector, using the underlying (prefix##infix##suffix) function and
76// the template, return type, parameters and arguments defined by the "args"
77// parameters passed here (see HWY_NEON_BUILD_* macros defined before).
78#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
79 HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
80 HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
81 name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
82 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
83 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
84 }
85
86// The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
87// called "name" using the set of neon functions starting with the given
88// "prefix" for all the variants of certain types, as specified next to each
89// macro. For example, the prefix "vsub" can be used to define the operator-
90// using args=2.
91
92// uint8_t
93#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
94 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
95 HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
96 HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
97 HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
98 HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
99
100// int8_t
101#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
102 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
103 HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
104 HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
105 HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
106 HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
107
108// uint16_t
109#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
110 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
111 HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
112 HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
113 HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
114
115// int16_t
116#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
117 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
118 HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
119 HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
120 HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
121
122// uint32_t
123#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
124 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
125 HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
126 HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
127
128// int32_t
129#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
130 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
131 HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
132 HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
133
134// uint64_t
135#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
136 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
137 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
138
139// int64_t
140#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
141 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
142 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
143
144#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && HWY_HAVE_SCALAR_BF16_TYPE
145#define HWY_NEON_HAVE_BFLOAT16 1
146#else
147#define HWY_NEON_HAVE_BFLOAT16 0
148#endif
149
150// HWY_NEON_HAVE_F32_TO_BF16C is defined if the NEON vcvt_bf16_f32 intrinsic
151// is available, even if the __bf16 type is disabled due to GCC/Clang bugs
152#if HWY_NEON_HAVE_BFLOAT16 || \
153 (defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
154 (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
155#define HWY_NEON_HAVE_F32_TO_BF16C 1
156#else
157#define HWY_NEON_HAVE_F32_TO_BF16C 0
158#endif
159
160// bfloat16_t
161#if HWY_NEON_HAVE_BFLOAT16
162#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
163 HWY_NEON_DEF_FUNCTION(bfloat16, 8, name, prefix##q, infix, bf16, args) \
164 HWY_NEON_DEF_FUNCTION(bfloat16, 4, name, prefix, infix, bf16, args) \
165 HWY_NEON_DEF_FUNCTION(bfloat16, 2, name, prefix, infix, bf16, args) \
166 HWY_NEON_DEF_FUNCTION(bfloat16, 1, name, prefix, infix, bf16, args)
167#else
168#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
169#endif
170
171// Used for conversion instructions if HWY_NEON_HAVE_F16C.
172#define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \
173 args) \
174 HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args) \
175 HWY_NEON_DEF_FUNCTION(float16, 4, name, prefix, infix, f16, args) \
176 HWY_NEON_DEF_FUNCTION(float16, 2, name, prefix, infix, f16, args) \
177 HWY_NEON_DEF_FUNCTION(float16, 1, name, prefix, infix, f16, args)
178
179// float16_t
180#if HWY_HAVE_FLOAT16
181#define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \
182 HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, args)
183#else
184#define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args)
185#endif
186
187// Enable generic functions for whichever of (f16, bf16) are not supported.
188#if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
189#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
190#elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
191#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D)
192#elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
193#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
194#elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
195// NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
196// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
197// !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
198// SFINAE to occur instead of a hard error due to a dependency on the D template
199// argument
200#define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
201#else
202#error "Logic error, handled all four cases"
203#endif
204
205// float
206#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
207 HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
208 HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
209 HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
210
211// double
212#if HWY_HAVE_FLOAT64
213#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
214 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
215 HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
216#else
217#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
218#endif
219
220// Helper macros to define for more than one type.
221// uint8_t, uint16_t and uint32_t
222#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
223 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
224 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
225 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
226
227// int8_t, int16_t and int32_t
228#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
229 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
230 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
231 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
232
233// uint8_t, uint16_t, uint32_t and uint64_t
234#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
235 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
236 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
237
238// int8_t, int16_t, int32_t and int64_t
239#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
240 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
241 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
242
243// All int*_t and uint*_t up to 64
244#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
245 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
246 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
247
248#define HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
249 HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \
250 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
251
252#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
253 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
254 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
255
256// All previous types.
257#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
258 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
259 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
260
261#define HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \
262 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
263 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)
264
265#define HWY_NEON_DEF_FUNCTION_UIF_8_16_32(name, prefix, infix, args) \
266 HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \
267 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args)
268
269#define HWY_NEON_DEF_FUNCTION_UIF_64(name, prefix, infix, args) \
270 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
271 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
272 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
273
274// For vzip1/2
275#define HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) \
276 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
277 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args)
278#define HWY_NEON_DEF_FUNCTION_FULL_UIF_64(name, prefix, infix, args) \
279 HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) \
280 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args)
281
282// For eor3q, which is only defined for full vectors.
283#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args) \
284 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
285 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
286 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
287 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
288 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
289 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
290 HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args)
291// Emulation of some intrinsics on armv7.
292#if HWY_ARCH_ARM_V7
293#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
294#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
295#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
296#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
297#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
298#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
299#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
300#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
301#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
302#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
303#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
304#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
305#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
306#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
307#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
308#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
309#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
310#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
311#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
312#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
313#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
314#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
315#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
316#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
317#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
318#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
319#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
320#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
321#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
322#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
323#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
324#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
325#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
326#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
327#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
328#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
329#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
330#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
331#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
332#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
333#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
334#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
335#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
336#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
337#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
338#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
339#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
340#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
341#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
342#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
343#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
344#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
345#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
346#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
347#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
348#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
349#endif
350
351// Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2
352// overloads for all vector types, even those (bfloat16_t) where the
353// underlying vector is the same as others (uint16_t).
354template <typename T, size_t N>
355struct Tuple2;
356template <typename T, size_t N>
357struct Tuple3;
358template <typename T, size_t N>
359struct Tuple4;
360
361template <>
362struct Tuple2<uint8_t, 16> {
363 uint8x16x2_t raw;
364};
365template <size_t N>
366struct Tuple2<uint8_t, N> {
367 uint8x8x2_t raw;
368};
369template <>
370struct Tuple2<int8_t, 16> {
371 int8x16x2_t raw;
372};
373template <size_t N>
374struct Tuple2<int8_t, N> {
375 int8x8x2_t raw;
376};
377template <>
378struct Tuple2<uint16_t, 8> {
379 uint16x8x2_t raw;
380};
381template <size_t N>
382struct Tuple2<uint16_t, N> {
383 uint16x4x2_t raw;
384};
385template <>
386struct Tuple2<int16_t, 8> {
387 int16x8x2_t raw;
388};
389template <size_t N>
390struct Tuple2<int16_t, N> {
391 int16x4x2_t raw;
392};
393template <>
394struct Tuple2<uint32_t, 4> {
395 uint32x4x2_t raw;
396};
397template <size_t N>
398struct Tuple2<uint32_t, N> {
399 uint32x2x2_t raw;
400};
401template <>
402struct Tuple2<int32_t, 4> {
403 int32x4x2_t raw;
404};
405template <size_t N>
406struct Tuple2<int32_t, N> {
407 int32x2x2_t raw;
408};
409template <>
410struct Tuple2<uint64_t, 2> {
411 uint64x2x2_t raw;
412};
413template <size_t N>
414struct Tuple2<uint64_t, N> {
415 uint64x1x2_t raw;
416};
417template <>
418struct Tuple2<int64_t, 2> {
419 int64x2x2_t raw;
420};
421template <size_t N>
422struct Tuple2<int64_t, N> {
423 int64x1x2_t raw;
424};
425
426template <>
427struct Tuple2<float32_t, 4> {
428 float32x4x2_t raw;
429};
430template <size_t N>
431struct Tuple2<float32_t, N> {
432 float32x2x2_t raw;
433};
434#if HWY_HAVE_FLOAT64
435template <>
436struct Tuple2<float64_t, 2> {
437 float64x2x2_t raw;
438};
439template <size_t N>
440struct Tuple2<float64_t, N> {
441 float64x1x2_t raw;
442};
443#endif // HWY_HAVE_FLOAT64
444
445template <>
446struct Tuple3<uint8_t, 16> {
447 uint8x16x3_t raw;
448};
449template <size_t N>
450struct Tuple3<uint8_t, N> {
451 uint8x8x3_t raw;
452};
453template <>
454struct Tuple3<int8_t, 16> {
455 int8x16x3_t raw;
456};
457template <size_t N>
458struct Tuple3<int8_t, N> {
459 int8x8x3_t raw;
460};
461template <>
462struct Tuple3<uint16_t, 8> {
463 uint16x8x3_t raw;
464};
465template <size_t N>
466struct Tuple3<uint16_t, N> {
467 uint16x4x3_t raw;
468};
469template <>
470struct Tuple3<int16_t, 8> {
471 int16x8x3_t raw;
472};
473template <size_t N>
474struct Tuple3<int16_t, N> {
475 int16x4x3_t raw;
476};
477template <>
478struct Tuple3<uint32_t, 4> {
479 uint32x4x3_t raw;
480};
481template <size_t N>
482struct Tuple3<uint32_t, N> {
483 uint32x2x3_t raw;
484};
485template <>
486struct Tuple3<int32_t, 4> {
487 int32x4x3_t raw;
488};
489template <size_t N>
490struct Tuple3<int32_t, N> {
491 int32x2x3_t raw;
492};
493template <>
494struct Tuple3<uint64_t, 2> {
495 uint64x2x3_t raw;
496};
497template <size_t N>
498struct Tuple3<uint64_t, N> {
499 uint64x1x3_t raw;
500};
501template <>
502struct Tuple3<int64_t, 2> {
503 int64x2x3_t raw;
504};
505template <size_t N>
506struct Tuple3<int64_t, N> {
507 int64x1x3_t raw;
508};
509
510template <>
511struct Tuple3<float32_t, 4> {
512 float32x4x3_t raw;
513};
514template <size_t N>
515struct Tuple3<float32_t, N> {
516 float32x2x3_t raw;
517};
518#if HWY_HAVE_FLOAT64
519template <>
520struct Tuple3<float64_t, 2> {
521 float64x2x3_t raw;
522};
523template <size_t N>
524struct Tuple3<float64_t, N> {
525 float64x1x3_t raw;
526};
527#endif // HWY_HAVE_FLOAT64
528
529template <>
530struct Tuple4<uint8_t, 16> {
531 uint8x16x4_t raw;
532};
533template <size_t N>
534struct Tuple4<uint8_t, N> {
535 uint8x8x4_t raw;
536};
537template <>
538struct Tuple4<int8_t, 16> {
539 int8x16x4_t raw;
540};
541template <size_t N>
542struct Tuple4<int8_t, N> {
543 int8x8x4_t raw;
544};
545template <>
546struct Tuple4<uint16_t, 8> {
547 uint16x8x4_t raw;
548};
549template <size_t N>
550struct Tuple4<uint16_t, N> {
551 uint16x4x4_t raw;
552};
553template <>
554struct Tuple4<int16_t, 8> {
555 int16x8x4_t raw;
556};
557template <size_t N>
558struct Tuple4<int16_t, N> {
559 int16x4x4_t raw;
560};
561template <>
562struct Tuple4<uint32_t, 4> {
563 uint32x4x4_t raw;
564};
565template <size_t N>
566struct Tuple4<uint32_t, N> {
567 uint32x2x4_t raw;
568};
569template <>
570struct Tuple4<int32_t, 4> {
571 int32x4x4_t raw;
572};
573template <size_t N>
574struct Tuple4<int32_t, N> {
575 int32x2x4_t raw;
576};
577template <>
578struct Tuple4<uint64_t, 2> {
579 uint64x2x4_t raw;
580};
581template <size_t N>
582struct Tuple4<uint64_t, N> {
583 uint64x1x4_t raw;
584};
585template <>
586struct Tuple4<int64_t, 2> {
587 int64x2x4_t raw;
588};
589template <size_t N>
590struct Tuple4<int64_t, N> {
591 int64x1x4_t raw;
592};
593
594template <>
595struct Tuple4<float32_t, 4> {
596 float32x4x4_t raw;
597};
598template <size_t N>
599struct Tuple4<float32_t, N> {
600 float32x2x4_t raw;
601};
602#if HWY_HAVE_FLOAT64
603template <>
604struct Tuple4<float64_t, 2> {
605 float64x2x4_t raw;
606};
607template <size_t N>
608struct Tuple4<float64_t, N> {
609 float64x1x4_t raw;
610};
611#endif // HWY_HAVE_FLOAT64
612
613template <typename T, size_t N>
614struct Raw128;
615
616template <>
617struct Raw128<uint8_t, 16> {
618 using type = uint8x16_t;
619};
620template <size_t N>
621struct Raw128<uint8_t, N> {
622 using type = uint8x8_t;
623};
624
625template <>
626struct Raw128<uint16_t, 8> {
627 using type = uint16x8_t;
628};
629template <size_t N>
630struct Raw128<uint16_t, N> {
631 using type = uint16x4_t;
632};
633
634template <>
635struct Raw128<uint32_t, 4> {
636 using type = uint32x4_t;
637};
638template <size_t N>
639struct Raw128<uint32_t, N> {
640 using type = uint32x2_t;
641};
642
643template <>
644struct Raw128<uint64_t, 2> {
645 using type = uint64x2_t;
646};
647template <>
648struct Raw128<uint64_t, 1> {
649 using type = uint64x1_t;
650};
651
652template <>
653struct Raw128<int8_t, 16> {
654 using type = int8x16_t;
655};
656template <size_t N>
657struct Raw128<int8_t, N> {
658 using type = int8x8_t;
659};
660
661template <>
662struct Raw128<int16_t, 8> {
663 using type = int16x8_t;
664};
665template <size_t N>
666struct Raw128<int16_t, N> {
667 using type = int16x4_t;
668};
669
670template <>
671struct Raw128<int32_t, 4> {
672 using type = int32x4_t;
673};
674template <size_t N>
675struct Raw128<int32_t, N> {
676 using type = int32x2_t;
677};
678
679template <>
680struct Raw128<int64_t, 2> {
681 using type = int64x2_t;
682};
683template <>
684struct Raw128<int64_t, 1> {
685 using type = int64x1_t;
686};
687
688template <>
689struct Raw128<float, 4> {
690 using type = float32x4_t;
691};
692template <size_t N>
693struct Raw128<float, N> {
694 using type = float32x2_t;
695};
696
697#if HWY_HAVE_FLOAT64
698template <>
699struct Raw128<double, 2> {
700 using type = float64x2_t;
701};
702template <>
703struct Raw128<double, 1> {
704 using type = float64x1_t;
705};
706#endif // HWY_HAVE_FLOAT64
707
708#if HWY_NEON_HAVE_F16C
709
710template <>
711struct Tuple2<float16_t, 8> {
712 float16x8x2_t raw;
713};
714template <size_t N>
715struct Tuple2<float16_t, N> {
716 float16x4x2_t raw;
717};
718
719template <>
720struct Tuple3<float16_t, 8> {
721 float16x8x3_t raw;
722};
723template <size_t N>
724struct Tuple3<float16_t, N> {
725 float16x4x3_t raw;
726};
727
728template <>
729struct Tuple4<float16_t, 8> {
730 float16x8x4_t raw;
731};
732template <size_t N>
733struct Tuple4<float16_t, N> {
734 float16x4x4_t raw;
735};
736
737template <>
738struct Raw128<float16_t, 8> {
739 using type = float16x8_t;
740};
741template <size_t N>
742struct Raw128<float16_t, N> {
743 using type = float16x4_t;
744};
745
746#else // !HWY_NEON_HAVE_F16C
747
748template <size_t N>
749struct Tuple2<float16_t, N> : public Tuple2<uint16_t, N> {};
750template <size_t N>
751struct Tuple3<float16_t, N> : public Tuple3<uint16_t, N> {};
752template <size_t N>
753struct Tuple4<float16_t, N> : public Tuple4<uint16_t, N> {};
754template <size_t N>
755struct Raw128<float16_t, N> : public Raw128<uint16_t, N> {};
756
757#endif // HWY_NEON_HAVE_F16C
758
759#if HWY_NEON_HAVE_BFLOAT16
760
761template <>
762struct Tuple2<bfloat16_t, 8> {
763 bfloat16x8x2_t raw;
764};
765template <size_t N>
766struct Tuple2<bfloat16_t, N> {
767 bfloat16x4x2_t raw;
768};
769
770template <>
771struct Tuple3<bfloat16_t, 8> {
772 bfloat16x8x3_t raw;
773};
774template <size_t N>
775struct Tuple3<bfloat16_t, N> {
776 bfloat16x4x3_t raw;
777};
778
779template <>
780struct Tuple4<bfloat16_t, 8> {
781 bfloat16x8x4_t raw;
782};
783template <size_t N>
784struct Tuple4<bfloat16_t, N> {
785 bfloat16x4x4_t raw;
786};
787
788template <>
789struct Raw128<bfloat16_t, 8> {
790 using type = bfloat16x8_t;
791};
792template <size_t N>
793struct Raw128<bfloat16_t, N> {
794 using type = bfloat16x4_t;
795};
796
797#else // !HWY_NEON_HAVE_BFLOAT16
798
799template <size_t N>
800struct Tuple2<bfloat16_t, N> : public Tuple2<uint16_t, N> {};
801template <size_t N>
802struct Tuple3<bfloat16_t, N> : public Tuple3<uint16_t, N> {};
803template <size_t N>
804struct Tuple4<bfloat16_t, N> : public Tuple4<uint16_t, N> {};
805template <size_t N>
806struct Raw128<bfloat16_t, N> : public Raw128<uint16_t, N> {};
807
808#endif // HWY_NEON_HAVE_BFLOAT16
809
810} // namespace detail
811
812template <typename T, size_t N = 16 / sizeof(T)>
813class Vec128 {
814 public:
816 using PrivateT = T; // only for DFromV
817 static constexpr size_t kPrivateN = N; // only for DFromV
818
820 Vec128(const Vec128&) = default;
821 Vec128& operator=(const Vec128&) = default;
822 HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
823
824 // Compound assignment. Only usable if there is a corresponding non-member
825 // binary operator overload. For example, only f32 and f64 support division.
827 return *this = (*this * other);
828 }
830 return *this = (*this / other);
831 }
833 return *this = (*this + other);
834 }
836 return *this = (*this - other);
837 }
839 return *this = (*this % other);
840 }
842 return *this = (*this & other);
843 }
845 return *this = (*this | other);
846 }
848 return *this = (*this ^ other);
849 }
850
852};
853
854template <typename T>
855using Vec64 = Vec128<T, 8 / sizeof(T)>;
856
857template <typename T>
858using Vec32 = Vec128<T, 4 / sizeof(T)>;
859
860template <typename T>
861using Vec16 = Vec128<T, 2 / sizeof(T)>;
862
863// FF..FF or 0.
864template <typename T, size_t N = 16 / sizeof(T)>
865class Mask128 {
866 // Arm C Language Extensions return and expect unsigned type.
867 using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
868
869 public:
870 using PrivateT = T; // only for DFromM
871 static constexpr size_t kPrivateN = N; // only for DFromM
872
874 Mask128(const Mask128&) = default;
875 Mask128& operator=(const Mask128&) = default;
876 HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}
877
879};
880
881template <typename T>
882using Mask64 = Mask128<T, 8 / sizeof(T)>;
883
884template <class V>
886
887template <class M>
889
890template <class V>
891using TFromV = typename V::PrivateT;
892
893// ------------------------------ Set
894
895namespace detail {
896// We want to route any combination of N/kPow2 to the intrinsics depending on
897// whether the requested size is <= 64 bits or 128. HWY_NEON_BUILD_TPL is
898// unconditional and currently does not accept inputs (such as whether the
899// vector is 64 or 128-bit). Thus we are not able to use HWY_IF_V_SIZE_D for
900// SFINAE. We instead define a private NativeSet which receives a Simd<> whose
901// kPow2 has already been folded into its N.
902#define HWY_NEON_BUILD_TPL_HWY_SET
903#define HWY_NEON_BUILD_RET_HWY_SET(type, size) Vec128<type##_t, size>
904#define HWY_NEON_BUILD_PARAM_HWY_SET(type, size) \
905 Simd<type##_t, size, 0> /* tag */, type##_t t
906#define HWY_NEON_BUILD_ARG_HWY_SET t
907
909#if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C
911#endif
913
914template <class D, HWY_NEON_IF_EMULATED_D(D)>
916 const uint16_t tu = BitCastScalar<uint16_t>(t);
917 return Vec128<TFromD<D>, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
918}
919
920#undef HWY_NEON_BUILD_TPL_HWY_SET
921#undef HWY_NEON_BUILD_RET_HWY_SET
922#undef HWY_NEON_BUILD_PARAM_HWY_SET
923#undef HWY_NEON_BUILD_ARG_HWY_SET
924
925} // namespace detail
926
927// Full vector. Cannot yet use VFromD because that is defined in terms of Set.
928// Do not use a typename T = TFromD<D> argument because T will be deduced from
929// the actual argument type, which can differ from TFromD<D>.
930template <class D, HWY_IF_V_SIZE_D(D, 16), typename T>
931HWY_INLINE Vec128<TFromD<D>> Set(D /* tag */, T t) {
932 return detail::NativeSet(Full128<TFromD<D>>(), static_cast<TFromD<D>>(t));
933}
934
935// Partial vector: create 64-bit and return wrapper.
936template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T>
937HWY_API Vec128<TFromD<D>, MaxLanes(D())> Set(D /* tag */, T t) {
938 const Full64<TFromD<D>> dfull;
939 return Vec128<TFromD<D>, MaxLanes(D())>(
940 detail::NativeSet(dfull, static_cast<TFromD<D>>(t)).raw);
941}
942
943template <class D>
944using VFromD = decltype(Set(D(), TFromD<D>()));
945
946template <class D>
948 // Default ctor also works for bfloat16_t and float16_t.
949 return Set(d, TFromD<D>{});
950}
951
952HWY_DIAGNOSTICS(push)
953HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
955HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
956#endif
957
958template <class D>
959HWY_API VFromD<D> Undefined(D /*tag*/) {
960 VFromD<D> v;
961 return v;
962}
963
965
966#if !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
967namespace detail {
968
969#pragma pack(push, 1)
970
971template <class T>
972struct alignas(8) Vec64ValsWrapper {
973 static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
974 static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
975 T vals[8 / sizeof(T)];
976};
977
978#pragma pack(pop)
979
980} // namespace detail
981#endif // !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
982
983template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
985 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
986 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
987 TFromD<D> /*t8*/, TFromD<D> /*t9*/,
988 TFromD<D> /*t10*/, TFromD<D> /*t11*/,
989 TFromD<D> /*t12*/, TFromD<D> /*t13*/,
990 TFromD<D> /*t14*/, TFromD<D> /*t15*/) {
991#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
992 typedef int8_t GccI8RawVectType __attribute__((__vector_size__(8)));
993 (void)d;
994 const GccI8RawVectType raw = {
995 static_cast<int8_t>(t0), static_cast<int8_t>(t1), static_cast<int8_t>(t2),
996 static_cast<int8_t>(t3), static_cast<int8_t>(t4), static_cast<int8_t>(t5),
997 static_cast<int8_t>(t6), static_cast<int8_t>(t7)};
998 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
999#else
1000 return ResizeBitCast(
1002 BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
1003 {t0, t1, t2, t3, t4, t5, t6, t7}})));
1004#endif
1005}
1006
1007template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1009 TFromD<D> t2, TFromD<D> t3,
1010 TFromD<D> /*t4*/, TFromD<D> /*t5*/,
1011 TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
1012#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1013 typedef int16_t GccI16RawVectType __attribute__((__vector_size__(8)));
1014 (void)d;
1015 const GccI16RawVectType raw = {
1016 static_cast<int16_t>(t0), static_cast<int16_t>(t1),
1017 static_cast<int16_t>(t2), static_cast<int16_t>(t3)};
1018 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1019#else
1020 return ResizeBitCast(
1022 BitCastScalar<uint64_t>(
1023 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}})));
1024#endif
1025}
1026
1027template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1029 TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
1030#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1031 typedef int32_t GccI32RawVectType __attribute__((__vector_size__(8)));
1032 (void)d;
1033 const GccI32RawVectType raw = {static_cast<int32_t>(t0),
1034 static_cast<int32_t>(t1)};
1035 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1036#else
1037 return ResizeBitCast(d,
1039 BitCastScalar<uint64_t>(
1040 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
1041#endif
1042}
1043
1044template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1045HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1046 TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
1047#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1048 typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
1049 (void)d;
1050 const GccF32RawVectType raw = {t0, t1};
1051 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1052#else
1053 return ResizeBitCast(d,
1054 Set(Full64<uint64_t>(),
1055 BitCastScalar<uint64_t>(
1056 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
1057#endif
1058}
1059
1060template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
1062 return Set(d, t0);
1063}
1064
1065template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
1066HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1067 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1068 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
1069 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
1070 TFromD<D> t11, TFromD<D> t12,
1071 TFromD<D> t13, TFromD<D> t14,
1072 TFromD<D> t15) {
1073#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1074 typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16)));
1075 (void)d;
1076 const GccI8RawVectType raw = {
1077 static_cast<int8_t>(t0), static_cast<int8_t>(t1),
1078 static_cast<int8_t>(t2), static_cast<int8_t>(t3),
1079 static_cast<int8_t>(t4), static_cast<int8_t>(t5),
1080 static_cast<int8_t>(t6), static_cast<int8_t>(t7),
1081 static_cast<int8_t>(t8), static_cast<int8_t>(t9),
1082 static_cast<int8_t>(t10), static_cast<int8_t>(t11),
1083 static_cast<int8_t>(t12), static_cast<int8_t>(t13),
1084 static_cast<int8_t>(t14), static_cast<int8_t>(t15)};
1085 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1086#else
1087 const Half<decltype(d)> dh;
1088 return Combine(d,
1089 Dup128VecFromValues(dh, t8, t9, t10, t11, t12, t13, t14, t15,
1090 t8, t9, t10, t11, t12, t13, t14, t15),
1091 Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t0, t1,
1092 t2, t3, t4, t5, t6, t7));
1093#endif
1094}
1095
1096template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
1097HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1098 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1099 TFromD<D> t5, TFromD<D> t6,
1100 TFromD<D> t7) {
1101#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1102 typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16)));
1103 (void)d;
1104 const GccI16RawVectType raw = {
1105 static_cast<int16_t>(t0), static_cast<int16_t>(t1),
1106 static_cast<int16_t>(t2), static_cast<int16_t>(t3),
1107 static_cast<int16_t>(t4), static_cast<int16_t>(t5),
1108 static_cast<int16_t>(t6), static_cast<int16_t>(t7)};
1109 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1110#else
1111 const Half<decltype(d)> dh;
1112 return Combine(d, Dup128VecFromValues(dh, t4, t5, t6, t7, t4, t5, t6, t7),
1113 Dup128VecFromValues(dh, t0, t1, t2, t3, t0, t1, t2, t3));
1114#endif
1115}
1116
1117template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 16)>
1118HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1119 TFromD<D> t2, TFromD<D> t3) {
1120#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1121 typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16)));
1122 (void)d;
1123 const GccI32RawVectType raw = {
1124 static_cast<int32_t>(t0), static_cast<int32_t>(t1),
1125 static_cast<int32_t>(t2), static_cast<int32_t>(t3)};
1126 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1127#else
1128 const Half<decltype(d)> dh;
1129 return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
1130 Dup128VecFromValues(dh, t0, t1, t0, t1));
1131#endif
1132}
1133
1134template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 16)>
1135HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1136 TFromD<D> t2, TFromD<D> t3) {
1137#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1138 typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
1139 (void)d;
1140 const GccF32RawVectType raw = {t0, t1, t2, t3};
1141 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1142#else
1143 const Half<decltype(d)> dh;
1144 return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
1145 Dup128VecFromValues(dh, t0, t1, t0, t1));
1146#endif
1147}
1148
1149template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 16)>
1150HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
1151#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1152 typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16)));
1153 (void)d;
1154 const GccI64RawVectType raw = {static_cast<int64_t>(t0),
1155 static_cast<int64_t>(t1)};
1156 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1157#else
1158 const Half<decltype(d)> dh;
1159 return Combine(d, Set(dh, t1), Set(dh, t0));
1160#endif
1161}
1162
1163#if HWY_HAVE_FLOAT64
1164template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 16)>
1165HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
1166#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1167 typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
1168 (void)d;
1169 const GccF64RawVectType raw = {t0, t1};
1170 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1171#else
1172 const Half<decltype(d)> dh;
1173 return Combine(d, Set(dh, t1), Set(dh, t0));
1174#endif
1175}
1176#endif
1177
1178// Generic for all vector lengths
1179template <class D, HWY_IF_BF16_D(D)>
1181 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1182 TFromD<D> t5, TFromD<D> t6,
1183 TFromD<D> t7) {
1184 const RebindToSigned<decltype(d)> di;
1185 return BitCast(d,
1187 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
1188 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
1189 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
1190 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
1191}
1192
1193#if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
1194template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1195HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1196 TFromD<D> t2, TFromD<D> t3,
1197 TFromD<D> /*t4*/, TFromD<D> /*t5*/,
1198 TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
1199 typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(8)));
1200 (void)d;
1201 const GccF16RawVectType raw = {
1202 static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
1203 static_cast<__fp16>(t3)};
1204 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1205}
1206template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
1207HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1208 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1209 TFromD<D> t5, TFromD<D> t6,
1210 TFromD<D> t7) {
1211 typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(16)));
1212 (void)d;
1213 const GccF16RawVectType raw = {
1214 static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
1215 static_cast<__fp16>(t3), static_cast<__fp16>(t4), static_cast<__fp16>(t5),
1216 static_cast<__fp16>(t6), static_cast<__fp16>(t7)};
1217 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1218}
1219#else
1220// Generic for all vector lengths if MSVC or !HWY_NEON_HAVE_F16C
1221template <class D, HWY_IF_F16_D(D)>
1222HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1223 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1224 TFromD<D> t5, TFromD<D> t6,
1225 TFromD<D> t7) {
1226 const RebindToSigned<decltype(d)> di;
1227 return BitCast(d,
1229 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
1230 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
1231 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
1232 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
1233}
1234#endif // (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
1235
1236namespace detail {
1237
1238template <class D, HWY_IF_T_SIZE_D(D, 1)>
1240 return Dup128VecFromValues(
1241 d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, TFromD<D>{3}, TFromD<D>{4},
1242 TFromD<D>{5}, TFromD<D>{6}, TFromD<D>{7}, TFromD<D>{8}, TFromD<D>{9},
1243 TFromD<D>{10}, TFromD<D>{11}, TFromD<D>{12}, TFromD<D>{13}, TFromD<D>{14},
1244 TFromD<D>{15});
1245}
1246
1247template <class D, HWY_IF_UI16_D(D)>
1249 return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
1250 TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5},
1251 TFromD<D>{6}, TFromD<D>{7});
1252}
1253
1254template <class D, HWY_IF_F16_D(D)>
1256 const RebindToUnsigned<decltype(d)> du;
1257 return BitCast(d, Dup128VecFromValues(du, uint16_t{0}, uint16_t{0x3C00},
1258 uint16_t{0x4000}, uint16_t{0x4200},
1259 uint16_t{0x4400}, uint16_t{0x4500},
1260 uint16_t{0x4600}, uint16_t{0x4700}));
1261}
1262
1263template <class D, HWY_IF_T_SIZE_D(D, 4)>
1265 return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
1266 TFromD<D>{3});
1267}
1268
1269template <class D, HWY_IF_T_SIZE_D(D, 8)>
1271 return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1});
1272}
1273
1274#if HWY_COMPILER_MSVC
1275template <class V, HWY_IF_V_SIZE_LE_V(V, 4)>
1276static HWY_INLINE V MaskOutIota(V v) {
1277 constexpr size_t kVecSizeInBytes = HWY_MAX_LANES_V(V) * sizeof(TFromV<V>);
1278 constexpr uint64_t kU64MaskOutMask =
1279 hwy::LimitsMax<hwy::UnsignedFromSize<kVecSizeInBytes>>();
1280
1281 const DFromV<decltype(v)> d;
1282 const Repartition<uint8_t, decltype(d)> du8;
1283 using VU8 = VFromD<decltype(du8)>;
1284 const auto mask_out_mask =
1285 BitCast(d, VU8(vreinterpret_u8_u64(vdup_n_u64(kU64MaskOutMask))));
1286 return v & mask_out_mask;
1287}
1288template <class V, HWY_IF_V_SIZE_GT_V(V, 4)>
1289static HWY_INLINE V MaskOutIota(V v) {
1290 return v;
1291}
1292#endif
1293
1294} // namespace detail
1295
1296template <class D, typename T2>
1297HWY_API VFromD<D> Iota(D d, const T2 first) {
1298 const auto result_iota =
1299 detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
1300#if HWY_COMPILER_MSVC
1301 return detail::MaskOutIota(result_iota);
1302#else
1303 return result_iota;
1304#endif
1305}
1306
1307// ------------------------------ Tuple (VFromD)
1308#include "hwy/ops/tuple-inl.h"
1309
1310// ------------------------------ Combine
1311
1312// Full result
1313template <class D, HWY_IF_U8_D(D)>
1315 Vec64<uint8_t> lo) {
1316 return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw));
1317}
1318template <class D, HWY_IF_U16_D(D)>
1320 Vec64<uint16_t> lo) {
1321 return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw));
1322}
1323template <class D, HWY_IF_U32_D(D)>
1325 Vec64<uint32_t> lo) {
1326 return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw));
1327}
1328template <class D, HWY_IF_U64_D(D)>
1330 Vec64<uint64_t> lo) {
1331 return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw));
1332}
1333
1334template <class D, HWY_IF_I8_D(D)>
1336 Vec64<int8_t> lo) {
1337 return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw));
1338}
1339template <class D, HWY_IF_I16_D(D)>
1341 Vec64<int16_t> lo) {
1342 return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw));
1343}
1344template <class D, HWY_IF_I32_D(D)>
1346 Vec64<int32_t> lo) {
1347 return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw));
1348}
1349template <class D, HWY_IF_I64_D(D)>
1351 Vec64<int64_t> lo) {
1352 return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
1353}
1354
1355#if HWY_HAVE_FLOAT16
1356template <class D, HWY_IF_F16_D(D)>
1357HWY_API Vec128<float16_t> Combine(D, Vec64<float16_t> hi, Vec64<float16_t> lo) {
1358 return Vec128<float16_t>(vcombine_f16(lo.raw, hi.raw));
1359}
1360#endif // HWY_HAVE_FLOAT16
1361
1362#if HWY_NEON_HAVE_BFLOAT16
1363template <class D, HWY_IF_BF16_D(D)>
1364HWY_API VFromD<D> Combine(D, Vec64<bfloat16_t> hi, Vec64<bfloat16_t> lo) {
1365 return VFromD<D>(vcombine_bf16(lo.raw, hi.raw));
1366}
1367#endif // HWY_NEON_HAVE_BFLOAT16
1368
1369template <class D, class DH = Half<D>, HWY_NEON_IF_EMULATED_D(D)>
1371 const RebindToUnsigned<D> du;
1372 const Half<decltype(du)> duh;
1373 return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
1374}
1375
1376template <class D, HWY_IF_F32_D(D)>
1378 return Vec128<float>(vcombine_f32(lo.raw, hi.raw));
1379}
1380#if HWY_HAVE_FLOAT64
1381template <class D, HWY_IF_F64_D(D)>
1382HWY_API Vec128<double> Combine(D /* tag */, Vec64<double> hi,
1383 Vec64<double> lo) {
1384 return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
1385}
1386#endif // HWY_HAVE_FLOAT64
1387
1388// ------------------------------ BitCast
1389
1390namespace detail {
1391
1392// Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
1393// vreinterpret*_u8_*() set of functions.
1394#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
1395#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
1396 Vec128<uint8_t, size * sizeof(type##_t)>
1397#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
1398#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
1399
1400// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
1401template <size_t N>
1405
1407 HWY_CAST_TO_U8)
1409 HWY_CAST_TO_U8)
1410
1415
1416#if !HWY_HAVE_FLOAT16
1417#if HWY_NEON_HAVE_F16C
1419 HWY_CAST_TO_U8)
1420#else
1421template <size_t N>
1425#endif // HWY_NEON_HAVE_F16C
1426#endif // !HWY_HAVE_FLOAT16
1427
1428#if !HWY_NEON_HAVE_BFLOAT16
1429template <size_t N>
1433#endif // !HWY_NEON_HAVE_BFLOAT16
1434
1435#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
1436#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
1437#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
1438#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
1439
1440template <class D, HWY_IF_U8_D(D)>
1442 return v;
1443}
1444
1445// 64-bit or less:
1446
1447template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
1450 return VFromD<D>(vreinterpret_s8_u8(v.raw));
1451}
1452template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
1455 return VFromD<D>(vreinterpret_u16_u8(v.raw));
1456}
1457template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
1460 return VFromD<D>(vreinterpret_s16_u8(v.raw));
1461}
1462template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
1464 VFromD<Repartition<uint8_t, D>> v) {
1465 return VFromD<D>(vreinterpret_u32_u8(v.raw));
1466}
1467template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
1469 VFromD<Repartition<uint8_t, D>> v) {
1470 return VFromD<D>(vreinterpret_s32_u8(v.raw));
1471}
1472
1473template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
1475 return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
1476}
1477template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
1479 return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
1480}
1481
1482// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
1483template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
1485#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1486 return VFromD<D>(vreinterpret_f16_u8(v.raw));
1487#else
1488 const RebindToUnsigned<D> du;
1489 return VFromD<D>(BitCastFromByte(du, v).raw);
1490#endif
1491}
1492
1493template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
1494HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
1495#if HWY_NEON_HAVE_BFLOAT16
1496 return VFromD<D>(vreinterpret_bf16_u8(v.raw));
1497#else
1498 const RebindToUnsigned<D> du;
1499 return VFromD<D>(BitCastFromByte(du, v).raw);
1500#endif
1501}
1502
1503template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
1505 VFromD<Repartition<uint8_t, D>> v) {
1506 return VFromD<D>(vreinterpret_f32_u8(v.raw));
1507}
1508
1509#if HWY_HAVE_FLOAT64
1510template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)>
1511HWY_INLINE Vec64<double> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
1512 return Vec64<double>(vreinterpret_f64_u8(v.raw));
1513}
1514#endif // HWY_HAVE_FLOAT64
1515
1516// 128-bit full:
1517
1518template <class D, HWY_IF_I8_D(D)>
1520 return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
1521}
1522template <class D, HWY_IF_U16_D(D)>
1524 return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
1525}
1526template <class D, HWY_IF_I16_D(D)>
1528 return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
1529}
1530template <class D, HWY_IF_U32_D(D)>
1532 return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
1533}
1534template <class D, HWY_IF_I32_D(D)>
1536 return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
1537}
1538template <class D, HWY_IF_U64_D(D)>
1540 return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
1541}
1542template <class D, HWY_IF_I64_D(D)>
1544 return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
1545}
1546
1547template <class D, HWY_IF_F32_D(D)>
1549 return Vec128<float>(vreinterpretq_f32_u8(v.raw));
1550}
1551
1552#if HWY_HAVE_FLOAT64
1553template <class D, HWY_IF_F64_D(D)>
1555 return Vec128<double>(vreinterpretq_f64_u8(v.raw));
1556}
1557#endif // HWY_HAVE_FLOAT64
1558
1559// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
1560template <class D, HWY_IF_F16_D(D)>
1562#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1563 return VFromD<D>(vreinterpretq_f16_u8(v.raw));
1564#else
1566#endif
1567}
1568
1569template <class D, HWY_IF_BF16_D(D)>
1571#if HWY_NEON_HAVE_BFLOAT16
1572 return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
1573#else
1575#endif
1576}
1577
1578} // namespace detail
1579
1580template <class D, class FromT>
1585
1586// ------------------------------ ResizeBitCast
1587
1588// <= 8 byte vector to <= 8 byte vector
1589template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
1590 HWY_IF_V_SIZE_LE_D(D, 8)>
1592 const Repartition<uint8_t, decltype(d)> du8;
1593 return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToByte(v).raw});
1594}
1595
1596// 16-byte vector to 16-byte vector: same as BitCast
1597template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
1598 HWY_IF_V_SIZE_D(D, 16)>
1599HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1600 return BitCast(d, v);
1601}
1602
1603// 16-byte vector to <= 8-byte vector
1604template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
1605 HWY_IF_V_SIZE_LE_D(D, 8)>
1606HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1607 const DFromV<decltype(v)> d_from;
1608 const Half<decltype(d_from)> dh_from;
1609 return ResizeBitCast(d, LowerHalf(dh_from, v));
1610}
1611
1612// <= 8-bit vector to 16-byte vector
1613template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
1614 HWY_IF_V_SIZE_D(D, 16)>
1615HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1616 const Full64<TFromV<FromV>> d_full64_from;
1617 const Full128<TFromV<FromV>> d_full128_from;
1618 return BitCast(d, Combine(d_full128_from, Zero(d_full64_from),
1619 ResizeBitCast(d_full64_from, v)));
1620}
1621
1622// ------------------------------ GetLane
1623
1624namespace detail {
1625#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
1626#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
1627#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
1628#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1629
1630HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)
1631HWY_NEON_DEF_FUNCTION_BFLOAT_16(GetLane, vget, _lane_, HWY_GET)
1632
1633template <size_t kLane, class V, HWY_NEON_IF_EMULATED_D(DFromV<V>)>
1635 const DFromV<decltype(v)> d;
1636 const RebindToUnsigned<decltype(d)> du;
1637 return BitCastScalar<TFromV<V>>(GetLane<kLane>(BitCast(du, v)));
1638}
1639
1640#undef HWY_NEON_BUILD_TPL_HWY_GET
1641#undef HWY_NEON_BUILD_RET_HWY_GET
1642#undef HWY_NEON_BUILD_PARAM_HWY_GET
1643#undef HWY_NEON_BUILD_ARG_HWY_GET
1644
1645} // namespace detail
1646
1647template <class V>
1649 return detail::GetLane<0>(v);
1650}
1651
1652// ------------------------------ ExtractLane
1653
1654// Requires one overload per vector length because GetLane<3> is a compile error
1655// if v is a uint32x2_t.
1656template <typename T>
1657HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
1658 HWY_DASSERT(i == 0);
1659 (void)i;
1660 return detail::GetLane<0>(v);
1661}
1662
1663template <typename T>
1664HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
1665#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1666 if (__builtin_constant_p(i)) {
1667 switch (i) {
1668 case 0:
1669 return detail::GetLane<0>(v);
1670 case 1:
1671 return detail::GetLane<1>(v);
1672 }
1673 }
1674#endif
1675 alignas(16) T lanes[2];
1676 Store(v, DFromV<decltype(v)>(), lanes);
1677 return lanes[i];
1678}
1679
1680template <typename T>
1681HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
1682#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1683 if (__builtin_constant_p(i)) {
1684 switch (i) {
1685 case 0:
1686 return detail::GetLane<0>(v);
1687 case 1:
1688 return detail::GetLane<1>(v);
1689 case 2:
1690 return detail::GetLane<2>(v);
1691 case 3:
1692 return detail::GetLane<3>(v);
1693 }
1694 }
1695#endif
1696 alignas(16) T lanes[4];
1697 Store(v, DFromV<decltype(v)>(), lanes);
1698 return lanes[i];
1699}
1700
1701template <typename T>
1702HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
1703#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1704 if (__builtin_constant_p(i)) {
1705 switch (i) {
1706 case 0:
1707 return detail::GetLane<0>(v);
1708 case 1:
1709 return detail::GetLane<1>(v);
1710 case 2:
1711 return detail::GetLane<2>(v);
1712 case 3:
1713 return detail::GetLane<3>(v);
1714 case 4:
1715 return detail::GetLane<4>(v);
1716 case 5:
1717 return detail::GetLane<5>(v);
1718 case 6:
1719 return detail::GetLane<6>(v);
1720 case 7:
1721 return detail::GetLane<7>(v);
1722 }
1723 }
1724#endif
1725 alignas(16) T lanes[8];
1726 Store(v, DFromV<decltype(v)>(), lanes);
1727 return lanes[i];
1728}
1729
1730template <typename T>
1731HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
1732#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1733 if (__builtin_constant_p(i)) {
1734 switch (i) {
1735 case 0:
1736 return detail::GetLane<0>(v);
1737 case 1:
1738 return detail::GetLane<1>(v);
1739 case 2:
1740 return detail::GetLane<2>(v);
1741 case 3:
1742 return detail::GetLane<3>(v);
1743 case 4:
1744 return detail::GetLane<4>(v);
1745 case 5:
1746 return detail::GetLane<5>(v);
1747 case 6:
1748 return detail::GetLane<6>(v);
1749 case 7:
1750 return detail::GetLane<7>(v);
1751 case 8:
1752 return detail::GetLane<8>(v);
1753 case 9:
1754 return detail::GetLane<9>(v);
1755 case 10:
1756 return detail::GetLane<10>(v);
1757 case 11:
1758 return detail::GetLane<11>(v);
1759 case 12:
1760 return detail::GetLane<12>(v);
1761 case 13:
1762 return detail::GetLane<13>(v);
1763 case 14:
1764 return detail::GetLane<14>(v);
1765 case 15:
1766 return detail::GetLane<15>(v);
1767 }
1768 }
1769#endif
1770 alignas(16) T lanes[16];
1771 Store(v, DFromV<decltype(v)>(), lanes);
1772 return lanes[i];
1773}
1774
1775// ------------------------------ InsertLane
1776
1777namespace detail {
1778#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
1779#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
1780#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
1781 Vec128<type##_t, size> v, type##_t t
1782#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1783
1784HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
1785HWY_NEON_DEF_FUNCTION_BFLOAT_16(InsertLane, vset, _lane_, HWY_INSERT)
1786
1787#undef HWY_NEON_BUILD_TPL_HWY_INSERT
1788#undef HWY_NEON_BUILD_RET_HWY_INSERT
1789#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1790#undef HWY_NEON_BUILD_ARG_HWY_INSERT
1791
1792template <size_t kLane, class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
1794 const D d;
1795 const RebindToUnsigned<D> du;
1796 const uint16_t tu = BitCastScalar<uint16_t>(t);
1797 return BitCast(d, InsertLane<kLane>(BitCast(du, v), tu));
1798}
1799
1800} // namespace detail
1801
1802// Requires one overload per vector length because InsertLane<3> may be a
1803// compile error.
1804
1805template <typename T>
1807 HWY_DASSERT(i == 0);
1808 (void)i;
1809 return Set(DFromV<decltype(v)>(), t);
1810}
1811
1812template <typename T>
1814#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1815 if (__builtin_constant_p(i)) {
1816 switch (i) {
1817 case 0:
1818 return detail::InsertLane<0>(v, t);
1819 case 1:
1820 return detail::InsertLane<1>(v, t);
1821 }
1822 }
1823#endif
1824 const DFromV<decltype(v)> d;
1825 alignas(16) T lanes[2];
1826 Store(v, d, lanes);
1827 lanes[i] = t;
1828 return Load(d, lanes);
1829}
1830
1831template <typename T>
1833#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1834 if (__builtin_constant_p(i)) {
1835 switch (i) {
1836 case 0:
1837 return detail::InsertLane<0>(v, t);
1838 case 1:
1839 return detail::InsertLane<1>(v, t);
1840 case 2:
1841 return detail::InsertLane<2>(v, t);
1842 case 3:
1843 return detail::InsertLane<3>(v, t);
1844 }
1845 }
1846#endif
1847 const DFromV<decltype(v)> d;
1848 alignas(16) T lanes[4];
1849 Store(v, d, lanes);
1850 lanes[i] = t;
1851 return Load(d, lanes);
1852}
1853
1854template <typename T>
1856#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1857 if (__builtin_constant_p(i)) {
1858 switch (i) {
1859 case 0:
1860 return detail::InsertLane<0>(v, t);
1861 case 1:
1862 return detail::InsertLane<1>(v, t);
1863 case 2:
1864 return detail::InsertLane<2>(v, t);
1865 case 3:
1866 return detail::InsertLane<3>(v, t);
1867 case 4:
1868 return detail::InsertLane<4>(v, t);
1869 case 5:
1870 return detail::InsertLane<5>(v, t);
1871 case 6:
1872 return detail::InsertLane<6>(v, t);
1873 case 7:
1874 return detail::InsertLane<7>(v, t);
1875 }
1876 }
1877#endif
1878 const DFromV<decltype(v)> d;
1879 alignas(16) T lanes[8];
1880 Store(v, d, lanes);
1881 lanes[i] = t;
1882 return Load(d, lanes);
1883}
1884
1885template <typename T>
1887#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
1888 if (__builtin_constant_p(i)) {
1889 switch (i) {
1890 case 0:
1891 return detail::InsertLane<0>(v, t);
1892 case 1:
1893 return detail::InsertLane<1>(v, t);
1894 case 2:
1895 return detail::InsertLane<2>(v, t);
1896 case 3:
1897 return detail::InsertLane<3>(v, t);
1898 case 4:
1899 return detail::InsertLane<4>(v, t);
1900 case 5:
1901 return detail::InsertLane<5>(v, t);
1902 case 6:
1903 return detail::InsertLane<6>(v, t);
1904 case 7:
1905 return detail::InsertLane<7>(v, t);
1906 case 8:
1907 return detail::InsertLane<8>(v, t);
1908 case 9:
1909 return detail::InsertLane<9>(v, t);
1910 case 10:
1911 return detail::InsertLane<10>(v, t);
1912 case 11:
1913 return detail::InsertLane<11>(v, t);
1914 case 12:
1915 return detail::InsertLane<12>(v, t);
1916 case 13:
1917 return detail::InsertLane<13>(v, t);
1918 case 14:
1919 return detail::InsertLane<14>(v, t);
1920 case 15:
1921 return detail::InsertLane<15>(v, t);
1922 }
1923 }
1924#endif
1925 const DFromV<decltype(v)> d;
1926 alignas(16) T lanes[16];
1927 Store(v, d, lanes);
1928 lanes[i] = t;
1929 return Load(d, lanes);
1930}
1931
1932// ================================================== ARITHMETIC
1933
1934// ------------------------------ Addition
1935HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)
1936
1937// ------------------------------ Subtraction
1938HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)
1939
1940// ------------------------------ SumsOf8
1941
1942HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
1943 return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw))));
1944}
1946 return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
1947}
1949 return Vec128<int64_t>(vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(v.raw))));
1950}
1952 return Vec64<int64_t>(vpaddl_s32(vpaddl_s16(vpaddl_s8(v.raw))));
1953}
1954
1955// ------------------------------ SumsOf2
1956namespace detail {
1957
1958template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1960 hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1961 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s8(v.raw));
1962}
1963
1964template <class V, HWY_IF_V_SIZE_V(V, 16)>
1966 hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1967 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s8(v.raw));
1968}
1969
1970template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1975
1976template <class V, HWY_IF_V_SIZE_V(V, 16)>
1978 hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1979 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u8(v.raw));
1980}
1981
1982template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1984 hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
1985 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s16(v.raw));
1986}
1987
1988template <class V, HWY_IF_V_SIZE_V(V, 16)>
1990 hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
1991 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s16(v.raw));
1992}
1993
1994template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1996 hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
1997 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u16(v.raw));
1998}
1999
2000template <class V, HWY_IF_V_SIZE_V(V, 16)>
2002 hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
2003 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u16(v.raw));
2004}
2005
2006template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
2008 hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
2009 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s32(v.raw));
2010}
2011
2012template <class V, HWY_IF_V_SIZE_V(V, 16)>
2014 hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
2015 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s32(v.raw));
2016}
2017
2018template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
2020 hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
2021 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u32(v.raw));
2022}
2023
2024template <class V, HWY_IF_V_SIZE_V(V, 16)>
2026 hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
2027 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u32(v.raw));
2028}
2029
2030} // namespace detail
2031
2032// ------------------------------ SaturatedAdd
2033
2034#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
2035#undef HWY_NATIVE_I32_SATURATED_ADDSUB
2036#else
2037#define HWY_NATIVE_I32_SATURATED_ADDSUB
2038#endif
2039
2040#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
2041#undef HWY_NATIVE_U32_SATURATED_ADDSUB
2042#else
2043#define HWY_NATIVE_U32_SATURATED_ADDSUB
2044#endif
2045
2046#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
2047#undef HWY_NATIVE_I64_SATURATED_ADDSUB
2048#else
2049#define HWY_NATIVE_I64_SATURATED_ADDSUB
2050#endif
2051
2052#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
2053#undef HWY_NATIVE_U64_SATURATED_ADDSUB
2054#else
2055#define HWY_NATIVE_U64_SATURATED_ADDSUB
2056#endif
2057
2058// Returns a + b clamped to the destination range.
2060
2061// ------------------------------ SaturatedSub
2062
2063// Returns a - b clamped to the destination range.
2065
2066// ------------------------------ Average
2067
2068// Returns (a + b + 1) / 2
2071
2072// ------------------------------ Neg
2073
2075HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below
2076
2077#if !HWY_HAVE_FLOAT16
2078template <size_t N>
2080 const DFromV<decltype(v)> d;
2081 const RebindToUnsigned<decltype(d)> du;
2082 using TU = TFromD<decltype(du)>;
2083 return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
2084}
2085#endif // !HWY_HAVE_FLOAT16
2086
2087// There is no vneg for bf16, but we can cast to f16 (emulated or native).
2088template <size_t N>
2090 const DFromV<decltype(v)> d;
2091 const Rebind<float16_t, decltype(d)> df16;
2092 return BitCast(d, Neg(BitCast(df16, v)));
2093}
2094
2096#if HWY_ARCH_ARM_A64
2097 return Vec64<int64_t>(vneg_s64(v.raw));
2098#else
2099 return Zero(DFromV<decltype(v)>()) - v;
2100#endif
2101}
2102
2104#if HWY_ARCH_ARM_A64
2105 return Vec128<int64_t>(vnegq_s64(v.raw));
2106#else
2107 return Zero(DFromV<decltype(v)>()) - v;
2108#endif
2109}
2110
2111// ------------------------------ SaturatedNeg
2112#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
2113#undef HWY_NATIVE_SATURATED_NEG_8_16_32
2114#else
2115#define HWY_NATIVE_SATURATED_NEG_8_16_32
2116#endif
2117
2119
2120#if HWY_ARCH_ARM_A64
2121#ifdef HWY_NATIVE_SATURATED_NEG_64
2122#undef HWY_NATIVE_SATURATED_NEG_64
2123#else
2124#define HWY_NATIVE_SATURATED_NEG_64
2125#endif
2126
2127HWY_API Vec64<int64_t> SaturatedNeg(const Vec64<int64_t> v) {
2128 return Vec64<int64_t>(vqneg_s64(v.raw));
2129}
2130
2131HWY_API Vec128<int64_t> SaturatedNeg(const Vec128<int64_t> v) {
2132 return Vec128<int64_t>(vqnegq_s64(v.raw));
2133}
2134#endif
2135
2136// ------------------------------ ShiftLeft
2137
2138// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
2139#pragma push_macro("HWY_NEON_DEF_FUNCTION")
2140#undef HWY_NEON_DEF_FUNCTION
2141#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
2142 template <int kBits> \
2143 HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
2144 return kBits == 0 ? v \
2145 : Vec128<type##_t, size>(HWY_NEON_EVAL( \
2146 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
2147 }
2148
2150
2151HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored)
2152HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
2153
2154#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2155
2156// ------------------------------ RotateRight (ShiftRight, Or)
2157template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
2159 const DFromV<decltype(v)> d;
2160 const RebindToUnsigned<decltype(d)> du;
2161
2162 constexpr size_t kSizeInBits = sizeof(T) * 8;
2163 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
2164 if (kBits == 0) return v;
2165
2166 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
2167 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
2168}
2169
2170// NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a
2171// mechanism for checking for extensions to Armv8.
2172
2173// ------------------------------ Shl
2174
2176 return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
2177}
2178template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
2180 Vec128<uint8_t, N> bits) {
2181 return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
2182}
2183
2185 return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
2186}
2187template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
2189 Vec128<uint16_t, N> bits) {
2190 return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw)));
2191}
2192
2194 return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw)));
2195}
2196template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
2198 Vec128<uint32_t, N> bits) {
2199 return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw)));
2200}
2201
2203 return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw)));
2204}
2206 return Vec64<uint64_t>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
2207}
2208
2210 return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
2211}
2212template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
2217
2221template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
2226
2230template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
2235
2240 return Vec64<int64_t>(vshl_s64(v.raw, bits.raw));
2241}
2242
2243// ------------------------------ Shr (Neg)
2244
2246 const RebindToSigned<DFromV<decltype(v)>> di;
2247 const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw;
2248 return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
2249}
2250template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
2252 Vec128<uint8_t, N> bits) {
2253 const RebindToSigned<DFromV<decltype(v)>> di;
2254 const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw;
2255 return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
2256}
2257
2259 const RebindToSigned<DFromV<decltype(v)>> di;
2260 const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw;
2261 return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits));
2262}
2263template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
2265 Vec128<uint16_t, N> bits) {
2266 const RebindToSigned<DFromV<decltype(v)>> di;
2267 const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw;
2268 return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits));
2269}
2270
2272 const RebindToSigned<DFromV<decltype(v)>> di;
2273 const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw;
2274 return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits));
2275}
2276template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
2278 Vec128<uint32_t, N> bits) {
2279 const RebindToSigned<DFromV<decltype(v)>> di;
2280 const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw;
2281 return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits));
2282}
2283
2285 const RebindToSigned<DFromV<decltype(v)>> di;
2286 const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw;
2287 return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits));
2288}
2290 const RebindToSigned<DFromV<decltype(v)>> di;
2291 const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw;
2292 return Vec64<uint64_t>(vshl_u64(v.raw, neg_bits));
2293}
2294
2296 return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
2297}
2298template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
2300 Vec128<int8_t, N> bits) {
2301 return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
2302}
2303
2305 return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
2306}
2307template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
2309 Vec128<int16_t, N> bits) {
2310 return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw));
2311}
2312
2314 return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw));
2315}
2316template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
2318 Vec128<int32_t, N> bits) {
2319 return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw));
2320}
2321
2323 return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw));
2324}
2326 return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw));
2327}
2328
2329// ------------------------------ ShiftLeftSame (Shl)
2330
2331template <typename T, size_t N>
2333 return v << Set(DFromV<decltype(v)>(), static_cast<T>(bits));
2334}
2335template <typename T, size_t N>
2337 return v >> Set(DFromV<decltype(v)>(), static_cast<T>(bits));
2338}
2339
2340// ------------------------------ Int/float multiplication
2341
2342// Per-target flag to prevent generic_ops-inl.h from defining 8-bit operator*.
2343#ifdef HWY_NATIVE_MUL_8
2344#undef HWY_NATIVE_MUL_8
2345#else
2346#define HWY_NATIVE_MUL_8
2347#endif
2348
2349// All except ui64
2350HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator*, vmul, _, 2)
2351HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator*, vmul, _, 2)
2352HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
2353
2354// ------------------------------ Integer multiplication
2355
2356// Returns the upper sizeof(T)*8 bits of a * b in each lane.
2357HWY_API Vec128<int8_t> MulHigh(Vec128<int8_t> a, Vec128<int8_t> b) {
2358 int16x8_t rlo = vmull_s8(vget_low_s8(a.raw), vget_low_s8(b.raw));
2359#if HWY_ARCH_ARM_A64
2360 int16x8_t rhi = vmull_high_s8(a.raw, b.raw);
2361#else
2362 int16x8_t rhi = vmull_s8(vget_high_s8(a.raw), vget_high_s8(b.raw));
2363#endif
2364 return Vec128<int8_t>(
2365 vuzp2q_s8(vreinterpretq_s8_s16(rlo), vreinterpretq_s8_s16(rhi)));
2366}
2368 uint16x8_t rlo = vmull_u8(vget_low_u8(a.raw), vget_low_u8(b.raw));
2369#if HWY_ARCH_ARM_A64
2370 uint16x8_t rhi = vmull_high_u8(a.raw, b.raw);
2371#else
2372 uint16x8_t rhi = vmull_u8(vget_high_u8(a.raw), vget_high_u8(b.raw));
2373#endif
2374 return Vec128<uint8_t>(
2375 vuzp2q_u8(vreinterpretq_u8_u16(rlo), vreinterpretq_u8_u16(rhi)));
2376}
2377
2378template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
2380 int8x16_t hi_lo = vreinterpretq_s8_s16(vmull_s8(a.raw, b.raw));
2381 return Vec128<int8_t, N>(vget_low_s8(vuzp2q_s8(hi_lo, hi_lo)));
2382}
2383template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
2385 uint8x16_t hi_lo = vreinterpretq_u8_u16(vmull_u8(a.raw, b.raw));
2386 return Vec128<uint8_t, N>(vget_low_u8(vuzp2q_u8(hi_lo, hi_lo)));
2387}
2388
2390 int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
2391#if HWY_ARCH_ARM_A64
2392 int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
2393#else
2394 int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
2395#endif
2396 return Vec128<int16_t>(
2397 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
2398}
2400 uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
2401#if HWY_ARCH_ARM_A64
2402 uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
2403#else
2404 uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
2405#endif
2406 return Vec128<uint16_t>(
2407 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
2408}
2409
2410template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
2412 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw));
2413 return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo)));
2414}
2415template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
2418 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw));
2419 return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
2420}
2421
2423 int64x2_t rlo = vmull_s32(vget_low_s32(a.raw), vget_low_s32(b.raw));
2424#if HWY_ARCH_ARM_A64
2425 int64x2_t rhi = vmull_high_s32(a.raw, b.raw);
2426#else
2427 int64x2_t rhi = vmull_s32(vget_high_s32(a.raw), vget_high_s32(b.raw));
2428#endif
2429 return Vec128<int32_t>(
2430 vuzp2q_s32(vreinterpretq_s32_s64(rlo), vreinterpretq_s32_s64(rhi)));
2431}
2433 uint64x2_t rlo = vmull_u32(vget_low_u32(a.raw), vget_low_u32(b.raw));
2434#if HWY_ARCH_ARM_A64
2435 uint64x2_t rhi = vmull_high_u32(a.raw, b.raw);
2436#else
2437 uint64x2_t rhi = vmull_u32(vget_high_u32(a.raw), vget_high_u32(b.raw));
2438#endif
2439 return Vec128<uint32_t>(
2440 vuzp2q_u32(vreinterpretq_u32_u64(rlo), vreinterpretq_u32_u64(rhi)));
2441}
2442
2443template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
2445 int32x4_t hi_lo = vreinterpretq_s32_s64(vmull_s32(a.raw, b.raw));
2446 return Vec128<int32_t, N>(vget_low_s32(vuzp2q_s32(hi_lo, hi_lo)));
2447}
2448template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
2451 uint32x4_t hi_lo = vreinterpretq_u32_u64(vmull_u32(a.raw, b.raw));
2452 return Vec128<uint32_t, N>(vget_low_u32(vuzp2q_u32(hi_lo, hi_lo)));
2453}
2454
2455template <class T, HWY_IF_UI64(T)>
2457 T hi_0;
2458 T hi_1;
2459
2460 Mul128(GetLane(a), GetLane(b), &hi_0);
2461 Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi_1);
2462
2463 return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
2464}
2465
2466template <class T, HWY_IF_UI64(T)>
2468 T hi;
2469 Mul128(GetLane(a), GetLane(b), &hi);
2470 return Set(Full64<T>(), hi);
2471}
2472
2476template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
2481
2482// ------------------------------ Floating-point division
2483
2484// Emulate missing intrinsic
2485#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
2486HWY_INLINE float64x1_t vrecpe_f64(float64x1_t raw) {
2487 const CappedTag<double, 1> d;
2488 const Twice<decltype(d)> dt;
2489 using VT = VFromD<decltype(dt)>;
2490 return LowerHalf(d, VT(vrecpeq_f64(Combine(dt, v, v).raw))).raw;
2491}
2492#endif
2493
2494// Approximate reciprocal
2496
2497#if HWY_HAVE_FLOAT64
2498#ifdef HWY_NATIVE_F64_APPROX_RECIP
2499#undef HWY_NATIVE_F64_APPROX_RECIP
2500#else
2501#define HWY_NATIVE_F64_APPROX_RECIP
2502#endif
2503
2504HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
2505#else // !HWY_HAVE_FLOAT64
2506namespace detail {
2507HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ReciprocalNewtonRaphsonStep, vrecps, _, 2)
2508} // namespace detail
2509
2510template <typename T, size_t N, HWY_IF_FLOAT(T)>
2512 auto x = ApproximateReciprocal(b);
2513 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
2514 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
2515 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
2516 return a * x;
2517}
2518#endif // HWY_HAVE_FLOAT64
2519
2520// ------------------------------ Absolute value of difference.
2521
2523HWY_NEON_DEF_FUNCTION_UI_8_16_32(AbsDiff, vabd, _, 2) // no UI64
2524
2525#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
2526#undef HWY_NATIVE_INTEGER_ABS_DIFF
2527#else
2528#define HWY_NATIVE_INTEGER_ABS_DIFF
2529#endif
2530
2531// ------------------------------ Integer multiply-add
2532
2533// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd.
2534#ifdef HWY_NATIVE_INT_FMA
2535#undef HWY_NATIVE_INT_FMA
2536#else
2537#define HWY_NATIVE_INT_FMA
2538#endif
2539
2540// Wrappers for changing argument order to what intrinsics expect.
2541namespace detail {
2542// All except ui64
2547} // namespace detail
2548
2549template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)>
2551 Vec128<T, N> add) {
2552 return detail::MulAdd(add, mul, x);
2553}
2554
2555template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)>
2557 Vec128<T, N> add) {
2558 return detail::NegMulAdd(add, mul, x);
2559}
2560
2561// 64-bit integer
2562template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)>
2563HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
2564 Vec128<T, N> add) {
2565 return Add(Mul(mul, x), add);
2566}
2567
2568template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)>
2569HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
2570 Vec128<T, N> add) {
2571 return Sub(add, Mul(mul, x));
2572}
2573
2574// ------------------------------ Floating-point multiply-add variants
2575
2576namespace detail {
2577
2578#if HWY_NATIVE_FMA
2579// Wrappers for changing argument order to what intrinsics expect.
2582#else
2583// Emulate. Matches intrinsics arg order.
2584template <size_t N>
2586 Vec128<float, N> x) {
2587 return mul * x + add;
2588}
2589
2590template <size_t N>
2592 Vec128<float, N> x) {
2593 return add - mul * x;
2594}
2595
2596#endif // HWY_NATIVE_FMA
2597} // namespace detail
2598
2599template <typename T, size_t N, HWY_IF_FLOAT(T)>
2601 Vec128<T, N> add) {
2602 return detail::MulAdd(add, mul, x);
2603}
2604
2605template <typename T, size_t N, HWY_IF_FLOAT(T)>
2607 Vec128<T, N> add) {
2608 return detail::NegMulAdd(add, mul, x);
2609}
2610
2611template <typename T, size_t N, HWY_IF_FLOAT(T)>
2613 Vec128<T, N> sub) {
2614 return MulAdd(mul, x, Neg(sub));
2615}
2616
2617template <typename T, size_t N, HWY_IF_FLOAT(T)>
2619 Vec128<T, N> sub) {
2620 return Neg(MulAdd(mul, x, sub));
2621}
2622
2623// ------------------------------ Floating-point square root (IfThenZeroElse)
2624
2625// Emulate missing intrinsic
2626#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 490
2627HWY_INLINE float64x1_t vrsqrte_f64(float64x1_t raw) {
2628 const CappedTag<double, 1> d;
2629 const Twice<decltype(d)> dt;
2630 using VT = VFromD<decltype(dt)>;
2631 const VFromD<decltype(d)> v(raw);
2632 return LowerHalf(d, VT(vrsqrteq_f64(Combine(dt, v, v).raw))).raw;
2633}
2634#endif
2635
2636// Approximate reciprocal square root
2638
2639#if HWY_HAVE_FLOAT64
2640#ifdef HWY_NATIVE_F64_APPROX_RSQRT
2641#undef HWY_NATIVE_F64_APPROX_RSQRT
2642#else
2643#define HWY_NATIVE_F64_APPROX_RSQRT
2644#endif
2645
2646// Full precision square root
2648#else // !HWY_HAVE_FLOAT64
2649namespace detail {
2650HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ReciprocalSqrtStep, vrsqrts, _, 2)
2651} // namespace detail
2652
2653template <typename T, size_t N, HWY_IF_FLOAT(T)>
2655 auto recip = ApproximateReciprocalSqrt(v);
2656
2657 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
2658 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
2659 recip *= detail::ReciprocalSqrtStep(v * recip, recip);
2660
2661 const auto root = v * recip;
2662 return IfThenZeroElse(v == Zero(Simd<T, N, 0>()), root);
2663}
2664#endif // HWY_HAVE_FLOAT64
2665
2666// ================================================== LOGICAL
2667
2668// ------------------------------ Not
2669
2670// There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
2671template <typename T>
2673 const DFromV<decltype(v)> d;
2674 const Repartition<uint8_t, decltype(d)> d8;
2675 return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
2676}
2677template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2679 const DFromV<decltype(v)> d;
2680 const Repartition<uint8_t, decltype(d)> d8;
2681 using V8 = decltype(Zero(d8));
2682 return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
2683}
2684
2685// ------------------------------ And
2687
2688// Uses the u32/64 defined above.
2689template <typename T, size_t N, HWY_IF_FLOAT(T)>
2690HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) {
2691 const DFromV<decltype(a)> d;
2692 const RebindToUnsigned<decltype(d)> du;
2693 return BitCast(d, BitCast(du, a) & BitCast(du, b));
2694}
2695
2696// ------------------------------ AndNot
2697
2698namespace detail {
2699// reversed_andnot returns a & ~b.
2700HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2)
2701} // namespace detail
2702
2703// Returns ~not_mask & mask.
2704template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
2706 const Vec128<T, N> mask) {
2707 return detail::reversed_andnot(mask, not_mask);
2708}
2709
2710// Uses the u32/64 defined above.
2711template <typename T, size_t N, HWY_IF_FLOAT(T)>
2712HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
2713 const Vec128<T, N> mask) {
2714 const DFromV<decltype(mask)> d;
2715 const RebindToUnsigned<decltype(d)> du;
2716 VFromD<decltype(du)> ret =
2717 detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
2718 return BitCast(d, ret);
2719}
2720
2721// ------------------------------ Or
2722
2724
2725// Uses the u32/64 defined above.
2726template <typename T, size_t N, HWY_IF_FLOAT(T)>
2727HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) {
2728 const DFromV<decltype(a)> d;
2729 const RebindToUnsigned<decltype(d)> du;
2730 return BitCast(d, BitCast(du, a) | BitCast(du, b));
2731}
2732
2733// ------------------------------ Xor
2734
2736
2737// Uses the u32/64 defined above.
2738template <typename T, size_t N, HWY_IF_FLOAT(T)>
2739HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) {
2740 const DFromV<decltype(a)> d;
2741 const RebindToUnsigned<decltype(d)> du;
2742 return BitCast(d, BitCast(du, a) ^ BitCast(du, b));
2743}
2744
2745// ------------------------------ Xor3
2746#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3)
2748
2749// Half vectors are not natively supported. Two Xor are likely more efficient
2750// than Combine to 128-bit.
2751template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8), HWY_IF_NOT_FLOAT(T)>
2752HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
2753 return Xor(x1, Xor(x2, x3));
2754}
2755
2756template <typename T, size_t N, HWY_IF_FLOAT(T)>
2757HWY_API Vec128<T, N> Xor3(const Vec128<T, N> x1, const Vec128<T, N> x2,
2758 const Vec128<T, N> x3) {
2759 const DFromV<decltype(x1)> d;
2760 const RebindToUnsigned<decltype(d)> du;
2761 return BitCast(d, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3)));
2762}
2763
2764#else
2765template <typename T, size_t N>
2767 return Xor(x1, Xor(x2, x3));
2768}
2769#endif
2770
2771// ------------------------------ Or3
2772template <typename T, size_t N>
2774 return Or(o1, Or(o2, o3));
2775}
2776
2777// ------------------------------ OrAnd
2778template <typename T, size_t N>
2780 return Or(o, And(a1, a2));
2781}
2782
2783// ------------------------------ IfVecThenElse
2784template <typename T, size_t N>
2786 Vec128<T, N> no) {
2787 return IfThenElse(MaskFromVec(mask), yes, no);
2788}
2789
2790// ------------------------------ BitwiseIfThenElse
2791
2792#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
2793#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
2794#else
2795#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
2796#endif
2797
2798template <class V>
2799HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
2800 return IfVecThenElse(mask, yes, no);
2801}
2802
2803// ------------------------------ Operator overloads (internal-only if float)
2804
2805template <typename T, size_t N>
2807 return And(a, b);
2808}
2809
2810template <typename T, size_t N>
2812 return Or(a, b);
2813}
2814
2815template <typename T, size_t N>
2817 return Xor(a, b);
2818}
2819
2820// ------------------------------ I64/U64 AbsDiff
2821
2822template <size_t N>
2824 const Vec128<int64_t, N> b) {
2825 return Max(a, b) - Min(a, b);
2826}
2827
2828template <size_t N>
2833
2834// ------------------------------ PopulationCount
2835
2836#ifdef HWY_NATIVE_POPCNT
2837#undef HWY_NATIVE_POPCNT
2838#else
2839#define HWY_NATIVE_POPCNT
2840#endif
2841
2842namespace detail {
2843
2844template <typename T>
2846 const Full128<uint8_t> d8;
2847 return Vec128<T>(vcntq_u8(BitCast(d8, v).raw));
2848}
2849template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2851 Vec128<T, N> v) {
2852 const Simd<uint8_t, N, 0> d8;
2853 return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw));
2854}
2855
2856// NEON lacks popcount for lane sizes > 1, so take pairwise sums of the bytes.
2857template <typename T>
2859 const Full128<uint8_t> d8;
2860 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2861 return Vec128<T>(vpaddlq_u8(bytes));
2862}
2863template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2865 Vec128<T, N> v) {
2866 const Repartition<uint8_t, DFromV<decltype(v)>> d8;
2867 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2868 return Vec128<T, N>(vpaddl_u8(bytes));
2869}
2870
2871template <typename T>
2873 const Full128<uint8_t> d8;
2874 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2875 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
2876}
2877template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2879 Vec128<T, N> v) {
2880 const Repartition<uint8_t, DFromV<decltype(v)>> d8;
2881 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2882 return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes)));
2883}
2884
2885template <typename T>
2887 const Full128<uint8_t> d8;
2888 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
2889 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
2890}
2891template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2893 Vec128<T, N> v) {
2894 const Repartition<uint8_t, DFromV<decltype(v)>> d8;
2895 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
2896 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
2897}
2898
2899} // namespace detail
2900
2901template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
2905
2906// ================================================== SIGN
2907
2908// ------------------------------ Abs
2909// i64 is implemented after BroadcastSignBit.
2912
2913// ------------------------------ SaturatedAbs
2914#ifdef HWY_NATIVE_SATURATED_ABS
2915#undef HWY_NATIVE_SATURATED_ABS
2916#else
2917#define HWY_NATIVE_SATURATED_ABS
2918#endif
2919
2921
2922// ------------------------------ CopySign
2923template <typename T, size_t N>
2924HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
2925 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
2926 const DFromV<decltype(magn)> d;
2927 return BitwiseIfThenElse(SignBit(d), sign, magn);
2928}
2929
2930// ------------------------------ CopySignToAbs
2931template <typename T, size_t N>
2933 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
2934 const DFromV<decltype(abs)> d;
2935 return OrAnd(abs, SignBit(d), sign);
2936}
2937
2938// ------------------------------ BroadcastSignBit
2939
2940template <typename T, size_t N, HWY_IF_SIGNED(T)>
2942 return ShiftRight<sizeof(T) * 8 - 1>(v);
2943}
2944
2945// ================================================== MASK
2946
2947// ------------------------------ To/from vector
2948
2949// Mask and Vec have the same representation (true = FF..FF).
2950template <typename T, size_t N>
2952 const Simd<MakeUnsigned<T>, N, 0> du;
2953 return Mask128<T, N>(BitCast(du, v).raw);
2954}
2955
2956template <class D>
2957using MFromD = decltype(MaskFromVec(VFromD<D>()));
2958
2959template <class D>
2961 // Raw type of masks is unsigned.
2962 const RebindToUnsigned<D> du;
2963 return BitCast(d, VFromD<decltype(du)>(m.raw));
2964}
2965
2966// ------------------------------ RebindMask (MaskFromVec)
2967
2968template <typename TFrom, size_t NFrom, class DTo>
2970 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
2971 return MFromD<DTo>(m.raw);
2972}
2973
2974// ------------------------------ IfThenElse
2975
2976#define HWY_NEON_BUILD_TPL_HWY_IF
2977#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
2978#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
2979 const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
2980 const Vec128<type##_t, size> no
2981#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
2982
2984
2985#if HWY_HAVE_FLOAT16
2986#define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_BF16(TFromV<V>)
2987#else
2988#define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_SPECIAL_FLOAT_V(V)
2989#endif
2990
2991template <class V, HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V)>
2992HWY_API V IfThenElse(MFromD<DFromV<V>> mask, V yes, V no) {
2993 const DFromV<decltype(yes)> d;
2994 const RebindToUnsigned<decltype(d)> du;
2995 return BitCast(
2996 d, IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
2997}
2998
2999#undef HWY_NEON_IF_EMULATED_IF_THEN_ELSE
3000#undef HWY_NEON_BUILD_TPL_HWY_IF
3001#undef HWY_NEON_BUILD_RET_HWY_IF
3002#undef HWY_NEON_BUILD_PARAM_HWY_IF
3003#undef HWY_NEON_BUILD_ARG_HWY_IF
3004
3005// mask ? yes : 0
3006template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
3008 return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
3009}
3010template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
3011HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
3012 const DFromV<decltype(yes)> d;
3013 const RebindToUnsigned<decltype(d)> du;
3014 return BitCast(d, IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
3015}
3016
3017// mask ? 0 : no
3018template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
3020 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
3021}
3022template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
3023HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
3024 const DFromV<decltype(no)> d;
3025 const RebindToUnsigned<decltype(d)> du;
3026 return BitCast(d, IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
3027}
3028
3029template <typename T, size_t N>
3031 Vec128<T, N> no) {
3032 static_assert(IsSigned<T>(), "Only works for signed/float");
3033 const DFromV<decltype(no)> d;
3034 const RebindToSigned<decltype(d)> di;
3035
3037 return IfThenElse(m, yes, no);
3038}
3039
3040// ------------------------------ Mask logical
3041
3042template <typename T, size_t N>
3044 return MaskFromVec(Not(VecFromMask(DFromM<decltype(m)>(), m)));
3045}
3046
3047template <typename T, size_t N>
3049 const DFromM<decltype(a)> d;
3050 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
3051}
3052
3053template <typename T, size_t N>
3055 const DFromM<decltype(a)> d;
3056 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
3057}
3058
3059template <typename T, size_t N>
3061 const DFromM<decltype(a)> d;
3062 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
3063}
3064
3065template <typename T, size_t N>
3067 const DFromM<decltype(a)> d;
3068 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
3069}
3070
3071template <typename T, size_t N>
3073 const DFromM<decltype(a)> d;
3074 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
3075}
3076
3077// ================================================== COMPARE
3078
3079// Comparisons fill a lane with 1-bits if the condition is true, else 0.
3080
3081// ------------------------------ Shuffle2301 (for i64 compares)
3082
3083// Swap 32-bit halves in 64-bits
3085 return Vec64<uint32_t>(vrev64_u32(v.raw));
3086}
3088 return Vec64<int32_t>(vrev64_s32(v.raw));
3089}
3091 return Vec64<float>(vrev64_f32(v.raw));
3092}
3094 return Vec128<uint32_t>(vrev64q_u32(v.raw));
3095}
3097 return Vec128<int32_t>(vrev64q_s32(v.raw));
3098}
3100 return Vec128<float>(vrev64q_f32(v.raw));
3101}
3102
3103#define HWY_NEON_BUILD_TPL_HWY_COMPARE
3104#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
3105#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
3106 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
3107#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
3108
3109// ------------------------------ Equality
3110HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
3111#if HWY_ARCH_ARM_A64
3112HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
3113#else
3114// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
3115HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
3116HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
3117#endif
3118
3119// ------------------------------ Strict inequality (signed, float)
3120#if HWY_ARCH_ARM_A64
3121HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE)
3122#else
3123HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE)
3124HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
3125#endif
3126HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
3127
3128// ------------------------------ Weak inequality (float)
3129#if HWY_ARCH_ARM_A64
3130HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<=, vcle, _, HWY_COMPARE)
3131#else
3132HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<=, vcle, _, HWY_COMPARE)
3133HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<=, vcle, _, HWY_COMPARE)
3134#endif
3135HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
3136
3137#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
3138#undef HWY_NEON_BUILD_RET_HWY_COMPARE
3139#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
3140#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
3141
3142// ------------------------------ Armv7 i64 compare (Shuffle2301, Eq)
3143
3144#if HWY_ARCH_ARM_V7
3145
3146template <size_t N>
3147HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
3148 const Vec128<int64_t, N> b) {
3149 const Simd<int32_t, N * 2, 0> d32;
3150 const Simd<int64_t, N, 0> d64;
3151 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
3152 const auto cmp64 = cmp32 & Shuffle2301(cmp32);
3153 return MaskFromVec(BitCast(d64, cmp64));
3154}
3155
3156template <size_t N>
3157HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
3158 const Vec128<uint64_t, N> b) {
3159 const Simd<uint32_t, N * 2, 0> d32;
3160 const Simd<uint64_t, N, 0> d64;
3161 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
3162 const auto cmp64 = cmp32 & Shuffle2301(cmp32);
3163 return MaskFromVec(BitCast(d64, cmp64));
3164}
3165
3166HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a,
3167 const Vec128<int64_t> b) {
3168 const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
3169 return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
3170}
3171HWY_API Mask128<int64_t, 1> operator<(const Vec64<int64_t> a,
3172 const Vec64<int64_t> b) {
3173 const int64x1_t sub = vqsub_s64(a.raw, b.raw);
3174 return MaskFromVec(BroadcastSignBit(Vec64<int64_t>(sub)));
3175}
3176
3177template <size_t N>
3178HWY_API Mask128<uint64_t, N> operator<(const Vec128<uint64_t, N> a,
3179 const Vec128<uint64_t, N> b) {
3180 const DFromV<decltype(a)> du;
3181 const RebindToSigned<decltype(du)> di;
3182 const Vec128<uint64_t, N> msb = AndNot(a, b) | AndNot(a ^ b, a - b);
3183 return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb))));
3184}
3185
3186template <size_t N>
3187HWY_API Mask128<int64_t, N> operator<=(const Vec128<int64_t, N> a,
3188 const Vec128<int64_t, N> b) {
3189 return Not(b < a);
3190}
3191
3192template <size_t N>
3193HWY_API Mask128<uint64_t, N> operator<=(const Vec128<uint64_t, N> a,
3194 const Vec128<uint64_t, N> b) {
3195 return Not(b < a);
3196}
3197
3198#endif
3199
3200// ------------------------------ operator!= (operator==)
3201
3202// Customize HWY_NEON_DEF_FUNCTION to call 2 functions.
3203#pragma push_macro("HWY_NEON_DEF_FUNCTION")
3204#undef HWY_NEON_DEF_FUNCTION
3205// This cannot have _any_ template argument (in x86_128 we can at least have N
3206// as an argument), otherwise it is not more specialized than rewritten
3207// operator== in C++20, leading to compile errors.
3208#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
3209 HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \
3210 Vec128<type##_t, size> b) { \
3211 return Not(a == b); \
3212 }
3213
3214HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored)
3215
3216#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
3217
3218// ------------------------------ Reversed comparisons
3219
3220template <typename T, size_t N>
3224template <typename T, size_t N>
3228
3229// ------------------------------ FirstN (Iota, Lt)
3230
3231template <class D>
3232HWY_API MFromD<D> FirstN(D d, size_t num) {
3233 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
3234 using TI = TFromD<decltype(di)>;
3235 return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num)));
3236}
3237
3238// ------------------------------ TestBit (Eq)
3239
3240#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
3241#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
3242#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
3243 Vec128<type##_t, size> v, Vec128<type##_t, size> bit
3244#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
3245
3246#if HWY_ARCH_ARM_A64
3247HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
3248#else
3249// No 64-bit versions on armv7
3250HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
3251HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
3252
3253template <size_t N>
3255 Vec128<uint64_t, N> bit) {
3256 return (v & bit) == bit;
3257}
3258template <size_t N>
3260 Vec128<int64_t, N> bit) {
3261 return (v & bit) == bit;
3262}
3263
3264#endif
3265#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
3266#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
3267#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
3268#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
3269
3270// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
3272#if HWY_ARCH_ARM_A64
3273 return Vec128<int64_t>(vabsq_s64(v.raw));
3274#else
3275 const auto zero = Zero(DFromV<decltype(v)>());
3276 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
3277#endif
3278}
3280#if HWY_ARCH_ARM_A64
3281 return Vec64<int64_t>(vabs_s64(v.raw));
3282#else
3283 const auto zero = Zero(DFromV<decltype(v)>());
3284 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
3285#endif
3286}
3287
3289#if HWY_ARCH_ARM_A64
3290 return Vec128<int64_t>(vqabsq_s64(v.raw));
3291#else
3292 const auto zero = Zero(DFromV<decltype(v)>());
3293 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
3294#endif
3295}
3297#if HWY_ARCH_ARM_A64
3298 return Vec64<int64_t>(vqabs_s64(v.raw));
3299#else
3300 const auto zero = Zero(DFromV<decltype(v)>());
3301 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
3302#endif
3303}
3304
3305// ------------------------------ Min (IfThenElse, BroadcastSignBit)
3306
3307// Unsigned
3309
3310template <size_t N>
3311HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
3312#if HWY_ARCH_ARM_A64
3313 return IfThenElse(b < a, b, a);
3314#else
3315 const DFromV<decltype(a)> du;
3316 const RebindToSigned<decltype(du)> di;
3317 return BitCast(du, BitCast(di, a) - BitCast(di, SaturatedSub(a, b)));
3318#endif
3319}
3320
3321// Signed
3323
3324template <size_t N>
3325HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
3326#if HWY_ARCH_ARM_A64
3327 return IfThenElse(b < a, b, a);
3328#else
3329 const Vec128<int64_t, N> sign = SaturatedSub(a, b);
3330 return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
3331#endif
3332}
3333
3334// Float: IEEE minimumNumber on v8
3335#if HWY_ARCH_ARM_A64
3336
3338
3339// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define
3340// in terms of the 128-bit intrinsic.
3341#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3342namespace detail {
3343
3344template <class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)>
3345HWY_INLINE V F64Vec64Min(V a, V b) {
3346 const DFromV<decltype(a)> d;
3347 const Twice<decltype(d)> dt;
3348 return LowerHalf(d, Min(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b)));
3349}
3350
3351} // namespace detail
3352#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3353
3354HWY_API Vec64<double> Min(Vec64<double> a, Vec64<double> b) {
3355#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3356 return detail::F64Vec64Min(a, b);
3357#else
3358 return Vec64<double>(vminnm_f64(a.raw, b.raw));
3359#endif
3360}
3361
3362HWY_API Vec128<double> Min(Vec128<double> a, Vec128<double> b) {
3363 return Vec128<double>(vminnmq_f64(a.raw, b.raw));
3364}
3365
3366#else
3367// Armv7: NaN if any is NaN.
3369#endif // HWY_ARCH_ARM_A64
3370
3371// ------------------------------ Max (IfThenElse, BroadcastSignBit)
3372
3373// Unsigned (no u64)
3375
3376template <size_t N>
3377HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
3378#if HWY_ARCH_ARM_A64
3379 return IfThenElse(b < a, a, b);
3380#else
3381 const DFromV<decltype(a)> du;
3382 const RebindToSigned<decltype(du)> di;
3383 return BitCast(du, BitCast(di, b) + BitCast(di, SaturatedSub(a, b)));
3384#endif
3385}
3386
3387// Signed (no i64)
3389
3390template <size_t N>
3391HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
3392#if HWY_ARCH_ARM_A64
3393 return IfThenElse(b < a, a, b);
3394#else
3395 const Vec128<int64_t, N> sign = SaturatedSub(a, b);
3396 return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
3397#endif
3398}
3399
3400// Float: IEEE minimumNumber on v8
3401#if HWY_ARCH_ARM_A64
3402
3404
3405// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define
3406// in terms of the 128-bit intrinsic.
3407#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3408namespace detail {
3409
3410template <class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)>
3411HWY_INLINE V F64Vec64Max(V a, V b) {
3412 const DFromV<decltype(a)> d;
3413 const Twice<decltype(d)> dt;
3414 return LowerHalf(d, Max(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b)));
3415}
3416
3417} // namespace detail
3418#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3419
3420HWY_API Vec64<double> Max(Vec64<double> a, Vec64<double> b) {
3421#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3422 return detail::F64Vec64Max(a, b);
3423#else
3424 return Vec64<double>(vmaxnm_f64(a.raw, b.raw));
3425#endif
3426}
3427
3428HWY_API Vec128<double> Max(Vec128<double> a, Vec128<double> b) {
3429 return Vec128<double>(vmaxnmq_f64(a.raw, b.raw));
3430}
3431
3432#else
3433// Armv7: NaN if any is NaN.
3435#endif // HWY_ARCH_ARM_A64
3436
3437// ================================================== MEMORY
3438
3439// ------------------------------ Load 128
3440
3441template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
3443 const uint8_t* HWY_RESTRICT unaligned) {
3444 return Vec128<uint8_t>(vld1q_u8(unaligned));
3445}
3446template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
3448 const uint16_t* HWY_RESTRICT unaligned) {
3449 return Vec128<uint16_t>(vld1q_u16(unaligned));
3450}
3451template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
3453 const uint32_t* HWY_RESTRICT unaligned) {
3454 return Vec128<uint32_t>(vld1q_u32(unaligned));
3455}
3456template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
3458 const uint64_t* HWY_RESTRICT unaligned) {
3459 return Vec128<uint64_t>(vld1q_u64(unaligned));
3460}
3461template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
3463 const int8_t* HWY_RESTRICT unaligned) {
3464 return Vec128<int8_t>(vld1q_s8(unaligned));
3465}
3466template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
3468 const int16_t* HWY_RESTRICT unaligned) {
3469 return Vec128<int16_t>(vld1q_s16(unaligned));
3470}
3471template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
3473 const int32_t* HWY_RESTRICT unaligned) {
3474 return Vec128<int32_t>(vld1q_s32(unaligned));
3475}
3476template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
3478 const int64_t* HWY_RESTRICT unaligned) {
3479 return Vec128<int64_t>(vld1q_s64(unaligned));
3480}
3481#if HWY_HAVE_FLOAT16
3482template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3483HWY_API Vec128<float16_t> LoadU(D /* tag */,
3484 const float16_t* HWY_RESTRICT unaligned) {
3485 return Vec128<float16_t>(vld1q_f16(detail::NativeLanePointer(unaligned)));
3486}
3487#endif // HWY_HAVE_FLOAT16
3488#if HWY_NEON_HAVE_BFLOAT16
3489template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
3490HWY_API Vec128<bfloat16_t> LoadU(D /* tag */,
3491 const bfloat16_t* HWY_RESTRICT unaligned) {
3492 return Vec128<bfloat16_t>(vld1q_bf16(detail::NativeLanePointer(unaligned)));
3493}
3494#endif // HWY_NEON_HAVE_BFLOAT16
3495template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3496HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) {
3497 return Vec128<float>(vld1q_f32(unaligned));
3498}
3499#if HWY_HAVE_FLOAT64
3500template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3501HWY_API Vec128<double> LoadU(D /* tag */,
3502 const double* HWY_RESTRICT unaligned) {
3503 return Vec128<double>(vld1q_f64(unaligned));
3504}
3505#endif // HWY_HAVE_FLOAT64
3506
3507// ------------------------------ Load 64
3508
3509template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
3510HWY_API Vec64<uint8_t> LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) {
3511 return Vec64<uint8_t>(vld1_u8(p));
3512}
3513template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
3514HWY_API Vec64<uint16_t> LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) {
3515 return Vec64<uint16_t>(vld1_u16(p));
3516}
3517template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
3518HWY_API Vec64<uint32_t> LoadU(D /* tag */, const uint32_t* HWY_RESTRICT p) {
3519 return Vec64<uint32_t>(vld1_u32(p));
3520}
3521template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
3522HWY_API Vec64<uint64_t> LoadU(D /* tag */, const uint64_t* HWY_RESTRICT p) {
3523 return Vec64<uint64_t>(vld1_u64(p));
3524}
3525template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
3526HWY_API Vec64<int8_t> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) {
3527 return Vec64<int8_t>(vld1_s8(p));
3528}
3529template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
3530HWY_API Vec64<int16_t> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) {
3531 return Vec64<int16_t>(vld1_s16(p));
3532}
3533template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
3534HWY_API Vec64<int32_t> LoadU(D /* tag */, const int32_t* HWY_RESTRICT p) {
3535 return Vec64<int32_t>(vld1_s32(p));
3536}
3537template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
3538HWY_API Vec64<int64_t> LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) {
3539 return Vec64<int64_t>(vld1_s64(p));
3540}
3541#if HWY_HAVE_FLOAT16
3542template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
3543HWY_API Vec64<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
3544 return Vec64<float16_t>(vld1_f16(detail::NativeLanePointer(p)));
3545}
3546#endif // HWY_HAVE_FLOAT16
3547#if HWY_NEON_HAVE_BFLOAT16
3548template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
3549HWY_API Vec64<bfloat16_t> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
3550 return Vec64<bfloat16_t>(vld1_bf16(detail::NativeLanePointer(p)));
3551}
3552#endif // HWY_NEON_HAVE_BFLOAT16
3553template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3554HWY_API Vec64<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
3555 return Vec64<float>(vld1_f32(p));
3556}
3557#if HWY_HAVE_FLOAT64
3558template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
3559HWY_API Vec64<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) {
3560 return Vec64<double>(vld1_f64(p));
3561}
3562#endif // HWY_HAVE_FLOAT64
3563
3564// ------------------------------ Load 32
3565
3566// Actual 32-bit broadcast load - used to implement the other lane types
3567// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
3568template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
3569HWY_API Vec32<uint32_t> LoadU(D /*tag*/, const uint32_t* HWY_RESTRICT p) {
3570 return Vec32<uint32_t>(vld1_dup_u32(p));
3571}
3572template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
3573HWY_API Vec32<int32_t> LoadU(D /*tag*/, const int32_t* HWY_RESTRICT p) {
3574 return Vec32<int32_t>(vld1_dup_s32(p));
3575}
3576template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
3577HWY_API Vec32<float> LoadU(D /*tag*/, const float* HWY_RESTRICT p) {
3578 return Vec32<float>(vld1_dup_f32(p));
3579}
3580
3581// {u,i}{8,16}
3582template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
3585 const Repartition<uint32_t, decltype(d)> d32;
3586 uint32_t buf;
3587 CopyBytes<4>(p, &buf);
3588 return BitCast(d, LoadU(d32, &buf));
3589}
3590
3591#if HWY_HAVE_FLOAT16
3592template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
3593HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
3594 const Repartition<uint32_t, decltype(d)> d32;
3595 uint32_t buf;
3596 CopyBytes<4>(p, &buf);
3597 return BitCast(d, LoadU(d32, &buf));
3598}
3599#endif // HWY_HAVE_FLOAT16
3600#if HWY_NEON_HAVE_BFLOAT16
3601template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
3602HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
3603 const Repartition<uint32_t, decltype(d)> d32;
3604 uint32_t buf;
3605 CopyBytes<4>(p, &buf);
3606 return BitCast(d, LoadU(d32, &buf));
3607}
3608#endif // HWY_NEON_HAVE_BFLOAT16
3609
3610// ------------------------------ Load 16
3611
3612// Actual 16-bit broadcast load - used to implement the other lane types
3613// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
3614template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_U16_D(D)>
3615HWY_API VFromD<D> LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) {
3616 return VFromD<D>(vld1_dup_u16(p));
3617}
3618template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I16_D(D)>
3619HWY_API VFromD<D> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) {
3620 return VFromD<D>(vld1_dup_s16(p));
3621}
3622#if HWY_HAVE_FLOAT16
3623template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_F16_D(D)>
3624HWY_API VFromD<D> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
3625 return VFromD<D>(vld1_dup_f16(detail::NativeLanePointer(p)));
3626}
3627#endif // HWY_HAVE_FLOAT16
3628#if HWY_NEON_HAVE_BFLOAT16
3629template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_BF16_D(D)>
3630HWY_API VFromD<D> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
3631 return VFromD<D>(vld1_dup_bf16(detail::NativeLanePointer(p)));
3632}
3633#endif // HWY_NEON_HAVE_BFLOAT16
3634
3635// 8-bit x2
3636template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
3638 const Repartition<uint16_t, decltype(d)> d16;
3639 uint16_t buf;
3640 CopyBytes<2>(p, &buf);
3641 return BitCast(d, LoadU(d16, &buf));
3642}
3643
3644// ------------------------------ Load 8
3645template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_U8_D(D)>
3646HWY_API VFromD<D> LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) {
3647 return VFromD<D>(vld1_dup_u8(p));
3648}
3649template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I8_D(D)>
3650HWY_API VFromD<D> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) {
3651 return VFromD<D>(vld1_dup_s8(p));
3652}
3653
3654// ------------------------------ Load misc
3655
3656template <class D, HWY_NEON_IF_EMULATED_D(D)>
3658 const RebindToUnsigned<decltype(d)> du;
3659 return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
3660}
3661
3662// On Arm, Load is the same as LoadU.
3663template <class D>
3665 return LoadU(d, p);
3666}
3667
3668template <class D>
3670 const TFromD<D>* HWY_RESTRICT aligned) {
3671 return IfThenElseZero(m, Load(d, aligned));
3672}
3673
3674template <class D>
3676 const TFromD<D>* HWY_RESTRICT aligned) {
3677 return IfThenElse(m, Load(d, aligned), v);
3678}
3679
3680// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
3681template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
3683 return LoadU(d, p);
3684}
3685
3686// ------------------------------ Store 128
3687
3688template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
3689HWY_API void StoreU(Vec128<uint8_t> v, D /* tag */,
3690 uint8_t* HWY_RESTRICT unaligned) {
3691 vst1q_u8(unaligned, v.raw);
3692}
3693template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
3694HWY_API void StoreU(Vec128<uint16_t> v, D /* tag */,
3695 uint16_t* HWY_RESTRICT unaligned) {
3696 vst1q_u16(unaligned, v.raw);
3697}
3698template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
3699HWY_API void StoreU(Vec128<uint32_t> v, D /* tag */,
3700 uint32_t* HWY_RESTRICT unaligned) {
3701 vst1q_u32(unaligned, v.raw);
3702}
3703template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
3704HWY_API void StoreU(Vec128<uint64_t> v, D /* tag */,
3705 uint64_t* HWY_RESTRICT unaligned) {
3706 vst1q_u64(unaligned, v.raw);
3707}
3708template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
3709HWY_API void StoreU(Vec128<int8_t> v, D /* tag */,
3710 int8_t* HWY_RESTRICT unaligned) {
3711 vst1q_s8(unaligned, v.raw);
3712}
3713template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
3714HWY_API void StoreU(Vec128<int16_t> v, D /* tag */,
3715 int16_t* HWY_RESTRICT unaligned) {
3716 vst1q_s16(unaligned, v.raw);
3717}
3718template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
3719HWY_API void StoreU(Vec128<int32_t> v, D /* tag */,
3720 int32_t* HWY_RESTRICT unaligned) {
3721 vst1q_s32(unaligned, v.raw);
3722}
3723template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
3724HWY_API void StoreU(Vec128<int64_t> v, D /* tag */,
3725 int64_t* HWY_RESTRICT unaligned) {
3726 vst1q_s64(unaligned, v.raw);
3727}
3728#if HWY_HAVE_FLOAT16
3729template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3730HWY_API void StoreU(Vec128<float16_t> v, D /* tag */,
3731 float16_t* HWY_RESTRICT unaligned) {
3732 vst1q_f16(detail::NativeLanePointer(unaligned), v.raw);
3733}
3734#endif // HWY_HAVE_FLOAT16
3735#if HWY_NEON_HAVE_BFLOAT16
3736template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
3737HWY_API void StoreU(Vec128<bfloat16_t> v, D /* tag */,
3738 bfloat16_t* HWY_RESTRICT unaligned) {
3739 vst1q_bf16(detail::NativeLanePointer(unaligned), v.raw);
3740}
3741#endif // HWY_NEON_HAVE_BFLOAT16
3742template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3743HWY_API void StoreU(Vec128<float> v, D /* tag */,
3744 float* HWY_RESTRICT unaligned) {
3745 vst1q_f32(unaligned, v.raw);
3746}
3747#if HWY_HAVE_FLOAT64
3748template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3749HWY_API void StoreU(Vec128<double> v, D /* tag */,
3750 double* HWY_RESTRICT unaligned) {
3751 vst1q_f64(unaligned, v.raw);
3752}
3753#endif // HWY_HAVE_FLOAT64
3754
3755// ------------------------------ Store 64
3756
3757template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
3758HWY_API void StoreU(Vec64<uint8_t> v, D /* tag */, uint8_t* HWY_RESTRICT p) {
3759 vst1_u8(p, v.raw);
3760}
3761template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
3762HWY_API void StoreU(Vec64<uint16_t> v, D /* tag */, uint16_t* HWY_RESTRICT p) {
3763 vst1_u16(p, v.raw);
3764}
3765template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
3766HWY_API void StoreU(Vec64<uint32_t> v, D /* tag */, uint32_t* HWY_RESTRICT p) {
3767 vst1_u32(p, v.raw);
3768}
3769template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
3770HWY_API void StoreU(Vec64<uint64_t> v, D /* tag */, uint64_t* HWY_RESTRICT p) {
3771 vst1_u64(p, v.raw);
3772}
3773template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
3774HWY_API void StoreU(Vec64<int8_t> v, D /* tag */, int8_t* HWY_RESTRICT p) {
3775 vst1_s8(p, v.raw);
3776}
3777template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
3778HWY_API void StoreU(Vec64<int16_t> v, D /* tag */, int16_t* HWY_RESTRICT p) {
3779 vst1_s16(p, v.raw);
3780}
3781template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
3782HWY_API void StoreU(Vec64<int32_t> v, D /* tag */, int32_t* HWY_RESTRICT p) {
3783 vst1_s32(p, v.raw);
3784}
3785template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
3786HWY_API void StoreU(Vec64<int64_t> v, D /* tag */, int64_t* HWY_RESTRICT p) {
3787 vst1_s64(p, v.raw);
3788}
3789#if HWY_HAVE_FLOAT16
3790template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
3791HWY_API void StoreU(Vec64<float16_t> v, D /* tag */,
3793 vst1_f16(detail::NativeLanePointer(p), v.raw);
3794}
3795#endif // HWY_HAVE_FLOAT16
3796#if HWY_NEON_HAVE_BFLOAT16
3797template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
3798HWY_API void StoreU(Vec64<bfloat16_t> v, D /* tag */,
3799 bfloat16_t* HWY_RESTRICT p) {
3800 vst1_bf16(detail::NativeLanePointer(p), v.raw);
3801}
3802#endif // HWY_NEON_HAVE_BFLOAT16
3803template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3804HWY_API void StoreU(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) {
3805 vst1_f32(p, v.raw);
3806}
3807#if HWY_HAVE_FLOAT64
3808template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
3809HWY_API void StoreU(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) {
3810 vst1_f64(p, v.raw);
3811}
3812#endif // HWY_HAVE_FLOAT64
3813
3814// ------------------------------ Store 32
3815
3816template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
3818 vst1_lane_u32(p, v.raw, 0);
3819}
3820template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
3822 vst1_lane_s32(p, v.raw, 0);
3823}
3824template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
3826 vst1_lane_f32(p, v.raw, 0);
3827}
3828
3829// {u,i}{8,16}
3830template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
3833 Repartition<uint32_t, decltype(d)> d32;
3834 uint32_t buf = GetLane(BitCast(d32, v));
3835 CopyBytes<4>(&buf, p);
3836}
3837
3838#if HWY_HAVE_FLOAT16
3839template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
3840HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
3841 Repartition<uint32_t, decltype(d)> d32;
3842 uint32_t buf = GetLane(BitCast(d32, v));
3843 CopyBytes<4>(&buf, p);
3844}
3845#endif
3846#if HWY_NEON_HAVE_BFLOAT16
3847template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
3848HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
3849 Repartition<uint32_t, decltype(d)> d32;
3850 uint32_t buf = GetLane(BitCast(d32, v));
3851 CopyBytes<4>(&buf, p);
3852}
3853#endif // HWY_NEON_HAVE_BFLOAT16
3854
3855// ------------------------------ Store 16
3856
3857template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U16_D(D)>
3859 vst1_lane_u16(p, v.raw, 0);
3860}
3861template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I16_D(D)>
3863 vst1_lane_s16(p, v.raw, 0);
3864}
3865#if HWY_HAVE_FLOAT16
3866template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_F16_D(D)>
3867HWY_API void StoreU(Vec16<float16_t> v, D, float16_t* HWY_RESTRICT p) {
3868 vst1_lane_f16(detail::NativeLanePointer(p), v.raw, 0);
3869}
3870#endif // HWY_HAVE_FLOAT16
3871#if HWY_NEON_HAVE_BFLOAT16
3872template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_BF16_D(D)>
3873HWY_API void StoreU(Vec16<bfloat16_t> v, D, bfloat16_t* HWY_RESTRICT p) {
3874 vst1_lane_bf16(detail::NativeLanePointer(p), v.raw, 0);
3875}
3876#endif // HWY_NEON_HAVE_BFLOAT16
3877
3878template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
3880 const Repartition<uint16_t, decltype(d)> d16;
3881 const uint16_t buf = GetLane(BitCast(d16, v));
3882 CopyBytes<2>(&buf, p);
3883}
3884
3885// ------------------------------ Store 8
3886
3887template <class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_U8_D(D)>
3889 vst1_lane_u8(p, v.raw, 0);
3890}
3891template <class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_I8_D(D)>
3893 vst1_lane_s8(p, v.raw, 0);
3894}
3895
3896// ------------------------------ Store misc
3897
3898template <class D, HWY_NEON_IF_EMULATED_D(D)>
3900 const RebindToUnsigned<decltype(d)> du;
3901 return StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
3902}
3903
3904HWY_DIAGNOSTICS(push)
3905#if HWY_COMPILER_GCC_ACTUAL
3906HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
3907#endif
3908
3909// On Arm, Store is the same as StoreU.
3910template <class D>
3911HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
3912 StoreU(v, d, aligned);
3913}
3914
3915HWY_DIAGNOSTICS(pop)
3916
3917template <class D>
3920 // Treat as unsigned so that we correctly support float16.
3921 const RebindToUnsigned<decltype(d)> du;
3922 const auto blended =
3923 IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p)));
3924 StoreU(BitCast(d, blended), d, p);
3925}
3926
3927// ------------------------------ Non-temporal stores
3928
3929// Same as aligned stores on non-x86.
3930
3931template <class D>
3932HWY_API void Stream(const VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
3933#if HWY_ARCH_ARM_A64
3934#if HWY_COMPILER_GCC
3935 __builtin_prefetch(aligned, 1, 0);
3936#elif HWY_COMPILER_MSVC
3937 __prefetch2(aligned, 0x11);
3938#endif
3939#endif
3940 Store(v, d, aligned);
3941}
3942
3943// ================================================== CONVERT
3944
3945// ------------------------------ ConvertTo
3946
3947#if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
3948
3949// TODO(janwas): use macro generator instead of handwritten
3950template <class D, HWY_IF_F16_D(D)>
3951HWY_API Vec128<float16_t> ConvertTo(D /* tag */, Vec128<int16_t> v) {
3952 return Vec128<float16_t>(vcvtq_f16_s16(v.raw));
3953}
3954template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
3955HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
3956 return VFromD<D>(vcvt_f16_s16(v.raw));
3957}
3958
3959template <class D, HWY_IF_F16_D(D)>
3960HWY_API Vec128<float16_t> ConvertTo(D /* tag */, Vec128<uint16_t> v) {
3961 return Vec128<float16_t>(vcvtq_f16_u16(v.raw));
3962}
3963template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
3964HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
3965 return VFromD<D>(vcvt_f16_u16(v.raw));
3966}
3967
3968#endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
3969
3970template <class D, HWY_IF_F32_D(D)>
3972 return Vec128<float>(vcvtq_f32_s32(v.raw));
3973}
3974template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
3976 return VFromD<D>(vcvt_f32_s32(v.raw));
3977}
3978
3979template <class D, HWY_IF_F32_D(D)>
3981 return Vec128<float>(vcvtq_f32_u32(v.raw));
3982}
3983template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
3985 return VFromD<D>(vcvt_f32_u32(v.raw));
3986}
3987
3988#if HWY_HAVE_FLOAT64
3989
3990template <class D, HWY_IF_F64_D(D)>
3991HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<int64_t> v) {
3992 return Vec128<double>(vcvtq_f64_s64(v.raw));
3993}
3994template <class D, HWY_IF_F64_D(D)>
3995HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<int64_t> v) {
3996// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic.
3997#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3998 return Set(Full64<double>(), static_cast<double>(GetLane(v)));
3999#else
4000 return Vec64<double>(vcvt_f64_s64(v.raw));
4001#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
4002}
4003
4004template <class D, HWY_IF_F64_D(D)>
4005HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<uint64_t> v) {
4006 return Vec128<double>(vcvtq_f64_u64(v.raw));
4007}
4008template <class D, HWY_IF_F64_D(D)>
4009HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) {
4010 // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic.
4011#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
4012 return Set(Full64<double>(), static_cast<double>(GetLane(v)));
4013#else
4014 return Vec64<double>(vcvt_f64_u64(v.raw));
4015#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
4016}
4017
4018#endif // HWY_HAVE_FLOAT64
4019
4020namespace detail {
4021// Truncates (rounds toward zero).
4022template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
4024#if HWY_COMPILER_CLANG && \
4025 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4026 // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
4027 // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
4028 // outside of the range of an int32_t.
4029
4030 int32x4_t raw_result;
4031 __asm__(
4033 "fcvtzs %0.4s, %1.4s"
4034#else
4035 "vcvt.s32.f32 %0, %1"
4036#endif
4037 : "=w"(raw_result)
4038 : "w"(v.raw));
4039 return Vec128<int32_t>(raw_result);
4040#else
4041 return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
4042#endif
4043}
4044template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4046#if HWY_COMPILER_CLANG && \
4047 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4048 // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
4049 // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
4050 // outside of the range of an int32_t.
4051
4052 int32x2_t raw_result;
4053 __asm__(
4055 "fcvtzs %0.2s, %1.2s"
4056#else
4057 "vcvt.s32.f32 %0, %1"
4058#endif
4059 : "=w"(raw_result)
4060 : "w"(v.raw));
4061 return VFromD<D>(raw_result);
4062#else
4063 return VFromD<D>(vcvt_s32_f32(v.raw));
4064#endif
4065}
4066template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
4068#if HWY_COMPILER_CLANG && \
4069 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4070 // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
4071 // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
4072 // outside of the range of an uint32_t.
4073
4074 uint32x4_t raw_result;
4075 __asm__(
4077 "fcvtzu %0.4s, %1.4s"
4078#else
4079 "vcvt.u32.f32 %0, %1"
4080#endif
4081 : "=w"(raw_result)
4082 : "w"(v.raw));
4083 return Vec128<uint32_t>(raw_result);
4084#else
4085 return Vec128<uint32_t>(vcvtq_u32_f32(v.raw));
4086#endif
4087}
4088template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4090#if HWY_COMPILER_CLANG && \
4091 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4092 // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
4093 // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
4094 // outside of the range of an uint32_t.
4095
4096 uint32x2_t raw_result;
4097 __asm__(
4099 "fcvtzu %0.2s, %1.2s"
4100#else
4101 "vcvt.u32.f32 %0, %1"
4102#endif
4103 : "=w"(raw_result)
4104 : "w"(v.raw));
4105 return VFromD<D>(raw_result);
4106#else
4107 return VFromD<D>(vcvt_u32_f32(v.raw));
4108#endif
4109}
4110
4111#if HWY_HAVE_FLOAT64
4112
4113// Truncates (rounds toward zero).
4114template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
4116#if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
4117 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4118 // to avoid undefined behavior if v[i] is outside of the range of an int64_t.
4119 int64x2_t raw_result;
4120 __asm__("fcvtzs %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
4121 return Vec128<int64_t>(raw_result);
4122#else
4123 return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
4124#endif
4125}
4126template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
4127HWY_INLINE Vec64<int64_t> ConvertFToI(D /* tag */, Vec64<double> v) {
4128#if HWY_ARCH_ARM_A64 && \
4129 ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
4130 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
4131 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4132 // to avoid undefined behavior if v[i] is outside of the range of an int64_t.
4133 // If compiling for AArch64 NEON with GCC 6 or earlier, use inline assembly to
4134 // work around the missing vcvt_s64_f64 intrinsic.
4135 int64x1_t raw_result;
4136 __asm__("fcvtzs %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
4137 return Vec64<int64_t>(raw_result);
4138#else
4139 return Vec64<int64_t>(vcvt_s64_f64(v.raw));
4140#endif
4141}
4142template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
4143HWY_INLINE Vec128<uint64_t> ConvertFToU(D /* tag */, Vec128<double> v) {
4144#if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
4145 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4146 // to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
4147 uint64x2_t raw_result;
4148 __asm__("fcvtzu %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
4149 return Vec128<uint64_t>(raw_result);
4150#else
4151 return Vec128<uint64_t>(vcvtq_u64_f64(v.raw));
4152#endif
4153}
4154template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
4155HWY_INLINE Vec64<uint64_t> ConvertFToU(D /* tag */, Vec64<double> v) {
4156#if HWY_ARCH_ARM_A64 && \
4157 ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
4158 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
4159 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4160 // to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
4161
4162 // Inline assembly is also used if compiling for AArch64 NEON with GCC 6 or
4163 // earlier to work around the issue of the missing vcvt_u64_f64 intrinsic.
4164 uint64x1_t raw_result;
4165 __asm__("fcvtzu %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
4166 return Vec64<uint64_t>(raw_result);
4167#else
4168 return Vec64<uint64_t>(vcvt_u64_f64(v.raw));
4169#endif
4170}
4171
4172#endif // HWY_HAVE_FLOAT64
4173
4174#if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
4175
4176// Truncates (rounds toward zero).
4177template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
4178HWY_INLINE Vec128<int16_t> ConvertFToI(D /* tag */, Vec128<float16_t> v) {
4179#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4180 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4181 // to avoid undefined behavior if v[i] is outside of the range of an int16_t.
4182 int16x8_t raw_result;
4183 __asm__("fcvtzs %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
4184 return Vec128<int16_t>(raw_result);
4185#else
4186 return Vec128<int16_t>(vcvtq_s16_f16(v.raw));
4187#endif
4188}
4189template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
4190HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4191#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4192 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4193 // to avoid undefined behavior if v[i] is outside of the range of an int16_t.
4194 int16x4_t raw_result;
4195 __asm__("fcvtzs %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
4196 return VFromD<D>(raw_result);
4197#else
4198 return VFromD<D>(vcvt_s16_f16(v.raw));
4199#endif
4200}
4201
4202template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
4203HWY_INLINE Vec128<uint16_t> ConvertFToU(D /* tag */, Vec128<float16_t> v) {
4204#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4205 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4206 // to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
4207 uint16x8_t raw_result;
4208 __asm__("fcvtzu %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
4209 return Vec128<uint16_t>(raw_result);
4210#else
4211 return Vec128<uint16_t>(vcvtq_u16_f16(v.raw));
4212#endif
4213}
4214template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
4215HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4216#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4217 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4218 // to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
4219 uint16x4_t raw_result;
4220 __asm__("fcvtzu %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
4221 return VFromD<D>(raw_result);
4222#else
4223 return VFromD<D>(vcvt_u16_f16(v.raw));
4224#endif
4225}
4226
4227#endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
4228} // namespace detail
4229
4230template <class D, HWY_IF_SIGNED_D(D),
4232 D, (1 << 4) |
4233 ((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
4234 (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
4236 return detail::ConvertFToI(di, v);
4237}
4238
4239template <class D, HWY_IF_UNSIGNED_D(D),
4241 D, (1 << 4) |
4242 ((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
4243 (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
4245 return detail::ConvertFToU(du, v);
4246}
4247
4248// ------------------------------ PromoteTo (ConvertTo)
4249
4250// Unsigned: zero-extend to full vector.
4251template <class D, HWY_IF_U16_D(D)>
4253 return Vec128<uint16_t>(vmovl_u8(v.raw));
4254}
4255template <class D, HWY_IF_U32_D(D)>
4257 uint16x8_t a = vmovl_u8(v.raw);
4258 return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)));
4259}
4260template <class D, HWY_IF_U32_D(D)>
4262 return Vec128<uint32_t>(vmovl_u16(v.raw));
4263}
4264template <class D, HWY_IF_U64_D(D)>
4266 return Vec128<uint64_t>(vmovl_u32(v.raw));
4267}
4268template <class D, HWY_IF_I16_D(D)>
4272template <class D, HWY_IF_I32_D(D)>
4274 uint16x8_t a = vmovl_u8(v.raw);
4275 return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
4276}
4277template <class D, HWY_IF_I32_D(D)>
4281template <class D, HWY_IF_I64_D(D)>
4285
4286// Unsigned: zero-extend to half vector.
4287template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
4289 return VFromD<D>(vget_low_u16(vmovl_u8(v.raw)));
4290}
4291template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4292HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
4293 return VFromD<D>(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(v.raw)))));
4294}
4295template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4297 return VFromD<D>(vget_low_u32(vmovl_u16(v.raw)));
4298}
4299template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
4301 return VFromD<D>(vget_low_u64(vmovl_u32(v.raw)));
4302}
4303template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
4304HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
4305 using VU16 = VFromD<RebindToUnsigned<D>>;
4306 return BitCast(d, VU16(vget_low_u16(vmovl_u8(v.raw))));
4307}
4308template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4309HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
4310 const uint32x4_t u32 = vmovl_u16(vget_low_u16(vmovl_u8(v.raw)));
4311 return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(u32)));
4312}
4313template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4314HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
4315 return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(vmovl_u16(v.raw))));
4316}
4317template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
4318HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint32_t, D>> v) {
4319 using DU = RebindToUnsigned<D>;
4320 return BitCast(d, VFromD<DU>(vget_low_u64(vmovl_u32(v.raw))));
4321}
4322
4323// U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to
4324// TFromD<D>
4325template <class D, class V, HWY_IF_UI64_D(D),
4327 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
4329 const Rebind<uint32_t, decltype(d)> du32;
4330 return PromoteTo(d, PromoteTo(du32, v));
4331}
4332
4333// Signed: replicate sign bit to full vector.
4334template <class D, HWY_IF_I16_D(D)>
4336 return Vec128<int16_t>(vmovl_s8(v.raw));
4337}
4338template <class D, HWY_IF_I32_D(D)>
4340 int16x8_t a = vmovl_s8(v.raw);
4341 return Vec128<int32_t>(vmovl_s16(vget_low_s16(a)));
4342}
4343template <class D, HWY_IF_I32_D(D)>
4345 return Vec128<int32_t>(vmovl_s16(v.raw));
4346}
4347template <class D, HWY_IF_I64_D(D)>
4349 return Vec128<int64_t>(vmovl_s32(v.raw));
4350}
4351
4352// Signed: replicate sign bit to half vector.
4353template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
4355 return VFromD<D>(vget_low_s16(vmovl_s8(v.raw)));
4356}
4357template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4358HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
4359 return VFromD<D>(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(v.raw)))));
4360}
4361template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4363 return VFromD<D>(vget_low_s32(vmovl_s16(v.raw)));
4364}
4365template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
4367 return VFromD<D>(vget_low_s64(vmovl_s32(v.raw)));
4368}
4369
4370// I8/I16 to I64: First, promote to I32, and then promote to I64
4371template <class D, class V, HWY_IF_I64_D(D),
4373 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
4374HWY_API VFromD<D> PromoteTo(D d, V v) {
4375 const Rebind<int32_t, decltype(d)> di32;
4376 return PromoteTo(d, PromoteTo(di32, v));
4377}
4378
4379#if HWY_NEON_HAVE_F16C
4380
4381// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
4382#ifdef HWY_NATIVE_F16C
4383#undef HWY_NATIVE_F16C
4384#else
4385#define HWY_NATIVE_F16C
4386#endif
4387
4388template <class D, HWY_IF_F32_D(D)>
4389HWY_API Vec128<float> PromoteTo(D /* tag */, Vec64<float16_t> v) {
4390 return Vec128<float>(vcvt_f32_f16(v.raw));
4391}
4392template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
4393HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4394 return VFromD<D>(vget_low_f32(vcvt_f32_f16(v.raw)));
4395}
4396
4397#endif // HWY_NEON_HAVE_F16C
4398
4399#if HWY_HAVE_FLOAT64
4400
4401template <class D, HWY_IF_F64_D(D)>
4402HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<float> v) {
4403 return Vec128<double>(vcvt_f64_f32(v.raw));
4404}
4405
4406template <class D, HWY_IF_F64_D(D)>
4407HWY_API Vec64<double> PromoteTo(D /* tag */, Vec32<float> v) {
4408 return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw)));
4409}
4410
4411template <class D, HWY_IF_F64_D(D)>
4412HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<int32_t> v) {
4413 const int64x2_t i64 = vmovl_s32(v.raw);
4414 return Vec128<double>(vcvtq_f64_s64(i64));
4415}
4416
4417template <class D, HWY_IF_F64_D(D)>
4418HWY_API Vec64<double> PromoteTo(D d, Vec32<int32_t> v) {
4419 return ConvertTo(d, Vec64<int64_t>(vget_low_s64(vmovl_s32(v.raw))));
4420}
4421
4422template <class D, HWY_IF_F64_D(D)>
4423HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<uint32_t> v) {
4424 const uint64x2_t u64 = vmovl_u32(v.raw);
4425 return Vec128<double>(vcvtq_f64_u64(u64));
4426}
4427
4428template <class D, HWY_IF_F64_D(D)>
4429HWY_API Vec64<double> PromoteTo(D d, Vec32<uint32_t> v) {
4430 return ConvertTo(d, Vec64<uint64_t>(vget_low_u64(vmovl_u32(v.raw))));
4431}
4432
4433template <class D, HWY_IF_UI64_D(D)>
4434HWY_API VFromD<D> PromoteTo(D d64, VFromD<Rebind<float, D>> v) {
4435 const RebindToFloat<decltype(d64)> df64;
4436 return ConvertTo(d64, PromoteTo(df64, v));
4437}
4438
4439#else // !HWY_HAVE_FLOAT64
4440
4441template <class D, HWY_IF_I64_D(D)>
4443 const Rebind<int32_t, decltype(di64)> di32;
4444 const RebindToFloat<decltype(di32)> df32;
4445 const RebindToUnsigned<decltype(di32)> du32;
4446 const Repartition<uint8_t, decltype(du32)> du32_as_du8;
4447
4448 const auto exponent_adj = BitCast(
4449 du32,
4450 Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
4451 BitCast(du32_as_du8, Set(du32, uint32_t{157}))),
4452 BitCast(du32_as_du8, Set(du32, uint32_t{32}))));
4453 const auto adj_v =
4454 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
4455
4456 const auto f32_to_i32_result = ConvertTo(di32, adj_v);
4457 const auto lo64_or_mask = PromoteTo(
4458 di64,
4459 BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result,
4460 Set(di32, LimitsMax<int32_t>())))));
4461
4462 return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result))
4463 << PromoteTo(di64, exponent_adj),
4464 lo64_or_mask);
4465}
4466
4467template <class D, HWY_IF_U64_D(D)>
4468HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
4469 const Rebind<uint32_t, decltype(du64)> du32;
4470 const RebindToFloat<decltype(du32)> df32;
4471 const Repartition<uint8_t, decltype(du32)> du32_as_du8;
4472
4473 const auto exponent_adj = BitCast(
4474 du32,
4475 Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
4476 BitCast(du32_as_du8, Set(du32, uint32_t{158}))),
4477 BitCast(du32_as_du8, Set(du32, uint32_t{32}))));
4478
4479 const auto adj_v =
4480 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
4481 const auto f32_to_u32_result = ConvertTo(du32, adj_v);
4482 const auto lo32_or_mask = PromoteTo(
4483 du64,
4484 VecFromMask(du32, f32_to_u32_result == Set(du32, LimitsMax<uint32_t>())));
4485
4486 return Or(PromoteTo(du64, f32_to_u32_result) << PromoteTo(du64, exponent_adj),
4487 lo32_or_mask);
4488}
4489
4490#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4491#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4492#else
4493#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4494#endif
4495
4496template <class D, HWY_IF_UI64_D(D)>
4498 const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32;
4499 const RebindToFloat<decltype(d32)> df32;
4500 const RebindToUnsigned<decltype(d32)> du32;
4501 const Repartition<uint8_t, decltype(d32)> du32_as_du8;
4502
4503 constexpr uint32_t kExpAdjDecr =
4504 0xFFFFFF9Du + static_cast<uint32_t>(!IsSigned<TFromD<D>>());
4505
4506 const auto exponent_adj = BitCast(
4507 du32, SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
4508 BitCast(du32_as_du8, Set(du32, kExpAdjDecr))));
4509 const auto adj_v =
4510 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
4511
4512 return PromoteTo(d64, ConvertTo(d32, adj_v)) << PromoteTo(d64, exponent_adj);
4513}
4514
4515#endif // HWY_HAVE_FLOAT64
4516
4517// ------------------------------ PromoteUpperTo
4518
4519#if HWY_ARCH_ARM_A64
4520
4521// Per-target flag to prevent generic_ops-inl.h from defining PromoteUpperTo.
4522#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
4523#undef HWY_NATIVE_PROMOTE_UPPER_TO
4524#else
4525#define HWY_NATIVE_PROMOTE_UPPER_TO
4526#endif
4527
4528// Unsigned: zero-extend to full vector.
4529template <class D, HWY_IF_U16_D(D)>
4530HWY_API Vec128<uint16_t> PromoteUpperTo(D /* tag */, Vec128<uint8_t> v) {
4531 return Vec128<uint16_t>(vmovl_high_u8(v.raw));
4532}
4533template <class D, HWY_IF_U32_D(D)>
4534HWY_API Vec128<uint32_t> PromoteUpperTo(D /* tag */, Vec128<uint16_t> v) {
4535 return Vec128<uint32_t>(vmovl_high_u16(v.raw));
4536}
4537template <class D, HWY_IF_U64_D(D)>
4538HWY_API Vec128<uint64_t> PromoteUpperTo(D /* tag */, Vec128<uint32_t> v) {
4539 return Vec128<uint64_t>(vmovl_high_u32(v.raw));
4540}
4541template <class D, HWY_IF_I16_D(D)>
4542HWY_API Vec128<int16_t> PromoteUpperTo(D d, Vec128<uint8_t> v) {
4543 return BitCast(d, Vec128<uint16_t>(vmovl_high_u8(v.raw)));
4544}
4545template <class D, HWY_IF_I32_D(D)>
4546HWY_API Vec128<int32_t> PromoteUpperTo(D d, Vec128<uint16_t> v) {
4547 return BitCast(d, Vec128<uint32_t>(vmovl_high_u16(v.raw)));
4548}
4549template <class D, HWY_IF_I64_D(D)>
4550HWY_API Vec128<int64_t> PromoteUpperTo(D d, Vec128<uint32_t> v) {
4551 return BitCast(d, Vec128<uint64_t>(vmovl_high_u32(v.raw)));
4552}
4553
4554// Signed: replicate sign bit to full vector.
4555template <class D, HWY_IF_I16_D(D)>
4556HWY_API Vec128<int16_t> PromoteUpperTo(D /* tag */, Vec128<int8_t> v) {
4557 return Vec128<int16_t>(vmovl_high_s8(v.raw));
4558}
4559template <class D, HWY_IF_I32_D(D)>
4560HWY_API Vec128<int32_t> PromoteUpperTo(D /* tag */, Vec128<int16_t> v) {
4561 return Vec128<int32_t>(vmovl_high_s16(v.raw));
4562}
4563template <class D, HWY_IF_I64_D(D)>
4564HWY_API Vec128<int64_t> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) {
4565 return Vec128<int64_t>(vmovl_high_s32(v.raw));
4566}
4567
4568#if HWY_NEON_HAVE_F16C
4569
4570template <class D, HWY_IF_F32_D(D)>
4571HWY_API Vec128<float> PromoteUpperTo(D /* tag */, Vec128<float16_t> v) {
4572 return Vec128<float>(vcvt_high_f32_f16(v.raw));
4573}
4574
4575#endif // HWY_NEON_HAVE_F16C
4576
4577template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
4578HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<bfloat16_t, D>> v) {
4579 const Repartition<uint16_t, decltype(df32)> du16;
4580 const RebindToSigned<decltype(df32)> di32;
4581 return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v))));
4582}
4583
4584#if HWY_HAVE_FLOAT64
4585
4586template <class D, HWY_IF_F64_D(D)>
4587HWY_API Vec128<double> PromoteUpperTo(D /* tag */, Vec128<float> v) {
4588 return Vec128<double>(vcvt_high_f64_f32(v.raw));
4589}
4590
4591template <class D, HWY_IF_F64_D(D)>
4592HWY_API Vec128<double> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) {
4593 const int64x2_t i64 = vmovl_high_s32(v.raw);
4594 return Vec128<double>(vcvtq_f64_s64(i64));
4595}
4596
4597template <class D, HWY_IF_F64_D(D)>
4598HWY_API Vec128<double> PromoteUpperTo(D /* tag */, Vec128<uint32_t> v) {
4599 const uint64x2_t u64 = vmovl_high_u32(v.raw);
4600 return Vec128<double>(vcvtq_f64_u64(u64));
4601}
4602
4603#endif // HWY_HAVE_FLOAT64
4604
4605template <class D, HWY_IF_UI64_D(D)>
4606HWY_API VFromD<D> PromoteUpperTo(D d64, Vec128<float> v) {
4607#if HWY_HAVE_FLOAT64
4608 const RebindToFloat<decltype(d64)> df64;
4609 return ConvertTo(d64, PromoteUpperTo(df64, v));
4610#else
4611 const Rebind<float, decltype(d)> dh;
4612 return PromoteTo(d, UpperHalf(dh, v));
4613#endif
4614}
4615
4616// Generic version for <=64 bit input/output (_high is only for full vectors).
4617template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V>
4619 const Rebind<TFromV<V>, decltype(d)> dh;
4620 return PromoteTo(d, UpperHalf(dh, v));
4621}
4622
4623#endif // HWY_ARCH_ARM_A64
4624
4625// ------------------------------ DemoteTo (ConvertTo)
4626
4627// From full vector to half or quarter
4628template <class D, HWY_IF_U16_D(D)>
4630 return Vec64<uint16_t>(vqmovun_s32(v.raw));
4631}
4632template <class D, HWY_IF_I16_D(D)>
4634 return Vec64<int16_t>(vqmovn_s32(v.raw));
4635}
4636template <class D, HWY_IF_U8_D(D)>
4638 const uint16x4_t a = vqmovun_s32(v.raw);
4639 return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
4640}
4641template <class D, HWY_IF_U8_D(D)>
4643 return Vec64<uint8_t>(vqmovun_s16(v.raw));
4644}
4645template <class D, HWY_IF_I8_D(D)>
4647 const int16x4_t a = vqmovn_s32(v.raw);
4648 return Vec32<int8_t>(vqmovn_s16(vcombine_s16(a, a)));
4649}
4650template <class D, HWY_IF_I8_D(D)>
4652 return Vec64<int8_t>(vqmovn_s16(v.raw));
4653}
4654template <class D, HWY_IF_U16_D(D)>
4656 return Vec64<uint16_t>(vqmovn_u32(v.raw));
4657}
4658template <class D, HWY_IF_U8_D(D)>
4660 const uint16x4_t a = vqmovn_u32(v.raw);
4661 return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
4662}
4663template <class D, HWY_IF_U8_D(D)>
4665 return Vec64<uint8_t>(vqmovn_u16(v.raw));
4666}
4667
4668// From half vector to partial half
4669template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
4671 return VFromD<D>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
4672}
4673template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)>
4674HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
4675 return VFromD<D>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
4676}
4677template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
4678HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
4679 const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
4680 return VFromD<D>(vqmovn_u16(vcombine_u16(a, a)));
4681}
4682template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
4684 return VFromD<D>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
4685}
4686template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)>
4687HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
4688 const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
4689 return VFromD<D>(vqmovn_s16(vcombine_s16(a, a)));
4690}
4691template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
4692HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
4693 return VFromD<D>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
4694}
4695template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
4697 return VFromD<D>(vqmovn_u32(vcombine_u32(v.raw, v.raw)));
4698}
4699template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
4700HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
4701 const uint16x4_t a = vqmovn_u32(vcombine_u32(v.raw, v.raw));
4702 return VFromD<D>(vqmovn_u16(vcombine_u16(a, a)));
4703}
4704template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
4706 return VFromD<D>(vqmovn_u16(vcombine_u16(v.raw, v.raw)));
4707}
4708
4709template <class D, HWY_IF_I32_D(D)>
4711 return Vec64<int32_t>(vqmovn_s64(v.raw));
4712}
4713template <class D, HWY_IF_U32_D(D)>
4715 return Vec64<uint32_t>(vqmovun_s64(v.raw));
4716}
4717template <class D, HWY_IF_U32_D(D)>
4719 return Vec64<uint32_t>(vqmovn_u64(v.raw));
4720}
4721template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
4722 HWY_IF_SIGNED_D(D)>
4724 const Rebind<int32_t, D> di32;
4725 return DemoteTo(d, DemoteTo(di32, v));
4726}
4727template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
4729HWY_API VFromD<D> DemoteTo(D d, Vec128<int64_t> v) {
4730 const Rebind<uint32_t, D> du32;
4731 return DemoteTo(d, DemoteTo(du32, v));
4732}
4733template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
4736 const Rebind<uint32_t, D> du32;
4737 return DemoteTo(d, DemoteTo(du32, v));
4738}
4739
4740template <class D, HWY_IF_I32_D(D)>
4742 return Vec32<int32_t>(vqmovn_s64(vcombine_s64(v.raw, v.raw)));
4743}
4744template <class D, HWY_IF_U32_D(D)>
4746 return Vec32<uint32_t>(vqmovun_s64(vcombine_s64(v.raw, v.raw)));
4747}
4748template <class D, HWY_IF_U32_D(D)>
4750 return Vec32<uint32_t>(vqmovn_u64(vcombine_u64(v.raw, v.raw)));
4751}
4752template <class D, HWY_IF_SIGNED_D(D),
4753 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
4755 const Rebind<int32_t, D> di32;
4756 return DemoteTo(d, DemoteTo(di32, v));
4757}
4758template <class D, HWY_IF_UNSIGNED_D(D),
4759 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
4760HWY_API VFromD<D> DemoteTo(D d, Vec64<int64_t> v) {
4761 const Rebind<uint32_t, D> du32;
4762 return DemoteTo(d, DemoteTo(du32, v));
4763}
4764template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_UNSIGNED_D(D),
4765 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
4767 const Rebind<uint32_t, D> du32;
4768 return DemoteTo(d, DemoteTo(du32, v));
4769}
4770
4771#if HWY_NEON_HAVE_F16C
4772
4773// We already toggled HWY_NATIVE_F16C above.
4774
4775template <class D, HWY_IF_F16_D(D)>
4776HWY_API Vec64<float16_t> DemoteTo(D /* tag */, Vec128<float> v) {
4777 return Vec64<float16_t>{vcvt_f16_f32(v.raw)};
4778}
4779template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
4780HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
4781 return VFromD<D>(vcvt_f16_f32(vcombine_f32(v.raw, v.raw)));
4782}
4783
4784#endif // HWY_NEON_HAVE_F16C
4785
4786#if HWY_NEON_HAVE_F32_TO_BF16C
4787#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
4788#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
4789#else
4790#define HWY_NATIVE_DEMOTE_F32_TO_BF16
4791#endif
4792
4793namespace detail {
4794#if HWY_NEON_HAVE_BFLOAT16
4795// If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
4796// bfloat16x4_t or bfloat16x8_t.
4797static HWY_INLINE bfloat16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
4798 return raw;
4799}
4800#else
4801// If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
4802// detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
4803// work around compiler bugs that are there with GCC 13 or earlier or Clang 16
4804// or earlier on AArch64.
4805
4806// The bfloat16x4_t vector returned by vcvt_bf16_f32 needs to be bitcasted to
4807// an uint16x4_t vector if HWY_NEON_HAVE_F32_TO_BF16C &&
4808// !HWY_NEON_HAVE_BFLOAT16 is true.
4809static HWY_INLINE uint16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
4810 return vreinterpret_u16_bf16(raw);
4811}
4812#endif
4813} // namespace detail
4814
4815template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
4816HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
4817 return VFromD<D>(detail::BitCastFromRawNeonBF16(vcvt_bf16_f32(v.raw)));
4818}
4819template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_BF16_D(D)>
4820HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
4821 return VFromD<D>(detail::BitCastFromRawNeonBF16(
4822 vcvt_bf16_f32(vcombine_f32(v.raw, v.raw))));
4823}
4824#endif // HWY_NEON_HAVE_F32_TO_BF16C
4825
4826#if HWY_HAVE_FLOAT64
4827
4828template <class D, HWY_IF_F32_D(D)>
4829HWY_API Vec64<float> DemoteTo(D /* tag */, Vec128<double> v) {
4830 return Vec64<float>(vcvt_f32_f64(v.raw));
4831}
4832template <class D, HWY_IF_F32_D(D)>
4833HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
4834 return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
4835}
4836
4837template <class D, HWY_IF_UI32_D(D)>
4838HWY_API VFromD<D> DemoteTo(D d32, VFromD<Rebind<double, D>> v) {
4839 const Rebind<MakeWide<TFromD<D>>, D> d64;
4840 return DemoteTo(d32, ConvertTo(d64, v));
4841}
4842
4843#endif // HWY_HAVE_FLOAT64
4844
4845template <class D, HWY_IF_F32_D(D)>
4847 const Rebind<int64_t, decltype(df32)> di64;
4848 const RebindToUnsigned<decltype(di64)> du64;
4849
4850#if HWY_ARCH_ARM_A64
4851 const RebindToFloat<decltype(du64)> df64;
4852
4853 const auto k2p64_63 = Set(df64, 27670116110564327424.0);
4854 const auto f64_hi52 =
4855 Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63;
4856 const auto f64_lo12 =
4857 ConvertTo(df64, And(BitCast(du64, v), Set(du64, uint64_t{0x00000FFF})));
4858
4859 const auto f64_sum = f64_hi52 + f64_lo12;
4860 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
4861
4862 const auto f64_sum_is_inexact =
4863 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
4864 const auto f64_bits_decrement =
4865 And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))),
4866 f64_sum_is_inexact);
4867
4868 const auto adj_f64_val = BitCast(
4869 df64,
4870 Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));
4871
4872 return DemoteTo(df32, adj_f64_val);
4873#else
4874 const RebindToUnsigned<decltype(df32)> du32;
4875 const auto hi23 = TruncateTo(du32, ShiftRight<41>(BitCast(du64, v)));
4876 const auto mid23 = And(TruncateTo(du32, ShiftRight<18>(BitCast(du64, v))),
4877 Set(du32, uint32_t{0x007FFFFFu}));
4878 const auto lo18 =
4879 And(TruncateTo(du32, BitCast(du64, v)), Set(du32, uint32_t{0x0003FFFFu}));
4880
4881 const auto k2p41_f32 = Set(df32, 2199023255552.0f);
4882 const auto k2p64_63_f32 = Set(df32, 27670116110564327424.0f);
4883
4884 const auto hi23_f32 =
4885 BitCast(df32, Xor(hi23, BitCast(du32, k2p64_63_f32))) - k2p64_63_f32;
4886 const auto mid23_f32 =
4887 BitCast(df32, Or(mid23, BitCast(du32, k2p41_f32))) - k2p41_f32;
4888 const auto lo18_f32 = ConvertTo(df32, lo18);
4889
4890 const auto s_hi46 = hi23_f32 + mid23_f32;
4891 const auto c_hi46 = (hi23_f32 - s_hi46) + mid23_f32;
4892
4893 auto s_lo = c_hi46 + lo18_f32;
4894 const auto c_lo = (c_hi46 - s_lo) + lo18_f32;
4895
4896 const auto s_lo_inexact_mask =
4897 VecFromMask(du32, RebindMask(du32, c_lo != Zero(df32)));
4898 const auto s_lo_mag_adj = ShiftRight<31>(
4899 And(s_lo_inexact_mask, Xor(BitCast(du32, s_lo), BitCast(du32, c_lo))));
4900
4901 s_lo = BitCast(df32, BitCast(du32, s_lo) - s_lo_mag_adj);
4902 s_lo =
4903 BitCast(df32, Or(BitCast(du32, s_lo), ShiftRight<31>(s_lo_inexact_mask)));
4904 return s_hi46 + s_lo;
4905#endif
4906}
4907
4908template <class D, HWY_IF_F32_D(D)>
4910#if HWY_ARCH_ARM_A64
4911 const Rebind<uint64_t, decltype(df32)> du64;
4912 const RebindToFloat<decltype(du64)> df64;
4913
4914 const auto k2p64 = Set(df64, 18446744073709551616.0);
4915 const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
4916 const auto f64_lo12 =
4917 ConvertTo(df64, And(v, Set(du64, uint64_t{0x00000FFF})));
4918
4919 const auto f64_sum = f64_hi52 + f64_lo12;
4920 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
4921 const auto f64_sum_is_inexact =
4922 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
4923
4924 const auto adj_f64_val = BitCast(
4925 df64,
4926 Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)),
4927 f64_sum_is_inexact));
4928
4929 return DemoteTo(df32, adj_f64_val);
4930#else
4931 const RebindToUnsigned<decltype(df32)> du32;
4932
4933 const auto hi23 = TruncateTo(du32, ShiftRight<41>(v));
4934 const auto mid23 = And(TruncateTo(du32, ShiftRight<18>(v)),
4935 Set(du32, uint32_t{0x007FFFFFu}));
4936 const auto lo18 = And(TruncateTo(du32, v), Set(du32, uint32_t{0x0003FFFFu}));
4937
4938 const auto k2p41_f32 = Set(df32, 2199023255552.0f);
4939 const auto k2p64_f32 = Set(df32, 18446744073709551616.0f);
4940
4941 const auto hi23_f32 =
4942 BitCast(df32, Or(hi23, BitCast(du32, k2p64_f32))) - k2p64_f32;
4943 const auto mid23_f32 =
4944 BitCast(df32, Or(mid23, BitCast(du32, k2p41_f32))) - k2p41_f32;
4945 const auto lo18_f32 = ConvertTo(df32, lo18);
4946
4947 const auto s_hi46 = hi23_f32 + mid23_f32;
4948 const auto c_hi46 = (hi23_f32 - s_hi46) + mid23_f32;
4949
4950 auto s_lo = c_hi46 + lo18_f32;
4951 const auto c_lo = (c_hi46 - s_lo) + lo18_f32;
4952
4953 const auto s_lo_inexact_mask =
4954 VecFromMask(du32, RebindMask(du32, c_lo != Zero(df32)));
4955 const auto s_lo_mag_adj = ShiftRight<31>(
4956 And(s_lo_inexact_mask, Xor(BitCast(du32, s_lo), BitCast(du32, c_lo))));
4957
4958 s_lo = BitCast(df32, BitCast(du32, s_lo) - s_lo_mag_adj);
4959 s_lo =
4960 BitCast(df32, Or(BitCast(du32, s_lo), ShiftRight<31>(s_lo_inexact_mask)));
4961 return s_hi46 + s_lo;
4962#endif
4963}
4964
4966 const uint8x16_t org_v = detail::BitCastToByte(v).raw;
4967 const uint8x16_t w = vuzp1q_u8(org_v, org_v);
4968 return Vec32<uint8_t>(vget_low_u8(vuzp1q_u8(w, w)));
4969}
4970template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
4972 const uint8x8_t org_v = detail::BitCastToByte(v).raw;
4973 const uint8x8_t w = vuzp1_u8(org_v, org_v);
4974 return Vec128<uint8_t, N>(vuzp1_u8(w, w));
4975}
4976
4977// ------------------------------ Round (IfThenElse, mask, logical)
4978
4979#if HWY_ARCH_ARM_A64
4980// Toward nearest integer
4982
4983// Toward zero, aka truncate
4985
4986// Toward +infinity, aka ceiling
4988
4989// Toward -infinity, aka floor
4991#else
4992
4993// ------------------------------ Trunc
4994
4995// Armv7 only supports truncation to integer. We can either convert back to
4996// float (3 floating-point and 2 logic operations) or manipulate the binary32
4997// representation, clearing the lowest 23-exp mantissa bits. This requires 9
4998// integer operations and 3 constants, which is likely more expensive.
4999
5000namespace detail {
5001
5002// The original value is already the desired result if NaN or the magnitude is
5003// large (i.e. the value is already an integer).
5004template <size_t N>
5008
5009} // namespace detail
5010
5011template <size_t N>
5013 const DFromV<decltype(v)> df;
5014 const RebindToSigned<decltype(df)> di;
5015
5016 const auto integer = ConvertTo(di, v); // round toward 0
5017 const auto int_f = ConvertTo(df, integer);
5018
5019 return IfThenElse(detail::UseInt(v), int_f, v);
5020}
5021
5022template <size_t N>
5024 const DFromV<decltype(v)> df;
5025
5026 // Armv7 also lacks a native NearestInt, but we can instead rely on rounding
5027 // (we assume the current mode is nearest-even) after addition with a large
5028 // value such that no mantissa bits remain. We may need a compiler flag for
5029 // precise floating-point to prevent this from being "optimized" out.
5030 const auto max = Set(df, MantissaEnd<float>());
5031 const auto large = CopySignToAbs(max, v);
5032 const auto added = large + v;
5033 const auto rounded = added - large;
5034
5035 // Keep original if NaN or the magnitude is large (already an int).
5036 return IfThenElse(Abs(v) < max, rounded, v);
5037}
5038
5039template <size_t N>
5041 const DFromV<decltype(v)> df;
5042 const RebindToSigned<decltype(df)> di;
5043
5044 const auto integer = ConvertTo(di, v); // round toward 0
5045 const auto int_f = ConvertTo(df, integer);
5046
5047 // Truncating a positive non-integer ends up smaller; if so, add 1.
5048 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
5049
5050 return IfThenElse(detail::UseInt(v), int_f - neg1, v);
5051}
5052
5053template <size_t N>
5055 const DFromV<decltype(v)> df;
5056 const RebindToSigned<decltype(df)> di;
5057
5058 const auto integer = ConvertTo(di, v); // round toward 0
5059 const auto int_f = ConvertTo(df, integer);
5060
5061 // Truncating a negative non-integer ends up larger; if so, subtract 1.
5062 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
5063
5064 return IfThenElse(detail::UseInt(v), int_f + neg1, v);
5065}
5066
5067#endif
5068
5069// ------------------------------ NearestInt (Round)
5070
5071#if HWY_ARCH_ARM_A64
5072
5073HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
5074 return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
5075}
5076template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
5077HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
5078 return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
5079}
5080
5081#else
5082
5083template <size_t N>
5085 const RebindToSigned<DFromV<decltype(v)>> di;
5086 return ConvertTo(di, Round(v));
5087}
5088
5089#endif
5090
5091// ------------------------------ Floating-point classification
5092template <typename T, size_t N>
5094 return v != v;
5095}
5096
5097// ================================================== SWIZZLE
5098
5099// ------------------------------ LowerHalf
5100
5101// <= 64 bit: just return different type
5102template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
5104 return Vec128<T, N / 2>(v.raw);
5105}
5106
5108 return Vec64<uint8_t>(vget_low_u8(v.raw));
5109}
5111 return Vec64<uint16_t>(vget_low_u16(v.raw));
5112}
5114 return Vec64<uint32_t>(vget_low_u32(v.raw));
5115}
5117 return Vec64<uint64_t>(vget_low_u64(v.raw));
5118}
5120 return Vec64<int8_t>(vget_low_s8(v.raw));
5121}
5123 return Vec64<int16_t>(vget_low_s16(v.raw));
5124}
5126 return Vec64<int32_t>(vget_low_s32(v.raw));
5127}
5129 return Vec64<int64_t>(vget_low_s64(v.raw));
5130}
5132 return Vec64<float>(vget_low_f32(v.raw));
5133}
5134#if HWY_HAVE_FLOAT16
5135HWY_API Vec64<float16_t> LowerHalf(Vec128<float16_t> v) {
5136 return Vec64<float16_t>(vget_low_f16(v.raw));
5137}
5138#endif // HWY_HAVE_FLOAT16
5139#if HWY_NEON_HAVE_BFLOAT16
5140HWY_API Vec64<bfloat16_t> LowerHalf(Vec128<bfloat16_t> v) {
5141 return Vec64<bfloat16_t>(vget_low_bf16(v.raw));
5142}
5143#endif // HWY_NEON_HAVE_BFLOAT16
5144#if HWY_HAVE_FLOAT64
5145HWY_API Vec64<double> LowerHalf(Vec128<double> v) {
5146 return Vec64<double>(vget_low_f64(v.raw));
5147}
5148#endif // HWY_HAVE_FLOAT64
5149
5150template <class V, HWY_NEON_IF_EMULATED_D(DFromV<V>), HWY_IF_V_SIZE_V(V, 16)>
5152 const Full128<uint16_t> du;
5153 const Half<DFromV<V>> dh;
5154 return BitCast(dh, LowerHalf(BitCast(du, v)));
5155}
5156
5157template <class DH>
5159 return LowerHalf(v);
5160}
5161
5162// ------------------------------ CombineShiftRightBytes
5163
5164// 128-bit
5165template <int kBytes, class D, typename T = TFromD<D>>
5167 static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]");
5168 const Repartition<uint8_t, decltype(d)> d8;
5169 uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
5170 return BitCast(d, Vec128<uint8_t>(v8));
5171}
5172
5173// 64-bit
5174template <int kBytes, class D, typename T = TFromD<D>>
5176 static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]");
5177 const Repartition<uint8_t, decltype(d)> d8;
5178 uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
5179 return BitCast(d, VFromD<decltype(d8)>(v8));
5180}
5181
5182// <= 32-bit defined after ShiftLeftBytes.
5183
5184// ------------------------------ Shift vector by constant #bytes
5185
5186namespace detail {
5187
5188// Partially specialize because kBytes = 0 and >= size are compile errors;
5189// callers replace the latter with 0xFF for easier specialization.
5190template <int kBytes>
5192 // Full
5193 template <class T>
5195 const Full128<T> d;
5196 return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d));
5197 }
5198
5199 // Partial
5200 template <class T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
5202 // Expand to 64-bit so we only use the native EXT instruction.
5203 const Full64<T> d64;
5204 const auto zero64 = Zero(d64);
5205 const decltype(zero64) v64(v.raw);
5206 return Vec128<T, N>(
5207 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
5208 }
5209};
5210template <>
5212 template <class T, size_t N>
5214 return v;
5215 }
5216};
5217template <>
5218struct ShiftLeftBytesT<0xFF> {
5219 template <class T, size_t N>
5221 return Xor(v, v);
5222 }
5223};
5224
5225template <int kBytes>
5227 template <class T, size_t N>
5229 const DFromV<decltype(v)> d;
5230 // For < 64-bit vectors, zero undefined lanes so we shift in zeros.
5231 if (d.MaxBytes() < 8) {
5232 constexpr size_t kReg = d.MaxBytes() == 16 ? 16 : 8;
5233 const Simd<T, kReg / sizeof(T), 0> dreg;
5234 v = Vec128<T, N>(
5235 IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw);
5236 }
5237 return CombineShiftRightBytes<kBytes>(d, Zero(d), v);
5238 }
5239};
5240template <>
5242 template <class T, size_t N>
5244 return v;
5245 }
5246};
5247template <>
5248struct ShiftRightBytesT<0xFF> {
5249 template <class T, size_t N>
5251 return Xor(v, v);
5252 }
5253};
5254
5255} // namespace detail
5256
5257template <int kBytes, class D>
5259 return detail::ShiftLeftBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()(v);
5260}
5261
5262template <int kBytes, typename T, size_t N>
5264 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
5265}
5266
5267template <int kLanes, class D>
5269 const Repartition<uint8_t, decltype(d)> d8;
5270 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v)));
5271}
5272
5273template <int kLanes, typename T, size_t N>
5275 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
5276}
5277
5278// 0x01..0F, kBytes = 1 => 0x0001..0E
5279template <int kBytes, class D>
5281 return detail::ShiftRightBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()(
5282 v);
5283}
5284
5285template <int kLanes, class D>
5287 const Repartition<uint8_t, decltype(d)> d8;
5288 return BitCast(
5289 d, ShiftRightBytes<kLanes * sizeof(TFromD<D>)>(d8, BitCast(d8, v)));
5290}
5291
5292// Calls ShiftLeftBytes
5293template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 4)>
5295 constexpr size_t kSize = d.MaxBytes();
5296 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
5297 const Repartition<uint8_t, decltype(d)> d8;
5298 const Full64<uint8_t> d_full8;
5299 const Repartition<TFromD<D>, decltype(d_full8)> d_full;
5300 using V64 = VFromD<decltype(d_full8)>;
5301 const V64 hi64(BitCast(d8, hi).raw);
5302 // Move into most-significant bytes
5303 const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw));
5304 const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64);
5305 // After casting to full 64-bit vector of correct type, shrink to 32-bit
5306 return VFromD<D>(BitCast(d_full, r).raw);
5307}
5308
5309// ------------------------------ UpperHalf (ShiftRightBytes)
5310
5311// Full input
5312template <class D, HWY_IF_U8_D(D)>
5314 return Vec64<uint8_t>(vget_high_u8(v.raw));
5315}
5316template <class D, HWY_IF_U16_D(D)>
5318 return Vec64<uint16_t>(vget_high_u16(v.raw));
5319}
5320template <class D, HWY_IF_U32_D(D)>
5322 return Vec64<uint32_t>(vget_high_u32(v.raw));
5323}
5324template <class D, HWY_IF_U64_D(D)>
5326 return Vec64<uint64_t>(vget_high_u64(v.raw));
5327}
5328template <class D, HWY_IF_I8_D(D)>
5330 return Vec64<int8_t>(vget_high_s8(v.raw));
5331}
5332template <class D, HWY_IF_I16_D(D)>
5334 return Vec64<int16_t>(vget_high_s16(v.raw));
5335}
5336template <class D, HWY_IF_I32_D(D)>
5338 return Vec64<int32_t>(vget_high_s32(v.raw));
5339}
5340template <class D, HWY_IF_I64_D(D)>
5342 return Vec64<int64_t>(vget_high_s64(v.raw));
5343}
5344#if HWY_HAVE_FLOAT16
5345template <class D, HWY_IF_F16_D(D)>
5346HWY_API Vec64<float16_t> UpperHalf(D /* tag */, Vec128<float16_t> v) {
5347 return Vec64<float16_t>(vget_high_f16(v.raw));
5348}
5349#endif
5350#if HWY_NEON_HAVE_BFLOAT16
5351template <class D, HWY_IF_BF16_D(D)>
5352HWY_API Vec64<bfloat16_t> UpperHalf(D /* tag */, Vec128<bfloat16_t> v) {
5353 return Vec64<bfloat16_t>(vget_high_bf16(v.raw));
5354}
5355#endif // HWY_NEON_HAVE_BFLOAT16
5356template <class D, HWY_IF_F32_D(D)>
5358 return Vec64<float>(vget_high_f32(v.raw));
5359}
5360#if HWY_HAVE_FLOAT64
5361template <class D, HWY_IF_F64_D(D)>
5362HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) {
5363 return Vec64<double>(vget_high_f64(v.raw));
5364}
5365#endif // HWY_HAVE_FLOAT64
5366
5367template <class D, HWY_NEON_IF_EMULATED_D(D), HWY_IF_V_SIZE_GT_D(D, 4)>
5369 const RebindToUnsigned<Twice<decltype(dh)>> du;
5370 const Half<decltype(du)> duh;
5371 return BitCast(dh, UpperHalf(duh, BitCast(du, v)));
5372}
5373
5374// Partial
5375template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
5377 const Twice<DH> d;
5378 const RebindToUnsigned<decltype(d)> du;
5379 const VFromD<decltype(du)> upper =
5380 ShiftRightBytes<dh.MaxBytes()>(du, BitCast(du, v));
5381 return VFromD<DH>(BitCast(d, upper).raw);
5382}
5383
5384// ------------------------------ Broadcast/splat any lane
5385
5386template <int kLane, typename T>
5388 return v;
5389}
5390
5391#if HWY_ARCH_ARM_A64
5392// Unsigned
5393template <int kLane>
5394HWY_API Vec128<uint8_t> Broadcast(Vec128<uint8_t> v) {
5395 static_assert(0 <= kLane && kLane < 16, "Invalid lane");
5396 return Vec128<uint8_t>(vdupq_laneq_u8(v.raw, kLane));
5397}
5398template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8),
5399 HWY_IF_LANES_GT(N, 1)>
5400HWY_API Vec128<uint8_t, N> Broadcast(Vec128<uint8_t, N> v) {
5401 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5402 return Vec128<uint8_t, N>(vdup_lane_u8(v.raw, kLane));
5403}
5404template <int kLane>
5405HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) {
5406 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5407 return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
5408}
5409template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8),
5410 HWY_IF_LANES_GT(N, 1)>
5411HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) {
5412 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5413 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
5414}
5415template <int kLane>
5416HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) {
5417 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
5418 return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
5419}
5420template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8),
5421 HWY_IF_LANES_GT(N, 1)>
5422HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) {
5423 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5424 return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
5425}
5426template <int kLane>
5427HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) {
5428 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
5429 return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
5430}
5431
5432// Signed
5433template <int kLane>
5434HWY_API Vec128<int8_t> Broadcast(Vec128<int8_t> v) {
5435 static_assert(0 <= kLane && kLane < 16, "Invalid lane");
5436 return Vec128<int8_t>(vdupq_laneq_s8(v.raw, kLane));
5437}
5438template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8),
5439 HWY_IF_LANES_GT(N, 1)>
5440HWY_API Vec128<int8_t, N> Broadcast(Vec128<int8_t, N> v) {
5441 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5442 return Vec128<int8_t, N>(vdup_lane_s8(v.raw, kLane));
5443}
5444template <int kLane>
5445HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) {
5446 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5447 return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
5448}
5449template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8),
5450 HWY_IF_LANES_GT(N, 1)>
5451HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) {
5452 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5453 return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
5454}
5455template <int kLane>
5456HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) {
5457 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
5458 return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
5459}
5460template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8),
5461 HWY_IF_LANES_GT(N, 1)>
5462HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) {
5463 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5464 return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
5465}
5466template <int kLane>
5467HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) {
5468 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
5469 return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
5470}
5471
5472// Float
5473#if HWY_HAVE_FLOAT16
5474template <int kLane>
5475HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) {
5476 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5477 return Vec128<float16_t>(vdupq_laneq_f16(v.raw, kLane));
5478}
5479template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8),
5480 HWY_IF_LANES_GT(N, 1)>
5481HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
5482 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5483 return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
5484}
5485#endif // HWY_HAVE_FLOAT16
5486
5487#if HWY_NEON_HAVE_BFLOAT16
5488template <int kLane>
5489HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
5490 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5491 return Vec128<bfloat16_t>(vdupq_laneq_bf16(v.raw, kLane));
5492}
5493template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
5494 HWY_IF_LANES_GT(N, 1)>
5495HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
5496 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5497 return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
5498}
5499#endif // HWY_NEON_HAVE_BFLOAT16
5500
5501template <int kLane>
5502HWY_API Vec128<float> Broadcast(Vec128<float> v) {
5503 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
5504 return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
5505}
5506template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8),
5507 HWY_IF_LANES_GT(N, 1)>
5508HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
5509 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5510 return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
5511}
5512template <int kLane>
5513HWY_API Vec128<double> Broadcast(Vec128<double> v) {
5514 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
5515 return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
5516}
5517
5518#else // !HWY_ARCH_ARM_A64
5519// No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.
5520
5521// Unsigned
5522template <int kLane>
5524 static_assert(0 <= kLane && kLane < 16, "Invalid lane");
5525 return Vec128<uint8_t>(vdupq_n_u8(vgetq_lane_u8(v.raw, kLane)));
5526}
5527template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8),
5528 HWY_IF_LANES_GT(N, 1)>
5530 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5531 return Vec128<uint8_t, N>(vdup_lane_u8(v.raw, kLane));
5532}
5533template <int kLane>
5535 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5536 return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
5537}
5538template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8),
5539 HWY_IF_LANES_GT(N, 1)>
5541 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5542 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
5543}
5544template <int kLane>
5546 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
5547 return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
5548}
5549template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8),
5550 HWY_IF_LANES_GT(N, 1)>
5552 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5553 return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
5554}
5555template <int kLane>
5557 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
5558 return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
5559}
5560
5561// Signed
5562template <int kLane>
5564 static_assert(0 <= kLane && kLane < 16, "Invalid lane");
5565 return Vec128<int8_t>(vdupq_n_s8(vgetq_lane_s8(v.raw, kLane)));
5566}
5567template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8),
5568 HWY_IF_LANES_GT(N, 1)>
5570 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5571 return Vec128<int8_t, N>(vdup_lane_s8(v.raw, kLane));
5572}
5573template <int kLane>
5575 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5576 return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
5577}
5578template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8),
5579 HWY_IF_LANES_GT(N, 1)>
5581 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5582 return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
5583}
5584template <int kLane>
5586 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
5587 return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
5588}
5589template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8),
5590 HWY_IF_LANES_GT(N, 1)>
5592 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5593 return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
5594}
5595template <int kLane>
5597 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
5598 return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
5599}
5600
5601// Float
5602#if HWY_HAVE_FLOAT16
5603template <int kLane>
5604HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) {
5605 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5606 return Vec128<float16_t>(vdupq_n_f16(vgetq_lane_f16(v.raw, kLane)));
5607}
5608template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8),
5609 HWY_IF_LANES_GT(N, 1)>
5610HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
5611 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5612 return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
5613}
5614#endif // HWY_HAVE_FLOAT16
5615#if HWY_NEON_HAVE_BFLOAT16
5616template <int kLane>
5617HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
5618 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5619 return Vec128<bfloat16_t>(vdupq_n_bf16(vgetq_lane_bf16(v.raw, kLane)));
5620}
5621template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
5622 HWY_IF_LANES_GT(N, 1)>
5623HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
5624 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5625 return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
5626}
5627#endif // HWY_NEON_HAVE_BFLOAT16
5628template <int kLane>
5630 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
5631 return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
5632}
5633template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8),
5634 HWY_IF_LANES_GT(N, 1)>
5636 static_assert(0 <= kLane && kLane < N, "Invalid lane");
5637 return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
5638}
5639
5640#endif // HWY_ARCH_ARM_A64
5641
5642template <int kLane, typename V, HWY_NEON_IF_EMULATED_D(DFromV<V>),
5643 HWY_IF_LANES_GT_D(DFromV<V>, 1)>
5645 const DFromV<V> d;
5646 const RebindToUnsigned<decltype(d)> du;
5647 return BitCast(d, Broadcast<kLane>(BitCast(du, v)));
5648}
5649
5650// ------------------------------ TableLookupLanes
5651
5652// Returned by SetTableIndices for use by TableLookupLanes.
5653template <typename T, size_t N>
5657
5658namespace detail {
5659
5660template <class D, HWY_IF_T_SIZE_D(D, 1)>
5662 D d) {
5663 const Repartition<uint8_t, decltype(d)> d8;
5664 return Iota(d8, 0);
5665}
5666
5667template <class D, HWY_IF_T_SIZE_D(D, 2)>
5669 D d) {
5670 const Repartition<uint8_t, decltype(d)> d8;
5671 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
5672 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
5673 return Load(d8, kBroadcastLaneBytes);
5674}
5675
5676template <class D, HWY_IF_T_SIZE_D(D, 4)>
5678 D d) {
5679 const Repartition<uint8_t, decltype(d)> d8;
5680 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
5681 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
5682 return Load(d8, kBroadcastLaneBytes);
5683}
5684
5685template <class D, HWY_IF_T_SIZE_D(D, 8)>
5687 D d) {
5688 const Repartition<uint8_t, decltype(d)> d8;
5689 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
5690 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
5691 return Load(d8, kBroadcastLaneBytes);
5692}
5693
5694template <class D, HWY_IF_T_SIZE_D(D, 1)>
5696 const Repartition<uint8_t, decltype(d)> d8;
5697 return Zero(d8);
5698}
5699
5700template <class D, HWY_IF_T_SIZE_D(D, 2)>
5702 const Repartition<uint8_t, decltype(d)> d8;
5703 alignas(16) static constexpr uint8_t kByteOffsets[16] = {
5704 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
5705 return Load(d8, kByteOffsets);
5706}
5707
5708template <class D, HWY_IF_T_SIZE_D(D, 4)>
5710 const Repartition<uint8_t, decltype(d)> d8;
5711 alignas(16) static constexpr uint8_t kByteOffsets[16] = {
5712 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
5713 return Load(d8, kByteOffsets);
5714}
5715
5716template <class D, HWY_IF_T_SIZE_D(D, 8)>
5718 const Repartition<uint8_t, decltype(d)> d8;
5719 alignas(16) static constexpr uint8_t kByteOffsets[16] = {
5720 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
5721 return Load(d8, kByteOffsets);
5722}
5723
5724} // namespace detail
5725
5726template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)>
5728 D d, Vec128<TI, MaxLanes(D())> vec) {
5729 using T = TFromD<D>;
5730 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
5731#if HWY_IS_DEBUG_BUILD
5732 const RebindToUnsigned<decltype(d)> du;
5733 using TU = TFromD<decltype(du)>;
5735 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
5736#endif
5737
5738 (void)d;
5739 return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, vec).raw};
5740}
5741
5742template <class D, typename TI,
5743 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
5744HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
5745 D d, Vec128<TI, MaxLanes(D())> vec) {
5746 using T = TFromD<D>;
5747 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
5748#if HWY_IS_DEBUG_BUILD
5749 const RebindToUnsigned<decltype(d)> du;
5750 using TU = TFromD<decltype(du)>;
5752 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
5753#endif
5754
5755 const Repartition<uint8_t, decltype(d)> d8;
5756 using V8 = VFromD<decltype(d8)>;
5757
5758 // Broadcast each lane index to all bytes of T and shift to bytes
5759 const V8 lane_indices = TableLookupBytes(
5761 constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
5762 const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
5763 const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
5764 return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, sum).raw};
5765}
5766
5767template <class D, typename TI>
5769 const TI* idx) {
5770 const Rebind<TI, decltype(d)> di;
5771 return IndicesFromVec(d, LoadU(di, idx));
5772}
5773
5774template <typename T, size_t N>
5776 const DFromV<decltype(v)> d;
5777 const RebindToSigned<decltype(d)> di;
5778 return BitCast(
5779 d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128<T, N>{idx.raw})));
5780}
5781
5782template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
5784 Indices128<T, N> idx) {
5785 const DFromV<decltype(a)> d;
5786 const Twice<decltype(d)> dt;
5787// TableLookupLanes currently requires table and index vectors to be the same
5788// size, though a half-length index vector would be sufficient here.
5789#if HWY_IS_MSAN
5790 const Vec128<T, N> idx_vec{idx.raw};
5791 const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
5792#else
5793 // We only keep LowerHalf of the result, which is valid in idx.
5794 const Indices128<T, N * 2> idx2{idx.raw};
5795#endif
5796 return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2));
5797}
5798
5799template <typename T>
5801 Indices128<T, 8 / sizeof(T)> idx) {
5802 const DFromV<decltype(a)> d;
5803 const Repartition<uint8_t, decltype(d)> du8;
5804 const auto a_u8 = BitCast(du8, a);
5805 const auto b_u8 = BitCast(du8, b);
5806 const auto idx_u8 = BitCast(du8, Vec64<T>{idx.raw});
5807
5808#if HWY_ARCH_ARM_A64
5809 const Twice<decltype(du8)> dt_u8;
5810 return BitCast(
5811 d, Vec64<uint8_t>{vqtbl1_u8(Combine(dt_u8, b_u8, a_u8).raw, idx_u8.raw)});
5812#else
5813 detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}};
5814 return BitCast(d, Vec64<uint8_t>{vtbl2_u8(tup.raw, idx_u8.raw)});
5815#endif
5816}
5817
5818template <typename T>
5820 Indices128<T, 16 / sizeof(T)> idx) {
5821 const DFromV<decltype(a)> d;
5822 const Repartition<uint8_t, decltype(d)> du8;
5823 const auto a_u8 = BitCast(du8, a);
5824 const auto b_u8 = BitCast(du8, b);
5825 const auto idx_u8 = BitCast(du8, Vec128<T>{idx.raw});
5826
5827#if HWY_ARCH_ARM_A64
5828 detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}};
5829 return BitCast(d, Vec128<uint8_t>{vqtbl2q_u8(tup.raw, idx_u8.raw)});
5830#else
5831 const Half<decltype(d)> dh;
5832 const Repartition<uint8_t, decltype(dh)> dh_u8;
5833 const auto a_lo_u8 = LowerHalf(dh_u8, a_u8);
5834 const auto a_hi_u8 = UpperHalf(dh_u8, a_u8);
5835 const auto b_lo_u8 = LowerHalf(dh_u8, b_u8);
5836 const auto b_hi_u8 = UpperHalf(dh_u8, b_u8);
5837 const auto idx_lo_u8 = LowerHalf(dh_u8, idx_u8);
5838 const auto idx_hi_u8 = UpperHalf(dh_u8, idx_u8);
5839
5840 detail::Tuple4<uint8_t, dh_u8.MaxLanes()> tup = {
5841 {{a_lo_u8.raw, a_hi_u8.raw, b_lo_u8.raw, b_hi_u8.raw}}};
5842 const auto lo_result =
5843 BitCast(dh, Vec64<uint8_t>{vtbl4_u8(tup.raw, idx_lo_u8.raw)});
5844 const auto hi_result =
5845 BitCast(dh, Vec64<uint8_t>{vtbl4_u8(tup.raw, idx_hi_u8.raw)});
5846 return Combine(d, hi_result, lo_result);
5847#endif
5848}
5849
5850// ------------------------------ Reverse2 (CombineShiftRightBytes)
5851
5852// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
5853#ifdef HWY_NATIVE_REVERSE2_8
5854#undef HWY_NATIVE_REVERSE2_8
5855#else
5856#define HWY_NATIVE_REVERSE2_8
5857#endif
5858
5859template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
5861 const RebindToUnsigned<decltype(d)> du;
5862 return BitCast(d, VFromD<decltype(du)>(vrev16_u8(BitCast(du, v).raw)));
5863}
5864template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
5866 const RebindToUnsigned<decltype(d)> du;
5867 return BitCast(d, Vec128<uint8_t>(vrev16q_u8(BitCast(du, v).raw)));
5868}
5869
5870template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
5872 const RebindToUnsigned<decltype(d)> du;
5873 return BitCast(d, VFromD<decltype(du)>(vrev32_u16(BitCast(du, v).raw)));
5874}
5875template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
5876HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) {
5877 const RebindToUnsigned<decltype(d)> du;
5878 return BitCast(d, Vec128<uint16_t>(vrev32q_u16(BitCast(du, v).raw)));
5879}
5880
5881template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
5883 const RebindToUnsigned<decltype(d)> du;
5884 return BitCast(d, VFromD<decltype(du)>(vrev64_u32(BitCast(du, v).raw)));
5885}
5886template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
5887HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) {
5888 const RebindToUnsigned<decltype(d)> du;
5889 return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw)));
5890}
5891
5892template <class D, HWY_IF_T_SIZE_D(D, 8)>
5894 return CombineShiftRightBytes<8>(d, v, v);
5895}
5896
5897// ------------------------------ Reverse4 (Reverse2)
5898
5899template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
5901 const RebindToUnsigned<decltype(d)> du;
5902 return BitCast(d, VFromD<decltype(du)>(vrev32_u8(BitCast(du, v).raw)));
5903}
5904template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
5906 const RebindToUnsigned<decltype(d)> du;
5907 return BitCast(d, Vec128<uint8_t>(vrev32q_u8(BitCast(du, v).raw)));
5908}
5909
5910template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
5912 const RebindToUnsigned<decltype(d)> du;
5913 return BitCast(d, VFromD<decltype(du)>(vrev64_u16(BitCast(du, v).raw)));
5914}
5915template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
5916HWY_API Vec128<T> Reverse4(D d, Vec128<T> v) {
5917 const RebindToUnsigned<decltype(d)> du;
5918 return BitCast(d, Vec128<uint16_t>(vrev64q_u16(BitCast(du, v).raw)));
5919}
5920
5921template <class D, HWY_IF_T_SIZE_D(D, 4)>
5923 const RepartitionToWide<RebindToUnsigned<decltype(d)>> duw;
5924 return BitCast(d, Reverse2(duw, BitCast(duw, Reverse2(d, v))));
5925}
5926
5927template <class D, HWY_IF_T_SIZE_D(D, 8)>
5928HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D>) {
5929 HWY_ASSERT(0); // don't have 8 u64 lanes
5930}
5931
5932// ------------------------------ Reverse8 (Reverse2, Reverse4)
5933
5934template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
5936 const RebindToUnsigned<decltype(d)> du;
5937 return BitCast(d, VFromD<decltype(du)>(vrev64_u8(BitCast(du, v).raw)));
5938}
5939template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
5941 const RebindToUnsigned<decltype(d)> du;
5942 return BitCast(d, Vec128<uint8_t>(vrev64q_u8(BitCast(du, v).raw)));
5943}
5944
5945template <class D, HWY_IF_T_SIZE_D(D, 2)>
5947 const Repartition<uint64_t, decltype(d)> du64;
5948 return BitCast(d, Reverse2(du64, BitCast(du64, Reverse4(d, v))));
5949}
5950
5951template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
5952HWY_API VFromD<D> Reverse8(D, VFromD<D>) {
5953 HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit
5954}
5955
5956// ------------------------------ Reverse (Reverse2, Reverse4, Reverse8)
5957
5958template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
5960 return v;
5961}
5962
5963template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
5965 return Reverse2(d, v);
5966}
5967
5968template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 4)>
5970 return Reverse4(d, v);
5971}
5972
5973template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 8)>
5975 return Reverse8(d, v);
5976}
5977
5978template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 16)>
5980 const Repartition<uint64_t, decltype(d)> du64;
5981 return BitCast(d, Reverse2(du64, BitCast(du64, Reverse8(d, v))));
5982}
5983
5984// ------------------------------ ReverseBits
5985
5986#if HWY_ARCH_ARM_A64
5987
5988#ifdef HWY_NATIVE_REVERSE_BITS_UI8
5989#undef HWY_NATIVE_REVERSE_BITS_UI8
5990#else
5991#define HWY_NATIVE_REVERSE_BITS_UI8
5992#endif
5993
5996
5997#endif // HWY_ARCH_ARM_A64
5998
5999// ------------------------------ Other shuffles (TableLookupBytes)
6000
6001// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
6002// Shuffle0321 rotates one lane to the right (the previous least-significant
6003// lane is now most-significant). These could also be implemented via
6004// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
6005
6006// Swap 64-bit halves
6007template <typename T>
6009 return CombineShiftRightBytes<8>(DFromV<decltype(v)>(), v, v);
6010}
6011template <typename T>
6013 return CombineShiftRightBytes<8>(DFromV<decltype(v)>(), v, v);
6014}
6015
6016// Rotate right 32 bits
6017template <typename T>
6019 return CombineShiftRightBytes<4>(DFromV<decltype(v)>(), v, v);
6020}
6021
6022// Rotate left 32 bits
6023template <typename T>
6025 return CombineShiftRightBytes<12>(DFromV<decltype(v)>(), v, v);
6026}
6027
6028// Reverse
6029template <typename T>
6031 return Reverse4(DFromV<decltype(v)>(), v);
6032}
6033
6034// ------------------------------ InterleaveLower
6035
6036// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
6037// the least-significant lane) and "b". To concatenate two half-width integers
6038// into one, use ZipLower/Upper instead (also works with scalar).
6040#if HWY_ARCH_ARM_A64
6041// N=1 makes no sense (in that case, there would be no upper/lower).
6043#else
6044// Emulated version for Armv7.
6045template <typename T, HWY_IF_T_SIZE(T, 8)>
6047 const DFromV<decltype(a)> d;
6048 return CombineShiftRightBytes<8>(d, b, Shuffle01(a));
6049}
6050#endif
6051
6052#if !HWY_HAVE_FLOAT16
6053template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 4)>
6056 const DFromV<decltype(a)> d;
6057 const RebindToUnsigned<decltype(d)> du;
6058 return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b)));
6059}
6060#endif // !HWY_HAVE_FLOAT16
6061
6062// < 64 bit parts
6063template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
6067
6068// Additional overload for the optional Simd<> tag.
6069template <class D>
6071 return InterleaveLower(a, b);
6072}
6073
6074// ------------------------------ InterleaveUpper (UpperHalf)
6075
6076// All functions inside detail lack the required D parameter.
6077namespace detail {
6079
6080#if HWY_ARCH_ARM_A64
6081// N=1 makes no sense (in that case, there would be no upper/lower).
6083#else
6084// Emulated version for Armv7.
6085template <typename T, HWY_IF_T_SIZE(T, 8)>
6087 const DFromV<decltype(a)> d;
6088 return CombineShiftRightBytes<8>(d, Shuffle01(b), a);
6089}
6090#endif
6091} // namespace detail
6092
6093// Full register
6094template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
6096 return detail::InterleaveUpper(a, b);
6097}
6098
6099// Partial
6100template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
6102 const Half<decltype(d)> d2;
6103 const VFromD<D> a2(UpperHalf(d2, a).raw);
6104 const VFromD<D> b2(UpperHalf(d2, b).raw);
6105 return InterleaveLower(d, a2, b2);
6106}
6107
6108// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
6109
6110// Same as Interleave*, except that the return lanes are double-width integers;
6111// this is necessary because the single-lane scalar cannot return two values.
6112template <class V, class DW = RepartitionToWide<DFromV<V>>>
6114 return BitCast(DW(), InterleaveLower(a, b));
6115}
6116template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
6117HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
6118 return BitCast(dw, InterleaveLower(D(), a, b));
6119}
6120
6121template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
6122HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
6123 return BitCast(dw, InterleaveUpper(D(), a, b));
6124}
6125
6126// ------------------------------ Per4LaneBlockShuffle
6127namespace detail {
6128
6129#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
6130
6131#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6132#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6133#else
6134#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6135#endif
6136
6137template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6138HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t /*x3*/,
6139 const uint32_t /*x2*/,
6140 const uint32_t x1,
6141 const uint32_t x0) {
6142 typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8)));
6143 const GccU32RawVectType raw = {x0, x1};
6144 return ResizeBitCast(d, Vec64<uint32_t>(reinterpret_cast<uint32x2_t>(raw)));
6145}
6146
6147template <class D, HWY_IF_V_SIZE_D(D, 16)>
6149 const uint32_t x2,
6150 const uint32_t x1,
6151 const uint32_t x0) {
6152 typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
6153 const GccU32RawVectType raw = {x0, x1, x2, x3};
6154 return ResizeBitCast(d, Vec128<uint32_t>(reinterpret_cast<uint32x4_t>(raw)));
6155}
6156#endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
6157
6158template <size_t kLaneSize, size_t kVectSize, class V,
6159 HWY_IF_LANES_GT_D(DFromV<V>, 4)>
6161 hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
6162 hwy::SizeTag<kVectSize> /*vect_size_tag*/,
6163 V v) {
6164 const DFromV<decltype(v)> d;
6165 const RebindToUnsigned<decltype(d)> du;
6166 const RepartitionToWide<decltype(du)> dw;
6167
6168 const auto evens = BitCast(dw, ConcatEven(d, v, v));
6169 return BitCast(d, InterleaveLower(dw, evens, evens));
6170}
6171
6172template <size_t kLaneSize, size_t kVectSize, class V,
6175 hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
6176 hwy::SizeTag<kVectSize> /*vect_size_tag*/,
6177 V v) {
6178 const DFromV<decltype(v)> d;
6179 const RebindToUnsigned<decltype(d)> du;
6180 const RepartitionToWide<decltype(du)> dw;
6181
6182 const auto odds = BitCast(dw, ConcatOdd(d, v, v));
6183 return BitCast(d, InterleaveLower(dw, odds, odds));
6184}
6185
6186template <class V>
6188 hwy::SizeTag<2> /*lane_size_tag*/,
6189 hwy::SizeTag<8> /*vect_size_tag*/, V v) {
6190 const DFromV<decltype(v)> d;
6191 return InterleaveUpper(d, v, v);
6192}
6193
6194} // namespace detail
6195
6196// ------------------------------ SlideUpLanes
6197
6198namespace detail {
6199
6200template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
6201HWY_INLINE V SlideUpLanes(V v, size_t amt) {
6202 const DFromV<decltype(v)> d;
6203 using TU = UnsignedFromSize<d.MaxBytes()>;
6204 const Repartition<TU, decltype(d)> du;
6205 return BitCast(d, BitCast(du, v) << Set(
6206 du, static_cast<TU>(amt * sizeof(TFromV<V>) * 8)));
6207}
6208
6209template <class V, HWY_IF_V_SIZE_V(V, 16)>
6210HWY_INLINE V SlideUpLanes(V v, size_t amt) {
6211 const DFromV<decltype(v)> d;
6212 const Repartition<uint8_t, decltype(d)> du8;
6213 const auto idx =
6214 Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>)));
6215 return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx));
6216}
6217
6218} // namespace detail
6219
6220template <class D, HWY_IF_LANES_D(D, 1)>
6221HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
6222 return v;
6223}
6224
6225template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
6227#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6228 if (__builtin_constant_p(amt)) {
6229 switch (amt) {
6230 case 0:
6231 return v;
6232 case 1:
6233 return ShiftLeftLanes<1>(d, v);
6234 }
6235 }
6236#else
6237 (void)d;
6238#endif
6239
6240 return detail::SlideUpLanes(v, amt);
6241}
6242
6243template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
6244HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
6245#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6246 if (__builtin_constant_p(amt)) {
6247 switch (amt) {
6248 case 0:
6249 return v;
6250 case 1:
6251 return ShiftLeftLanes<1>(d, v);
6252 case 2:
6253 return ShiftLeftLanes<2>(d, v);
6254 case 3:
6255 return ShiftLeftLanes<3>(d, v);
6256 }
6257 }
6258#else
6259 (void)d;
6260#endif
6261
6262 return detail::SlideUpLanes(v, amt);
6263}
6264
6265template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
6266HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
6267#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6268 if (__builtin_constant_p(amt)) {
6269 switch (amt) {
6270 case 0:
6271 return v;
6272 case 1:
6273 return ShiftLeftLanes<1>(d, v);
6274 case 2:
6275 return ShiftLeftLanes<2>(d, v);
6276 case 3:
6277 return ShiftLeftLanes<3>(d, v);
6278 case 4:
6279 return ShiftLeftLanes<4>(d, v);
6280 case 5:
6281 return ShiftLeftLanes<5>(d, v);
6282 case 6:
6283 return ShiftLeftLanes<6>(d, v);
6284 case 7:
6285 return ShiftLeftLanes<7>(d, v);
6286 }
6287 }
6288#else
6289 (void)d;
6290#endif
6291
6292 return detail::SlideUpLanes(v, amt);
6293}
6294
6295template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
6296HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
6297#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6298 if (__builtin_constant_p(amt)) {
6299 switch (amt) {
6300 case 0:
6301 return v;
6302 case 1:
6303 return ShiftLeftLanes<1>(d, v);
6304 case 2:
6305 return ShiftLeftLanes<2>(d, v);
6306 case 3:
6307 return ShiftLeftLanes<3>(d, v);
6308 case 4:
6309 return ShiftLeftLanes<4>(d, v);
6310 case 5:
6311 return ShiftLeftLanes<5>(d, v);
6312 case 6:
6313 return ShiftLeftLanes<6>(d, v);
6314 case 7:
6315 return ShiftLeftLanes<7>(d, v);
6316 case 8:
6317 return ShiftLeftLanes<8>(d, v);
6318 case 9:
6319 return ShiftLeftLanes<9>(d, v);
6320 case 10:
6321 return ShiftLeftLanes<10>(d, v);
6322 case 11:
6323 return ShiftLeftLanes<11>(d, v);
6324 case 12:
6325 return ShiftLeftLanes<12>(d, v);
6326 case 13:
6327 return ShiftLeftLanes<13>(d, v);
6328 case 14:
6329 return ShiftLeftLanes<14>(d, v);
6330 case 15:
6331 return ShiftLeftLanes<15>(d, v);
6332 }
6333 }
6334#else
6335 (void)d;
6336#endif
6337
6338 return detail::SlideUpLanes(v, amt);
6339}
6340
6341// ------------------------------ SlideDownLanes
6342
6343namespace detail {
6344
6345template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
6346HWY_INLINE V SlideDownLanes(V v, size_t amt) {
6347 const DFromV<decltype(v)> d;
6348 using TU = UnsignedFromSize<d.MaxBytes()>;
6349 const Repartition<TU, decltype(d)> du;
6350 return BitCast(d,
6351 BitCast(du, v) << Set(
6352 du, static_cast<TU>(TU{0} - amt * sizeof(TFromV<V>) * 8)));
6353}
6354
6355template <class V, HWY_IF_V_SIZE_V(V, 16)>
6356HWY_INLINE V SlideDownLanes(V v, size_t amt) {
6357 const DFromV<decltype(v)> d;
6358 const Repartition<int8_t, decltype(d)> di8;
6359 auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
6360 idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
6361 return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
6362}
6363
6364} // namespace detail
6365
6366template <class D, HWY_IF_LANES_D(D, 1)>
6367HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
6368 return v;
6369}
6370
6371template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
6373#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6374 if (__builtin_constant_p(amt)) {
6375 switch (amt) {
6376 case 0:
6377 return v;
6378 case 1:
6379 return ShiftRightLanes<1>(d, v);
6380 }
6381 }
6382#else
6383 (void)d;
6384#endif
6385
6386 return detail::SlideDownLanes(v, amt);
6387}
6388
6389template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
6390HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
6391#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6392 if (__builtin_constant_p(amt)) {
6393 switch (amt) {
6394 case 0:
6395 return v;
6396 case 1:
6397 return ShiftRightLanes<1>(d, v);
6398 case 2:
6399 return ShiftRightLanes<2>(d, v);
6400 case 3:
6401 return ShiftRightLanes<3>(d, v);
6402 }
6403 }
6404#else
6405 (void)d;
6406#endif
6407
6408 return detail::SlideDownLanes(v, amt);
6409}
6410
6411template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
6412HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
6413#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6414 if (__builtin_constant_p(amt)) {
6415 switch (amt) {
6416 case 0:
6417 return v;
6418 case 1:
6419 return ShiftRightLanes<1>(d, v);
6420 case 2:
6421 return ShiftRightLanes<2>(d, v);
6422 case 3:
6423 return ShiftRightLanes<3>(d, v);
6424 case 4:
6425 return ShiftRightLanes<4>(d, v);
6426 case 5:
6427 return ShiftRightLanes<5>(d, v);
6428 case 6:
6429 return ShiftRightLanes<6>(d, v);
6430 case 7:
6431 return ShiftRightLanes<7>(d, v);
6432 }
6433 }
6434#else
6435 (void)d;
6436#endif
6437
6438 return detail::SlideDownLanes(v, amt);
6439}
6440
6441template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
6442HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
6443#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6444 if (__builtin_constant_p(amt)) {
6445 switch (amt) {
6446 case 0:
6447 return v;
6448 case 1:
6449 return ShiftRightLanes<1>(d, v);
6450 case 2:
6451 return ShiftRightLanes<2>(d, v);
6452 case 3:
6453 return ShiftRightLanes<3>(d, v);
6454 case 4:
6455 return ShiftRightLanes<4>(d, v);
6456 case 5:
6457 return ShiftRightLanes<5>(d, v);
6458 case 6:
6459 return ShiftRightLanes<6>(d, v);
6460 case 7:
6461 return ShiftRightLanes<7>(d, v);
6462 case 8:
6463 return ShiftRightLanes<8>(d, v);
6464 case 9:
6465 return ShiftRightLanes<9>(d, v);
6466 case 10:
6467 return ShiftRightLanes<10>(d, v);
6468 case 11:
6469 return ShiftRightLanes<11>(d, v);
6470 case 12:
6471 return ShiftRightLanes<12>(d, v);
6472 case 13:
6473 return ShiftRightLanes<13>(d, v);
6474 case 14:
6475 return ShiftRightLanes<14>(d, v);
6476 case 15:
6477 return ShiftRightLanes<15>(d, v);
6478 }
6479 }
6480#else
6481 (void)d;
6482#endif
6483
6484 return detail::SlideDownLanes(v, amt);
6485}
6486
6487// ------------------------------ SatWidenMulAccumFixedPoint
6488
6489#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
6490#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
6491#else
6492#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
6493#endif
6494
6495template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
6499 VFromD<DI32> sum) {
6500 return VFromD<DI32>(vqdmlal_s16(sum.raw, a.raw, b.raw));
6501}
6502
6503template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
6505 VFromD<Rebind<int16_t, DI32>> a,
6506 VFromD<Rebind<int16_t, DI32>> b,
6507 VFromD<DI32> sum) {
6508 const Full128<TFromD<DI32>> di32_full;
6509 const Rebind<int16_t, decltype(di32_full)> di16_full64;
6510 return ResizeBitCast(
6511 di32, SatWidenMulAccumFixedPoint(di32_full, ResizeBitCast(di16_full64, a),
6512 ResizeBitCast(di16_full64, b),
6513 ResizeBitCast(di32_full, sum)));
6514}
6515
6516// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
6517
6518#if HWY_NEON_HAVE_F32_TO_BF16C
6519
6520namespace detail {
6521#if HWY_NEON_HAVE_BFLOAT16
6522// If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
6523// bfloat16x4_t or bfloat16x8_t.
6524static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(bfloat16x4_t raw) {
6525 return raw;
6526}
6527static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(bfloat16x8_t raw) {
6528 return raw;
6529}
6530#else
6531// If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
6532// detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
6533// work around compiler bugs that are there with GCC 13 or earlier or Clang 16
6534// or earlier on AArch64.
6535
6536// The uint16x4_t or uint16x8_t vector neets to be bitcasted to a bfloat16x4_t
6537// or a bfloat16x8_t vector for the vbfdot_f32 and vbfdotq_f32 intrinsics if
6538// HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true
6539static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(uint16x4_t raw) {
6540 return vreinterpret_bf16_u16(raw);
6541}
6542static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(uint16x8_t raw) {
6543 return vreinterpretq_bf16_u16(raw);
6544}
6545#endif
6546} // namespace detail
6547
6548template <class D, HWY_IF_V_SIZE_D(D, 16)>
6549HWY_API Vec128<float> ReorderWidenMulAccumulate(D /*d32*/, Vec128<bfloat16_t> a,
6550 Vec128<bfloat16_t> b,
6551 const Vec128<float> sum0,
6552 Vec128<float>& /*sum1*/) {
6553 return Vec128<float>(vbfdotq_f32(sum0.raw,
6554 detail::BitCastToRawNeonBF16(a.raw),
6555 detail::BitCastToRawNeonBF16(b.raw)));
6556}
6557
6558template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6560 D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
6561 VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> sum0,
6562 VFromD<D>& /*sum1*/) {
6563 return VFromD<D>(vbfdot_f32(sum0.raw, detail::BitCastToRawNeonBF16(a.raw),
6564 detail::BitCastToRawNeonBF16(b.raw)));
6565}
6566
6567#else
6568
6569template <class D32, HWY_IF_F32_D(D32),
6572 const VFromD<D32> sum0,
6573 VFromD<D32>& sum1) {
6574 const RebindToUnsigned<decltype(df32)> du32;
6575 using VU32 = VFromD<decltype(du32)>;
6576 const VU32 odd = Set(du32, 0xFFFF0000u);
6577 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
6578 const VU32 ao = And(BitCast(du32, a), odd);
6579 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
6580 const VU32 bo = And(BitCast(du32, b), odd);
6581 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
6582 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
6583}
6584
6585#endif // HWY_NEON_HAVE_F32_TO_BF16C
6586
6587template <class D, HWY_IF_I32_D(D)>
6590 const Vec128<int32_t> sum0,
6591 Vec128<int32_t>& sum1) {
6592#if HWY_ARCH_ARM_A64
6593 sum1 = Vec128<int32_t>(vmlal_high_s16(sum1.raw, a.raw, b.raw));
6594#else
6595 const Full64<int16_t> dh;
6596 sum1 = Vec128<int32_t>(
6597 vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
6598#endif
6599 return Vec128<int32_t>(
6600 vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw));
6601}
6602
6603template <class D, HWY_IF_I32_D(D)>
6606 const Vec64<int32_t> sum0,
6607 Vec64<int32_t>& sum1) {
6608 // vmlal writes into the upper half, which the caller cannot use, so
6609 // split into two halves.
6610 const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
6611 const Vec64<int32_t> mul_32 = UpperHalf(d32, mul_3210);
6612 sum1 += mul_32;
6613 return sum0 + LowerHalf(mul_3210);
6614}
6615
6616template <class D, HWY_IF_I32_D(D)>
6619 const Vec32<int32_t> sum0,
6620 Vec32<int32_t>& sum1) {
6621 const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
6622 const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
6623 const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
6624 const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
6625 sum1 += mul1;
6626 return sum0 + mul0;
6627}
6628
6629template <class D, HWY_IF_U32_D(D)>
6633 const Vec128<uint32_t> sum0,
6634 Vec128<uint32_t>& sum1) {
6635#if HWY_ARCH_ARM_A64
6636 sum1 = Vec128<uint32_t>(vmlal_high_u16(sum1.raw, a.raw, b.raw));
6637#else
6638 const Full64<uint16_t> dh;
6639 sum1 = Vec128<uint32_t>(
6640 vmlal_u16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
6641#endif
6642 return Vec128<uint32_t>(
6643 vmlal_u16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw));
6644}
6645
6646template <class D, HWY_IF_U32_D(D)>
6649 const Vec64<uint32_t> sum0,
6650 Vec64<uint32_t>& sum1) {
6651 // vmlal writes into the upper half, which the caller cannot use, so
6652 // split into two halves.
6653 const Vec128<uint32_t> mul_3210(vmull_u16(a.raw, b.raw));
6654 const Vec64<uint32_t> mul_32 = UpperHalf(d32, mul_3210);
6655 sum1 += mul_32;
6656 return sum0 + LowerHalf(mul_3210);
6657}
6658
6659template <class D, HWY_IF_U32_D(D)>
6662 const Vec32<uint32_t> sum0,
6663 Vec32<uint32_t>& sum1) {
6664 const Vec128<uint32_t> mul_xx10(vmull_u16(a.raw, b.raw));
6665 const Vec64<uint32_t> mul_10(LowerHalf(mul_xx10));
6666 const Vec32<uint32_t> mul0 = LowerHalf(du32, mul_10);
6667 const Vec32<uint32_t> mul1 = UpperHalf(du32, mul_10);
6668 sum1 += mul1;
6669 return sum0 + mul0;
6670}
6671
6672// ------------------------------ Combine partial (InterleaveLower)
6673// < 64bit input, <= 64 bit result
6674template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6676 // First double N (only lower halves will be used).
6677 const VFromD<D> hi2(hi.raw);
6678 const VFromD<D> lo2(lo.raw);
6679 // Repartition to two unsigned lanes (each the size of the valid input).
6680 const Simd<UnsignedFromSize<d.MaxBytes() / 2>, 2, 0> du;
6681 return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2)));
6682}
6683
6684// ------------------------------ RearrangeToOddPlusEven (Combine)
6685
6686template <size_t N>
6688 Vec128<float, N> sum1) {
6689#if HWY_NEON_HAVE_BFLOAT16
6690 (void)sum1; // unused by bf16 ReorderWidenMulAccumulate
6691 return sum0;
6692#else
6693 return Add(sum0, sum1);
6694#endif
6695}
6696
6698 Vec128<int32_t> sum1) {
6699// vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
6700#if HWY_ARCH_ARM_A64 // pairwise sum is available and what we want
6701 return Vec128<int32_t>(vpaddq_s32(sum0.raw, sum1.raw));
6702#else
6703 const Full128<int32_t> d;
6704 const Half<decltype(d)> d64;
6705 const Vec64<int32_t> hi(
6706 vpadd_s32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw));
6707 const Vec64<int32_t> lo(
6708 vpadd_s32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw));
6709 return Combine(Full128<int32_t>(), hi, lo);
6710#endif
6711}
6712
6714 Vec64<int32_t> sum1) {
6715 // vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
6716 return Vec64<int32_t>(vpadd_s32(sum0.raw, sum1.raw));
6717}
6718
6720 Vec32<int32_t> sum1) {
6721 // Only one widened sum per register, so add them for sum of odd and even.
6722 return sum0 + sum1;
6723}
6724
6726 Vec128<uint32_t> sum1) {
6727// vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
6728#if HWY_ARCH_ARM_A64 // pairwise sum is available and what we want
6729 return Vec128<uint32_t>(vpaddq_u32(sum0.raw, sum1.raw));
6730#else
6731 const Full128<uint32_t> d;
6732 const Half<decltype(d)> d64;
6733 const Vec64<uint32_t> hi(
6734 vpadd_u32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw));
6735 const Vec64<uint32_t> lo(
6736 vpadd_u32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw));
6737 return Combine(Full128<uint32_t>(), hi, lo);
6738#endif
6739}
6740
6742 Vec64<uint32_t> sum1) {
6743 // vmlal_u16 multiplied the lower half into sum0 and upper into sum1.
6744 return Vec64<uint32_t>(vpadd_u32(sum0.raw, sum1.raw));
6745}
6746
6748 Vec32<uint32_t> sum1) {
6749 // Only one widened sum per register, so add them for sum of odd and even.
6750 return sum0 + sum1;
6751}
6752
6753// ------------------------------ WidenMulPairwiseAdd
6754
6755#if HWY_NEON_HAVE_F32_TO_BF16C
6756
6757template <class D, HWY_IF_V_SIZE_D(D, 16)>
6758HWY_API Vec128<float> WidenMulPairwiseAdd(D d32, Vec128<bfloat16_t> a,
6759 Vec128<bfloat16_t> b) {
6760 return Vec128<float>(vbfdotq_f32(Zero(d32).raw,
6761 detail::BitCastToRawNeonBF16(a.raw),
6762 detail::BitCastToRawNeonBF16(b.raw)));
6763}
6764
6765template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6767 VFromD<Repartition<bfloat16_t, D>> a,
6768 VFromD<Repartition<bfloat16_t, D>> b) {
6769 return VFromD<D>(vbfdot_f32(Zero(d32).raw,
6770 detail::BitCastToRawNeonBF16(a.raw),
6771 detail::BitCastToRawNeonBF16(b.raw)));
6772}
6773
6774#else
6775template <class D32, HWY_IF_F32_D(D32)>
6779 const RebindToUnsigned<decltype(df32)> du32;
6780 using VU32 = VFromD<decltype(du32)>;
6781 const VU32 odd = Set(du32, 0xFFFF0000u);
6782 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
6783 const VU32 ao = And(BitCast(du32, a), odd);
6784 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
6785 const VU32 bo = And(BitCast(du32, b), odd);
6786 return MulAdd(BitCast(df32, ae), BitCast(df32, be),
6787 Mul(BitCast(df32, ao), BitCast(df32, bo)));
6788}
6789#endif // HWY_NEON_HAVE_F32_TO_BF16C
6790
6791template <class D, HWY_IF_I32_D(D)>
6793 Vec128<int16_t> b) {
6794 Vec128<int32_t> sum1;
6795#if HWY_ARCH_ARM_A64
6796 sum1 = Vec128<int32_t>(vmull_high_s16(a.raw, b.raw));
6797#else
6798 const Full64<int16_t> dh;
6799 sum1 = Vec128<int32_t>(vmull_s16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
6800#endif
6801 Vec128<int32_t> sum0 =
6802 Vec128<int32_t>(vmull_s16(LowerHalf(a).raw, LowerHalf(b).raw));
6803 return RearrangeToOddPlusEven(sum0, sum1);
6804}
6805
6806template <class D, HWY_IF_I32_D(D)>
6808 Vec64<int16_t> b) {
6809 // vmlal writes into the upper half, which the caller cannot use, so
6810 // split into two halves.
6811 const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
6812 const Vec64<int32_t> mul0 = LowerHalf(mul_3210);
6813 const Vec64<int32_t> mul1 = UpperHalf(d32, mul_3210);
6814 return RearrangeToOddPlusEven(mul0, mul1);
6815}
6816
6817template <class D, HWY_IF_I32_D(D)>
6819 Vec32<int16_t> b) {
6820 const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
6821 const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
6822 const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
6823 const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
6824 return RearrangeToOddPlusEven(mul0, mul1);
6825}
6826
6827template <class D, HWY_IF_U32_D(D)>
6829 Vec128<uint16_t> b) {
6830 Vec128<uint32_t> sum1;
6831#if HWY_ARCH_ARM_A64
6832 sum1 = Vec128<uint32_t>(vmull_high_u16(a.raw, b.raw));
6833#else
6834 const Full64<uint16_t> dh;
6835 sum1 =
6836 Vec128<uint32_t>(vmull_u16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
6837#endif
6838 Vec128<uint32_t> sum0 =
6839 Vec128<uint32_t>(vmull_u16(LowerHalf(a).raw, LowerHalf(b).raw));
6840 return RearrangeToOddPlusEven(sum0, sum1);
6841}
6842
6843template <class D, HWY_IF_U32_D(D)>
6845 Vec64<uint16_t> b) {
6846 // vmlal writes into the upper half, which the caller cannot use, so
6847 // split into two halves.
6848 const Vec128<uint32_t> mul_3210(vmull_u16(a.raw, b.raw));
6849 const Vec64<uint32_t> mul0 = LowerHalf(mul_3210);
6850 const Vec64<uint32_t> mul1 = UpperHalf(d32, mul_3210);
6851 return RearrangeToOddPlusEven(mul0, mul1);
6852}
6853
6854template <class D, HWY_IF_U32_D(D)>
6856 Vec32<uint16_t> b) {
6857 const Vec128<uint32_t> mul_xx10(vmull_u16(a.raw, b.raw));
6858 const Vec64<uint32_t> mul_10(LowerHalf(mul_xx10));
6859 const Vec32<uint32_t> mul0 = LowerHalf(d32, mul_10);
6860 const Vec32<uint32_t> mul1 = UpperHalf(d32, mul_10);
6861 return RearrangeToOddPlusEven(mul0, mul1);
6862}
6863
6864// ------------------------------ ZeroExtendVector (Combine)
6865
6866template <class D>
6868 return Combine(d, Zero(Half<decltype(d)>()), lo);
6869}
6870
6871// ------------------------------ ConcatLowerLower
6872
6873// 64 or 128-bit input: just interleave
6874template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
6876 // Treat half-width input as a single lane and interleave them.
6877 const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
6878 return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi)));
6879}
6880
6881namespace detail {
6882#if HWY_ARCH_ARM_A64
6885#else
6886
6887// vtrn returns a struct with even and odd result.
6888#define HWY_NEON_BUILD_TPL_HWY_TRN
6889#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
6890// Pass raw args so we can accept uint16x2 args, for which there is no
6891// corresponding uint16x2x2 return type.
6892#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
6893 Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
6894#define HWY_NEON_BUILD_ARG_HWY_TRN a, b
6895
6896// Cannot use UINT8 etc. type macros because the x2_t tuples are only defined
6897// for full and half vectors.
6898HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN)
6899HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN)
6900HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN)
6901HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN)
6902HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN)
6903HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN)
6904HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN)
6905HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN)
6906HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN)
6907HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN)
6908HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN)
6909HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN)
6910HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN)
6911HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN)
6912
6913#undef HWY_NEON_BUILD_TPL_HWY_TRN
6914#undef HWY_NEON_BUILD_RET_HWY_TRN
6915#undef HWY_NEON_BUILD_PARAM_HWY_TRN
6916#undef HWY_NEON_BUILD_ARG_HWY_TRN
6917
6918#endif // HWY_ARCH_ARM_A64
6919} // namespace detail
6920
6921// <= 32-bit input/output
6922template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
6924 // Treat half-width input as two lanes and take every second one.
6925 const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
6926#if HWY_ARCH_ARM_A64
6927 return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi)));
6928#else
6929 using VU = VFromD<decltype(du)>;
6930 return BitCast(
6931 d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
6932 .val[0]));
6933#endif
6934}
6935
6936// ------------------------------ ConcatUpperUpper
6937
6938// 64 or 128-bit input: just interleave
6939template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
6941 // Treat half-width input as a single lane and interleave them.
6942 const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
6943 return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi)));
6944}
6945
6946// <= 32-bit input/output
6947template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
6949 // Treat half-width input as two lanes and take every second one.
6950 const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
6951#if HWY_ARCH_ARM_A64
6952 return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi)));
6953#else
6954 using VU = VFromD<decltype(du)>;
6955 return BitCast(
6956 d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
6957 .val[1]));
6958#endif
6959}
6960
6961// ------------------------------ ConcatLowerUpper (ShiftLeftBytes)
6962
6963// 64 or 128-bit input: extract from concatenated
6964template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
6966 return CombineShiftRightBytes<d.MaxBytes() / 2>(d, hi, lo);
6967}
6968
6969// <= 32-bit input/output
6970template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
6972 constexpr size_t kSize = d.MaxBytes();
6973 const Repartition<uint8_t, decltype(d)> d8;
6974 const Full64<uint8_t> d8x8;
6975 const Full64<TFromD<D>> d64;
6976 using V8x8 = VFromD<decltype(d8x8)>;
6977 const V8x8 hi8x8(BitCast(d8, hi).raw);
6978 // Move into most-significant bytes
6979 const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw));
6980 const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8);
6981 // Back to original lane type, then shrink N.
6982 return VFromD<D>(BitCast(d64, r).raw);
6983}
6984
6985// ------------------------------ ConcatUpperLower
6986
6987// Works for all N.
6988template <class D>
6990 return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
6991}
6992
6993// ------------------------------ ConcatOdd (InterleaveUpper)
6994
6995namespace detail {
6996// There is no vuzpq_u64.
6999
7000#if !HWY_HAVE_FLOAT16
7001template <size_t N>
7004 const DFromV<decltype(hi)> d;
7005 const RebindToUnsigned<decltype(d)> du;
7006 return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo)));
7007}
7008template <size_t N>
7011 const DFromV<decltype(hi)> d;
7012 const RebindToUnsigned<decltype(d)> du;
7013 return BitCast(d, ConcatOdd(BitCast(du, hi), BitCast(du, lo)));
7014}
7015#endif // !HWY_HAVE_FLOAT16
7016} // namespace detail
7017
7018// Full/half vector
7019template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
7021 return detail::ConcatOdd(lo, hi);
7022}
7023
7024// 8-bit x4
7025template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
7027 const Twice<decltype(d)> d2;
7028 const Repartition<uint16_t, decltype(d2)> dw2;
7029 const VFromD<decltype(d2)> hi2(hi.raw);
7030 const VFromD<decltype(d2)> lo2(lo.raw);
7031 const VFromD<decltype(dw2)> Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2));
7032 // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
7033 // vcopy_lane_u16, but that's A64-only.
7034 return Vec32<T>(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw);
7035}
7036
7037// Any type x2
7038template <class D, HWY_IF_LANES_D(D, 2), typename T = TFromD<D>>
7042
7043// ------------------------------ ConcatEven (InterleaveLower)
7044
7045// Full/half vector
7046template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
7048 return detail::ConcatEven(lo, hi);
7049}
7050
7051// 8-bit x4
7052template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
7054 const Twice<decltype(d)> d2;
7055 const Repartition<uint16_t, decltype(d2)> dw2;
7056 const VFromD<decltype(d2)> hi2(hi.raw);
7057 const VFromD<decltype(d2)> lo2(lo.raw);
7058 const VFromD<decltype(dw2)> Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2));
7059 // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
7060 // vcopy_lane_u16, but that's A64-only.
7061 return Vec32<T>(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw);
7062}
7063
7064// Any type x2
7065template <class D, HWY_IF_LANES_D(D, 2), typename T = TFromD<D>>
7069
7070// ------------------------------ DupEven (InterleaveLower)
7071
7072template <typename T, size_t N,
7073 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
7075#if HWY_ARCH_ARM_A64
7076 return detail::InterleaveEven(v, v);
7077#else
7078 return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]);
7079#endif
7080}
7081
7082template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
7083HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
7084 return InterleaveLower(DFromV<decltype(v)>(), v, v);
7085}
7086
7087// ------------------------------ DupOdd (InterleaveUpper)
7088
7089template <typename T, size_t N,
7090 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
7092#if HWY_ARCH_ARM_A64
7093 return detail::InterleaveOdd(v, v);
7094#else
7095 return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]);
7096#endif
7097}
7098
7099template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
7100HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
7101 return InterleaveUpper(DFromV<decltype(v)>(), v, v);
7102}
7103
7104// ------------------------------ OddEven (IfThenElse)
7105
7106template <typename T, size_t N>
7108 const DFromV<decltype(a)> d;
7109 const Repartition<uint8_t, decltype(d)> d8;
7110 alignas(16) static constexpr uint8_t kBytes[16] = {
7111 ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF,
7112 ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF,
7113 ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF,
7114 ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF,
7115 ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF,
7116 ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF,
7117 ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF,
7118 ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF,
7119 };
7120 const auto vec = BitCast(d, Load(d8, kBytes));
7121 return IfThenElse(MaskFromVec(vec), b, a);
7122}
7123
7124// ------------------------------ InterleaveEven
7125template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
7126HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
7127#if HWY_ARCH_ARM_A64
7128 return detail::InterleaveEven(a, b);
7129#else
7130 return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[0]);
7131#endif
7132}
7133
7134template <class D, HWY_IF_T_SIZE_D(D, 8)>
7136 return InterleaveLower(a, b);
7137}
7138
7139// ------------------------------ InterleaveOdd
7140template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
7141HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
7142#if HWY_ARCH_ARM_A64
7143 return detail::InterleaveOdd(a, b);
7144#else
7145 return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[1]);
7146#endif
7147}
7148
7149template <class D, HWY_IF_T_SIZE_D(D, 8)>
7151 return InterleaveUpper(d, a, b);
7152}
7153
7154// ------------------------------ OddEvenBlocks
7155template <typename T, size_t N>
7157 return even;
7158}
7159
7160// ------------------------------ SwapAdjacentBlocks
7161template <typename T, size_t N>
7165
7166// ------------------------------ ReverseBlocks
7167// Single block: no change
7168template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
7170 return v;
7171}
7172
7173// ------------------------------ ReorderDemote2To (OddEven)
7174
7175#if HWY_NEON_HAVE_F32_TO_BF16C
7176template <class D, HWY_IF_BF16_D(D)>
7177HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
7178 VFromD<Repartition<float, D>> b) {
7179 const Half<decltype(dbf16)> dh_bf16;
7180 return Combine(dbf16, DemoteTo(dh_bf16, b), DemoteTo(dh_bf16, a));
7181}
7182#endif // HWY_NEON_HAVE_F32_TO_BF16C
7183
7184template <class D, HWY_IF_I32_D(D)>
7186 Vec128<int64_t> b) {
7187 const Vec64<int32_t> a32(vqmovn_s64(a.raw));
7188#if HWY_ARCH_ARM_A64
7189 (void)d32;
7190 return Vec128<int32_t>(vqmovn_high_s64(a32.raw, b.raw));
7191#else
7192 const Vec64<int32_t> b32(vqmovn_s64(b.raw));
7193 return Combine(d32, b32, a32);
7194#endif
7195}
7196
7197template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7200 const Rebind<int64_t, decltype(d32)> dt;
7201 return DemoteTo(d32, Combine(dt, b, a));
7202}
7203
7204template <class D, HWY_IF_U32_D(D)>
7206 Vec128<int64_t> b) {
7207 const Vec64<uint32_t> a32(vqmovun_s64(a.raw));
7208#if HWY_ARCH_ARM_A64
7209 (void)d32;
7210 return Vec128<uint32_t>(vqmovun_high_s64(a32.raw, b.raw));
7211#else
7212 const Vec64<uint32_t> b32(vqmovun_s64(b.raw));
7213 return Combine(d32, b32, a32);
7214#endif
7215}
7216
7217template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7218HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<int64_t, D>> a,
7219 VFromD<Repartition<int64_t, D>> b) {
7220 const Rebind<int64_t, decltype(d32)> dt;
7221 return DemoteTo(d32, Combine(dt, b, a));
7222}
7223
7224template <class D, HWY_IF_U32_D(D)>
7226 Vec128<uint64_t> b) {
7227 const Vec64<uint32_t> a32(vqmovn_u64(a.raw));
7228#if HWY_ARCH_ARM_A64
7229 (void)d32;
7230 return Vec128<uint32_t>(vqmovn_high_u64(a32.raw, b.raw));
7231#else
7232 const Vec64<uint32_t> b32(vqmovn_u64(b.raw));
7233 return Combine(d32, b32, a32);
7234#endif
7235}
7236
7237template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7240 const Rebind<uint64_t, decltype(d32)> dt;
7241 return DemoteTo(d32, Combine(dt, b, a));
7242}
7243
7244template <class D, HWY_IF_I16_D(D)>
7246 Vec128<int32_t> b) {
7247 const Vec64<int16_t> a16(vqmovn_s32(a.raw));
7248#if HWY_ARCH_ARM_A64
7249 (void)d16;
7250 return Vec128<int16_t>(vqmovn_high_s32(a16.raw, b.raw));
7251#else
7252 const Vec64<int16_t> b16(vqmovn_s32(b.raw));
7253 return Combine(d16, b16, a16);
7254#endif
7255}
7256
7257template <class D, HWY_IF_I16_D(D)>
7259 Vec64<int32_t> b) {
7260 const Full128<int32_t> d32;
7261 const Vec128<int32_t> ab = Combine(d32, b, a);
7262 return Vec64<int16_t>(vqmovn_s32(ab.raw));
7263}
7264
7265template <class D, HWY_IF_I16_D(D)>
7267 Vec32<int32_t> b) {
7268 const Full128<int32_t> d32;
7269 const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
7270 return Vec32<int16_t>(vqmovn_s32(Combine(d32, ab, ab).raw));
7271}
7272
7273template <class D, HWY_IF_U16_D(D)>
7275 Vec128<int32_t> b) {
7276 const Vec64<uint16_t> a16(vqmovun_s32(a.raw));
7277#if HWY_ARCH_ARM_A64
7278 (void)d16;
7279 return Vec128<uint16_t>(vqmovun_high_s32(a16.raw, b.raw));
7280#else
7281 const Vec64<uint16_t> b16(vqmovun_s32(b.raw));
7282 return Combine(d16, b16, a16);
7283#endif
7284}
7285
7286template <class D, HWY_IF_U16_D(D)>
7288 Vec64<int32_t> b) {
7289 const Full128<int32_t> d32;
7290 const Vec128<int32_t> ab = Combine(d32, b, a);
7291 return Vec64<uint16_t>(vqmovun_s32(ab.raw));
7292}
7293
7294template <class D, HWY_IF_U16_D(D)>
7296 Vec32<int32_t> b) {
7297 const Full128<int32_t> d32;
7298 const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
7299 return Vec32<uint16_t>(vqmovun_s32(Combine(d32, ab, ab).raw));
7300}
7301
7302template <class D, HWY_IF_U16_D(D)>
7304 Vec128<uint32_t> b) {
7305 const Vec64<uint16_t> a16(vqmovn_u32(a.raw));
7306#if HWY_ARCH_ARM_A64
7307 (void)d16;
7308 return Vec128<uint16_t>(vqmovn_high_u32(a16.raw, b.raw));
7309#else
7310 const Vec64<uint16_t> b16(vqmovn_u32(b.raw));
7311 return Combine(d16, b16, a16);
7312#endif
7313}
7314
7315template <class D, HWY_IF_U16_D(D)>
7317 Vec64<uint32_t> b) {
7318 const Full128<uint32_t> d32;
7319 const Vec128<uint32_t> ab = Combine(d32, b, a);
7320 return Vec64<uint16_t>(vqmovn_u32(ab.raw));
7321}
7322
7323template <class D, HWY_IF_U16_D(D)>
7325 Vec32<uint32_t> b) {
7326 const Full128<uint32_t> d32;
7327 const Vec64<uint32_t> ab(vzip1_u32(a.raw, b.raw));
7328 return Vec32<uint16_t>(vqmovn_u32(Combine(d32, ab, ab).raw));
7329}
7330
7331template <class D, HWY_IF_I8_D(D)>
7333 Vec128<int16_t> b) {
7334 const Vec64<int8_t> a8(vqmovn_s16(a.raw));
7335#if HWY_ARCH_ARM_A64
7336 (void)d8;
7337 return Vec128<int8_t>(vqmovn_high_s16(a8.raw, b.raw));
7338#else
7339 const Vec64<int8_t> b8(vqmovn_s16(b.raw));
7340 return Combine(d8, b8, a8);
7341#endif
7342}
7343
7344template <class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7347 const Rebind<int16_t, decltype(d8)> dt;
7348 return DemoteTo(d8, Combine(dt, b, a));
7349}
7350
7351template <class D, HWY_IF_U8_D(D)>
7353 Vec128<int16_t> b) {
7354 const Vec64<uint8_t> a8(vqmovun_s16(a.raw));
7355#if HWY_ARCH_ARM_A64
7356 (void)d8;
7357 return Vec128<uint8_t>(vqmovun_high_s16(a8.raw, b.raw));
7358#else
7359 const Vec64<uint8_t> b8(vqmovun_s16(b.raw));
7360 return Combine(d8, b8, a8);
7361#endif
7362}
7363
7364template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7365HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<int16_t, D>> a,
7366 VFromD<Repartition<int16_t, D>> b) {
7367 const Rebind<int16_t, decltype(d8)> dt;
7368 return DemoteTo(d8, Combine(dt, b, a));
7369}
7370
7371template <class D, HWY_IF_U8_D(D)>
7373 Vec128<uint16_t> b) {
7374 const Vec64<uint8_t> a8(vqmovn_u16(a.raw));
7375#if HWY_ARCH_ARM_A64
7376 (void)d8;
7377 return Vec128<uint8_t>(vqmovn_high_u16(a8.raw, b.raw));
7378#else
7379 const Vec64<uint8_t> b8(vqmovn_u16(b.raw));
7380 return Combine(d8, b8, a8);
7381#endif
7382}
7383
7384template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
7387 const Rebind<uint16_t, decltype(d8)> dt;
7388 return DemoteTo(d8, Combine(dt, b, a));
7389}
7390
7391template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
7393 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
7395 return ReorderDemote2To(d, a, b);
7396}
7397
7398#if HWY_NEON_HAVE_F32_TO_BF16C
7399template <class D, HWY_IF_BF16_D(D)>
7400HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
7401 VFromD<Repartition<float, D>> b) {
7402 return ReorderDemote2To(dbf16, a, b);
7403}
7404#endif // HWY_NEON_HAVE_F32_TO_BF16C
7405
7406// ================================================== CRYPTO
7407
7408// (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH).
7409// Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*.
7410#if HWY_TARGET == HWY_NEON
7411
7412#ifdef HWY_NATIVE_AES
7413#undef HWY_NATIVE_AES
7414#else
7415#define HWY_NATIVE_AES
7416#endif
7417
7419 Vec128<uint8_t> round_key) {
7420 // NOTE: it is important that AESE and AESMC be consecutive instructions so
7421 // they can be fused. AESE includes AddRoundKey, which is a different ordering
7422 // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
7423 // round key (the compiler will hopefully optimize this for multiple rounds).
7424 return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
7425 round_key;
7426}
7427
7429 Vec128<uint8_t> round_key) {
7430 return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
7431}
7432
7434 return Vec128<uint8_t>{vaesimcq_u8(state.raw)};
7435}
7436
7438 Vec128<uint8_t> round_key) {
7439 // NOTE: it is important that AESD and AESIMC be consecutive instructions so
7440 // they can be fused. AESD includes AddRoundKey, which is a different ordering
7441 // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
7442 // round key (the compiler will hopefully optimize this for multiple rounds).
7443 return Vec128<uint8_t>(vaesimcq_u8(vaesdq_u8(state.raw, vdupq_n_u8(0)))) ^
7444 round_key;
7445}
7446
7448 Vec128<uint8_t> round_key) {
7449 return Vec128<uint8_t>(vaesdq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
7450}
7451
7455
7457 return Vec128<uint64_t>(
7458 (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
7459}
7460
7461#endif // HWY_TARGET == HWY_NEON
7462
7463// ================================================== MISC
7464
7465template <class D, HWY_IF_F32_D(D)>
7467 const Rebind<uint16_t, decltype(df32)> du16;
7468 const RebindToSigned<decltype(df32)> di32;
7469 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
7470}
7471
7472// ------------------------------ Truncations
7473
7474template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
7475 HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED(TTo),
7476 hwy::EnableIf<(sizeof(TTo) < sizeof(TFrom))>* = nullptr>
7477HWY_API Vec128<TTo, 1> TruncateTo(DTo /* tag */, Vec128<TFrom, 1> v) {
7478 const Repartition<TTo, DFromV<decltype(v)>> d;
7479 return Vec128<TTo, 1>{BitCast(d, v).raw};
7480}
7481
7482template <class D, HWY_IF_U8_D(D)>
7484 const Repartition<uint8_t, DFromV<decltype(v)>> d;
7485 const auto v1 = BitCast(d, v);
7486 const auto v2 = detail::ConcatEven(v1, v1);
7487 const auto v3 = detail::ConcatEven(v2, v2);
7488 const auto v4 = detail::ConcatEven(v3, v3);
7489 return LowerHalf(LowerHalf(LowerHalf(v4)));
7490}
7491
7492template <class D, HWY_IF_U16_D(D)>
7494 const Repartition<uint16_t, DFromV<decltype(v)>> d;
7495 const auto v1 = BitCast(d, v);
7496 const auto v2 = detail::ConcatEven(v1, v1);
7497 const auto v3 = detail::ConcatEven(v2, v2);
7498 return LowerHalf(LowerHalf(v3));
7499}
7500
7501template <class D, HWY_IF_U32_D(D)>
7503 const Repartition<uint32_t, DFromV<decltype(v)>> d;
7504 const auto v1 = BitCast(d, v);
7505 const auto v2 = detail::ConcatEven(v1, v1);
7506 return LowerHalf(v2);
7507}
7508
7509template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)>
7511 const Repartition<uint8_t, DFromV<decltype(v)>> d;
7512 const auto v1 = BitCast(d, v);
7513 const auto v2 = detail::ConcatEven(v1, v1);
7514 const auto v3 = detail::ConcatEven(v2, v2);
7515 return LowerHalf(LowerHalf(v3));
7516}
7517
7518template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 1)>
7519HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
7520 const Repartition<uint16_t, DFromV<decltype(v)>> d;
7521 const auto v1 = BitCast(d, v);
7522 const auto v2 = detail::ConcatEven(v1, v1);
7523 return LowerHalf(v2);
7524}
7525
7526template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)>
7528 const Repartition<uint8_t, DFromV<decltype(v)>> d;
7529 const auto v1 = BitCast(d, v);
7530 const auto v2 = detail::ConcatEven(v1, v1);
7531 return LowerHalf(v2);
7532}
7533
7534// ------------------------------ MulEven (ConcatEven)
7535
7536// Multiplies even lanes (0, 2 ..) and places the double-wide result into
7537// even and the upper half into its odd neighbor lane.
7539 const DFromV<decltype(a)> d;
7540 int8x16_t a_packed = ConcatEven(d, a, a).raw;
7541 int8x16_t b_packed = ConcatEven(d, b, b).raw;
7542 return Vec128<int16_t>(
7543 vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed)));
7544}
7546 const DFromV<decltype(a)> d;
7547 uint8x16_t a_packed = ConcatEven(d, a, a).raw;
7548 uint8x16_t b_packed = ConcatEven(d, b, b).raw;
7549 return Vec128<uint16_t>(
7550 vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed)));
7551}
7553 const DFromV<decltype(a)> d;
7554 int16x8_t a_packed = ConcatEven(d, a, a).raw;
7555 int16x8_t b_packed = ConcatEven(d, b, b).raw;
7556 return Vec128<int32_t>(
7557 vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed)));
7558}
7560 const DFromV<decltype(a)> d;
7561 uint16x8_t a_packed = ConcatEven(d, a, a).raw;
7562 uint16x8_t b_packed = ConcatEven(d, b, b).raw;
7563 return Vec128<uint32_t>(
7564 vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed)));
7565}
7567 const DFromV<decltype(a)> d;
7568 int32x4_t a_packed = ConcatEven(d, a, a).raw;
7569 int32x4_t b_packed = ConcatEven(d, b, b).raw;
7570 return Vec128<int64_t>(
7571 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
7572}
7574 const DFromV<decltype(a)> d;
7575 uint32x4_t a_packed = ConcatEven(d, a, a).raw;
7576 uint32x4_t b_packed = ConcatEven(d, b, b).raw;
7577 return Vec128<uint64_t>(
7578 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
7579}
7580
7581template <size_t N>
7582HWY_API Vec128<int16_t, (N + 1) / 2> MulEven(Vec128<int8_t, N> a,
7584 const DFromV<decltype(a)> d;
7585 int8x8_t a_packed = ConcatEven(d, a, a).raw;
7586 int8x8_t b_packed = ConcatEven(d, b, b).raw;
7587 return Vec128<int16_t, (N + 1) / 2>(
7588 vget_low_s16(vmull_s8(a_packed, b_packed)));
7589}
7590template <size_t N>
7591HWY_API Vec128<uint16_t, (N + 1) / 2> MulEven(Vec128<uint8_t, N> a,
7593 const DFromV<decltype(a)> d;
7594 uint8x8_t a_packed = ConcatEven(d, a, a).raw;
7595 uint8x8_t b_packed = ConcatEven(d, b, b).raw;
7596 return Vec128<uint16_t, (N + 1) / 2>(
7597 vget_low_u16(vmull_u8(a_packed, b_packed)));
7598}
7599template <size_t N>
7602 const DFromV<decltype(a)> d;
7603 int16x4_t a_packed = ConcatEven(d, a, a).raw;
7604 int16x4_t b_packed = ConcatEven(d, b, b).raw;
7605 return Vec128<int32_t, (N + 1) / 2>(
7606 vget_low_s32(vmull_s16(a_packed, b_packed)));
7607}
7608template <size_t N>
7611 const DFromV<decltype(a)> d;
7612 uint16x4_t a_packed = ConcatEven(d, a, a).raw;
7613 uint16x4_t b_packed = ConcatEven(d, b, b).raw;
7614 return Vec128<uint32_t, (N + 1) / 2>(
7615 vget_low_u32(vmull_u16(a_packed, b_packed)));
7616}
7617template <size_t N>
7620 const DFromV<decltype(a)> d;
7621 int32x2_t a_packed = ConcatEven(d, a, a).raw;
7622 int32x2_t b_packed = ConcatEven(d, b, b).raw;
7623 return Vec128<int64_t, (N + 1) / 2>(
7624 vget_low_s64(vmull_s32(a_packed, b_packed)));
7625}
7626template <size_t N>
7629 const DFromV<decltype(a)> d;
7630 uint32x2_t a_packed = ConcatEven(d, a, a).raw;
7631 uint32x2_t b_packed = ConcatEven(d, b, b).raw;
7632 return Vec128<uint64_t, (N + 1) / 2>(
7633 vget_low_u64(vmull_u32(a_packed, b_packed)));
7634}
7635
7636template <class T, HWY_IF_UI64(T)>
7638 T hi;
7639 T lo = Mul128(GetLane(a), GetLane(b), &hi);
7640 return Dup128VecFromValues(Full128<T>(), lo, hi);
7641}
7642
7643// Multiplies odd lanes (1, 3 ..) and places the double-wide result into
7644// even and the upper half into its odd neighbor lane.
7646 const DFromV<decltype(a)> d;
7647 int8x16_t a_packed = ConcatOdd(d, a, a).raw;
7648 int8x16_t b_packed = ConcatOdd(d, b, b).raw;
7649 return Vec128<int16_t>(
7650 vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed)));
7651}
7653 const DFromV<decltype(a)> d;
7654 uint8x16_t a_packed = ConcatOdd(d, a, a).raw;
7655 uint8x16_t b_packed = ConcatOdd(d, b, b).raw;
7656 return Vec128<uint16_t>(
7657 vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed)));
7658}
7660 const DFromV<decltype(a)> d;
7661 int16x8_t a_packed = ConcatOdd(d, a, a).raw;
7662 int16x8_t b_packed = ConcatOdd(d, b, b).raw;
7663 return Vec128<int32_t>(
7664 vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed)));
7665}
7667 const DFromV<decltype(a)> d;
7668 uint16x8_t a_packed = ConcatOdd(d, a, a).raw;
7669 uint16x8_t b_packed = ConcatOdd(d, b, b).raw;
7670 return Vec128<uint32_t>(
7671 vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed)));
7672}
7674 const DFromV<decltype(a)> d;
7675 int32x4_t a_packed = ConcatOdd(d, a, a).raw;
7676 int32x4_t b_packed = ConcatOdd(d, b, b).raw;
7677 return Vec128<int64_t>(
7678 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
7679}
7681 const DFromV<decltype(a)> d;
7682 uint32x4_t a_packed = ConcatOdd(d, a, a).raw;
7683 uint32x4_t b_packed = ConcatOdd(d, b, b).raw;
7684 return Vec128<uint64_t>(
7685 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
7686}
7687
7688template <size_t N>
7689HWY_API Vec128<int16_t, (N + 1) / 2> MulOdd(Vec128<int8_t, N> a,
7691 const DFromV<decltype(a)> d;
7692 int8x8_t a_packed = ConcatOdd(d, a, a).raw;
7693 int8x8_t b_packed = ConcatOdd(d, b, b).raw;
7694 return Vec128<int16_t, (N + 1) / 2>(
7695 vget_low_s16(vmull_s8(a_packed, b_packed)));
7696}
7697template <size_t N>
7698HWY_API Vec128<uint16_t, (N + 1) / 2> MulOdd(Vec128<uint8_t, N> a,
7700 const DFromV<decltype(a)> d;
7701 uint8x8_t a_packed = ConcatOdd(d, a, a).raw;
7702 uint8x8_t b_packed = ConcatOdd(d, b, b).raw;
7703 return Vec128<uint16_t, (N + 1) / 2>(
7704 vget_low_u16(vmull_u8(a_packed, b_packed)));
7705}
7706template <size_t N>
7707HWY_API Vec128<int32_t, (N + 1) / 2> MulOdd(Vec128<int16_t, N> a,
7709 const DFromV<decltype(a)> d;
7710 int16x4_t a_packed = ConcatOdd(d, a, a).raw;
7711 int16x4_t b_packed = ConcatOdd(d, b, b).raw;
7712 return Vec128<int32_t, (N + 1) / 2>(
7713 vget_low_s32(vmull_s16(a_packed, b_packed)));
7714}
7715template <size_t N>
7716HWY_API Vec128<uint32_t, (N + 1) / 2> MulOdd(Vec128<uint16_t, N> a,
7718 const DFromV<decltype(a)> d;
7719 uint16x4_t a_packed = ConcatOdd(d, a, a).raw;
7720 uint16x4_t b_packed = ConcatOdd(d, b, b).raw;
7721 return Vec128<uint32_t, (N + 1) / 2>(
7722 vget_low_u32(vmull_u16(a_packed, b_packed)));
7723}
7724template <size_t N>
7725HWY_API Vec128<int64_t, (N + 1) / 2> MulOdd(Vec128<int32_t, N> a,
7727 const DFromV<decltype(a)> d;
7728 int32x2_t a_packed = ConcatOdd(d, a, a).raw;
7729 int32x2_t b_packed = ConcatOdd(d, b, b).raw;
7730 return Vec128<int64_t, (N + 1) / 2>(
7731 vget_low_s64(vmull_s32(a_packed, b_packed)));
7732}
7733template <size_t N>
7734HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a,
7736 const DFromV<decltype(a)> d;
7737 uint32x2_t a_packed = ConcatOdd(d, a, a).raw;
7738 uint32x2_t b_packed = ConcatOdd(d, b, b).raw;
7739 return Vec128<uint64_t, (N + 1) / 2>(
7740 vget_low_u64(vmull_u32(a_packed, b_packed)));
7741}
7742
7743template <class T, HWY_IF_UI64(T)>
7745 T hi;
7746 T lo = Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi);
7747 return Dup128VecFromValues(Full128<T>(), lo, hi);
7748}
7749
7750// ------------------------------ TableLookupBytes (Combine, LowerHalf)
7751
7752// Both full
7753template <typename T, typename TI>
7755 const DFromV<decltype(from)> d;
7756 const Repartition<uint8_t, decltype(d)> d8;
7757#if HWY_ARCH_ARM_A64
7758 return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
7759 BitCast(d8, from).raw)));
7760#else
7761 uint8x16_t table0 = BitCast(d8, bytes).raw;
7762 uint8x8x2_t table;
7763 table.val[0] = vget_low_u8(table0);
7764 table.val[1] = vget_high_u8(table0);
7765 uint8x16_t idx = BitCast(d8, from).raw;
7766 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
7767 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
7768 return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi)));
7769#endif
7770}
7771
7772// Partial index vector
7773template <typename T, typename TI, size_t NI, HWY_IF_V_SIZE_LE(TI, NI, 8)>
7775 const Full128<TI> d_full;
7776 const Vec64<TI> from64(from.raw);
7777 const auto idx_full = Combine(d_full, from64, from64);
7778 const auto out_full = TableLookupBytes(bytes, idx_full);
7779 return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw);
7780}
7781
7782// Partial table vector
7783template <typename T, size_t N, typename TI, HWY_IF_V_SIZE_LE(T, N, 8)>
7785 const Full128<T> d_full;
7786 return TableLookupBytes(Combine(d_full, bytes, bytes), from);
7787}
7788
7789// Partial both
7790template <typename T, size_t N, typename TI, size_t NI,
7791 HWY_IF_V_SIZE_LE(T, N, 8), HWY_IF_V_SIZE_LE(TI, NI, 8)>
7793 Vec128<TI, NI> from) {
7794 const DFromV<decltype(bytes)> d;
7795 const Simd<TI, NI, 0> d_idx;
7796 const Repartition<uint8_t, decltype(d_idx)> d_idx8;
7797 // uint8x8
7798 const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes);
7799 const auto from8 = BitCast(d_idx8, from);
7800 const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
7801 return BitCast(d_idx, v8);
7802}
7803
7804// For all vector widths; Arm anyway zeroes if >= 0x10.
7805template <class V, class VI>
7806HWY_API VI TableLookupBytesOr0(V bytes, VI from) {
7807 return TableLookupBytes(bytes, from);
7808}
7809
7810// ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes)
7811
7812#if HWY_TARGET == HWY_NEON
7813template <uint8_t kRcon>
7815 alignas(16) static constexpr uint8_t kRconXorMask[16] = {
7816 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0};
7817 alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
7818 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12};
7819 const DFromV<decltype(v)> d;
7820 const Repartition<uint32_t, decltype(d)> du32;
7821 const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
7822 const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask));
7823 return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle));
7824}
7825#endif // HWY_TARGET == HWY_NEON
7826
7827// ------------------------------ Scatter in generic_ops-inl.h
7828// ------------------------------ Gather in generic_ops-inl.h
7829
7830// ------------------------------ Reductions
7831
7832// On Armv8 we define ReduceSum and generic_ops defines SumOfLanes via Set.
7833#if HWY_ARCH_ARM_A64
7834
7835#ifdef HWY_NATIVE_REDUCE_SCALAR
7836#undef HWY_NATIVE_REDUCE_SCALAR
7837#else
7838#define HWY_NATIVE_REDUCE_SCALAR
7839#endif
7840
7841// TODO(janwas): use normal HWY_NEON_DEF, then FULL type list.
7842#define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \
7843 template <class D, HWY_IF_LANES_D(D, size)> \
7844 HWY_API type##_t name(D /* tag */, Vec128<type##_t, size> v) { \
7845 return HWY_NEON_EVAL(prefix##infix##suffix, v.raw); \
7846 }
7847
7848// Excludes u64/s64 (missing minv/maxv) and f16 (missing addv).
7849#define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \
7850 HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8) \
7851 HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8) \
7852 HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16) \
7853 HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16) \
7854 HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32) \
7855 HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32) \
7856 HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8) \
7857 HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8) \
7858 HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16) \
7859 HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16) \
7860 HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32) \
7861 HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32) \
7862 HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32) \
7863 HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32) \
7864 HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64)
7865
7866// Different interface than HWY_NEON_DEF_FUNCTION_FULL_UI_64.
7867#define HWY_NEON_DEF_REDUCTION_UI64(name, prefix) \
7868 HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64) \
7869 HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64)
7870
7871#if HWY_HAVE_FLOAT16
7872#define HWY_NEON_DEF_REDUCTION_F16(name, prefix) \
7873 HWY_NEON_DEF_REDUCTION(float16, 4, name, prefix, _, f16) \
7874 HWY_NEON_DEF_REDUCTION(float16, 8, name, prefix##q, _, f16)
7875#else
7876#define HWY_NEON_DEF_REDUCTION_F16(name, prefix)
7877#endif
7878
7879HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMin, vminv)
7880HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMax, vmaxv)
7881HWY_NEON_DEF_REDUCTION_F16(ReduceMin, vminv)
7882HWY_NEON_DEF_REDUCTION_F16(ReduceMax, vmaxv)
7883
7884HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceSum, vaddv)
7885HWY_NEON_DEF_REDUCTION_UI64(ReduceSum, vaddv)
7886
7887// Emulate missing UI64 and partial N=2.
7888template <class D, HWY_IF_LANES_D(D, 2),
7889 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
7890HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v10) {
7891 return GetLane(v10) + ExtractLane(v10, 1);
7892}
7893
7894template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
7895 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
7896HWY_API TFromD<D> ReduceMin(D /* tag */, VFromD<D> v10) {
7897 return HWY_MIN(GetLane(v10), ExtractLane(v10, 1));
7898}
7899
7900template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
7901 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
7902HWY_API TFromD<D> ReduceMax(D /* tag */, VFromD<D> v10) {
7903 return HWY_MAX(GetLane(v10), ExtractLane(v10, 1));
7904}
7905
7906#if HWY_HAVE_FLOAT16
7907template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
7908HWY_API float16_t ReduceMin(D d, VFromD<D> v10) {
7909 return GetLane(Min(v10, Reverse2(d, v10)));
7910}
7911
7912template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
7913HWY_API float16_t ReduceMax(D d, VFromD<D> v10) {
7914 return GetLane(Max(v10, Reverse2(d, v10)));
7915}
7916
7917template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 8)>
7918HWY_API float16_t ReduceSum(D /* tag */, VFromD<D> v) {
7919 const float16x4_t x2 = vpadd_f16(v.raw, v.raw);
7920 return GetLane(VFromD<D>(vpadd_f16(x2, x2)));
7921}
7922template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
7923HWY_API float16_t ReduceSum(D d, VFromD<D> v) {
7924 const Half<decltype(d)> dh;
7925 return ReduceSum(dh, LowerHalf(dh, VFromD<D>(vpaddq_f16(v.raw, v.raw))));
7926}
7927#endif // HWY_HAVE_FLOAT16
7928
7929#undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
7930#undef HWY_NEON_DEF_REDUCTION_F16
7931#undef HWY_NEON_DEF_REDUCTION_UI64
7932#undef HWY_NEON_DEF_REDUCTION
7933
7934// ------------------------------ SumOfLanes
7935
7936template <class D, HWY_IF_LANES_GT_D(D, 1)>
7938 return Set(d, ReduceSum(d, v));
7939}
7940template <class D, HWY_IF_LANES_GT_D(D, 1)>
7942 return Set(d, ReduceMin(d, v));
7943}
7944template <class D, HWY_IF_LANES_GT_D(D, 1)>
7946 return Set(d, ReduceMax(d, v));
7947}
7948
7949// On Armv7 we define SumOfLanes and generic_ops defines ReduceSum via GetLane.
7950#else // !HWY_ARCH_ARM_A64
7951
7952// Armv7 lacks N=2 and 8-bit x4, so enable generic versions of those.
7953#undef HWY_IF_SUM_OF_LANES_D
7954#define HWY_IF_SUM_OF_LANES_D(D) \
7955 hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
7956 (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
7957 nullptr
7958#undef HWY_IF_MINMAX_OF_LANES_D
7959#define HWY_IF_MINMAX_OF_LANES_D(D) \
7960 hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
7961 (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
7962 nullptr
7963
7964// For arm7, we implement reductions using a series of pairwise operations. This
7965// produces the full vector result, so we express Reduce* in terms of *OfLanes.
7966#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
7967#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
7968 template <class D, HWY_IF_LANES_D(D, size)> \
7969 HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
7970 Vec128<type##_t, size> v) { \
7971 HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
7972 if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7973 if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7974 return Vec128<type##_t, size>(tmp); \
7975 }
7976
7977// For the wide versions, the pairwise operations produce a half-length vector.
7978// We produce that `tmp` and then Combine.
7979#define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
7980 suffix) \
7981 template <class D, HWY_IF_LANES_D(D, size)> \
7982 HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
7983 Vec128<type##_t, size> v) { \
7984 HWY_NEON_BUILD_TYPE_T(type, half) tmp; \
7985 tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \
7986 vget_low_##suffix(v.raw)); \
7987 if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7988 if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7989 if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7990 return Vec128<type##_t, size>(vcombine_##suffix(tmp, tmp)); \
7991 }
7992
7993#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \
7994 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint32, 2, name, prefix, u32) \
7995 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint16, 4, name, prefix, u16) \
7996 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8) \
7997 HWY_NEON_DEF_PAIRWISE_REDUCTION(int32, 2, name, prefix, s32) \
7998 HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16) \
7999 HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8) \
8000 HWY_NEON_DEF_PAIRWISE_REDUCTION(float32, 2, name, prefix, f32) \
8001 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint32, 4, 2, name, prefix, u32) \
8002 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \
8003 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8) \
8004 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int32, 4, 2, name, prefix, s32) \
8005 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16) \
8006 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8) \
8007 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(float32, 4, 2, name, prefix, f32)
8008
8012
8013#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
8014#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
8015#undef HWY_NEON_DEF_PAIRWISE_REDUCTION
8016#undef HWY_NEON_BUILD_TYPE_T
8017
8018// GetLane(SumsOf4(v)) is more efficient on ArmV7 NEON than the default
8019// N=4 I8/U8 ReduceSum implementation in generic_ops-inl.h
8020#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
8021#undef HWY_NATIVE_REDUCE_SUM_4_UI8
8022#else
8023#define HWY_NATIVE_REDUCE_SUM_4_UI8
8024#endif
8025
8026template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
8028 return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
8029}
8030
8031#endif // HWY_ARCH_ARM_A64
8032
8033// ------------------------------ LoadMaskBits (TestBit)
8034
8035namespace detail {
8036
8037// Helper function to set 64 bits and potentially return a smaller vector. The
8038// overload is required to call the q vs non-q intrinsics. Note that 8-bit
8039// LoadMaskBits only requires 16 bits, but 64 avoids casting.
8040template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
8041HWY_INLINE VFromD<D> Set64(D /* tag */, uint64_t mask_bits) {
8042 const auto v64 = Vec64<uint64_t>(vdup_n_u64(mask_bits));
8043 return VFromD<D>(BitCast(Full64<TFromD<D>>(), v64).raw);
8044}
8045template <typename T>
8046HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) {
8047 return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits)));
8048}
8049
8050template <class D, HWY_IF_T_SIZE_D(D, 1)>
8051HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
8052 const RebindToUnsigned<decltype(d)> du;
8053 // Easier than Set(), which would require an >8-bit type, which would not
8054 // compile for T=uint8_t, N=1.
8055 const auto vmask_bits = Set64(du, mask_bits);
8056
8057 // Replicate bytes 8x such that each byte contains the bit that governs it.
8058 alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
8059 1, 1, 1, 1, 1, 1, 1, 1};
8060 const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8));
8061
8062 alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
8063 1, 2, 4, 8, 16, 32, 64, 128};
8064 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
8065}
8066
8067template <class D, HWY_IF_T_SIZE_D(D, 2)>
8068HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
8069 const RebindToUnsigned<decltype(d)> du;
8070 alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
8071 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
8072 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
8073}
8074
8075template <class D, HWY_IF_T_SIZE_D(D, 4)>
8076HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
8077 const RebindToUnsigned<decltype(d)> du;
8078 alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
8079 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
8080 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
8081}
8082
8083template <class D, HWY_IF_T_SIZE_D(D, 8)>
8084HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
8085 const RebindToUnsigned<decltype(d)> du;
8086 alignas(16) static constexpr uint64_t kBit[8] = {1, 2};
8087 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
8088}
8089
8090} // namespace detail
8091
8092// `p` points to at least 8 readable bytes, not all of which need be valid.
8093template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
8095 uint64_t mask_bits = 0;
8096 CopyBytes<(d.MaxLanes() + 7) / 8>(bits, &mask_bits);
8097 return detail::LoadMaskBits(d, mask_bits);
8098}
8099
8100// ------------------------------ Dup128MaskFromMaskBits
8101
8102template <class D>
8104 constexpr size_t kN = MaxLanes(d);
8105 if (kN < 8) mask_bits &= (1u << kN) - 1;
8106 return detail::LoadMaskBits(d, mask_bits);
8107}
8108
8109// ------------------------------ Mask
8110
8111namespace detail {
8112
8113// Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than
8114// BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse.
8115template <class D, HWY_IF_V_SIZE_D(D, 16)>
8117 const Full128<uint16_t> du16;
8118 const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask));
8119 const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4));
8120 return GetLane(BitCast(Full64<uint64_t>(), nib));
8121}
8122
8123template <class D, HWY_IF_V_SIZE_D(D, 8)>
8124HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) {
8125 // There is no vshrn_n_u16 for uint16x4, so zero-extend.
8126 const Twice<decltype(d)> d2;
8127 const VFromD<decltype(d2)> v128 = ZeroExtendVector(d2, VecFromMask(d, mask));
8128 // No need to mask, upper half is zero thanks to ZeroExtendVector.
8129 return NibblesFromMask(d2, MaskFromVec(v128));
8130}
8131
8132template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
8133HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) {
8134 const Mask64<TFromD<D>> mask64(mask.raw);
8135 const uint64_t nib = NibblesFromMask(Full64<TFromD<D>>(), mask64);
8136 // Clear nibbles from upper half of 64-bits
8137 return nib & ((1ull << (d.MaxBytes() * 4)) - 1);
8138}
8139
8140template <typename T>
8142 alignas(16) static constexpr uint8_t kSliceLanes[16] = {
8143 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
8144 };
8145 const Full128<uint8_t> du;
8146 const Vec128<uint8_t> values =
8147 BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
8148
8149#if HWY_ARCH_ARM_A64
8150 // Can't vaddv - we need two separate bytes (16 bits).
8151 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
8152 const uint8x8_t x4 = vpadd_u8(x2, x2);
8153 const uint8x8_t x8 = vpadd_u8(x4, x4);
8154 return vget_lane_u64(vreinterpret_u64_u8(x8), 0) & 0xFFFF;
8155#else
8156 // Don't have vpaddq, so keep doubling lane size.
8157 const uint16x8_t x2 = vpaddlq_u8(values.raw);
8158 const uint32x4_t x4 = vpaddlq_u16(x2);
8159 const uint64x2_t x8 = vpaddlq_u32(x4);
8160 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
8161#endif
8162}
8163
8164template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8166 // Upper lanes of partial loads are undefined. OnlyActive will fix this if
8167 // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
8168 alignas(8) static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
8169 0x10, 0x20, 0x40, 0x80};
8170 const DFromM<decltype(mask)> d;
8171 const RebindToUnsigned<decltype(d)> du;
8172 const Vec128<uint8_t, N> slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
8173 const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
8174
8175#if HWY_ARCH_ARM_A64
8176 return vaddv_u8(values.raw);
8177#else
8178 const uint16x4_t x2 = vpaddl_u8(values.raw);
8179 const uint32x2_t x4 = vpaddl_u16(x2);
8180 const uint64x1_t x8 = vpaddl_u32(x4);
8181 return vget_lane_u64(x8, 0);
8182#endif
8183}
8184
8185template <typename T>
8187 alignas(16) static constexpr uint16_t kSliceLanes[8] = {
8188 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80};
8189 const Full128<T> d;
8190 const Full128<uint16_t> du;
8191 const Vec128<uint16_t> values =
8192 BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
8193#if HWY_ARCH_ARM_A64
8194 return vaddvq_u16(values.raw);
8195#else
8196 const uint32x4_t x2 = vpaddlq_u16(values.raw);
8197 const uint64x2_t x4 = vpaddlq_u32(x2);
8198 return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
8199#endif
8200}
8201
8202template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8204 // Upper lanes of partial loads are undefined. OnlyActive will fix this if
8205 // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
8206 alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
8207 const DFromM<decltype(mask)> d;
8208 const RebindToUnsigned<decltype(d)> du;
8209 const Vec128<uint16_t, N> slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
8210 const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
8211#if HWY_ARCH_ARM_A64
8212 return vaddv_u16(values.raw);
8213#else
8214 const uint32x2_t x2 = vpaddl_u16(values.raw);
8215 const uint64x1_t x4 = vpaddl_u32(x2);
8216 return vget_lane_u64(x4, 0);
8217#endif
8218}
8219
8220template <typename T>
8222 alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
8223 const Full128<T> d;
8224 const Full128<uint32_t> du;
8225 const Vec128<uint32_t> values =
8226 BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
8227#if HWY_ARCH_ARM_A64
8228 return vaddvq_u32(values.raw);
8229#else
8230 const uint64x2_t x2 = vpaddlq_u32(values.raw);
8231 return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
8232#endif
8233}
8234
8235template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8237 // Upper lanes of partial loads are undefined. OnlyActive will fix this if
8238 // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
8239 alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2};
8240 const DFromM<decltype(mask)> d;
8241 const RebindToUnsigned<decltype(d)> du;
8242 const Vec128<uint32_t, N> slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
8243 const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
8244#if HWY_ARCH_ARM_A64
8245 return vaddv_u32(values.raw);
8246#else
8247 const uint64x1_t x2 = vpaddl_u32(values.raw);
8248 return vget_lane_u64(x2, 0);
8249#endif
8250}
8251
8252template <typename T>
8254 alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2};
8255 const Full128<T> d;
8256 const Full128<uint64_t> du;
8257 const Vec128<uint64_t> values =
8258 BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
8259#if HWY_ARCH_ARM_A64
8260 return vaddvq_u64(values.raw);
8261#else
8262 return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
8263#endif
8264}
8265
8266template <typename T>
8268 const Full64<T> d;
8269 const Full64<uint64_t> du;
8270 const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, m)) & Set(du, 1);
8271 return vget_lane_u64(values.raw, 0);
8272}
8273
8274// Returns the lowest N for the BitsFromMask result.
8275template <typename T, size_t N>
8276constexpr uint64_t OnlyActive(uint64_t bits) {
8277 return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
8278}
8279
8280template <typename T, size_t N>
8282 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
8283}
8284
8285// Returns number of lanes whose mask is set.
8286//
8287// Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
8288// ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
8289// changes each lane to 1 (if mask set) or 0.
8290// NOTE: PopCount also operates on vectors, so we still have to do horizontal
8291// sums separately. We specialize CountTrue for full vectors (negating instead
8292// of PopCount because it avoids an extra shift), and use PopCount of
8293// NibblesFromMask for partial vectors.
8294
8295template <typename T>
8297 const Full128<int8_t> di;
8298 const int8x16_t ones =
8299 vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
8300
8301#if HWY_ARCH_ARM_A64
8302 return static_cast<size_t>(vaddvq_s8(ones));
8303#else
8304 const int16x8_t x2 = vpaddlq_s8(ones);
8305 const int32x4_t x4 = vpaddlq_s16(x2);
8306 const int64x2_t x8 = vpaddlq_s32(x4);
8307 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
8308#endif
8309}
8310template <typename T>
8312 const Full128<int16_t> di;
8313 const int16x8_t ones =
8314 vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
8315
8316#if HWY_ARCH_ARM_A64
8317 return static_cast<size_t>(vaddvq_s16(ones));
8318#else
8319 const int32x4_t x2 = vpaddlq_s16(ones);
8320 const int64x2_t x4 = vpaddlq_s32(x2);
8321 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
8322#endif
8323}
8324
8325template <typename T>
8327 const Full128<int32_t> di;
8328 const int32x4_t ones =
8329 vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
8330
8331#if HWY_ARCH_ARM_A64
8332 return static_cast<size_t>(vaddvq_s32(ones));
8333#else
8334 const int64x2_t x2 = vpaddlq_s32(ones);
8335 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
8336#endif
8337}
8338
8339template <typename T>
8341#if HWY_ARCH_ARM_A64
8342 const Full128<int64_t> di;
8343 const int64x2_t ones =
8344 vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
8345 return static_cast<size_t>(vaddvq_s64(ones));
8346#else
8347 const Full128<uint64_t> du;
8348 const auto mask_u = VecFromMask(du, RebindMask(du, mask));
8349 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
8350 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
8351#endif
8352}
8353
8354} // namespace detail
8355
8356// Full
8357template <class D, typename T = TFromD<D>>
8358HWY_API size_t CountTrue(D /* tag */, Mask128<T> mask) {
8359 return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask);
8360}
8361
8362// Partial
8363template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
8364HWY_API size_t CountTrue(D d, MFromD<D> mask) {
8365 constexpr int kDiv = 4 * sizeof(TFromD<D>);
8366 return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
8367}
8368
8369template <class D>
8371 const uint64_t nib = detail::NibblesFromMask(d, mask);
8372 constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
8373 return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv;
8374}
8375
8376template <class D>
8378 const uint64_t nib = detail::NibblesFromMask(d, mask);
8379 if (nib == 0) return -1;
8380 constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
8381 return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv);
8382}
8383
8384template <class D>
8386 const uint64_t nib = detail::NibblesFromMask(d, mask);
8387 constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
8388 return (63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / kDiv;
8389}
8390
8391template <class D>
8392HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
8393 const uint64_t nib = detail::NibblesFromMask(d, mask);
8394 if (nib == 0) return -1;
8395 constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
8396 return static_cast<intptr_t>((63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) /
8397 kDiv);
8398}
8399
8400// `p` points to at least 8 writable bytes.
8401template <class D>
8402HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
8403 const uint64_t mask_bits = detail::BitsFromMask(mask);
8404 const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
8405 CopyBytes<kNumBytes>(&mask_bits, bits);
8406 return kNumBytes;
8407}
8408
8409template <class D>
8411 return detail::NibblesFromMask(d, m) == 0;
8412}
8413
8414// Full
8415template <class D, typename T = TFromD<D>>
8417 return detail::NibblesFromMask(d, m) == ~0ull;
8418}
8419// Partial
8420template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
8422 return detail::NibblesFromMask(d, m) == (1ull << (d.MaxBytes() * 4)) - 1;
8423}
8424
8425// ------------------------------ Compress
8426
8427template <typename T>
8429 enum { value = (sizeof(T) != 1) };
8430};
8431
8432namespace detail {
8433
8434// Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
8435template <class D, HWY_IF_V_SIZE_D(D, 16)>
8436HWY_INLINE Vec128<uint8_t> Load8Bytes(D /*tag*/, const uint8_t* bytes) {
8437 return Vec128<uint8_t>(vreinterpretq_u8_u64(
8438 vld1q_dup_u64(HWY_RCAST_ALIGNED(const uint64_t*, bytes))));
8439}
8440
8441// Load 8 bytes and return half-reg with N <= 8 bytes.
8442template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
8443HWY_INLINE VFromD<D> Load8Bytes(D d, const uint8_t* bytes) {
8444 return Load(d, bytes);
8445}
8446
8447template <typename T, size_t N>
8449 uint64_t mask_bits) {
8450 HWY_DASSERT(mask_bits < 256);
8451 const Simd<T, N, 0> d;
8452 const Repartition<uint8_t, decltype(d)> d8;
8453 const Simd<uint16_t, N, 0> du;
8454
8455 // NEON does not provide an equivalent of AVX2 permutevar, so we need byte
8456 // indices for VTBL (one vector's worth for each of 256 combinations of
8457 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
8458 // store lane indices and convert to byte indices (2*lane + 0..1), with the
8459 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
8460 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
8461 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
8462 // is likely more costly than the higher cache footprint from storing bytes.
8463 alignas(16) static constexpr uint8_t table[256 * 8] = {
8464 // PrintCompress16x8Tables
8465 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
8466 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
8467 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
8468 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
8469 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
8470 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
8471 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
8472 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
8473 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
8474 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
8475 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
8476 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
8477 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
8478 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
8479 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
8480 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
8481 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
8482 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
8483 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
8484 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
8485 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
8486 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
8487 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
8488 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
8489 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
8490 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
8491 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
8492 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
8493 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
8494 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
8495 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
8496 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
8497 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
8498 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
8499 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
8500 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
8501 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
8502 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
8503 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
8504 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
8505 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
8506 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
8507 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
8508 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
8509 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
8510 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
8511 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
8512 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
8513 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
8514 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
8515 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
8516 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
8517 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
8518 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
8519 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
8520 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
8521 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
8522 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
8523 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
8524 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
8525 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
8526 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
8527 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
8528 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
8529 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
8530 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
8531 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
8532 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
8533 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
8534 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
8535 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
8536 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
8537 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
8538 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
8539 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
8540 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
8541 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
8542 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
8543 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
8544 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
8545 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
8546 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
8547 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
8548 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
8549 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
8550 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
8551 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
8552 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
8553 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
8554 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
8555 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
8556 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
8557 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
8558 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
8559 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
8560 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
8561 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
8562 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
8563 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
8564 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
8565 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
8566 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
8567 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
8568 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
8569 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
8570 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
8571 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
8572 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
8573 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
8574 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
8575 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
8576 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
8577 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
8578 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
8579 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
8580 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
8581 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
8582 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
8583 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
8584 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
8585 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
8586 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
8587 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
8588 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
8589 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
8590 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
8591 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
8592 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
8593
8594 const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
8595 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
8596 return BitCast(d, pairs + Set(du, 0x0100));
8597}
8598
8599template <typename T, size_t N>
8601 uint64_t mask_bits) {
8602 HWY_DASSERT(mask_bits < 256);
8603 const Simd<T, N, 0> d;
8604 const Repartition<uint8_t, decltype(d)> d8;
8605 const Simd<uint16_t, N, 0> du;
8606
8607 // NEON does not provide an equivalent of AVX2 permutevar, so we need byte
8608 // indices for VTBL (one vector's worth for each of 256 combinations of
8609 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
8610 // store lane indices and convert to byte indices (2*lane + 0..1), with the
8611 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
8612 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
8613 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
8614 // is likely more costly than the higher cache footprint from storing bytes.
8615 alignas(16) static constexpr uint8_t table[256 * 8] = {
8616 // PrintCompressNot16x8Tables
8617 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
8618 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
8619 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
8620 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
8621 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
8622 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
8623 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
8624 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
8625 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
8626 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
8627 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
8628 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
8629 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
8630 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
8631 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
8632 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
8633 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
8634 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
8635 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
8636 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
8637 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
8638 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
8639 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
8640 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
8641 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
8642 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
8643 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
8644 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
8645 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
8646 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
8647 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
8648 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
8649 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
8650 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
8651 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
8652 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
8653 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
8654 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
8655 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
8656 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
8657 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
8658 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
8659 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
8660 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
8661 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
8662 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
8663 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
8664 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
8665 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
8666 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
8667 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
8668 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
8669 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
8670 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
8671 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
8672 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
8673 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
8674 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
8675 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
8676 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
8677 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
8678 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
8679 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
8680 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
8681 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
8682 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
8683 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
8684 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
8685 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
8686 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
8687 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
8688 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
8689 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
8690 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
8691 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
8692 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
8693 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
8694 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
8695 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
8696 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
8697 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
8698 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
8699 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
8700 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
8701 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
8702 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
8703 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
8704 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
8705 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
8706 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
8707 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
8708 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
8709 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
8710 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
8711 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
8712 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
8713 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
8714 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
8715 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
8716 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
8717 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
8718 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
8719 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
8720 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
8721 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
8722 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
8723 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
8724 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
8725 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
8726 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
8727 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
8728 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
8729 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
8730 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
8731 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
8732 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
8733 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
8734 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
8735 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
8736 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
8737 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
8738 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
8739 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
8740 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
8741 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
8742 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
8743 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
8744 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
8745
8746 const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
8747 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
8748 return BitCast(d, pairs + Set(du, 0x0100));
8749}
8750
8751template <typename T, size_t N>
8753 uint64_t mask_bits) {
8754 HWY_DASSERT(mask_bits < 16);
8755
8756 // There are only 4 lanes, so we can afford to load the index vector directly.
8757 alignas(16) static constexpr uint8_t u8_indices[16 * 16] = {
8758 // PrintCompress32x4Tables
8759 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
8760 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
8761 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
8762 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
8763 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
8764 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
8765 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
8766 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
8767 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
8768 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
8769 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
8770 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
8771 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
8772 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
8773 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
8774 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
8775 const Simd<T, N, 0> d;
8776 const Repartition<uint8_t, decltype(d)> d8;
8777 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
8778}
8779
8780template <typename T, size_t N>
8782 uint64_t mask_bits) {
8783 HWY_DASSERT(mask_bits < 16);
8784
8785 // There are only 4 lanes, so we can afford to load the index vector directly.
8786 alignas(16) static constexpr uint8_t u8_indices[16 * 16] = {
8787 // PrintCompressNot32x4Tables
8788 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
8789 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
8790 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
8791 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
8792 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
8793 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
8794 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
8795 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8796 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
8797 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
8798 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
8799 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
8800 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
8801 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
8802 12, 13, 14, 15};
8803 const Simd<T, N, 0> d;
8804 const Repartition<uint8_t, decltype(d)> d8;
8805 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
8806}
8807
8808#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
8809
8810template <typename T, size_t N>
8812 uint64_t mask_bits) {
8813 HWY_DASSERT(mask_bits < 4);
8814
8815 // There are only 2 lanes, so we can afford to load the index vector directly.
8816 alignas(16) static constexpr uint8_t u8_indices[64] = {
8817 // PrintCompress64x2Tables
8818 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8819 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8820 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
8821 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
8822
8823 const Simd<T, N, 0> d;
8824 const Repartition<uint8_t, decltype(d)> d8;
8825 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
8826}
8827
8828template <typename T, size_t N>
8829HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/,
8830 uint64_t mask_bits) {
8831 HWY_DASSERT(mask_bits < 4);
8832
8833 // There are only 2 lanes, so we can afford to load the index vector directly.
8834 alignas(16) static constexpr uint8_t u8_indices[4 * 16] = {
8835 // PrintCompressNot64x2Tables
8836 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8837 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
8838 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8839 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
8840
8841 const Simd<T, N, 0> d;
8842 const Repartition<uint8_t, decltype(d)> d8;
8843 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
8844}
8845
8846#endif
8847
8848// Helper function called by both Compress and CompressStore - avoids a
8849// redundant BitsFromMask in the latter.
8850template <typename T, size_t N>
8852 const auto idx =
8853 detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
8854 using D = DFromV<decltype(v)>;
8855 const RebindToSigned<D> di;
8856 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
8857}
8858
8859template <typename T, size_t N>
8861 const auto idx =
8862 detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
8863 using D = DFromV<decltype(v)>;
8864 const RebindToSigned<D> di;
8865 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
8866}
8867
8868} // namespace detail
8869
8870// Single lane: no-op
8871template <typename T>
8873 return v;
8874}
8875
8876// Two lanes: conditional swap
8877template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
8879 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
8880 const DFromV<decltype(v)> d;
8881 const Vec128<T, N> m = VecFromMask(d, mask);
8882 const Vec128<T, N> maskL = DupEven(m);
8883 const Vec128<T, N> maskH = DupOdd(m);
8884 const Vec128<T, N> swap = AndNot(maskL, maskH);
8885 return IfVecThenElse(swap, Shuffle01(v), v);
8886}
8887
8888// General case, 2 or 4 byte lanes
8889template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
8890HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
8891 return detail::Compress(v, detail::BitsFromMask(mask));
8892}
8893
8894// Single lane: no-op
8895template <typename T>
8899
8900// Two lanes: conditional swap
8901template <typename T, HWY_IF_T_SIZE(T, 8)>
8903 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
8904 const DFromV<decltype(v)> d;
8905 const Vec128<T> m = VecFromMask(d, mask);
8906 const Vec128<T> maskL = DupEven(m);
8907 const Vec128<T> maskH = DupOdd(m);
8908 const Vec128<T> swap = AndNot(maskH, maskL);
8909 return IfVecThenElse(swap, Shuffle01(v), v);
8910}
8911
8912// General case, 2 or 4 byte lanes
8913template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
8914HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
8915 // For partial vectors, we cannot pull the Not() into the table because
8916 // BitsFromMask clears the upper bits.
8917 if (N < 16 / sizeof(T)) {
8918 return detail::Compress(v, detail::BitsFromMask(Not(mask)));
8919 }
8920 return detail::CompressNot(v, detail::BitsFromMask(mask));
8921}
8922
8923// ------------------------------ CompressBlocksNot
8924HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
8925 Mask128<uint64_t> /* m */) {
8926 return v;
8927}
8928
8929// ------------------------------ CompressBits
8930
8931template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
8933 const uint8_t* HWY_RESTRICT bits) {
8934 uint64_t mask_bits = 0;
8935 constexpr size_t kNumBytes = (N + 7) / 8;
8936 CopyBytes<kNumBytes>(bits, &mask_bits);
8937 if (N < 8) {
8938 mask_bits &= (1ull << N) - 1;
8939 }
8940
8941 return detail::Compress(v, mask_bits);
8942}
8943
8944// ------------------------------ CompressStore
8945template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
8947 TFromD<D>* HWY_RESTRICT unaligned) {
8948 const uint64_t mask_bits = detail::BitsFromMask(mask);
8949 StoreU(detail::Compress(v, mask_bits), d, unaligned);
8950 return PopCount(mask_bits);
8951}
8952
8953// ------------------------------ CompressBlendedStore
8954template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
8956 TFromD<D>* HWY_RESTRICT unaligned) {
8957 const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
8958 const uint64_t mask_bits = detail::BitsFromMask(m);
8959 const size_t count = PopCount(mask_bits);
8960 const MFromD<D> store_mask = RebindMask(d, FirstN(du, count));
8961 const VFromD<decltype(du)> compressed =
8962 detail::Compress(BitCast(du, v), mask_bits);
8963 BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
8964 return count;
8965}
8966
8967// ------------------------------ CompressBitsStore
8968
8969template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
8970HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
8971 D d, TFromD<D>* HWY_RESTRICT unaligned) {
8972 uint64_t mask_bits = 0;
8973 constexpr size_t kNumBytes = (d.MaxLanes() + 7) / 8;
8974 CopyBytes<kNumBytes>(bits, &mask_bits);
8975 if (d.MaxLanes() < 8) {
8976 mask_bits &= (1ull << d.MaxLanes()) - 1;
8977 }
8978
8979 StoreU(detail::Compress(v, mask_bits), d, unaligned);
8980 return PopCount(mask_bits);
8981}
8982
8983// ------------------------------ LoadInterleaved2
8984
8985// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
8986#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
8987#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
8988#else
8989#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
8990#endif
8991
8992namespace detail {
8993
8994#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
8995#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
8996
8997#if HWY_ARCH_ARM_A64
8998#define HWY_IF_LOAD_INT(D) HWY_IF_V_SIZE_GT_D(D, 4)
8999#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
9000#else
9001// Exclude 64x2 and f64x1, which are only supported on aarch64
9002#define HWY_IF_LOAD_INT(D) \
9003 HWY_IF_V_SIZE_GT_D(D, 4), \
9004 hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
9005 nullptr
9006#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
9007 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
9008 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
9009 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
9010 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
9011 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
9012#endif // HWY_ARCH_ARM_A64
9013
9014// Must return raw tuple because Tuple2 lack a ctor, and we cannot use
9015// brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return
9016// void.
9017#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
9018 decltype(Tuple2<type##_t, size>().raw)
9019// Tuple tag arg allows overloading (cannot just overload on return type)
9020#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
9021 const NativeLaneType<type##_t>*from, Tuple2<type##_t, size>
9023#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
9024#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
9025
9026#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
9027 decltype(Tuple3<type##_t, size>().raw)
9028#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
9029 const NativeLaneType<type##_t>*from, Tuple3<type##_t, size>
9031#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
9032#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
9033
9034#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
9035 decltype(Tuple4<type##_t, size>().raw)
9036#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
9037 const NativeLaneType<type##_t>*from, Tuple4<type##_t, size>
9039#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
9040#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
9041
9042#undef HWY_NEON_DEF_FUNCTION_LOAD_INT
9043#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
9044#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
9045
9046} // namespace detail
9047
9048template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
9049HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
9050 VFromD<D>& v0, VFromD<D>& v1) {
9051 auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(unaligned),
9052 detail::Tuple2<T, d.MaxLanes()>());
9053 v0 = VFromD<D>(raw.val[0]);
9054 v1 = VFromD<D>(raw.val[1]);
9055}
9056
9057// <= 32 bits: avoid loading more than N bytes by copying to buffer
9058template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
9059HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
9060 VFromD<D>& v0, VFromD<D>& v1) {
9061 // The smallest vector registers are 64-bits and we want space for two.
9062 alignas(16) T buf[2 * 8 / sizeof(T)] = {};
9063 CopyBytes<d.MaxBytes() * 2>(unaligned, buf);
9064 auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(buf),
9065 detail::Tuple2<T, d.MaxLanes()>());
9066 v0 = VFromD<D>(raw.val[0]);
9067 v1 = VFromD<D>(raw.val[1]);
9068}
9069
9070#if HWY_ARCH_ARM_V7
9071// 64x2: split into two 64x1
9072template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9073HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
9074 Vec128<T>& v1) {
9075 const Half<decltype(d)> dh;
9076 VFromD<decltype(dh)> v00, v10, v01, v11;
9077 LoadInterleaved2(dh, detail::NativeLanePointer(unaligned), v00, v10);
9078 LoadInterleaved2(dh, detail::NativeLanePointer(unaligned + 2), v01, v11);
9079 v0 = Combine(d, v01, v00);
9080 v1 = Combine(d, v11, v10);
9081}
9082#endif // HWY_ARCH_ARM_V7
9083
9084// ------------------------------ LoadInterleaved3
9085
9086template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
9087HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
9088 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
9089 auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(unaligned),
9090 detail::Tuple3<T, d.MaxLanes()>());
9091 v0 = VFromD<D>(raw.val[0]);
9092 v1 = VFromD<D>(raw.val[1]);
9093 v2 = VFromD<D>(raw.val[2]);
9094}
9095
9096// <= 32 bits: avoid writing more than N bytes by copying to buffer
9097template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
9098HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
9099 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
9100 // The smallest vector registers are 64-bits and we want space for three.
9101 alignas(16) T buf[3 * 8 / sizeof(T)] = {};
9102 CopyBytes<d.MaxBytes() * 3>(unaligned, buf);
9103 auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(buf),
9104 detail::Tuple3<T, d.MaxLanes()>());
9105 v0 = VFromD<D>(raw.val[0]);
9106 v1 = VFromD<D>(raw.val[1]);
9107 v2 = VFromD<D>(raw.val[2]);
9108}
9109
9110#if HWY_ARCH_ARM_V7
9111// 64x2: split into two 64x1
9112template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9113HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
9114 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
9115 const Half<decltype(d)> dh;
9116 VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
9117 LoadInterleaved3(dh, detail::NativeLanePointer(unaligned), v00, v10, v20);
9118 LoadInterleaved3(dh, detail::NativeLanePointer(unaligned + 3), v01, v11, v21);
9119 v0 = Combine(d, v01, v00);
9120 v1 = Combine(d, v11, v10);
9121 v2 = Combine(d, v21, v20);
9122}
9123#endif // HWY_ARCH_ARM_V7
9124
9125// ------------------------------ LoadInterleaved4
9126
9127template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
9128HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
9129 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
9130 VFromD<D>& v3) {
9131 auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(unaligned),
9132 detail::Tuple4<T, d.MaxLanes()>());
9133 v0 = VFromD<D>(raw.val[0]);
9134 v1 = VFromD<D>(raw.val[1]);
9135 v2 = VFromD<D>(raw.val[2]);
9136 v3 = VFromD<D>(raw.val[3]);
9137}
9138
9139// <= 32 bits: avoid writing more than N bytes by copying to buffer
9140template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
9141HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
9142 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
9143 VFromD<D>& v3) {
9144 alignas(16) T buf[4 * 8 / sizeof(T)] = {};
9145 CopyBytes<d.MaxBytes() * 4>(unaligned, buf);
9146 auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(buf),
9147 detail::Tuple4<T, d.MaxLanes()>());
9148 v0 = VFromD<D>(raw.val[0]);
9149 v1 = VFromD<D>(raw.val[1]);
9150 v2 = VFromD<D>(raw.val[2]);
9151 v3 = VFromD<D>(raw.val[3]);
9152}
9153
9154#if HWY_ARCH_ARM_V7
9155// 64x2: split into two 64x1
9156template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9157HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
9158 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
9159 Vec128<T>& v3) {
9160 const Half<decltype(d)> dh;
9161 VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
9162 LoadInterleaved4(dh, detail::NativeLanePointer(unaligned), v00, v10, v20,
9163 v30);
9164 LoadInterleaved4(dh, detail::NativeLanePointer(unaligned + 4), v01, v11, v21,
9165 v31);
9166 v0 = Combine(d, v01, v00);
9167 v1 = Combine(d, v11, v10);
9168 v2 = Combine(d, v21, v20);
9169 v3 = Combine(d, v31, v30);
9170}
9171#endif // HWY_ARCH_ARM_V7
9172
9173#undef HWY_IF_LOAD_INT
9174
9175// ------------------------------ StoreInterleaved2
9176
9177namespace detail {
9178#define HWY_NEON_BUILD_TPL_HWY_STORE_INT
9179#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
9180#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
9181
9182#if HWY_ARCH_ARM_A64
9183#define HWY_IF_STORE_INT(D) HWY_IF_V_SIZE_GT_D(D, 4)
9184#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
9185#else
9186// Exclude 64x2 and f64x1, which are only supported on aarch64
9187#define HWY_IF_STORE_INT(D) \
9188 HWY_IF_V_SIZE_GT_D(D, 4), \
9189 hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
9190 nullptr
9191#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
9192 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
9193 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
9194 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
9195 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
9196 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
9197#endif // HWY_ARCH_ARM_A64
9198
9199#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
9200 Tuple2<type##_t, size> tup, NativeLaneType<type##_t>*to
9202#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
9203
9204#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
9205 Tuple3<type##_t, size> tup, NativeLaneType<type##_t>*to
9207#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
9208
9209#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
9210 Tuple4<type##_t, size> tup, NativeLaneType<type##_t>*to
9212#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
9213
9214#undef HWY_NEON_DEF_FUNCTION_STORE_INT
9215#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
9216#undef HWY_NEON_BUILD_RET_HWY_STORE_INT
9217#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
9218} // namespace detail
9219
9220template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
9222 T* HWY_RESTRICT unaligned) {
9223 detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
9224 detail::StoreInterleaved2(tup, detail::NativeLanePointer(unaligned));
9225}
9226
9227// <= 32 bits: avoid writing more than N bytes by copying to buffer
9228template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
9230 T* HWY_RESTRICT unaligned) {
9231 alignas(16) T buf[2 * 8 / sizeof(T)];
9232 detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
9233 detail::StoreInterleaved2(tup, detail::NativeLanePointer(buf));
9234 CopyBytes<d.MaxBytes() * 2>(buf, unaligned);
9235}
9236
9237#if HWY_ARCH_ARM_V7
9238// 64x2: split into two 64x1
9239template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9240HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d,
9241 T* HWY_RESTRICT unaligned) {
9242 const Half<decltype(d)> dh;
9243 StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh,
9244 detail::NativeLanePointer(unaligned));
9245 StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh,
9246 detail::NativeLanePointer(unaligned + 2));
9247}
9248#endif // HWY_ARCH_ARM_V7
9249
9250// ------------------------------ StoreInterleaved3
9251
9252template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
9254 T* HWY_RESTRICT unaligned) {
9255 detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
9256 detail::StoreInterleaved3(tup, detail::NativeLanePointer(unaligned));
9257}
9258
9259// <= 32 bits: avoid writing more than N bytes by copying to buffer
9260template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
9262 T* HWY_RESTRICT unaligned) {
9263 alignas(16) T buf[3 * 8 / sizeof(T)];
9264 detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
9265 detail::StoreInterleaved3(tup, detail::NativeLanePointer(buf));
9266 CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
9267}
9268
9269#if HWY_ARCH_ARM_V7
9270// 64x2: split into two 64x1
9271template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9272HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d,
9273 T* HWY_RESTRICT unaligned) {
9274 const Half<decltype(d)> dh;
9275 StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
9276 detail::NativeLanePointer(unaligned));
9277 StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
9278 detail::NativeLanePointer(unaligned + 3));
9279}
9280#endif // HWY_ARCH_ARM_V7
9281
9282// ------------------------------ StoreInterleaved4
9283
9284template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
9286 VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
9287 detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
9288 detail::StoreInterleaved4(tup, detail::NativeLanePointer(unaligned));
9289}
9290
9291// <= 32 bits: avoid writing more than N bytes by copying to buffer
9292template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
9294 VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
9295 alignas(16) T buf[4 * 8 / sizeof(T)];
9296 detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
9297 detail::StoreInterleaved4(tup, detail::NativeLanePointer(buf));
9298 CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
9299}
9300
9301#if HWY_ARCH_ARM_V7
9302// 64x2: split into two 64x1
9303template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9304HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
9305 Vec128<T> v3, D d, T* HWY_RESTRICT unaligned) {
9306 const Half<decltype(d)> dh;
9307 StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
9308 LowerHalf(dh, v3), dh,
9309 detail::NativeLanePointer(unaligned));
9310 StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
9311 UpperHalf(dh, v3), dh,
9312 detail::NativeLanePointer(unaligned + 4));
9313}
9314#endif // HWY_ARCH_ARM_V7
9315
9316#undef HWY_IF_STORE_INT
9317
9318// ------------------------------ Additional mask logical operations
9319template <class T>
9323template <class T>
9325 const FixedTag<T, 2> d;
9326 const auto vmask = VecFromMask(d, mask);
9327 return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
9328}
9329template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
9331 const Simd<T, N, 0> d;
9332 const auto vmask = VecFromMask(d, mask);
9333 const auto neg_vmask =
9335 return MaskFromVec(Or(vmask, neg_vmask));
9336}
9337template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
9339 const Full128<T> d;
9340 const Repartition<int64_t, decltype(d)> di64;
9341
9342 auto vmask = BitCast(di64, VecFromMask(d, mask));
9343 vmask = Or(vmask, Neg(vmask));
9344
9345 // Copy the sign bit of the first int64_t lane to the second int64_t lane
9346 const auto vmask2 = BroadcastSignBit(InterleaveLower(Zero(di64), vmask));
9347 return MaskFromVec(BitCast(d, Or(vmask, vmask2)));
9348}
9349
9350template <class T, size_t N>
9354
9355template <class T>
9357 return mask;
9358}
9359template <class T>
9361 const FixedTag<T, 2> d;
9362 const RebindToSigned<decltype(d)> di;
9363
9364 const auto vmask = BitCast(di, VecFromMask(d, mask));
9365 const auto zero = Zero(di);
9366 const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
9367 return MaskFromVec(BitCast(d, And(vmask, vmask2)));
9368}
9369template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
9371 const Simd<T, N, 0> d;
9372 const RebindToSigned<decltype(d)> di;
9373
9374 const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask));
9375 const auto only_first_vmask =
9376 BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask)))));
9377 return MaskFromVec(only_first_vmask);
9378}
9379template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
9381 const Full128<T> d;
9382 const RebindToSigned<decltype(d)> di;
9383 const Repartition<int64_t, decltype(d)> di64;
9384
9385 const auto zero = Zero(di64);
9386 const auto vmask = BitCast(di64, VecFromMask(d, mask));
9387 const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero);
9388 const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask))));
9389 return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2))));
9390}
9391
9392template <class T>
9394 const FixedTag<T, 1> d;
9395 const RebindToSigned<decltype(d)> di;
9396 using TI = MakeSigned<T>;
9397
9398 return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
9399}
9400template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
9402 const Simd<T, N, 0> d;
9403 return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
9404}
9405
9406// ------------------------------ Lt128
9407
9408template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9410 static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64");
9411 // Truth table of Eq and Lt for Hi and Lo u64.
9412 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
9413 // =H =L cH cL | out = cH | (=H & cL)
9414 // 0 0 0 0 | 0
9415 // 0 0 0 1 | 0
9416 // 0 0 1 0 | 1
9417 // 0 0 1 1 | 1
9418 // 0 1 0 0 | 0
9419 // 0 1 0 1 | 0
9420 // 0 1 1 0 | 1
9421 // 1 0 0 0 | 0
9422 // 1 0 0 1 | 1
9423 // 1 1 0 0 | 0
9424 const MFromD<D> eqHL = Eq(a, b);
9425 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
9426 // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
9427 // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
9428 // comparison result leftwards requires only 4. IfThenElse compiles to the
9429 // same code as OrAnd().
9430 const VFromD<D> ltLx = DupEven(ltHL);
9431 const VFromD<D> outHx = IfThenElse(eqHL, ltLx, ltHL);
9432 return MaskFromVec(DupOdd(outHx));
9433}
9434
9435template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9437 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
9438 return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
9439}
9440
9441// ------------------------------ Eq128
9442
9443template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9445 static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64");
9446 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
9447 return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
9448}
9449
9450template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9452 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
9453 return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
9454}
9455
9456// ------------------------------ Ne128
9457
9458template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9460 static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64");
9461 const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
9462 return MaskFromVec(Or(Reverse2(d, neHL), neHL));
9463}
9464
9465template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
9467 const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
9468 return MaskFromVec(InterleaveUpper(d, neHL, neHL));
9469}
9470
9471// ------------------------------ Min128, Max128 (Lt128)
9472
9473// Without a native OddEven, it seems infeasible to go faster than Lt128.
9474template <class D>
9476 return IfThenElse(Lt128(d, a, b), a, b);
9477}
9478
9479template <class D>
9481 return IfThenElse(Lt128(d, b, a), a, b);
9482}
9483
9484template <class D>
9486 return IfThenElse(Lt128Upper(d, a, b), a, b);
9487}
9488
9489template <class D>
9491 return IfThenElse(Lt128Upper(d, b, a), a, b);
9492}
9493
9494// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
9495
9496#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
9497#undef HWY_NATIVE_LEADING_ZERO_COUNT
9498#else
9499#define HWY_NATIVE_LEADING_ZERO_COUNT
9500#endif
9501
9504
9505template <class V, HWY_IF_UI64_D(DFromV<V>)>
9507 const DFromV<decltype(v)> d;
9508 const RebindToUnsigned<decltype(d)> du;
9509 const Repartition<uint32_t, decltype(d)> du32;
9510
9511 const auto v_k32 = BitCast(du32, Set(du, 32));
9512 const auto v_u32_lzcnt = LeadingZeroCount(BitCast(du32, v)) + v_k32;
9513 const auto v_u32_lo_lzcnt =
9514 And(v_u32_lzcnt, BitCast(du32, Set(du, 0xFFFFFFFFu)));
9515 const auto v_u32_hi_lzcnt =
9516 BitCast(du32, ShiftRight<32>(BitCast(du, v_u32_lzcnt)));
9517
9518 return BitCast(
9519 d, IfThenElse(v_u32_hi_lzcnt == v_k32, v_u32_lo_lzcnt, v_u32_hi_lzcnt));
9520}
9521
9522template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
9524 const DFromV<decltype(v)> d;
9525 using T = TFromD<decltype(d)>;
9526 return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
9527}
9528
9529template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, 1)>
9533
9534template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
9535 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
9537 const DFromV<decltype(v)> d;
9538 const Repartition<uint8_t, decltype(d)> du8;
9539 return LeadingZeroCount(
9541}
9542
9543namespace detail { // for code folding
9544#if HWY_ARCH_ARM_V7
9545#undef vuzp1_s8
9546#undef vuzp1_u8
9547#undef vuzp1_s16
9548#undef vuzp1_u16
9549#undef vuzp1_s32
9550#undef vuzp1_u32
9551#undef vuzp1_f32
9552#undef vuzp1q_s8
9553#undef vuzp1q_u8
9554#undef vuzp1q_s16
9555#undef vuzp1q_u16
9556#undef vuzp1q_s32
9557#undef vuzp1q_u32
9558#undef vuzp1q_f32
9559#undef vuzp2_s8
9560#undef vuzp2_u8
9561#undef vuzp2_s16
9562#undef vuzp2_u16
9563#undef vuzp2_s32
9564#undef vuzp2_u32
9565#undef vuzp2_f32
9566#undef vuzp2q_s8
9567#undef vuzp2q_u8
9568#undef vuzp2q_s16
9569#undef vuzp2q_u16
9570#undef vuzp2q_s32
9571#undef vuzp2q_u32
9572#undef vuzp2q_f32
9573#undef vzip1_s8
9574#undef vzip1_u8
9575#undef vzip1_s16
9576#undef vzip1_u16
9577#undef vzip1_s32
9578#undef vzip1_u32
9579#undef vzip1_f32
9580#undef vzip1q_s8
9581#undef vzip1q_u8
9582#undef vzip1q_s16
9583#undef vzip1q_u16
9584#undef vzip1q_s32
9585#undef vzip1q_u32
9586#undef vzip1q_f32
9587#undef vzip2_s8
9588#undef vzip2_u8
9589#undef vzip2_s16
9590#undef vzip2_u16
9591#undef vzip2_s32
9592#undef vzip2_u32
9593#undef vzip2_f32
9594#undef vzip2q_s8
9595#undef vzip2q_u8
9596#undef vzip2q_s16
9597#undef vzip2q_u16
9598#undef vzip2q_s32
9599#undef vzip2q_u32
9600#undef vzip2q_f32
9601#endif
9602
9603#undef HWY_NEON_BUILD_ARG_1
9604#undef HWY_NEON_BUILD_ARG_2
9605#undef HWY_NEON_BUILD_ARG_3
9606#undef HWY_NEON_BUILD_PARAM_1
9607#undef HWY_NEON_BUILD_PARAM_2
9608#undef HWY_NEON_BUILD_PARAM_3
9609#undef HWY_NEON_BUILD_RET_1
9610#undef HWY_NEON_BUILD_RET_2
9611#undef HWY_NEON_BUILD_RET_3
9612#undef HWY_NEON_BUILD_TPL_1
9613#undef HWY_NEON_BUILD_TPL_2
9614#undef HWY_NEON_BUILD_TPL_3
9615#undef HWY_NEON_DEF_FUNCTION
9616#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
9617#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
9618#undef HWY_NEON_DEF_FUNCTION_BFLOAT_16
9619#undef HWY_NEON_DEF_FUNCTION_FLOAT_16
9620#undef HWY_NEON_DEF_FUNCTION_FLOAT_16_32
9621#undef HWY_NEON_DEF_FUNCTION_FLOAT_32
9622#undef HWY_NEON_DEF_FUNCTION_FLOAT_64
9623#undef HWY_NEON_DEF_FUNCTION_FULL_UI
9624#undef HWY_NEON_DEF_FUNCTION_FULL_UI_64
9625#undef HWY_NEON_DEF_FUNCTION_FULL_UIF_64
9626#undef HWY_NEON_DEF_FUNCTION_INT_16
9627#undef HWY_NEON_DEF_FUNCTION_INT_32
9628#undef HWY_NEON_DEF_FUNCTION_INT_64
9629#undef HWY_NEON_DEF_FUNCTION_INT_8
9630#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
9631#undef HWY_NEON_DEF_FUNCTION_INTS
9632#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
9633#undef HWY_NEON_DEF_FUNCTION_UI_8_16_32
9634#undef HWY_NEON_DEF_FUNCTION_UIF_64
9635#undef HWY_NEON_DEF_FUNCTION_UIF_8_16_32
9636#undef HWY_NEON_DEF_FUNCTION_UINT_16
9637#undef HWY_NEON_DEF_FUNCTION_UINT_32
9638#undef HWY_NEON_DEF_FUNCTION_UINT_64
9639#undef HWY_NEON_DEF_FUNCTION_UINT_8
9640#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
9641#undef HWY_NEON_DEF_FUNCTION_UINTS
9642#undef HWY_NEON_EVAL
9643#undef HWY_NEON_IF_EMULATED_D
9644} // namespace detail
9645
9646// NOLINTNEXTLINE(google-readability-namespace-comments)
9647} // namespace HWY_NAMESPACE
9648} // namespace hwy
HWY_AFTER_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition arm_neon-inl.h:252
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition arm_neon-inl.h:239
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition arm_neon-inl.h:257
#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:228
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition arm_neon-inl.h:135
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args)
Definition arm_neon-inl.h:9191
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition arm_neon-inl.h:93
#define HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:248
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition arm_neon-inl.h:244
#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
Definition arm_neon-inl.h:168
#define HWY_NEON_DEF_FUNCTION_FULL_UIF_64(name, prefix, infix, args)
Definition arm_neon-inl.h:278
#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args)
Definition arm_neon-inl.h:283
#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix)
Definition arm_neon-inl.h:7993
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition arm_neon-inl.h:123
HWY_BEFORE_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:222
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition arm_neon-inl.h:78
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition arm_neon-inl.h:109
#define HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:261
#define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, args)
Definition arm_neon-inl.h:172
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition arm_neon-inl.h:234
#define HWY_NEON_DEF_FUNCTION_UIF_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:265
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition arm_neon-inl.h:101
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args)
Definition arm_neon-inl.h:9006
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_IF_FLOAT(T)
Definition base.h:625
#define HWY_RESTRICT
Definition base.h:95
#define HWY_RCAST_ALIGNED(type, ptr)
Definition base.h:144
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_LANES_GT(kN, lanes)
Definition base.h:618
#define HWY_IF_V_SIZE_LE(T, kN, bytes)
Definition base.h:611
#define HWY_IF_NOT_FLOAT(T)
Definition base.h:626
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_MAYBE_UNUSED
Definition base.h:113
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
Definition arm_neon-inl.h:865
HWY_INLINE Mask128()
Definition arm_neon-inl.h:873
Mask128(const Mask128 &)=default
T PrivateT
Definition arm_neon-inl.h:870
Mask128 & operator=(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition arm_neon-inl.h:876
Raw raw
Definition arm_neon-inl.h:878
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:867
Definition arm_neon-inl.h:813
HWY_INLINE Vec128()
Definition arm_neon-inl.h:819
HWY_INLINE Vec128 & operator%=(const Vec128 other)
Definition arm_neon-inl.h:838
T PrivateT
Definition arm_neon-inl.h:816
HWY_INLINE Vec128(const Raw raw)
Definition arm_neon-inl.h:822
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition arm_neon-inl.h:829
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:815
Raw raw
Definition arm_neon-inl.h:851
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition arm_neon-inl.h:835
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition arm_neon-inl.h:847
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition arm_neon-inl.h:844
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition arm_neon-inl.h:826
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition arm_neon-inl.h:841
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition arm_neon-inl.h:832
#define HWY_COMPILER_GCC_ACTUAL
Definition detect_compiler_arch.h:121
#define HWY_ARCH_ARM_A64
Definition detect_compiler_arch.h:202
HWY_INLINE VFromD< D > Set64(D, uint64_t mask_bits)
Definition arm_neon-inl.h:8041
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_INLINE uint64_t NibblesFromMask(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8116
HWY_API V InsertLane(const V v, TFromD< D > t)
Definition arm_neon-inl.h:1793
HWY_INLINE VFromD< RepartitionToWide< DFromV< V > > > SumsOf2(hwy::SignedTag, hwy::SizeTag< 1 >, V v)
Definition arm_neon-inl.h:1959
HWY_INLINE T * NativeLanePointer(T *p)
Definition ops/shared-inl.h:111
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:689
HWY_API Vec128< TFromD< D >, MaxLanes(D())> NativeSet(D d, TFromD< D > t)
Definition arm_neon-inl.h:915
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE V SlideUpLanes(V v, size_t amt)
Definition arm_neon-inl.h:6201
_u8_
Definition arm_neon-inl.h:1408
HWY_API Vec128< float, N > NegMulAdd(Vec128< float, N > add, Vec128< float, N > mul, Vec128< float, N > x)
Definition arm_neon-inl.h:2591
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, uint64_t mask_bits)
Definition arm_neon-inl.h:8448
vreinterpret
Definition arm_neon-inl.h:1408
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:681
static HWY_INLINE HWY_MAYBE_UNUSED TFromV< V > GetLane(V v)
Definition arm_neon-inl.h:1634
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
HWY_INLINE Vec128< float16_t, N > ConcatEven(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7002
HWY_INLINE MFromD< D > LoadMaskBits(D d, uint64_t mask_bits)
Definition arm_neon-inl.h:8051
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecBroadcastLaneBytes(D d)
Definition arm_neon-inl.h:5661
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE Vec128< float16_t, N > ConcatOdd(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7009
HWY_INLINE Vec128< uint32_t > ConvertFToU(D, Vec128< float > v)
Definition arm_neon-inl.h:4067
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5005
HWY_API Vec128< float, N > MulAdd(Vec128< float, N > add, Vec128< float, N > mul, Vec128< float, N > x)
Definition arm_neon-inl.h:2585
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecByteOffsets(D d)
Definition arm_neon-inl.h:5695
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2845
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8296
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8851
HWY_INLINE Vec128< int32_t > ConvertFToI(D, Vec128< float > v)
Definition arm_neon-inl.h:4023
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:8276
HWY_INLINE Vec128< uint8_t > Load8Bytes(D, const uint8_t *bytes)
Definition arm_neon-inl.h:8436
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8860
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE V SlideDownLanes(V v, size_t amt)
Definition arm_neon-inl.h:6346
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, uint64_t mask_bits)
Definition arm_neon-inl.h:8600
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_API V SaturatedNeg(V v)
Definition generic_ops-inl.h:897
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API Vec128< int64_t > SaturatedAbs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3288
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API TFromD< D > ReduceMax(D d, VFromD< D > v)
Definition arm_sve-inl.h:3213
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
HWY_API VFromD< DI32 > SatWidenMulAccumFixedPoint(DI32, VFromD< Rebind< int16_t, DI32 > > a, VFromD< Rebind< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition arm_neon-inl.h:6496
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API VFromD< RepartitionToWideX2< DFromV< V > > > SumsOf4(V v)
Definition generic_ops-inl.h:3733
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API V ReverseLaneBytes(V v)
Definition generic_ops-inl.h:6386
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API V Per4LaneBlockShuffle(V v)
Definition generic_ops-inl.h:6904
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_API TFromD< D > ReduceMin(D d, VFromD< D > v)
Definition arm_sve-inl.h:3208
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API void LoadInterleaved2(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1)
Definition arm_neon-inl.h:9049
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
Simd< T, 8/sizeof(T), 0 > Full64
Definition ops/shared-inl.h:417
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API V HighestSetBitIndex(V v)
Definition arm_neon-inl.h:9523
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition ops/shared-inl.h:407
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API V TrailingZeroCount(V v)
Definition arm_neon-inl.h:9530
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API V ReverseBits(V v)
Definition generic_ops-inl.h:6464
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
Definition abort.h:8
double float64_t
Definition base.h:406
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2551
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
constexpr size_t FloorLog2(TI x)
Definition base.h:2662
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd< float >()
Definition base.h:2320
float float32_t
Definition base.h:405
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2588
HWY_API size_t PopCount(T x)
Definition base.h:2615
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_I64_D(D)
Definition ops/shared-inl.h:585
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_SPECIAL_FLOAT_D(D)
Definition ops/shared-inl.h:542
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_UI64_D(D)
Definition ops/shared-inl.h:592
#define HWY_IF_LANES_GT_D(D, lanes)
Definition ops/shared-inl.h:562
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_V_SIZE_V(V, bytes)
Definition ops/shared-inl.h:632
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_IF_NOT_FLOAT_D(D)
Definition ops/shared-inl.h:536
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:555
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_HAVE_FLOAT64
Definition set_macros-inl.h:174
#define HWY_HAVE_FLOAT16
Definition set_macros-inl.h:173
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition arm_neon-inl.h:8428
@ value
Definition arm_neon-inl.h:8429
Definition arm_neon-inl.h:5654
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:5655
Definition ops/shared-inl.h:198
constexpr size_t MaxBytes() const
Definition ops/shared-inl.h:250
float32x4_t type
Definition arm_neon-inl.h:690
float32x2_t type
Definition arm_neon-inl.h:694
int16x8_t type
Definition arm_neon-inl.h:663
int16x4_t type
Definition arm_neon-inl.h:667
int32x4_t type
Definition arm_neon-inl.h:672
int32x2_t type
Definition arm_neon-inl.h:676
int64x1_t type
Definition arm_neon-inl.h:685
int64x2_t type
Definition arm_neon-inl.h:681
int8x16_t type
Definition arm_neon-inl.h:654
int8x8_t type
Definition arm_neon-inl.h:658
uint16x8_t type
Definition arm_neon-inl.h:627
uint16x4_t type
Definition arm_neon-inl.h:631
uint32x4_t type
Definition arm_neon-inl.h:636
uint32x2_t type
Definition arm_neon-inl.h:640
uint64x1_t type
Definition arm_neon-inl.h:649
uint64x2_t type
Definition arm_neon-inl.h:645
uint8x16_t type
Definition arm_neon-inl.h:618
uint8x8_t type
Definition arm_neon-inl.h:622
Definition x86_128-inl.h:67
__v128_u type
Definition wasm_128-inl.h:60
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:5213
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:5220
Definition arm_neon-inl.h:5191
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:5201
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition arm_neon-inl.h:5194
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:5243
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:5250
Definition arm_neon-inl.h:5226
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition arm_neon-inl.h:5228
float32x4x2_t raw
Definition arm_neon-inl.h:428
float32x2x2_t raw
Definition arm_neon-inl.h:432
int16x8x2_t raw
Definition arm_neon-inl.h:387
int16x4x2_t raw
Definition arm_neon-inl.h:391
int32x4x2_t raw
Definition arm_neon-inl.h:403
int32x2x2_t raw
Definition arm_neon-inl.h:407
int64x2x2_t raw
Definition arm_neon-inl.h:419
int64x1x2_t raw
Definition arm_neon-inl.h:423
int8x16x2_t raw
Definition arm_neon-inl.h:371
int8x8x2_t raw
Definition arm_neon-inl.h:375
uint16x8x2_t raw
Definition arm_neon-inl.h:379
uint16x4x2_t raw
Definition arm_neon-inl.h:383
uint32x4x2_t raw
Definition arm_neon-inl.h:395
uint32x2x2_t raw
Definition arm_neon-inl.h:399
uint64x2x2_t raw
Definition arm_neon-inl.h:411
uint64x1x2_t raw
Definition arm_neon-inl.h:415
uint8x16x2_t raw
Definition arm_neon-inl.h:363
uint8x8x2_t raw
Definition arm_neon-inl.h:367
Definition arm_neon-inl.h:355
float32x4x3_t raw
Definition arm_neon-inl.h:512
float32x2x3_t raw
Definition arm_neon-inl.h:516
int16x8x3_t raw
Definition arm_neon-inl.h:471
int16x4x3_t raw
Definition arm_neon-inl.h:475
int32x4x3_t raw
Definition arm_neon-inl.h:487
int32x2x3_t raw
Definition arm_neon-inl.h:491
int64x2x3_t raw
Definition arm_neon-inl.h:503
int64x1x3_t raw
Definition arm_neon-inl.h:507
int8x16x3_t raw
Definition arm_neon-inl.h:455
int8x8x3_t raw
Definition arm_neon-inl.h:459
uint16x8x3_t raw
Definition arm_neon-inl.h:463
uint16x4x3_t raw
Definition arm_neon-inl.h:467
uint32x4x3_t raw
Definition arm_neon-inl.h:479
uint32x2x3_t raw
Definition arm_neon-inl.h:483
uint64x2x3_t raw
Definition arm_neon-inl.h:495
uint64x1x3_t raw
Definition arm_neon-inl.h:499
uint8x16x3_t raw
Definition arm_neon-inl.h:447
uint8x8x3_t raw
Definition arm_neon-inl.h:451
Definition arm_neon-inl.h:357
float32x4x4_t raw
Definition arm_neon-inl.h:596
float32x2x4_t raw
Definition arm_neon-inl.h:600
int16x8x4_t raw
Definition arm_neon-inl.h:555
int16x4x4_t raw
Definition arm_neon-inl.h:559
int32x4x4_t raw
Definition arm_neon-inl.h:571
int32x2x4_t raw
Definition arm_neon-inl.h:575
int64x2x4_t raw
Definition arm_neon-inl.h:587
int64x1x4_t raw
Definition arm_neon-inl.h:591
int8x16x4_t raw
Definition arm_neon-inl.h:539
int8x8x4_t raw
Definition arm_neon-inl.h:543
uint16x8x4_t raw
Definition arm_neon-inl.h:547
uint16x4x4_t raw
Definition arm_neon-inl.h:551
uint32x4x4_t raw
Definition arm_neon-inl.h:563
uint32x2x4_t raw
Definition arm_neon-inl.h:567
uint64x2x4_t raw
Definition arm_neon-inl.h:579
uint64x1x4_t raw
Definition arm_neon-inl.h:583
uint8x16x4_t raw
Definition arm_neon-inl.h:531
uint8x8x4_t raw
Definition arm_neon-inl.h:535
Definition arm_neon-inl.h:359
T vals[8/sizeof(T)]
Definition arm_neon-inl.h:975
Definition base.h:694
Definition base.h:1594
Definition base.h:1117
int VFromD
Definition tuple-inl.h:25