Grok 12.0.1
arm_sve-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Arm SVE[2] vectors (length not known at compile time).
17// External include guard in highway.h - see comment there.
18
19#include <arm_sve.h>
20
21#include "hwy/ops/shared-inl.h"
22
23// Arm C215 declares that SVE vector lengths will always be a power of two.
24// We default to relying on this, which makes some operations more efficient.
25// You can still opt into fixups by setting this to 0 (unsupported).
26#ifndef HWY_SVE_IS_POW2
27#define HWY_SVE_IS_POW2 1
28#endif
29
30#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
31#define HWY_SVE_HAVE_2 1
32#else
33#define HWY_SVE_HAVE_2 0
34#endif
35
36// If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
37// create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
38#if HWY_ARM_HAVE_SCALAR_BF16_TYPE && defined(__ARM_FEATURE_SVE_BF16)
39#define HWY_SVE_HAVE_BF16_FEATURE 1
40#else
41#define HWY_SVE_HAVE_BF16_FEATURE 0
42#endif
43
44// HWY_SVE_HAVE_BF16_VEC is defined to 1 if the SVE svbfloat16_t vector type
45// is supported, even if HWY_SVE_HAVE_BF16_FEATURE (= intrinsics) is 0.
46#if HWY_SVE_HAVE_BF16_FEATURE || \
47 (HWY_COMPILER_CLANG >= 1200 && defined(__ARM_FEATURE_SVE_BF16)) || \
48 HWY_COMPILER_GCC_ACTUAL >= 1000
49#define HWY_SVE_HAVE_BF16_VEC 1
50#else
51#define HWY_SVE_HAVE_BF16_VEC 0
52#endif
53
54// HWY_SVE_HAVE_F32_TO_BF16C is defined to 1 if the SVE svcvt_bf16_f32_x
55// and svcvtnt_bf16_f32_x intrinsics are available, even if the __bf16 type
56// is disabled
57#if HWY_SVE_HAVE_BF16_VEC && defined(__ARM_FEATURE_SVE_BF16)
58#define HWY_SVE_HAVE_F32_TO_BF16C 1
59#else
60#define HWY_SVE_HAVE_F32_TO_BF16C 0
61#endif
62
64namespace hwy {
65namespace HWY_NAMESPACE {
66
67template <class V>
68struct DFromV_t {}; // specialized in macros
69template <class V>
70using DFromV = typename DFromV_t<RemoveConst<V>>::type;
71
72template <class V>
74
75// ================================================== MACROS
76
77// Generate specializations and function definitions using X macros. Although
78// harder to read and debug, writing everything manually is too bulky.
79
80namespace detail { // for code folding
81
82// Args: BASE, CHAR, BITS, HALF, NAME, OP
83
84// Unsigned:
85#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP)
86#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP)
87#define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
88 X_MACRO(uint, u, 32, 16, NAME, OP)
89#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
90 X_MACRO(uint, u, 64, 32, NAME, OP)
91
92// Signed:
93#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP)
94#define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP)
95#define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP)
96#define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP)
97
98// Float:
99#define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
100 X_MACRO(float, f, 16, 16, NAME, OP)
101#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
102 X_MACRO(float, f, 32, 16, NAME, OP)
103#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
104 X_MACRO(float, f, 64, 32, NAME, OP)
105
106#define HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP) \
107 X_MACRO(bfloat, bf, 16, 16, NAME, OP)
108
109#if HWY_SVE_HAVE_BF16_FEATURE
110#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
111 HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP)
112// We have both f16 and bf16, so nothing is emulated.
113
114// NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
115// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
116// !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
117// SFINAE to occur instead of a hard error due to a dependency on the D template
118// argument
119#define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
120#define HWY_SVE_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
121#else
122#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
123#define HWY_SVE_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
124#define HWY_SVE_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
125#endif // HWY_SVE_HAVE_BF16_FEATURE
126
127// For all element sizes:
128#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
129 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
130 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
131 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
132 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
133
134#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
135 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
136 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
137 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
138 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
139
140#define HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP) \
141 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
142 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
143
144// HWY_SVE_FOREACH_F does not include HWY_SVE_FOREACH_BF16 because SVE lacks
145// bf16 overloads for some intrinsics (especially less-common arithmetic).
146// However, this does include f16 because SVE supports it unconditionally.
147#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
148 HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
149 HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
150
151// Commonly used type categories for a given element size:
152#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
153 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
154 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
155
156#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
157 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
158 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
159
160#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
161 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
162 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
163
164#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
165 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
166 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
167
168#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
169 HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
170 HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
171 HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
172
173// Commonly used type categories:
174#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
175 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
176 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
177
178#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
179 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
180 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
181
182#define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
183 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
184 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
185 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
186
187// Assemble types for use in x-macros
188#define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
189#define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2>
190#define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
191#define HWY_SVE_TUPLE(BASE, BITS, MUL) sv##BASE##BITS##x##MUL##_t
192
193} // namespace detail
194
195#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \
196 template <> \
197 struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
198 using type = ScalableTag<HWY_SVE_T(BASE, BITS)>; \
199 };
200
202#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
204#endif
205#undef HWY_SPECIALIZE
206
207// Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
208// instructions, and we anyway only use it when the predicate is ptrue.
209
210// vector = f(vector), e.g. Not
211#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \
212 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
213 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
214 }
215#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \
216 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
217 return sv##OP##_##CHAR##BITS(v); \
218 }
219
220// vector = f(vector, scalar), e.g. detail::AddN
221#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
222 HWY_API HWY_SVE_V(BASE, BITS) \
223 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
224 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
225 }
226#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \
227 HWY_API HWY_SVE_V(BASE, BITS) \
228 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
229 return sv##OP##_##CHAR##BITS(a, b); \
230 }
231
232// vector = f(vector, vector), e.g. Add
233#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
234 HWY_API HWY_SVE_V(BASE, BITS) \
235 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
236 return sv##OP##_##CHAR##BITS(a, b); \
237 }
238// All-true mask
239#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
240 HWY_API HWY_SVE_V(BASE, BITS) \
241 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
242 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
243 }
244// User-specified mask. Mask=false value is undefined and must be set by caller
245// because SVE instructions take it from one of the two inputs, whereas
246// AVX-512, RVV and Highway allow a third argument.
247#define HWY_SVE_RETV_ARGMVV(BASE, CHAR, BITS, HALF, NAME, OP) \
248 HWY_API HWY_SVE_V(BASE, BITS) \
249 NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
250 return sv##OP##_##CHAR##BITS##_x(m, a, b); \
251 }
252
253#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
254 HWY_API HWY_SVE_V(BASE, BITS) \
255 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \
256 HWY_SVE_V(BASE, BITS) c) { \
257 return sv##OP##_##CHAR##BITS(a, b, c); \
258 }
259
260// ------------------------------ Lanes
261
262namespace detail {
263
264// Returns actual lanes of a hardware vector without rounding to a power of two.
265template <typename T, HWY_IF_T_SIZE(T, 1)>
267 return svcntb_pat(SV_ALL);
268}
269template <typename T, HWY_IF_T_SIZE(T, 2)>
271 return svcnth_pat(SV_ALL);
272}
273template <typename T, HWY_IF_T_SIZE(T, 4)>
275 return svcntw_pat(SV_ALL);
276}
277template <typename T, HWY_IF_T_SIZE(T, 8)>
279 return svcntd_pat(SV_ALL);
280}
281
282// All-true mask from a macro
283
284#if HWY_SVE_IS_POW2
285#define HWY_SVE_ALL_PTRUE(BITS) svptrue_b##BITS()
286#define HWY_SVE_PTRUE(BITS) svptrue_b##BITS()
287#else
288#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL)
289#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
290#endif // HWY_SVE_IS_POW2
291
292} // namespace detail
293
294#if HWY_HAVE_SCALABLE
295
296// Returns actual number of lanes after capping by N and shifting. May return 0
297// (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8).
298template <typename T, size_t N, int kPow2>
299HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
300 const size_t actual = detail::AllHardwareLanes<T>();
301 constexpr size_t kMaxLanes = MaxLanes(d);
302 constexpr int kClampedPow2 = HWY_MIN(kPow2, 0);
303 // Common case of full vectors: avoid any extra instructions.
304 if (detail::IsFull(d)) return actual;
305 return HWY_MIN(detail::ScaleByPower(actual, kClampedPow2), kMaxLanes);
306}
307
308#endif // HWY_HAVE_SCALABLE
309
310// ================================================== MASK INIT
311
312// One mask bit per byte; only the one belonging to the lowest byte is valid.
313
314// ------------------------------ FirstN
315#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \
316 template <size_t N, int kPow2> \
317 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \
318 const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \
319 return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
320 }
322#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
324#endif
325
326template <class D, HWY_SVE_IF_EMULATED_D(D)>
327svbool_t FirstN(D /* tag */, size_t count) {
328 return FirstN(RebindToUnsigned<D>(), count);
329}
330
331#undef HWY_SVE_FIRSTN
332
333template <class D>
334using MFromD = svbool_t;
335
336namespace detail {
337
338#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
339 template <size_t N, int kPow2> \
340 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
341 return HWY_SVE_PTRUE(BITS); \
342 } \
343 template <size_t N, int kPow2> \
344 HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
345 return HWY_SVE_ALL_PTRUE(BITS); \
346 }
347
348HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true
350#undef HWY_SVE_WRAP_PTRUE
351
352HWY_API svbool_t PFalse() { return svpfalse_b(); }
353
354// Returns all-true if d is HWY_FULL or FirstN(N) after capping N.
355//
356// This is used in functions that load/store memory; other functions (e.g.
357// arithmetic) can ignore d and use PTrue instead.
358template <class D>
359svbool_t MakeMask(D d) {
360 return IsFull(d) ? PTrue(d) : FirstN(d, Lanes(d));
361}
362
363} // namespace detail
364
365#ifdef HWY_NATIVE_MASK_FALSE
366#undef HWY_NATIVE_MASK_FALSE
367#else
368#define HWY_NATIVE_MASK_FALSE
369#endif
370
371template <class D>
372HWY_API svbool_t MaskFalse(const D /*d*/) {
373 return detail::PFalse();
374}
375
376// ================================================== INIT
377
378// ------------------------------ Set
379// vector = f(d, scalar), e.g. Set
380#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
381 template <size_t N, int kPow2> \
382 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
383 HWY_SVE_T(BASE, BITS) arg) { \
384 return sv##OP##_##CHAR##BITS(arg); \
385 }
386
388#if HWY_SVE_HAVE_BF16_FEATURE // for if-elif chain
390#elif HWY_SVE_HAVE_BF16_VEC
391// Required for Zero and VFromD
392template <class D, HWY_IF_BF16_D(D)>
393HWY_API svbfloat16_t Set(D d, bfloat16_t arg) {
394 return svreinterpret_bf16_u16(
395 Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg)));
396}
397#else // neither bf16 feature nor vector: emulate with u16
398// Required for Zero and VFromD
399template <class D, HWY_IF_BF16_D(D)>
400HWY_API svuint16_t Set(D d, bfloat16_t arg) {
401 const RebindToUnsigned<decltype(d)> du;
402 return Set(du, BitCastScalar<uint16_t>(arg));
403}
404#endif // HWY_SVE_HAVE_BF16_FEATURE
405#undef HWY_SVE_SET
406
407template <class D>
408using VFromD = decltype(Set(D(), TFromD<D>()));
409
411
412// ------------------------------ Zero
413
414template <class D>
416 // Cast to support bfloat16_t.
417 const RebindToUnsigned<decltype(d)> du;
418 return BitCast(d, Set(du, 0));
419}
420
421// ------------------------------ BitCast
422
423namespace detail {
424
425// u8: no change
426#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \
427 HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
428 return v; \
429 } \
430 template <size_t N, int kPow2> \
431 HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
432 HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
433 return v; \
434 }
435
436// All other types
437#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \
438 HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
439 return sv##OP##_u8_##CHAR##BITS(v); \
440 } \
441 template <size_t N, int kPow2> \
442 HWY_INLINE HWY_SVE_V(BASE, BITS) \
443 BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svuint8_t v) { \
444 return sv##OP##_##CHAR##BITS##_u8(v); \
445 }
446
447// U08 is special-cased, hence do not use FOREACH.
449HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
450HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
451HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
452HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
453HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
454
455#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
457#else // !(HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC)
458template <class V, HWY_SVE_IF_EMULATED_D(DFromV<V>)>
459HWY_INLINE svuint8_t BitCastToByte(V v) {
461 return BitCastToByte(BitCast(du, v));
462}
463
464template <class D, HWY_SVE_IF_EMULATED_D(D)>
466 const RebindToUnsigned<decltype(d)> du;
467 return BitCastFromByte(du, v);
468}
469#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
470
471#undef HWY_SVE_CAST_NOP
472#undef HWY_SVE_CAST
473
474} // namespace detail
475
476template <class D, class FromV>
480
481// ------------------------------ Undefined
482
483#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
484 template <size_t N, int kPow2> \
485 HWY_API HWY_SVE_V(BASE, BITS) \
486 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
487 return sv##OP##_##CHAR##BITS(); \
488 }
489
491#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
493#endif
494
495template <class D, HWY_SVE_IF_EMULATED_D(D)>
497 const RebindToUnsigned<D> du;
498 return BitCast(d, Undefined(du));
499}
500
501// ------------------------------ Tuple
502
503// tuples = f(d, v..), e.g. Create2
504#define HWY_SVE_CREATE(BASE, CHAR, BITS, HALF, NAME, OP) \
505 template <size_t N, int kPow2> \
506 HWY_API HWY_SVE_TUPLE(BASE, BITS, 2) \
507 NAME##2(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
508 HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1) { \
509 return sv##OP##2_##CHAR##BITS(v0, v1); \
510 } \
511 template <size_t N, int kPow2> \
512 HWY_API HWY_SVE_TUPLE(BASE, BITS, 3) NAME##3( \
513 HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v0, \
514 HWY_SVE_V(BASE, BITS) v1, HWY_SVE_V(BASE, BITS) v2) { \
515 return sv##OP##3_##CHAR##BITS(v0, v1, v2); \
516 } \
517 template <size_t N, int kPow2> \
518 HWY_API HWY_SVE_TUPLE(BASE, BITS, 4) \
519 NAME##4(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
520 HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
521 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3) { \
522 return sv##OP##4_##CHAR##BITS(v0, v1, v2, v3); \
523 }
524
525HWY_SVE_FOREACH(HWY_SVE_CREATE, Create, create)
526#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
528#endif
529#undef HWY_SVE_CREATE
530
531template <class D>
532using Vec2 = decltype(Create2(D(), Zero(D()), Zero(D())));
533template <class D>
534using Vec3 = decltype(Create3(D(), Zero(D()), Zero(D()), Zero(D())));
535template <class D>
536using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D())));
537
538#define HWY_SVE_GET(BASE, CHAR, BITS, HALF, NAME, OP) \
539 template <size_t kIndex> \
540 HWY_API HWY_SVE_V(BASE, BITS) NAME##2(HWY_SVE_TUPLE(BASE, BITS, 2) tuple) { \
541 return sv##OP##2_##CHAR##BITS(tuple, kIndex); \
542 } \
543 template <size_t kIndex> \
544 HWY_API HWY_SVE_V(BASE, BITS) NAME##3(HWY_SVE_TUPLE(BASE, BITS, 3) tuple) { \
545 return sv##OP##3_##CHAR##BITS(tuple, kIndex); \
546 } \
547 template <size_t kIndex> \
548 HWY_API HWY_SVE_V(BASE, BITS) NAME##4(HWY_SVE_TUPLE(BASE, BITS, 4) tuple) { \
549 return sv##OP##4_##CHAR##BITS(tuple, kIndex); \
550 }
551
553#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
555#endif
556#undef HWY_SVE_GET
557
558#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
559 template <size_t kIndex> \
560 HWY_API HWY_SVE_TUPLE(BASE, BITS, 2) \
561 NAME##2(HWY_SVE_TUPLE(BASE, BITS, 2) tuple, HWY_SVE_V(BASE, BITS) vec) { \
562 return sv##OP##2_##CHAR##BITS(tuple, kIndex, vec); \
563 } \
564 template <size_t kIndex> \
565 HWY_API HWY_SVE_TUPLE(BASE, BITS, 3) \
566 NAME##3(HWY_SVE_TUPLE(BASE, BITS, 3) tuple, HWY_SVE_V(BASE, BITS) vec) { \
567 return sv##OP##3_##CHAR##BITS(tuple, kIndex, vec); \
568 } \
569 template <size_t kIndex> \
570 HWY_API HWY_SVE_TUPLE(BASE, BITS, 4) \
571 NAME##4(HWY_SVE_TUPLE(BASE, BITS, 4) tuple, HWY_SVE_V(BASE, BITS) vec) { \
572 return sv##OP##4_##CHAR##BITS(tuple, kIndex, vec); \
573 }
574
576#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
578#endif
579#undef HWY_SVE_SET
580
581// ------------------------------ ResizeBitCast
582
583// Same as BitCast on SVE
584template <class D, class FromV>
586 return BitCast(d, v);
587}
588
589// ------------------------------ Dup128VecFromValues
590
591template <class D, HWY_IF_I8_D(D)>
593 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
594 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
595 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
596 TFromD<D> t11, TFromD<D> t12,
597 TFromD<D> t13, TFromD<D> t14,
598 TFromD<D> t15) {
599 return svdupq_n_s8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
600 t14, t15);
601}
602
603template <class D, HWY_IF_U8_D(D)>
605 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
606 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
607 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
608 TFromD<D> t11, TFromD<D> t12,
609 TFromD<D> t13, TFromD<D> t14,
610 TFromD<D> t15) {
611 return svdupq_n_u8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
612 t14, t15);
613}
614
615template <class D, HWY_IF_I16_D(D)>
617 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
618 TFromD<D> t5, TFromD<D> t6,
619 TFromD<D> t7) {
620 return svdupq_n_s16(t0, t1, t2, t3, t4, t5, t6, t7);
621}
622
623template <class D, HWY_IF_U16_D(D)>
625 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
626 TFromD<D> t5, TFromD<D> t6,
627 TFromD<D> t7) {
628 return svdupq_n_u16(t0, t1, t2, t3, t4, t5, t6, t7);
629}
630
631template <class D, HWY_IF_F16_D(D)>
632HWY_API svfloat16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
633 TFromD<D> t2, TFromD<D> t3,
634 TFromD<D> t4, TFromD<D> t5,
635 TFromD<D> t6, TFromD<D> t7) {
636 return svdupq_n_f16(t0, t1, t2, t3, t4, t5, t6, t7);
637}
638
639template <class D, HWY_IF_BF16_D(D)>
641 TFromD<D> t3, TFromD<D> t4, TFromD<D> t5,
642 TFromD<D> t6, TFromD<D> t7) {
643#if HWY_SVE_HAVE_BF16_FEATURE
644 (void)d;
645 return svdupq_n_bf16(t0, t1, t2, t3, t4, t5, t6, t7);
646#else
647 const RebindToUnsigned<decltype(d)> du;
648 return BitCast(
650 du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
651 BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
652 BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
653 BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
654#endif
655}
656
657template <class D, HWY_IF_I32_D(D)>
659 TFromD<D> t2, TFromD<D> t3) {
660 return svdupq_n_s32(t0, t1, t2, t3);
661}
662
663template <class D, HWY_IF_U32_D(D)>
665 TFromD<D> t2, TFromD<D> t3) {
666 return svdupq_n_u32(t0, t1, t2, t3);
667}
668
669template <class D, HWY_IF_F32_D(D)>
670HWY_API svfloat32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
671 TFromD<D> t2, TFromD<D> t3) {
672 return svdupq_n_f32(t0, t1, t2, t3);
673}
674
675template <class D, HWY_IF_I64_D(D)>
676HWY_API svint64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
677 return svdupq_n_s64(t0, t1);
678}
679
680template <class D, HWY_IF_U64_D(D)>
681HWY_API svuint64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
682 return svdupq_n_u64(t0, t1);
683}
684
685template <class D, HWY_IF_F64_D(D)>
686HWY_API svfloat64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
687 return svdupq_n_f64(t0, t1);
688}
689
690// ================================================== LOGICAL
691
692// detail::*N() functions accept a scalar argument to avoid extra Set().
693
694// ------------------------------ Not
696
697// ------------------------------ And
698
699namespace detail {
701} // namespace detail
702
704
705template <class V, HWY_IF_FLOAT_V(V)>
706HWY_API V And(const V a, const V b) {
707 const DFromV<V> df;
708 const RebindToUnsigned<decltype(df)> du;
709 return BitCast(df, And(BitCast(du, a), BitCast(du, b)));
710}
711
712// ------------------------------ Or
713
714namespace detail {
716} // namespace detail
717
719
720template <class V, HWY_IF_FLOAT_V(V)>
721HWY_API V Or(const V a, const V b) {
722 const DFromV<V> df;
723 const RebindToUnsigned<decltype(df)> du;
724 return BitCast(df, Or(BitCast(du, a), BitCast(du, b)));
725}
726
727// ------------------------------ Xor
728
729namespace detail {
731} // namespace detail
732
734
735template <class V, HWY_IF_FLOAT_V(V)>
736HWY_API V Xor(const V a, const V b) {
737 const DFromV<V> df;
738 const RebindToUnsigned<decltype(df)> du;
739 return BitCast(df, Xor(BitCast(du, a), BitCast(du, b)));
740}
741
742// ------------------------------ AndNot
743
744namespace detail {
745#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
746 HWY_API HWY_SVE_V(BASE, BITS) \
747 NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
748 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
749 }
750
752#undef HWY_SVE_RETV_ARGPVN_SWAP
753} // namespace detail
754
755#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
756 HWY_API HWY_SVE_V(BASE, BITS) \
757 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
758 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
759 }
761#undef HWY_SVE_RETV_ARGPVV_SWAP
762
763template <class V, HWY_IF_FLOAT_V(V)>
764HWY_API V AndNot(const V a, const V b) {
765 const DFromV<V> df;
766 const RebindToUnsigned<decltype(df)> du;
767 return BitCast(df, AndNot(BitCast(du, a), BitCast(du, b)));
768}
769
770// ------------------------------ Xor3
771
772#if HWY_SVE_HAVE_2
773
775
776template <class V, HWY_IF_FLOAT_V(V)>
777HWY_API V Xor3(const V x1, const V x2, const V x3) {
778 const DFromV<V> df;
779 const RebindToUnsigned<decltype(df)> du;
780 return BitCast(df, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3)));
781}
782
783#else
784template <class V>
785HWY_API V Xor3(V x1, V x2, V x3) {
786 return Xor(x1, Xor(x2, x3));
787}
788#endif
789
790// ------------------------------ Or3
791template <class V>
792HWY_API V Or3(V o1, V o2, V o3) {
793 return Or(o1, Or(o2, o3));
794}
795
796// ------------------------------ OrAnd
797template <class V>
798HWY_API V OrAnd(const V o, const V a1, const V a2) {
799 return Or(o, And(a1, a2));
800}
801
802// ------------------------------ PopulationCount
803
804#ifdef HWY_NATIVE_POPCNT
805#undef HWY_NATIVE_POPCNT
806#else
807#define HWY_NATIVE_POPCNT
808#endif
809
810// Need to return original type instead of unsigned.
811#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \
812 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
813 return BitCast(DFromV<decltype(v)>(), \
814 sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
815 }
817#undef HWY_SVE_POPCNT
818
819// ================================================== SIGN
820
821// ------------------------------ Neg
823
825 const DFromV<decltype(v)> d;
826 const RebindToUnsigned<decltype(d)> du;
827 using TU = TFromD<decltype(du)>;
828 return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
829}
830
831// ------------------------------ SaturatedNeg
832#if HWY_SVE_HAVE_2
833#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
834#undef HWY_NATIVE_SATURATED_NEG_8_16_32
835#else
836#define HWY_NATIVE_SATURATED_NEG_8_16_32
837#endif
838
839#ifdef HWY_NATIVE_SATURATED_NEG_64
840#undef HWY_NATIVE_SATURATED_NEG_64
841#else
842#define HWY_NATIVE_SATURATED_NEG_64
843#endif
844
846#endif // HWY_SVE_HAVE_2
847
848// ------------------------------ Abs
850
851// ------------------------------ SaturatedAbs
852#if HWY_SVE_HAVE_2
853#ifdef HWY_NATIVE_SATURATED_ABS
854#undef HWY_NATIVE_SATURATED_ABS
855#else
856#define HWY_NATIVE_SATURATED_ABS
857#endif
858
860#endif // HWY_SVE_HAVE_2
861
862// ================================================== ARITHMETIC
863
864// Per-target flags to prevent generic_ops-inl.h defining Add etc.
865#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
866#undef HWY_NATIVE_OPERATOR_REPLACEMENTS
867#else
868#define HWY_NATIVE_OPERATOR_REPLACEMENTS
869#endif
870
871// ------------------------------ Add
872
873namespace detail {
875} // namespace detail
876
878
879// ------------------------------ Sub
880
881namespace detail {
882// Can't use HWY_SVE_RETV_ARGPVN because caller wants to specify pg.
883#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \
884 HWY_API HWY_SVE_V(BASE, BITS) \
885 NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
886 return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
887 }
888
890#undef HWY_SVE_RETV_ARGPVN_MASK
891} // namespace detail
892
894
895// ------------------------------ SumsOf8
896HWY_API svuint64_t SumsOf8(const svuint8_t v) {
897 const ScalableTag<uint32_t> du32;
898 const ScalableTag<uint64_t> du64;
899 const svbool_t pg = detail::PTrue(du64);
900
901 const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1);
902 // Compute pairwise sum of u32 and extend to u64.
903
904#if HWY_SVE_HAVE_2
905 return svadalp_u64_x(pg, Zero(du64), sums_of_4);
906#else
907 const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32);
908 // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended)
909 const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4));
910 return Add(hi, lo);
911#endif
912}
913
914HWY_API svint64_t SumsOf8(const svint8_t v) {
915 const ScalableTag<int32_t> di32;
916 const ScalableTag<int64_t> di64;
917 const svbool_t pg = detail::PTrue(di64);
918
919 const svint32_t sums_of_4 = svdot_n_s32(Zero(di32), v, 1);
920#if HWY_SVE_HAVE_2
921 return svadalp_s64_x(pg, Zero(di64), sums_of_4);
922#else
923 const svint64_t hi = svasr_n_s64_x(pg, BitCast(di64, sums_of_4), 32);
924 // Isolate the lower 32 bits (to be added to the upper 32 and sign-extended)
925 const svint64_t lo = svextw_s64_x(pg, BitCast(di64, sums_of_4));
926 return Add(hi, lo);
927#endif
928}
929
930// ------------------------------ SumsOf2
931#if HWY_SVE_HAVE_2
932namespace detail {
933
934HWY_INLINE svint16_t SumsOf2(hwy::SignedTag /*type_tag*/,
935 hwy::SizeTag<1> /*lane_size_tag*/, svint8_t v) {
936 const ScalableTag<int16_t> di16;
937 const svbool_t pg = detail::PTrue(di16);
938 return svadalp_s16_x(pg, Zero(di16), v);
939}
940
941HWY_INLINE svuint16_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
942 hwy::SizeTag<1> /*lane_size_tag*/, svuint8_t v) {
943 const ScalableTag<uint16_t> du16;
944 const svbool_t pg = detail::PTrue(du16);
945 return svadalp_u16_x(pg, Zero(du16), v);
946}
947
948HWY_INLINE svint32_t SumsOf2(hwy::SignedTag /*type_tag*/,
949 hwy::SizeTag<2> /*lane_size_tag*/, svint16_t v) {
950 const ScalableTag<int32_t> di32;
951 const svbool_t pg = detail::PTrue(di32);
952 return svadalp_s32_x(pg, Zero(di32), v);
953}
954
955HWY_INLINE svuint32_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
956 hwy::SizeTag<2> /*lane_size_tag*/, svuint16_t v) {
957 const ScalableTag<uint32_t> du32;
958 const svbool_t pg = detail::PTrue(du32);
959 return svadalp_u32_x(pg, Zero(du32), v);
960}
961
962HWY_INLINE svint64_t SumsOf2(hwy::SignedTag /*type_tag*/,
963 hwy::SizeTag<4> /*lane_size_tag*/, svint32_t v) {
964 const ScalableTag<int64_t> di64;
965 const svbool_t pg = detail::PTrue(di64);
966 return svadalp_s64_x(pg, Zero(di64), v);
967}
968
969HWY_INLINE svuint64_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
970 hwy::SizeTag<4> /*lane_size_tag*/, svuint32_t v) {
971 const ScalableTag<uint64_t> du64;
972 const svbool_t pg = detail::PTrue(du64);
973 return svadalp_u64_x(pg, Zero(du64), v);
974}
975
976} // namespace detail
977#endif // HWY_SVE_HAVE_2
978
979// ------------------------------ SumsOf4
980namespace detail {
981
982HWY_INLINE svint32_t SumsOf4(hwy::SignedTag /*type_tag*/,
983 hwy::SizeTag<1> /*lane_size_tag*/, svint8_t v) {
984 return svdot_n_s32(Zero(ScalableTag<int32_t>()), v, 1);
985}
986
987HWY_INLINE svuint32_t SumsOf4(hwy::UnsignedTag /*type_tag*/,
988 hwy::SizeTag<1> /*lane_size_tag*/, svuint8_t v) {
989 return svdot_n_u32(Zero(ScalableTag<uint32_t>()), v, 1);
990}
991
992HWY_INLINE svint64_t SumsOf4(hwy::SignedTag /*type_tag*/,
993 hwy::SizeTag<2> /*lane_size_tag*/, svint16_t v) {
994 return svdot_n_s64(Zero(ScalableTag<int64_t>()), v, 1);
995}
996
997HWY_INLINE svuint64_t SumsOf4(hwy::UnsignedTag /*type_tag*/,
998 hwy::SizeTag<2> /*lane_size_tag*/, svuint16_t v) {
999 return svdot_n_u64(Zero(ScalableTag<uint64_t>()), v, 1);
1000}
1001
1002} // namespace detail
1003
1004// ------------------------------ SaturatedAdd
1005
1006#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
1007#undef HWY_NATIVE_I32_SATURATED_ADDSUB
1008#else
1009#define HWY_NATIVE_I32_SATURATED_ADDSUB
1010#endif
1011
1012#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
1013#undef HWY_NATIVE_U32_SATURATED_ADDSUB
1014#else
1015#define HWY_NATIVE_U32_SATURATED_ADDSUB
1016#endif
1017
1018#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
1019#undef HWY_NATIVE_I64_SATURATED_ADDSUB
1020#else
1021#define HWY_NATIVE_I64_SATURATED_ADDSUB
1022#endif
1023
1024#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
1025#undef HWY_NATIVE_U64_SATURATED_ADDSUB
1026#else
1027#define HWY_NATIVE_U64_SATURATED_ADDSUB
1028#endif
1029
1031
1032// ------------------------------ SaturatedSub
1033
1035
1036// ------------------------------ AbsDiff
1037#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
1038#undef HWY_NATIVE_INTEGER_ABS_DIFF
1039#else
1040#define HWY_NATIVE_INTEGER_ABS_DIFF
1041#endif
1042
1044
1045// ------------------------------ ShiftLeft[Same]
1046
1047#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1048 template <int kBits> \
1049 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1050 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
1051 } \
1052 HWY_API HWY_SVE_V(BASE, BITS) \
1053 NAME##Same(HWY_SVE_V(BASE, BITS) v, int bits) { \
1054 return sv##OP##_##CHAR##BITS##_x( \
1055 HWY_SVE_PTRUE(BITS), v, static_cast<HWY_SVE_T(uint, BITS)>(bits)); \
1056 }
1057
1059
1060// ------------------------------ ShiftRight[Same]
1061
1064
1065#undef HWY_SVE_SHIFT_N
1066
1067// ------------------------------ RotateRight
1068
1069#if HWY_SVE_HAVE_2
1070
1071#define HWY_SVE_ROTATE_RIGHT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1072 template <int kBits> \
1073 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1074 if (kBits == 0) return v; \
1075 return sv##OP##_##CHAR##BITS(v, Zero(DFromV<decltype(v)>()), \
1076 HWY_MAX(kBits, 1)); \
1077 }
1078
1081
1082#undef HWY_SVE_ROTATE_RIGHT_N
1083
1084#else // !HWY_SVE_HAVE_2
1085template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
1086HWY_API V RotateRight(const V v) {
1087 const DFromV<decltype(v)> d;
1088 const RebindToUnsigned<decltype(d)> du;
1089
1090 constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
1091 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1092 if (kBits == 0) return v;
1093
1094 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
1095 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
1096}
1097#endif
1098
1099// ------------------------------ Shl/r
1100
1101#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
1102 HWY_API HWY_SVE_V(BASE, BITS) \
1103 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
1104 const RebindToUnsigned<DFromV<decltype(v)>> du; \
1105 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
1106 BitCast(du, bits)); \
1107 }
1108
1110
1113
1114#undef HWY_SVE_SHIFT
1115
1116// ------------------------------ Min/Max
1117
1122
1123namespace detail {
1126} // namespace detail
1127
1128// ------------------------------ Mul
1129
1130// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
1131#ifdef HWY_NATIVE_MUL_8
1132#undef HWY_NATIVE_MUL_8
1133#else
1134#define HWY_NATIVE_MUL_8
1135#endif
1136#ifdef HWY_NATIVE_MUL_64
1137#undef HWY_NATIVE_MUL_64
1138#else
1139#define HWY_NATIVE_MUL_64
1140#endif
1141
1143
1144// ------------------------------ MulHigh
1146
1147// ------------------------------ MulFixedPoint15
1148HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
1149#if HWY_SVE_HAVE_2
1150 return svqrdmulh_s16(a, b);
1151#else
1152 const DFromV<decltype(a)> d;
1153 const RebindToUnsigned<decltype(d)> du;
1154
1155 const svuint16_t lo = BitCast(du, Mul(a, b));
1156 const svint16_t hi = MulHigh(a, b);
1157 // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must
1158 // carry that into the result. Instead isolate the top two bits because only
1159 // they can influence the result.
1160 const svuint16_t lo_top2 = ShiftRight<14>(lo);
1161 // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0.
1162 const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1));
1163 return Add(Add(hi, hi), BitCast(d, rounding));
1164#endif
1165}
1166
1167// ------------------------------ Div
1168#ifdef HWY_NATIVE_INT_DIV
1169#undef HWY_NATIVE_INT_DIV
1170#else
1171#define HWY_NATIVE_INT_DIV
1172#endif
1173
1177
1178// ------------------------------ ApproximateReciprocal
1179#ifdef HWY_NATIVE_F64_APPROX_RECIP
1180#undef HWY_NATIVE_F64_APPROX_RECIP
1181#else
1182#define HWY_NATIVE_F64_APPROX_RECIP
1183#endif
1184
1186
1187// ------------------------------ Sqrt
1189
1190// ------------------------------ ApproximateReciprocalSqrt
1191#ifdef HWY_NATIVE_F64_APPROX_RSQRT
1192#undef HWY_NATIVE_F64_APPROX_RSQRT
1193#else
1194#define HWY_NATIVE_F64_APPROX_RSQRT
1195#endif
1196
1198
1199// ------------------------------ MulAdd
1200
1201// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd.
1202#ifdef HWY_NATIVE_INT_FMA
1203#undef HWY_NATIVE_INT_FMA
1204#else
1205#define HWY_NATIVE_INT_FMA
1206#endif
1207
1208#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
1209 HWY_API HWY_SVE_V(BASE, BITS) \
1210 NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
1211 HWY_SVE_V(BASE, BITS) add) { \
1212 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
1213 }
1214
1216
1217// ------------------------------ NegMulAdd
1219
1220// ------------------------------ MulSub
1222
1223// ------------------------------ NegMulSub
1225
1226#undef HWY_SVE_FMA
1227
1228// ------------------------------ Round etc.
1229
1234
1235// ================================================== MASK
1236
1237// ------------------------------ RebindMask
1238template <class D, typename MFrom>
1239HWY_API svbool_t RebindMask(const D /*d*/, const MFrom mask) {
1240 return mask;
1241}
1242
1243// ------------------------------ Mask logical
1244
1245HWY_API svbool_t Not(svbool_t m) {
1246 // We don't know the lane type, so assume 8-bit. For larger types, this will
1247 // de-canonicalize the predicate, i.e. set bits to 1 even though they do not
1248 // correspond to the lowest byte in the lane. Arm says such bits are ignored.
1249 return svnot_b_z(HWY_SVE_PTRUE(8), m);
1250}
1251HWY_API svbool_t And(svbool_t a, svbool_t b) {
1252 return svand_b_z(b, b, a); // same order as AndNot for consistency
1253}
1254HWY_API svbool_t AndNot(svbool_t a, svbool_t b) {
1255 return svbic_b_z(b, b, a); // reversed order like NEON
1256}
1257HWY_API svbool_t Or(svbool_t a, svbool_t b) {
1258 return svsel_b(a, a, b); // a ? true : b
1259}
1260HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
1261 return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b.
1262}
1263
1264HWY_API svbool_t ExclusiveNeither(svbool_t a, svbool_t b) {
1265 return svnor_b_z(HWY_SVE_PTRUE(8), a, b); // !a && !b, undefined if a && b.
1266}
1267
1268// ------------------------------ CountTrue
1269
1270#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
1271 template <size_t N, int kPow2> \
1272 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \
1273 return sv##OP##_b##BITS(detail::MakeMask(d), m); \
1274 }
1275
1277#undef HWY_SVE_COUNT_TRUE
1278
1279// For 16-bit Compress: full vector, not limited to SV_POW2.
1280namespace detail {
1281
1282#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \
1283 template <size_t N, int kPow2> \
1284 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m) { \
1285 return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
1286 }
1287
1288HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE_FULL, CountTrueFull, cntp)
1289#undef HWY_SVE_COUNT_TRUE_FULL
1290
1291} // namespace detail
1292
1293// ------------------------------ AllFalse
1294template <class D>
1295HWY_API bool AllFalse(D d, svbool_t m) {
1296 return !svptest_any(detail::MakeMask(d), m);
1297}
1298
1299// ------------------------------ AllTrue
1300template <class D>
1301HWY_API bool AllTrue(D d, svbool_t m) {
1302 return CountTrue(d, m) == Lanes(d);
1303}
1304
1305// ------------------------------ FindFirstTrue
1306template <class D>
1307HWY_API intptr_t FindFirstTrue(D d, svbool_t m) {
1308 return AllFalse(d, m) ? intptr_t{-1}
1309 : static_cast<intptr_t>(
1310 CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)));
1311}
1312
1313// ------------------------------ FindKnownFirstTrue
1314template <class D>
1315HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
1316 return CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m));
1317}
1318
1319// ------------------------------ IfThenElse
1320#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
1321 HWY_API HWY_SVE_V(BASE, BITS) \
1322 NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
1323 return sv##OP##_##CHAR##BITS(m, yes, no); \
1324 }
1325
1328#undef HWY_SVE_IF_THEN_ELSE
1329
1330template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1331HWY_API V IfThenElse(const svbool_t mask, V yes, V no) {
1332 const RebindToUnsigned<D> du;
1333 return BitCast(
1334 D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
1335}
1336
1337// ------------------------------ IfThenElseZero
1338
1339template <class V, class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
1340HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
1341 return IfThenElse(mask, yes, Zero(D()));
1342}
1343
1344template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1345HWY_API V IfThenElseZero(const svbool_t mask, V yes) {
1346 const RebindToUnsigned<D> du;
1347 return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
1348}
1349
1350// ------------------------------ IfThenZeroElse
1351
1352template <class V, class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
1353HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
1354 return IfThenElse(mask, Zero(D()), no);
1355}
1356
1357template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1358HWY_API V IfThenZeroElse(const svbool_t mask, V no) {
1359 const RebindToUnsigned<D> du;
1360 return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
1361}
1362
1363// ------------------------------ Additional mask logical operations
1364HWY_API svbool_t SetBeforeFirst(svbool_t m) {
1365 // We don't know the lane type, so assume 8-bit. For larger types, this will
1366 // de-canonicalize the predicate, i.e. set bits to 1 even though they do not
1367 // correspond to the lowest byte in the lane. Arm says such bits are ignored.
1368 return svbrkb_b_z(HWY_SVE_PTRUE(8), m);
1369}
1370
1371HWY_API svbool_t SetAtOrBeforeFirst(svbool_t m) {
1372 // We don't know the lane type, so assume 8-bit. For larger types, this will
1373 // de-canonicalize the predicate, i.e. set bits to 1 even though they do not
1374 // correspond to the lowest byte in the lane. Arm says such bits are ignored.
1375 return svbrka_b_z(HWY_SVE_PTRUE(8), m);
1376}
1377
1378HWY_API svbool_t SetOnlyFirst(svbool_t m) { return svbrka_b_z(m, m); }
1379
1380HWY_API svbool_t SetAtOrAfterFirst(svbool_t m) {
1381 return Not(SetBeforeFirst(m));
1382}
1383
1384// ------------------------------ PromoteMaskTo
1385
1386#ifdef HWY_NATIVE_PROMOTE_MASK_TO
1387#undef HWY_NATIVE_PROMOTE_MASK_TO
1388#else
1389#define HWY_NATIVE_PROMOTE_MASK_TO
1390#endif
1391
1392template <class DTo, class DFrom,
1393 HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) * 2)>
1394HWY_API svbool_t PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1395 return svunpklo_b(m);
1396}
1397
1398template <class DTo, class DFrom,
1399 HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>) * 2)>
1400HWY_API svbool_t PromoteMaskTo(DTo d_to, DFrom d_from, svbool_t m) {
1401 using TFrom = TFromD<DFrom>;
1402 using TWFrom = MakeWide<MakeUnsigned<TFrom>>;
1403 static_assert(sizeof(TWFrom) > sizeof(TFrom),
1404 "sizeof(TWFrom) > sizeof(TFrom) must be true");
1405
1406 const Rebind<TWFrom, decltype(d_from)> dw_from;
1407 return PromoteMaskTo(d_to, dw_from, PromoteMaskTo(dw_from, d_from, m));
1408}
1409
1410// ------------------------------ DemoteMaskTo
1411
1412#ifdef HWY_NATIVE_DEMOTE_MASK_TO
1413#undef HWY_NATIVE_DEMOTE_MASK_TO
1414#else
1415#define HWY_NATIVE_DEMOTE_MASK_TO
1416#endif
1417
1418template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 1),
1419 HWY_IF_T_SIZE_D(DFrom, 2)>
1420HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1421 return svuzp1_b8(m, m);
1422}
1423
1424template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 2),
1425 HWY_IF_T_SIZE_D(DFrom, 4)>
1426HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1427 return svuzp1_b16(m, m);
1428}
1429
1430template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 4),
1431 HWY_IF_T_SIZE_D(DFrom, 8)>
1432HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1433 return svuzp1_b32(m, m);
1434}
1435
1436template <class DTo, class DFrom,
1437 HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) / 4)>
1438HWY_API svbool_t DemoteMaskTo(DTo d_to, DFrom d_from, svbool_t m) {
1439 using TFrom = TFromD<DFrom>;
1440 using TNFrom = MakeNarrow<MakeUnsigned<TFrom>>;
1441 static_assert(sizeof(TNFrom) < sizeof(TFrom),
1442 "sizeof(TNFrom) < sizeof(TFrom) must be true");
1443
1444 const Rebind<TNFrom, decltype(d_from)> dn_from;
1445 return DemoteMaskTo(d_to, dn_from, DemoteMaskTo(dn_from, d_from, m));
1446}
1447
1448// ------------------------------ LowerHalfOfMask
1449#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
1450#undef HWY_NATIVE_LOWER_HALF_OF_MASK
1451#else
1452#define HWY_NATIVE_LOWER_HALF_OF_MASK
1453#endif
1454
1455template <class D>
1456HWY_API svbool_t LowerHalfOfMask(D /*d*/, svbool_t m) {
1457 return m;
1458}
1459
1460// ------------------------------ MaskedAddOr etc. (IfThenElse)
1461
1462#ifdef HWY_NATIVE_MASKED_ARITH
1463#undef HWY_NATIVE_MASKED_ARITH
1464#else
1465#define HWY_NATIVE_MASKED_ARITH
1466#endif
1467
1468namespace detail {
1469HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMin, min)
1470HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMax, max)
1471HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedAdd, add)
1472HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedSub, sub)
1473HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMul, mul)
1477#if HWY_SVE_HAVE_2
1478HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatAdd, qadd)
1479HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatSub, qsub)
1480#endif
1481} // namespace detail
1482
1483template <class V, class M>
1484HWY_API V MaskedMinOr(V no, M m, V a, V b) {
1485 return IfThenElse(m, detail::MaskedMin(m, a, b), no);
1486}
1487
1488template <class V, class M>
1489HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
1490 return IfThenElse(m, detail::MaskedMax(m, a, b), no);
1491}
1492
1493template <class V, class M>
1494HWY_API V MaskedAddOr(V no, M m, V a, V b) {
1495 return IfThenElse(m, detail::MaskedAdd(m, a, b), no);
1496}
1497
1498template <class V, class M>
1499HWY_API V MaskedSubOr(V no, M m, V a, V b) {
1500 return IfThenElse(m, detail::MaskedSub(m, a, b), no);
1501}
1502
1503template <class V, class M>
1504HWY_API V MaskedMulOr(V no, M m, V a, V b) {
1505 return IfThenElse(m, detail::MaskedMul(m, a, b), no);
1506}
1507
1508template <class V, class M,
1510 V, (hwy::IsSame<TFromV<V>, hwy::float16_t>() ? (1 << 2) : 0) |
1511 (1 << 4) | (1 << 8))>
1512HWY_API V MaskedDivOr(V no, M m, V a, V b) {
1513 return IfThenElse(m, detail::MaskedDiv(m, a, b), no);
1514}
1515
1516// I8/U8/I16/U16 MaskedDivOr is implemented after I8/U8/I16/U16 Div
1517
1518#if HWY_SVE_HAVE_2
1519template <class V, class M>
1520HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
1521 return IfThenElse(m, detail::MaskedSatAdd(m, a, b), no);
1522}
1523
1524template <class V, class M>
1525HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
1526 return IfThenElse(m, detail::MaskedSatSub(m, a, b), no);
1527}
1528#else
1529template <class V, class M>
1530HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
1531 return IfThenElse(m, SaturatedAdd(a, b), no);
1532}
1533
1534template <class V, class M>
1535HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
1536 return IfThenElse(m, SaturatedSub(a, b), no);
1537}
1538#endif
1539
1540// ================================================== COMPARE
1541
1542// mask = f(vector, vector)
1543#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \
1544 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
1545 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
1546 }
1547#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1548 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
1549 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
1550 }
1551
1552// ------------------------------ Eq
1554namespace detail {
1556} // namespace detail
1557
1558// ------------------------------ Ne
1560namespace detail {
1562} // namespace detail
1563
1564// ------------------------------ Lt
1566namespace detail {
1568} // namespace detail
1569
1570// ------------------------------ Le
1572namespace detail {
1574} // namespace detail
1575
1576// ------------------------------ Gt/Ge (swapped order)
1577template <class V>
1578HWY_API svbool_t Gt(const V a, const V b) {
1579 return Lt(b, a);
1580}
1581template <class V>
1582HWY_API svbool_t Ge(const V a, const V b) {
1583 return Le(b, a);
1584}
1585namespace detail {
1588} // namespace detail
1589
1590#undef HWY_SVE_COMPARE
1591#undef HWY_SVE_COMPARE_N
1592
1593// ------------------------------ TestBit
1594template <class V>
1595HWY_API svbool_t TestBit(const V a, const V bit) {
1596 return detail::NeN(And(a, bit), 0);
1597}
1598
1599// ------------------------------ MaskFromVec (Ne)
1600template <class V>
1601HWY_API svbool_t MaskFromVec(const V v) {
1602 using T = TFromV<V>;
1603 return detail::NeN(v, ConvertScalarTo<T>(0));
1604}
1605
1606// ------------------------------ VecFromMask
1607template <class D>
1608HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
1609 const RebindToSigned<D> di;
1610 // This generates MOV imm, whereas svdup_n_s8_z generates MOV scalar, which
1611 // requires an extra instruction plus M0 pipeline.
1612 return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
1613}
1614
1615// ------------------------------ IsNegative (Lt)
1616#ifdef HWY_NATIVE_IS_NEGATIVE
1617#undef HWY_NATIVE_IS_NEGATIVE
1618#else
1619#define HWY_NATIVE_IS_NEGATIVE
1620#endif
1621
1622template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1623HWY_API svbool_t IsNegative(V v) {
1624 const DFromV<decltype(v)> d;
1625 const RebindToSigned<decltype(d)> di;
1626 using TI = TFromD<decltype(di)>;
1627
1628 return detail::LtN(BitCast(di, v), static_cast<TI>(0));
1629}
1630
1631// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
1632
1633#if HWY_SVE_HAVE_2
1634
1635#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP) \
1636 HWY_API HWY_SVE_V(BASE, BITS) \
1637 NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \
1638 HWY_SVE_V(BASE, BITS) no) { \
1639 return sv##OP##_##CHAR##BITS(yes, no, mask); \
1640 }
1641
1643#undef HWY_SVE_IF_VEC
1644
1645template <class V, HWY_IF_FLOAT_V(V)>
1646HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
1647 const DFromV<V> d;
1648 const RebindToUnsigned<decltype(d)> du;
1649 return BitCast(
1650 d, IfVecThenElse(BitCast(du, mask), BitCast(du, yes), BitCast(du, no)));
1651}
1652
1653#else
1654
1655template <class V>
1656HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
1657 return Or(And(mask, yes), AndNot(mask, no));
1658}
1659
1660#endif // HWY_SVE_HAVE_2
1661
1662// ------------------------------ BitwiseIfThenElse
1663
1664#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
1665#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
1666#else
1667#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
1668#endif
1669
1670template <class V>
1671HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
1672 return IfVecThenElse(mask, yes, no);
1673}
1674
1675// ------------------------------ CopySign (BitwiseIfThenElse)
1676template <class V>
1677HWY_API V CopySign(const V magn, const V sign) {
1678 const DFromV<decltype(magn)> d;
1679 return BitwiseIfThenElse(SignBit(d), sign, magn);
1680}
1681
1682// ------------------------------ CopySignToAbs
1683template <class V>
1684HWY_API V CopySignToAbs(const V abs, const V sign) {
1685#if HWY_SVE_HAVE_2 // CopySign is more efficient than OrAnd
1686 return CopySign(abs, sign);
1687#else
1688 const DFromV<V> d;
1689 return OrAnd(abs, SignBit(d), sign);
1690#endif
1691}
1692
1693// ------------------------------ Floating-point classification (Ne)
1694
1695template <class V>
1696HWY_API svbool_t IsNaN(const V v) {
1697 return Ne(v, v); // could also use cmpuo
1698}
1699
1700// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
1701// We use a fused Set/comparison for IsFinite.
1702#ifdef HWY_NATIVE_ISINF
1703#undef HWY_NATIVE_ISINF
1704#else
1705#define HWY_NATIVE_ISINF
1706#endif
1707
1708template <class V>
1709HWY_API svbool_t IsInf(const V v) {
1710 using T = TFromV<V>;
1711 const DFromV<decltype(v)> d;
1712 const RebindToUnsigned<decltype(d)> du;
1713 const RebindToSigned<decltype(d)> di;
1714
1715 // 'Shift left' to clear the sign bit
1716 const VFromD<decltype(du)> vu = BitCast(du, v);
1717 const VFromD<decltype(du)> v2 = Add(vu, vu);
1718 // Check for exponent=max and mantissa=0.
1719 const VFromD<decltype(di)> max2 = Set(di, hwy::MaxExponentTimes2<T>());
1720 return RebindMask(d, Eq(v2, BitCast(du, max2)));
1721}
1722
1723// Returns whether normal/subnormal/zero.
1724template <class V>
1725HWY_API svbool_t IsFinite(const V v) {
1726 using T = TFromV<V>;
1727 const DFromV<decltype(v)> d;
1728 const RebindToUnsigned<decltype(d)> du;
1729 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1730 const VFromD<decltype(du)> vu = BitCast(du, v);
1731 // 'Shift left' to clear the sign bit, then right so we can compare with the
1732 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
1733 // negative and non-negative floats would be greater).
1734 const VFromD<decltype(di)> exp =
1735 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
1736 return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField<T>()));
1737}
1738
1739// ================================================== MEMORY
1740
1741// ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream
1742
1743#define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \
1744 template <size_t N, int kPow2> \
1745 HWY_API HWY_SVE_V(BASE, BITS) \
1746 LoadU(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1747 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1748 return svld1_##CHAR##BITS(detail::MakeMask(d), \
1749 detail::NativeLanePointer(p)); \
1750 } \
1751 template <size_t N, int kPow2> \
1752 HWY_API HWY_SVE_V(BASE, BITS) \
1753 MaskedLoad(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1754 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1755 return svld1_##CHAR##BITS(m, detail::NativeLanePointer(p)); \
1756 } \
1757 template <size_t N, int kPow2> \
1758 HWY_API void StoreU(HWY_SVE_V(BASE, BITS) v, \
1759 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1760 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1761 svst1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), v); \
1762 } \
1763 template <size_t N, int kPow2> \
1764 HWY_API void Stream(HWY_SVE_V(BASE, BITS) v, \
1765 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1766 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1767 svstnt1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), \
1768 v); \
1769 } \
1770 template <size_t N, int kPow2> \
1771 HWY_API void BlendedStore(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1772 HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1773 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1774 svst1_##CHAR##BITS(m, detail::NativeLanePointer(p), v); \
1775 }
1776
1779
1780template <class D, HWY_SVE_IF_EMULATED_D(D)>
1781HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1782 const RebindToUnsigned<decltype(d)> du;
1783 return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
1784}
1785
1786template <class D, HWY_SVE_IF_EMULATED_D(D)>
1787HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1788 const RebindToUnsigned<decltype(d)> du;
1789 StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
1790}
1791
1792template <class D, HWY_SVE_IF_EMULATED_D(D)>
1794 const TFromD<D>* HWY_RESTRICT p) {
1795 const RebindToUnsigned<decltype(d)> du;
1796 return BitCast(d,
1798}
1799
1800// MaskedLoadOr is generic and does not require emulation.
1801
1802template <class D, HWY_SVE_IF_EMULATED_D(D)>
1805 const RebindToUnsigned<decltype(d)> du;
1806 BlendedStore(BitCast(du, v), RebindMask(du, m), du,
1808}
1809
1810#undef HWY_SVE_MEM
1811
1812#if HWY_TARGET != HWY_SVE2_128
1813namespace detail {
1814#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
1815 template <size_t N, int kPow2> \
1816 HWY_API HWY_SVE_V(BASE, BITS) \
1817 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1818 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1819 /* All-true predicate to load all 128 bits. */ \
1820 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), \
1821 detail::NativeLanePointer(p)); \
1822 }
1823
1824HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
1825HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
1826
1827template <class D, HWY_SVE_IF_EMULATED_D(D)>
1828HWY_API VFromD<D> LoadDupFull128(D d, const TFromD<D>* HWY_RESTRICT p) {
1829 const RebindToUnsigned<decltype(d)> du;
1830 return BitCast(d, LoadDupFull128(du, detail::U16LanePointer(p)));
1831}
1832
1833} // namespace detail
1834#endif // HWY_TARGET != HWY_SVE2_128
1835
1836#if HWY_TARGET == HWY_SVE2_128
1837// On the HWY_SVE2_128 target, LoadDup128 is the same as LoadU since vectors
1838// cannot exceed 16 bytes on the HWY_SVE2_128 target.
1839template <class D>
1841 return LoadU(d, p);
1842}
1843#else // HWY_TARGET != HWY_SVE2_128
1844// If D().MaxBytes() <= 16 is true, simply do a LoadU operation.
1845template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1846HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1847 return LoadU(d, p);
1848}
1849
1850// If D().MaxBytes() > 16 is true, need to load the vector using ld1rq
1851template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
1852HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1853 return detail::LoadDupFull128(d, p);
1854}
1855
1856#endif // HWY_TARGET != HWY_SVE2_128
1857
1858// ------------------------------ Load/Store
1859
1860// SVE only requires lane alignment, not natural alignment of the entire
1861// vector, so Load/Store are the same as LoadU/StoreU.
1862template <class D>
1863HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
1864 return LoadU(d, p);
1865}
1866
1867template <class V, class D>
1868HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1869 StoreU(v, d, p);
1870}
1871
1872// ------------------------------ MaskedLoadOr
1873
1874// SVE MaskedLoad hard-codes zero, so this requires an extra blend.
1875template <class D>
1876HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
1877 const TFromD<D>* HWY_RESTRICT p) {
1878 return IfThenElse(m, MaskedLoad(m, d, p), v);
1879}
1880
1881// ------------------------------ ScatterOffset/Index
1882
1883#ifdef HWY_NATIVE_SCATTER
1884#undef HWY_NATIVE_SCATTER
1885#else
1886#define HWY_NATIVE_SCATTER
1887#endif
1888
1889#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1890 template <size_t N, int kPow2> \
1891 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1892 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1893 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1894 HWY_SVE_V(int, BITS) offset) { \
1895 sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \
1896 v); \
1897 }
1898
1899#define HWY_SVE_MASKED_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1900 template <size_t N, int kPow2> \
1901 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1902 HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, \
1903 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1904 HWY_SVE_V(int, BITS) indices) { \
1905 sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices, v); \
1906 }
1907
1910 st1_scatter)
1911#undef HWY_SVE_SCATTER_OFFSET
1912#undef HWY_SVE_MASKED_SCATTER_INDEX
1913
1914template <class D>
1918}
1919
1920// ------------------------------ GatherOffset/Index
1921
1922#ifdef HWY_NATIVE_GATHER
1923#undef HWY_NATIVE_GATHER
1924#else
1925#define HWY_NATIVE_GATHER
1926#endif
1927
1928#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1929 template <size_t N, int kPow2> \
1930 HWY_API HWY_SVE_V(BASE, BITS) \
1931 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1932 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1933 HWY_SVE_V(int, BITS) offset) { \
1934 return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \
1935 offset); \
1936 }
1937#define HWY_SVE_MASKED_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1938 template <size_t N, int kPow2> \
1939 HWY_API HWY_SVE_V(BASE, BITS) \
1940 NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1941 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1942 HWY_SVE_V(int, BITS) indices) { \
1943 const RebindToSigned<decltype(d)> di; \
1944 (void)di; /* for HWY_DASSERT */ \
1945 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
1946 return sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices); \
1947 }
1948
1951 ld1_gather)
1952#undef HWY_SVE_GATHER_OFFSET
1953#undef HWY_SVE_MASKED_GATHER_INDEX
1954
1955template <class D>
1957 const TFromD<D>* HWY_RESTRICT p,
1959 return IfThenElse(m, MaskedGatherIndex(m, d, p, indices), no);
1960}
1961
1962template <class D>
1967
1968// ------------------------------ LoadInterleaved2
1969
1970// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1971#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1972#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1973#else
1974#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1975#endif
1976
1977#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP) \
1978 template <size_t N, int kPow2> \
1979 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1980 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1981 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
1982 const HWY_SVE_TUPLE(BASE, BITS, 2) tuple = sv##OP##_##CHAR##BITS( \
1983 detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
1984 v0 = svget2(tuple, 0); \
1985 v1 = svget2(tuple, 1); \
1986 }
1988
1989#undef HWY_SVE_LOAD2
1990
1991// ------------------------------ LoadInterleaved3
1992
1993#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP) \
1994 template <size_t N, int kPow2> \
1995 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1996 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1997 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1998 HWY_SVE_V(BASE, BITS) & v2) { \
1999 const HWY_SVE_TUPLE(BASE, BITS, 3) tuple = sv##OP##_##CHAR##BITS( \
2000 detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
2001 v0 = svget3(tuple, 0); \
2002 v1 = svget3(tuple, 1); \
2003 v2 = svget3(tuple, 2); \
2004 }
2006
2007#undef HWY_SVE_LOAD3
2008
2009// ------------------------------ LoadInterleaved4
2010
2011#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP) \
2012 template <size_t N, int kPow2> \
2013 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
2014 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
2015 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
2016 HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
2017 const HWY_SVE_TUPLE(BASE, BITS, 4) tuple = sv##OP##_##CHAR##BITS( \
2018 detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
2019 v0 = svget4(tuple, 0); \
2020 v1 = svget4(tuple, 1); \
2021 v2 = svget4(tuple, 2); \
2022 v3 = svget4(tuple, 3); \
2023 }
2025
2026#undef HWY_SVE_LOAD4
2027
2028// ------------------------------ StoreInterleaved2
2029
2030#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
2031 template <size_t N, int kPow2> \
2032 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
2033 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
2034 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
2035 sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
2036 detail::NativeLanePointer(unaligned), \
2037 Create2(d, v0, v1)); \
2038 }
2040
2041#undef HWY_SVE_STORE2
2042
2043// ------------------------------ StoreInterleaved3
2044
2045#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \
2046 template <size_t N, int kPow2> \
2047 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
2048 HWY_SVE_V(BASE, BITS) v2, \
2049 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
2050 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
2051 sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
2052 detail::NativeLanePointer(unaligned), \
2053 Create3(d, v0, v1, v2)); \
2054 }
2056
2057#undef HWY_SVE_STORE3
2058
2059// ------------------------------ StoreInterleaved4
2060
2061#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \
2062 template <size_t N, int kPow2> \
2063 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
2064 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
2065 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
2066 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
2067 sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
2068 detail::NativeLanePointer(unaligned), \
2069 Create4(d, v0, v1, v2, v3)); \
2070 }
2072
2073#undef HWY_SVE_STORE4
2074
2075// ================================================== CONVERT
2076
2077// ------------------------------ PromoteTo
2078
2079// Same sign
2080#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \
2081 template <size_t N, int kPow2> \
2082 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
2083 HWY_SVE_D(BASE, BITS, N, kPow2) /* tag */, HWY_SVE_V(BASE, HALF) v) { \
2084 return sv##OP##_##CHAR##BITS(v); \
2085 }
2086
2090
2091// 2x
2092template <size_t N, int kPow2>
2093HWY_API svuint32_t PromoteTo(Simd<uint32_t, N, kPow2> dto, svuint8_t vfrom) {
2094 const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
2095 return PromoteTo(dto, PromoteTo(d2, vfrom));
2096}
2097template <size_t N, int kPow2>
2098HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svint8_t vfrom) {
2099 const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
2100 return PromoteTo(dto, PromoteTo(d2, vfrom));
2101}
2102template <size_t N, int kPow2>
2103HWY_API svuint64_t PromoteTo(Simd<uint64_t, N, kPow2> dto, svuint16_t vfrom) {
2104 const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
2105 return PromoteTo(dto, PromoteTo(d2, vfrom));
2106}
2107template <size_t N, int kPow2>
2108HWY_API svint64_t PromoteTo(Simd<int64_t, N, kPow2> dto, svint16_t vfrom) {
2109 const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
2110 return PromoteTo(dto, PromoteTo(d2, vfrom));
2111}
2112
2113// 3x
2114template <size_t N, int kPow2>
2115HWY_API svuint64_t PromoteTo(Simd<uint64_t, N, kPow2> dto, svuint8_t vfrom) {
2116 const RepartitionToNarrow<decltype(dto)> d4;
2117 const RepartitionToNarrow<decltype(d4)> d2;
2118 return PromoteTo(dto, PromoteTo(d4, PromoteTo(d2, vfrom)));
2119}
2120template <size_t N, int kPow2>
2121HWY_API svint64_t PromoteTo(Simd<int64_t, N, kPow2> dto, svint8_t vfrom) {
2122 const RepartitionToNarrow<decltype(dto)> d4;
2123 const RepartitionToNarrow<decltype(d4)> d2;
2124 return PromoteTo(dto, PromoteTo(d4, PromoteTo(d2, vfrom)));
2125}
2126
2127// Sign change
2128template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
2129 HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>))>
2131 const RebindToUnsigned<decltype(di)> du;
2132 return BitCast(di, PromoteTo(du, v));
2133}
2134
2135// ------------------------------ PromoteTo F
2136
2137// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
2138#ifdef HWY_NATIVE_F16C
2139#undef HWY_NATIVE_F16C
2140#else
2141#define HWY_NATIVE_F16C
2142#endif
2143
2144// Unlike Highway's ZipLower, this returns the same type.
2145namespace detail {
2146HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLowerSame, zip1)
2147} // namespace detail
2148
2149template <size_t N, int kPow2>
2151 const svfloat16_t v) {
2152 // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
2153 // first replicate each lane once.
2154 const svfloat16_t vv = detail::ZipLowerSame(v, v);
2155 return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
2156}
2157
2158#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
2159#undef HWY_NATIVE_PROMOTE_F16_TO_F64
2160#else
2161#define HWY_NATIVE_PROMOTE_F16_TO_F64
2162#endif
2163
2164template <size_t N, int kPow2>
2166 const svfloat16_t v) {
2167 // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
2168 // first replicate each lane once.
2169 const svfloat16_t vv = detail::ZipLowerSame(v, v);
2170 return svcvt_f64_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()),
2171 detail::ZipLowerSame(vv, vv));
2172}
2173
2174template <size_t N, int kPow2>
2176 const svfloat32_t v) {
2177 const svfloat32_t vv = detail::ZipLowerSame(v, v);
2178 return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv);
2179}
2180
2181template <size_t N, int kPow2>
2183 const svint32_t v) {
2184 const svint32_t vv = detail::ZipLowerSame(v, v);
2185 return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv);
2186}
2187
2188template <size_t N, int kPow2>
2190 const svuint32_t v) {
2191 const svuint32_t vv = detail::ZipLowerSame(v, v);
2192 return svcvt_f64_u32_x(detail::PTrue(Simd<uint32_t, N, kPow2>()), vv);
2193}
2194
2195template <size_t N, int kPow2>
2197 const svfloat32_t v) {
2198 const svfloat32_t vv = detail::ZipLowerSame(v, v);
2199 return svcvt_s64_f32_x(detail::PTrue(Simd<float, N, kPow2>()), vv);
2200}
2201
2202template <size_t N, int kPow2>
2204 const svfloat32_t v) {
2205 const svfloat32_t vv = detail::ZipLowerSame(v, v);
2206 return svcvt_u64_f32_x(detail::PTrue(Simd<float, N, kPow2>()), vv);
2207}
2208
2209// ------------------------------ PromoteUpperTo
2210
2211namespace detail {
2215#undef HWY_SVE_PROMOTE_TO
2216} // namespace detail
2217
2218#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
2219#undef HWY_NATIVE_PROMOTE_UPPER_TO
2220#else
2221#define HWY_NATIVE_PROMOTE_UPPER_TO
2222#endif
2223
2224// Unsigned->Unsigned or Signed->Signed
2225template <class D, class V, typename TD = TFromD<D>, typename TV = TFromV<V>,
2226 hwy::EnableIf<IsInteger<TD>() && IsInteger<TV>() &&
2227 (IsSigned<TD>() == IsSigned<TV>())>* = nullptr>
2229 if (detail::IsFull(d)) {
2230 return detail::PromoteUpperTo(d, v);
2231 }
2232 const Rebind<TFromV<V>, decltype(d)> dh;
2233 return PromoteTo(d, UpperHalf(dh, v));
2234}
2235
2236// Differing signs or either is float
2237template <class D, class V, typename TD = TFromD<D>, typename TV = TFromV<V>,
2238 hwy::EnableIf<!IsInteger<TD>() || !IsInteger<TV>() ||
2239 (IsSigned<TD>() != IsSigned<TV>())>* = nullptr>
2241 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
2242 // because it cannot be deduced from D (could be either bf16 or f16).
2243 const Rebind<TFromV<V>, decltype(d)> dh;
2244 return PromoteTo(d, UpperHalf(dh, v));
2245}
2246
2247// ------------------------------ DemoteTo U
2248
2249namespace detail {
2250
2251// Saturates unsigned vectors to half/quarter-width TN.
2252template <typename TN, class VU>
2253VU SaturateU(VU v) {
2254 return detail::MinN(v, static_cast<TFromV<VU>>(LimitsMax<TN>()));
2255}
2256
2257// Saturates unsigned vectors to half/quarter-width TN.
2258template <typename TN, class VI>
2259VI SaturateI(VI v) {
2260 return detail::MinN(detail::MaxN(v, LimitsMin<TN>()), LimitsMax<TN>());
2261}
2262
2263} // namespace detail
2264
2265template <size_t N, int kPow2>
2266HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint16_t v) {
2267#if HWY_SVE_HAVE_2
2268 const svuint8_t vn = BitCast(dn, svqxtunb_s16(v));
2269#else
2270 const DFromV<decltype(v)> di;
2271 const RebindToUnsigned<decltype(di)> du;
2272 using TN = TFromD<decltype(dn)>;
2273 // First clamp negative numbers to zero and cast to unsigned.
2274 const svuint16_t clamped = BitCast(du, detail::MaxN(v, 0));
2275 // Saturate to unsigned-max and halve the width.
2276 const svuint8_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
2277#endif
2278 return svuzp1_u8(vn, vn);
2279}
2280
2281template <size_t N, int kPow2>
2282HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svint32_t v) {
2283#if HWY_SVE_HAVE_2
2284 const svuint16_t vn = BitCast(dn, svqxtunb_s32(v));
2285#else
2286 const DFromV<decltype(v)> di;
2287 const RebindToUnsigned<decltype(di)> du;
2288 using TN = TFromD<decltype(dn)>;
2289 // First clamp negative numbers to zero and cast to unsigned.
2290 const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
2291 // Saturate to unsigned-max and halve the width.
2292 const svuint16_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
2293#endif
2294 return svuzp1_u16(vn, vn);
2295}
2296
2297template <size_t N, int kPow2>
2298HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint32_t v) {
2299 const DFromV<decltype(v)> di;
2300 const RebindToUnsigned<decltype(di)> du;
2301 const RepartitionToNarrow<decltype(du)> d2;
2302#if HWY_SVE_HAVE_2
2303 const svuint16_t cast16 = BitCast(d2, svqxtnb_u16(svqxtunb_s32(v)));
2304#else
2305 using TN = TFromD<decltype(dn)>;
2306 // First clamp negative numbers to zero and cast to unsigned.
2307 const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
2308 // Saturate to unsigned-max and quarter the width.
2309 const svuint16_t cast16 = BitCast(d2, detail::SaturateU<TN>(clamped));
2310#endif
2311 const svuint8_t x2 = BitCast(dn, svuzp1_u16(cast16, cast16));
2312 return svuzp1_u8(x2, x2);
2313}
2314
2315HWY_API svuint8_t U8FromU32(const svuint32_t v) {
2316 const DFromV<svuint32_t> du32;
2317 const RepartitionToNarrow<decltype(du32)> du16;
2318 const RepartitionToNarrow<decltype(du16)> du8;
2319
2320 const svuint16_t cast16 = BitCast(du16, v);
2321 const svuint16_t x2 = svuzp1_u16(cast16, cast16);
2322 const svuint8_t cast8 = BitCast(du8, x2);
2323 return svuzp1_u8(cast8, cast8);
2324}
2325
2326template <size_t N, int kPow2>
2327HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint16_t v) {
2328#if HWY_SVE_HAVE_2
2329 const svuint8_t vn = BitCast(dn, svqxtnb_u16(v));
2330#else
2331 using TN = TFromD<decltype(dn)>;
2332 const svuint8_t vn = BitCast(dn, detail::SaturateU<TN>(v));
2333#endif
2334 return svuzp1_u8(vn, vn);
2335}
2336
2337template <size_t N, int kPow2>
2338HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svuint32_t v) {
2339#if HWY_SVE_HAVE_2
2340 const svuint16_t vn = BitCast(dn, svqxtnb_u32(v));
2341#else
2342 using TN = TFromD<decltype(dn)>;
2343 const svuint16_t vn = BitCast(dn, detail::SaturateU<TN>(v));
2344#endif
2345 return svuzp1_u16(vn, vn);
2346}
2347
2348template <size_t N, int kPow2>
2349HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint32_t v) {
2350 using TN = TFromD<decltype(dn)>;
2351 return U8FromU32(detail::SaturateU<TN>(v));
2352}
2353
2354// ------------------------------ Truncations
2355
2356template <size_t N, int kPow2>
2358 const svuint64_t v) {
2359 const DFromV<svuint8_t> d;
2360 const svuint8_t v1 = BitCast(d, v);
2361 const svuint8_t v2 = svuzp1_u8(v1, v1);
2362 const svuint8_t v3 = svuzp1_u8(v2, v2);
2363 return svuzp1_u8(v3, v3);
2364}
2365
2366template <size_t N, int kPow2>
2368 const svuint64_t v) {
2369 const DFromV<svuint16_t> d;
2370 const svuint16_t v1 = BitCast(d, v);
2371 const svuint16_t v2 = svuzp1_u16(v1, v1);
2372 return svuzp1_u16(v2, v2);
2373}
2374
2375template <size_t N, int kPow2>
2377 const svuint64_t v) {
2378 const DFromV<svuint32_t> d;
2379 const svuint32_t v1 = BitCast(d, v);
2380 return svuzp1_u32(v1, v1);
2381}
2382
2383template <size_t N, int kPow2>
2385 const svuint32_t v) {
2386 const DFromV<svuint8_t> d;
2387 const svuint8_t v1 = BitCast(d, v);
2388 const svuint8_t v2 = svuzp1_u8(v1, v1);
2389 return svuzp1_u8(v2, v2);
2390}
2391
2392template <size_t N, int kPow2>
2394 const svuint32_t v) {
2395 const DFromV<svuint16_t> d;
2396 const svuint16_t v1 = BitCast(d, v);
2397 return svuzp1_u16(v1, v1);
2398}
2399
2400template <size_t N, int kPow2>
2402 const svuint16_t v) {
2403 const DFromV<svuint8_t> d;
2404 const svuint8_t v1 = BitCast(d, v);
2405 return svuzp1_u8(v1, v1);
2406}
2407
2408// ------------------------------ DemoteTo I
2409
2410template <size_t N, int kPow2>
2411HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint16_t v) {
2412#if HWY_SVE_HAVE_2
2413 const svint8_t vn = BitCast(dn, svqxtnb_s16(v));
2414#else
2415 using TN = TFromD<decltype(dn)>;
2416 const svint8_t vn = BitCast(dn, detail::SaturateI<TN>(v));
2417#endif
2418 return svuzp1_s8(vn, vn);
2419}
2420
2421template <size_t N, int kPow2>
2422HWY_API svint16_t DemoteTo(Simd<int16_t, N, kPow2> dn, const svint32_t v) {
2423#if HWY_SVE_HAVE_2
2424 const svint16_t vn = BitCast(dn, svqxtnb_s32(v));
2425#else
2426 using TN = TFromD<decltype(dn)>;
2427 const svint16_t vn = BitCast(dn, detail::SaturateI<TN>(v));
2428#endif
2429 return svuzp1_s16(vn, vn);
2430}
2431
2432template <size_t N, int kPow2>
2433HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint32_t v) {
2434 const RepartitionToWide<decltype(dn)> d2;
2435#if HWY_SVE_HAVE_2
2436 const svint16_t cast16 = BitCast(d2, svqxtnb_s16(svqxtnb_s32(v)));
2437#else
2438 using TN = TFromD<decltype(dn)>;
2439 const svint16_t cast16 = BitCast(d2, detail::SaturateI<TN>(v));
2440#endif
2441 const svint8_t v2 = BitCast(dn, svuzp1_s16(cast16, cast16));
2442 return BitCast(dn, svuzp1_s8(v2, v2));
2443}
2444
2445// ------------------------------ I64/U64 DemoteTo
2446
2447template <size_t N, int kPow2>
2448HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> dn, const svint64_t v) {
2449 const Rebind<uint64_t, decltype(dn)> du64;
2450 const RebindToUnsigned<decltype(dn)> dn_u;
2451#if HWY_SVE_HAVE_2
2452 const svuint64_t vn = BitCast(du64, svqxtnb_s64(v));
2453#else
2454 using TN = TFromD<decltype(dn)>;
2455 const svuint64_t vn = BitCast(du64, detail::SaturateI<TN>(v));
2456#endif
2457 return BitCast(dn, TruncateTo(dn_u, vn));
2458}
2459
2460template <size_t N, int kPow2>
2461HWY_API svint16_t DemoteTo(Simd<int16_t, N, kPow2> dn, const svint64_t v) {
2462 const Rebind<uint64_t, decltype(dn)> du64;
2463 const RebindToUnsigned<decltype(dn)> dn_u;
2464#if HWY_SVE_HAVE_2
2465 const svuint64_t vn = BitCast(du64, svqxtnb_s32(svqxtnb_s64(v)));
2466#else
2467 using TN = TFromD<decltype(dn)>;
2468 const svuint64_t vn = BitCast(du64, detail::SaturateI<TN>(v));
2469#endif
2470 return BitCast(dn, TruncateTo(dn_u, vn));
2471}
2472
2473template <size_t N, int kPow2>
2474HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint64_t v) {
2475 const Rebind<uint64_t, decltype(dn)> du64;
2476 const RebindToUnsigned<decltype(dn)> dn_u;
2477 using TN = TFromD<decltype(dn)>;
2478 const svuint64_t vn = BitCast(du64, detail::SaturateI<TN>(v));
2479 return BitCast(dn, TruncateTo(dn_u, vn));
2480}
2481
2482template <size_t N, int kPow2>
2483HWY_API svuint32_t DemoteTo(Simd<uint32_t, N, kPow2> dn, const svint64_t v) {
2484 const Rebind<uint64_t, decltype(dn)> du64;
2485#if HWY_SVE_HAVE_2
2486 const svuint64_t vn = BitCast(du64, svqxtunb_s64(v));
2487#else
2488 using TN = TFromD<decltype(dn)>;
2489 // First clamp negative numbers to zero and cast to unsigned.
2490 const svuint64_t clamped = BitCast(du64, detail::MaxN(v, 0));
2491 // Saturate to unsigned-max
2492 const svuint64_t vn = detail::SaturateU<TN>(clamped);
2493#endif
2494 return TruncateTo(dn, vn);
2495}
2496
2497template <size_t N, int kPow2>
2498HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svint64_t v) {
2499 const Rebind<uint64_t, decltype(dn)> du64;
2500#if HWY_SVE_HAVE_2
2501 const svuint64_t vn = BitCast(du64, svqxtnb_u32(svqxtunb_s64(v)));
2502#else
2503 using TN = TFromD<decltype(dn)>;
2504 // First clamp negative numbers to zero and cast to unsigned.
2505 const svuint64_t clamped = BitCast(du64, detail::MaxN(v, 0));
2506 // Saturate to unsigned-max
2507 const svuint64_t vn = detail::SaturateU<TN>(clamped);
2508#endif
2509 return TruncateTo(dn, vn);
2510}
2511
2512template <size_t N, int kPow2>
2513HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint64_t v) {
2514 const Rebind<uint64_t, decltype(dn)> du64;
2515 using TN = TFromD<decltype(dn)>;
2516 // First clamp negative numbers to zero and cast to unsigned.
2517 const svuint64_t clamped = BitCast(du64, detail::MaxN(v, 0));
2518 // Saturate to unsigned-max
2519 const svuint64_t vn = detail::SaturateU<TN>(clamped);
2520 return TruncateTo(dn, vn);
2521}
2522
2523template <size_t N, int kPow2>
2524HWY_API svuint32_t DemoteTo(Simd<uint32_t, N, kPow2> dn, const svuint64_t v) {
2525 const Rebind<uint64_t, decltype(dn)> du64;
2526#if HWY_SVE_HAVE_2
2527 const svuint64_t vn = BitCast(du64, svqxtnb_u64(v));
2528#else
2529 using TN = TFromD<decltype(dn)>;
2530 const svuint64_t vn = BitCast(du64, detail::SaturateU<TN>(v));
2531#endif
2532 return TruncateTo(dn, vn);
2533}
2534
2535template <size_t N, int kPow2>
2536HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svuint64_t v) {
2537 const Rebind<uint64_t, decltype(dn)> du64;
2538#if HWY_SVE_HAVE_2
2539 const svuint64_t vn = BitCast(du64, svqxtnb_u32(svqxtnb_u64(v)));
2540#else
2541 using TN = TFromD<decltype(dn)>;
2542 const svuint64_t vn = BitCast(du64, detail::SaturateU<TN>(v));
2543#endif
2544 return TruncateTo(dn, vn);
2545}
2546
2547template <size_t N, int kPow2>
2548HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint64_t v) {
2549 const Rebind<uint64_t, decltype(dn)> du64;
2550 using TN = TFromD<decltype(dn)>;
2551 const svuint64_t vn = BitCast(du64, detail::SaturateU<TN>(v));
2552 return TruncateTo(dn, vn);
2553}
2554
2555// ------------------------------ Unsigned to signed demotions
2556
2557// Disable the default unsigned to signed DemoteTo/ReorderDemote2To
2558// implementations in generic_ops-inl.h on SVE/SVE2 as the SVE/SVE2 targets have
2559// target-specific implementations of the unsigned to signed DemoteTo and
2560// ReorderDemote2To ops
2561
2562// NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
2563// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
2564// !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
2565// SFINAE to occur instead of a hard error due to a dependency on the V template
2566// argument
2567#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
2568#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
2569 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
2570
2571template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
2572 HWY_IF_T_SIZE_LE_D(D, sizeof(TFromV<V>) - 1)>
2574 const RebindToUnsigned<D> dn_u;
2575 return BitCast(dn, TruncateTo(dn_u, detail::SaturateU<TFromD<D>>(v)));
2576}
2577
2578// ------------------------------ ConcatEven/ConcatOdd
2579
2580// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
2581// full vector length, not rounded down to a power of two as we require).
2582namespace detail {
2583
2584#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
2585 HWY_INLINE HWY_SVE_V(BASE, BITS) \
2586 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
2587 return sv##OP##_##CHAR##BITS(lo, hi); \
2588 }
2589HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
2590HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
2591#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2593 uzp1)
2595 uzp2)
2596#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2597#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
2598HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
2599HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
2600#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2602 ConcatEvenBlocks, uzp1q)
2604 uzp2q)
2605#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2606#endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64)
2607#undef HWY_SVE_CONCAT_EVERY_SECOND
2608
2609// Used to slide up / shift whole register left; mask indicates which range
2610// to take from lo, and the rest is filled from hi starting at its lowest.
2611#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
2612 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
2613 HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
2614 return sv##OP##_##CHAR##BITS(mask, lo, hi); \
2615 }
2617#if HWY_SVE_HAVE_BF16_FEATURE
2619#else
2620template <class V, HWY_IF_BF16_D(DFromV<V>)>
2621HWY_INLINE V Splice(V hi, V lo, svbool_t mask) {
2622 const DFromV<V> d;
2623 const RebindToUnsigned<decltype(d)> du;
2624 return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask));
2625}
2626#endif // HWY_SVE_HAVE_BF16_FEATURE
2627#undef HWY_SVE_SPLICE
2628
2629} // namespace detail
2630
2631template <class D>
2633#if HWY_SVE_IS_POW2
2634 if (detail::IsFull(d)) return detail::ConcatOddFull(hi, lo);
2635#endif
2636 const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
2637 const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
2638 return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
2639}
2640
2641template <class D>
2643#if HWY_SVE_IS_POW2
2644 if (detail::IsFull(d)) return detail::ConcatEvenFull(hi, lo);
2645#endif
2646 const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
2647 const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
2648 return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
2649}
2650
2651// ------------------------------ DemoteTo F
2652
2653// We already toggled HWY_NATIVE_F16C above.
2654
2655template <size_t N, int kPow2>
2656HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
2657 const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v);
2658 return detail::ConcatEvenFull(in_even,
2659 in_even); // lower half
2660}
2661
2662#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
2663#undef HWY_NATIVE_DEMOTE_F64_TO_F16
2664#else
2665#define HWY_NATIVE_DEMOTE_F64_TO_F16
2666#endif
2667
2668template <size_t N, int kPow2>
2669HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat64_t v) {
2670 const svfloat16_t in_lo16 = svcvt_f16_f64_x(detail::PTrue(d), v);
2671 const svfloat16_t in_even = detail::ConcatEvenFull(in_lo16, in_lo16);
2672 return detail::ConcatEvenFull(in_even,
2673 in_even); // lower half
2674}
2675
2676#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
2677#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
2678#else
2679#define HWY_NATIVE_DEMOTE_F32_TO_BF16
2680#endif
2681
2682#if !HWY_SVE_HAVE_F32_TO_BF16C
2683namespace detail {
2684
2685// Round a F32 value to the nearest BF16 value, with the result returned as the
2686// rounded F32 value bitcasted to an U32
2687
2688// RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
2689// NaN F32 values from being converted to an infinity
2690HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v) {
2691 const DFromV<decltype(v)> df32;
2692 const RebindToUnsigned<decltype(df32)> du32;
2693
2694 const auto is_non_nan = Eq(v, v);
2695 const auto bits32 = BitCast(du32, v);
2696
2697 const auto round_incr =
2698 detail::AddN(detail::AndN(ShiftRight<16>(bits32), 1u), 0x7FFFu);
2699 return MaskedAddOr(detail::OrN(bits32, 0x00400000u), is_non_nan, bits32,
2700 round_incr);
2701}
2702
2703} // namespace detail
2704#endif // !HWY_SVE_HAVE_F32_TO_BF16C
2705
2706template <size_t N, int kPow2>
2708#if HWY_SVE_HAVE_F32_TO_BF16C
2709 const VBF16 in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), v);
2710 return detail::ConcatEvenFull(in_even, in_even);
2711#else
2712 const svuint16_t in_odd =
2714 return BitCast(dbf16, detail::ConcatOddFull(in_odd, in_odd)); // lower half
2715#endif
2716}
2717
2718template <size_t N, int kPow2>
2719HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) {
2720 const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v);
2721 return detail::ConcatEvenFull(in_even,
2722 in_even); // lower half
2723}
2724
2725template <size_t N, int kPow2>
2726HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
2727 const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v);
2728 return detail::ConcatEvenFull(in_even,
2729 in_even); // lower half
2730}
2731
2732template <size_t N, int kPow2>
2733HWY_API svuint32_t DemoteTo(Simd<uint32_t, N, kPow2> d, const svfloat64_t v) {
2734 const svuint32_t in_even = svcvt_u32_f64_x(detail::PTrue(d), v);
2735 return detail::ConcatEvenFull(in_even,
2736 in_even); // lower half
2737}
2738
2739template <size_t N, int kPow2>
2740HWY_API svfloat32_t DemoteTo(Simd<float, N, kPow2> d, const svint64_t v) {
2741 const svfloat32_t in_even = svcvt_f32_s64_x(detail::PTrue(d), v);
2742 return detail::ConcatEvenFull(in_even,
2743 in_even); // lower half
2744}
2745
2746template <size_t N, int kPow2>
2747HWY_API svfloat32_t DemoteTo(Simd<float, N, kPow2> d, const svuint64_t v) {
2748 const svfloat32_t in_even = svcvt_f32_u64_x(detail::PTrue(d), v);
2749 return detail::ConcatEvenFull(in_even,
2750 in_even); // lower half
2751}
2752
2753// ------------------------------ ConvertTo F
2754
2755#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
2756 /* signed integers */ \
2757 template <size_t N, int kPow2> \
2758 HWY_API HWY_SVE_V(BASE, BITS) \
2759 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
2760 return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2761 } \
2762 /* unsigned integers */ \
2763 template <size_t N, int kPow2> \
2764 HWY_API HWY_SVE_V(BASE, BITS) \
2765 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \
2766 return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2767 } \
2768 /* Truncates (rounds toward zero). */ \
2769 template <size_t N, int kPow2> \
2770 HWY_API HWY_SVE_V(int, BITS) \
2771 NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
2772 return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2773 } \
2774 /* Truncates to unsigned (rounds toward zero). */ \
2775 template <size_t N, int kPow2> \
2776 HWY_API HWY_SVE_V(uint, BITS) \
2777 NAME(HWY_SVE_D(uint, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
2778 return sv##OP##_u##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2779 }
2780
2781// API only requires f32 but we provide f64 for use by Iota.
2783#undef HWY_SVE_CONVERT
2784
2785// ------------------------------ NearestInt (Round, ConvertTo)
2786template <class VF, class DI = RebindToSigned<DFromV<VF>>>
2788 // No single instruction, round then truncate.
2789 return ConvertTo(DI(), Round(v));
2790}
2791
2792// ------------------------------ Iota (Add, ConvertTo)
2793
2794#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
2795 template <size_t N, int kPow2, typename T2> \
2796 HWY_API HWY_SVE_V(BASE, BITS) \
2797 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, T2 first) { \
2798 return sv##OP##_##CHAR##BITS( \
2799 ConvertScalarTo<HWY_SVE_T(BASE, BITS)>(first), 1); \
2800 }
2801
2803#undef HWY_SVE_IOTA
2804
2805template <class D, typename T2, HWY_IF_FLOAT_D(D)>
2806HWY_API VFromD<D> Iota(const D d, T2 first) {
2807 const RebindToSigned<D> di;
2808 return detail::AddN(ConvertTo(d, Iota(di, 0)),
2809 ConvertScalarTo<TFromD<D>>(first));
2810}
2811
2812// ------------------------------ InterleaveLower
2813
2814template <class D, class V>
2815HWY_API V InterleaveLower(D d, const V a, const V b) {
2816 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
2817#if HWY_TARGET == HWY_SVE2_128
2818 (void)d;
2819 return detail::ZipLowerSame(a, b);
2820#else
2821 // Move lower halves of blocks to lower half of vector.
2822 const Repartition<uint64_t, decltype(d)> d64;
2823 const auto a64 = BitCast(d64, a);
2824 const auto b64 = BitCast(d64, b);
2825 const auto a_blocks = detail::ConcatEvenFull(a64, a64); // lower half
2826 const auto b_blocks = detail::ConcatEvenFull(b64, b64);
2827 return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
2828#endif
2829}
2830
2831template <class V>
2832HWY_API V InterleaveLower(const V a, const V b) {
2833 return InterleaveLower(DFromV<V>(), a, b);
2834}
2835
2836// ------------------------------ InterleaveUpper
2837
2838// Only use zip2 if vector are a powers of two, otherwise getting the actual
2839// "upper half" requires MaskUpperHalf.
2840namespace detail {
2841// Unlike Highway's ZipUpper, this returns the same type.
2842HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2)
2843} // namespace detail
2844
2845// Full vector: guaranteed to have at least one block
2846template <class D, class V = VFromD<D>,
2847 hwy::EnableIf<detail::IsFull(D())>* = nullptr>
2848HWY_API V InterleaveUpper(D d, const V a, const V b) {
2849#if HWY_TARGET == HWY_SVE2_128
2850 (void)d;
2851 return detail::ZipUpperSame(a, b);
2852#else
2853 // Move upper halves of blocks to lower half of vector.
2854 const Repartition<uint64_t, decltype(d)> d64;
2855 const auto a64 = BitCast(d64, a);
2856 const auto b64 = BitCast(d64, b);
2857 const auto a_blocks = detail::ConcatOddFull(a64, a64); // lower half
2858 const auto b_blocks = detail::ConcatOddFull(b64, b64);
2859 return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
2860#endif
2861}
2862
2863// Capped/fraction: need runtime check
2864template <class D, class V = VFromD<D>,
2865 hwy::EnableIf<!detail::IsFull(D())>* = nullptr>
2866HWY_API V InterleaveUpper(D d, const V a, const V b) {
2867 // Less than one block: treat as capped
2868 if (Lanes(d) * sizeof(TFromD<D>) < 16) {
2869 const Half<decltype(d)> d2;
2870 return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
2871 }
2872 return InterleaveUpper(DFromV<V>(), a, b);
2873}
2874
2875// ------------------------------ InterleaveWholeLower
2876#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
2877#undef HWY_NATIVE_INTERLEAVE_WHOLE
2878#else
2879#define HWY_NATIVE_INTERLEAVE_WHOLE
2880#endif
2881
2882template <class D>
2884 return detail::ZipLowerSame(a, b);
2885}
2886
2887// ------------------------------ InterleaveWholeUpper
2888
2889template <class D>
2891 if (HWY_SVE_IS_POW2 && detail::IsFull(d)) {
2892 return detail::ZipUpperSame(a, b);
2893 }
2894
2895 const Half<decltype(d)> d2;
2896 return InterleaveWholeLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
2897}
2898
2899// ------------------------------ Per4LaneBlockShuffle
2900
2901namespace detail {
2902
2903template <size_t kLaneSize, size_t kVectSize, class V,
2904 HWY_IF_NOT_T_SIZE_V(V, 8)>
2906 hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
2907 hwy::SizeTag<kVectSize> /*vect_size_tag*/,
2908 V v) {
2909 const DFromV<decltype(v)> d;
2910 const RebindToUnsigned<decltype(d)> du;
2911 const RepartitionToWide<decltype(du)> dw;
2912
2913 const auto evens = BitCast(dw, ConcatEvenFull(v, v));
2914 return BitCast(d, ZipLowerSame(evens, evens));
2915}
2916
2917template <size_t kLaneSize, size_t kVectSize, class V,
2918 HWY_IF_NOT_T_SIZE_V(V, 8)>
2920 hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
2921 hwy::SizeTag<kVectSize> /*vect_size_tag*/,
2922 V v) {
2923 const DFromV<decltype(v)> d;
2924 const RebindToUnsigned<decltype(d)> du;
2925 const RepartitionToWide<decltype(du)> dw;
2926
2927 const auto odds = BitCast(dw, ConcatOddFull(v, v));
2928 return BitCast(d, ZipLowerSame(odds, odds));
2929}
2930
2931} // namespace detail
2932
2933// ================================================== COMBINE
2934
2935namespace detail {
2936
2937#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2938template <class D, HWY_IF_T_SIZE_D(D, 1)>
2939svbool_t MaskLowerHalf(D d) {
2940 switch (Lanes(d)) {
2941 case 32:
2942 return svptrue_pat_b8(SV_VL16);
2943 case 16:
2944 return svptrue_pat_b8(SV_VL8);
2945 case 8:
2946 return svptrue_pat_b8(SV_VL4);
2947 case 4:
2948 return svptrue_pat_b8(SV_VL2);
2949 default:
2950 return svptrue_pat_b8(SV_VL1);
2951 }
2952}
2953template <class D, HWY_IF_T_SIZE_D(D, 2)>
2954svbool_t MaskLowerHalf(D d) {
2955 switch (Lanes(d)) {
2956 case 16:
2957 return svptrue_pat_b16(SV_VL8);
2958 case 8:
2959 return svptrue_pat_b16(SV_VL4);
2960 case 4:
2961 return svptrue_pat_b16(SV_VL2);
2962 default:
2963 return svptrue_pat_b16(SV_VL1);
2964 }
2965}
2966template <class D, HWY_IF_T_SIZE_D(D, 4)>
2967svbool_t MaskLowerHalf(D d) {
2968 switch (Lanes(d)) {
2969 case 8:
2970 return svptrue_pat_b32(SV_VL4);
2971 case 4:
2972 return svptrue_pat_b32(SV_VL2);
2973 default:
2974 return svptrue_pat_b32(SV_VL1);
2975 }
2976}
2977template <class D, HWY_IF_T_SIZE_D(D, 8)>
2978svbool_t MaskLowerHalf(D d) {
2979 switch (Lanes(d)) {
2980 case 4:
2981 return svptrue_pat_b64(SV_VL2);
2982 default:
2983 return svptrue_pat_b64(SV_VL1);
2984 }
2985}
2986#endif
2987#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2988template <class D, HWY_IF_T_SIZE_D(D, 1)>
2989svbool_t MaskLowerHalf(D d) {
2990 switch (Lanes(d)) {
2991 case 16:
2992 return svptrue_pat_b8(SV_VL8);
2993 case 8:
2994 return svptrue_pat_b8(SV_VL4);
2995 case 4:
2996 return svptrue_pat_b8(SV_VL2);
2997 case 2:
2998 case 1:
2999 default:
3000 return svptrue_pat_b8(SV_VL1);
3001 }
3002}
3003template <class D, HWY_IF_T_SIZE_D(D, 2)>
3004svbool_t MaskLowerHalf(D d) {
3005 switch (Lanes(d)) {
3006 case 8:
3007 return svptrue_pat_b16(SV_VL4);
3008 case 4:
3009 return svptrue_pat_b16(SV_VL2);
3010 case 2:
3011 case 1:
3012 default:
3013 return svptrue_pat_b16(SV_VL1);
3014 }
3015}
3016template <class D, HWY_IF_T_SIZE_D(D, 4)>
3017svbool_t MaskLowerHalf(D d) {
3018 return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1);
3019}
3020template <class D, HWY_IF_T_SIZE_D(D, 8)>
3021svbool_t MaskLowerHalf(D /*d*/) {
3022 return svptrue_pat_b64(SV_VL1);
3023}
3024#endif // HWY_TARGET == HWY_SVE2_128
3025#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
3026template <class D>
3027svbool_t MaskLowerHalf(D d) {
3028 return FirstN(d, Lanes(d) / 2);
3029}
3030#endif
3031
3032template <class D>
3033svbool_t MaskUpperHalf(D d) {
3034 // TODO(janwas): WHILEGE on SVE2
3035 if (HWY_SVE_IS_POW2 && IsFull(d)) {
3036 return Not(MaskLowerHalf(d));
3037 }
3038
3039 // For Splice to work as intended, make sure bits above Lanes(d) are zero.
3041}
3042
3043// Right-shift vector pair by constexpr; can be used to slide down (=N) or up
3044// (=Lanes()-N).
3045#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \
3046 template <size_t kIndex> \
3047 HWY_API HWY_SVE_V(BASE, BITS) \
3048 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
3049 return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
3050 }
3052#undef HWY_SVE_EXT
3053
3054} // namespace detail
3055
3056// ------------------------------ ConcatUpperLower
3057template <class D, class V>
3058HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) {
3059 return IfThenElse(detail::MaskLowerHalf(d), lo, hi);
3060}
3061
3062// ------------------------------ ConcatLowerLower
3063template <class D, class V>
3064HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
3065 if (detail::IsFull(d)) {
3066#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
3067 return detail::ConcatEvenBlocks(hi, lo);
3068#endif
3069#if HWY_TARGET == HWY_SVE2_128
3070 const Repartition<uint64_t, D> du64;
3071 const auto lo64 = BitCast(du64, lo);
3072 return BitCast(d, InterleaveLower(du64, lo64, BitCast(du64, hi)));
3073#endif
3074 }
3075 return detail::Splice(hi, lo, detail::MaskLowerHalf(d));
3076}
3077
3078// ------------------------------ ConcatLowerUpper
3079template <class D, class V>
3080HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
3081#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
3082 if (detail::IsFull(d)) {
3083 return detail::Ext<Lanes(d) / 2>(hi, lo);
3084 }
3085#endif
3086 return detail::Splice(hi, lo, detail::MaskUpperHalf(d));
3087}
3088
3089// ------------------------------ ConcatUpperUpper
3090template <class D, class V>
3091HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) {
3092 if (detail::IsFull(d)) {
3093#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
3094 return detail::ConcatOddBlocks(hi, lo);
3095#endif
3096#if HWY_TARGET == HWY_SVE2_128
3097 const Repartition<uint64_t, D> du64;
3098 const auto lo64 = BitCast(du64, lo);
3099 return BitCast(d, InterleaveUpper(du64, lo64, BitCast(du64, hi)));
3100#endif
3101 }
3102 const svbool_t mask_upper = detail::MaskUpperHalf(d);
3103 const V lo_upper = detail::Splice(lo, lo, mask_upper);
3104 return IfThenElse(mask_upper, hi, lo_upper);
3105}
3106
3107// ------------------------------ Combine
3108template <class D, class V2>
3109HWY_API VFromD<D> Combine(const D d, const V2 hi, const V2 lo) {
3110 return ConcatLowerLower(d, hi, lo);
3111}
3112
3113// ------------------------------ ZeroExtendVector
3114template <class D, class V>
3115HWY_API V ZeroExtendVector(const D d, const V lo) {
3116 return Combine(d, Zero(Half<D>()), lo);
3117}
3118
3119// ------------------------------ Lower/UpperHalf
3120
3121template <class D2, class V>
3122HWY_API V LowerHalf(D2 /* tag */, const V v) {
3123 return v;
3124}
3125
3126template <class V>
3127HWY_API V LowerHalf(const V v) {
3128 return v;
3129}
3130
3131template <class DH, class V>
3132HWY_API V UpperHalf(const DH dh, const V v) {
3133 const Twice<decltype(dh)> d;
3134 // Cast so that we support bfloat16_t.
3135 const RebindToUnsigned<decltype(d)> du;
3136 const VFromD<decltype(du)> vu = BitCast(du, v);
3137#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
3138 return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu));
3139#else
3140 const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du);
3141 return BitCast(d, detail::Splice(vu, vu, mask));
3142#endif
3143}
3144
3145// ================================================== REDUCE
3146
3147#ifdef HWY_NATIVE_REDUCE_SCALAR
3148#undef HWY_NATIVE_REDUCE_SCALAR
3149#else
3150#define HWY_NATIVE_REDUCE_SCALAR
3151#endif
3152
3153// These return T, suitable for ReduceSum.
3154namespace detail {
3155#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
3156 HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
3157 /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \
3158 using T = HWY_SVE_T(BASE, BITS); \
3159 using TU = MakeUnsigned<T>; \
3160 constexpr uint64_t kMask = LimitsMax<TU>(); \
3161 return static_cast<T>(static_cast<TU>( \
3162 static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \
3163 }
3164
3165#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
3166 HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
3167 return sv##OP##_##CHAR##BITS(pg, v); \
3168 }
3169
3170HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv)
3171HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv)
3172
3173HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv)
3174HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv)
3175// NaN if all are
3176HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv)
3177HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
3178
3179#undef HWY_SVE_REDUCE
3180#undef HWY_SVE_REDUCE_ADD
3181} // namespace detail
3182
3183// detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more
3184// efficient for N=4 I8/U8 reductions on SVE than the default implementations
3185// of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in
3186// generic_ops-inl.h
3187#undef HWY_IF_REDUCE_D
3188#define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
3189
3190#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
3191#undef HWY_NATIVE_REDUCE_SUM_4_UI8
3192#else
3193#define HWY_NATIVE_REDUCE_SUM_4_UI8
3194#endif
3195
3196#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
3197#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
3198#else
3199#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
3200#endif
3201
3202template <class D, HWY_IF_REDUCE_D(D)>
3204 return detail::SumOfLanesM(detail::MakeMask(d), v);
3205}
3206
3207template <class D, HWY_IF_REDUCE_D(D)>
3209 return detail::MinOfLanesM(detail::MakeMask(d), v);
3210}
3211
3212template <class D, HWY_IF_REDUCE_D(D)>
3214 return detail::MaxOfLanesM(detail::MakeMask(d), v);
3215}
3216
3217// ------------------------------ SumOfLanes
3218
3219template <class D, HWY_IF_LANES_GT_D(D, 1)>
3221 return Set(d, ReduceSum(d, v));
3222}
3223template <class D, HWY_IF_LANES_GT_D(D, 1)>
3225 return Set(d, ReduceMin(d, v));
3226}
3227template <class D, HWY_IF_LANES_GT_D(D, 1)>
3229 return Set(d, ReduceMax(d, v));
3230}
3231
3232// ================================================== SWIZZLE
3233
3234// ------------------------------ GetLane
3235
3236namespace detail {
3237#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
3238 HWY_INLINE HWY_SVE_T(BASE, BITS) \
3239 NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
3240 return sv##OP##_##CHAR##BITS(mask, v); \
3241 }
3242
3243HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta)
3244HWY_SVE_FOREACH(HWY_SVE_GET_LANE, ExtractLastMatchingLaneM, lastb)
3245#undef HWY_SVE_GET_LANE
3246} // namespace detail
3247
3248template <class V>
3249HWY_API TFromV<V> GetLane(V v) {
3250 return detail::GetLaneM(v, detail::PFalse());
3251}
3252
3253// ------------------------------ ExtractLane
3254template <class V>
3256 return detail::GetLaneM(v, FirstN(DFromV<V>(), i));
3257}
3258
3259// ------------------------------ InsertLane (IfThenElse)
3260template <class V>
3261HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
3262 const DFromV<V> d;
3263 const RebindToSigned<decltype(d)> di;
3264 using TI = TFromD<decltype(di)>;
3265 const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast<TI>(i));
3266 return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
3267}
3268
3269// ------------------------------ DupEven
3270
3271namespace detail {
3273} // namespace detail
3274
3275template <class V>
3276HWY_API V DupEven(const V v) {
3277 return detail::InterleaveEven(v, v);
3278}
3279
3280// ------------------------------ DupOdd
3281
3282namespace detail {
3284} // namespace detail
3285
3286template <class V>
3287HWY_API V DupOdd(const V v) {
3288 return detail::InterleaveOdd(v, v);
3289}
3290
3291// ------------------------------ OddEven
3292
3293#if HWY_SVE_HAVE_2
3294
3295#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
3296 HWY_API HWY_SVE_V(BASE, BITS) \
3297 NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \
3298 return sv##OP##_##CHAR##BITS(even, odd, /*xor=*/0); \
3299 }
3300
3302#undef HWY_SVE_ODD_EVEN
3303
3304template <class V, HWY_IF_FLOAT_V(V)>
3305HWY_API V OddEven(const V odd, const V even) {
3306 const DFromV<V> d;
3307 const RebindToUnsigned<decltype(d)> du;
3308 return BitCast(d, OddEven(BitCast(du, odd), BitCast(du, even)));
3309}
3310
3311#else
3312
3313template <class V>
3314HWY_API V OddEven(const V odd, const V even) {
3315 const auto odd_in_even = detail::Ext<1>(odd, odd);
3316 return detail::InterleaveEven(even, odd_in_even);
3317}
3318
3319#endif // HWY_TARGET
3320
3321// ------------------------------ InterleaveEven
3322template <class D>
3324 return detail::InterleaveEven(a, b);
3325}
3326
3327// ------------------------------ InterleaveOdd
3328template <class D>
3330 return detail::InterleaveOdd(a, b);
3331}
3332
3333// ------------------------------ OddEvenBlocks
3334template <class V>
3335HWY_API V OddEvenBlocks(const V odd, const V even) {
3336 const DFromV<V> d;
3337#if HWY_TARGET == HWY_SVE_256
3338 return ConcatUpperLower(d, odd, even);
3339#elif HWY_TARGET == HWY_SVE2_128
3340 (void)odd;
3341 (void)d;
3342 return even;
3343#else
3344 const RebindToUnsigned<decltype(d)> du;
3345 using TU = TFromD<decltype(du)>;
3346 constexpr size_t kShift = CeilLog2(16 / sizeof(TU));
3347 const auto idx_block = ShiftRight<kShift>(Iota(du, 0));
3348 const auto lsb = detail::AndN(idx_block, static_cast<TU>(1));
3349 const svbool_t is_even = detail::EqN(lsb, static_cast<TU>(0));
3350 return IfThenElse(is_even, even, odd);
3351#endif
3352}
3353
3354// ------------------------------ TableLookupLanes
3355
3356template <class D, class VI>
3358 using TI = TFromV<VI>;
3359 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size mismatch");
3360 const RebindToUnsigned<D> du;
3361 const auto indices = BitCast(du, vec);
3362#if HWY_IS_DEBUG_BUILD
3363 using TU = MakeUnsigned<TI>;
3364 const size_t twice_max_lanes = Lanes(d) * 2;
3366 du, Eq(indices,
3367 detail::AndN(indices, static_cast<TU>(twice_max_lanes - 1)))));
3368#else
3369 (void)d;
3370#endif
3371 return indices;
3372}
3373
3374template <class D, typename TI>
3376 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
3377 return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
3378}
3379
3380#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \
3381 HWY_API HWY_SVE_V(BASE, BITS) \
3382 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \
3383 return sv##OP##_##CHAR##BITS(v, idx); \
3384 }
3385
3387#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3389#endif
3390#undef HWY_SVE_TABLE
3391
3392#if HWY_SVE_HAVE_2
3393namespace detail {
3394#define HWY_SVE_TABLE2(BASE, CHAR, BITS, HALF, NAME, OP) \
3395 HWY_API HWY_SVE_V(BASE, BITS) \
3396 NAME(HWY_SVE_TUPLE(BASE, BITS, 2) tuple, HWY_SVE_V(uint, BITS) idx) { \
3397 return sv##OP##_##CHAR##BITS(tuple, idx); \
3398 }
3399
3400HWY_SVE_FOREACH(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2)
3401#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3402HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_TABLE2, NativeTwoTableLookupLanes,
3403 tbl2)
3404#endif
3405#undef HWY_SVE_TABLE
3406} // namespace detail
3407#endif // HWY_SVE_HAVE_2
3408
3409template <class D>
3412 // SVE2 has an instruction for this, but it only works for full 2^n vectors.
3413#if HWY_SVE_HAVE_2 && HWY_SVE_IS_POW2
3414 if (detail::IsFull(d)) {
3415 return detail::NativeTwoTableLookupLanes(Create2(d, a, b), idx);
3416 }
3417#endif
3418 const RebindToUnsigned<decltype(d)> du;
3419 using TU = TFromD<decltype(du)>;
3420
3421 const size_t num_of_lanes = Lanes(d);
3422 const auto idx_mod = detail::AndN(idx, static_cast<TU>(num_of_lanes - 1));
3423 const auto sel_a_mask = Eq(idx, idx_mod);
3424
3425 const auto a_lookup_result = TableLookupLanes(a, idx_mod);
3426 const auto b_lookup_result = TableLookupLanes(b, idx_mod);
3427 return IfThenElse(sel_a_mask, a_lookup_result, b_lookup_result);
3428}
3429
3430template <class V>
3433 const DFromV<decltype(a)> d;
3434 return TwoTablesLookupLanes(d, a, b, idx);
3435}
3436
3437// ------------------------------ SwapAdjacentBlocks (TableLookupLanes)
3438
3439namespace detail {
3440
3441template <typename T, size_t N, int kPow2>
3443 // We might have a capped vector smaller than a block, so honor that.
3444 return HWY_MIN(16 / sizeof(T), MaxLanes(d));
3445}
3446
3447} // namespace detail
3448
3449template <class V>
3451 const DFromV<V> d;
3452#if HWY_TARGET == HWY_SVE_256
3453 return ConcatLowerUpper(d, v, v);
3454#elif HWY_TARGET == HWY_SVE2_128
3455 (void)d;
3456 return v;
3457#else
3458 const RebindToUnsigned<decltype(d)> du;
3459 constexpr auto kLanesPerBlock =
3460 static_cast<TFromD<decltype(du)>>(detail::LanesPerBlock(d));
3461 const VFromD<decltype(du)> idx = detail::XorN(Iota(du, 0), kLanesPerBlock);
3462 return TableLookupLanes(v, idx);
3463#endif
3464}
3465
3466// ------------------------------ Reverse
3467
3468namespace detail {
3469
3470#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \
3471 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
3472 return sv##OP##_##CHAR##BITS(v); \
3473 }
3474
3475HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
3476#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3478#endif
3479#undef HWY_SVE_REVERSE
3480
3481} // namespace detail
3482
3483template <class D, class V>
3484HWY_API V Reverse(D d, V v) {
3485 using T = TFromD<D>;
3486 const auto reversed = detail::ReverseFull(v);
3487 if (HWY_SVE_IS_POW2 && detail::IsFull(d)) return reversed;
3488 // Shift right to remove extra (non-pow2 and remainder) lanes.
3489 // TODO(janwas): on SVE2, use WHILEGE.
3490 // Avoids FirstN truncating to the return vector size. Must also avoid Not
3491 // because that is limited to SV_POW2.
3492 const ScalableTag<T> dfull;
3493 const svbool_t all_true = detail::AllPTrue(dfull);
3494 const size_t all_lanes = detail::AllHardwareLanes<T>();
3495 const size_t want_lanes = Lanes(d);
3496 HWY_DASSERT(want_lanes <= all_lanes);
3497 const svbool_t mask =
3498 svnot_b_z(all_true, FirstN(dfull, all_lanes - want_lanes));
3499 return detail::Splice(reversed, reversed, mask);
3500}
3501
3502// ------------------------------ Reverse2
3503
3504// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
3505#ifdef HWY_NATIVE_REVERSE2_8
3506#undef HWY_NATIVE_REVERSE2_8
3507#else
3508#define HWY_NATIVE_REVERSE2_8
3509#endif
3510
3511template <class D, HWY_IF_T_SIZE_D(D, 1)>
3512HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
3513 const RebindToUnsigned<decltype(d)> du;
3514 const RepartitionToWide<decltype(du)> dw;
3515 return BitCast(d, svrevb_u16_x(detail::PTrue(d), BitCast(dw, v)));
3516}
3517
3518template <class D, HWY_IF_T_SIZE_D(D, 2)>
3519HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
3520 const RebindToUnsigned<decltype(d)> du;
3521 const RepartitionToWide<decltype(du)> dw;
3522 return BitCast(d, svrevh_u32_x(detail::PTrue(d), BitCast(dw, v)));
3523}
3524
3525template <class D, HWY_IF_T_SIZE_D(D, 4)>
3526HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
3527 const RebindToUnsigned<decltype(d)> du;
3528 const RepartitionToWide<decltype(du)> dw;
3529 return BitCast(d, svrevw_u64_x(detail::PTrue(d), BitCast(dw, v)));
3530}
3531
3532template <class D, HWY_IF_T_SIZE_D(D, 8)>
3533HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { // 3210
3534#if HWY_TARGET == HWY_SVE2_128
3535 if (detail::IsFull(d)) {
3536 return detail::Ext<1>(v, v);
3537 }
3538#endif
3539 (void)d;
3540 const auto odd_in_even = detail::Ext<1>(v, v); // x321
3541 return detail::InterleaveEven(odd_in_even, v); // 2301
3542}
3543
3544// ------------------------------ Reverse4 (TableLookupLanes)
3545
3546template <class D, HWY_IF_T_SIZE_D(D, 1)>
3547HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
3548 const RebindToUnsigned<decltype(d)> du;
3549 const RepartitionToWideX2<decltype(du)> du32;
3550 return BitCast(d, svrevb_u32_x(detail::PTrue(d), BitCast(du32, v)));
3551}
3552
3553template <class D, HWY_IF_T_SIZE_D(D, 2)>
3554HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
3555 const RebindToUnsigned<decltype(d)> du;
3556 const RepartitionToWideX2<decltype(du)> du64;
3557 return BitCast(d, svrevh_u64_x(detail::PTrue(d), BitCast(du64, v)));
3558}
3559
3560template <class D, HWY_IF_T_SIZE_D(D, 4)>
3561HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
3563 return detail::ReverseFull(v);
3564 }
3565 // TODO(janwas): is this approach faster than Shuffle0123?
3566 const RebindToUnsigned<decltype(d)> du;
3567 const auto idx = detail::XorN(Iota(du, 0), 3);
3568 return TableLookupLanes(v, idx);
3569}
3570
3571template <class D, HWY_IF_T_SIZE_D(D, 8)>
3572HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
3574 return detail::ReverseFull(v);
3575 }
3576 // TODO(janwas): is this approach faster than Shuffle0123?
3577 const RebindToUnsigned<decltype(d)> du;
3578 const auto idx = detail::XorN(Iota(du, 0), 3);
3579 return TableLookupLanes(v, idx);
3580}
3581
3582// ------------------------------ Reverse8 (TableLookupLanes)
3583
3584template <class D, HWY_IF_T_SIZE_D(D, 1)>
3585HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
3586 const Repartition<uint64_t, decltype(d)> du64;
3587 return BitCast(d, svrevb_u64_x(detail::PTrue(d), BitCast(du64, v)));
3588}
3589
3590template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
3591HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
3592 const RebindToUnsigned<decltype(d)> du;
3593 const auto idx = detail::XorN(Iota(du, 0), 7);
3594 return TableLookupLanes(v, idx);
3595}
3596
3597// ------------------------------- ReverseBits
3598
3599#ifdef HWY_NATIVE_REVERSE_BITS_UI8
3600#undef HWY_NATIVE_REVERSE_BITS_UI8
3601#else
3602#define HWY_NATIVE_REVERSE_BITS_UI8
3603#endif
3604
3605#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
3606#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
3607#else
3608#define HWY_NATIVE_REVERSE_BITS_UI16_32_64
3609#endif
3610
3611#define HWY_SVE_REVERSE_BITS(BASE, CHAR, BITS, HALF, NAME, OP) \
3612 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
3613 const DFromV<decltype(v)> d; \
3614 return sv##OP##_##CHAR##BITS##_x(detail::PTrue(d), v); \
3615 }
3616
3618#undef HWY_SVE_REVERSE_BITS
3619
3620// ------------------------------ SlideUpLanes
3621
3622template <class D>
3624 return detail::Splice(v, Zero(d), FirstN(d, amt));
3625}
3626
3627// ------------------------------ Slide1Up
3628
3629#ifdef HWY_NATIVE_SLIDE1_UP_DOWN
3630#undef HWY_NATIVE_SLIDE1_UP_DOWN
3631#else
3632#define HWY_NATIVE_SLIDE1_UP_DOWN
3633#endif
3634
3635template <class D>
3637 return SlideUpLanes(d, v, 1);
3638}
3639
3640// ------------------------------ SlideDownLanes (TableLookupLanes)
3641
3642template <class D>
3644 const RebindToUnsigned<decltype(d)> du;
3645 using TU = TFromD<decltype(du)>;
3646 const auto idx = Iota(du, static_cast<TU>(amt));
3647 return IfThenElseZero(FirstN(d, Lanes(d) - amt), TableLookupLanes(v, idx));
3648}
3649
3650// ------------------------------ Slide1Down
3651
3652template <class D>
3654 return SlideDownLanes(d, v, 1);
3655}
3656
3657// ------------------------------ Block insert/extract/broadcast ops
3658#if HWY_TARGET != HWY_SVE2_128
3659
3660#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
3661#undef HWY_NATIVE_BLK_INSERT_EXTRACT
3662#else
3663#define HWY_NATIVE_BLK_INSERT_EXTRACT
3664#endif
3665
3666template <int kBlockIdx, class V>
3667HWY_API V InsertBlock(V v, V blk_to_insert) {
3668 const DFromV<decltype(v)> d;
3669 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
3670 "Invalid block index");
3671
3672#if HWY_TARGET == HWY_SVE_256
3673 return (kBlockIdx == 0) ? ConcatUpperLower(d, v, blk_to_insert)
3674 : ConcatLowerLower(d, blk_to_insert, v);
3675#else
3676 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
3677
3678 constexpr size_t kBlockOffset =
3679 static_cast<size_t>(kBlockIdx) * kLanesPerBlock;
3680 const auto splice_mask = FirstN(d, kBlockOffset);
3681 const auto sel_lo_mask = FirstN(d, kBlockOffset + kLanesPerBlock);
3682
3683 const auto splice_result = detail::Splice(blk_to_insert, v, splice_mask);
3684 return IfThenElse(sel_lo_mask, splice_result, v);
3685#endif
3686}
3687
3688template <int kBlockIdx, class V>
3689HWY_API V ExtractBlock(V v) {
3690 const DFromV<decltype(v)> d;
3691 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
3692 "Invalid block index");
3693
3694 if (kBlockIdx == 0) return v;
3695
3696#if HWY_TARGET == HWY_SVE_256
3697 return UpperHalf(Half<decltype(d)>(), v);
3698#else
3699 const RebindToUnsigned<decltype(d)> du;
3700 using TU = TFromD<decltype(du)>;
3701 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
3702 constexpr size_t kBlockOffset =
3703 static_cast<size_t>(kBlockIdx) * kLanesPerBlock;
3704 const auto splice_mask =
3705 RebindMask(d, detail::LtN(Iota(du, static_cast<TU>(0u - kBlockOffset)),
3706 static_cast<TU>(kLanesPerBlock)));
3707 return detail::Splice(v, v, splice_mask);
3708#endif
3709}
3710
3711template <int kBlockIdx, class V>
3712HWY_API V BroadcastBlock(V v) {
3713 const DFromV<decltype(v)> d;
3714 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
3715 "Invalid block index");
3716
3717 const RebindToUnsigned<decltype(d)> du; // for bfloat16_t
3718 using VU = VFromD<decltype(du)>;
3719 const VU vu = BitCast(du, v);
3720
3721#if HWY_TARGET == HWY_SVE_256
3722 return BitCast(d, (kBlockIdx == 0) ? ConcatLowerLower(du, vu, vu)
3723 : ConcatUpperUpper(du, vu, vu));
3724#else
3725 using TU = TFromD<decltype(du)>;
3726 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
3727 constexpr size_t kBlockOffset =
3728 static_cast<size_t>(kBlockIdx) * kLanesPerBlock;
3729
3730 const VU idx = detail::AddN(
3731 detail::AndN(Iota(du, TU{0}), static_cast<TU>(kLanesPerBlock - 1)),
3732 static_cast<TU>(kBlockOffset));
3733 return BitCast(d, TableLookupLanes(vu, idx));
3734#endif
3735}
3736
3737#endif // HWY_TARGET != HWY_SVE2_128
3738
3739// ------------------------------ Compress (PromoteTo)
3740
3741template <typename T>
3742struct CompressIsPartition {
3743#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
3744 // Optimization for 64-bit lanes (could also be applied to 32-bit, but that
3745 // requires a larger table).
3746 enum { value = (sizeof(T) == 8) };
3747#else
3748 enum { value = 0 };
3749#endif // HWY_TARGET == HWY_SVE_256
3750};
3751
3752#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \
3753 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
3754 return sv##OP##_##CHAR##BITS(mask, v); \
3755 }
3756
3757#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
3760#else
3762#endif
3763#undef HWY_SVE_COMPRESS
3764
3765#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
3766template <class V, HWY_IF_T_SIZE_V(V, 8)>
3767HWY_API V Compress(V v, svbool_t mask) {
3768 const DFromV<V> d;
3769 const RebindToUnsigned<decltype(d)> du64;
3770
3771 // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
3772 // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
3773 // SetTableIndices.
3774 const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
3775 const size_t offset = detail::SumOfLanesM(mask, bits);
3776
3777 // See CompressIsPartition.
3778 alignas(16) static constexpr uint64_t table[4 * 16] = {
3779 // PrintCompress64x4Tables
3780 0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2,
3781 1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2,
3782 0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3};
3783 return TableLookupLanes(v, SetTableIndices(d, table + offset));
3784}
3785
3786#endif // HWY_TARGET == HWY_SVE_256
3787#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
3788template <class V, HWY_IF_T_SIZE_V(V, 8)>
3789HWY_API V Compress(V v, svbool_t mask) {
3790 // If mask == 10: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
3791 // swaps upper/lower (the lower half is set to the upper half, and the
3792 // remaining upper half is filled from the lower half of the second v), and
3793 // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot keep 10
3794 // unchanged and map everything else to 00.
3795 const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane
3796 return detail::Splice(v, v, AndNot(maskLL, mask));
3797}
3798
3799#endif // HWY_TARGET == HWY_SVE2_128
3800
3801template <class V, HWY_IF_T_SIZE_V(V, 2)>
3802HWY_API V Compress(V v, svbool_t mask16) {
3803 static_assert(!IsSame<V, svfloat16_t>(), "Must use overload");
3804 const DFromV<V> d16;
3805
3806 // Promote vector and mask to 32-bit
3807 const RepartitionToWide<decltype(d16)> dw;
3808 const auto v32L = PromoteTo(dw, v);
3809 const auto v32H = detail::PromoteUpperTo(dw, v);
3810 const svbool_t mask32L = svunpklo_b(mask16);
3811 const svbool_t mask32H = svunpkhi_b(mask16);
3812
3813 const auto compressedL = Compress(v32L, mask32L);
3814 const auto compressedH = Compress(v32H, mask32H);
3815
3816 // Demote to 16-bit (already in range) - separately so we can splice
3817 const V evenL = BitCast(d16, compressedL);
3818 const V evenH = BitCast(d16, compressedH);
3819 const V v16L = detail::ConcatEvenFull(evenL, evenL); // lower half
3820 const V v16H = detail::ConcatEvenFull(evenH, evenH);
3821
3822 // We need to combine two vectors of non-constexpr length, so the only option
3823 // is Splice, which requires us to synthesize a mask. NOTE: this function uses
3824 // full vectors (SV_ALL instead of SV_POW2), hence we need unmasked svcnt.
3825 const size_t countL = detail::CountTrueFull(dw, mask32L);
3826 const auto compressed_maskL = FirstN(d16, countL);
3827 return detail::Splice(v16H, v16L, compressed_maskL);
3828}
3829
3830// Must treat float16_t as integers so we can ConcatEven.
3831HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) {
3832 const DFromV<decltype(v)> df;
3833 const RebindToSigned<decltype(df)> di;
3834 return BitCast(df, Compress(BitCast(di, v), mask16));
3835}
3836
3837// ------------------------------ CompressNot
3838
3839// 2 or 4 bytes
3840template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4))>
3841HWY_API V CompressNot(V v, const svbool_t mask) {
3842 return Compress(v, Not(mask));
3843}
3844
3845template <class V, HWY_IF_T_SIZE_V(V, 8)>
3846HWY_API V CompressNot(V v, svbool_t mask) {
3847#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
3848 // If mask == 01: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
3849 // swaps upper/lower (the lower half is set to the upper half, and the
3850 // remaining upper half is filled from the lower half of the second v), and
3851 // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot map
3852 // 01 to 10, and everything else to 00.
3853 const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane
3854 return detail::Splice(v, v, AndNot(mask, maskLL));
3855#endif
3856#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
3857 const DFromV<V> d;
3858 const RebindToUnsigned<decltype(d)> du64;
3859
3860 // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
3861 // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
3862 // SetTableIndices.
3863 const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
3864 const size_t offset = detail::SumOfLanesM(mask, bits);
3865
3866 // See CompressIsPartition.
3867 alignas(16) static constexpr uint64_t table[4 * 16] = {
3868 // PrintCompressNot64x4Tables
3869 0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3,
3870 0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3,
3871 2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
3872 return TableLookupLanes(v, SetTableIndices(d, table + offset));
3873#endif // HWY_TARGET == HWY_SVE_256
3874
3875 return Compress(v, Not(mask));
3876}
3877
3878// ------------------------------ CompressBlocksNot
3879HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) {
3880#if HWY_TARGET == HWY_SVE2_128
3881 (void)mask;
3882 return v;
3883#endif
3884#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
3885 uint64_t bits = 0; // predicate reg is 32-bit
3886 CopyBytes<4>(&mask, &bits); // not same size - 64-bit more efficient
3887 // Concatenate LSB for upper and lower blocks, pre-scale by 4 for table idx.
3888 const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u);
3889 // See CompressIsPartition. Manually generated; flip halves if mask = [0, 1].
3890 alignas(16) static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1,
3891 0, 1, 2, 3, 0, 1, 2, 3};
3893 return TableLookupLanes(v, SetTableIndices(d, table + offset));
3894#endif
3895
3896 return CompressNot(v, mask);
3897}
3898
3899// ------------------------------ CompressStore
3900template <class V, class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
3901HWY_API size_t CompressStore(const V v, const svbool_t mask, const D d,
3902 TFromD<D>* HWY_RESTRICT unaligned) {
3903 StoreU(Compress(v, mask), d, unaligned);
3904 return CountTrue(d, mask);
3905}
3906
3907// ------------------------------ CompressBlendedStore
3908template <class V, class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
3909HWY_API size_t CompressBlendedStore(const V v, const svbool_t mask, const D d,
3910 TFromD<D>* HWY_RESTRICT unaligned) {
3911 const size_t count = CountTrue(d, mask);
3912 const svbool_t store_mask = FirstN(d, count);
3913 BlendedStore(Compress(v, mask), store_mask, d, unaligned);
3914 return count;
3915}
3916
3917// ================================================== MASK (2)
3918
3919// ------------------------------ FindKnownLastTrue
3920template <class D>
3921HWY_API size_t FindKnownLastTrue(D d, svbool_t m) {
3922 const RebindToUnsigned<decltype(d)> du;
3923 return static_cast<size_t>(detail::ExtractLastMatchingLaneM(
3924 Iota(du, 0), And(m, detail::MakeMask(d))));
3925}
3926
3927// ------------------------------ FindLastTrue
3928template <class D>
3929HWY_API intptr_t FindLastTrue(D d, svbool_t m) {
3930 return AllFalse(d, m) ? intptr_t{-1}
3931 : static_cast<intptr_t>(FindKnownLastTrue(d, m));
3932}
3933
3934// ================================================== BLOCKWISE
3935
3936// ------------------------------ CombineShiftRightBytes
3937
3938// Prevent accidentally using these for 128-bit vectors - should not be
3939// necessary.
3940#if HWY_TARGET != HWY_SVE2_128
3941namespace detail {
3942
3943// For x86-compatible behaviour mandated by Highway API: TableLookupBytes
3944// offsets are implicitly relative to the start of their 128-bit block.
3945template <class D, class V>
3946HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
3947 using T = MakeUnsigned<TFromD<D>>;
3948 return detail::AndNotN(static_cast<T>(LanesPerBlock(d) - 1), iota0);
3949}
3950
3951template <size_t kLanes, class D, HWY_IF_T_SIZE_D(D, 1)>
3952svbool_t FirstNPerBlock(D d) {
3953 const RebindToUnsigned<decltype(d)> du;
3954 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
3955 const svuint8_t idx_mod =
3956 svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
3957 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
3958 6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock,
3959 9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock,
3960 12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock,
3961 15 % kLanesPerBlock);
3962 return detail::LtN(BitCast(du, idx_mod), kLanes);
3963}
3964template <size_t kLanes, class D, HWY_IF_T_SIZE_D(D, 2)>
3965svbool_t FirstNPerBlock(D d) {
3966 const RebindToUnsigned<decltype(d)> du;
3967 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
3968 const svuint16_t idx_mod =
3969 svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
3970 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
3971 6 % kLanesPerBlock, 7 % kLanesPerBlock);
3972 return detail::LtN(BitCast(du, idx_mod), kLanes);
3973}
3974template <size_t kLanes, class D, HWY_IF_T_SIZE_D(D, 4)>
3975svbool_t FirstNPerBlock(D d) {
3976 const RebindToUnsigned<decltype(d)> du;
3977 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
3978 const svuint32_t idx_mod =
3979 svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
3980 3 % kLanesPerBlock);
3981 return detail::LtN(BitCast(du, idx_mod), kLanes);
3982}
3983template <size_t kLanes, class D, HWY_IF_T_SIZE_D(D, 8)>
3984svbool_t FirstNPerBlock(D d) {
3985 const RebindToUnsigned<decltype(d)> du;
3986 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
3987 const svuint64_t idx_mod =
3988 svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock);
3989 return detail::LtN(BitCast(du, idx_mod), kLanes);
3990}
3991
3992} // namespace detail
3993#endif // HWY_TARGET != HWY_SVE2_128
3994
3995template <size_t kBytes, class D, class V = VFromD<D>>
3996HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) {
3997 const Repartition<uint8_t, decltype(d)> d8;
3998 const auto hi8 = BitCast(d8, hi);
3999 const auto lo8 = BitCast(d8, lo);
4000#if HWY_TARGET == HWY_SVE2_128
4001 return BitCast(d, detail::Ext<kBytes>(hi8, lo8));
4002#else
4003 const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes));
4004 const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
4005 const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
4006 return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
4007#endif
4008}
4009
4010// ------------------------------ Shuffle2301
4011template <class V>
4012HWY_API V Shuffle2301(const V v) {
4013 const DFromV<V> d;
4014 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
4015 return Reverse2(d, v);
4016}
4017
4018// ------------------------------ Shuffle2103
4019template <class V>
4020HWY_API V Shuffle2103(const V v) {
4021 const DFromV<V> d;
4022 const Repartition<uint8_t, decltype(d)> d8;
4023 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
4024 const svuint8_t v8 = BitCast(d8, v);
4025 return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8));
4026}
4027
4028// ------------------------------ Shuffle0321
4029template <class V>
4030HWY_API V Shuffle0321(const V v) {
4031 const DFromV<V> d;
4032 const Repartition<uint8_t, decltype(d)> d8;
4033 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
4034 const svuint8_t v8 = BitCast(d8, v);
4035 return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8));
4036}
4037
4038// ------------------------------ Shuffle1032
4039template <class V>
4040HWY_API V Shuffle1032(const V v) {
4041 const DFromV<V> d;
4042 const Repartition<uint8_t, decltype(d)> d8;
4043 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
4044 const svuint8_t v8 = BitCast(d8, v);
4045 return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
4046}
4047
4048// ------------------------------ Shuffle01
4049template <class V>
4050HWY_API V Shuffle01(const V v) {
4051 const DFromV<V> d;
4052 const Repartition<uint8_t, decltype(d)> d8;
4053 static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
4054 const svuint8_t v8 = BitCast(d8, v);
4055 return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
4056}
4057
4058// ------------------------------ Shuffle0123
4059template <class V>
4060HWY_API V Shuffle0123(const V v) {
4061 return Shuffle2301(Shuffle1032(v));
4062}
4063
4064// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
4065template <class D, class V = VFromD<D>>
4067#if HWY_TARGET == HWY_SVE_256
4068 if (detail::IsFull(d)) {
4069 return SwapAdjacentBlocks(v);
4070 } else if (detail::IsFull(Twice<D>())) {
4071 return v;
4072 }
4073#elif HWY_TARGET == HWY_SVE2_128
4074 (void)d;
4075 return v;
4076#endif
4077 const Repartition<uint64_t, D> du64;
4078 return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v))));
4079}
4080
4081// ------------------------------ TableLookupBytes
4082
4083template <class V, class VI>
4084HWY_API VI TableLookupBytes(const V v, const VI idx) {
4085 const DFromV<VI> d;
4086 const Repartition<uint8_t, decltype(d)> du8;
4087#if HWY_TARGET == HWY_SVE2_128
4088 return BitCast(d, TableLookupLanes(BitCast(du8, v), BitCast(du8, idx)));
4089#else
4090 const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0));
4091 const auto idx8 = Add(BitCast(du8, idx), offsets128);
4092 return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8));
4093#endif
4094}
4095
4096template <class V, class VI>
4097HWY_API VI TableLookupBytesOr0(const V v, const VI idx) {
4098 const DFromV<VI> d;
4099 // Mask size must match vector type, so cast everything to this type.
4100 const Repartition<int8_t, decltype(d)> di8;
4101
4102 auto idx8 = BitCast(di8, idx);
4103 const auto msb = detail::LtN(idx8, 0);
4104
4105 const auto lookup = TableLookupBytes(BitCast(di8, v), idx8);
4106 return BitCast(d, IfThenZeroElse(msb, lookup));
4107}
4108
4109// ------------------------------ Broadcast
4110
4111#ifdef HWY_NATIVE_BROADCASTLANE
4112#undef HWY_NATIVE_BROADCASTLANE
4113#else
4114#define HWY_NATIVE_BROADCASTLANE
4115#endif
4116
4117namespace detail {
4118#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP) \
4119 template <int kLane> \
4120 HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
4121 return sv##OP##_##CHAR##BITS(v, kLane); \
4122 }
4123
4125#undef HWY_SVE_BROADCAST
4126} // namespace detail
4127
4128template <int kLane, class V>
4129HWY_API V Broadcast(const V v) {
4130 const DFromV<V> d;
4131 const RebindToUnsigned<decltype(d)> du;
4132 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
4133 static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
4134#if HWY_TARGET == HWY_SVE2_128
4135 return detail::BroadcastLane<kLane>(v);
4136#else
4137 auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
4138 if (kLane != 0) {
4139 idx = detail::AddN(idx, kLane);
4140 }
4141 return TableLookupLanes(v, idx);
4142#endif
4143}
4144
4145template <int kLane, class V>
4147 static_assert(0 <= kLane && kLane < HWY_MAX_LANES_V(V), "Invalid lane");
4148 return detail::BroadcastLane<kLane>(v);
4149}
4150
4151// ------------------------------ ShiftLeftLanes
4152
4153template <size_t kLanes, class D, class V = VFromD<D>>
4154HWY_API V ShiftLeftLanes(D d, const V v) {
4155 const auto zero = Zero(d);
4156 const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes));
4157#if HWY_TARGET == HWY_SVE2_128
4158 return shifted;
4159#else
4160 // Match x86 semantics by zeroing lower lanes in 128-bit blocks
4161 return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted);
4162#endif
4163}
4164
4165template <size_t kLanes, class V>
4167 return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
4168}
4169
4170// ------------------------------ ShiftRightLanes
4171template <size_t kLanes, class D, class V = VFromD<D>>
4173 // For capped/fractional vectors, clear upper lanes so we shift in zeros.
4174 if (!detail::IsFull(d)) {
4176 }
4177
4178#if HWY_TARGET == HWY_SVE2_128
4179 return detail::Ext<kLanes>(Zero(d), v);
4180#else
4181 const auto shifted = detail::Ext<kLanes>(v, v);
4182 // Match x86 semantics by zeroing upper lanes in 128-bit blocks
4183 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
4184 const svbool_t mask = detail::FirstNPerBlock<kLanesPerBlock - kLanes>(d);
4185 return IfThenElseZero(mask, shifted);
4186#endif
4187}
4188
4189// ------------------------------ ShiftLeftBytes
4190
4191template <int kBytes, class D, class V = VFromD<D>>
4192HWY_API V ShiftLeftBytes(const D d, const V v) {
4193 const Repartition<uint8_t, decltype(d)> d8;
4194 return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
4195}
4196
4197template <int kBytes, class V>
4199 return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
4200}
4201
4202// ------------------------------ ShiftRightBytes
4203template <int kBytes, class D, class V = VFromD<D>>
4204HWY_API V ShiftRightBytes(const D d, const V v) {
4205 const Repartition<uint8_t, decltype(d)> d8;
4206 return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
4207}
4208
4209// ------------------------------ ZipLower
4210
4211template <class V, class DW = RepartitionToWide<DFromV<V>>>
4212HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
4213 const RepartitionToNarrow<DW> dn;
4214 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
4215 return BitCast(dw, InterleaveLower(dn, a, b));
4216}
4217template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4218HWY_API VFromD<DW> ZipLower(const V a, const V b) {
4219 return BitCast(DW(), InterleaveLower(D(), a, b));
4220}
4221
4222// ------------------------------ ZipUpper
4223template <class V, class DW = RepartitionToWide<DFromV<V>>>
4224HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
4225 const RepartitionToNarrow<DW> dn;
4226 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
4227 return BitCast(dw, InterleaveUpper(dn, a, b));
4228}
4229
4230// ================================================== Ops with dependencies
4231
4232// ------------------------------ AddSub (Reverse2)
4233
4234// NOTE: svcadd_f*_x(HWY_SVE_PTRUE(BITS), a, b, 90) computes a[i] - b[i + 1] in
4235// the even lanes and a[i] + b[i - 1] in the odd lanes.
4236
4237#define HWY_SVE_ADDSUB_F(BASE, CHAR, BITS, HALF, NAME, OP) \
4238 HWY_API HWY_SVE_V(BASE, BITS) \
4239 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
4240 const DFromV<decltype(b)> d; \
4241 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, Reverse2(d, b), \
4242 90); \
4243 }
4244
4246
4247#undef HWY_SVE_ADDSUB_F
4248
4249// NOTE: svcadd_s*(a, b, 90) and svcadd_u*(a, b, 90) compute a[i] - b[i + 1] in
4250// the even lanes and a[i] + b[i - 1] in the odd lanes.
4251
4252#if HWY_SVE_HAVE_2
4253#define HWY_SVE_ADDSUB_UI(BASE, CHAR, BITS, HALF, NAME, OP) \
4254 HWY_API HWY_SVE_V(BASE, BITS) \
4255 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
4256 const DFromV<decltype(b)> d; \
4257 return sv##OP##_##CHAR##BITS(a, Reverse2(d, b), 90); \
4258 }
4259
4261
4262#undef HWY_SVE_ADDSUB_UI
4263
4264// Disable the default implementation of AddSub in generic_ops-inl.h on SVE2
4265#undef HWY_IF_ADDSUB_V
4266#define HWY_IF_ADDSUB_V(V) \
4267 HWY_IF_LANES_GT_D(DFromV<V>, 1), \
4268 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
4269
4270#else // !HWY_SVE_HAVE_2
4271
4272// Disable the default implementation of AddSub in generic_ops-inl.h for
4273// floating-point vectors on SVE, but enable the default implementation of
4274// AddSub in generic_ops-inl.h for integer vectors on SVE that do not support
4275// SVE2
4276#undef HWY_IF_ADDSUB_V
4277#define HWY_IF_ADDSUB_V(V) \
4278 HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
4279
4280#endif // HWY_SVE_HAVE_2
4281
4282// ------------------------------ MulAddSub (AddSub)
4283
4284template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_FLOAT_V(V)>
4285HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
4286 using T = TFromV<V>;
4287
4288 const DFromV<V> d;
4289 const T neg_zero = ConvertScalarTo<T>(-0.0f);
4290
4291 return MulAdd(mul, x, AddSub(Set(d, neg_zero), sub_or_add));
4292}
4293
4294#if HWY_SVE_HAVE_2
4295
4296// Disable the default implementation of MulAddSub in generic_ops-inl.h on SVE2
4297#undef HWY_IF_MULADDSUB_V
4298#define HWY_IF_MULADDSUB_V(V) \
4299 HWY_IF_LANES_GT_D(DFromV<V>, 1), \
4300 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
4301
4302template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
4303 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4304HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
4305 const DFromV<V> d;
4306 return MulAdd(mul, x, AddSub(Zero(d), sub_or_add));
4307}
4308
4309#else // !HWY_SVE_HAVE_2
4310
4311// Disable the default implementation of MulAddSub in generic_ops-inl.h for
4312// floating-point vectors on SVE, but enable the default implementation of
4313// AddSub in generic_ops-inl.h for integer vectors on SVE targets that do not
4314// support SVE2
4315#undef HWY_IF_MULADDSUB_V
4316#define HWY_IF_MULADDSUB_V(V) \
4317 HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
4318
4319#endif // HWY_SVE_HAVE_2
4320
4321// ------------------------------ PromoteTo bfloat16 (ZipLower)
4322template <size_t N, int kPow2>
4324 const ScalableTag<uint16_t> du16;
4325 return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), BitCast(du16, v)));
4326}
4327
4328// ------------------------------ PromoteEvenTo/PromoteOddTo (ConcatOddFull)
4329
4330namespace detail {
4331
4332// Signed to signed PromoteEvenTo
4333template <class D>
4335 hwy::SizeTag<2> /*to_lane_size_tag*/,
4336 hwy::SignedTag /*from_type_tag*/, D d_to,
4337 svint8_t v) {
4338 return svextb_s16_x(detail::PTrue(d_to), BitCast(d_to, v));
4339}
4340
4341template <class D>
4343 hwy::SizeTag<4> /*to_lane_size_tag*/,
4344 hwy::SignedTag /*from_type_tag*/, D d_to,
4345 svint16_t v) {
4346 return svexth_s32_x(detail::PTrue(d_to), BitCast(d_to, v));
4347}
4348
4349template <class D>
4351 hwy::SizeTag<8> /*to_lane_size_tag*/,
4352 hwy::SignedTag /*from_type_tag*/, D d_to,
4353 svint32_t v) {
4354 return svextw_s64_x(detail::PTrue(d_to), BitCast(d_to, v));
4355}
4356
4357// F16->F32 PromoteEvenTo
4358template <class D>
4360 hwy::SizeTag<4> /*to_lane_size_tag*/,
4361 hwy::FloatTag /*from_type_tag*/, D d_to,
4362 svfloat16_t v) {
4363 const Repartition<float, decltype(d_to)> d_from;
4364 return svcvt_f32_f16_x(detail::PTrue(d_from), v);
4365}
4366
4367// F32->F64 PromoteEvenTo
4368template <class D>
4370 hwy::SizeTag<8> /*to_lane_size_tag*/,
4371 hwy::FloatTag /*from_type_tag*/, D d_to,
4372 svfloat32_t v) {
4373 const Repartition<float, decltype(d_to)> d_from;
4374 return svcvt_f64_f32_x(detail::PTrue(d_from), v);
4375}
4376
4377// I32->F64 PromoteEvenTo
4378template <class D>
4380 hwy::SizeTag<8> /*to_lane_size_tag*/,
4381 hwy::SignedTag /*from_type_tag*/, D d_to,
4382 svint32_t v) {
4383 const Repartition<float, decltype(d_to)> d_from;
4384 return svcvt_f64_s32_x(detail::PTrue(d_from), v);
4385}
4386
4387// U32->F64 PromoteEvenTo
4388template <class D>
4390 hwy::SizeTag<8> /*to_lane_size_tag*/,
4391 hwy::UnsignedTag /*from_type_tag*/, D d_to,
4392 svuint32_t v) {
4393 const Repartition<float, decltype(d_to)> d_from;
4394 return svcvt_f64_u32_x(detail::PTrue(d_from), v);
4395}
4396
4397// F32->I64 PromoteEvenTo
4398template <class D>
4400 hwy::SizeTag<8> /*to_lane_size_tag*/,
4401 hwy::FloatTag /*from_type_tag*/, D d_to,
4402 svfloat32_t v) {
4403 const Repartition<float, decltype(d_to)> d_from;
4404 return svcvt_s64_f32_x(detail::PTrue(d_from), v);
4405}
4406
4407// F32->U64 PromoteEvenTo
4408template <class D>
4410 hwy::SizeTag<8> /*to_lane_size_tag*/,
4411 hwy::FloatTag /*from_type_tag*/, D d_to,
4412 svfloat32_t v) {
4413 const Repartition<float, decltype(d_to)> d_from;
4414 return svcvt_u64_f32_x(detail::PTrue(d_from), v);
4415}
4416
4417// F16->F32 PromoteOddTo
4418template <class D>
4420 hwy::SizeTag<4> to_lane_size_tag,
4421 hwy::FloatTag from_type_tag, D d_to,
4422 svfloat16_t v) {
4423 return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4424 DupOdd(v));
4425}
4426
4427// I32/U32/F32->F64 PromoteOddTo
4428template <class FromTypeTag, class D, class V>
4430 hwy::SizeTag<8> to_lane_size_tag,
4431 FromTypeTag from_type_tag, D d_to, V v) {
4432 return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4433 DupOdd(v));
4434}
4435
4436// F32->I64/U64 PromoteOddTo
4437template <class ToTypeTag, class D, HWY_IF_UI64_D(D)>
4438HWY_INLINE VFromD<D> PromoteOddTo(ToTypeTag to_type_tag,
4439 hwy::SizeTag<8> to_lane_size_tag,
4440 hwy::FloatTag from_type_tag, D d_to,
4441 svfloat32_t v) {
4442 return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4443 DupOdd(v));
4444}
4445
4446} // namespace detail
4447
4448// ------------------------------ ReorderDemote2To (OddEven)
4449
4450template <size_t N, int kPow2>
4452 svfloat32_t b) {
4453#if HWY_SVE_HAVE_F32_TO_BF16C
4454 const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
4455 return svcvtnt_bf16_f32_x(b_in_even, detail::PTrue(dbf16), a);
4456#else
4457 (void)dbf16;
4458 const auto a_in_odd =
4460 const auto b_in_odd =
4462 return BitCast(dbf16, detail::InterleaveOdd(b_in_odd, a_in_odd));
4463#endif
4464}
4465
4466template <size_t N, int kPow2>
4468 svint32_t b) {
4469#if HWY_SVE_HAVE_2
4470 (void)d16;
4471 const svint16_t a_in_even = svqxtnb_s32(a);
4472 return svqxtnt_s32(a_in_even, b);
4473#else
4474 const svint16_t a16 = BitCast(d16, detail::SaturateI<int16_t>(a));
4475 const svint16_t b16 = BitCast(d16, detail::SaturateI<int16_t>(b));
4476 return detail::InterleaveEven(a16, b16);
4477#endif
4478}
4479
4480template <size_t N, int kPow2>
4482 svint32_t b) {
4483#if HWY_SVE_HAVE_2
4484 (void)d16;
4485 const svuint16_t a_in_even = svqxtunb_s32(a);
4486 return svqxtunt_s32(a_in_even, b);
4487#else
4488 const Repartition<uint32_t, decltype(d16)> du32;
4489 const svuint32_t clamped_a = BitCast(du32, detail::MaxN(a, 0));
4490 const svuint32_t clamped_b = BitCast(du32, detail::MaxN(b, 0));
4491 const svuint16_t a16 = BitCast(d16, detail::SaturateU<uint16_t>(clamped_a));
4492 const svuint16_t b16 = BitCast(d16, detail::SaturateU<uint16_t>(clamped_b));
4493 return detail::InterleaveEven(a16, b16);
4494#endif
4495}
4496
4497template <size_t N, int kPow2>
4499 svuint32_t b) {
4500#if HWY_SVE_HAVE_2
4501 (void)d16;
4502 const svuint16_t a_in_even = svqxtnb_u32(a);
4503 return svqxtnt_u32(a_in_even, b);
4504#else
4505 const svuint16_t a16 = BitCast(d16, detail::SaturateU<uint16_t>(a));
4506 const svuint16_t b16 = BitCast(d16, detail::SaturateU<uint16_t>(b));
4507 return detail::InterleaveEven(a16, b16);
4508#endif
4509}
4510
4511template <size_t N, int kPow2>
4513 svint16_t b) {
4514#if HWY_SVE_HAVE_2
4515 (void)d8;
4516 const svint8_t a_in_even = svqxtnb_s16(a);
4517 return svqxtnt_s16(a_in_even, b);
4518#else
4519 const svint8_t a8 = BitCast(d8, detail::SaturateI<int8_t>(a));
4520 const svint8_t b8 = BitCast(d8, detail::SaturateI<int8_t>(b));
4521 return detail::InterleaveEven(a8, b8);
4522#endif
4523}
4524
4525template <size_t N, int kPow2>
4527 svint16_t b) {
4528#if HWY_SVE_HAVE_2
4529 (void)d8;
4530 const svuint8_t a_in_even = svqxtunb_s16(a);
4531 return svqxtunt_s16(a_in_even, b);
4532#else
4533 const Repartition<uint16_t, decltype(d8)> du16;
4534 const svuint16_t clamped_a = BitCast(du16, detail::MaxN(a, 0));
4535 const svuint16_t clamped_b = BitCast(du16, detail::MaxN(b, 0));
4536 const svuint8_t a8 = BitCast(d8, detail::SaturateU<uint8_t>(clamped_a));
4537 const svuint8_t b8 = BitCast(d8, detail::SaturateU<uint8_t>(clamped_b));
4538 return detail::InterleaveEven(a8, b8);
4539#endif
4540}
4541
4542template <size_t N, int kPow2>
4544 svuint16_t b) {
4545#if HWY_SVE_HAVE_2
4546 (void)d8;
4547 const svuint8_t a_in_even = svqxtnb_u16(a);
4548 return svqxtnt_u16(a_in_even, b);
4549#else
4550 const svuint8_t a8 = BitCast(d8, detail::SaturateU<uint8_t>(a));
4551 const svuint8_t b8 = BitCast(d8, detail::SaturateU<uint8_t>(b));
4552 return detail::InterleaveEven(a8, b8);
4553#endif
4554}
4555
4556template <size_t N, int kPow2>
4558 svint64_t b) {
4559#if HWY_SVE_HAVE_2
4560 (void)d32;
4561 const svint32_t a_in_even = svqxtnb_s64(a);
4562 return svqxtnt_s64(a_in_even, b);
4563#else
4564 const svint32_t a32 = BitCast(d32, detail::SaturateI<int32_t>(a));
4565 const svint32_t b32 = BitCast(d32, detail::SaturateI<int32_t>(b));
4566 return detail::InterleaveEven(a32, b32);
4567#endif
4568}
4569
4570template <size_t N, int kPow2>
4572 svint64_t b) {
4573#if HWY_SVE_HAVE_2
4574 (void)d32;
4575 const svuint32_t a_in_even = svqxtunb_s64(a);
4576 return svqxtunt_s64(a_in_even, b);
4577#else
4578 const Repartition<uint64_t, decltype(d32)> du64;
4579 const svuint64_t clamped_a = BitCast(du64, detail::MaxN(a, 0));
4580 const svuint64_t clamped_b = BitCast(du64, detail::MaxN(b, 0));
4581 const svuint32_t a32 = BitCast(d32, detail::SaturateU<uint32_t>(clamped_a));
4582 const svuint32_t b32 = BitCast(d32, detail::SaturateU<uint32_t>(clamped_b));
4583 return detail::InterleaveEven(a32, b32);
4584#endif
4585}
4586
4587template <size_t N, int kPow2>
4589 svuint64_t b) {
4590#if HWY_SVE_HAVE_2
4591 (void)d32;
4592 const svuint32_t a_in_even = svqxtnb_u64(a);
4593 return svqxtnt_u64(a_in_even, b);
4594#else
4595 const svuint32_t a32 = BitCast(d32, detail::SaturateU<uint32_t>(a));
4596 const svuint32_t b32 = BitCast(d32, detail::SaturateU<uint32_t>(b));
4597 return detail::InterleaveEven(a32, b32);
4598#endif
4599}
4600
4601template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
4602 HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>) / 2)>
4604 const auto clamped_a = BitCast(dn, detail::SaturateU<TFromD<D>>(a));
4605 const auto clamped_b = BitCast(dn, detail::SaturateU<TFromD<D>>(b));
4606 return detail::InterleaveEven(clamped_a, clamped_b);
4607}
4608
4609template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
4610 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4611 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2)>
4612HWY_API VFromD<D> OrderedDemote2To(D dn, V a, V b) {
4613 const Half<decltype(dn)> dnh;
4614 const auto demoted_a = DemoteTo(dnh, a);
4615 const auto demoted_b = DemoteTo(dnh, b);
4616 return Combine(dn, demoted_b, demoted_a);
4617}
4618
4619template <size_t N, int kPow2>
4621 svfloat32_t b) {
4622#if HWY_SVE_HAVE_F32_TO_BF16C
4623 (void)dbf16;
4624 const VBF16 a_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), a);
4625 const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
4626 return ConcatEven(dbf16, b_in_even, a_in_even);
4627#else
4628 const RebindToUnsigned<decltype(dbf16)> du16;
4629 const svuint16_t a_in_odd = BitCast(du16, detail::RoundF32ForDemoteToBF16(a));
4630 const svuint16_t b_in_odd = BitCast(du16, detail::RoundF32ForDemoteToBF16(b));
4631 return BitCast(dbf16, ConcatOdd(du16, b_in_odd, a_in_odd)); // lower half
4632#endif
4633}
4634
4635// ------------------------------ I8/U8/I16/U16 Div
4636
4637template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4638 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
4639HWY_API V Div(V a, V b) {
4640 const DFromV<decltype(a)> d;
4641 const Half<decltype(d)> dh;
4642 const RepartitionToWide<decltype(d)> dw;
4643
4644 const auto q_lo =
4645 Div(PromoteTo(dw, LowerHalf(dh, a)), PromoteTo(dw, LowerHalf(dh, b)));
4646 const auto q_hi = Div(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b));
4647
4648 return OrderedDemote2To(d, q_lo, q_hi);
4649}
4650
4651// ------------------------------ I8/U8/I16/U16 MaskedDivOr
4652template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
4654HWY_API V MaskedDivOr(V no, M m, V a, V b) {
4655 return IfThenElse(m, Div(a, b), no);
4656}
4657
4658// ------------------------------ Mod (Div, NegMulAdd)
4659template <class V>
4660HWY_API V Mod(V a, V b) {
4661 return NegMulAdd(Div(a, b), b, a);
4662}
4663
4664// ------------------------------ MaskedModOr (Mod)
4665template <class V, class M>
4666HWY_API V MaskedModOr(V no, M m, V a, V b) {
4667 return IfThenElse(m, Mod(a, b), no);
4668}
4669
4670// ------------------------------ BroadcastSignBit (ShiftRight)
4671template <class V>
4673 return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
4674}
4675
4676// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
4677template <class V>
4678HWY_API V IfNegativeThenElse(V v, V yes, V no) {
4679 static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
4680 return IfThenElse(IsNegative(v), yes, no);
4681}
4682
4683// ------------------------------ AverageRound (ShiftRight)
4684
4685#if HWY_SVE_HAVE_2
4688#else
4689template <class V>
4690V AverageRound(const V a, const V b) {
4691 return ShiftRight<1>(detail::AddN(Add(a, b), 1));
4692}
4693#endif // HWY_SVE_HAVE_2
4694
4695// ------------------------------ LoadMaskBits (TestBit)
4696
4697// `p` points to at least 8 readable bytes, not all of which need be valid.
4698template <class D, HWY_IF_T_SIZE_D(D, 1)>
4699HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
4700 // TODO(janwas): with SVE2.1, load to vector, then PMOV
4701 const RebindToUnsigned<D> du;
4702 const svuint8_t iota = Iota(du, 0);
4703
4704 // Load correct number of bytes (bits/8) with 7 zeros after each.
4705 const svuint8_t bytes = BitCast(du, svld1ub_u64(detail::PTrue(d), bits));
4706 // Replicate bytes 8x such that each byte contains the bit that governs it.
4707 const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
4708
4709 const svuint8_t bit =
4710 svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4711 return TestBit(rep8, bit);
4712}
4713
4714template <class D, HWY_IF_T_SIZE_D(D, 2)>
4715HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
4716 const uint8_t* HWY_RESTRICT bits) {
4717 const RebindToUnsigned<D> du;
4718 const Repartition<uint8_t, D> du8;
4719
4720 // There may be up to 128 bits; avoid reading past the end.
4721 const svuint8_t bytes = svld1(FirstN(du8, (Lanes(du) + 7) / 8), bits);
4722
4723 // Replicate bytes 16x such that each lane contains the bit that governs it.
4724 const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(Iota(du8, 0)));
4725
4726 const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
4727 return TestBit(BitCast(du, rep16), bit);
4728}
4729
4730template <class D, HWY_IF_T_SIZE_D(D, 4)>
4731HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
4732 const uint8_t* HWY_RESTRICT bits) {
4733 const RebindToUnsigned<D> du;
4734 const Repartition<uint8_t, D> du8;
4735
4736 // Upper bound = 2048 bits / 32 bit = 64 bits; at least 8 bytes are readable,
4737 // so we can skip computing the actual length (Lanes(du)+7)/8.
4738 const svuint8_t bytes = svld1(FirstN(du8, 8), bits);
4739
4740 // Replicate bytes 32x such that each lane contains the bit that governs it.
4741 const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(Iota(du8, 0)));
4742
4743 // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 ..
4744 const svuint32_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7));
4745
4746 return TestBit(BitCast(du, rep32), bit);
4747}
4748
4749template <class D, HWY_IF_T_SIZE_D(D, 8)>
4750HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
4751 const uint8_t* HWY_RESTRICT bits) {
4752 const RebindToUnsigned<D> du;
4753
4754 // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane.
4755 // The "at least 8 byte" guarantee in quick_reference ensures this is safe.
4756 uint32_t mask_bits;
4757 CopyBytes<4>(bits, &mask_bits); // copy from bytes
4758 const auto vbits = Set(du, mask_bits);
4759
4760 // 2 ^ {0,1, .., 31}, will not have more lanes than that.
4761 const svuint64_t bit = Shl(Set(du, 1), Iota(du, 0));
4762
4763 return TestBit(vbits, bit);
4764}
4765
4766// ------------------------------ Dup128MaskFromMaskBits
4767
4768template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 8)>
4770 const RebindToUnsigned<decltype(d)> du;
4771
4772 constexpr size_t kN = MaxLanes(d);
4773 if (kN < 8) mask_bits &= (1u << kN) - 1;
4774
4775 // Replicate the lower 8 bits of mask_bits to each u8 lane
4776 const svuint8_t bytes = BitCast(du, Set(du, static_cast<uint8_t>(mask_bits)));
4777
4778 const svuint8_t bit =
4779 svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4780 return TestBit(bytes, bit);
4781}
4782
4783template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
4784HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4785 const RebindToUnsigned<decltype(d)> du;
4786 const Repartition<uint16_t, decltype(du)> du16;
4787
4788 // Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
4789 // and then bitcast the replicated mask_bits to a u8 vector
4790 const svuint8_t bytes =
4791 BitCast(du, Set(du16, static_cast<uint16_t>(mask_bits)));
4792 // Replicate bytes 8x such that each byte contains the bit that governs it.
4793 const svuint8_t rep8 = svtbl_u8(bytes, ShiftRight<3>(Iota(du, 0)));
4794
4795 const svuint8_t bit =
4796 svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4797 return TestBit(rep8, bit);
4798}
4799
4800template <class D, HWY_IF_T_SIZE_D(D, 2)>
4802 const RebindToUnsigned<decltype(d)> du;
4803 const Repartition<uint8_t, decltype(d)> du8;
4804
4805 constexpr size_t kN = MaxLanes(d);
4806 if (kN < 8) mask_bits &= (1u << kN) - 1;
4807
4808 // Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
4809 const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
4810
4811 const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
4812 return TestBit(BitCast(du, bytes), bit);
4813}
4814
4815template <class D, HWY_IF_T_SIZE_D(D, 4)>
4816HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4817 const RebindToUnsigned<decltype(d)> du;
4818 const Repartition<uint8_t, decltype(d)> du8;
4819
4820 constexpr size_t kN = MaxLanes(d);
4821 if (kN < 4) mask_bits &= (1u << kN) - 1;
4822
4823 // Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
4824 const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
4825
4826 const svuint32_t bit = svdupq_n_u32(1, 2, 4, 8);
4827 return TestBit(BitCast(du, bytes), bit);
4828}
4829
4830template <class D, HWY_IF_T_SIZE_D(D, 8)>
4831HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4832 const RebindToUnsigned<decltype(d)> du;
4833 const Repartition<uint8_t, decltype(d)> du8;
4834
4835 if (MaxLanes(d) < 2) mask_bits &= 1u;
4836
4837 // Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
4838 const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
4839
4840 const svuint64_t bit = svdupq_n_u64(1, 2);
4841 return TestBit(BitCast(du, bytes), bit);
4842}
4843
4844// ------------------------------ StoreMaskBits
4845
4846namespace detail {
4847
4848// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes.
4849template <class T, HWY_IF_T_SIZE(T, 1)>
4850HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
4851 return svdup_n_u8_z(m, 1);
4852}
4853template <class T, HWY_IF_T_SIZE(T, 2)>
4854HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
4855 const ScalableTag<uint8_t> d8;
4856 const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
4857 return detail::ConcatEvenFull(b16, b16); // lower half
4858}
4859template <class T, HWY_IF_T_SIZE(T, 4)>
4860HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
4861 return U8FromU32(svdup_n_u32_z(m, 1));
4862}
4863template <class T, HWY_IF_T_SIZE(T, 8)>
4864HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
4865 const ScalableTag<uint32_t> d32;
4866 const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
4867 return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half
4868}
4869
4870// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
4871HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
4872 const ScalableTag<uint8_t> d8;
4873 const ScalableTag<uint16_t> d16;
4874 const ScalableTag<uint32_t> d32;
4875 const ScalableTag<uint64_t> d64;
4876 // TODO(janwas): could use SVE2 BDEP, but it's optional.
4877 x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
4878 x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
4879 x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
4880 return BitCast(d64, x);
4881}
4882
4883} // namespace detail
4884
4885// `p` points to at least 8 writable bytes.
4886// TODO(janwas): specialize for HWY_SVE_256
4887// TODO(janwas): with SVE2.1, use PMOV to store to vector, then StoreU
4888template <class D>
4889HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
4890 svuint64_t bits_in_u64 =
4892
4893 const size_t num_bits = Lanes(d);
4894 const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below
4895
4896 // Truncate each u64 to 8 bits and store to u8.
4897 svst1b_u64(FirstN(ScalableTag<uint64_t>(), num_bytes), bits, bits_in_u64);
4898
4899 // Non-full byte, need to clear the undefined upper bits. Can happen for
4900 // capped/fractional vectors or large T and small hardware vectors.
4901 if (num_bits < 8) {
4902 const int mask = static_cast<int>((1ull << num_bits) - 1);
4903 bits[0] = static_cast<uint8_t>(bits[0] & mask);
4904 }
4905 // Else: we wrote full bytes because num_bits is a power of two >= 8.
4906
4907 return num_bytes;
4908}
4909
4910// ------------------------------ CompressBits (LoadMaskBits)
4911template <class V, HWY_IF_NOT_T_SIZE_V(V, 1)>
4912HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
4913 return Compress(v, LoadMaskBits(DFromV<V>(), bits));
4914}
4915
4916// ------------------------------ CompressBitsStore (LoadMaskBits)
4917template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
4918HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
4919 D d, TFromD<D>* HWY_RESTRICT unaligned) {
4920 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
4921}
4922
4923// ------------------------------ Expand (StoreMaskBits)
4924
4925#ifdef HWY_NATIVE_EXPAND
4926#undef HWY_NATIVE_EXPAND
4927#else
4928#define HWY_NATIVE_EXPAND
4929#endif
4930
4931namespace detail {
4932
4933HWY_INLINE svuint8_t IndicesForExpandFromBits(uint64_t mask_bits) {
4934 const CappedTag<uint8_t, 8> du8;
4935 alignas(16) static constexpr uint8_t table[8 * 256] = {
4936 // PrintExpand8x8Tables
4937 128, 128, 128, 128, 128, 128, 128, 128, //
4938 0, 128, 128, 128, 128, 128, 128, 128, //
4939 128, 0, 128, 128, 128, 128, 128, 128, //
4940 0, 1, 128, 128, 128, 128, 128, 128, //
4941 128, 128, 0, 128, 128, 128, 128, 128, //
4942 0, 128, 1, 128, 128, 128, 128, 128, //
4943 128, 0, 1, 128, 128, 128, 128, 128, //
4944 0, 1, 2, 128, 128, 128, 128, 128, //
4945 128, 128, 128, 0, 128, 128, 128, 128, //
4946 0, 128, 128, 1, 128, 128, 128, 128, //
4947 128, 0, 128, 1, 128, 128, 128, 128, //
4948 0, 1, 128, 2, 128, 128, 128, 128, //
4949 128, 128, 0, 1, 128, 128, 128, 128, //
4950 0, 128, 1, 2, 128, 128, 128, 128, //
4951 128, 0, 1, 2, 128, 128, 128, 128, //
4952 0, 1, 2, 3, 128, 128, 128, 128, //
4953 128, 128, 128, 128, 0, 128, 128, 128, //
4954 0, 128, 128, 128, 1, 128, 128, 128, //
4955 128, 0, 128, 128, 1, 128, 128, 128, //
4956 0, 1, 128, 128, 2, 128, 128, 128, //
4957 128, 128, 0, 128, 1, 128, 128, 128, //
4958 0, 128, 1, 128, 2, 128, 128, 128, //
4959 128, 0, 1, 128, 2, 128, 128, 128, //
4960 0, 1, 2, 128, 3, 128, 128, 128, //
4961 128, 128, 128, 0, 1, 128, 128, 128, //
4962 0, 128, 128, 1, 2, 128, 128, 128, //
4963 128, 0, 128, 1, 2, 128, 128, 128, //
4964 0, 1, 128, 2, 3, 128, 128, 128, //
4965 128, 128, 0, 1, 2, 128, 128, 128, //
4966 0, 128, 1, 2, 3, 128, 128, 128, //
4967 128, 0, 1, 2, 3, 128, 128, 128, //
4968 0, 1, 2, 3, 4, 128, 128, 128, //
4969 128, 128, 128, 128, 128, 0, 128, 128, //
4970 0, 128, 128, 128, 128, 1, 128, 128, //
4971 128, 0, 128, 128, 128, 1, 128, 128, //
4972 0, 1, 128, 128, 128, 2, 128, 128, //
4973 128, 128, 0, 128, 128, 1, 128, 128, //
4974 0, 128, 1, 128, 128, 2, 128, 128, //
4975 128, 0, 1, 128, 128, 2, 128, 128, //
4976 0, 1, 2, 128, 128, 3, 128, 128, //
4977 128, 128, 128, 0, 128, 1, 128, 128, //
4978 0, 128, 128, 1, 128, 2, 128, 128, //
4979 128, 0, 128, 1, 128, 2, 128, 128, //
4980 0, 1, 128, 2, 128, 3, 128, 128, //
4981 128, 128, 0, 1, 128, 2, 128, 128, //
4982 0, 128, 1, 2, 128, 3, 128, 128, //
4983 128, 0, 1, 2, 128, 3, 128, 128, //
4984 0, 1, 2, 3, 128, 4, 128, 128, //
4985 128, 128, 128, 128, 0, 1, 128, 128, //
4986 0, 128, 128, 128, 1, 2, 128, 128, //
4987 128, 0, 128, 128, 1, 2, 128, 128, //
4988 0, 1, 128, 128, 2, 3, 128, 128, //
4989 128, 128, 0, 128, 1, 2, 128, 128, //
4990 0, 128, 1, 128, 2, 3, 128, 128, //
4991 128, 0, 1, 128, 2, 3, 128, 128, //
4992 0, 1, 2, 128, 3, 4, 128, 128, //
4993 128, 128, 128, 0, 1, 2, 128, 128, //
4994 0, 128, 128, 1, 2, 3, 128, 128, //
4995 128, 0, 128, 1, 2, 3, 128, 128, //
4996 0, 1, 128, 2, 3, 4, 128, 128, //
4997 128, 128, 0, 1, 2, 3, 128, 128, //
4998 0, 128, 1, 2, 3, 4, 128, 128, //
4999 128, 0, 1, 2, 3, 4, 128, 128, //
5000 0, 1, 2, 3, 4, 5, 128, 128, //
5001 128, 128, 128, 128, 128, 128, 0, 128, //
5002 0, 128, 128, 128, 128, 128, 1, 128, //
5003 128, 0, 128, 128, 128, 128, 1, 128, //
5004 0, 1, 128, 128, 128, 128, 2, 128, //
5005 128, 128, 0, 128, 128, 128, 1, 128, //
5006 0, 128, 1, 128, 128, 128, 2, 128, //
5007 128, 0, 1, 128, 128, 128, 2, 128, //
5008 0, 1, 2, 128, 128, 128, 3, 128, //
5009 128, 128, 128, 0, 128, 128, 1, 128, //
5010 0, 128, 128, 1, 128, 128, 2, 128, //
5011 128, 0, 128, 1, 128, 128, 2, 128, //
5012 0, 1, 128, 2, 128, 128, 3, 128, //
5013 128, 128, 0, 1, 128, 128, 2, 128, //
5014 0, 128, 1, 2, 128, 128, 3, 128, //
5015 128, 0, 1, 2, 128, 128, 3, 128, //
5016 0, 1, 2, 3, 128, 128, 4, 128, //
5017 128, 128, 128, 128, 0, 128, 1, 128, //
5018 0, 128, 128, 128, 1, 128, 2, 128, //
5019 128, 0, 128, 128, 1, 128, 2, 128, //
5020 0, 1, 128, 128, 2, 128, 3, 128, //
5021 128, 128, 0, 128, 1, 128, 2, 128, //
5022 0, 128, 1, 128, 2, 128, 3, 128, //
5023 128, 0, 1, 128, 2, 128, 3, 128, //
5024 0, 1, 2, 128, 3, 128, 4, 128, //
5025 128, 128, 128, 0, 1, 128, 2, 128, //
5026 0, 128, 128, 1, 2, 128, 3, 128, //
5027 128, 0, 128, 1, 2, 128, 3, 128, //
5028 0, 1, 128, 2, 3, 128, 4, 128, //
5029 128, 128, 0, 1, 2, 128, 3, 128, //
5030 0, 128, 1, 2, 3, 128, 4, 128, //
5031 128, 0, 1, 2, 3, 128, 4, 128, //
5032 0, 1, 2, 3, 4, 128, 5, 128, //
5033 128, 128, 128, 128, 128, 0, 1, 128, //
5034 0, 128, 128, 128, 128, 1, 2, 128, //
5035 128, 0, 128, 128, 128, 1, 2, 128, //
5036 0, 1, 128, 128, 128, 2, 3, 128, //
5037 128, 128, 0, 128, 128, 1, 2, 128, //
5038 0, 128, 1, 128, 128, 2, 3, 128, //
5039 128, 0, 1, 128, 128, 2, 3, 128, //
5040 0, 1, 2, 128, 128, 3, 4, 128, //
5041 128, 128, 128, 0, 128, 1, 2, 128, //
5042 0, 128, 128, 1, 128, 2, 3, 128, //
5043 128, 0, 128, 1, 128, 2, 3, 128, //
5044 0, 1, 128, 2, 128, 3, 4, 128, //
5045 128, 128, 0, 1, 128, 2, 3, 128, //
5046 0, 128, 1, 2, 128, 3, 4, 128, //
5047 128, 0, 1, 2, 128, 3, 4, 128, //
5048 0, 1, 2, 3, 128, 4, 5, 128, //
5049 128, 128, 128, 128, 0, 1, 2, 128, //
5050 0, 128, 128, 128, 1, 2, 3, 128, //
5051 128, 0, 128, 128, 1, 2, 3, 128, //
5052 0, 1, 128, 128, 2, 3, 4, 128, //
5053 128, 128, 0, 128, 1, 2, 3, 128, //
5054 0, 128, 1, 128, 2, 3, 4, 128, //
5055 128, 0, 1, 128, 2, 3, 4, 128, //
5056 0, 1, 2, 128, 3, 4, 5, 128, //
5057 128, 128, 128, 0, 1, 2, 3, 128, //
5058 0, 128, 128, 1, 2, 3, 4, 128, //
5059 128, 0, 128, 1, 2, 3, 4, 128, //
5060 0, 1, 128, 2, 3, 4, 5, 128, //
5061 128, 128, 0, 1, 2, 3, 4, 128, //
5062 0, 128, 1, 2, 3, 4, 5, 128, //
5063 128, 0, 1, 2, 3, 4, 5, 128, //
5064 0, 1, 2, 3, 4, 5, 6, 128, //
5065 128, 128, 128, 128, 128, 128, 128, 0, //
5066 0, 128, 128, 128, 128, 128, 128, 1, //
5067 128, 0, 128, 128, 128, 128, 128, 1, //
5068 0, 1, 128, 128, 128, 128, 128, 2, //
5069 128, 128, 0, 128, 128, 128, 128, 1, //
5070 0, 128, 1, 128, 128, 128, 128, 2, //
5071 128, 0, 1, 128, 128, 128, 128, 2, //
5072 0, 1, 2, 128, 128, 128, 128, 3, //
5073 128, 128, 128, 0, 128, 128, 128, 1, //
5074 0, 128, 128, 1, 128, 128, 128, 2, //
5075 128, 0, 128, 1, 128, 128, 128, 2, //
5076 0, 1, 128, 2, 128, 128, 128, 3, //
5077 128, 128, 0, 1, 128, 128, 128, 2, //
5078 0, 128, 1, 2, 128, 128, 128, 3, //
5079 128, 0, 1, 2, 128, 128, 128, 3, //
5080 0, 1, 2, 3, 128, 128, 128, 4, //
5081 128, 128, 128, 128, 0, 128, 128, 1, //
5082 0, 128, 128, 128, 1, 128, 128, 2, //
5083 128, 0, 128, 128, 1, 128, 128, 2, //
5084 0, 1, 128, 128, 2, 128, 128, 3, //
5085 128, 128, 0, 128, 1, 128, 128, 2, //
5086 0, 128, 1, 128, 2, 128, 128, 3, //
5087 128, 0, 1, 128, 2, 128, 128, 3, //
5088 0, 1, 2, 128, 3, 128, 128, 4, //
5089 128, 128, 128, 0, 1, 128, 128, 2, //
5090 0, 128, 128, 1, 2, 128, 128, 3, //
5091 128, 0, 128, 1, 2, 128, 128, 3, //
5092 0, 1, 128, 2, 3, 128, 128, 4, //
5093 128, 128, 0, 1, 2, 128, 128, 3, //
5094 0, 128, 1, 2, 3, 128, 128, 4, //
5095 128, 0, 1, 2, 3, 128, 128, 4, //
5096 0, 1, 2, 3, 4, 128, 128, 5, //
5097 128, 128, 128, 128, 128, 0, 128, 1, //
5098 0, 128, 128, 128, 128, 1, 128, 2, //
5099 128, 0, 128, 128, 128, 1, 128, 2, //
5100 0, 1, 128, 128, 128, 2, 128, 3, //
5101 128, 128, 0, 128, 128, 1, 128, 2, //
5102 0, 128, 1, 128, 128, 2, 128, 3, //
5103 128, 0, 1, 128, 128, 2, 128, 3, //
5104 0, 1, 2, 128, 128, 3, 128, 4, //
5105 128, 128, 128, 0, 128, 1, 128, 2, //
5106 0, 128, 128, 1, 128, 2, 128, 3, //
5107 128, 0, 128, 1, 128, 2, 128, 3, //
5108 0, 1, 128, 2, 128, 3, 128, 4, //
5109 128, 128, 0, 1, 128, 2, 128, 3, //
5110 0, 128, 1, 2, 128, 3, 128, 4, //
5111 128, 0, 1, 2, 128, 3, 128, 4, //
5112 0, 1, 2, 3, 128, 4, 128, 5, //
5113 128, 128, 128, 128, 0, 1, 128, 2, //
5114 0, 128, 128, 128, 1, 2, 128, 3, //
5115 128, 0, 128, 128, 1, 2, 128, 3, //
5116 0, 1, 128, 128, 2, 3, 128, 4, //
5117 128, 128, 0, 128, 1, 2, 128, 3, //
5118 0, 128, 1, 128, 2, 3, 128, 4, //
5119 128, 0, 1, 128, 2, 3, 128, 4, //
5120 0, 1, 2, 128, 3, 4, 128, 5, //
5121 128, 128, 128, 0, 1, 2, 128, 3, //
5122 0, 128, 128, 1, 2, 3, 128, 4, //
5123 128, 0, 128, 1, 2, 3, 128, 4, //
5124 0, 1, 128, 2, 3, 4, 128, 5, //
5125 128, 128, 0, 1, 2, 3, 128, 4, //
5126 0, 128, 1, 2, 3, 4, 128, 5, //
5127 128, 0, 1, 2, 3, 4, 128, 5, //
5128 0, 1, 2, 3, 4, 5, 128, 6, //
5129 128, 128, 128, 128, 128, 128, 0, 1, //
5130 0, 128, 128, 128, 128, 128, 1, 2, //
5131 128, 0, 128, 128, 128, 128, 1, 2, //
5132 0, 1, 128, 128, 128, 128, 2, 3, //
5133 128, 128, 0, 128, 128, 128, 1, 2, //
5134 0, 128, 1, 128, 128, 128, 2, 3, //
5135 128, 0, 1, 128, 128, 128, 2, 3, //
5136 0, 1, 2, 128, 128, 128, 3, 4, //
5137 128, 128, 128, 0, 128, 128, 1, 2, //
5138 0, 128, 128, 1, 128, 128, 2, 3, //
5139 128, 0, 128, 1, 128, 128, 2, 3, //
5140 0, 1, 128, 2, 128, 128, 3, 4, //
5141 128, 128, 0, 1, 128, 128, 2, 3, //
5142 0, 128, 1, 2, 128, 128, 3, 4, //
5143 128, 0, 1, 2, 128, 128, 3, 4, //
5144 0, 1, 2, 3, 128, 128, 4, 5, //
5145 128, 128, 128, 128, 0, 128, 1, 2, //
5146 0, 128, 128, 128, 1, 128, 2, 3, //
5147 128, 0, 128, 128, 1, 128, 2, 3, //
5148 0, 1, 128, 128, 2, 128, 3, 4, //
5149 128, 128, 0, 128, 1, 128, 2, 3, //
5150 0, 128, 1, 128, 2, 128, 3, 4, //
5151 128, 0, 1, 128, 2, 128, 3, 4, //
5152 0, 1, 2, 128, 3, 128, 4, 5, //
5153 128, 128, 128, 0, 1, 128, 2, 3, //
5154 0, 128, 128, 1, 2, 128, 3, 4, //
5155 128, 0, 128, 1, 2, 128, 3, 4, //
5156 0, 1, 128, 2, 3, 128, 4, 5, //
5157 128, 128, 0, 1, 2, 128, 3, 4, //
5158 0, 128, 1, 2, 3, 128, 4, 5, //
5159 128, 0, 1, 2, 3, 128, 4, 5, //
5160 0, 1, 2, 3, 4, 128, 5, 6, //
5161 128, 128, 128, 128, 128, 0, 1, 2, //
5162 0, 128, 128, 128, 128, 1, 2, 3, //
5163 128, 0, 128, 128, 128, 1, 2, 3, //
5164 0, 1, 128, 128, 128, 2, 3, 4, //
5165 128, 128, 0, 128, 128, 1, 2, 3, //
5166 0, 128, 1, 128, 128, 2, 3, 4, //
5167 128, 0, 1, 128, 128, 2, 3, 4, //
5168 0, 1, 2, 128, 128, 3, 4, 5, //
5169 128, 128, 128, 0, 128, 1, 2, 3, //
5170 0, 128, 128, 1, 128, 2, 3, 4, //
5171 128, 0, 128, 1, 128, 2, 3, 4, //
5172 0, 1, 128, 2, 128, 3, 4, 5, //
5173 128, 128, 0, 1, 128, 2, 3, 4, //
5174 0, 128, 1, 2, 128, 3, 4, 5, //
5175 128, 0, 1, 2, 128, 3, 4, 5, //
5176 0, 1, 2, 3, 128, 4, 5, 6, //
5177 128, 128, 128, 128, 0, 1, 2, 3, //
5178 0, 128, 128, 128, 1, 2, 3, 4, //
5179 128, 0, 128, 128, 1, 2, 3, 4, //
5180 0, 1, 128, 128, 2, 3, 4, 5, //
5181 128, 128, 0, 128, 1, 2, 3, 4, //
5182 0, 128, 1, 128, 2, 3, 4, 5, //
5183 128, 0, 1, 128, 2, 3, 4, 5, //
5184 0, 1, 2, 128, 3, 4, 5, 6, //
5185 128, 128, 128, 0, 1, 2, 3, 4, //
5186 0, 128, 128, 1, 2, 3, 4, 5, //
5187 128, 0, 128, 1, 2, 3, 4, 5, //
5188 0, 1, 128, 2, 3, 4, 5, 6, //
5189 128, 128, 0, 1, 2, 3, 4, 5, //
5190 0, 128, 1, 2, 3, 4, 5, 6, //
5191 128, 0, 1, 2, 3, 4, 5, 6, //
5192 0, 1, 2, 3, 4, 5, 6, 7};
5193 return Load(du8, table + mask_bits * 8);
5194}
5195
5196template <class D, HWY_IF_T_SIZE_D(D, 1)>
5197HWY_INLINE svuint8_t LaneIndicesFromByteIndices(D, svuint8_t idx) {
5198 return idx;
5199}
5200template <class D, class DU = RebindToUnsigned<D>, HWY_IF_NOT_T_SIZE_D(D, 1)>
5202 return PromoteTo(DU(), idx);
5203}
5204
5205// General case when we don't know the vector size, 8 elements at a time.
5206template <class V>
5207HWY_INLINE V ExpandLoop(V v, svbool_t mask) {
5208 const DFromV<V> d;
5209 using T = TFromV<V>;
5210 uint8_t mask_bytes[256 / 8];
5211 StoreMaskBits(d, mask, mask_bytes);
5212
5213 // ShiftLeftLanes is expensive, so we're probably better off storing to memory
5214 // and loading the final result.
5215 alignas(16) T out[2 * MaxLanes(d)];
5216
5217 svbool_t next = svpfalse_b();
5218 size_t input_consumed = 0;
5219 const V iota = Iota(d, 0);
5220 for (size_t i = 0; i < Lanes(d); i += 8) {
5221 uint64_t mask_bits = mask_bytes[i / 8];
5222
5223 // We want to skip past the v lanes already consumed. There is no
5224 // instruction for variable-shift-reg, but we can splice.
5225 const V vH = detail::Splice(v, v, next);
5226 input_consumed += PopCount(mask_bits);
5227 next = detail::GeN(iota, ConvertScalarTo<T>(input_consumed));
5228
5229 const auto idx = detail::LaneIndicesFromByteIndices(
5231 const V expand = TableLookupLanes(vH, idx);
5232 StoreU(expand, d, out + i);
5233 }
5234 return LoadU(d, out);
5235}
5236
5237} // namespace detail
5238
5239template <class V, HWY_IF_T_SIZE_V(V, 1)>
5240HWY_API V Expand(V v, svbool_t mask) {
5241#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
5242 const DFromV<V> d;
5243 uint8_t mask_bytes[256 / 8];
5244 StoreMaskBits(d, mask, mask_bytes);
5245 const uint64_t maskL = mask_bytes[0];
5246 const uint64_t maskH = mask_bytes[1];
5247
5248 // We want to skip past the v bytes already consumed by expandL. There is no
5249 // instruction for shift-reg by variable bytes, but we can splice. Instead of
5250 // GeN, Not(FirstN()) would also work.
5251 using T = TFromV<V>;
5252 const T countL = static_cast<T>(PopCount(maskL));
5253 const V vH = detail::Splice(v, v, detail::GeN(Iota(d, 0), countL));
5254
5255 const svuint8_t idxL = detail::IndicesForExpandFromBits(maskL);
5256 const svuint8_t idxH = detail::IndicesForExpandFromBits(maskH);
5257 return Combine(d, TableLookupLanes(vH, idxH), TableLookupLanes(v, idxL));
5258#else
5259 return detail::ExpandLoop(v, mask);
5260#endif
5261}
5262
5263template <class V, HWY_IF_T_SIZE_V(V, 2)>
5264HWY_API V Expand(V v, svbool_t mask) {
5265#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE // 16x8
5266 const DFromV<V> d;
5267 const RebindToUnsigned<decltype(d)> du16;
5268 const Rebind<uint8_t, decltype(d)> du8;
5269 // Convert mask into bitfield via horizontal sum (faster than ORV) of 8 bits.
5270 // Pre-multiply by N so we can use it as an offset for Load.
5271 const svuint16_t bits = Shl(Set(du16, 1), Iota(du16, 3));
5272 const size_t offset = detail::SumOfLanesM(mask, bits);
5273
5274 // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply
5275 // the nibble trick used below because not all indices fit within one lane.
5276 alignas(16) static constexpr uint8_t table[8 * 256] = {
5277 // PrintExpand16x8LaneTables
5278 255, 255, 255, 255, 255, 255, 255, 255, //
5279 0, 255, 255, 255, 255, 255, 255, 255, //
5280 255, 0, 255, 255, 255, 255, 255, 255, //
5281 0, 1, 255, 255, 255, 255, 255, 255, //
5282 255, 255, 0, 255, 255, 255, 255, 255, //
5283 0, 255, 1, 255, 255, 255, 255, 255, //
5284 255, 0, 1, 255, 255, 255, 255, 255, //
5285 0, 1, 2, 255, 255, 255, 255, 255, //
5286 255, 255, 255, 0, 255, 255, 255, 255, //
5287 0, 255, 255, 1, 255, 255, 255, 255, //
5288 255, 0, 255, 1, 255, 255, 255, 255, //
5289 0, 1, 255, 2, 255, 255, 255, 255, //
5290 255, 255, 0, 1, 255, 255, 255, 255, //
5291 0, 255, 1, 2, 255, 255, 255, 255, //
5292 255, 0, 1, 2, 255, 255, 255, 255, //
5293 0, 1, 2, 3, 255, 255, 255, 255, //
5294 255, 255, 255, 255, 0, 255, 255, 255, //
5295 0, 255, 255, 255, 1, 255, 255, 255, //
5296 255, 0, 255, 255, 1, 255, 255, 255, //
5297 0, 1, 255, 255, 2, 255, 255, 255, //
5298 255, 255, 0, 255, 1, 255, 255, 255, //
5299 0, 255, 1, 255, 2, 255, 255, 255, //
5300 255, 0, 1, 255, 2, 255, 255, 255, //
5301 0, 1, 2, 255, 3, 255, 255, 255, //
5302 255, 255, 255, 0, 1, 255, 255, 255, //
5303 0, 255, 255, 1, 2, 255, 255, 255, //
5304 255, 0, 255, 1, 2, 255, 255, 255, //
5305 0, 1, 255, 2, 3, 255, 255, 255, //
5306 255, 255, 0, 1, 2, 255, 255, 255, //
5307 0, 255, 1, 2, 3, 255, 255, 255, //
5308 255, 0, 1, 2, 3, 255, 255, 255, //
5309 0, 1, 2, 3, 4, 255, 255, 255, //
5310 255, 255, 255, 255, 255, 0, 255, 255, //
5311 0, 255, 255, 255, 255, 1, 255, 255, //
5312 255, 0, 255, 255, 255, 1, 255, 255, //
5313 0, 1, 255, 255, 255, 2, 255, 255, //
5314 255, 255, 0, 255, 255, 1, 255, 255, //
5315 0, 255, 1, 255, 255, 2, 255, 255, //
5316 255, 0, 1, 255, 255, 2, 255, 255, //
5317 0, 1, 2, 255, 255, 3, 255, 255, //
5318 255, 255, 255, 0, 255, 1, 255, 255, //
5319 0, 255, 255, 1, 255, 2, 255, 255, //
5320 255, 0, 255, 1, 255, 2, 255, 255, //
5321 0, 1, 255, 2, 255, 3, 255, 255, //
5322 255, 255, 0, 1, 255, 2, 255, 255, //
5323 0, 255, 1, 2, 255, 3, 255, 255, //
5324 255, 0, 1, 2, 255, 3, 255, 255, //
5325 0, 1, 2, 3, 255, 4, 255, 255, //
5326 255, 255, 255, 255, 0, 1, 255, 255, //
5327 0, 255, 255, 255, 1, 2, 255, 255, //
5328 255, 0, 255, 255, 1, 2, 255, 255, //
5329 0, 1, 255, 255, 2, 3, 255, 255, //
5330 255, 255, 0, 255, 1, 2, 255, 255, //
5331 0, 255, 1, 255, 2, 3, 255, 255, //
5332 255, 0, 1, 255, 2, 3, 255, 255, //
5333 0, 1, 2, 255, 3, 4, 255, 255, //
5334 255, 255, 255, 0, 1, 2, 255, 255, //
5335 0, 255, 255, 1, 2, 3, 255, 255, //
5336 255, 0, 255, 1, 2, 3, 255, 255, //
5337 0, 1, 255, 2, 3, 4, 255, 255, //
5338 255, 255, 0, 1, 2, 3, 255, 255, //
5339 0, 255, 1, 2, 3, 4, 255, 255, //
5340 255, 0, 1, 2, 3, 4, 255, 255, //
5341 0, 1, 2, 3, 4, 5, 255, 255, //
5342 255, 255, 255, 255, 255, 255, 0, 255, //
5343 0, 255, 255, 255, 255, 255, 1, 255, //
5344 255, 0, 255, 255, 255, 255, 1, 255, //
5345 0, 1, 255, 255, 255, 255, 2, 255, //
5346 255, 255, 0, 255, 255, 255, 1, 255, //
5347 0, 255, 1, 255, 255, 255, 2, 255, //
5348 255, 0, 1, 255, 255, 255, 2, 255, //
5349 0, 1, 2, 255, 255, 255, 3, 255, //
5350 255, 255, 255, 0, 255, 255, 1, 255, //
5351 0, 255, 255, 1, 255, 255, 2, 255, //
5352 255, 0, 255, 1, 255, 255, 2, 255, //
5353 0, 1, 255, 2, 255, 255, 3, 255, //
5354 255, 255, 0, 1, 255, 255, 2, 255, //
5355 0, 255, 1, 2, 255, 255, 3, 255, //
5356 255, 0, 1, 2, 255, 255, 3, 255, //
5357 0, 1, 2, 3, 255, 255, 4, 255, //
5358 255, 255, 255, 255, 0, 255, 1, 255, //
5359 0, 255, 255, 255, 1, 255, 2, 255, //
5360 255, 0, 255, 255, 1, 255, 2, 255, //
5361 0, 1, 255, 255, 2, 255, 3, 255, //
5362 255, 255, 0, 255, 1, 255, 2, 255, //
5363 0, 255, 1, 255, 2, 255, 3, 255, //
5364 255, 0, 1, 255, 2, 255, 3, 255, //
5365 0, 1, 2, 255, 3, 255, 4, 255, //
5366 255, 255, 255, 0, 1, 255, 2, 255, //
5367 0, 255, 255, 1, 2, 255, 3, 255, //
5368 255, 0, 255, 1, 2, 255, 3, 255, //
5369 0, 1, 255, 2, 3, 255, 4, 255, //
5370 255, 255, 0, 1, 2, 255, 3, 255, //
5371 0, 255, 1, 2, 3, 255, 4, 255, //
5372 255, 0, 1, 2, 3, 255, 4, 255, //
5373 0, 1, 2, 3, 4, 255, 5, 255, //
5374 255, 255, 255, 255, 255, 0, 1, 255, //
5375 0, 255, 255, 255, 255, 1, 2, 255, //
5376 255, 0, 255, 255, 255, 1, 2, 255, //
5377 0, 1, 255, 255, 255, 2, 3, 255, //
5378 255, 255, 0, 255, 255, 1, 2, 255, //
5379 0, 255, 1, 255, 255, 2, 3, 255, //
5380 255, 0, 1, 255, 255, 2, 3, 255, //
5381 0, 1, 2, 255, 255, 3, 4, 255, //
5382 255, 255, 255, 0, 255, 1, 2, 255, //
5383 0, 255, 255, 1, 255, 2, 3, 255, //
5384 255, 0, 255, 1, 255, 2, 3, 255, //
5385 0, 1, 255, 2, 255, 3, 4, 255, //
5386 255, 255, 0, 1, 255, 2, 3, 255, //
5387 0, 255, 1, 2, 255, 3, 4, 255, //
5388 255, 0, 1, 2, 255, 3, 4, 255, //
5389 0, 1, 2, 3, 255, 4, 5, 255, //
5390 255, 255, 255, 255, 0, 1, 2, 255, //
5391 0, 255, 255, 255, 1, 2, 3, 255, //
5392 255, 0, 255, 255, 1, 2, 3, 255, //
5393 0, 1, 255, 255, 2, 3, 4, 255, //
5394 255, 255, 0, 255, 1, 2, 3, 255, //
5395 0, 255, 1, 255, 2, 3, 4, 255, //
5396 255, 0, 1, 255, 2, 3, 4, 255, //
5397 0, 1, 2, 255, 3, 4, 5, 255, //
5398 255, 255, 255, 0, 1, 2, 3, 255, //
5399 0, 255, 255, 1, 2, 3, 4, 255, //
5400 255, 0, 255, 1, 2, 3, 4, 255, //
5401 0, 1, 255, 2, 3, 4, 5, 255, //
5402 255, 255, 0, 1, 2, 3, 4, 255, //
5403 0, 255, 1, 2, 3, 4, 5, 255, //
5404 255, 0, 1, 2, 3, 4, 5, 255, //
5405 0, 1, 2, 3, 4, 5, 6, 255, //
5406 255, 255, 255, 255, 255, 255, 255, 0, //
5407 0, 255, 255, 255, 255, 255, 255, 1, //
5408 255, 0, 255, 255, 255, 255, 255, 1, //
5409 0, 1, 255, 255, 255, 255, 255, 2, //
5410 255, 255, 0, 255, 255, 255, 255, 1, //
5411 0, 255, 1, 255, 255, 255, 255, 2, //
5412 255, 0, 1, 255, 255, 255, 255, 2, //
5413 0, 1, 2, 255, 255, 255, 255, 3, //
5414 255, 255, 255, 0, 255, 255, 255, 1, //
5415 0, 255, 255, 1, 255, 255, 255, 2, //
5416 255, 0, 255, 1, 255, 255, 255, 2, //
5417 0, 1, 255, 2, 255, 255, 255, 3, //
5418 255, 255, 0, 1, 255, 255, 255, 2, //
5419 0, 255, 1, 2, 255, 255, 255, 3, //
5420 255, 0, 1, 2, 255, 255, 255, 3, //
5421 0, 1, 2, 3, 255, 255, 255, 4, //
5422 255, 255, 255, 255, 0, 255, 255, 1, //
5423 0, 255, 255, 255, 1, 255, 255, 2, //
5424 255, 0, 255, 255, 1, 255, 255, 2, //
5425 0, 1, 255, 255, 2, 255, 255, 3, //
5426 255, 255, 0, 255, 1, 255, 255, 2, //
5427 0, 255, 1, 255, 2, 255, 255, 3, //
5428 255, 0, 1, 255, 2, 255, 255, 3, //
5429 0, 1, 2, 255, 3, 255, 255, 4, //
5430 255, 255, 255, 0, 1, 255, 255, 2, //
5431 0, 255, 255, 1, 2, 255, 255, 3, //
5432 255, 0, 255, 1, 2, 255, 255, 3, //
5433 0, 1, 255, 2, 3, 255, 255, 4, //
5434 255, 255, 0, 1, 2, 255, 255, 3, //
5435 0, 255, 1, 2, 3, 255, 255, 4, //
5436 255, 0, 1, 2, 3, 255, 255, 4, //
5437 0, 1, 2, 3, 4, 255, 255, 5, //
5438 255, 255, 255, 255, 255, 0, 255, 1, //
5439 0, 255, 255, 255, 255, 1, 255, 2, //
5440 255, 0, 255, 255, 255, 1, 255, 2, //
5441 0, 1, 255, 255, 255, 2, 255, 3, //
5442 255, 255, 0, 255, 255, 1, 255, 2, //
5443 0, 255, 1, 255, 255, 2, 255, 3, //
5444 255, 0, 1, 255, 255, 2, 255, 3, //
5445 0, 1, 2, 255, 255, 3, 255, 4, //
5446 255, 255, 255, 0, 255, 1, 255, 2, //
5447 0, 255, 255, 1, 255, 2, 255, 3, //
5448 255, 0, 255, 1, 255, 2, 255, 3, //
5449 0, 1, 255, 2, 255, 3, 255, 4, //
5450 255, 255, 0, 1, 255, 2, 255, 3, //
5451 0, 255, 1, 2, 255, 3, 255, 4, //
5452 255, 0, 1, 2, 255, 3, 255, 4, //
5453 0, 1, 2, 3, 255, 4, 255, 5, //
5454 255, 255, 255, 255, 0, 1, 255, 2, //
5455 0, 255, 255, 255, 1, 2, 255, 3, //
5456 255, 0, 255, 255, 1, 2, 255, 3, //
5457 0, 1, 255, 255, 2, 3, 255, 4, //
5458 255, 255, 0, 255, 1, 2, 255, 3, //
5459 0, 255, 1, 255, 2, 3, 255, 4, //
5460 255, 0, 1, 255, 2, 3, 255, 4, //
5461 0, 1, 2, 255, 3, 4, 255, 5, //
5462 255, 255, 255, 0, 1, 2, 255, 3, //
5463 0, 255, 255, 1, 2, 3, 255, 4, //
5464 255, 0, 255, 1, 2, 3, 255, 4, //
5465 0, 1, 255, 2, 3, 4, 255, 5, //
5466 255, 255, 0, 1, 2, 3, 255, 4, //
5467 0, 255, 1, 2, 3, 4, 255, 5, //
5468 255, 0, 1, 2, 3, 4, 255, 5, //
5469 0, 1, 2, 3, 4, 5, 255, 6, //
5470 255, 255, 255, 255, 255, 255, 0, 1, //
5471 0, 255, 255, 255, 255, 255, 1, 2, //
5472 255, 0, 255, 255, 255, 255, 1, 2, //
5473 0, 1, 255, 255, 255, 255, 2, 3, //
5474 255, 255, 0, 255, 255, 255, 1, 2, //
5475 0, 255, 1, 255, 255, 255, 2, 3, //
5476 255, 0, 1, 255, 255, 255, 2, 3, //
5477 0, 1, 2, 255, 255, 255, 3, 4, //
5478 255, 255, 255, 0, 255, 255, 1, 2, //
5479 0, 255, 255, 1, 255, 255, 2, 3, //
5480 255, 0, 255, 1, 255, 255, 2, 3, //
5481 0, 1, 255, 2, 255, 255, 3, 4, //
5482 255, 255, 0, 1, 255, 255, 2, 3, //
5483 0, 255, 1, 2, 255, 255, 3, 4, //
5484 255, 0, 1, 2, 255, 255, 3, 4, //
5485 0, 1, 2, 3, 255, 255, 4, 5, //
5486 255, 255, 255, 255, 0, 255, 1, 2, //
5487 0, 255, 255, 255, 1, 255, 2, 3, //
5488 255, 0, 255, 255, 1, 255, 2, 3, //
5489 0, 1, 255, 255, 2, 255, 3, 4, //
5490 255, 255, 0, 255, 1, 255, 2, 3, //
5491 0, 255, 1, 255, 2, 255, 3, 4, //
5492 255, 0, 1, 255, 2, 255, 3, 4, //
5493 0, 1, 2, 255, 3, 255, 4, 5, //
5494 255, 255, 255, 0, 1, 255, 2, 3, //
5495 0, 255, 255, 1, 2, 255, 3, 4, //
5496 255, 0, 255, 1, 2, 255, 3, 4, //
5497 0, 1, 255, 2, 3, 255, 4, 5, //
5498 255, 255, 0, 1, 2, 255, 3, 4, //
5499 0, 255, 1, 2, 3, 255, 4, 5, //
5500 255, 0, 1, 2, 3, 255, 4, 5, //
5501 0, 1, 2, 3, 4, 255, 5, 6, //
5502 255, 255, 255, 255, 255, 0, 1, 2, //
5503 0, 255, 255, 255, 255, 1, 2, 3, //
5504 255, 0, 255, 255, 255, 1, 2, 3, //
5505 0, 1, 255, 255, 255, 2, 3, 4, //
5506 255, 255, 0, 255, 255, 1, 2, 3, //
5507 0, 255, 1, 255, 255, 2, 3, 4, //
5508 255, 0, 1, 255, 255, 2, 3, 4, //
5509 0, 1, 2, 255, 255, 3, 4, 5, //
5510 255, 255, 255, 0, 255, 1, 2, 3, //
5511 0, 255, 255, 1, 255, 2, 3, 4, //
5512 255, 0, 255, 1, 255, 2, 3, 4, //
5513 0, 1, 255, 2, 255, 3, 4, 5, //
5514 255, 255, 0, 1, 255, 2, 3, 4, //
5515 0, 255, 1, 2, 255, 3, 4, 5, //
5516 255, 0, 1, 2, 255, 3, 4, 5, //
5517 0, 1, 2, 3, 255, 4, 5, 6, //
5518 255, 255, 255, 255, 0, 1, 2, 3, //
5519 0, 255, 255, 255, 1, 2, 3, 4, //
5520 255, 0, 255, 255, 1, 2, 3, 4, //
5521 0, 1, 255, 255, 2, 3, 4, 5, //
5522 255, 255, 0, 255, 1, 2, 3, 4, //
5523 0, 255, 1, 255, 2, 3, 4, 5, //
5524 255, 0, 1, 255, 2, 3, 4, 5, //
5525 0, 1, 2, 255, 3, 4, 5, 6, //
5526 255, 255, 255, 0, 1, 2, 3, 4, //
5527 0, 255, 255, 1, 2, 3, 4, 5, //
5528 255, 0, 255, 1, 2, 3, 4, 5, //
5529 0, 1, 255, 2, 3, 4, 5, 6, //
5530 255, 255, 0, 1, 2, 3, 4, 5, //
5531 0, 255, 1, 2, 3, 4, 5, 6, //
5532 255, 0, 1, 2, 3, 4, 5, 6, //
5533 0, 1, 2, 3, 4, 5, 6, 7};
5534 const svuint16_t indices = PromoteTo(du16, Load(du8, table + offset));
5535 return TableLookupLanes(v, indices); // already zeros mask=false lanes
5536#else
5537 return detail::ExpandLoop(v, mask);
5538#endif
5539}
5540
5541template <class V, HWY_IF_T_SIZE_V(V, 4)>
5542HWY_API V Expand(V v, svbool_t mask) {
5543#if HWY_TARGET == HWY_SVE_256 || HWY_IDE // 32x8
5544 const DFromV<V> d;
5545 const RebindToUnsigned<decltype(d)> du32;
5546 // Convert mask into bitfield via horizontal sum (faster than ORV).
5547 const svuint32_t bits = Shl(Set(du32, 1), Iota(du32, 0));
5548 const size_t code = detail::SumOfLanesM(mask, bits);
5549
5550 alignas(16) constexpr uint32_t packed_array[256] = {
5551 // PrintExpand32x8.
5552 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0,
5553 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10,
5554 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0,
5555 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210,
5556 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0,
5557 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10,
5558 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0,
5559 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210,
5560 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0,
5561 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10,
5562 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0,
5563 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210,
5564 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0,
5565 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10,
5566 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0,
5567 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210,
5568 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0,
5569 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10,
5570 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0,
5571 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210,
5572 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0,
5573 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10,
5574 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0,
5575 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210,
5576 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0,
5577 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10,
5578 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0,
5579 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210,
5580 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0,
5581 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10,
5582 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0,
5583 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210,
5584 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0,
5585 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10,
5586 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0,
5587 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210,
5588 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0,
5589 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10,
5590 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0,
5591 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210,
5592 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0,
5593 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10,
5594 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210};
5595
5596 // For lane i, shift the i-th 4-bit index down and mask with 0xF because
5597 // svtbl zeros outputs if the index is out of bounds.
5598 const svuint32_t packed = Set(du32, packed_array[code]);
5599 const svuint32_t indices = detail::AndN(Shr(packed, svindex_u32(0, 4)), 0xF);
5600 return TableLookupLanes(v, indices); // already zeros mask=false lanes
5601#elif HWY_TARGET == HWY_SVE2_128 // 32x4
5602 const DFromV<V> d;
5603 const RebindToUnsigned<decltype(d)> du32;
5604 // Convert mask into bitfield via horizontal sum (faster than ORV).
5605 const svuint32_t bits = Shl(Set(du32, 1), Iota(du32, 0));
5606 const size_t offset = detail::SumOfLanesM(mask, bits);
5607
5608 alignas(16) constexpr uint32_t packed_array[16] = {
5609 // PrintExpand64x4Nibble - same for 32x4.
5610 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
5611 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
5612 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
5613
5614 // For lane i, shift the i-th 4-bit index down and mask with 0xF because
5615 // svtbl zeros outputs if the index is out of bounds.
5616 const svuint32_t packed = Set(du32, packed_array[offset]);
5617 const svuint32_t indices = detail::AndN(Shr(packed, svindex_u32(0, 4)), 0xF);
5618 return TableLookupLanes(v, indices); // already zeros mask=false lanes
5619#else
5620 return detail::ExpandLoop(v, mask);
5621#endif
5622}
5623
5624template <class V, HWY_IF_T_SIZE_V(V, 8)>
5625HWY_API V Expand(V v, svbool_t mask) {
5626#if HWY_TARGET == HWY_SVE_256 || HWY_IDE // 64x4
5627 const DFromV<V> d;
5628 const RebindToUnsigned<decltype(d)> du64;
5629
5630 // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
5631 // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
5632 // SetTableIndices.
5633 const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
5634 const size_t offset = detail::SumOfLanesM(mask, bits);
5635
5636 alignas(16) static constexpr uint64_t table[4 * 16] = {
5637 // PrintExpand64x4Tables - small enough to store uncompressed.
5638 255, 255, 255, 255, 0, 255, 255, 255, 255, 0, 255, 255, 0, 1, 255, 255,
5639 255, 255, 0, 255, 0, 255, 1, 255, 255, 0, 1, 255, 0, 1, 2, 255,
5640 255, 255, 255, 0, 0, 255, 255, 1, 255, 0, 255, 1, 0, 1, 255, 2,
5641 255, 255, 0, 1, 0, 255, 1, 2, 255, 0, 1, 2, 0, 1, 2, 3};
5642 // This already zeros mask=false lanes.
5643 return TableLookupLanes(v, SetTableIndices(d, table + offset));
5644#elif HWY_TARGET == HWY_SVE2_128 // 64x2
5645 // Same as Compress, just zero out the mask=false lanes.
5646 return IfThenElseZero(mask, Compress(v, mask));
5647#else
5648 return detail::ExpandLoop(v, mask);
5649#endif
5650}
5651
5652// ------------------------------ LoadExpand
5653
5654template <class D>
5656 const TFromD<D>* HWY_RESTRICT unaligned) {
5657 return Expand(LoadU(d, unaligned), mask);
5658}
5659
5660// ------------------------------ MulEven (InterleaveEven)
5661
5662#if HWY_SVE_HAVE_2
5663namespace detail {
5664#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
5665 HWY_API HWY_SVE_V(BASE, BITS) \
5666 NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \
5667 return sv##OP##_##CHAR##BITS(a, b); \
5668 }
5669
5670HWY_SVE_FOREACH_UI16(HWY_SVE_MUL_EVEN, MulEvenNative, mullb)
5671HWY_SVE_FOREACH_UI32(HWY_SVE_MUL_EVEN, MulEvenNative, mullb)
5672HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEvenNative, mullb)
5673HWY_SVE_FOREACH_UI16(HWY_SVE_MUL_EVEN, MulOddNative, mullt)
5674HWY_SVE_FOREACH_UI32(HWY_SVE_MUL_EVEN, MulOddNative, mullt)
5675HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulOddNative, mullt)
5676#undef HWY_SVE_MUL_EVEN
5677} // namespace detail
5678#endif
5679
5680template <class V, class DW = RepartitionToWide<DFromV<V>>,
5681 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
5682HWY_API VFromD<DW> MulEven(const V a, const V b) {
5683#if HWY_SVE_HAVE_2
5684 return BitCast(DW(), detail::MulEvenNative(a, b));
5685#else
5686 const auto lo = Mul(a, b);
5687 const auto hi = MulHigh(a, b);
5688 return BitCast(DW(), detail::InterleaveEven(lo, hi));
5689#endif
5690}
5691
5692template <class V, class DW = RepartitionToWide<DFromV<V>>,
5693 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
5694HWY_API VFromD<DW> MulOdd(const V a, const V b) {
5695#if HWY_SVE_HAVE_2
5696 return BitCast(DW(), detail::MulOddNative(a, b));
5697#else
5698 const auto lo = Mul(a, b);
5699 const auto hi = MulHigh(a, b);
5700 return BitCast(DW(), detail::InterleaveOdd(lo, hi));
5701#endif
5702}
5703
5704HWY_API svint64_t MulEven(const svint64_t a, const svint64_t b) {
5705 const auto lo = Mul(a, b);
5706 const auto hi = MulHigh(a, b);
5707 return detail::InterleaveEven(lo, hi);
5708}
5709
5710HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
5711 const auto lo = Mul(a, b);
5712 const auto hi = MulHigh(a, b);
5713 return detail::InterleaveEven(lo, hi);
5714}
5715
5716HWY_API svint64_t MulOdd(const svint64_t a, const svint64_t b) {
5717 const auto lo = Mul(a, b);
5718 const auto hi = MulHigh(a, b);
5719 return detail::InterleaveOdd(lo, hi);
5720}
5721
5722HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
5723 const auto lo = Mul(a, b);
5724 const auto hi = MulHigh(a, b);
5725 return detail::InterleaveOdd(lo, hi);
5726}
5727
5728// ------------------------------ WidenMulPairwiseAdd
5729
5730template <size_t N, int kPow2>
5732 VBF16 b) {
5733#if HWY_SVE_HAVE_F32_TO_BF16C
5734 const svfloat32_t even = svbfmlalb_f32(Zero(df32), a, b);
5735 return svbfmlalt_f32(even, a, b);
5736#else
5737 const RebindToUnsigned<decltype(df32)> du32;
5738 // Using shift/and instead of Zip leads to the odd/even order that
5739 // RearrangeToOddPlusEven prefers.
5740 using VU32 = VFromD<decltype(du32)>;
5741 const VU32 odd = Set(du32, 0xFFFF0000u);
5742 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5743 const VU32 ao = And(BitCast(du32, a), odd);
5744 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5745 const VU32 bo = And(BitCast(du32, b), odd);
5746 return MulAdd(BitCast(df32, ae), BitCast(df32, be),
5747 Mul(BitCast(df32, ao), BitCast(df32, bo)));
5748#endif // HWY_SVE_HAVE_BF16_FEATURE
5749}
5750
5751template <size_t N, int kPow2>
5753 svint16_t b) {
5754#if HWY_SVE_HAVE_2
5755 (void)d32;
5756 return svmlalt_s32(svmullb_s32(a, b), a, b);
5757#else
5758 const svbool_t pg = detail::PTrue(d32);
5759 // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
5760 // Fortunately SVE has sign-extension for the even lanes.
5761 const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a));
5762 const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
5763 const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
5764 const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
5765 return svmla_s32_x(pg, svmul_s32_x(pg, ao, bo), ae, be);
5766#endif
5767}
5768
5769template <size_t N, int kPow2>
5771 svuint16_t a, svuint16_t b) {
5772#if HWY_SVE_HAVE_2
5773 (void)d32;
5774 return svmlalt_u32(svmullb_u32(a, b), a, b);
5775#else
5776 const svbool_t pg = detail::PTrue(d32);
5777 // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
5778 // Fortunately SVE has sign-extension for the even lanes.
5779 const svuint32_t ae = svexth_u32_x(pg, BitCast(d32, a));
5780 const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b));
5781 const svuint32_t ao = ShiftRight<16>(BitCast(d32, a));
5782 const svuint32_t bo = ShiftRight<16>(BitCast(d32, b));
5783 return svmla_u32_x(pg, svmul_u32_x(pg, ao, bo), ae, be);
5784#endif
5785}
5786
5787// ------------------------------ SatWidenMulAccumFixedPoint
5788
5789#if HWY_SVE_HAVE_2
5790
5791#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5792#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5793#else
5794#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5795#endif
5796
5797template <class DI32, HWY_IF_I32_D(DI32)>
5801 VFromD<DI32> sum) {
5802 return svqdmlalb_s32(sum, detail::ZipLowerSame(a, a),
5803 detail::ZipLowerSame(b, b));
5804}
5805
5806#endif // HWY_SVE_HAVE_2
5807
5808// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
5809
5810template <size_t N, int kPow2>
5812 VBF16 a, VBF16 b,
5813 const svfloat32_t sum0,
5814 svfloat32_t& sum1) {
5815#if HWY_SVE_HAVE_BF16_FEATURE
5816 (void)df32;
5817 sum1 = svbfmlalt_f32(sum1, a, b);
5818 return svbfmlalb_f32(sum0, a, b);
5819#else
5820 const RebindToUnsigned<decltype(df32)> du32;
5821 // Using shift/and instead of Zip leads to the odd/even order that
5822 // RearrangeToOddPlusEven prefers.
5823 using VU32 = VFromD<decltype(du32)>;
5824 const VU32 odd = Set(du32, 0xFFFF0000u);
5825 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5826 const VU32 ao = And(BitCast(du32, a), odd);
5827 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5828 const VU32 bo = And(BitCast(du32, b), odd);
5829 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
5830 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
5831#endif // HWY_SVE_HAVE_BF16_FEATURE
5832}
5833
5834template <size_t N, int kPow2>
5836 svint16_t a, svint16_t b,
5837 const svint32_t sum0,
5838 svint32_t& sum1) {
5839#if HWY_SVE_HAVE_2
5840 (void)d32;
5841 sum1 = svmlalt_s32(sum1, a, b);
5842 return svmlalb_s32(sum0, a, b);
5843#else
5844 const svbool_t pg = detail::PTrue(d32);
5845 // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
5846 // Fortunately SVE has sign-extension for the even lanes.
5847 const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a));
5848 const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
5849 const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
5850 const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
5851 sum1 = svmla_s32_x(pg, sum1, ao, bo);
5852 return svmla_s32_x(pg, sum0, ae, be);
5853#endif
5854}
5855
5856template <size_t N, int kPow2>
5858 svuint16_t a, svuint16_t b,
5859 const svuint32_t sum0,
5860 svuint32_t& sum1) {
5861#if HWY_SVE_HAVE_2
5862 (void)d32;
5863 sum1 = svmlalt_u32(sum1, a, b);
5864 return svmlalb_u32(sum0, a, b);
5865#else
5866 const svbool_t pg = detail::PTrue(d32);
5867 // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
5868 // Fortunately SVE has sign-extension for the even lanes.
5869 const svuint32_t ae = svexth_u32_x(pg, BitCast(d32, a));
5870 const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b));
5871 const svuint32_t ao = ShiftRight<16>(BitCast(d32, a));
5872 const svuint32_t bo = ShiftRight<16>(BitCast(d32, b));
5873 sum1 = svmla_u32_x(pg, sum1, ao, bo);
5874 return svmla_u32_x(pg, sum0, ae, be);
5875#endif
5876}
5877
5878// ------------------------------ RearrangeToOddPlusEven
5879template <class VW>
5880HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
5881 // sum0 is the sum of bottom/even lanes and sum1 of top/odd lanes.
5882 return Add(sum0, sum1);
5883}
5884
5885// ------------------------------ SumOfMulQuadAccumulate
5886
5887#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5888#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5889#else
5890#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5891#endif
5892
5893template <class DI32, HWY_IF_I32_D(DI32)>
5895 svint8_t b, svint32_t sum) {
5896 return svdot_s32(sum, a, b);
5897}
5898
5899#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5900#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5901#else
5902#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5903#endif
5904
5905template <class DU32, HWY_IF_U32_D(DU32)>
5907 svuint8_t b, svuint32_t sum) {
5908 return svdot_u32(sum, a, b);
5909}
5910
5911#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5912#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5913#else
5914#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5915#endif
5916
5917template <class DI32, HWY_IF_I32_D(DI32)>
5919 svint8_t b_i, svint32_t sum) {
5920 // TODO: use svusdot_u32 on SVE targets that require support for both SVE2
5921 // and SVE I8MM.
5922
5923 const RebindToUnsigned<decltype(di32)> du32;
5924 const Repartition<uint8_t, decltype(di32)> du8;
5925
5926 const auto b_u = BitCast(du8, b_i);
5927 const auto result_sum0 = svdot_u32(BitCast(du32, sum), a_u, b_u);
5928 const auto result_sum1 =
5929 ShiftLeft<8>(svdot_u32(Zero(du32), a_u, ShiftRight<7>(b_u)));
5930
5931 return BitCast(di32, Sub(result_sum0, result_sum1));
5932}
5933
5934#ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5935#undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5936#else
5937#define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5938#endif
5939
5940template <class DI64, HWY_IF_I64_D(DI64)>
5942 svint16_t b, svint64_t sum) {
5943 return svdot_s64(sum, a, b);
5944}
5945
5946#ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5947#undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5948#else
5949#define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5950#endif
5951
5952template <class DU64, HWY_IF_U64_D(DU64)>
5953HWY_API VFromD<DU64> SumOfMulQuadAccumulate(DU64 /*du64*/, svuint16_t a,
5954 svuint16_t b, svuint64_t sum) {
5955 return svdot_u64(sum, a, b);
5956}
5957
5958// ------------------------------ AESRound / CLMul
5959
5960// Static dispatch with -march=armv8-a+sve2+aes, or dynamic dispatch WITHOUT a
5961// baseline, in which case we check for AES support at runtime.
5962#if defined(__ARM_FEATURE_SVE2_AES) || \
5963 (HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH && HWY_BASELINE_SVE2 == 0)
5964
5965// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
5966#ifdef HWY_NATIVE_AES
5967#undef HWY_NATIVE_AES
5968#else
5969#define HWY_NATIVE_AES
5970#endif
5971
5972HWY_API svuint8_t AESRound(svuint8_t state, svuint8_t round_key) {
5973 // It is not clear whether E and MC fuse like they did on NEON.
5974 return Xor(svaesmc_u8(svaese_u8(state, svdup_n_u8(0))), round_key);
5975}
5976
5977HWY_API svuint8_t AESLastRound(svuint8_t state, svuint8_t round_key) {
5978 return Xor(svaese_u8(state, svdup_n_u8(0)), round_key);
5979}
5980
5981HWY_API svuint8_t AESInvMixColumns(svuint8_t state) {
5982 return svaesimc_u8(state);
5983}
5984
5985HWY_API svuint8_t AESRoundInv(svuint8_t state, svuint8_t round_key) {
5986 return Xor(svaesimc_u8(svaesd_u8(state, svdup_n_u8(0))), round_key);
5987}
5988
5989HWY_API svuint8_t AESLastRoundInv(svuint8_t state, svuint8_t round_key) {
5990 return Xor(svaesd_u8(state, svdup_n_u8(0)), round_key);
5991}
5992
5993template <uint8_t kRcon>
5994HWY_API svuint8_t AESKeyGenAssist(svuint8_t v) {
5995 alignas(16) static constexpr uint8_t kRconXorMask[16] = {
5996 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0};
5997 alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
5998 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12};
5999 const DFromV<decltype(v)> d;
6000 const Repartition<uint32_t, decltype(d)> du32;
6001 const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
6002 const auto sub_word_result = AESLastRound(w13, LoadDup128(d, kRconXorMask));
6003 return TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle));
6004}
6005
6006HWY_API svuint64_t CLMulLower(const svuint64_t a, const svuint64_t b) {
6007 return svpmullb_pair(a, b);
6008}
6009
6010HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) {
6011 return svpmullt_pair(a, b);
6012}
6013
6014#endif // __ARM_FEATURE_SVE2_AES
6015
6016// ------------------------------ Lt128
6017
6018namespace detail {
6019#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP) \
6020 template <size_t N, int kPow2> \
6021 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, svbool_t m) { \
6022 return sv##OP##_b##BITS(m, m); \
6023 }
6024
6025HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1) // actually for bool
6026HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2) // actually for bool
6027#undef HWY_SVE_DUP
6028
6029#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
6030template <class D>
6031HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) {
6032 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6033 const svbool_t eqHx = Eq(a, b); // only odd lanes used
6034 // Convert to vector: more pipelines can execute vector TRN* instructions
6035 // than the predicate version.
6036 const svuint64_t ltHL = VecFromMask(d, Lt(a, b));
6037 // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
6038 // Requires an extra IfThenElse because INSR, EXT, TRN2 are unpredicated.
6039 const svuint64_t ltHx = IfThenElse(eqHx, DupEven(ltHL), ltHL);
6040 // Duplicate upper lane into lower.
6041 return DupOdd(ltHx);
6042}
6043#endif
6044} // namespace detail
6045
6046template <class D>
6047HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
6048#if HWY_TARGET == HWY_SVE_256
6049 return MaskFromVec(detail::Lt128Vec(d, a, b));
6050#else
6051 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6052 const svbool_t eqHx = Eq(a, b); // only odd lanes used
6053 const svbool_t ltHL = Lt(a, b);
6054 // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
6055 const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(d, ltHL), ltHL);
6056 // Duplicate upper lane into lower.
6057 return detail::DupOddB(d, ltHx);
6058#endif // HWY_TARGET != HWY_SVE_256
6059}
6060
6061// ------------------------------ Lt128Upper
6062
6063template <class D>
6064HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) {
6065 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6066 const svbool_t ltHL = Lt(a, b);
6067 return detail::DupOddB(d, ltHL);
6068}
6069
6070// ------------------------------ Eq128, Ne128
6071
6072#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
6073namespace detail {
6074
6075template <class D>
6076HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) {
6077 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6078 // Convert to vector: more pipelines can execute vector TRN* instructions
6079 // than the predicate version.
6080 const svuint64_t eqHL = VecFromMask(d, Eq(a, b));
6081 // Duplicate upper and lower.
6082 const svuint64_t eqHH = DupOdd(eqHL);
6083 const svuint64_t eqLL = DupEven(eqHL);
6084 return And(eqLL, eqHH);
6085}
6086
6087template <class D>
6088HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b) {
6089 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6090 // Convert to vector: more pipelines can execute vector TRN* instructions
6091 // than the predicate version.
6092 const svuint64_t neHL = VecFromMask(d, Ne(a, b));
6093 // Duplicate upper and lower.
6094 const svuint64_t neHH = DupOdd(neHL);
6095 const svuint64_t neLL = DupEven(neHL);
6096 return Or(neLL, neHH);
6097}
6098
6099} // namespace detail
6100#endif
6101
6102template <class D>
6103HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) {
6104#if HWY_TARGET == HWY_SVE_256
6105 return MaskFromVec(detail::Eq128Vec(d, a, b));
6106#else
6107 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6108 const svbool_t eqHL = Eq(a, b);
6109 const svbool_t eqHH = detail::DupOddB(d, eqHL);
6110 const svbool_t eqLL = detail::DupEvenB(d, eqHL);
6111 return And(eqLL, eqHH);
6112#endif // HWY_TARGET != HWY_SVE_256
6113}
6114
6115template <class D>
6116HWY_INLINE svbool_t Ne128(D d, const svuint64_t a, const svuint64_t b) {
6117#if HWY_TARGET == HWY_SVE_256
6118 return MaskFromVec(detail::Ne128Vec(d, a, b));
6119#else
6120 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6121 const svbool_t neHL = Ne(a, b);
6122 const svbool_t neHH = detail::DupOddB(d, neHL);
6123 const svbool_t neLL = detail::DupEvenB(d, neHL);
6124 return Or(neLL, neHH);
6125#endif // HWY_TARGET != HWY_SVE_256
6126}
6127
6128// ------------------------------ Eq128Upper, Ne128Upper
6129
6130template <class D>
6131HWY_INLINE svbool_t Eq128Upper(D d, svuint64_t a, svuint64_t b) {
6132 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6133 const svbool_t eqHL = Eq(a, b);
6134 return detail::DupOddB(d, eqHL);
6135}
6136
6137template <class D>
6138HWY_INLINE svbool_t Ne128Upper(D d, svuint64_t a, svuint64_t b) {
6139 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6140 const svbool_t neHL = Ne(a, b);
6141 return detail::DupOddB(d, neHL);
6142}
6143
6144// ------------------------------ Min128, Max128 (Lt128)
6145
6146template <class D>
6147HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) {
6148#if HWY_TARGET == HWY_SVE_256
6149 return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
6150#else
6151 return IfThenElse(Lt128(d, a, b), a, b);
6152#endif
6153}
6154
6155template <class D>
6156HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) {
6157#if HWY_TARGET == HWY_SVE_256
6158 return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
6159#else
6160 return IfThenElse(Lt128(d, b, a), a, b);
6161#endif
6162}
6163
6164template <class D>
6165HWY_INLINE svuint64_t Min128Upper(D d, const svuint64_t a, const svuint64_t b) {
6166 return IfThenElse(Lt128Upper(d, a, b), a, b);
6167}
6168
6169template <class D>
6170HWY_INLINE svuint64_t Max128Upper(D d, const svuint64_t a, const svuint64_t b) {
6171 return IfThenElse(Lt128Upper(d, b, a), a, b);
6172}
6173
6174// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
6175
6176#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
6177#undef HWY_NATIVE_LEADING_ZERO_COUNT
6178#else
6179#define HWY_NATIVE_LEADING_ZERO_COUNT
6180#endif
6181
6182#define HWY_SVE_LEADING_ZERO_COUNT(BASE, CHAR, BITS, HALF, NAME, OP) \
6183 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
6184 const DFromV<decltype(v)> d; \
6185 return BitCast(d, sv##OP##_##CHAR##BITS##_x(detail::PTrue(d), v)); \
6186 }
6187
6189#undef HWY_SVE_LEADING_ZERO_COUNT
6190
6191template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
6195
6196template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
6198 const DFromV<decltype(v)> d;
6199 using T = TFromD<decltype(d)>;
6200 return BitCast(d, Sub(Set(d, T{sizeof(T) * 8 - 1}), LeadingZeroCount(v)));
6201}
6202
6203// ================================================== END MACROS
6204#undef HWY_SVE_ALL_PTRUE
6205#undef HWY_SVE_D
6206#undef HWY_SVE_FOREACH
6207#undef HWY_SVE_FOREACH_BF16
6208#undef HWY_SVE_FOREACH_BF16_UNCONDITIONAL
6209#undef HWY_SVE_FOREACH_F
6210#undef HWY_SVE_FOREACH_F16
6211#undef HWY_SVE_FOREACH_F32
6212#undef HWY_SVE_FOREACH_F3264
6213#undef HWY_SVE_FOREACH_F64
6214#undef HWY_SVE_FOREACH_I
6215#undef HWY_SVE_FOREACH_I08
6216#undef HWY_SVE_FOREACH_I16
6217#undef HWY_SVE_FOREACH_I32
6218#undef HWY_SVE_FOREACH_I64
6219#undef HWY_SVE_FOREACH_IF
6220#undef HWY_SVE_FOREACH_U
6221#undef HWY_SVE_FOREACH_U08
6222#undef HWY_SVE_FOREACH_U16
6223#undef HWY_SVE_FOREACH_U32
6224#undef HWY_SVE_FOREACH_U64
6225#undef HWY_SVE_FOREACH_UI
6226#undef HWY_SVE_FOREACH_UI08
6227#undef HWY_SVE_FOREACH_UI16
6228#undef HWY_SVE_FOREACH_UI32
6229#undef HWY_SVE_FOREACH_UI64
6230#undef HWY_SVE_FOREACH_UIF3264
6231#undef HWY_SVE_HAVE_2
6232#undef HWY_SVE_IF_EMULATED_D
6233#undef HWY_SVE_IF_NOT_EMULATED_D
6234#undef HWY_SVE_PTRUE
6235#undef HWY_SVE_RETV_ARGMVV
6236#undef HWY_SVE_RETV_ARGPV
6237#undef HWY_SVE_RETV_ARGPVN
6238#undef HWY_SVE_RETV_ARGPVV
6239#undef HWY_SVE_RETV_ARGV
6240#undef HWY_SVE_RETV_ARGVN
6241#undef HWY_SVE_RETV_ARGVV
6242#undef HWY_SVE_RETV_ARGVVV
6243#undef HWY_SVE_T
6244#undef HWY_SVE_UNDEFINED
6245#undef HWY_SVE_V
6246
6247// NOLINTNEXTLINE(google-readability-namespace-comments)
6248} // namespace HWY_NAMESPACE
6249} // namespace hwy
#define HWY_SVE_RETV_ARGMVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:247
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:160
HWY_AFTER_NAMESPACE()
#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1977
#define HWY_SVE_ROTATE_RIGHT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1071
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:101
#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:6019
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1270
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2611
#define HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:106
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2584
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:85
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2794
#define HWY_SVE_REVERSE_BITS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3611
#define HWY_SVE_MASKED_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1899
#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2011
#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:437
#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1208
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:426
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:122
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:182
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:315
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3470
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:883
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1928
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:380
#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:253
#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3295
#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:174
#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:4118
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:168
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:215
#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3155
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:93
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:483
#define HWY_SVE_PTRUE(BITS)
Definition arm_sve-inl.h:289
#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1993
#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3045
#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2061
#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2045
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1282
#define HWY_SVE_GET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:538
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:147
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:221
#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3380
#define HWY_SVE_TABLE2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3394
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:233
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1101
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2080
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3752
#define HWY_SVE_ADDSUB_UI(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:4253
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1547
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:811
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:134
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:195
#define HWY_SVE_LEADING_ZERO_COUNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:6182
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2755
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1320
#define HWY_SVE_MASKED_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1937
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:745
#define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1743
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:178
HWY_BEFORE_NAMESPACE()
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:86
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:128
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1543
#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1635
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:338
#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:5664
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2030
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:164
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1889
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1047
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3165
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:156
#define HWY_SVE_ADDSUB_F(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:4237
#define HWY_SVE_CREATE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:504
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:239
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:211
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:755
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:3237
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_LANES_GT(kN, lanes)
Definition base.h:618
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_TARGET
Definition detect_targets.h:543
#define HWY_SVE_256
Definition detect_targets.h:90
#define HWY_SVE2_128
Definition detect_targets.h:89
HWY_INLINE V ExpandLoop(V v, svbool_t mask)
Definition arm_sve-inl.h:5207
HWY_INLINE Vec256< T > BroadcastLane(hwy::SizeTag< 0 >, Vec256< T > v)
Definition x86_256-inl.h:4186
HWY_INLINE svuint8_t BoolFromMask(svbool_t m)
Definition arm_sve-inl.h:4850
HWY_INLINE VFromD< RepartitionToWide< DFromV< V > > > SumsOf2(hwy::SignedTag, hwy::SizeTag< 1 >, V v)
Definition arm_neon-inl.h:1959
HWY_INLINE svuint8_t LaneIndicesFromByteIndices(D, svuint8_t idx)
Definition arm_sve-inl.h:5197
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x)
Definition arm_sve-inl.h:4871
svbool_t MaskLowerHalf(D d)
Definition arm_sve-inl.h:2939
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition rvv-inl.h:2966
svbool_t MakeMask(D d)
Definition arm_sve-inl.h:359
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 > d)
Definition arm_sve-inl.h:3442
VI SaturateI(VI v)
Definition arm_sve-inl.h:2259
HWY_API svbool_t PFalse()
Definition arm_sve-inl.h:352
svbool_t MaskUpperHalf(D d)
Definition arm_sve-inl.h:3033
VFromD< D > Ext(D d, VFromD< Half< D > > v)
Definition rvv-inl.h:764
VU SaturateU(VU v)
Definition arm_sve-inl.h:2253
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6088
HWY_INLINE VFromD< D > PromoteOddTo(hwy::FloatTag to_type_tag, hwy::SizeTag< 4 > to_lane_size_tag, hwy::FloatTag from_type_tag, D d_to, svfloat16_t v)
Definition arm_sve-inl.h:4419
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6031
HWY_INLINE svuint8_t IndicesForExpandFromBits(uint64_t mask_bits)
Definition arm_sve-inl.h:4933
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition ops/shared-inl.h:146
HWY_INLINE VFromD< D > PromoteEvenTo(hwy::SignedTag, hwy::SizeTag< 2 >, hwy::SignedTag, D d_to, svint8_t v)
Definition arm_sve-inl.h:4334
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition ops/shared-inl.h:325
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition rvv-inl.h:2972
HWY_INLINE V Splice(V hi, V lo, svbool_t mask)
Definition arm_sve-inl.h:2621
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6076
HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v)
Definition arm_sve-inl.h:2690
HWY_INLINE size_t AllHardwareLanes()
Definition arm_sve-inl.h:266
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API V SaturatedNeg(V v)
Definition generic_ops-inl.h:897
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API svbool_t MaskFalse(const D)
Definition arm_sve-inl.h:372
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
V Shl(V a, V b)
Definition generic_ops-inl.h:7322
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API Vec128< int64_t > SaturatedAbs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3288
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API V MaskedModOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:4666
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API V AddSub(V a, V b)
Definition generic_ops-inl.h:775
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API TFromD< D > ReduceMax(D d, VFromD< D > v)
Definition arm_sve-inl.h:3213
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
HWY_API VFromD< DI32 > SatWidenMulAccumFixedPoint(DI32, VFromD< Rebind< int16_t, DI32 > > a, VFromD< Rebind< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition arm_neon-inl.h:6496
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API svbool_t DemoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1420
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API V MulAddSub(V mul, V x, V sub_or_add)
Definition arm_sve-inl.h:4285
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
typename detail::CappedTagChecker< T, kLimit, kPow2 >::type CappedTag
Definition ops/shared-inl.h:379
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API TFromD< D > ReduceMin(D d, VFromD< D > v)
Definition arm_sve-inl.h:3208
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API void LoadInterleaved2(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1)
Definition arm_neon-inl.h:9049
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:476
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
VFromD< ScalableTag< bfloat16_t > > VBF16
Definition arm_sve-inl.h:410
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:367
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API svbool_t LowerHalfOfMask(D, svbool_t m)
Definition arm_sve-inl.h:1456
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
RepartitionToWide< RepartitionToWide< D > > RepartitionToWideX2
Definition ops/shared-inl.h:480
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API V Div(V a, V b)
Definition arm_sve-inl.h:4639
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7353
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API V HighestSetBitIndex(V v)
Definition arm_neon-inl.h:9523
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > MaskedGatherIndex(MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2731
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
V Shr(V a, V b)
Definition generic_ops-inl.h:7326
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API svbool_t IsNegative(V v)
Definition arm_sve-inl.h:1623
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API V TrailingZeroCount(V v)
Definition arm_neon-inl.h:9530
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API V ReverseBits(V v)
Definition generic_ops-inl.h:6464
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API svbool_t PromoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1394
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API V Mod(V a, V b)
Definition arm_sve-inl.h:4660
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API V BroadcastLane(const V v)
Definition arm_sve-inl.h:4146
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
Definition abort.h:8
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API constexpr bool IsSame()
Definition base.h:499
constexpr size_t CeilLog2(TI x)
Definition base.h:2669
typename detail::Relations< T >::Narrow MakeNarrow
Definition base.h:2088
HWY_API size_t PopCount(T x)
Definition base.h:2615
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:626
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_T_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:557
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_IF_T_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:555
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition tuple-inl.h:30
Definition tuple-inl.h:36
Definition tuple-inl.h:43
@ value
Definition arm_neon-inl.h:8429
Definition arm_sve-inl.h:68
Definition ops/shared-inl.h:198
Definition base.h:694
Definition base.h:1594
Definition base.h:1117
int VFromD
Definition tuple-inl.h:25
HWY_API Vec2< D > Create2(D, VFromD< D > v0, VFromD< D > v1)
Definition tuple-inl.h:52
HWY_API Vec4< D > Create4(D, VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3)
Definition tuple-inl.h:62
HWY_API Vec3< D > Create3(D, VFromD< D > v0, VFromD< D > v1, VFromD< D > v2)
Definition tuple-inl.h:57