Grok 12.0.1
generic_ops-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
3// SPDX-License-Identifier: Apache-2.0
4// SPDX-License-Identifier: BSD-3-Clause
5//
6// Licensed under the Apache License, Version 2.0 (the "License");
7// you may not use this file except in compliance with the License.
8// You may obtain a copy of the License at
9//
10// http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18// Target-independent types/functions defined after target-specific ops.
19
20// The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip
21// the generic implementation here if native ops are already defined.
22
23#include "hwy/base.h"
24
25// Define detail::Shuffle1230 etc, but only when viewing the current header;
26// normally this is included via highway.h, which includes ops/*.h.
27#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
28#include "hwy/detect_targets.h"
29#include "hwy/ops/emu128-inl.h"
30#endif // HWY_IDE
31
32// Relies on the external include guard in highway.h.
34namespace hwy {
35namespace HWY_NAMESPACE {
36
37// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
38template <class V>
39using LaneType = decltype(GetLane(V()));
40
41// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
42// type of functions that do not take a vector argument, or as an argument type
43// if the function only has a template argument for D, or for explicit type
44// names instead of auto. This may be a built-in type.
45template <class D>
46using Vec = decltype(Zero(D()));
47
48// Mask type. Useful as the return type of functions that do not take a mask
49// argument, or as an argument type if the function only has a template argument
50// for D, or for explicit type names instead of auto.
51template <class D>
52using Mask = decltype(MaskFromVec(Zero(D())));
53
54// Returns the closest value to v within [lo, hi].
55template <class V>
56HWY_API V Clamp(const V v, const V lo, const V hi) {
57 return Min(Max(lo, v), hi);
58}
59
60// CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
61// and RVV has its own implementation of -Lanes.
62#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
63
64template <size_t kLanes, class D>
66 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
67 static_assert(kBytes < 16, "Shift count is per-block");
68 return CombineShiftRightBytes<kBytes>(d, hi, lo);
69}
70
71#endif
72
73// Returns lanes with the most significant bit set and all other bits zero.
74template <class D>
76 const RebindToUnsigned<decltype(d)> du;
77 return BitCast(d, Set(du, SignMask<TFromD<D>>()));
78}
79
80// Returns quiet NaN.
81template <class D>
83 const RebindToSigned<D> di;
84 // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
85 // mantissa MSB (to indicate quiet) would be sufficient.
86 return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
87}
88
89// Returns positive infinity.
90template <class D>
92 const RebindToUnsigned<D> du;
93 using T = TFromD<D>;
94 using TU = TFromD<decltype(du)>;
95 const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
96 return BitCast(d, Set(du, max_x2 >> 1));
97}
98
99// ------------------------------ ZeroExtendResizeBitCast
100
101// The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128
102// target is in emu128-inl.h, and the implementation of
103// detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h
104#if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
105namespace detail {
106
107#if HWY_HAVE_SCALABLE
108template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom>
110 hwy::SizeTag<kFromVectSize> /* from_size_tag */,
111 hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
112 VFromD<DFrom> v) {
113 const Repartition<uint8_t, DTo> d_to_u8;
114 const auto resized = ResizeBitCast(d_to_u8, v);
115 // Zero the upper bytes which were not present/valid in d_from.
116 const size_t num_bytes = Lanes(Repartition<uint8_t, decltype(d_from)>());
117 return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized));
118}
119#else // target that uses fixed-size vectors
120// Truncating or same-size resizing cast: same as ResizeBitCast
121template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
122 HWY_IF_LANES_LE(kToVectSize, kFromVectSize)>
124 hwy::SizeTag<kFromVectSize> /* from_size_tag */,
125 hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
126 VFromD<DFrom> v) {
127 return ResizeBitCast(d_to, v);
128}
129
130// Resizing cast to vector that has twice the number of lanes of the source
131// vector
132template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
133 HWY_IF_LANES(kToVectSize, kFromVectSize * 2)>
135 hwy::SizeTag<kFromVectSize> /* from_size_tag */,
136 hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
137 VFromD<DFrom> v) {
138 const Twice<decltype(d_from)> dt_from;
139 return BitCast(d_to, ZeroExtendVector(dt_from, v));
140}
141
142// Resizing cast to vector that has more than twice the number of lanes of the
143// source vector
144template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
145 HWY_IF_LANES_GT(kToVectSize, kFromVectSize * 2)>
147 hwy::SizeTag<kFromVectSize> /* from_size_tag */,
148 hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
149 VFromD<DFrom> v) {
150 using TFrom = TFromD<DFrom>;
151 constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom);
152 const Repartition<TFrom, decltype(d_to)> d_resize_to;
153 return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes),
154 ResizeBitCast(d_resize_to, v)));
155}
156#endif // HWY_HAVE_SCALABLE
157
158} // namespace detail
159#endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
160
161template <class DTo, class DFrom>
163 VFromD<DFrom> v) {
164 return detail::ZeroExtendResizeBitCast(hwy::SizeTag<d_from.MaxBytes()>(),
165 hwy::SizeTag<d_to.MaxBytes()>(), d_to,
166 d_from, v);
167}
168
169// ------------------------------ SafeFillN
170
171template <class D, typename T = TFromD<D>>
172HWY_API void SafeFillN(const size_t num, const T value, D d,
173 T* HWY_RESTRICT to) {
174#if HWY_MEM_OPS_MIGHT_FAULT
175 (void)d;
176 for (size_t i = 0; i < num; ++i) {
177 to[i] = value;
178 }
179#else
180 BlendedStore(Set(d, value), FirstN(d, num), d, to);
181#endif
182}
183
184// ------------------------------ SafeCopyN
185
186template <class D, typename T = TFromD<D>>
187HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
188 T* HWY_RESTRICT to) {
189#if HWY_MEM_OPS_MIGHT_FAULT
190 (void)d;
191 for (size_t i = 0; i < num; ++i) {
192 to[i] = from[i];
193 }
194#else
195 const Mask<D> mask = FirstN(d, num);
196 BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
197#endif
198}
199
200// ------------------------------ IsNegative
201#if (defined(HWY_NATIVE_IS_NEGATIVE) == defined(HWY_TARGET_TOGGLE))
202#ifdef HWY_NATIVE_IS_NEGATIVE
203#undef HWY_NATIVE_IS_NEGATIVE
204#else
205#define HWY_NATIVE_IS_NEGATIVE
206#endif
207
208template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
210 const DFromV<decltype(v)> d;
211 const RebindToSigned<decltype(d)> di;
213}
214
215#endif // HWY_NATIVE_IS_NEGATIVE
216
217// ------------------------------ MaskFalse
218#if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
219#ifdef HWY_NATIVE_MASK_FALSE
220#undef HWY_NATIVE_MASK_FALSE
221#else
222#define HWY_NATIVE_MASK_FALSE
223#endif
224
225template <class D>
227 return MaskFromVec(Zero(d));
228}
229
230#endif // HWY_NATIVE_MASK_FALSE
231
232// ------------------------------ IfNegativeThenElseZero
233#if (defined(HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO) == defined(HWY_TARGET_TOGGLE))
234#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
235#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
236#else
237#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
238#endif
239
240template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
242 return IfThenElseZero(IsNegative(v), yes);
243}
244
245#endif // HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
246
247// ------------------------------ IfNegativeThenZeroElse
248#if (defined(HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE) == defined(HWY_TARGET_TOGGLE))
249#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
250#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
251#else
252#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
253#endif
254
255template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
257 return IfThenZeroElse(IsNegative(v), no);
258}
259
260#endif // HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
261
262// ------------------------------ ZeroIfNegative (IfNegativeThenZeroElse)
263
264// ZeroIfNegative is generic for all vector lengths
265template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
267 return IfNegativeThenZeroElse(v, v);
268}
269
270// ------------------------------ BitwiseIfThenElse
271#if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
272#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
273#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
274#else
275#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
276#endif
277
278template <class V>
279HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
280 return Or(And(mask, yes), AndNot(mask, no));
281}
282
283#endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE
284
285// ------------------------------ PromoteMaskTo
286
287#if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
288#ifdef HWY_NATIVE_PROMOTE_MASK_TO
289#undef HWY_NATIVE_PROMOTE_MASK_TO
290#else
291#define HWY_NATIVE_PROMOTE_MASK_TO
292#endif
293
294template <class DTo, class DFrom>
295HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
296 static_assert(
297 sizeof(TFromD<DTo>) > sizeof(TFromD<DFrom>),
298 "sizeof(TFromD<DTo>) must be greater than sizeof(TFromD<DFrom>)");
299 static_assert(
300 IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
301 "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
302
303 const RebindToSigned<decltype(d_to)> di_to;
304 const RebindToSigned<decltype(d_from)> di_from;
305
306 return MaskFromVec(BitCast(
307 d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
308}
309
310#endif // HWY_NATIVE_PROMOTE_MASK_TO
311
312// ------------------------------ DemoteMaskTo
313
314#if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
315#ifdef HWY_NATIVE_DEMOTE_MASK_TO
316#undef HWY_NATIVE_DEMOTE_MASK_TO
317#else
318#define HWY_NATIVE_DEMOTE_MASK_TO
319#endif
320
321template <class DTo, class DFrom>
322HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
323 static_assert(sizeof(TFromD<DTo>) < sizeof(TFromD<DFrom>),
324 "sizeof(TFromD<DTo>) must be less than sizeof(TFromD<DFrom>)");
325 static_assert(
326 IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
327 "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
328
329 const RebindToSigned<decltype(d_to)> di_to;
330 const RebindToSigned<decltype(d_from)> di_from;
331
332 return MaskFromVec(
333 BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
334}
335
336#endif // HWY_NATIVE_DEMOTE_MASK_TO
337
338// ------------------------------ CombineMasks
339
340#if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
341#ifdef HWY_NATIVE_COMBINE_MASKS
342#undef HWY_NATIVE_COMBINE_MASKS
343#else
344#define HWY_NATIVE_COMBINE_MASKS
345#endif
346
347#if HWY_TARGET != HWY_SCALAR
348template <class D>
349HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) {
350 const Half<decltype(d)> dh;
351 return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo)));
352}
353#endif
354
355#endif // HWY_NATIVE_COMBINE_MASKS
356
357// ------------------------------ LowerHalfOfMask
358
359#if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
360#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
361#undef HWY_NATIVE_LOWER_HALF_OF_MASK
362#else
363#define HWY_NATIVE_LOWER_HALF_OF_MASK
364#endif
365
366template <class D>
368 const Twice<decltype(d)> dt;
369 return MaskFromVec(LowerHalf(d, VecFromMask(dt, m)));
370}
371
372#endif // HWY_NATIVE_LOWER_HALF_OF_MASK
373
374// ------------------------------ UpperHalfOfMask
375
376#if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
377#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
378#undef HWY_NATIVE_UPPER_HALF_OF_MASK
379#else
380#define HWY_NATIVE_UPPER_HALF_OF_MASK
381#endif
382
383#if HWY_TARGET != HWY_SCALAR
384template <class D>
385HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
386 const Twice<decltype(d)> dt;
387 return MaskFromVec(UpperHalf(d, VecFromMask(dt, m)));
388}
389#endif
390
391#endif // HWY_NATIVE_UPPER_HALF_OF_MASK
392
393// ------------------------------ OrderedDemote2MasksTo
394
395#if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \
396 defined(HWY_TARGET_TOGGLE))
397#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
398#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
399#else
400#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
401#endif
402
403#if HWY_TARGET != HWY_SCALAR
404template <class DTo, class DFrom>
405HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
406 Mask<DFrom> b) {
407 static_assert(
408 sizeof(TFromD<DTo>) == sizeof(TFromD<DFrom>) / 2,
409 "sizeof(TFromD<DTo>) must be equal to sizeof(TFromD<DFrom>) / 2");
410 static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(),
411 "Mask<DTo> must be the same type as "
412 "Mask<Repartition<TFromD<DTo>, DFrom>>>()");
413
414 const RebindToSigned<decltype(d_from)> di_from;
415 const RebindToSigned<decltype(d_to)> di_to;
416
417 const auto va = BitCast(di_from, VecFromMask(d_from, a));
418 const auto vb = BitCast(di_from, VecFromMask(d_from, b));
419 return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)));
420}
421#endif
422
423#endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
424
425// ------------------------------ RotateLeft
426template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
428 constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
429 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
430
431 constexpr int kRotateRightAmt =
432 (kBits == 0) ? 0 : static_cast<int>(kSizeInBits) - kBits;
433 return RotateRight<kRotateRightAmt>(v);
434}
435
436// ------------------------------ Rol/Ror
437#if (defined(HWY_NATIVE_ROL_ROR_8) == defined(HWY_TARGET_TOGGLE))
438#ifdef HWY_NATIVE_ROL_ROR_8
439#undef HWY_NATIVE_ROL_ROR_8
440#else
441#define HWY_NATIVE_ROL_ROR_8
442#endif
443
444template <class V, HWY_IF_UI8(TFromV<V>)>
445HWY_API V Rol(V a, V b) {
446 const DFromV<decltype(a)> d;
447 const RebindToSigned<decltype(d)> di;
448 const RebindToUnsigned<decltype(d)> du;
449
450 const auto shift_amt_mask = Set(du, uint8_t{7});
451 const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
452 const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
453
454 const auto vu = BitCast(du, a);
455 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
456}
457
458template <class V, HWY_IF_UI8(TFromV<V>)>
459HWY_API V Ror(V a, V b) {
460 const DFromV<decltype(a)> d;
461 const RebindToSigned<decltype(d)> di;
462 const RebindToUnsigned<decltype(d)> du;
463
464 const auto shift_amt_mask = Set(du, uint8_t{7});
465 const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
466 const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
467
468 const auto vu = BitCast(du, a);
469 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
470}
471
472#endif // HWY_NATIVE_ROL_ROR_8
473
474#if (defined(HWY_NATIVE_ROL_ROR_16) == defined(HWY_TARGET_TOGGLE))
475#ifdef HWY_NATIVE_ROL_ROR_16
476#undef HWY_NATIVE_ROL_ROR_16
477#else
478#define HWY_NATIVE_ROL_ROR_16
479#endif
480
481template <class V, HWY_IF_UI16(TFromV<V>)>
482HWY_API V Rol(V a, V b) {
483 const DFromV<decltype(a)> d;
484 const RebindToSigned<decltype(d)> di;
485 const RebindToUnsigned<decltype(d)> du;
486
487 const auto shift_amt_mask = Set(du, uint16_t{15});
488 const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
489 const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
490
491 const auto vu = BitCast(du, a);
492 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
493}
494
495template <class V, HWY_IF_UI16(TFromV<V>)>
496HWY_API V Ror(V a, V b) {
497 const DFromV<decltype(a)> d;
498 const RebindToSigned<decltype(d)> di;
499 const RebindToUnsigned<decltype(d)> du;
500
501 const auto shift_amt_mask = Set(du, uint16_t{15});
502 const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
503 const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
504
505 const auto vu = BitCast(du, a);
506 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
507}
508
509#endif // HWY_NATIVE_ROL_ROR_16
510
511#if (defined(HWY_NATIVE_ROL_ROR_32_64) == defined(HWY_TARGET_TOGGLE))
512#ifdef HWY_NATIVE_ROL_ROR_32_64
513#undef HWY_NATIVE_ROL_ROR_32_64
514#else
515#define HWY_NATIVE_ROL_ROR_32_64
516#endif
517
518template <class V, HWY_IF_UI32(TFromV<V>)>
519HWY_API V Rol(V a, V b) {
520 const DFromV<decltype(a)> d;
521 const RebindToSigned<decltype(d)> di;
522 const RebindToUnsigned<decltype(d)> du;
523
524 const auto shift_amt_mask = Set(du, uint32_t{31});
525 const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
526 const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
527
528 const auto vu = BitCast(du, a);
529 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
530}
531
532template <class V, HWY_IF_UI32(TFromV<V>)>
533HWY_API V Ror(V a, V b) {
534 const DFromV<decltype(a)> d;
535 const RebindToSigned<decltype(d)> di;
536 const RebindToUnsigned<decltype(d)> du;
537
538 const auto shift_amt_mask = Set(du, uint32_t{31});
539 const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
540 const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
541
542 const auto vu = BitCast(du, a);
543 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
544}
545
546#if HWY_HAVE_INTEGER64
547template <class V, HWY_IF_UI64(TFromV<V>)>
548HWY_API V Rol(V a, V b) {
549 const DFromV<decltype(a)> d;
550 const RebindToSigned<decltype(d)> di;
551 const RebindToUnsigned<decltype(d)> du;
552
553 const auto shift_amt_mask = Set(du, uint64_t{63});
554 const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
555 const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
556
557 const auto vu = BitCast(du, a);
558 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
559}
560
561template <class V, HWY_IF_UI64(TFromV<V>)>
562HWY_API V Ror(V a, V b) {
563 const DFromV<decltype(a)> d;
564 const RebindToSigned<decltype(d)> di;
565 const RebindToUnsigned<decltype(d)> du;
566
567 const auto shift_amt_mask = Set(du, uint64_t{63});
568 const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
569 const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
570
571 const auto vu = BitCast(du, a);
572 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
573}
574#endif // HWY_HAVE_INTEGER64
575
576#endif // HWY_NATIVE_ROL_ROR_32_64
577
578// ------------------------------ RotateLeftSame/RotateRightSame
579
580#if (defined(HWY_NATIVE_ROL_ROR_SAME_8) == defined(HWY_TARGET_TOGGLE))
581#ifdef HWY_NATIVE_ROL_ROR_SAME_8
582#undef HWY_NATIVE_ROL_ROR_SAME_8
583#else
584#define HWY_NATIVE_ROL_ROR_SAME_8
585#endif
586
587template <class V, HWY_IF_UI8(TFromV<V>)>
588HWY_API V RotateLeftSame(V v, int bits) {
589 const DFromV<decltype(v)> d;
590 const RebindToUnsigned<decltype(d)> du;
591
592 const int shl_amt = bits & 7;
593 const int shr_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u);
594
595 const auto vu = BitCast(du, v);
596 return BitCast(d,
597 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
598}
599
600template <class V, HWY_IF_UI8(TFromV<V>)>
601HWY_API V RotateRightSame(V v, int bits) {
602 const DFromV<decltype(v)> d;
603 const RebindToUnsigned<decltype(d)> du;
604
605 const int shr_amt = bits & 7;
606 const int shl_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u);
607
608 const auto vu = BitCast(du, v);
609 return BitCast(d,
610 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
611}
612
613#endif // HWY_NATIVE_ROL_ROR_SAME_8
614
615#if (defined(HWY_NATIVE_ROL_ROR_SAME_16) == defined(HWY_TARGET_TOGGLE))
616#ifdef HWY_NATIVE_ROL_ROR_SAME_16
617#undef HWY_NATIVE_ROL_ROR_SAME_16
618#else
619#define HWY_NATIVE_ROL_ROR_SAME_16
620#endif
621
622template <class V, HWY_IF_UI16(TFromV<V>)>
623HWY_API V RotateLeftSame(V v, int bits) {
624 const DFromV<decltype(v)> d;
625 const RebindToUnsigned<decltype(d)> du;
626
627 const int shl_amt = bits & 15;
628 const int shr_amt =
629 static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u);
630
631 const auto vu = BitCast(du, v);
632 return BitCast(d,
633 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
634}
635
636template <class V, HWY_IF_UI16(TFromV<V>)>
637HWY_API V RotateRightSame(V v, int bits) {
638 const DFromV<decltype(v)> d;
639 const RebindToUnsigned<decltype(d)> du;
640
641 const int shr_amt = bits & 15;
642 const int shl_amt =
643 static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u);
644
645 const auto vu = BitCast(du, v);
646 return BitCast(d,
647 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
648}
649#endif // HWY_NATIVE_ROL_ROR_SAME_16
650
651#if (defined(HWY_NATIVE_ROL_ROR_SAME_32_64) == defined(HWY_TARGET_TOGGLE))
652#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
653#undef HWY_NATIVE_ROL_ROR_SAME_32_64
654#else
655#define HWY_NATIVE_ROL_ROR_SAME_32_64
656#endif
657
658template <class V, HWY_IF_UI32(TFromV<V>)>
659HWY_API V RotateLeftSame(V v, int bits) {
660 const DFromV<decltype(v)> d;
661 const RebindToUnsigned<decltype(d)> du;
662
663 const int shl_amt = bits & 31;
664 const int shr_amt =
665 static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u);
666
667 const auto vu = BitCast(du, v);
668 return BitCast(d,
669 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
670}
671
672template <class V, HWY_IF_UI32(TFromV<V>)>
673HWY_API V RotateRightSame(V v, int bits) {
674 const DFromV<decltype(v)> d;
675 const RebindToUnsigned<decltype(d)> du;
676
677 const int shr_amt = bits & 31;
678 const int shl_amt =
679 static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u);
680
681 const auto vu = BitCast(du, v);
682 return BitCast(d,
683 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
684}
685
686#if HWY_HAVE_INTEGER64
687template <class V, HWY_IF_UI64(TFromV<V>)>
688HWY_API V RotateLeftSame(V v, int bits) {
689 const DFromV<decltype(v)> d;
690 const RebindToUnsigned<decltype(d)> du;
691
692 const int shl_amt = bits & 63;
693 const int shr_amt =
694 static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u);
695
696 const auto vu = BitCast(du, v);
697 return BitCast(d,
698 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
699}
700
701template <class V, HWY_IF_UI64(TFromV<V>)>
702HWY_API V RotateRightSame(V v, int bits) {
703 const DFromV<decltype(v)> d;
704 const RebindToUnsigned<decltype(d)> du;
705
706 const int shr_amt = bits & 63;
707 const int shl_amt =
708 static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u);
709
710 const auto vu = BitCast(du, v);
711 return BitCast(d,
712 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
713}
714#endif // HWY_HAVE_INTEGER64
715
716#endif // HWY_NATIVE_ROL_ROR_SAME_32_64
717
718// ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
719#if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
720#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
721#undef HWY_NATIVE_INTERLEAVE_WHOLE
722#else
723#define HWY_NATIVE_INTERLEAVE_WHOLE
724#endif
725
726#if HWY_TARGET != HWY_SCALAR
727template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
729 // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if
730 // D().MaxBytes() <= 16 is true
731 return InterleaveLower(d, a, b);
732}
733template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
735 // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if
736 // D().MaxBytes() <= 16 is true
737 return InterleaveUpper(d, a, b);
738}
739
740// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3
741// is implemented in x86_256-inl.h.
742
743// InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is
744// implemented in x86_512-inl.h.
745
746// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256
747// is implemented in wasm_256-inl.h.
748#endif // HWY_TARGET != HWY_SCALAR
749
750#endif // HWY_NATIVE_INTERLEAVE_WHOLE
751
752#if HWY_TARGET != HWY_SCALAR
753// The InterleaveWholeLower without the optional D parameter is generic for all
754// vector lengths.
755template <class V>
756HWY_API V InterleaveWholeLower(V a, V b) {
757 return InterleaveWholeLower(DFromV<V>(), a, b);
758}
759#endif // HWY_TARGET != HWY_SCALAR
760
761// ------------------------------ InterleaveEven
762
763#if HWY_TARGET != HWY_SCALAR
764// InterleaveEven without the optional D parameter is generic for all vector
765// lengths
766template <class V>
767HWY_API V InterleaveEven(V a, V b) {
768 return InterleaveEven(DFromV<V>(), a, b);
769}
770#endif
771
772// ------------------------------ AddSub
773
774template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
775HWY_API V AddSub(V a, V b) {
776 // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b)
777 return Sub(a, b);
778}
779
780// AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on
781// SSSE3/SSE4/AVX2/AVX3
782
783// AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on
784// AVX2/AVX3
785
786// AddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h
787
788// AddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h
789template <class V, HWY_IF_ADDSUB_V(V)>
790HWY_API V AddSub(V a, V b) {
791 using D = DFromV<decltype(a)>;
792 using T = TFromD<D>;
793 using TNegate = If<!hwy::IsSigned<T>(), MakeSigned<T>, T>;
794
795 const D d;
796 const Rebind<TNegate, D> d_negate;
797
798 // Negate the even lanes of b
799 const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b))));
800
801 return Add(a, negated_even_b);
802}
803
804// ------------------------------ MaskedAddOr etc.
805#if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
806#ifdef HWY_NATIVE_MASKED_ARITH
807#undef HWY_NATIVE_MASKED_ARITH
808#else
809#define HWY_NATIVE_MASKED_ARITH
810#endif
811
812template <class V, class M>
813HWY_API V MaskedMinOr(V no, M m, V a, V b) {
814 return IfThenElse(m, Min(a, b), no);
815}
816
817template <class V, class M>
818HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
819 return IfThenElse(m, Max(a, b), no);
820}
821
822template <class V, class M>
823HWY_API V MaskedAddOr(V no, M m, V a, V b) {
824 return IfThenElse(m, Add(a, b), no);
825}
826
827template <class V, class M>
828HWY_API V MaskedSubOr(V no, M m, V a, V b) {
829 return IfThenElse(m, Sub(a, b), no);
830}
831
832template <class V, class M>
833HWY_API V MaskedMulOr(V no, M m, V a, V b) {
834 return IfThenElse(m, Mul(a, b), no);
835}
836
837template <class V, class M>
838HWY_API V MaskedDivOr(V no, M m, V a, V b) {
839 return IfThenElse(m, Div(a, b), no);
840}
841
842template <class V, class M>
843HWY_API V MaskedModOr(V no, M m, V a, V b) {
844 return IfThenElse(m, Mod(a, b), no);
845}
846
847template <class V, class M>
848HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
849 return IfThenElse(m, SaturatedAdd(a, b), no);
850}
851
852template <class V, class M>
853HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
854 return IfThenElse(m, SaturatedSub(a, b), no);
855}
856#endif // HWY_NATIVE_MASKED_ARITH
857
858// ------------------------------ IfNegativeThenNegOrUndefIfZero
859
860#if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
861 defined(HWY_TARGET_TOGGLE))
862#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
863#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
864#else
865#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
866#endif
867
868template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
870#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
871 // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE
872 const auto zero = Zero(DFromV<V>());
873 return MaskedSubOr(v, Lt(mask, zero), zero, v);
874#else
875 return IfNegativeThenElse(mask, Neg(v), v);
876#endif
877}
878
879#endif // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
880
881template <class V, HWY_IF_FLOAT_V(V)>
883 return CopySign(v, Xor(mask, v));
884}
885
886// ------------------------------ SaturatedNeg
887
888#if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE))
889#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
890#undef HWY_NATIVE_SATURATED_NEG_8_16_32
891#else
892#define HWY_NATIVE_SATURATED_NEG_8_16_32
893#endif
894
895template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
898 const DFromV<decltype(v)> d;
899 return SaturatedSub(Zero(d), v);
900}
901
902template <class V, HWY_IF_I32(TFromV<V>)>
904 const DFromV<decltype(v)> d;
905
906#if HWY_TARGET == HWY_RVV || \
907 (HWY_TARGET >= HWY_PPC10 && HWY_TARGET <= HWY_PPC8) || \
908 (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
909 // RVV/NEON/SVE/PPC have native I32 SaturatedSub instructions
910 return SaturatedSub(Zero(d), v);
911#else
912 // ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to
913 // (v[i] > LimitsMin<int32_t>) ? (-v[i]) : LimitsMax<int32_t>() since
914 // -v[i] == ~v[i] + 1 == ~v[i] - (-1) and
915 // ~LimitsMin<int32_t>() == LimitsMax<int32_t>().
916 return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin<int32_t>()))));
917#endif
918}
919#endif // HWY_NATIVE_SATURATED_NEG_8_16_32
920
921#if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE))
922#ifdef HWY_NATIVE_SATURATED_NEG_64
923#undef HWY_NATIVE_SATURATED_NEG_64
924#else
925#define HWY_NATIVE_SATURATED_NEG_64
926#endif
927
928template <class V, HWY_IF_I64(TFromV<V>)>
929HWY_API V SaturatedNeg(V v) {
930#if HWY_TARGET == HWY_RVV || \
931 (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
932 // RVV/NEON/SVE have native I64 SaturatedSub instructions
933 const DFromV<decltype(v)> d;
934 return SaturatedSub(Zero(d), v);
935#else
936 const auto neg_v = Neg(v);
937 return Add(neg_v, BroadcastSignBit(And(v, neg_v)));
938#endif
939}
940#endif // HWY_NATIVE_SATURATED_NEG_64
941
942// ------------------------------ SaturatedAbs
943
944#if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE))
945#ifdef HWY_NATIVE_SATURATED_ABS
946#undef HWY_NATIVE_SATURATED_ABS
947#else
948#define HWY_NATIVE_SATURATED_ABS
949#endif
950
951template <class V, HWY_IF_SIGNED_V(V)>
953 return Max(v, SaturatedNeg(v));
954}
955
956#endif
957
958// ------------------------------ Reductions
959
960// Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled,
961// they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set.
962// Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the
963// SumOfLanes overloads. For the latter group, we here define the remaining
964// overloads, plus ReduceSum which uses them plus GetLane.
965#if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
966#ifdef HWY_NATIVE_REDUCE_SCALAR
967#undef HWY_NATIVE_REDUCE_SCALAR
968#else
969#define HWY_NATIVE_REDUCE_SCALAR
970#endif
971
972namespace detail {
973
974// Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes.
975struct AddFunc {
976 template <class V>
977 V operator()(V a, V b) const {
978 return Add(a, b);
979 }
980};
981
982struct MinFunc {
983 template <class V>
984 V operator()(V a, V b) const {
985 return Min(a, b);
986 }
987};
988
989struct MaxFunc {
990 template <class V>
991 V operator()(V a, V b) const {
992 return Max(a, b);
993 }
994};
995
996// No-op for vectors of at most one block.
997template <class D, class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
1001
1002// Reduces a lane with its counterpart in other block(s). Shared by AVX2 and
1003// WASM_EMU256. AVX3 has its own overload.
1004template <class D, class Func, HWY_IF_V_SIZE_D(D, 32)>
1006 return f(v, SwapAdjacentBlocks(v));
1007}
1008
1009// These return the reduction result broadcasted across all lanes. They assume
1010// the caller has already reduced across blocks.
1011
1012template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
1014 return f(v10, Reverse2(d, v10));
1015}
1016
1017template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
1019 const VFromD<D> v0123 = Reverse4(d, v3210);
1020 const VFromD<D> v03_12_12_03 = f(v3210, v0123);
1021 const VFromD<D> v12_03_03_12 = Reverse2(d, v03_12_12_03);
1022 return f(v03_12_12_03, v12_03_03_12);
1023}
1024
1025template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
1027 // The upper half is reversed from the lower half; omit for brevity.
1028 const VFromD<D> v34_25_16_07 = f(v76543210, Reverse8(d, v76543210));
1029 const VFromD<D> v0347_1625_1625_0347 =
1030 f(v34_25_16_07, Reverse4(d, v34_25_16_07));
1031 return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
1032}
1033
1034template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)>
1036 const RepartitionToWide<decltype(d)> dw;
1037 using VW = VFromD<decltype(dw)>;
1038 const VW vw = BitCast(dw, v);
1039 // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
1040 const VW even = And(vw, Set(dw, 0xFF));
1041 const VW odd = ShiftRight<8>(vw);
1042 const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
1043#if HWY_IS_LITTLE_ENDIAN
1044 return DupEven(BitCast(d, reduced));
1045#else
1046 return DupOdd(BitCast(d, reduced));
1047#endif
1048}
1049
1050template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)>
1052 const RepartitionToWide<decltype(d)> dw;
1053 using VW = VFromD<decltype(dw)>;
1054 const VW vw = BitCast(dw, v);
1055 // Sign-extend
1056 // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
1057 const VW even = ShiftRight<8>(ShiftLeft<8>(vw));
1058 const VW odd = ShiftRight<8>(vw);
1059 const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
1060#if HWY_IS_LITTLE_ENDIAN
1061 return DupEven(BitCast(d, reduced));
1062#else
1063 return DupOdd(BitCast(d, reduced));
1064#endif
1065}
1066
1067} // namespace detail
1068
1069template <class D, HWY_IF_SUM_OF_LANES_D(D)>
1071 const detail::AddFunc f;
1072 v = detail::ReduceAcrossBlocks(d, f, v);
1073 return detail::ReduceWithinBlocks(d, f, v);
1074}
1075template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
1077 const detail::MinFunc f;
1078 v = detail::ReduceAcrossBlocks(d, f, v);
1079 return detail::ReduceWithinBlocks(d, f, v);
1080}
1081template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
1083 const detail::MaxFunc f;
1084 v = detail::ReduceAcrossBlocks(d, f, v);
1085 return detail::ReduceWithinBlocks(d, f, v);
1086}
1087
1088template <class D, HWY_IF_REDUCE_D(D)>
1089HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
1090 return GetLane(SumOfLanes(d, v));
1091}
1092template <class D, HWY_IF_REDUCE_D(D)>
1093HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
1094 return GetLane(MinOfLanes(d, v));
1095}
1096template <class D, HWY_IF_REDUCE_D(D)>
1097HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
1098 return GetLane(MaxOfLanes(d, v));
1099}
1100
1101#endif // HWY_NATIVE_REDUCE_SCALAR
1102
1103// Corner cases for both generic and native implementations:
1104// N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm)
1105template <class D, HWY_IF_LANES_D(D, 1)>
1106HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
1107 return GetLane(v);
1108}
1109template <class D, HWY_IF_LANES_D(D, 1)>
1110HWY_API TFromD<D> ReduceMin(D /*d*/, VFromD<D> v) {
1111 return GetLane(v);
1112}
1113template <class D, HWY_IF_LANES_D(D, 1)>
1114HWY_API TFromD<D> ReduceMax(D /*d*/, VFromD<D> v) {
1115 return GetLane(v);
1116}
1117
1118template <class D, HWY_IF_LANES_D(D, 1)>
1119HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
1120 return v;
1121}
1122template <class D, HWY_IF_LANES_D(D, 1)>
1123HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
1124 return v;
1125}
1126template <class D, HWY_IF_LANES_D(D, 1)>
1127HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
1128 return v;
1129}
1130
1131// N=4 for 8-bit is still less than the minimum native size.
1132
1133// ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8
1134// ReduceSum operations
1135#if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE))
1136#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
1137#undef HWY_NATIVE_REDUCE_SUM_4_UI8
1138#else
1139#define HWY_NATIVE_REDUCE_SUM_4_UI8
1140#endif
1141template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
1142HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
1143 const Twice<RepartitionToWide<decltype(d)>> dw;
1144 return static_cast<TFromD<D>>(ReduceSum(dw, PromoteTo(dw, v)));
1145}
1146#endif // HWY_NATIVE_REDUCE_SUM_4_UI8
1147
1148// RVV/SVE have target-specific implementations of the N=4 I8/U8
1149// ReduceMin/ReduceMax operations
1150#if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE))
1151#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
1152#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
1153#else
1154#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
1155#endif
1156template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
1158 const Twice<RepartitionToWide<decltype(d)>> dw;
1159 return static_cast<TFromD<D>>(ReduceMin(dw, PromoteTo(dw, v)));
1160}
1161template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
1163 const Twice<RepartitionToWide<decltype(d)>> dw;
1164 return static_cast<TFromD<D>>(ReduceMax(dw, PromoteTo(dw, v)));
1165}
1166#endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8
1167
1168// ------------------------------ IsEitherNaN
1169#if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE))
1170#ifdef HWY_NATIVE_IS_EITHER_NAN
1171#undef HWY_NATIVE_IS_EITHER_NAN
1172#else
1173#define HWY_NATIVE_IS_EITHER_NAN
1174#endif
1175
1176template <class V, HWY_IF_FLOAT_V(V)>
1178 return Or(IsNaN(a), IsNaN(b));
1179}
1180
1181#endif // HWY_NATIVE_IS_EITHER_NAN
1182
1183// ------------------------------ IsInf, IsFinite
1184
1185// AVX3 has target-specific implementations of these.
1186#if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE))
1187#ifdef HWY_NATIVE_ISINF
1188#undef HWY_NATIVE_ISINF
1189#else
1190#define HWY_NATIVE_ISINF
1191#endif
1192
1193template <class V, class D = DFromV<V>>
1195 using T = TFromD<D>;
1196 const D d;
1197 const RebindToUnsigned<decltype(d)> du;
1198 const VFromD<decltype(du)> vu = BitCast(du, v);
1199 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1200 return RebindMask(
1201 d,
1202 Eq(Add(vu, vu),
1203 Set(du, static_cast<MakeUnsigned<T>>(hwy::MaxExponentTimes2<T>()))));
1204}
1205
1206// Returns whether normal/subnormal/zero.
1207template <class V, class D = DFromV<V>>
1209 using T = TFromD<D>;
1210 const D d;
1211 const RebindToUnsigned<decltype(d)> du;
1212 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1213 const VFromD<decltype(du)> vu = BitCast(du, v);
1214// 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code
1215// for AVX2 if we instead add vu + vu.
1216#if HWY_COMPILER_MSVC
1217 const VFromD<decltype(du)> shl = ShiftLeft<1>(vu);
1218#else
1219 const VFromD<decltype(du)> shl = Add(vu, vu);
1220#endif
1221
1222 // Then shift right so we can compare with the max exponent (cannot compare
1223 // with MaxExponentTimes2 directly because it is negative and non-negative
1224 // floats would be greater).
1225 const VFromD<decltype(di)> exp =
1226 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(shl));
1227 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
1228}
1229
1230#endif // HWY_NATIVE_ISINF
1231
1232// ------------------------------ LoadInterleaved2
1233
1234#if HWY_IDE || \
1235 (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
1236#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1237#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1238#else
1239#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1240#endif
1241
1242template <class D, HWY_IF_LANES_GT_D(D, 1)>
1244 VFromD<D>& v0, VFromD<D>& v1) {
1245 const VFromD<D> A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0]
1246 const VFromD<D> B = LoadU(d, unaligned + Lanes(d));
1247 v0 = ConcatEven(d, B, A);
1248 v1 = ConcatOdd(d, B, A);
1249}
1250
1251template <class D, HWY_IF_LANES_D(D, 1)>
1252HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
1253 VFromD<D>& v0, VFromD<D>& v1) {
1254 v0 = LoadU(d, unaligned + 0);
1255 v1 = LoadU(d, unaligned + 1);
1256}
1257
1258// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
1259
1260namespace detail {
1261
1262#if HWY_IDE
1263template <class V>
1264HWY_INLINE V ShuffleTwo1230(V a, V /* b */) {
1265 return a;
1266}
1267template <class V>
1268HWY_INLINE V ShuffleTwo2301(V a, V /* b */) {
1269 return a;
1270}
1271template <class V>
1272HWY_INLINE V ShuffleTwo3012(V a, V /* b */) {
1273 return a;
1274}
1275#endif // HWY_IDE
1276
1277// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
1278template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1280 const TFromD<D>* HWY_RESTRICT unaligned,
1281 VFromD<D>& A, VFromD<D>& B,
1282 VFromD<D>& C) {
1283 constexpr size_t kN = MaxLanes(d);
1284 A = LoadU(d, unaligned + 0 * kN);
1285 B = LoadU(d, unaligned + 1 * kN);
1286 C = LoadU(d, unaligned + 2 * kN);
1287}
1288
1289} // namespace detail
1290
1291template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
1293 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
1294 const RebindToUnsigned<decltype(d)> du;
1295 using V = VFromD<D>;
1296 using VU = VFromD<decltype(du)>;
1297 // Compact notation so these fit on one line: 12 := v1[2].
1298 V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
1299 V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
1300 V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
1301 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
1302 // Compress all lanes belonging to v0 into consecutive lanes.
1303 constexpr uint8_t Z = 0x80;
1304 const VU idx_v0A =
1305 Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1306 const VU idx_v0B =
1307 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z);
1308 const VU idx_v0C =
1309 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13);
1310 const VU idx_v1A =
1311 Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1312 const VU idx_v1B =
1313 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z);
1314 const VU idx_v1C =
1315 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14);
1316 const VU idx_v2A =
1317 Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1318 const VU idx_v2B =
1319 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z);
1320 const VU idx_v2C =
1321 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15);
1322 const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
1323 const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
1324 const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
1325 const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
1326 const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
1327 const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
1328 const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
1329 const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
1330 const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
1331 v0 = Xor3(v0L, v0M, v0U);
1332 v1 = Xor3(v1L, v1M, v1U);
1333 v2 = Xor3(v2L, v2M, v2U);
1334}
1335
1336// 8-bit lanes x8
1337template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
1339 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
1340 const RebindToUnsigned<decltype(d)> du;
1341 using V = VFromD<D>;
1342 using VU = VFromD<decltype(du)>;
1343 V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
1344 V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
1345 V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
1346 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
1347 // Compress all lanes belonging to v0 into consecutive lanes.
1348 constexpr uint8_t Z = 0x80;
1349 const VU idx_v0A =
1350 Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1351 const VU idx_v0B =
1352 Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1353 const VU idx_v0C =
1354 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0);
1355 const VU idx_v1A =
1356 Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1357 const VU idx_v1B =
1358 Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1359 const VU idx_v1C =
1360 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0);
1361 const VU idx_v2A =
1362 Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1363 const VU idx_v2B =
1364 Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
1365 const VU idx_v2C =
1366 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0);
1367 const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
1368 const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
1369 const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
1370 const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
1371 const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
1372 const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
1373 const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
1374 const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
1375 const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
1376 v0 = Xor3(v0L, v0M, v0U);
1377 v1 = Xor3(v1L, v1M, v1U);
1378 v2 = Xor3(v2L, v2M, v2U);
1379}
1380
1381// 16-bit lanes x8
1382template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
1383HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
1384 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
1385 const RebindToUnsigned<decltype(d)> du;
1386 const Repartition<uint8_t, decltype(du)> du8;
1387 using V = VFromD<D>;
1388 using VU8 = VFromD<decltype(du8)>;
1389 V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
1390 V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
1391 V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
1392 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
1393 // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
1394 // but each element of the array contains a byte index for a byte of a lane.
1395 constexpr uint8_t Z = 0x80;
1396 const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C,
1397 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1398 const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03,
1399 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z);
1400 const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
1401 Z, 0x04, 0x05, 0x0A, 0x0B);
1402 const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E,
1403 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1404 const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05,
1405 0x0A, 0x0B, Z, Z, Z, Z, Z, Z);
1406 const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
1407 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D);
1408 const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z,
1409 Z, Z, Z, Z, Z, Z, Z, Z, Z);
1410 const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06,
1411 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z);
1412 const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
1413 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F);
1414 const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A));
1415 const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B));
1416 const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C));
1417 const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A));
1418 const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B));
1419 const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C));
1420 const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A));
1421 const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B));
1422 const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C));
1423 v0 = Xor3(v0L, v0M, v0U);
1424 v1 = Xor3(v1L, v1M, v1U);
1425 v2 = Xor3(v2L, v2M, v2U);
1426}
1427
1428template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
1429HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
1430 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
1431 using V = VFromD<D>;
1432 V A; // v0[1] v2[0] v1[0] v0[0]
1433 V B; // v1[2] v0[2] v2[1] v1[1]
1434 V C; // v2[3] v1[3] v0[3] v2[2]
1435 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
1436
1437 const V vxx_02_03_xx = OddEven(C, B);
1438 v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx);
1439
1440 // Shuffle2301 takes the upper/lower halves of the output from one input, so
1441 // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
1442 // OddEven because it may have higher throughput than Shuffle.
1443 const V vxx_xx_10_11 = OddEven(A, B);
1444 const V v12_13_xx_xx = OddEven(B, C);
1445 v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx);
1446
1447 const V vxx_20_21_xx = OddEven(B, A);
1448 v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C);
1449}
1450
1451template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
1452HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
1453 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
1454 VFromD<D> A; // v1[0] v0[0]
1455 VFromD<D> B; // v0[1] v2[0]
1456 VFromD<D> C; // v2[1] v1[1]
1457 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
1458 v0 = OddEven(B, A);
1459 v1 = CombineShiftRightBytes<sizeof(TFromD<D>)>(d, C, A);
1460 v2 = OddEven(C, B);
1461}
1462
1463template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
1464HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
1465 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
1466 v0 = LoadU(d, unaligned + 0);
1467 v1 = LoadU(d, unaligned + 1);
1468 v2 = LoadU(d, unaligned + 2);
1469}
1470
1471// ------------------------------ LoadInterleaved4
1472
1473namespace detail {
1474
1475// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
1476template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1478 const TFromD<D>* HWY_RESTRICT unaligned,
1479 VFromD<D>& vA, VFromD<D>& vB,
1480 VFromD<D>& vC, VFromD<D>& vD) {
1481 constexpr size_t kN = MaxLanes(d);
1482 vA = LoadU(d, unaligned + 0 * kN);
1483 vB = LoadU(d, unaligned + 1 * kN);
1484 vC = LoadU(d, unaligned + 2 * kN);
1485 vD = LoadU(d, unaligned + 3 * kN);
1486}
1487
1488} // namespace detail
1489
1490template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
1492 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
1493 VFromD<D>& v3) {
1494 const Repartition<uint64_t, decltype(d)> d64;
1495 using V64 = VFromD<decltype(d64)>;
1496 using V = VFromD<D>;
1497 // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD.
1498 // Here int[i] means the four interleaved values of the i-th 4-tuple and
1499 // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
1500 V vA; // int[13..10] int[3..0]
1501 V vB; // int[17..14] int[7..4]
1502 V vC; // int[1b..18] int[b..8]
1503 V vD; // int[1f..1c] int[f..c]
1504 detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
1505
1506 // For brevity, the comments only list the lower block (upper = lower + 0x10)
1507 const V v5140 = InterleaveLower(d, vA, vB); // int[5,1,4,0]
1508 const V vd9c8 = InterleaveLower(d, vC, vD); // int[d,9,c,8]
1509 const V v7362 = InterleaveUpper(d, vA, vB); // int[7,3,6,2]
1510 const V vfbea = InterleaveUpper(d, vC, vD); // int[f,b,e,a]
1511
1512 const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0]
1513 const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8]
1514 const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1]
1515 const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9]
1516
1517 const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0]
1518 const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8]
1519 const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0]
1520 const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8]
1521
1522 v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
1523 v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
1524 v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
1525 v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
1526}
1527
1528template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
1529HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
1530 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
1531 VFromD<D>& v3) {
1532 // In the last step, we interleave by half of the block size, which is usually
1533 // 8 bytes but half that for 8-bit x8 vectors.
1534 using TW = hwy::UnsignedFromSize<d.MaxBytes() == 8 ? 4 : 8>;
1535 const Repartition<TW, decltype(d)> dw;
1536 using VW = VFromD<decltype(dw)>;
1537
1538 // (Comments are for 256-bit vectors.)
1539 // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD.
1540 VFromD<D> vA; // v3210[9]v3210[8] v3210[1]v3210[0]
1541 VFromD<D> vB; // v3210[b]v3210[a] v3210[3]v3210[2]
1542 VFromD<D> vC; // v3210[d]v3210[c] v3210[5]v3210[4]
1543 VFromD<D> vD; // v3210[f]v3210[e] v3210[7]v3210[6]
1544 detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
1545
1546 const VFromD<D> va820 = InterleaveLower(d, vA, vB); // v3210[a,8] v3210[2,0]
1547 const VFromD<D> vec64 = InterleaveLower(d, vC, vD); // v3210[e,c] v3210[6,4]
1548 const VFromD<D> vb931 = InterleaveUpper(d, vA, vB); // v3210[b,9] v3210[3,1]
1549 const VFromD<D> vfd75 = InterleaveUpper(d, vC, vD); // v3210[f,d] v3210[7,5]
1550
1551 const VW v10_b830 = // v10[b..8] v10[3..0]
1552 BitCast(dw, InterleaveLower(d, va820, vb931));
1553 const VW v10_fc74 = // v10[f..c] v10[7..4]
1554 BitCast(dw, InterleaveLower(d, vec64, vfd75));
1555 const VW v32_b830 = // v32[b..8] v32[3..0]
1556 BitCast(dw, InterleaveUpper(d, va820, vb931));
1557 const VW v32_fc74 = // v32[f..c] v32[7..4]
1558 BitCast(dw, InterleaveUpper(d, vec64, vfd75));
1559
1560 v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
1561 v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
1562 v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
1563 v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
1564}
1565
1566template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
1567HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
1568 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
1569 VFromD<D>& v3) {
1570 using V = VFromD<D>;
1571 V vA; // v3210[4] v3210[0]
1572 V vB; // v3210[5] v3210[1]
1573 V vC; // v3210[6] v3210[2]
1574 V vD; // v3210[7] v3210[3]
1575 detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
1576 const V v10e = InterleaveLower(d, vA, vC); // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
1577 const V v10o = InterleaveLower(d, vB, vD); // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
1578 const V v32e = InterleaveUpper(d, vA, vC); // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
1579 const V v32o = InterleaveUpper(d, vB, vD); // v3[7,5] v2[7,5] v3[3,1] v2[3,1]
1580
1581 v0 = InterleaveLower(d, v10e, v10o);
1582 v1 = InterleaveUpper(d, v10e, v10o);
1583 v2 = InterleaveLower(d, v32e, v32o);
1584 v3 = InterleaveUpper(d, v32e, v32o);
1585}
1586
1587template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
1588HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
1589 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
1590 VFromD<D>& v3) {
1591 VFromD<D> vA, vB, vC, vD;
1592 detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
1593 v0 = InterleaveLower(d, vA, vC);
1594 v1 = InterleaveUpper(d, vA, vC);
1595 v2 = InterleaveLower(d, vB, vD);
1596 v3 = InterleaveUpper(d, vB, vD);
1597}
1598
1599// Any T x1
1600template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
1601HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
1602 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
1603 VFromD<D>& v3) {
1604 v0 = LoadU(d, unaligned + 0);
1605 v1 = LoadU(d, unaligned + 1);
1606 v2 = LoadU(d, unaligned + 2);
1607 v3 = LoadU(d, unaligned + 3);
1608}
1609
1610// ------------------------------ StoreInterleaved2
1611
1612namespace detail {
1613
1614// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
1615template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1617 TFromD<D>* HWY_RESTRICT unaligned) {
1618 constexpr size_t kN = MaxLanes(d);
1619 StoreU(A, d, unaligned + 0 * kN);
1620 StoreU(B, d, unaligned + 1 * kN);
1621}
1622
1623} // namespace detail
1624
1625// >= 128 bit vector
1626template <class D, HWY_IF_V_SIZE_GT_D(D, 8)>
1628 TFromD<D>* HWY_RESTRICT unaligned) {
1629 const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0]
1630 const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[kN/2] v0[kN/2]
1631 detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
1632}
1633
1634// <= 64 bits
1635template <class V, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
1636HWY_API void StoreInterleaved2(V part0, V part1, D d,
1637 TFromD<D>* HWY_RESTRICT unaligned) {
1638 const Twice<decltype(d)> d2;
1639 const auto v0 = ZeroExtendVector(d2, part0);
1640 const auto v1 = ZeroExtendVector(d2, part1);
1641 const auto v10 = InterleaveLower(d2, v0, v1);
1642 StoreU(v10, d2, unaligned);
1643}
1644
1645// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
1646// TableLookupBytes)
1647
1648namespace detail {
1649
1650// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
1651template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1653 D d, TFromD<D>* HWY_RESTRICT unaligned) {
1654 constexpr size_t kN = MaxLanes(d);
1655 StoreU(A, d, unaligned + 0 * kN);
1656 StoreU(B, d, unaligned + 1 * kN);
1657 StoreU(C, d, unaligned + 2 * kN);
1658}
1659
1660} // namespace detail
1661
1662// >= 128-bit vector, 8-bit lanes
1663template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
1665 TFromD<D>* HWY_RESTRICT unaligned) {
1666 const RebindToUnsigned<decltype(d)> du;
1667 using TU = TFromD<decltype(du)>;
1668 using VU = VFromD<decltype(du)>;
1669 const VU k5 = Set(du, TU{5});
1670 const VU k6 = Set(du, TU{6});
1671
1672 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
1673 // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
1674 // to their place, with 0x80 so lanes to be filled from other vectors are 0
1675 // to enable blending by ORing together.
1676 const VFromD<decltype(du)> shuf_A0 =
1677 Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3,
1678 0x80, 0x80, 4, 0x80, 0x80, 5);
1679 // Cannot reuse shuf_A0 because it contains 5.
1680 const VFromD<decltype(du)> shuf_A1 =
1681 Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
1682 3, 0x80, 0x80, 4, 0x80, 0x80);
1683 // The interleaved vectors will be named A, B, C; temporaries with suffix
1684 // 0..2 indicate which input vector's lanes they hold.
1685 // cannot reuse shuf_A0 (has 5)
1686 const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
1687 const VU vA0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
1688 const VU vA1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
1689 const VU vA2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
1690 const VFromD<D> A = BitCast(d, vA0 | vA1 | vA2);
1691
1692 // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
1693 const VU shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6..
1694 const VU shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5
1695 const VU shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5.
1696 const VU vB0 = TableLookupBytesOr0(v0, shuf_B0);
1697 const VU vB1 = TableLookupBytesOr0(v1, shuf_B1);
1698 const VU vB2 = TableLookupBytesOr0(v2, shuf_B2);
1699 const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2);
1700
1701 // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
1702 const VU shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B.
1703 const VU shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B..
1704 const VU shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A
1705 const VU vC0 = TableLookupBytesOr0(v0, shuf_C0);
1706 const VU vC1 = TableLookupBytesOr0(v1, shuf_C1);
1707 const VU vC2 = TableLookupBytesOr0(v2, shuf_C2);
1708 const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2);
1709
1710 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
1711}
1712
1713// >= 128-bit vector, 16-bit lanes
1714template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
1716 TFromD<D>* HWY_RESTRICT unaligned) {
1717 const Repartition<uint8_t, decltype(d)> du8;
1718 using VU8 = VFromD<decltype(du8)>;
1719 const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
1720 const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
1721
1722 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
1723 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
1724 // filled from other vectors are 0 for blending. Note that these are byte
1725 // indices for 16-bit lanes.
1726 const VFromD<decltype(du8)> shuf_A1 =
1727 Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3,
1728 0x80, 0x80, 0x80, 0x80, 4, 5);
1729 const VFromD<decltype(du8)> shuf_A2 =
1730 Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80,
1731 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80);
1732
1733 // The interleaved vectors will be named A, B, C; temporaries with suffix
1734 // 0..2 indicate which input vector's lanes they hold.
1735 const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
1736
1737 const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);
1738 const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);
1739 const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);
1740 const VFromD<D> A = BitCast(d, A0 | A1 | A2);
1741
1742 // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
1743 const VU8 shuf_B0 = shuf_A1 + k3; // 5..4..3.
1744 const VU8 shuf_B1 = shuf_A2 + k3; // ..4..3..
1745 const VU8 shuf_B2 = shuf_A0 + k2; // .4..3..2
1746 const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0);
1747 const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1);
1748 const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2);
1749 const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2);
1750
1751 // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
1752 const VU8 shuf_C0 = shuf_B1 + k3; // ..7..6..
1753 const VU8 shuf_C1 = shuf_B2 + k3; // .7..6..5
1754 const VU8 shuf_C2 = shuf_B0 + k2; // 7..6..5.
1755 const VU8 vC0 = TableLookupBytesOr0(v0, shuf_C0);
1756 const VU8 vC1 = TableLookupBytesOr0(v1, shuf_C1);
1757 const VU8 vC2 = TableLookupBytesOr0(v2, shuf_C2);
1758 const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2);
1759
1760 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
1761}
1762
1763// >= 128-bit vector, 32-bit lanes
1764template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
1766 TFromD<D>* HWY_RESTRICT unaligned) {
1767 const RepartitionToWide<decltype(d)> dw;
1768
1769 const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
1770 const VFromD<D> v01_v20 = OddEven(v0, v2);
1771 // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
1772 const VFromD<D> A = BitCast(
1773 d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
1774
1775 const VFromD<D> v1_321 = ShiftRightLanes<1>(d, v1);
1776 const VFromD<D> v0_32 = ShiftRightLanes<2>(d, v0);
1777 const VFromD<D> v21_v11 = OddEven(v2, v1_321);
1778 const VFromD<D> v12_v02 = OddEven(v1_321, v0_32);
1779 // B: v1[2],v0[2], v2[1],v1[1]
1780 const VFromD<D> B = BitCast(
1781 d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
1782
1783 // Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
1784 const VFromD<D> v23_v13 = OddEven(v2, v1_321);
1785 const VFromD<D> v03_v22 = OddEven(v0, v2);
1786 // C: v2[3],v1[3],v0[3], v2[2]
1787 const VFromD<D> C = BitCast(
1788 d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
1789
1790 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
1791}
1792
1793// >= 128-bit vector, 64-bit lanes
1794template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
1796 TFromD<D>* HWY_RESTRICT unaligned) {
1797 const VFromD<D> A = InterleaveLower(d, v0, v1);
1798 const VFromD<D> B = OddEven(v0, v2);
1799 const VFromD<D> C = InterleaveUpper(d, v1, v2);
1800 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
1801}
1802
1803// 64-bit vector, 8-bit lanes
1804template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
1806 VFromD<D> part2, D d,
1807 TFromD<D>* HWY_RESTRICT unaligned) {
1808 // Use full vectors for the shuffles and first result.
1809 constexpr size_t kFullN = 16 / sizeof(TFromD<D>);
1810 const Full128<uint8_t> du;
1811 using VU = VFromD<decltype(du)>;
1812 const Full128<TFromD<D>> d_full;
1813 const VU k5 = Set(du, uint8_t{5});
1814 const VU k6 = Set(du, uint8_t{6});
1815
1816 const VFromD<decltype(d_full)> v0{part0.raw};
1817 const VFromD<decltype(d_full)> v1{part1.raw};
1818 const VFromD<decltype(d_full)> v2{part2.raw};
1819
1820 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
1821 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
1822 // filled from other vectors are 0 for blending.
1823 alignas(16) static constexpr uint8_t tbl_v0[16] = {
1824 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
1825 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
1826 alignas(16) static constexpr uint8_t tbl_v1[16] = {
1827 0x80, 0, 0x80, 0x80, 1, 0x80, //
1828 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
1829 // The interleaved vectors will be named A, B, C; temporaries with suffix
1830 // 0..2 indicate which input vector's lanes they hold.
1831 const VU shuf_A0 = Load(du, tbl_v0);
1832 const VU shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB)
1833 const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
1834 const VU A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
1835 const VU A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
1836 const VU A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
1837 const auto A = BitCast(d_full, A0 | A1 | A2);
1838 StoreU(A, d_full, unaligned + 0 * kFullN);
1839
1840 // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
1841 const VU shuf_B0 = shuf_A2 + k6; // ..7..6..
1842 const VU shuf_B1 = shuf_A0 + k5; // .7..6..5
1843 const VU shuf_B2 = shuf_A1 + k5; // 7..6..5.
1844 const VU vB0 = TableLookupBytesOr0(v0, shuf_B0);
1845 const VU vB1 = TableLookupBytesOr0(v1, shuf_B1);
1846 const VU vB2 = TableLookupBytesOr0(v2, shuf_B2);
1847 const VFromD<D> B{BitCast(d_full, vB0 | vB1 | vB2).raw};
1848 StoreU(B, d, unaligned + 1 * kFullN);
1849}
1850
1851// 64-bit vector, 16-bit lanes
1852template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 4)>
1854 VFromD<D> part2, D dh,
1855 TFromD<D>* HWY_RESTRICT unaligned) {
1856 const Twice<D> d_full;
1857 const Full128<uint8_t> du8;
1858 using VU8 = VFromD<decltype(du8)>;
1859 const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
1860 const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
1861
1862 const VFromD<decltype(d_full)> v0{part0.raw};
1863 const VFromD<decltype(d_full)> v1{part1.raw};
1864 const VFromD<decltype(d_full)> v2{part2.raw};
1865
1866 // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
1867 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
1868 // to their place, with 0x80 so lanes to be filled from other vectors are 0
1869 // to enable blending by ORing together.
1870 alignas(16) static constexpr uint8_t tbl_v1[16] = {
1871 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
1872 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
1873 alignas(16) static constexpr uint8_t tbl_v2[16] = {
1874 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
1875 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
1876
1877 // The interleaved vectors will be named A, B; temporaries with suffix
1878 // 0..2 indicate which input vector's lanes they hold.
1879 const VU8 shuf_A1 = Load(du8, tbl_v1); // 2..1..0.
1880 // .2..1..0
1881 const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
1882 const VU8 shuf_A2 = Load(du8, tbl_v2); // ..1..0..
1883
1884 const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);
1885 const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);
1886 const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);
1887 const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
1888 StoreU(A, d_full, unaligned);
1889
1890 // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
1891 const VU8 shuf_B0 = shuf_A1 + k3; // ..3.
1892 const VU8 shuf_B1 = shuf_A2 + k3; // .3..
1893 const VU8 shuf_B2 = shuf_A0 + k2; // 3..2
1894 const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0);
1895 const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1);
1896 const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2);
1897 const VFromD<decltype(d_full)> B = BitCast(d_full, vB0 | vB1 | vB2);
1898 StoreU(VFromD<D>{B.raw}, dh, unaligned + MaxLanes(d_full));
1899}
1900
1901// 64-bit vector, 32-bit lanes
1902template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_D(D, 2)>
1904 TFromD<D>* HWY_RESTRICT unaligned) {
1905 // (same code as 128-bit vector, 64-bit lanes)
1906 const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
1907 const VFromD<D> v01_v20 = OddEven(v0, v2);
1908 const VFromD<D> v21_v11 = InterleaveUpper(d, v1, v2);
1909 constexpr size_t kN = MaxLanes(d);
1910 StoreU(v10_v00, d, unaligned + 0 * kN);
1911 StoreU(v01_v20, d, unaligned + 1 * kN);
1912 StoreU(v21_v11, d, unaligned + 2 * kN);
1913}
1914
1915// 64-bit lanes are handled by the N=1 case below.
1916
1917// <= 32-bit vector, 8-bit lanes
1918template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4),
1919 HWY_IF_LANES_GT_D(D, 1)>
1921 VFromD<D> part2, D d,
1922 TFromD<D>* HWY_RESTRICT unaligned) {
1923 // Use full vectors for the shuffles and result.
1924 const Full128<uint8_t> du;
1925 using VU = VFromD<decltype(du)>;
1926 const Full128<TFromD<D>> d_full;
1927
1928 const VFromD<decltype(d_full)> v0{part0.raw};
1929 const VFromD<decltype(d_full)> v1{part1.raw};
1930 const VFromD<decltype(d_full)> v2{part2.raw};
1931
1932 // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
1933 // so lanes to be filled from other vectors are 0 to enable blending by ORing
1934 // together.
1935 alignas(16) static constexpr uint8_t tbl_v0[16] = {
1936 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
1937 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1938 // The interleaved vector will be named A; temporaries with suffix
1939 // 0..2 indicate which input vector's lanes they hold.
1940 const VU shuf_A0 = Load(du, tbl_v0);
1941 const VU shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
1942 const VU shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
1943 const VU A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0
1944 const VU A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0.
1945 const VU A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0..
1946 const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
1947 alignas(16) TFromD<D> buf[MaxLanes(d_full)];
1948 StoreU(A, d_full, buf);
1949 CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
1950}
1951
1952// 32-bit vector, 16-bit lanes
1953template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 2)>
1955 VFromD<D> part2, D d,
1956 TFromD<D>* HWY_RESTRICT unaligned) {
1957 // Use full vectors for the shuffles and result.
1958 const Full128<uint8_t> du8;
1959 using VU8 = VFromD<decltype(du8)>;
1960 const Full128<TFromD<D>> d_full;
1961
1962 const VFromD<decltype(d_full)> v0{part0.raw};
1963 const VFromD<decltype(d_full)> v1{part1.raw};
1964 const VFromD<decltype(d_full)> v2{part2.raw};
1965
1966 // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
1967 // so lanes to be filled from other vectors are 0 to enable blending by ORing
1968 // together.
1969 alignas(16) static constexpr uint8_t tbl_v2[16] = {
1970 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
1971 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
1972 // The interleaved vector will be named A; temporaries with suffix
1973 // 0..2 indicate which input vector's lanes they hold.
1974 const VU8 shuf_A2 = Load(du8, tbl_v2); // ..1..0..
1975 const VU8 shuf_A1 =
1976 CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2); // ...1..0.
1977 const VU8 shuf_A0 =
1978 CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2); // ....1..0
1979 const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0
1980 const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0.
1981 const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
1982 const auto A = BitCast(d_full, A0 | A1 | A2);
1983 alignas(16) TFromD<D> buf[MaxLanes(d_full)];
1984 StoreU(A, d_full, buf);
1985 CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
1986}
1987
1988// Single-element vector, any lane size: just store directly
1989template <class D, HWY_IF_LANES_D(D, 1)>
1991 TFromD<D>* HWY_RESTRICT unaligned) {
1992 StoreU(v0, d, unaligned + 0);
1993 StoreU(v1, d, unaligned + 1);
1994 StoreU(v2, d, unaligned + 2);
1995}
1996
1997// ------------------------------ StoreInterleaved4
1998
1999namespace detail {
2000
2001// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
2002template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
2004 VFromD<D> vD, D d,
2005 TFromD<D>* HWY_RESTRICT unaligned) {
2006 constexpr size_t kN = MaxLanes(d);
2007 StoreU(vA, d, unaligned + 0 * kN);
2008 StoreU(vB, d, unaligned + 1 * kN);
2009 StoreU(vC, d, unaligned + 2 * kN);
2010 StoreU(vD, d, unaligned + 3 * kN);
2011}
2012
2013} // namespace detail
2014
2015// >= 128-bit vector, 8..32-bit lanes
2016template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
2018 VFromD<D> v3, D d,
2019 TFromD<D>* HWY_RESTRICT unaligned) {
2020 const RepartitionToWide<decltype(d)> dw;
2021 const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
2022 const auto v32L = ZipLower(dw, v2, v3);
2023 const auto v10U = ZipUpper(dw, v0, v1);
2024 const auto v32U = ZipUpper(dw, v2, v3);
2025 // The interleaved vectors are vA, vB, vC, vD.
2026 const VFromD<D> vA = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210
2027 const VFromD<D> vB = BitCast(d, InterleaveUpper(dw, v10L, v32L));
2028 const VFromD<D> vC = BitCast(d, InterleaveLower(dw, v10U, v32U));
2029 const VFromD<D> vD = BitCast(d, InterleaveUpper(dw, v10U, v32U));
2030 detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
2031}
2032
2033// >= 128-bit vector, 64-bit lanes
2034template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
2036 VFromD<D> v3, D d,
2037 TFromD<D>* HWY_RESTRICT unaligned) {
2038 // The interleaved vectors are vA, vB, vC, vD.
2039 const VFromD<D> vA = InterleaveLower(d, v0, v1); // v1[0] v0[0]
2040 const VFromD<D> vB = InterleaveLower(d, v2, v3);
2041 const VFromD<D> vC = InterleaveUpper(d, v0, v1);
2042 const VFromD<D> vD = InterleaveUpper(d, v2, v3);
2043 detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
2044}
2045
2046// 64-bit vector, 8..32-bit lanes
2047template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
2049 VFromD<D> part2, VFromD<D> part3, D /* tag */,
2050 TFromD<D>* HWY_RESTRICT unaligned) {
2051 // Use full vectors to reduce the number of stores.
2052 const Full128<TFromD<D>> d_full;
2053 const RepartitionToWide<decltype(d_full)> dw;
2054 const VFromD<decltype(d_full)> v0{part0.raw};
2055 const VFromD<decltype(d_full)> v1{part1.raw};
2056 const VFromD<decltype(d_full)> v2{part2.raw};
2057 const VFromD<decltype(d_full)> v3{part3.raw};
2058 const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0]
2059 const auto v32 = ZipLower(dw, v2, v3);
2060 const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
2061 const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
2062 StoreU(A, d_full, unaligned);
2063 StoreU(B, d_full, unaligned + MaxLanes(d_full));
2064}
2065
2066// 64-bit vector, 64-bit lane
2067template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
2069 VFromD<D> part2, VFromD<D> part3, D /* tag */,
2070 TFromD<D>* HWY_RESTRICT unaligned) {
2071 // Use full vectors to reduce the number of stores.
2072 const Full128<TFromD<D>> d_full;
2073 const VFromD<decltype(d_full)> v0{part0.raw};
2074 const VFromD<decltype(d_full)> v1{part1.raw};
2075 const VFromD<decltype(d_full)> v2{part2.raw};
2076 const VFromD<decltype(d_full)> v3{part3.raw};
2077 const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0]
2078 const auto B = InterleaveLower(d_full, v2, v3);
2079 StoreU(A, d_full, unaligned);
2080 StoreU(B, d_full, unaligned + MaxLanes(d_full));
2081}
2082
2083// <= 32-bit vectors
2084template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
2086 VFromD<D> part2, VFromD<D> part3, D d,
2087 TFromD<D>* HWY_RESTRICT unaligned) {
2088 // Use full vectors to reduce the number of stores.
2089 const Full128<TFromD<D>> d_full;
2090 const RepartitionToWide<decltype(d_full)> dw;
2091 const VFromD<decltype(d_full)> v0{part0.raw};
2092 const VFromD<decltype(d_full)> v1{part1.raw};
2093 const VFromD<decltype(d_full)> v2{part2.raw};
2094 const VFromD<decltype(d_full)> v3{part3.raw};
2095 const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
2096 const auto v32 = ZipLower(dw, v2, v3);
2097 const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
2098 alignas(16) TFromD<D> buf[MaxLanes(d_full)];
2099 StoreU(v3210, d_full, buf);
2100 CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
2101}
2102
2103#endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED
2104
2105// ------------------------------ LoadN
2106
2107#if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE))
2108
2109#ifdef HWY_NATIVE_LOAD_N
2110#undef HWY_NATIVE_LOAD_N
2111#else
2112#define HWY_NATIVE_LOAD_N
2113#endif
2114
2115#if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
2116namespace detail {
2117
2118template <class DTo, class DFrom>
2119HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from,
2120 VFromD<DFrom> v) {
2121#if HWY_TARGET <= HWY_SSE2
2122 // On SSE2/SSSE3/SSE4, the LoadU operation will zero out any lanes of v.raw
2123 // past the first (lowest-index) Lanes(d_from) lanes of v.raw if
2124 // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true
2125 (void)d_from;
2126 return ResizeBitCast(d_to, v);
2127#else
2128 // On other targets such as PPC/NEON, the contents of any lanes past the first
2129 // (lowest-index) Lanes(d_from) lanes of v.raw might be non-zero if
2130 // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true.
2131 return ZeroExtendResizeBitCast(d_to, d_from, v);
2132#endif
2133}
2134
2135} // namespace detail
2136
2137template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
2139HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
2140 size_t num_lanes) {
2141 return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
2142}
2143
2144template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
2146HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
2147 size_t num_lanes) {
2148 return (num_lanes > 0) ? LoadU(d, p) : no;
2149}
2150
2151template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
2153HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
2154 size_t num_lanes) {
2155 const FixedTag<TFromD<D>, 1> d1;
2156
2157 if (num_lanes >= 2) return LoadU(d, p);
2158 if (num_lanes == 0) return Zero(d);
2159 return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
2160}
2161
2162template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
2164HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
2165 size_t num_lanes) {
2166 const FixedTag<TFromD<D>, 1> d1;
2167
2168 if (num_lanes >= 2) return LoadU(d, p);
2169 if (num_lanes == 0) return no;
2170 return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
2171}
2172
2173template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
2175HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
2176 size_t num_lanes) {
2177 const FixedTag<TFromD<D>, 2> d2;
2178 const Half<decltype(d2)> d1;
2179
2180 if (num_lanes >= 4) return LoadU(d, p);
2181 if (num_lanes == 0) return Zero(d);
2182 if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
2183
2184 // Two or three lanes.
2185 const VFromD<D> v_lo = detail::LoadNResizeBitCast(d, d2, LoadU(d2, p));
2186 return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
2187}
2188
2189template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
2191HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
2192 size_t num_lanes) {
2193 const FixedTag<TFromD<D>, 2> d2;
2194
2195 if (num_lanes >= 4) return LoadU(d, p);
2196 if (num_lanes == 0) return no;
2197 if (num_lanes == 1) return InsertLane(no, 0, p[0]);
2198
2199 // Two or three lanes.
2200 const VFromD<D> v_lo =
2201 ConcatUpperLower(d, no, ResizeBitCast(d, LoadU(d2, p)));
2202 return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
2203}
2204
2205template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
2207HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
2208 size_t num_lanes) {
2209 const FixedTag<TFromD<D>, 4> d4;
2210 const Half<decltype(d4)> d2;
2211 const Half<decltype(d2)> d1;
2212
2213 if (num_lanes >= 8) return LoadU(d, p);
2214 if (num_lanes == 0) return Zero(d);
2215 if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
2216
2217 const size_t leading_len = num_lanes & 4;
2218 VFromD<decltype(d4)> v_trailing = Zero(d4);
2219
2220 if ((num_lanes & 2) != 0) {
2221 const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
2222 if ((num_lanes & 1) != 0) {
2223 v_trailing = Combine(
2224 d4,
2225 detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)),
2226 v_trailing_lo2);
2227 } else {
2228 v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
2229 }
2230 } else if ((num_lanes & 1) != 0) {
2231 v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len));
2232 }
2233
2234 if (leading_len != 0) {
2235 return Combine(d, v_trailing, LoadU(d4, p));
2236 } else {
2237 return detail::LoadNResizeBitCast(d, d4, v_trailing);
2238 }
2239}
2240
2241template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
2243HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
2244 size_t num_lanes) {
2245 const FixedTag<TFromD<D>, 4> d4;
2246 const Half<decltype(d4)> d2;
2247 const Half<decltype(d2)> d1;
2248
2249 if (num_lanes >= 8) return LoadU(d, p);
2250 if (num_lanes == 0) return no;
2251 if (num_lanes == 1) return InsertLane(no, 0, p[0]);
2252
2253 const size_t leading_len = num_lanes & 4;
2254 VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no);
2255
2256 if ((num_lanes & 2) != 0) {
2257 const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
2258 if ((num_lanes & 1) != 0) {
2259 v_trailing = Combine(
2260 d4,
2261 InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)),
2262 ResizeBitCast(d2, no)),
2263 v_trailing_lo2);
2264 } else {
2265 v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no),
2266 ResizeBitCast(d4, v_trailing_lo2));
2267 }
2268 } else if ((num_lanes & 1) != 0) {
2269 v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]);
2270 }
2271
2272 if (leading_len != 0) {
2273 return Combine(d, v_trailing, LoadU(d4, p));
2274 } else {
2275 return ConcatUpperLower(d, no, ResizeBitCast(d, v_trailing));
2276 }
2277}
2278
2279template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
2281HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
2282 size_t num_lanes) {
2283 const FixedTag<TFromD<D>, 8> d8;
2284 const Half<decltype(d8)> d4;
2285 const Half<decltype(d4)> d2;
2286 const Half<decltype(d2)> d1;
2287
2288 if (num_lanes >= 16) return LoadU(d, p);
2289 if (num_lanes == 0) return Zero(d);
2290 if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
2291
2292 const size_t leading_len = num_lanes & 12;
2293 VFromD<decltype(d4)> v_trailing = Zero(d4);
2294
2295 if ((num_lanes & 2) != 0) {
2296 const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
2297 if ((num_lanes & 1) != 0) {
2298 v_trailing = Combine(
2299 d4,
2300 detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)),
2301 v_trailing_lo2);
2302 } else {
2303 v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
2304 }
2305 } else if ((num_lanes & 1) != 0) {
2306 v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len));
2307 }
2308
2309 if (leading_len != 0) {
2310 if (leading_len >= 8) {
2311 const VFromD<decltype(d8)> v_hi7 =
2312 ((leading_len & 4) != 0)
2313 ? Combine(d8, v_trailing, LoadU(d4, p + 8))
2314 : detail::LoadNResizeBitCast(d8, d4, v_trailing);
2315 return Combine(d, v_hi7, LoadU(d8, p));
2316 } else {
2317 return detail::LoadNResizeBitCast(d, d8,
2318 Combine(d8, v_trailing, LoadU(d4, p)));
2319 }
2320 } else {
2321 return detail::LoadNResizeBitCast(d, d4, v_trailing);
2322 }
2323}
2324
2325template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
2327HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
2328 size_t num_lanes) {
2329 const FixedTag<TFromD<D>, 8> d8;
2330 const Half<decltype(d8)> d4;
2331 const Half<decltype(d4)> d2;
2332 const Half<decltype(d2)> d1;
2333
2334 if (num_lanes >= 16) return LoadU(d, p);
2335 if (num_lanes == 0) return no;
2336 if (num_lanes == 1) return InsertLane(no, 0, p[0]);
2337
2338 const size_t leading_len = num_lanes & 12;
2339 VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no);
2340
2341 if ((num_lanes & 2) != 0) {
2342 const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
2343 if ((num_lanes & 1) != 0) {
2344 v_trailing = Combine(
2345 d4,
2346 InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)),
2347 ResizeBitCast(d2, no)),
2348 v_trailing_lo2);
2349 } else {
2350 v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no),
2351 ResizeBitCast(d4, v_trailing_lo2));
2352 }
2353 } else if ((num_lanes & 1) != 0) {
2354 v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]);
2355 }
2356
2357 if (leading_len != 0) {
2358 if (leading_len >= 8) {
2359 const VFromD<decltype(d8)> v_hi7 =
2360 ((leading_len & 4) != 0)
2361 ? Combine(d8, v_trailing, LoadU(d4, p + 8))
2362 : ConcatUpperLower(d8, ResizeBitCast(d8, no),
2363 ResizeBitCast(d8, v_trailing));
2364 return Combine(d, v_hi7, LoadU(d8, p));
2365 } else {
2366 return ConcatUpperLower(
2367 d, ResizeBitCast(d, no),
2368 ResizeBitCast(d, Combine(d8, v_trailing, LoadU(d4, p))));
2369 }
2370 } else {
2371 const Repartition<uint32_t, D> du32;
2372 // lowest 4 bytes from v_trailing, next 4 from no.
2373 const VFromD<decltype(du32)> lo8 =
2374 InterleaveLower(ResizeBitCast(du32, v_trailing), BitCast(du32, no));
2375 return ConcatUpperLower(d, ResizeBitCast(d, no), ResizeBitCast(d, lo8));
2376 }
2377}
2378
2379#if HWY_MAX_BYTES >= 32
2380
2381template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
2382HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
2383 size_t num_lanes) {
2384 if (num_lanes >= Lanes(d)) return LoadU(d, p);
2385
2386 const Half<decltype(d)> dh;
2387 const size_t half_N = Lanes(dh);
2388 if (num_lanes <= half_N) {
2389 return ZeroExtendVector(d, LoadN(dh, p, num_lanes));
2390 } else {
2391 const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
2392 const VFromD<decltype(dh)> v_hi = LoadN(dh, p + half_N, num_lanes - half_N);
2393 return Combine(d, v_hi, v_lo);
2394 }
2395}
2396
2397template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
2398HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
2399 size_t num_lanes) {
2400 if (num_lanes >= Lanes(d)) return LoadU(d, p);
2401
2402 const Half<decltype(d)> dh;
2403 const size_t half_N = Lanes(dh);
2404 const VFromD<decltype(dh)> no_h = LowerHalf(no);
2405 if (num_lanes <= half_N) {
2406 return ConcatUpperLower(d, no,
2407 ResizeBitCast(d, LoadNOr(no_h, dh, p, num_lanes)));
2408 } else {
2409 const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
2410 const VFromD<decltype(dh)> v_hi =
2411 LoadNOr(no_h, dh, p + half_N, num_lanes - half_N);
2412 return Combine(d, v_hi, v_lo);
2413 }
2414}
2415
2416#endif // HWY_MAX_BYTES >= 32
2417
2418template <class D, HWY_IF_BF16_D(D)>
2419HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
2420 size_t num_lanes) {
2421 const RebindToUnsigned<D> du;
2422 return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
2423}
2424
2425template <class D, HWY_IF_BF16_D(D)>
2426HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
2427 size_t num_lanes) {
2428 const RebindToUnsigned<D> du;
2429 return BitCast(
2430 d, LoadNOr(BitCast(du, no), du, detail::U16LanePointer(p), num_lanes));
2431}
2432
2433#else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
2434
2435// For SVE and non-sanitizer AVX-512; RVV has its own specialization.
2436template <class D>
2437HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
2438 size_t num_lanes) {
2439#if HWY_MEM_OPS_MIGHT_FAULT
2440 if (num_lanes <= 0) return Zero(d);
2441#endif
2442
2443 return MaskedLoad(FirstN(d, num_lanes), d, p);
2444}
2445
2446template <class D>
2447HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
2448 size_t num_lanes) {
2449#if HWY_MEM_OPS_MIGHT_FAULT
2450 if (num_lanes <= 0) return no;
2451#endif
2452
2453 return MaskedLoadOr(no, FirstN(d, num_lanes), d, p);
2454}
2455
2456#endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
2457#endif // HWY_NATIVE_LOAD_N
2458
2459// ------------------------------ StoreN
2460#if (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
2461#ifdef HWY_NATIVE_STORE_N
2462#undef HWY_NATIVE_STORE_N
2463#else
2464#define HWY_NATIVE_STORE_N
2465#endif
2466
2467#if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
2468namespace detail {
2469
2470template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
2471HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
2472 constexpr size_t kMinShrVectBytes =
2474 const FixedTag<uint8_t, kMinShrVectBytes> d_shift;
2475 return ResizeBitCast(
2476 dh, ShiftRightBytes<dh.MaxBytes()>(d_shift, ResizeBitCast(d_shift, v)));
2477}
2478
2479template <class DH, HWY_IF_V_SIZE_GT_D(DH, 4)>
2480HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
2481 return UpperHalf(dh, v);
2482}
2483
2484} // namespace detail
2485
2486template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
2487 typename T = TFromD<D>>
2488HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
2489 size_t max_lanes_to_store) {
2490 if (max_lanes_to_store > 0) {
2491 StoreU(v, d, p);
2492 }
2493}
2494
2495template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
2496 typename T = TFromD<D>>
2497HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
2498 size_t max_lanes_to_store) {
2499 if (max_lanes_to_store > 1) {
2500 StoreU(v, d, p);
2501 } else if (max_lanes_to_store == 1) {
2502 const FixedTag<TFromD<D>, 1> d1;
2503 StoreU(LowerHalf(d1, v), d1, p);
2504 }
2505}
2506
2507template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
2508 typename T = TFromD<D>>
2509HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
2510 size_t max_lanes_to_store) {
2511 const FixedTag<TFromD<D>, 2> d2;
2512 const Half<decltype(d2)> d1;
2513
2514 if (max_lanes_to_store > 1) {
2515 if (max_lanes_to_store >= 4) {
2516 StoreU(v, d, p);
2517 } else {
2518 StoreU(ResizeBitCast(d2, v), d2, p);
2519 if (max_lanes_to_store == 3) {
2520 StoreU(ResizeBitCast(d1, detail::StoreNGetUpperHalf(d2, v)), d1, p + 2);
2521 }
2522 }
2523 } else if (max_lanes_to_store == 1) {
2524 StoreU(ResizeBitCast(d1, v), d1, p);
2525 }
2526}
2527
2528template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
2529 typename T = TFromD<D>>
2530HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
2531 size_t max_lanes_to_store) {
2532 const FixedTag<TFromD<D>, 4> d4;
2533 const Half<decltype(d4)> d2;
2534 const Half<decltype(d2)> d1;
2535
2536 if (max_lanes_to_store <= 1) {
2537 if (max_lanes_to_store == 1) {
2538 StoreU(ResizeBitCast(d1, v), d1, p);
2539 }
2540 } else if (max_lanes_to_store >= 8) {
2541 StoreU(v, d, p);
2542 } else if (max_lanes_to_store >= 4) {
2543 StoreU(LowerHalf(d4, v), d4, p);
2544 StoreN(detail::StoreNGetUpperHalf(d4, v), d4, p + 4,
2545 max_lanes_to_store - 4);
2546 } else {
2547 StoreN(LowerHalf(d4, v), d4, p, max_lanes_to_store);
2548 }
2549}
2550
2551template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
2552 typename T = TFromD<D>>
2553HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
2554 size_t max_lanes_to_store) {
2555 const FixedTag<TFromD<D>, 8> d8;
2556 const Half<decltype(d8)> d4;
2557 const Half<decltype(d4)> d2;
2558 const Half<decltype(d2)> d1;
2559
2560 if (max_lanes_to_store <= 1) {
2561 if (max_lanes_to_store == 1) {
2562 StoreU(ResizeBitCast(d1, v), d1, p);
2563 }
2564 } else if (max_lanes_to_store >= 16) {
2565 StoreU(v, d, p);
2566 } else if (max_lanes_to_store >= 8) {
2567 StoreU(LowerHalf(d8, v), d8, p);
2568 StoreN(detail::StoreNGetUpperHalf(d8, v), d8, p + 8,
2569 max_lanes_to_store - 8);
2570 } else {
2571 StoreN(LowerHalf(d8, v), d8, p, max_lanes_to_store);
2572 }
2573}
2574
2575#if HWY_MAX_BYTES >= 32
2576template <class D, HWY_IF_V_SIZE_GT_D(D, 16), typename T = TFromD<D>>
2577HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
2578 size_t max_lanes_to_store) {
2579 const size_t N = Lanes(d);
2580 if (max_lanes_to_store >= N) {
2581 StoreU(v, d, p);
2582 return;
2583 }
2584
2585 const Half<decltype(d)> dh;
2586 const size_t half_N = Lanes(dh);
2587 if (max_lanes_to_store <= half_N) {
2588 StoreN(LowerHalf(dh, v), dh, p, max_lanes_to_store);
2589 } else {
2590 StoreU(LowerHalf(dh, v), dh, p);
2591 StoreN(UpperHalf(dh, v), dh, p + half_N, max_lanes_to_store - half_N);
2592 }
2593}
2594#endif // HWY_MAX_BYTES >= 32
2595
2596#else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
2597template <class D, typename T = TFromD<D>>
2599 size_t max_lanes_to_store) {
2600 const size_t N = Lanes(d);
2601 const size_t clamped_max_lanes_to_store = HWY_MIN(max_lanes_to_store, N);
2602#if HWY_MEM_OPS_MIGHT_FAULT
2603 if (clamped_max_lanes_to_store == 0) return;
2604#endif
2605
2606 BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p);
2607
2608 detail::MaybeUnpoison(p, clamped_max_lanes_to_store);
2609}
2610#endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
2611
2612#endif // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
2613
2614// ------------------------------ Scatter
2615
2616#if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
2617#ifdef HWY_NATIVE_SCATTER
2618#undef HWY_NATIVE_SCATTER
2619#else
2620#define HWY_NATIVE_SCATTER
2621#endif
2622
2623template <class D, typename T = TFromD<D>>
2625 VFromD<RebindToSigned<D>> offset) {
2626 const RebindToSigned<decltype(d)> di;
2627 using TI = TFromD<decltype(di)>;
2628 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2629
2630 HWY_ALIGN T lanes[MaxLanes(d)];
2631 Store(v, d, lanes);
2632
2633 HWY_ALIGN TI offset_lanes[MaxLanes(d)];
2634 Store(offset, di, offset_lanes);
2635
2636 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
2637 for (size_t i = 0; i < MaxLanes(d); ++i) {
2638 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2639 }
2640}
2641
2642template <class D, typename T = TFromD<D>>
2644 VFromD<RebindToSigned<D>> index) {
2645 const RebindToSigned<decltype(d)> di;
2646 using TI = TFromD<decltype(di)>;
2647 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2648
2649 HWY_ALIGN T lanes[MaxLanes(d)];
2650 Store(v, d, lanes);
2651
2652 HWY_ALIGN TI index_lanes[MaxLanes(d)];
2653 Store(index, di, index_lanes);
2654
2655 for (size_t i = 0; i < MaxLanes(d); ++i) {
2656 base[index_lanes[i]] = lanes[i];
2657 }
2658}
2659
2660template <class D, typename T = TFromD<D>>
2662 T* HWY_RESTRICT base,
2663 VFromD<RebindToSigned<D>> index) {
2664 const RebindToSigned<decltype(d)> di;
2665 using TI = TFromD<decltype(di)>;
2666 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2667
2668 HWY_ALIGN T lanes[MaxLanes(d)];
2669 Store(v, d, lanes);
2670
2671 HWY_ALIGN TI index_lanes[MaxLanes(d)];
2672 Store(index, di, index_lanes);
2673
2674 HWY_ALIGN TI mask_lanes[MaxLanes(di)];
2675 Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
2676
2677 for (size_t i = 0; i < MaxLanes(d); ++i) {
2678 if (mask_lanes[i]) base[index_lanes[i]] = lanes[i];
2679 }
2680}
2681
2682#endif // (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
2683
2684// ------------------------------ Gather
2685
2686#if (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
2687#ifdef HWY_NATIVE_GATHER
2688#undef HWY_NATIVE_GATHER
2689#else
2690#define HWY_NATIVE_GATHER
2691#endif
2692
2693template <class D, typename T = TFromD<D>>
2695 VFromD<RebindToSigned<D>> offset) {
2696 const RebindToSigned<D> di;
2697 using TI = TFromD<decltype(di)>;
2698 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2699
2700 HWY_ALIGN TI offset_lanes[MaxLanes(d)];
2701 Store(offset, di, offset_lanes);
2702
2703 HWY_ALIGN T lanes[MaxLanes(d)];
2704 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
2705 for (size_t i = 0; i < MaxLanes(d); ++i) {
2706 HWY_DASSERT(offset_lanes[i] >= 0);
2707 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
2708 }
2709 return Load(d, lanes);
2710}
2711
2712template <class D, typename T = TFromD<D>>
2714 VFromD<RebindToSigned<D>> index) {
2715 const RebindToSigned<D> di;
2716 using TI = TFromD<decltype(di)>;
2717 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2718
2719 HWY_ALIGN TI index_lanes[MaxLanes(d)];
2720 Store(index, di, index_lanes);
2721
2722 HWY_ALIGN T lanes[MaxLanes(d)];
2723 for (size_t i = 0; i < MaxLanes(d); ++i) {
2724 HWY_DASSERT(index_lanes[i] >= 0);
2725 lanes[i] = base[index_lanes[i]];
2726 }
2727 return Load(d, lanes);
2728}
2729
2730template <class D, typename T = TFromD<D>>
2732 const T* HWY_RESTRICT base,
2733 VFromD<RebindToSigned<D>> index) {
2734 const RebindToSigned<D> di;
2735 using TI = TFromD<decltype(di)>;
2736 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2737
2738 HWY_ALIGN TI index_lanes[MaxLanes(di)];
2739 Store(index, di, index_lanes);
2740
2741 HWY_ALIGN TI mask_lanes[MaxLanes(di)];
2742 Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
2743
2744 HWY_ALIGN T lanes[MaxLanes(d)];
2745 for (size_t i = 0; i < MaxLanes(d); ++i) {
2746 HWY_DASSERT(index_lanes[i] >= 0);
2747 lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0};
2748 }
2749 return Load(d, lanes);
2750}
2751
2752template <class D, typename T = TFromD<D>>
2754 const T* HWY_RESTRICT base,
2755 VFromD<RebindToSigned<D>> index) {
2756 const RebindToSigned<D> di;
2757 using TI = TFromD<decltype(di)>;
2758 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2759
2760 HWY_ALIGN TI index_lanes[MaxLanes(di)];
2761 Store(index, di, index_lanes);
2762
2763 HWY_ALIGN TI mask_lanes[MaxLanes(di)];
2764 Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
2765
2766 HWY_ALIGN T no_lanes[MaxLanes(d)];
2767 Store(no, d, no_lanes);
2768
2769 HWY_ALIGN T lanes[MaxLanes(d)];
2770 for (size_t i = 0; i < MaxLanes(d); ++i) {
2771 HWY_DASSERT(index_lanes[i] >= 0);
2772 lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : no_lanes[i];
2773 }
2774 return Load(d, lanes);
2775}
2776
2777#endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
2778
2779// ------------------------------ ScatterN/GatherN
2780
2781template <class D, typename T = TFromD<D>>
2784 const size_t max_lanes_to_store) {
2785 MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index);
2786}
2787
2788template <class D, typename T = TFromD<D>>
2791 const size_t max_lanes_to_load) {
2792 return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index);
2793}
2794
2795// ------------------------------ Integer AbsDiff and SumsOf8AbsDiff
2796
2797#if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
2798#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
2799#undef HWY_NATIVE_INTEGER_ABS_DIFF
2800#else
2801#define HWY_NATIVE_INTEGER_ABS_DIFF
2802#endif
2803
2804template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
2805HWY_API V AbsDiff(V a, V b) {
2806 return Sub(Max(a, b), Min(a, b));
2807}
2808
2809#endif // HWY_NATIVE_INTEGER_ABS_DIFF
2810
2811#if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
2812#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
2813#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
2814#else
2815#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
2816#endif
2817
2818template <class V, HWY_IF_UI8_D(DFromV<V>),
2819 HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))>
2821 const DFromV<decltype(a)> d;
2822 const RebindToUnsigned<decltype(d)> du;
2823 const RepartitionToWideX3<decltype(d)> dw;
2824
2825 return BitCast(dw, SumsOf8(BitCast(du, AbsDiff(a, b))));
2826}
2827
2828#endif // HWY_NATIVE_SUMS_OF_8_ABS_DIFF
2829
2830// ------------------------------ SaturatedAdd/SaturatedSub for UI32/UI64
2831
2832#if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
2833#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
2834#undef HWY_NATIVE_I32_SATURATED_ADDSUB
2835#else
2836#define HWY_NATIVE_I32_SATURATED_ADDSUB
2837#endif
2838
2839template <class V, HWY_IF_I32_D(DFromV<V>)>
2841 const DFromV<decltype(a)> d;
2842 const auto sum = Add(a, b);
2843 const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
2844 const auto overflow_result =
2845 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>()));
2846 return IfNegativeThenElse(overflow_mask, overflow_result, sum);
2847}
2848
2849template <class V, HWY_IF_I32_D(DFromV<V>)>
2851 const DFromV<decltype(a)> d;
2852 const auto diff = Sub(a, b);
2853 const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
2854 const auto overflow_result =
2855 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>()));
2856 return IfNegativeThenElse(overflow_mask, overflow_result, diff);
2857}
2858
2859#endif // HWY_NATIVE_I32_SATURATED_ADDSUB
2860
2861#if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
2862#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
2863#undef HWY_NATIVE_I64_SATURATED_ADDSUB
2864#else
2865#define HWY_NATIVE_I64_SATURATED_ADDSUB
2866#endif
2867
2868template <class V, HWY_IF_I64_D(DFromV<V>)>
2869HWY_API V SaturatedAdd(V a, V b) {
2870 const DFromV<decltype(a)> d;
2871 const auto sum = Add(a, b);
2872 const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
2873 const auto overflow_result =
2874 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
2875 return IfNegativeThenElse(overflow_mask, overflow_result, sum);
2876}
2877
2878template <class V, HWY_IF_I64_D(DFromV<V>)>
2879HWY_API V SaturatedSub(V a, V b) {
2880 const DFromV<decltype(a)> d;
2881 const auto diff = Sub(a, b);
2882 const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
2883 const auto overflow_result =
2884 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
2885 return IfNegativeThenElse(overflow_mask, overflow_result, diff);
2886}
2887
2888#endif // HWY_NATIVE_I64_SATURATED_ADDSUB
2889
2890#if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
2891#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
2892#undef HWY_NATIVE_U32_SATURATED_ADDSUB
2893#else
2894#define HWY_NATIVE_U32_SATURATED_ADDSUB
2895#endif
2896
2897template <class V, HWY_IF_U32_D(DFromV<V>)>
2898HWY_API V SaturatedAdd(V a, V b) {
2899 return Add(a, Min(b, Not(a)));
2900}
2901
2902template <class V, HWY_IF_U32_D(DFromV<V>)>
2903HWY_API V SaturatedSub(V a, V b) {
2904 return Sub(a, Min(a, b));
2905}
2906
2907#endif // HWY_NATIVE_U32_SATURATED_ADDSUB
2908
2909#if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
2910#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
2911#undef HWY_NATIVE_U64_SATURATED_ADDSUB
2912#else
2913#define HWY_NATIVE_U64_SATURATED_ADDSUB
2914#endif
2915
2916template <class V, HWY_IF_U64_D(DFromV<V>)>
2917HWY_API V SaturatedAdd(V a, V b) {
2918 return Add(a, Min(b, Not(a)));
2919}
2920
2921template <class V, HWY_IF_U64_D(DFromV<V>)>
2922HWY_API V SaturatedSub(V a, V b) {
2923 return Sub(a, Min(a, b));
2924}
2925
2926#endif // HWY_NATIVE_U64_SATURATED_ADDSUB
2927
2928// ------------------------------ Unsigned to signed demotions
2929
2930template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
2932 class V2 = VFromD<Rebind<TFromV<V>, DN>>,
2933 hwy::EnableIf<(sizeof(TFromD<DN>) < sizeof(TFromV<V>))>* = nullptr,
2934 HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
2936 const DFromV<decltype(v)> d;
2937 const RebindToSigned<decltype(d)> di;
2938 const RebindToUnsigned<decltype(dn)> dn_u;
2939
2940 // First, do a signed to signed demotion. This will convert any values
2941 // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a
2942 // negative value.
2943 const auto i2i_demote_result = DemoteTo(dn, BitCast(di, v));
2944
2945 // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>()
2946 // using an unsigned Min operation.
2947 const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());
2948
2949 return BitCast(
2950 dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val)));
2951}
2952
2953#if HWY_TARGET != HWY_SCALAR || HWY_IDE
2954template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
2956 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
2957 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
2958 HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
2959HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
2960 const DFromV<decltype(a)> d;
2961 const RebindToSigned<decltype(d)> di;
2962 const RebindToUnsigned<decltype(dn)> dn_u;
2963
2964 // First, do a signed to signed demotion. This will convert any values
2965 // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a
2966 // negative value.
2967 const auto i2i_demote_result =
2968 ReorderDemote2To(dn, BitCast(di, a), BitCast(di, b));
2969
2970 // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>()
2971 // using an unsigned Min operation.
2972 const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());
2973
2974 return BitCast(
2975 dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val)));
2976}
2977#endif
2978
2979// ------------------------------ PromoteLowerTo
2980
2981// There is no codegen advantage for a native version of this. It is provided
2982// only for convenience.
2983template <class D, class V>
2985 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
2986 // because it cannot be deduced from D (could be either bf16 or f16).
2987 const Rebind<TFromV<V>, decltype(d)> dh;
2988 return PromoteTo(d, LowerHalf(dh, v));
2989}
2990
2991// ------------------------------ PromoteUpperTo
2992
2993#if (defined(HWY_NATIVE_PROMOTE_UPPER_TO) == defined(HWY_TARGET_TOGGLE))
2994#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
2995#undef HWY_NATIVE_PROMOTE_UPPER_TO
2996#else
2997#define HWY_NATIVE_PROMOTE_UPPER_TO
2998#endif
2999
3000// This requires UpperHalf.
3001#if HWY_TARGET != HWY_SCALAR || HWY_IDE
3002
3003template <class D, class V>
3005 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
3006 // because it cannot be deduced from D (could be either bf16 or f16).
3007 const Rebind<TFromV<V>, decltype(d)> dh;
3008 return PromoteTo(d, UpperHalf(dh, v));
3009}
3010
3011#endif // HWY_TARGET != HWY_SCALAR
3012#endif // HWY_NATIVE_PROMOTE_UPPER_TO
3013
3014// ------------------------------ PromoteEvenTo/PromoteOddTo
3015
3016#if HWY_TARGET != HWY_SCALAR
3017namespace detail {
3018
3019// Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as
3020// there are target-specific specializations for some of the
3021// detail::PromoteEvenTo and detail::PromoteOddTo cases on
3022// SVE/PPC/SSE2/SSSE3/SSE4/AVX2.
3023
3024// All targets except HWY_SCALAR use the implementations of
3025// detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at
3026// least some of the PromoteEvenTo and PromoteOddTo cases.
3027
3028// Signed to signed PromoteEvenTo/PromoteOddTo
3029template <size_t kToLaneSize, class D, class V>
3031 hwy::SignedTag /*to_type_tag*/,
3032 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
3033 hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
3034#if HWY_IS_LITTLE_ENDIAN
3035 // On little-endian targets, need to shift each lane of the bitcasted vector
3036 // left by kToLaneSize * 4 bits to get the bits of the even source lanes into
3037 // the upper kToLaneSize * 4 bits of even_in_hi.
3038 const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
3039#else
3040 // On big-endian targets, the bits of the even source lanes are already in
3041 // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
3042 const auto even_in_hi = BitCast(d_to, v);
3043#endif
3044
3045 // Right-shift even_in_hi by kToLaneSize * 4 bits
3046 return ShiftRight<kToLaneSize * 4>(even_in_hi);
3047}
3048
3049template <size_t kToLaneSize, class D, class V>
3051 hwy::SignedTag /*to_type_tag*/,
3052 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
3053 hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
3054#if HWY_IS_LITTLE_ENDIAN
3055 // On little-endian targets, the bits of the odd source lanes are already in
3056 // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
3057 const auto odd_in_hi = BitCast(d_to, v);
3058#else
3059 // On big-endian targets, need to shift each lane of the bitcasted vector left
3060 // by kToLaneSize * 4 bits to get the bits of the odd source lanes into the
3061 // upper kToLaneSize * 4 bits of odd_in_hi.
3062 const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
3063#endif
3064
3065 // Right-shift odd_in_hi by kToLaneSize * 4 bits
3066 return ShiftRight<kToLaneSize * 4>(odd_in_hi);
3067}
3068
3069// Unsigned to unsigned PromoteEvenTo/PromoteOddTo
3070template <size_t kToLaneSize, class D, class V>
3072 hwy::UnsignedTag /*to_type_tag*/,
3073 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
3074 hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
3075#if HWY_IS_LITTLE_ENDIAN
3076 // On little-endian targets, the bits of the even source lanes are already
3077 // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
3078
3079 // Simply need to zero out the upper bits of each lane of the bitcasted
3080 // vector.
3081 return And(BitCast(d_to, v),
3082 Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
3083#else
3084 // On big-endian targets, need to shift each lane of the bitcasted vector
3085 // right by kToLaneSize * 4 bits to get the bits of the even source lanes into
3086 // the lower kToLaneSize * 4 bits of the result.
3087
3088 // The right shift below will zero out the upper kToLaneSize * 4 bits of the
3089 // result.
3090 return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
3091#endif
3092}
3093
3094template <size_t kToLaneSize, class D, class V>
3096 hwy::UnsignedTag /*to_type_tag*/,
3097 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
3098 hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
3099#if HWY_IS_LITTLE_ENDIAN
3100 // On little-endian targets, need to shift each lane of the bitcasted vector
3101 // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into
3102 // the lower kToLaneSize * 4 bits of the result.
3103
3104 // The right shift below will zero out the upper kToLaneSize * 4 bits of the
3105 // result.
3106 return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
3107#else
3108 // On big-endian targets, the bits of the even source lanes are already
3109 // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
3110
3111 // Simply need to zero out the upper bits of each lane of the bitcasted
3112 // vector.
3113 return And(BitCast(d_to, v),
3114 Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
3115#endif
3116}
3117
3118// Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo
3119// followed by BitCast to signed
3120template <size_t kToLaneSize, class D, class V>
3122 hwy::SignedTag /*to_type_tag*/,
3123 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
3124 hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
3125 const RebindToUnsigned<decltype(d_to)> du_to;
3126 return BitCast(d_to,
3128 hwy::UnsignedTag(), du_to, v));
3129}
3130
3131template <size_t kToLaneSize, class D, class V>
3133 hwy::SignedTag /*to_type_tag*/,
3134 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
3135 hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
3136 const RebindToUnsigned<decltype(d_to)> du_to;
3137 return BitCast(d_to,
3139 hwy::UnsignedTag(), du_to, v));
3140}
3141
3142// BF16->F32 PromoteEvenTo
3143
3144// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
3145// instead of hwy::FloatTag on targets that use scalable vectors.
3146
3147// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
3148// type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
3149
3150// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
3151// to be a bfloat16_t vector.
3152template <class FromTypeTag, class DF32, class VBF16,
3153 class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
3154 hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
3156 hwy::SizeTag<4> /*to_lane_size_tag*/,
3157 FromTypeTag /*from_type_tag*/, DF32 d_to,
3158 VBF16 v) {
3159 const RebindToUnsigned<decltype(d_to)> du_to;
3160#if HWY_IS_LITTLE_ENDIAN
3161 // On little-endian platforms, need to shift left each lane of the bitcasted
3162 // vector by 16 bits.
3163 return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
3164#else
3165 // On big-endian platforms, the even lanes of the source vector are already
3166 // in the upper 16 bits of the lanes of the bitcasted vector.
3167
3168 // Need to simply zero out the lower 16 bits of each lane of the bitcasted
3169 // vector.
3170 return BitCast(d_to,
3171 And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
3172#endif
3173}
3174
3175// BF16->F32 PromoteOddTo
3176
3177// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
3178// instead of hwy::FloatTag on targets that use scalable vectors.
3179
3180// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
3181// type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
3182
3183// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
3184// to be a bfloat16_t vector.
3185template <class FromTypeTag, class DF32, class VBF16,
3186 class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
3187 hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
3189 hwy::SizeTag<4> /*to_lane_size_tag*/,
3190 FromTypeTag /*from_type_tag*/, DF32 d_to,
3191 VBF16 v) {
3192 const RebindToUnsigned<decltype(d_to)> du_to;
3193#if HWY_IS_LITTLE_ENDIAN
3194 // On little-endian platforms, the odd lanes of the source vector are already
3195 // in the upper 16 bits of the lanes of the bitcasted vector.
3196
3197 // Need to simply zero out the lower 16 bits of each lane of the bitcasted
3198 // vector.
3199 return BitCast(d_to,
3200 And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
3201#else
3202 // On big-endian platforms, need to shift left each lane of the bitcasted
3203 // vector by 16 bits.
3204 return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
3205#endif
3206}
3207
3208// Default PromoteEvenTo/PromoteOddTo implementations
3209template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
3210 class V, HWY_IF_LANES_D(D, 1)>
3212 ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
3213 FromTypeTag /*from_type_tag*/, D d_to, V v) {
3214 return PromoteLowerTo(d_to, v);
3215}
3216
3217template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
3218 class V, HWY_IF_LANES_GT_D(D, 1)>
3220 ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
3221 FromTypeTag /*from_type_tag*/, D d_to, V v) {
3222 const DFromV<decltype(v)> d;
3223 return PromoteLowerTo(d_to, ConcatEven(d, v, v));
3224}
3225
3226template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
3227 class V>
3229 ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
3230 FromTypeTag /*from_type_tag*/, D d_to, V v) {
3231 const DFromV<decltype(v)> d;
3232 return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
3233}
3234
3235} // namespace detail
3236
3237template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
3238 class V2 = VFromD<Repartition<TFromV<V>, D>>,
3239 HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
3241 return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
3242 hwy::SizeTag<sizeof(TFromD<D>)>(),
3243 hwy::TypeTag<TFromV<V>>(), d, v);
3244}
3245
3246template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
3247 class V2 = VFromD<Repartition<TFromV<V>, D>>,
3248 HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
3250 return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(),
3251 hwy::SizeTag<sizeof(TFromD<D>)>(),
3252 hwy::TypeTag<TFromV<V>>(), d, v);
3253}
3254#endif // HWY_TARGET != HWY_SCALAR
3255
3256// ------------------------------ float16_t <-> float
3257
3258#if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
3259#ifdef HWY_NATIVE_F16C
3260#undef HWY_NATIVE_F16C
3261#else
3262#define HWY_NATIVE_F16C
3263#endif
3264
3265template <class D, HWY_IF_F32_D(D)>
3267 const RebindToSigned<decltype(df32)> di32;
3268 const RebindToUnsigned<decltype(df32)> du32;
3269 const Rebind<uint16_t, decltype(df32)> du16;
3270 using VU32 = VFromD<decltype(du32)>;
3271
3272 const VU32 bits16 = PromoteTo(du32, BitCast(du16, v));
3273 const VU32 sign = ShiftRight<15>(bits16);
3274 const VU32 biased_exp = And(ShiftRight<10>(bits16), Set(du32, 0x1F));
3275 const VU32 mantissa = And(bits16, Set(du32, 0x3FF));
3276 const VU32 subnormal =
3277 BitCast(du32, Mul(ConvertTo(df32, BitCast(di32, mantissa)),
3278 Set(df32, 1.0f / 16384 / 1024)));
3279
3280 const VU32 biased_exp32 = Add(biased_exp, Set(du32, 127 - 15));
3281 const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa);
3282 const VU32 normal = Or(ShiftLeft<23>(biased_exp32), mantissa32);
3283 const VU32 bits32 = IfThenElse(Eq(biased_exp, Zero(du32)), subnormal, normal);
3284 return BitCast(df32, Or(ShiftLeft<31>(sign), bits32));
3285}
3286
3287template <class D, HWY_IF_F16_D(D)>
3289 const RebindToSigned<decltype(df16)> di16;
3290 const Rebind<int32_t, decltype(df16)> di32;
3291 const RebindToFloat<decltype(di32)> df32;
3292 const RebindToUnsigned<decltype(df32)> du32;
3293
3294 // There are 23 fractional bits (plus the implied 1 bit) in the mantissa of
3295 // a F32, and there are 10 fractional bits (plus the implied 1 bit) in the
3296 // mantissa of a F16
3297
3298 // We want the unbiased exponent of round_incr[i] to be at least (-14) + 13 as
3299 // 2^(-14) is the smallest positive normal F16 value and as we want 13
3300 // mantissa bits (including the implicit 1 bit) to the left of the
3301 // F32 mantissa bits in rounded_val[i] since 23 - 10 is equal to 13
3302
3303 // The biased exponent of round_incr[i] needs to be at least 126 as
3304 // (-14) + 13 + 127 is equal to 126
3305
3306 // We also want to biased exponent of round_incr[i] to be less than or equal
3307 // to 255 (which is equal to MaxExponentField<float>())
3308
3309 // The biased F64 exponent of round_incr is equal to
3310 // HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126)
3311
3312 // hi9_bits[i] is equal to the upper 9 bits of v[i]
3313 const auto hi9_bits = ShiftRight<23>(BitCast(du32, v));
3314
3315 const auto k13 = Set(du32, uint32_t{13u});
3316
3317 // Minimum biased F32 exponent of round_incr
3318 const auto k126 = Set(du32, uint32_t{126u});
3319
3320 // round_incr_hi9_bits[i] is equivalent to
3321 // (hi9_bits[i] & 0x100) |
3322 // HWY_MAX(HWY_MIN((hi9_bits[i] & 0xFF) + 13, 255), 126)
3323
3324#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
3325 const auto k255 = Set(du32, uint32_t{255u});
3326 const auto round_incr_hi9_bits = BitwiseIfThenElse(
3327 k255, Max(Min(Add(And(hi9_bits, k255), k13), k255), k126), hi9_bits);
3328#else
3329 // On targets other than SCALAR and EMU128, the exponent bits of hi9_bits can
3330 // be incremented by 13 and clamped to the [13, 255] range without overflowing
3331 // into the sign bit of hi9_bits by using U8 SaturatedAdd as there are 8
3332 // exponent bits in an F32
3333
3334 // U8 Max can be used on targets other than SCALAR and EMU128 to clamp
3335 // ((hi9_bits & 0xFF) + 13) to the [126, 255] range without affecting the sign
3336 // bit
3337
3338 const Repartition<uint8_t, decltype(du32)> du32_as_u8;
3339 const auto round_incr_hi9_bits = BitCast(
3340 du32,
3341 Max(SaturatedAdd(BitCast(du32_as_u8, hi9_bits), BitCast(du32_as_u8, k13)),
3342 BitCast(du32_as_u8, k126)));
3343#endif
3344
3345 // (round_incr_hi9_bits >> 8) is equal to (hi9_bits >> 8), and
3346 // (round_incr_hi9_bits & 0xFF) is equal to
3347 // HWY_MAX(HWY_MIN((round_incr_hi9_bits & 0xFF) + 13, 255), 126)
3348
3349 const auto round_incr = BitCast(df32, ShiftLeft<23>(round_incr_hi9_bits));
3350
3351 // Add round_incr[i] to v[i] to round the mantissa to the nearest F16 mantissa
3352 // and to move the fractional bits of the resulting non-NaN mantissa down to
3353 // the lower 10 bits of rounded_val if (v[i] + round_incr[i]) is a non-NaN
3354 // value
3355 const auto rounded_val = Add(v, round_incr);
3356
3357 // rounded_val_bits is the bits of rounded_val as a U32
3358 const auto rounded_val_bits = BitCast(du32, rounded_val);
3359
3360 // rounded_val[i] is known to have the same biased exponent as round_incr[i]
3361 // as |round_incr[i]| > 2^12*|v[i]| is true if round_incr[i] is a finite
3362 // value, round_incr[i] and v[i] both have the same sign, and |round_incr[i]|
3363 // is either a power of 2 that is greater than or equal to 2^-1 or infinity.
3364
3365 // If rounded_val[i] is a finite F32 value, then
3366 // (rounded_val_bits[i] & 0x00000FFF) is the bit representation of the
3367 // rounded mantissa of rounded_val[i] as a UQ2.10 fixed point number that is
3368 // in the range [0, 2].
3369
3370 // In other words, (rounded_val_bits[i] & 0x00000FFF) is between 0 and 0x0800,
3371 // with (rounded_val_bits[i] & 0x000003FF) being the fractional bits of the
3372 // resulting F16 mantissa, if rounded_v[i] is a finite F32 value.
3373
3374 // (rounded_val_bits[i] & 0x007FF000) == 0 is guaranteed to be true if
3375 // rounded_val[i] is a non-NaN value
3376
3377 // The biased exponent of rounded_val[i] is guaranteed to be at least 126 as
3378 // the biased exponent of round_incr[i] is at least 126 and as both v[i] and
3379 // round_incr[i] have the same sign bit
3380
3381 // The ULP of a F32 value with a biased exponent of 126 is equal to
3382 // 2^(126 - 127 - 23), which is equal to 2^(-24) (which is also the ULP of a
3383 // F16 value with a biased exponent of 0 or 1 as (1 - 15 - 10) is equal to
3384 // -24)
3385
3386 // The biased exponent (before subtracting by 126) needs to be clamped to the
3387 // [126, 157] range as 126 + 31 is equal to 157 and as 31 is the largest
3388 // biased exponent of a F16.
3389
3390 // The biased exponent of the resulting F16 value is equal to
3391 // HWY_MIN((round_incr_hi9_bits[i] & 0xFF) +
3392 // ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126
3393
3394#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
3395 auto f16_exp_bits =
3396 Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)),
3397 And(rounded_val_bits,
3398 Set(du32, static_cast<uint32_t>(uint32_t{0xFFu} << 10)))),
3399 Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10)));
3400#else
3401 auto f16_exp_bits = ShiftLeft<10>(BitCast(
3402 du32,
3403 Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits),
3404 BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))),
3405 BitCast(du32_as_u8, Set(du32, uint32_t{157})))));
3406#endif
3407
3408 f16_exp_bits =
3409 Sub(f16_exp_bits, Set(du32, static_cast<uint32_t>(uint32_t{126u} << 10)));
3410
3411 const auto f16_unmasked_mant_bits =
3412 BitCast(di32, Or(rounded_val, VecFromMask(df32, IsNaN(rounded_val))));
3413
3414 const auto f16_exp_mant_bits =
3415 OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits,
3416 Set(di32, int32_t{0x03FF}));
3417
3418 // f16_bits_as_i32 is the F16 bits sign-extended to an I32 (with the upper 17
3419 // bits of f16_bits_as_i32[i] set to the sign bit of rounded_val[i]) to allow
3420 // efficient truncation of the F16 bits to an I16 using an I32->I16 DemoteTo
3421 // operation
3422 const auto f16_bits_as_i32 =
3423 OrAnd(f16_exp_mant_bits, ShiftRight<16>(BitCast(di32, rounded_val_bits)),
3424 Set(di32, static_cast<int32_t>(0xFFFF8000u)));
3425 return BitCast(df16, DemoteTo(di16, f16_bits_as_i32));
3426}
3427
3428#endif // HWY_NATIVE_F16C
3429
3430// ------------------------------ F64->F16 DemoteTo
3431#if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE))
3432#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
3433#undef HWY_NATIVE_DEMOTE_F64_TO_F16
3434#else
3435#define HWY_NATIVE_DEMOTE_F64_TO_F16
3436#endif
3437
3438#if HWY_HAVE_FLOAT64
3439template <class D, HWY_IF_F16_D(D)>
3440HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
3441 const Rebind<double, D> df64;
3442 const Rebind<uint64_t, D> du64;
3443 const Rebind<float, D> df32;
3444
3445 // The mantissa bits of v[i] are first rounded using round-to-odd rounding to
3446 // the nearest F64 value that has the lower 29 bits zeroed out to ensure that
3447 // the result is correctly rounded to a F16.
3448
3449 const auto vf64_rounded = OrAnd(
3450 And(v,
3451 BitCast(df64, Set(du64, static_cast<uint64_t>(0xFFFFFFFFE0000000u)))),
3452 BitCast(df64, Add(BitCast(du64, v),
3453 Set(du64, static_cast<uint64_t>(0x000000001FFFFFFFu)))),
3454 BitCast(df64, Set(du64, static_cast<uint64_t>(0x0000000020000000ULL))));
3455
3456 return DemoteTo(df16, DemoteTo(df32, vf64_rounded));
3457}
3458#endif // HWY_HAVE_FLOAT64
3459
3460#endif // HWY_NATIVE_DEMOTE_F64_TO_F16
3461
3462// ------------------------------ F16->F64 PromoteTo
3463#if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE))
3464#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
3465#undef HWY_NATIVE_PROMOTE_F16_TO_F64
3466#else
3467#define HWY_NATIVE_PROMOTE_F16_TO_F64
3468#endif
3469
3470#if HWY_HAVE_FLOAT64
3471template <class D, HWY_IF_F64_D(D)>
3472HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) {
3473 return PromoteTo(df64, PromoteTo(Rebind<float, D>(), v));
3474}
3475#endif // HWY_HAVE_FLOAT64
3476
3477#endif // HWY_NATIVE_PROMOTE_F16_TO_F64
3478
3479// ------------------------------ F32 to BF16 DemoteTo
3480#if (defined(HWY_NATIVE_DEMOTE_F32_TO_BF16) == defined(HWY_TARGET_TOGGLE))
3481#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
3482#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
3483#else
3484#define HWY_NATIVE_DEMOTE_F32_TO_BF16
3485#endif
3486
3487namespace detail {
3488
3489// Round a F32 value to the nearest BF16 value, with the result returned as the
3490// rounded F32 value bitcasted to an U32
3491
3492// RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
3493// NaN F32 values from being converted to an infinity
3494template <class V, HWY_IF_F32(TFromV<V>)>
3496 const DFromV<decltype(v)> d;
3497 const RebindToUnsigned<decltype(d)> du32;
3498
3499 const auto is_non_nan = Not(IsNaN(v));
3500 const auto bits32 = BitCast(du32, v);
3501
3502 const auto round_incr =
3503 Add(And(ShiftRight<16>(bits32), Set(du32, uint32_t{1})),
3504 Set(du32, uint32_t{0x7FFFu}));
3505 return MaskedAddOr(Or(bits32, Set(du32, uint32_t{0x00400000u})),
3506 RebindMask(du32, is_non_nan), bits32, round_incr);
3507}
3508
3509} // namespace detail
3510
3511template <class D, HWY_IF_BF16_D(D)>
3512HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
3513 const RebindToUnsigned<decltype(dbf16)> du16;
3514 const Twice<decltype(du16)> dt_u16;
3515
3516 const auto rounded_bits = BitCast(dt_u16, detail::RoundF32ForDemoteToBF16(v));
3517#if HWY_IS_LITTLE_ENDIAN
3518 return BitCast(
3519 dbf16, LowerHalf(du16, ConcatOdd(dt_u16, rounded_bits, rounded_bits)));
3520#else
3521 return BitCast(
3522 dbf16, LowerHalf(du16, ConcatEven(dt_u16, rounded_bits, rounded_bits)));
3523#endif
3524}
3525
3526template <class D, HWY_IF_BF16_D(D)>
3529 const RebindToUnsigned<decltype(dbf16)> du16;
3530
3531 const auto rounded_a_bits32 =
3533 const auto rounded_b_bits32 =
3535#if HWY_IS_LITTLE_ENDIAN
3536 return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, rounded_b_bits32),
3537 BitCast(du16, rounded_a_bits32)));
3538#else
3539 return BitCast(dbf16, ConcatEven(du16, BitCast(du16, rounded_b_bits32),
3540 BitCast(du16, rounded_a_bits32)));
3541#endif
3542}
3543
3544template <class D, HWY_IF_BF16_D(D)>
3547 const RebindToUnsigned<decltype(dbf16)> du16;
3548
3549#if HWY_IS_LITTLE_ENDIAN
3550 const auto a_in_odd = detail::RoundF32ForDemoteToBF16(a);
3551 const auto b_in_even = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(b));
3552#else
3553 const auto a_in_odd = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(a));
3554 const auto b_in_even = detail::RoundF32ForDemoteToBF16(b);
3555#endif
3556
3557 return BitCast(dbf16,
3558 OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even)));
3559}
3560
3561#endif // HWY_NATIVE_DEMOTE_F32_TO_BF16
3562
3563// ------------------------------ PromoteInRangeTo
3564#if (defined(HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO) == \
3565 defined(HWY_TARGET_TOGGLE))
3566#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3567#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3568#else
3569#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3570#endif
3571
3572#if HWY_HAVE_INTEGER64
3573template <class D64, HWY_IF_UI64_D(D64)>
3574HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
3575 return PromoteTo(d64, v);
3576}
3577#endif
3578
3579#endif // HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3580
3581// ------------------------------ ConvertInRangeTo
3582#if (defined(HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO) == defined(HWY_TARGET_TOGGLE))
3583#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3584#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3585#else
3586#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3587#endif
3588
3589template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
3590 HWY_IF_T_SIZE_ONE_OF_D(DI, (HWY_HAVE_FLOAT16 ? (1 << 2) : 0) |
3591 (1 << 4) |
3592 (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
3594 return ConvertTo(di, v);
3595}
3596
3597#endif // HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3598
3599// ------------------------------ DemoteInRangeTo
3600#if (defined(HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO) == \
3601 defined(HWY_TARGET_TOGGLE))
3602#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3603#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3604#else
3605#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3606#endif
3607
3608#if HWY_HAVE_FLOAT64
3609template <class D32, HWY_IF_UI32_D(D32)>
3610HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
3611 return DemoteTo(d32, v);
3612}
3613#endif
3614
3615#endif // HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3616
3617// ------------------------------ PromoteInRangeLowerTo/PromoteInRangeUpperTo
3618
3619template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
3621 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
3622 // because it cannot be deduced from D (could be either bf16 or f16).
3623 const Rebind<TFromV<V>, decltype(d)> dh;
3624 return PromoteInRangeTo(d, LowerHalf(dh, v));
3625}
3626
3627#if HWY_TARGET != HWY_SCALAR
3628template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
3629HWY_API VFromD<D> PromoteInRangeUpperTo(D d, V v) {
3630#if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
3631 ((HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
3632 !HWY_HAVE_FLOAT64))
3633 // On targets that provide target-specific implementations of F32->UI64
3634 // PromoteInRangeTo, promote the upper half of v using PromoteInRangeTo
3635
3636 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
3637 // because it cannot be deduced from D (could be either bf16 or f16).
3638 const Rebind<TFromV<V>, decltype(d)> dh;
3639 return PromoteInRangeTo(d, UpperHalf(dh, v));
3640#else
3641 // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
3642 // around F32->UI64 PromoteTo, promote the upper half of v to TFromD<D> using
3643 // PromoteUpperTo
3644 return PromoteUpperTo(d, v);
3645#endif
3646}
3647#endif // HWY_TARGET != HWY_SCALAR
3648
3649// ------------------------------ PromoteInRangeEvenTo/PromoteInRangeOddTo
3650
3651template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
3653#if HWY_TARGET == HWY_SCALAR
3654 return PromoteInRangeTo(d, v);
3655#elif (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
3656 ((HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
3657 !HWY_HAVE_FLOAT64))
3658 // On targets that provide target-specific implementations of F32->UI64
3659 // PromoteInRangeTo, promote the even lanes of v using PromoteInRangeTo
3660
3661 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
3662 // because it cannot be deduced from D (could be either bf16 or f16).
3663 const DFromV<decltype(v)> d_from;
3664 const Rebind<TFromV<V>, decltype(d)> dh;
3665 return PromoteInRangeTo(d, LowerHalf(dh, ConcatEven(d_from, v, v)));
3666#else
3667 // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
3668 // around F32->UI64 PromoteTo, promote the even lanes of v to TFromD<D> using
3669 // PromoteEvenTo
3670 return PromoteEvenTo(d, v);
3671#endif // HWY_TARGET == HWY_SCALAR
3672}
3673
3674#if HWY_TARGET != HWY_SCALAR
3675template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
3676HWY_API VFromD<D> PromoteInRangeOddTo(D d, V v) {
3677#if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
3678 ((HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
3679 !HWY_HAVE_FLOAT64))
3680 // On targets that provide target-specific implementations of F32->UI64
3681 // PromoteInRangeTo, promote the odd lanes of v using PromoteInRangeTo
3682
3683 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
3684 // because it cannot be deduced from D (could be either bf16 or f16).
3685 const DFromV<decltype(v)> d_from;
3686 const Rebind<TFromV<V>, decltype(d)> dh;
3687 return PromoteInRangeTo(d, LowerHalf(dh, ConcatOdd(d_from, v, v)));
3688#else
3689 // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
3690 // around F32->UI64 PromoteTo, promote the odd lanes of v to TFromD<D> using
3691 // PromoteOddTo
3692 return PromoteOddTo(d, v);
3693#endif
3694}
3695#endif // HWY_TARGET != HWY_SCALAR
3696
3697// ------------------------------ SumsOf2
3698
3699#if HWY_TARGET != HWY_SCALAR
3700namespace detail {
3701
3702template <class TypeTag, size_t kLaneSize, class V>
3704 TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
3705 const DFromV<decltype(v)> d;
3706 const RepartitionToWide<decltype(d)> dw;
3707 return Add(PromoteEvenTo(dw, v), PromoteOddTo(dw, v));
3708}
3709
3710} // namespace detail
3711
3712template <class V>
3714 return detail::SumsOf2(hwy::TypeTag<TFromV<V>>(),
3715 hwy::SizeTag<sizeof(TFromV<V>)>(), v);
3716}
3717#endif // HWY_TARGET != HWY_SCALAR
3718
3719// ------------------------------ SumsOf4
3720
3721namespace detail {
3722
3723template <class TypeTag, size_t kLaneSize, class V>
3725 TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
3727 return SumsOf2(SumsOf2(v));
3728}
3729
3730} // namespace detail
3731
3732template <class V>
3737
3738// ------------------------------ OrderedTruncate2To
3739
3740#if HWY_IDE || \
3741 (defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE))
3742
3743#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3744#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3745#else
3746#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3747#endif
3748
3749// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
3750#if HWY_TARGET != HWY_SCALAR || HWY_IDE
3751template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
3752 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
3754HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
3755 return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a));
3756}
3757#endif // HWY_TARGET != HWY_SCALAR
3758#endif // HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3759
3760// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
3761
3762#if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE))
3763#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
3764#undef HWY_NATIVE_LEADING_ZERO_COUNT
3765#else
3766#define HWY_NATIVE_LEADING_ZERO_COUNT
3767#endif
3768
3769namespace detail {
3770
3771template <class D, HWY_IF_U32_D(D)>
3773 const RebindToFloat<decltype(d)> df;
3774#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
3775 const RebindToSigned<decltype(d)> di;
3776 const Repartition<int16_t, decltype(d)> di16;
3777
3778 // On SSE2/SSSE3/SSE4/AVX2, do an int32_t to float conversion, followed
3779 // by a unsigned right shift of the uint32_t bit representation of the
3780 // floating point values by 23, followed by an int16_t Min
3781 // operation as we are only interested in the biased exponent that would
3782 // result from a uint32_t to float conversion.
3783
3784 // An int32_t to float vector conversion is also much more efficient on
3785 // SSE2/SSSE3/SSE4/AVX2 than an uint32_t vector to float vector conversion
3786 // as an uint32_t vector to float vector conversion on SSE2/SSSE3/SSE4/AVX2
3787 // requires multiple instructions whereas an int32_t to float vector
3788 // conversion can be carried out using a single instruction on
3789 // SSE2/SSSE3/SSE4/AVX2.
3790
3791 const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(di, v)));
3792 return BitCast(d, Min(BitCast(di16, ShiftRight<23>(f32_bits)),
3793 BitCast(di16, Set(d, 158))));
3794#else
3795 const auto f32_bits = BitCast(d, ConvertTo(df, v));
3796 return BitCast(d, ShiftRight<23>(f32_bits));
3797#endif
3798}
3799
3800template <class V, HWY_IF_U32_D(DFromV<V>)>
3802 // I32RangeU32ToF32BiasedExp is similar to UIntToF32BiasedExp, but
3803 // I32RangeU32ToF32BiasedExp assumes that v[i] is between 0 and 2147483647.
3804 const DFromV<decltype(v)> d;
3805 const RebindToFloat<decltype(d)> df;
3806#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
3807 const RebindToSigned<decltype(d)> d_src;
3808#else
3809 const RebindToUnsigned<decltype(d)> d_src;
3810#endif
3811 const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(d_src, v)));
3812 return ShiftRight<23>(f32_bits);
3813}
3814
3815template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
3817 const Rebind<uint32_t, decltype(d)> du32;
3818 const auto f32_biased_exp_as_u32 =
3820 return TruncateTo(d, f32_biased_exp_as_u32);
3821}
3822
3823#if HWY_TARGET != HWY_SCALAR
3824template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
3826 const Half<decltype(d)> dh;
3827 const Rebind<uint32_t, decltype(dh)> du32;
3828
3829 const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v));
3830 const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v));
3831
3832 const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32);
3833 const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32);
3834#if HWY_TARGET <= HWY_SSE2
3835 const RebindToSigned<decltype(du32)> di32;
3836 const RebindToSigned<decltype(d)> di;
3837 return BitCast(d,
3838 OrderedDemote2To(di, BitCast(di32, lo_f32_biased_exp_as_u32),
3839 BitCast(di32, hi_f32_biased_exp_as_u32)));
3840#else
3841 return OrderedTruncate2To(d, lo_f32_biased_exp_as_u32,
3842 hi_f32_biased_exp_as_u32);
3843#endif
3844}
3845#endif // HWY_TARGET != HWY_SCALAR
3846
3847template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
3849 const Rebind<uint32_t, decltype(d)> du32;
3850 const auto f32_biased_exp_as_u32 =
3852 return U8FromU32(f32_biased_exp_as_u32);
3853}
3854
3855#if HWY_TARGET != HWY_SCALAR
3856template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4),
3859 const Half<decltype(d)> dh;
3860 const Rebind<uint32_t, decltype(dh)> du32;
3861 const Repartition<uint16_t, decltype(du32)> du16;
3862
3863 const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v));
3864 const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v));
3865
3866 const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32);
3867 const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32);
3868
3869#if HWY_TARGET <= HWY_SSE2
3870 const RebindToSigned<decltype(du32)> di32;
3871 const RebindToSigned<decltype(du16)> di16;
3872 const auto f32_biased_exp_as_i16 =
3873 OrderedDemote2To(di16, BitCast(di32, lo_f32_biased_exp_as_u32),
3874 BitCast(di32, hi_f32_biased_exp_as_u32));
3875 return DemoteTo(d, f32_biased_exp_as_i16);
3876#else
3877 const auto f32_biased_exp_as_u16 = OrderedTruncate2To(
3878 du16, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32);
3879 return TruncateTo(d, f32_biased_exp_as_u16);
3880#endif
3881}
3882
3883template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 2)>
3885 const Half<decltype(d)> dh;
3886 const Half<decltype(dh)> dq;
3887 const Rebind<uint32_t, decltype(dq)> du32;
3888 const Repartition<uint16_t, decltype(du32)> du16;
3889
3890 const auto lo_half = LowerHalf(dh, v);
3891 const auto hi_half = UpperHalf(dh, v);
3892
3893 const auto u32_q0 = PromoteTo(du32, LowerHalf(dq, lo_half));
3894 const auto u32_q1 = PromoteTo(du32, UpperHalf(dq, lo_half));
3895 const auto u32_q2 = PromoteTo(du32, LowerHalf(dq, hi_half));
3896 const auto u32_q3 = PromoteTo(du32, UpperHalf(dq, hi_half));
3897
3898 const auto f32_biased_exp_as_u32_q0 = I32RangeU32ToF32BiasedExp(u32_q0);
3899 const auto f32_biased_exp_as_u32_q1 = I32RangeU32ToF32BiasedExp(u32_q1);
3900 const auto f32_biased_exp_as_u32_q2 = I32RangeU32ToF32BiasedExp(u32_q2);
3901 const auto f32_biased_exp_as_u32_q3 = I32RangeU32ToF32BiasedExp(u32_q3);
3902
3903#if HWY_TARGET <= HWY_SSE2
3904 const RebindToSigned<decltype(du32)> di32;
3905 const RebindToSigned<decltype(du16)> di16;
3906
3907 const auto lo_f32_biased_exp_as_i16 =
3908 OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q0),
3909 BitCast(di32, f32_biased_exp_as_u32_q1));
3910 const auto hi_f32_biased_exp_as_i16 =
3911 OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q2),
3912 BitCast(di32, f32_biased_exp_as_u32_q3));
3913 return OrderedDemote2To(d, lo_f32_biased_exp_as_i16,
3914 hi_f32_biased_exp_as_i16);
3915#else
3916 const auto lo_f32_biased_exp_as_u16 = OrderedTruncate2To(
3917 du16, f32_biased_exp_as_u32_q0, f32_biased_exp_as_u32_q1);
3918 const auto hi_f32_biased_exp_as_u16 = OrderedTruncate2To(
3919 du16, f32_biased_exp_as_u32_q2, f32_biased_exp_as_u32_q3);
3920 return OrderedTruncate2To(d, lo_f32_biased_exp_as_u16,
3921 hi_f32_biased_exp_as_u16);
3922#endif
3923}
3924#endif // HWY_TARGET != HWY_SCALAR
3925
3926#if HWY_TARGET == HWY_SCALAR
3927template <class D>
3929#elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2
3930template <class D>
3932#else
3933template <class D>
3936#endif
3937
3938template <class V>
3940
3941template <class V>
3943 const DFromV<decltype(v)> d;
3944 const F32ExpLzcntMinMaxRepartition<decltype(d)> d2;
3945 return BitCast(d2, v);
3946}
3947
3948template <class D, HWY_IF_U64_D(D)>
3950#if HWY_TARGET == HWY_SCALAR
3951 const uint64_t u64_val = GetLane(v);
3952 const float f32_val = static_cast<float>(u64_val);
3953 const uint32_t f32_bits = BitCastScalar<uint32_t>(f32_val);
3954 return Set(d, static_cast<uint64_t>(f32_bits >> 23));
3955#else
3956 const Repartition<uint32_t, decltype(d)> du32;
3957 const auto f32_biased_exp = UIntToF32BiasedExp(du32, BitCast(du32, v));
3958 const auto f32_biased_exp_adj =
3959 IfThenZeroElse(Eq(f32_biased_exp, Zero(du32)),
3960 BitCast(du32, Set(d, 0x0000002000000000u)));
3961 const auto adj_f32_biased_exp = Add(f32_biased_exp, f32_biased_exp_adj);
3962
3963 return ShiftRight<32>(BitCast(
3964 d, Max(F32ExpLzcntMinMaxBitCast(adj_f32_biased_exp),
3965 F32ExpLzcntMinMaxBitCast(Reverse2(du32, adj_f32_biased_exp)))));
3966#endif
3967}
3968
3969template <class V, HWY_IF_UNSIGNED_V(V)>
3971 const DFromV<decltype(v)> d;
3972 return UIntToF32BiasedExp(d, v);
3973}
3974
3975template <class V, HWY_IF_UNSIGNED_V(V),
3976 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
3978 return v;
3979}
3980
3981template <class V, HWY_IF_UNSIGNED_V(V),
3982 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
3984 // If v[i] >= 16777216 is true, make sure that the bit at
3985 // HighestSetBitIndex(v[i]) - 24 is zeroed out to ensure that any inexact
3986 // conversion to single-precision floating point is rounded down.
3987
3988 // This zeroing-out can be accomplished through the AndNot operation below.
3989 return AndNot(ShiftRight<24>(v), v);
3990}
3991
3992} // namespace detail
3993
3994template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
3996 const DFromV<decltype(v)> d;
3997 const RebindToUnsigned<decltype(d)> du;
3998 using TU = TFromD<decltype(du)>;
3999
4000 const auto f32_biased_exp = detail::UIntToF32BiasedExp(
4002 return BitCast(d, Sub(f32_biased_exp, Set(du, TU{127})));
4003}
4004
4005template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4007 const DFromV<decltype(v)> d;
4008 const RebindToUnsigned<decltype(d)> du;
4009 using TU = TFromD<decltype(du)>;
4010
4011 constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
4012 const auto f32_biased_exp = detail::UIntToF32BiasedExp(
4014 const auto lz_count = Sub(Set(du, TU{kNumOfBitsInT + 126}), f32_biased_exp);
4015
4016 return BitCast(d,
4018 detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT))));
4019}
4020
4021template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4023 const DFromV<decltype(v)> d;
4024 const RebindToUnsigned<decltype(d)> du;
4025 const RebindToSigned<decltype(d)> di;
4026 using TU = TFromD<decltype(du)>;
4027
4028 const auto vi = BitCast(di, v);
4029 const auto lowest_bit = BitCast(du, And(vi, Neg(vi)));
4030
4031 constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
4032 const auto f32_biased_exp = detail::UIntToF32BiasedExp(lowest_bit);
4033 const auto tz_count = Sub(f32_biased_exp, Set(du, TU{127}));
4034
4035 return BitCast(d,
4037 detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT))));
4038}
4039#endif // HWY_NATIVE_LEADING_ZERO_COUNT
4040
4041// ------------------------------ AESRound
4042
4043// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
4044#if HWY_TARGET != HWY_SCALAR || HWY_IDE
4045
4046// Define for white-box testing, even if native instructions are available.
4047namespace detail {
4048
4049// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
4050// Vector Permute Instructions" and the accompanying assembly language
4051// implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
4052// https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
4053//
4054// A brute-force 256 byte table lookup can also be made constant-time, and
4055// possibly competitive on NEON, but this is more performance-portable
4056// especially for x86 and large vectors.
4057
4058template <class V> // u8
4059HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
4060 V affine_tblU) {
4061 const DFromV<V> du;
4062 const auto mask = Set(du, uint8_t{0xF});
4063
4064 // Change polynomial basis to GF(2^4)
4065 {
4066 const VFromD<decltype(du)> basisL =
4067 Dup128VecFromValues(du, 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
4068 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA);
4069 const VFromD<decltype(du)> basisU =
4070 Dup128VecFromValues(du, 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
4071 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD);
4072 const auto sL = And(state, mask);
4073 const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
4074 const auto gf4L = TableLookupBytes(basisL, sL);
4075 const auto gf4U = TableLookupBytes(basisU, sU);
4076 state = Xor(gf4L, gf4U);
4077 }
4078
4079 // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
4080 // cause TableLookupBytesOr0 to return 0.
4081 const VFromD<decltype(du)> zetaInv = Dup128VecFromValues(
4082 du, 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3);
4083 const VFromD<decltype(du)> tbl = Dup128VecFromValues(
4084 du, 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4);
4085 const auto sL = And(state, mask); // L=low nibble, U=upper
4086 const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
4087 const auto sX = Xor(sU, sL);
4088 const auto invL = TableLookupBytes(zetaInv, sL);
4089 const auto invU = TableLookupBytes(tbl, sU);
4090 const auto invX = TableLookupBytes(tbl, sX);
4091 const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
4092 const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
4093
4094 const auto affL = TableLookupBytesOr0(affine_tblL, outL);
4095 const auto affU = TableLookupBytesOr0(affine_tblU, outU);
4096 return Xor(affL, affU);
4097}
4098
4099template <class V> // u8
4100HWY_INLINE V SubBytes(V state) {
4101 const DFromV<V> du;
4102 // Linear skew (cannot bake 0x63 bias into the table because out* indices
4103 // may have the infinity flag set).
4104 const VFromD<decltype(du)> affineL =
4105 Dup128VecFromValues(du, 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
4106 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15);
4107 const VFromD<decltype(du)> affineU =
4108 Dup128VecFromValues(du, 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
4109 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E);
4110 return Xor(SubBytesMulInverseAndAffineLookup(state, affineL, affineU),
4111 Set(du, uint8_t{0x63}));
4112}
4113
4114template <class V> // u8
4115HWY_INLINE V InvSubBytes(V state) {
4116 const DFromV<V> du;
4117 const VFromD<decltype(du)> gF2P4InvToGF2P8InvL =
4118 Dup128VecFromValues(du, 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
4119 0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7);
4120 const VFromD<decltype(du)> gF2P4InvToGF2P8InvU =
4121 Dup128VecFromValues(du, 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
4122 0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA);
4123
4124 // Apply the inverse affine transformation
4125 const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)),
4126 Or(ShiftLeft<3>(state), ShiftRight<5>(state)),
4127 Or(ShiftLeft<6>(state), ShiftRight<2>(state))),
4128 Set(du, uint8_t{0x05}));
4129
4130 // The GF(2^8) multiplicative inverse is computed as follows:
4131 // - Changing the polynomial basis to GF(2^4)
4132 // - Computing the GF(2^4) multiplicative inverse
4133 // - Converting the GF(2^4) multiplicative inverse to the GF(2^8)
4134 // multiplicative inverse through table lookups using the
4135 // kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables
4136 return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL,
4137 gF2P4InvToGF2P8InvU);
4138}
4139
4140} // namespace detail
4141
4142#endif // HWY_TARGET != HWY_SCALAR
4143
4144#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
4145#ifdef HWY_NATIVE_AES
4146#undef HWY_NATIVE_AES
4147#else
4148#define HWY_NATIVE_AES
4149#endif
4150
4151// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
4152#if HWY_TARGET != HWY_SCALAR
4153
4154namespace detail {
4155
4156template <class V> // u8
4157HWY_INLINE V ShiftRows(const V state) {
4158 const DFromV<V> du;
4159 // transposed: state is column major
4160 const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
4161 du, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11);
4162 return TableLookupBytes(state, shift_row);
4163}
4164
4165template <class V> // u8
4166HWY_INLINE V InvShiftRows(const V state) {
4167 const DFromV<V> du;
4168 // transposed: state is column major
4169 const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
4170 du, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3);
4171 return TableLookupBytes(state, shift_row);
4172}
4173
4174template <class V> // u8
4175HWY_INLINE V GF2P8Mod11BMulBy2(V v) {
4176 const DFromV<V> du;
4177 const RebindToSigned<decltype(du)> di; // can only do signed comparisons
4178 const auto msb = Lt(BitCast(di, v), Zero(di));
4179 const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B})));
4180 return Xor(Add(v, v), overflow); // = v*2 in GF(2^8).
4181}
4182
4183template <class V> // u8
4184HWY_INLINE V MixColumns(const V state) {
4185 const DFromV<V> du;
4186 // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
4187 // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3.
4188 // 1 2 3 1 // d are on diagonal, no permutation needed.
4189 // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows.
4190 // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301).
4191 const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
4192 du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
4193 const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
4194 du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
4195 const auto d = GF2P8Mod11BMulBy2(state); // = state*2 in GF(2^8).
4196 const auto s2301 = TableLookupBytes(state, v2301);
4197 const auto d_s2301 = Xor(d, s2301);
4198 const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)}
4199 const auto t1230_s3012 = TableLookupBytes(t_s2301, v1230);
4200 return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms
4201}
4202
4203template <class V> // u8
4204HWY_INLINE V InvMixColumns(const V state) {
4205 const DFromV<V> du;
4206 // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
4207 // 14 11 13 9
4208 // 9 14 11 13
4209 // 13 9 14 11
4210 // 11 13 9 14
4211 const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
4212 du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
4213 const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
4214 du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
4215
4216 const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */
4217 const auto sx4 = GF2P8Mod11BMulBy2(sx2); /* = state*4 in GF(2^8) */
4218 const auto sx8 = GF2P8Mod11BMulBy2(sx4); /* = state*8 in GF(2^8) */
4219 const auto sx9 = Xor(sx8, state); /* = state*9 in GF(2^8) */
4220 const auto sx11 = Xor(sx9, sx2); /* = state*11 in GF(2^8) */
4221 const auto sx13 = Xor(sx9, sx4); /* = state*13 in GF(2^8) */
4222 const auto sx14 = Xor3(sx8, sx4, sx2); /* = state*14 in GF(2^8) */
4223
4224 const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230));
4225 const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230));
4226 const auto sx13_2301_sx9_3012 = TableLookupBytes(sx13_0123_sx9_1230, v2301);
4227 return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012);
4228}
4229
4230} // namespace detail
4231
4232template <class V> // u8
4233HWY_API V AESRound(V state, const V round_key) {
4234 // Intel docs swap the first two steps, but it does not matter because
4235 // ShiftRows is a permutation and SubBytes is independent of lane index.
4236 state = detail::SubBytes(state);
4237 state = detail::ShiftRows(state);
4238 state = detail::MixColumns(state);
4239 state = Xor(state, round_key); // AddRoundKey
4240 return state;
4241}
4242
4243template <class V> // u8
4244HWY_API V AESLastRound(V state, const V round_key) {
4245 // LIke AESRound, but without MixColumns.
4246 state = detail::SubBytes(state);
4247 state = detail::ShiftRows(state);
4248 state = Xor(state, round_key); // AddRoundKey
4249 return state;
4250}
4251
4252template <class V>
4253HWY_API V AESInvMixColumns(V state) {
4254 return detail::InvMixColumns(state);
4255}
4256
4257template <class V> // u8
4258HWY_API V AESRoundInv(V state, const V round_key) {
4259 state = detail::InvSubBytes(state);
4260 state = detail::InvShiftRows(state);
4261 state = detail::InvMixColumns(state);
4262 state = Xor(state, round_key); // AddRoundKey
4263 return state;
4264}
4265
4266template <class V> // u8
4267HWY_API V AESLastRoundInv(V state, const V round_key) {
4268 // Like AESRoundInv, but without InvMixColumns.
4269 state = detail::InvSubBytes(state);
4270 state = detail::InvShiftRows(state);
4271 state = Xor(state, round_key); // AddRoundKey
4272 return state;
4273}
4274
4275template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)>
4276HWY_API V AESKeyGenAssist(V v) {
4277 const DFromV<decltype(v)> d;
4278 const V rconXorMask = Dup128VecFromValues(d, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0,
4279 0, 0, kRcon, 0, 0, 0);
4280 const V rotWordShuffle = Dup128VecFromValues(d, 4, 5, 6, 7, 5, 6, 7, 4, 12,
4281 13, 14, 15, 13, 14, 15, 12);
4282 const auto sub_word_result = detail::SubBytes(v);
4283 const auto rot_word_result =
4284 TableLookupBytes(sub_word_result, rotWordShuffle);
4285 return Xor(rot_word_result, rconXorMask);
4286}
4287
4288// Constant-time implementation inspired by
4289// https://www.bearssl.org/constanttime.html, but about half the cost because we
4290// use 64x64 multiplies and 128-bit XORs.
4291template <class V>
4292HWY_API V CLMulLower(V a, V b) {
4293 const DFromV<V> d;
4294 static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
4295 const auto k1 = Set(d, 0x1111111111111111ULL);
4296 const auto k2 = Set(d, 0x2222222222222222ULL);
4297 const auto k4 = Set(d, 0x4444444444444444ULL);
4298 const auto k8 = Set(d, 0x8888888888888888ULL);
4299 const auto a0 = And(a, k1);
4300 const auto a1 = And(a, k2);
4301 const auto a2 = And(a, k4);
4302 const auto a3 = And(a, k8);
4303 const auto b0 = And(b, k1);
4304 const auto b1 = And(b, k2);
4305 const auto b2 = And(b, k4);
4306 const auto b3 = And(b, k8);
4307
4308 auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
4309 auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
4310 auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
4311 auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
4312 m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
4313 m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
4314 m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
4315 m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
4316 return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
4317}
4318
4319template <class V>
4320HWY_API V CLMulUpper(V a, V b) {
4321 const DFromV<V> d;
4322 static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
4323 const auto k1 = Set(d, 0x1111111111111111ULL);
4324 const auto k2 = Set(d, 0x2222222222222222ULL);
4325 const auto k4 = Set(d, 0x4444444444444444ULL);
4326 const auto k8 = Set(d, 0x8888888888888888ULL);
4327 const auto a0 = And(a, k1);
4328 const auto a1 = And(a, k2);
4329 const auto a2 = And(a, k4);
4330 const auto a3 = And(a, k8);
4331 const auto b0 = And(b, k1);
4332 const auto b1 = And(b, k2);
4333 const auto b2 = And(b, k4);
4334 const auto b3 = And(b, k8);
4335
4336 auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
4337 auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
4338 auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
4339 auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
4340 m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
4341 m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
4342 m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
4343 m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
4344 return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
4345}
4346
4347#endif // HWY_NATIVE_AES
4348#endif // HWY_TARGET != HWY_SCALAR
4349
4350// ------------------------------ PopulationCount
4351
4352#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
4353#ifdef HWY_NATIVE_POPCNT
4354#undef HWY_NATIVE_POPCNT
4355#else
4356#define HWY_NATIVE_POPCNT
4357#endif
4358
4359// This overload requires vectors to be at least 16 bytes, which is the case
4360// for LMUL >= 2.
4361#undef HWY_IF_POPCNT
4362#if HWY_TARGET == HWY_RVV
4363#define HWY_IF_POPCNT(D) \
4364 hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
4365#else
4366// Other targets only have these two overloads which are mutually exclusive, so
4367// no further conditions are required.
4368#define HWY_IF_POPCNT(D) void* = nullptr
4369#endif // HWY_TARGET == HWY_RVV
4370
4371template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
4372 HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
4374 const D d;
4375 const V lookup =
4376 Dup128VecFromValues(d, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
4377 const auto lo = And(v, Set(d, uint8_t{0xF}));
4378 const auto hi = ShiftRight<4>(v);
4379 return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
4380}
4381
4382// RVV has a specialization that avoids the Set().
4383#if HWY_TARGET != HWY_RVV
4384// Slower fallback for capped vectors.
4385template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
4386 HWY_IF_V_SIZE_LE_D(D, 8)>
4387HWY_API V PopulationCount(V v) {
4388 const D d;
4389 // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
4390 const V k33 = Set(d, uint8_t{0x33});
4391 v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
4392 v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
4393 return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
4394}
4395#endif // HWY_TARGET != HWY_RVV
4396
4397template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
4399 const D d;
4400 const Repartition<uint8_t, decltype(d)> d8;
4401 const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
4402 return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
4403}
4404
4405template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
4406HWY_API V PopulationCount(V v) {
4407 const D d;
4408 Repartition<uint16_t, decltype(d)> d16;
4409 auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
4410 return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
4411}
4412
4413#if HWY_HAVE_INTEGER64
4414template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
4415HWY_API V PopulationCount(V v) {
4416 const D d;
4417 Repartition<uint32_t, decltype(d)> d32;
4418 auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
4419 return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
4420}
4421#endif
4422
4423#endif // HWY_NATIVE_POPCNT
4424
4425// ------------------------------ 8-bit multiplication
4426
4427#if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
4428#ifdef HWY_NATIVE_MUL_8
4429#undef HWY_NATIVE_MUL_8
4430#else
4431#define HWY_NATIVE_MUL_8
4432#endif
4433
4434// 8 bit and fits in wider reg: promote
4435template <class V, HWY_IF_T_SIZE_V(V, 1),
4437HWY_API V operator*(const V a, const V b) {
4438 const DFromV<decltype(a)> d;
4439 const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
4440 const RebindToUnsigned<decltype(d)> du; // TruncateTo result
4441 const RebindToUnsigned<decltype(dw)> dwu; // TruncateTo input
4442 const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
4443 // TruncateTo is cheaper than ConcatEven.
4444 return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
4445}
4446
4447// 8 bit full reg: promote halves
4448template <class V, HWY_IF_T_SIZE_V(V, 1),
4450HWY_API V operator*(const V a, const V b) {
4451 const DFromV<decltype(a)> d;
4452 const Half<decltype(d)> dh;
4453 const Twice<RepartitionToWide<decltype(dh)>> dw;
4454 const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a));
4455 const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a));
4456 const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
4457 const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
4458 const VFromD<decltype(dw)> m0 = a0 * b0;
4459 const VFromD<decltype(dw)> m1 = a1 * b1;
4460 return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
4461}
4462
4463#endif // HWY_NATIVE_MUL_8
4464
4465// ------------------------------ 64-bit multiplication
4466
4467#if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
4468#ifdef HWY_NATIVE_MUL_64
4469#undef HWY_NATIVE_MUL_64
4470#else
4471#define HWY_NATIVE_MUL_64
4472#endif
4473
4474// Single-lane i64 or u64
4475template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
4477HWY_API V operator*(V x, V y) {
4478 const DFromV<V> d;
4479 using T = TFromD<decltype(d)>;
4480 using TU = MakeUnsigned<T>;
4481 const TU xu = static_cast<TU>(GetLane(x));
4482 const TU yu = static_cast<TU>(GetLane(y));
4483 return Set(d, static_cast<T>(xu * yu));
4484}
4485
4486template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
4487 HWY_IF_V_SIZE_GT_D(D64, 8)>
4488HWY_API V operator*(V x, V y) {
4489 RepartitionToNarrow<D64> d32;
4490 auto x32 = BitCast(d32, x);
4491 auto y32 = BitCast(d32, y);
4492 auto lolo = BitCast(d32, MulEven(x32, y32));
4493 auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
4494 auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
4495 auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
4496 return BitCast(D64{}, lolo + hi);
4497}
4498template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
4499 HWY_IF_V_SIZE_GT_D(DI64, 8)>
4500HWY_API V operator*(V x, V y) {
4501 RebindToUnsigned<DI64> du64;
4502 return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
4503}
4504
4505#endif // HWY_NATIVE_MUL_64
4506
4507// ------------------------------ MulAdd / NegMulAdd
4508
4509#if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
4510#ifdef HWY_NATIVE_INT_FMA
4511#undef HWY_NATIVE_INT_FMA
4512#else
4513#define HWY_NATIVE_INT_FMA
4514#endif
4515
4516#ifdef HWY_NATIVE_INT_FMSUB
4517#undef HWY_NATIVE_INT_FMSUB
4518#else
4519#define HWY_NATIVE_INT_FMSUB
4520#endif
4521
4522template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4523HWY_API V MulAdd(V mul, V x, V add) {
4524 return Add(Mul(mul, x), add);
4525}
4526
4527template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4528HWY_API V NegMulAdd(V mul, V x, V add) {
4529 return Sub(add, Mul(mul, x));
4530}
4531
4532template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4533HWY_API V MulSub(V mul, V x, V sub) {
4534 return Sub(Mul(mul, x), sub);
4535}
4536#endif // HWY_NATIVE_INT_FMA
4537
4538// ------------------------------ Integer MulSub / NegMulSub
4539#if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
4540#ifdef HWY_NATIVE_INT_FMSUB
4541#undef HWY_NATIVE_INT_FMSUB
4542#else
4543#define HWY_NATIVE_INT_FMSUB
4544#endif
4545
4546template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4547HWY_API V MulSub(V mul, V x, V sub) {
4548 const DFromV<decltype(mul)> d;
4549 const RebindToSigned<decltype(d)> di;
4550 return MulAdd(mul, x, BitCast(d, Neg(BitCast(di, sub))));
4551}
4552
4553#endif // HWY_NATIVE_INT_FMSUB
4554
4555template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4556HWY_API V NegMulSub(V mul, V x, V sub) {
4557 const DFromV<decltype(mul)> d;
4558 const RebindToSigned<decltype(d)> di;
4559
4560 return BitCast(d, Neg(BitCast(di, MulAdd(mul, x, sub))));
4561}
4562
4563// ------------------------------ MulAddSub
4564
4565// MulAddSub(mul, x, sub_or_add) for a 1-lane vector is equivalent to
4566// MulSub(mul, x, sub_or_add)
4567template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
4568HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
4569 return MulSub(mul, x, sub_or_add);
4570}
4571
4572// MulAddSub for F16/F32/F64 vectors with 2 or more lanes on
4573// SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and
4574// x86_512-inl.h
4575
4576// MulAddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h
4577
4578// MulAddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h
4579template <class V, HWY_IF_MULADDSUB_V(V)>
4580HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
4581 using D = DFromV<V>;
4582 using T = TFromD<D>;
4583 using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;
4584
4585 const D d;
4586 const Rebind<TNegate, D> d_negate;
4587
4588 const auto add =
4589 OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
4590 return MulAdd(mul, x, add);
4591}
4592
4593// ------------------------------ Integer division
4594#if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
4595#ifdef HWY_NATIVE_INT_DIV
4596#undef HWY_NATIVE_INT_DIV
4597#else
4598#define HWY_NATIVE_INT_DIV
4599#endif
4600
4601namespace detail {
4602
4603// DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo are okay to use in
4604// the implementation of detail::IntDiv in generic_ops-inl.h as the current
4605// implementations of DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo
4606// will convert values that are outside of the range of TFromD<DI> by either
4607// saturation, truncation, or converting values that are outside of the
4608// destination range to LimitsMin<TFromD<DI>>() (which is equal to
4609// static_cast<TFromD<DI>>(LimitsMax<TFromD<DI>>() + 1))
4610
4611template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
4613 return ConvertInRangeTo(di, vf);
4614}
4615
4616template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
4618 return ConvertTo(df, vi);
4619}
4620
4621#if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
4622template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
4624 return PromoteInRangeTo(df, vi);
4625}
4626
4627// If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32
4628// IntDivConvIntToFloat(df, vi) returns an approximation of
4629// static_cast<float>(v[i]) that is within 4 ULP of static_cast<float>(v[i])
4630template <class D, class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)>
4631HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vi) {
4632 const Twice<decltype(df32)> dt_f32;
4633
4634 auto vf32 =
4635 ConvertTo(dt_f32, BitCast(RebindToSigned<decltype(dt_f32)>(), vi));
4636
4637#if HWY_IS_LITTLE_ENDIAN
4638 const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
4639 auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
4640#else
4641 const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
4642 auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
4643#endif
4644
4645 const RebindToSigned<decltype(df32)> di32;
4646
4647 hi_f32 =
4648 Add(hi_f32, And(BitCast(df32, BroadcastSignBit(BitCast(di32, lo_f32))),
4649 Set(df32, 1.0f)));
4650 return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
4651}
4652
4653template <class D, class V, HWY_IF_F32_D(D), HWY_IF_U64(TFromV<V>)>
4654HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vu) {
4655 const Twice<decltype(df32)> dt_f32;
4656
4657 auto vf32 =
4658 ConvertTo(dt_f32, BitCast(RebindToUnsigned<decltype(dt_f32)>(), vu));
4659
4660#if HWY_IS_LITTLE_ENDIAN
4661 const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
4662 const auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
4663#else
4664 const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
4665 const auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
4666#endif
4667
4668 return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
4669}
4670#endif // !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
4671
4672template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4673 HWY_IF_T_SIZE_GT(TFromV<V>, kOrigLaneSize)>
4675 const DFromV<decltype(a)> d;
4676 const RebindToFloat<decltype(d)> df;
4677
4678 // If kOrigLaneSize < sizeof(T) is true, then a[i] and b[i] are both in the
4679 // [LimitsMin<SignedFromSize<kOrigLaneSize>>(),
4680 // LimitsMax<UnsignedFromSize<kOrigLaneSize>>()] range.
4681
4682 // floor(|a[i] / b[i]|) <= |flt_q| < floor(|a[i] / b[i]|) + 1 is also
4683 // guaranteed to be true if MakeFloat<T> has at least kOrigLaneSize*8 + 1
4684 // mantissa bits (including the implied one bit), where flt_q is equal to
4685 // static_cast<MakeFloat<T>>(a[i]) / static_cast<MakeFloat<T>>(b[i]),
4686 // even in the case where the magnitude of an inexact floating point division
4687 // result is rounded up.
4688
4689 // In other words, floor(flt_q) < flt_q < ceil(flt_q) is guaranteed to be true
4690 // if (a[i] % b[i]) != 0 is true and MakeFloat<T> has at least
4691 // kOrigLaneSize*8 + 1 mantissa bits (including the implied one bit), even in
4692 // the case where the magnitude of an inexact floating point division result
4693 // is rounded up.
4694
4695 // It is okay to do conversions from MakeFloat<TFromV<V>> to TFromV<V> using
4696 // ConvertInRangeTo if sizeof(TFromV<V>) > kOrigLaneSize as the result of the
4697 // floating point division is always greater than LimitsMin<TFromV<V>>() and
4698 // less than LimitsMax<TFromV<V>>() if sizeof(TFromV<V>) > kOrigLaneSize and
4699 // b[i] != 0.
4700
4701#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
4702 !HWY_HAVE_FLOAT64
4703 // On Armv7, do division by multiplying by the ApproximateReciprocal
4704 // to avoid unnecessary overhead as F32 Div refines the approximate
4705 // reciprocal using 4 Newton-Raphson iterations
4706
4707 const RebindToSigned<decltype(d)> di;
4708 const RebindToUnsigned<decltype(d)> du;
4709
4710 const auto flt_b = ConvertTo(df, b);
4711 auto flt_recip_b = ApproximateReciprocal(flt_b);
4712 if (kOrigLaneSize > 1) {
4713 flt_recip_b =
4714 Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
4715 }
4716
4717 auto q0 = ConvertInRangeTo(d, Mul(ConvertTo(df, a), flt_recip_b));
4718 const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
4719
4720 auto r1 = r0;
4721
4722 // Need to negate r1[i] if a[i] < 0 is true
4723 if (IsSigned<TFromV<V>>()) {
4724 r1 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r1);
4725 }
4726
4727 // r1[i] is now equal to (a[i] < 0) ? (-r0[i]) : r0[i]
4728
4729 auto abs_b = BitCast(du, b);
4730 if (IsSigned<TFromV<V>>()) {
4731 abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
4732 }
4733
4734 // If (r1[i] < 0 || r1[i] >= abs_b[i]) is true, then set q1[i] to -1.
4735 // Otherwise, set q1[i] to 0.
4736
4737 // (r1[i] < 0 || r1[i] >= abs_b[i]) can be carried out using a single unsigned
4738 // comparison as static_cast<TU>(r1[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
4739 // will be true if r1[i] < 0 is true.
4740 auto q1 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r1), abs_b)));
4741
4742 // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? -1 : 0
4743
4744 // Need to negate q1[i] if r0[i] and b[i] do not have the same sign
4745 auto q1_negate_mask = r0;
4746 if (IsSigned<TFromV<V>>()) {
4747 q1_negate_mask = Xor(q1_negate_mask, BitCast(di, b));
4748 }
4749 q1 = IfNegativeThenElse(q1_negate_mask, Neg(q1), q1);
4750
4751 // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ?
4752 // (((r0[i] ^ b[i]) < 0) ? 1 : -1)
4753
4754 // Need to subtract q1[i] from q0[i] to get the final result
4755 return Sub(q0, BitCast(d, q1));
4756#else
4757 // On targets other than Armv7 NEON, use F16 or F32 division as most targets
4758 // other than Armv7 NEON have native F32 divide instructions
4759 return ConvertInRangeTo(d, Div(ConvertTo(df, a), ConvertTo(df, b)));
4760#endif
4761}
4762
4763template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4764 HWY_IF_T_SIZE(TFromV<V>, kOrigLaneSize),
4765 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
4767 // If kOrigLaneSize == sizeof(T) is true, at least two reciprocal
4768 // multiplication steps are needed as the mantissa of MakeFloat<T> has fewer
4769 // than kOrigLaneSize*8 + 1 bits
4770
4771 using T = TFromV<V>;
4772
4773#if HWY_HAVE_FLOAT64
4774 using TF = MakeFloat<T>;
4775#else
4776 using TF = float;
4777#endif
4778
4779 const DFromV<decltype(a)> d;
4780 const RebindToSigned<decltype(d)> di;
4781 const RebindToUnsigned<decltype(d)> du;
4782 const Rebind<TF, decltype(d)> df;
4783
4784 if (!IsSigned<T>()) {
4785 // If T is unsigned, set a[i] to (a[i] >= b[i] ? 1 : 0) and set b[i] to 1 if
4786 // b[i] > LimitsMax<MakeSigned<T>>() is true
4787
4788 const auto one = Set(di, MakeSigned<T>{1});
4789 a = BitCast(
4791 IfThenElseZero(RebindMask(di, Ge(a, b)), one),
4792 BitCast(di, a)));
4793 b = BitCast(d, IfNegativeThenElse(BitCast(di, b), one, BitCast(di, b)));
4794 }
4795
4796 // LimitsMin<T>() <= b[i] <= LimitsMax<MakeSigned<T>>() is now true
4797
4798 const auto flt_b = IntDivConvIntToFloat(df, b);
4799
4800#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
4801 !HWY_HAVE_FLOAT64
4802 auto flt_recip_b = ApproximateReciprocal(flt_b);
4803 flt_recip_b =
4804 Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
4805#else
4806 const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b);
4807#endif
4808
4809 // It is okay if the conversion of a[i] * flt_recip_b[i] to T using
4810 // IntDivConvFloatToInt returns incorrect results in any lanes where b[i] == 0
4811 // as the result of IntDivUsingFloatDiv(a, b) is implementation-defined in any
4812 // lanes where b[i] == 0.
4813
4814 // If ScalarAbs(b[i]) == 1 is true, then it is possible for
4815 // a[i] * flt_recip_b[i] to be rounded up to a value that is outside of the
4816 // range of T. If a[i] * flt_recip_b[i] is outside of the range of T,
4817 // IntDivConvFloatToInt will convert any values that are out of the range of T
4818 // by either saturation, truncation, or wrapping around to LimitsMin<T>().
4819
4820 // It is okay if the conversion of a[i] * flt_recip_b[i] to T using
4821 // IntDivConvFloatToInt wraps around if ScalarAbs(b[i]) == 1 as r0 will have
4822 // the correct sign if ScalarAbs(b[i]) == 1, even in the cases where the
4823 // conversion of a[i] * flt_recip_b[i] to T using IntDivConvFloatToInt is
4824 // truncated or wraps around.
4825
4826 // If ScalarAbs(b[i]) >= 2 is true, a[i] * flt_recip_b[i] will be within the
4827 // range of T, even in the cases where the conversion of a[i] to TF is
4828 // rounded up or the result of multiplying a[i] by flt_recip_b[i] is rounded
4829 // up.
4830
4831 // ScalarAbs(r0[i]) will also always be less than (LimitsMax<T>() / 2) if
4832 // b[i] != 0, even in the cases where the conversion of a[i] * flt_recip_b[i]
4833 // to T using IntDivConvFloatToInt is truncated or is wrapped around.
4834
4835 auto q0 =
4836 IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b));
4837 const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
4838
4839 // If b[i] != 0 is true, r0[i] * flt_recip_b[i] is always within the range of
4840 // T, even in the cases where the conversion of r0[i] to TF is rounded up or
4841 // the multiplication of r0[i] by flt_recip_b[i] is rounded up.
4842
4843 auto q1 =
4844 IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b));
4845 const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0);
4846
4847 auto r3 = r1;
4848
4849#if !HWY_HAVE_FLOAT64
4850 // Need two additional reciprocal multiplication steps for I64/U64 vectors if
4851 // HWY_HAVE_FLOAT64 is 0
4852 if (sizeof(T) == 8) {
4853 const auto q2 = IntDivConvFloatToInt(
4854 di, Mul(IntDivConvIntToFloat(df, r1), flt_recip_b));
4855 const auto r2 = hwy::HWY_NAMESPACE::NegMulAdd(q2, BitCast(di, b), r1);
4856
4857 const auto q3 = IntDivConvFloatToInt(
4858 di, Mul(IntDivConvIntToFloat(df, r2), flt_recip_b));
4859 r3 = hwy::HWY_NAMESPACE::NegMulAdd(q3, BitCast(di, b), r2);
4860
4861 q0 = Add(q0, BitCast(d, q2));
4862 q1 = Add(q1, q3);
4863 }
4864#endif // !HWY_HAVE_FLOAT64
4865
4866 auto r4 = r3;
4867
4868 // Need to negate r4[i] if a[i] < 0 is true
4869 if (IsSigned<TFromV<V>>()) {
4870 r4 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r4);
4871 }
4872
4873 // r4[i] is now equal to (a[i] < 0) ? (-r3[i]) : r3[i]
4874
4875 auto abs_b = BitCast(du, b);
4876 if (IsSigned<TFromV<V>>()) {
4877 abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
4878 }
4879
4880 // If (r4[i] < 0 || r4[i] >= abs_b[i]) is true, then set q4[i] to -1.
4881 // Otherwise, set r4[i] to 0.
4882
4883 // (r4[i] < 0 || r4[i] >= abs_b[i]) can be carried out using a single unsigned
4884 // comparison as static_cast<TU>(r4[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
4885 // will be true if r4[i] < 0 is true.
4886 auto q4 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r4), abs_b)));
4887
4888 // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? -1 : 0
4889
4890 // Need to negate q4[i] if r3[i] and b[i] do not have the same sign
4891 auto q4_negate_mask = r3;
4892 if (IsSigned<TFromV<V>>()) {
4893 q4_negate_mask = Xor(q4_negate_mask, BitCast(di, b));
4894 }
4895 q4 = IfNegativeThenElse(q4_negate_mask, Neg(q4), q4);
4896
4897 // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ?
4898 // (((r3[i] ^ b[i]) < 0) ? 1 : -1)
4899
4900 // The final result is equal to q0[i] + q1[i] - q4[i]
4901 return Sub(Add(q0, BitCast(d, q1)), BitCast(d, q4));
4902}
4903
4904template <size_t kOrigLaneSize, class V,
4905 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
4907 V, HWY_MAX_BYTES /
4908 ((!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1) ? 4 : 2))>
4909HWY_INLINE V IntDiv(V a, V b) {
4910 using T = TFromV<V>;
4911
4912 // If HWY_HAVE_FLOAT16 is 0, need to promote I8 to I32 and U8 to U32
4913 using TW = MakeWide<
4914 If<(!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1), MakeWide<T>, T>>;
4915
4916 const DFromV<decltype(a)> d;
4917 const Rebind<TW, decltype(d)> dw;
4918
4919#if HWY_TARGET <= HWY_SSE2
4920 // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
4921 // unnecessary overhead
4922 const RebindToSigned<decltype(dw)> dw_i;
4923
4924 // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<T> if
4925 // kOrigLaneSize < sizeof(T) to avoid unnecessary overhead
4926 const If<(kOrigLaneSize < sizeof(T)), RebindToSigned<decltype(d)>,
4927 decltype(d)>
4928 d_demote_to;
4929#else
4930 // On other targets, promote to TW and demote to T
4931 const decltype(dw) dw_i;
4932 const decltype(d) d_demote_to;
4933#endif
4934
4935 return BitCast(
4936 d, DemoteTo(d_demote_to, IntDivUsingFloatDiv<kOrigLaneSize>(
4937 PromoteTo(dw_i, a), PromoteTo(dw_i, b))));
4938}
4939
4940template <size_t kOrigLaneSize, class V,
4942 (HWY_HAVE_FLOAT16 ? (1 << 1) : 0) | (1 << 2)),
4944HWY_INLINE V IntDiv(V a, V b) {
4945 const DFromV<decltype(a)> d;
4946 const RepartitionToWide<decltype(d)> dw;
4947
4948#if HWY_TARGET <= HWY_SSE2
4949 // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
4950 // unnecessary overhead
4951 const RebindToSigned<decltype(dw)> dw_i;
4952
4953 // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<TFromV<V>> if
4954 // kOrigLaneSize < sizeof(TFromV<V>) to avoid unnecessary overhead
4955 const If<(kOrigLaneSize < sizeof(TFromV<V>)), RebindToSigned<decltype(d)>,
4956 decltype(d)>
4957 d_demote_to;
4958#else
4959 // On other targets, promote to MakeWide<TFromV<V>> and demote to TFromV<V>
4960 const decltype(dw) dw_i;
4961 const decltype(d) d_demote_to;
4962#endif
4963
4964 return BitCast(d, OrderedDemote2To(
4965 d_demote_to,
4966 IntDivUsingFloatDiv<kOrigLaneSize>(
4967 PromoteLowerTo(dw_i, a), PromoteLowerTo(dw_i, b)),
4968 IntDivUsingFloatDiv<kOrigLaneSize>(
4969 PromoteUpperTo(dw_i, a), PromoteUpperTo(dw_i, b))));
4970}
4971
4972#if !HWY_HAVE_FLOAT16
4973template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
4974 HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)>
4975HWY_INLINE V IntDiv(V a, V b) {
4976 const DFromV<decltype(a)> d;
4977 const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
4978
4979#if HWY_TARGET <= HWY_SSE2
4980 // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
4981 // overhead
4982 const RebindToSigned<decltype(dw)> dw_i;
4983#else
4984 // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
4985 const decltype(dw) dw_i;
4986#endif
4987
4988 return DemoteTo(d,
4989 BitCast(dw_i, IntDiv<1>(PromoteTo(dw, a), PromoteTo(dw, b))));
4990}
4991template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
4992 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
4993HWY_INLINE V IntDiv(V a, V b) {
4994 const DFromV<decltype(a)> d;
4995 const RepartitionToWide<decltype(d)> dw;
4996
4997#if HWY_TARGET <= HWY_SSE2
4998 // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
4999 // overhead
5000 const RebindToSigned<decltype(dw)> dw_i;
5001#else
5002 // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
5003 const decltype(dw) dw_i;
5004#endif
5005
5006 return OrderedDemote2To(
5007 d, BitCast(dw_i, IntDiv<1>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b))),
5008 BitCast(dw_i, IntDiv<1>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b))));
5009}
5010#endif // !HWY_HAVE_FLOAT16
5011
5012template <size_t kOrigLaneSize, class V,
5014 (HWY_HAVE_FLOAT64 ? 0 : (1 << 4)) | (1 << 8))>
5015HWY_INLINE V IntDiv(V a, V b) {
5016 return IntDivUsingFloatDiv<kOrigLaneSize>(a, b);
5017}
5018
5019#if HWY_HAVE_FLOAT64
5020template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
5021 HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
5022HWY_INLINE V IntDiv(V a, V b) {
5023 const DFromV<decltype(a)> d;
5024 const Rebind<double, decltype(d)> df64;
5025
5026 // It is okay to demote the F64 Div result to int32_t or uint32_t using
5027 // DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i])
5028 // will always be within the range of TFromV<V> if b[i] != 0 and
5029 // sizeof(TFromV<V>) <= 4.
5030
5031 return DemoteInRangeTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b)));
5032}
5033template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
5034 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
5035HWY_INLINE V IntDiv(V a, V b) {
5036 const DFromV<decltype(a)> d;
5037 const Half<decltype(d)> dh;
5038 const Repartition<double, decltype(d)> df64;
5039
5040 // It is okay to demote the F64 Div result to int32_t or uint32_t using
5041 // DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i])
5042 // will always be within the range of TFromV<V> if b[i] != 0 and
5043 // sizeof(TFromV<V>) <= 4.
5044
5045 return Combine(d,
5047 dh, Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b))),
5048 DemoteInRangeTo(dh, Div(PromoteLowerTo(df64, a),
5049 PromoteLowerTo(df64, b))));
5050}
5051#endif // HWY_HAVE_FLOAT64
5052
5053template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
5055 HWY_TARGET == HWY_WASM ||
5057 ? 0
5058 : (1 << 1)) |
5059 (1 << 2) | (1 << 4) | (1 << 8))>
5060HWY_INLINE V IntMod(V a, V b) {
5061 return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv<kOrigLaneSize>(a, b), b, a);
5062}
5063
5064#if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
5065 HWY_TARGET == HWY_WASM_EMU256
5066template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
5067 HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
5068HWY_INLINE V IntMod(V a, V b) {
5069 const DFromV<decltype(a)> d;
5070 const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
5071 return DemoteTo(d, IntMod<kOrigLaneSize>(PromoteTo(dw, a), PromoteTo(dw, b)));
5072}
5073
5074template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
5075 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
5076HWY_INLINE V IntMod(V a, V b) {
5077 const DFromV<decltype(a)> d;
5078 const RepartitionToWide<decltype(d)> dw;
5079 return OrderedDemote2To(
5080 d, IntMod<kOrigLaneSize>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b)),
5081 IntMod<kOrigLaneSize>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)));
5082}
5083#endif // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET ==
5084 // HWY_WASM_EMU256
5085
5086} // namespace detail
5087
5088#if HWY_TARGET == HWY_SCALAR
5089
5090template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5092 return detail::IntDiv<sizeof(T)>(a, b);
5093}
5094template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5096 return detail::IntMod<sizeof(T)>(a, b);
5097}
5098
5099#else // HWY_TARGET != HWY_SCALAR
5100
5101template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5102HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
5103 return detail::IntDiv<sizeof(T)>(a, b);
5104}
5105
5106template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5107HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
5108 return detail::IntMod<sizeof(T)>(a, b);
5109}
5110
5111#if HWY_CAP_GE256
5112template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5113HWY_API Vec256<T> operator/(Vec256<T> a, Vec256<T> b) {
5114 return detail::IntDiv<sizeof(T)>(a, b);
5115}
5116template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5117HWY_API Vec256<T> operator%(Vec256<T> a, Vec256<T> b) {
5118 return detail::IntMod<sizeof(T)>(a, b);
5119}
5120#endif
5121
5122#if HWY_CAP_GE512
5123template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5124HWY_API Vec512<T> operator/(Vec512<T> a, Vec512<T> b) {
5125 return detail::IntDiv<sizeof(T)>(a, b);
5126}
5127template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
5128HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
5129 return detail::IntMod<sizeof(T)>(a, b);
5130}
5131#endif
5132
5133#endif // HWY_TARGET == HWY_SCALAR
5134
5135#endif // HWY_NATIVE_INT_DIV
5136
5137// ------------------------------ SatWidenMulPairwiseAdd
5138
5139#if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \
5140 defined(HWY_TARGET_TOGGLE))
5141
5142#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
5143#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
5144#else
5145#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
5146#endif
5147
5148template <class DI16, class VU8, class VI8,
5149 class VU8_2 = Vec<Repartition<uint8_t, DI16>>, HWY_IF_I16_D(DI16),
5150 HWY_IF_U8_D(DFromV<VU8>), HWY_IF_I8_D(DFromV<VI8>),
5151 HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VI8)),
5152 HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VU8_2))>
5153HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
5154 const RebindToUnsigned<decltype(di16)> du16;
5155
5156 const auto a0 = BitCast(di16, PromoteEvenTo(du16, a));
5157 const auto b0 = PromoteEvenTo(di16, b);
5158
5159 const auto a1 = BitCast(di16, PromoteOddTo(du16, a));
5160 const auto b1 = PromoteOddTo(di16, b);
5161
5162 return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
5163}
5164
5165#endif
5166
5167// ------------------------------ SatWidenMulPairwiseAccumulate
5168
5169#if (defined(HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM) == \
5170 defined(HWY_TARGET_TOGGLE))
5171
5172#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
5173#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
5174#else
5175#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
5176#endif
5177
5178template <class DI32, HWY_IF_I32_D(DI32)>
5180 DI32 di32, VFromD<Repartition<int16_t, DI32>> a,
5182 // WidenMulPairwiseAdd(di32, a, b) is okay here as
5183 // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as
5184 // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if
5185 // a[0], b[0], a[1], and b[1] are all equal to -32768.
5186
5187 const auto product = WidenMulPairwiseAdd(di32, a, b);
5188
5189 const auto mul_overflow =
5190 VecFromMask(di32, Eq(product, Set(di32, LimitsMin<int32_t>())));
5191
5192 return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)),
5193 Add(product, mul_overflow));
5194}
5195
5196#endif // HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
5197
5198// ------------------------------ SatWidenMulAccumFixedPoint
5199
5200#if (defined(HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT) == \
5201 defined(HWY_TARGET_TOGGLE))
5202
5203#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5204#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5205#else
5206#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5207#endif
5208
5209template <class DI32, HWY_IF_I32_D(DI32)>
5211 VFromD<Rebind<int16_t, DI32>> a,
5212 VFromD<Rebind<int16_t, DI32>> b,
5213 VFromD<DI32> sum) {
5214 const Repartition<int16_t, DI32> dt_i16;
5215
5216 const auto vt_a = ResizeBitCast(dt_i16, a);
5217 const auto vt_b = ResizeBitCast(dt_i16, b);
5218
5219 const auto dup_a = InterleaveWholeLower(dt_i16, vt_a, vt_a);
5220 const auto dup_b = InterleaveWholeLower(dt_i16, vt_b, vt_b);
5221
5222 return SatWidenMulPairwiseAccumulate(di32, dup_a, dup_b, sum);
5223}
5224
5225#endif // HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5226
5227// ------------------------------ SumOfMulQuadAccumulate
5228
5229#if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
5230 defined(HWY_TARGET_TOGGLE))
5231
5232#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5233#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5234#else
5235#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
5236#endif
5237
5238template <class DI32, HWY_IF_I32_D(DI32)>
5242 VFromD<DI32> sum) {
5243 const Repartition<int16_t, decltype(di32)> di16;
5244
5245 const auto a0 = PromoteEvenTo(di16, a);
5246 const auto b0 = PromoteEvenTo(di16, b);
5247
5248 const auto a1 = PromoteOddTo(di16, a);
5249 const auto b1 = PromoteOddTo(di16, b);
5250
5251 return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
5252 WidenMulPairwiseAdd(di32, a1, b1)));
5253}
5254
5255#endif
5256
5257#if (defined(HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE) == \
5258 defined(HWY_TARGET_TOGGLE))
5259
5260#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5261#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5262#else
5263#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
5264#endif
5265
5266template <class DU32, HWY_IF_U32_D(DU32)>
5268 DU32 du32, VFromD<Repartition<uint8_t, DU32>> a,
5270 const Repartition<uint16_t, decltype(du32)> du16;
5271 const RebindToSigned<decltype(du16)> di16;
5272 const RebindToSigned<decltype(du32)> di32;
5273
5274 const auto lo8_mask = Set(di16, int16_t{0x00FF});
5275 const auto a0 = And(BitCast(di16, a), lo8_mask);
5276 const auto b0 = And(BitCast(di16, b), lo8_mask);
5277
5278 const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a)));
5279 const auto b1 = BitCast(di16, ShiftRight<8>(BitCast(du16, b)));
5280
5281 return Add(sum, Add(BitCast(du32, WidenMulPairwiseAdd(di32, a0, b0)),
5282 BitCast(du32, WidenMulPairwiseAdd(di32, a1, b1))));
5283}
5284
5285#endif
5286
5287#if (defined(HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE) == \
5288 defined(HWY_TARGET_TOGGLE))
5289
5290#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5291#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5292#else
5293#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
5294#endif
5295
5296template <class DI32, HWY_IF_I32_D(DI32)>
5298 DI32 di32, VFromD<Repartition<uint8_t, DI32>> a_u,
5300 const Repartition<int16_t, decltype(di32)> di16;
5301 const RebindToUnsigned<decltype(di16)> du16;
5302
5303 const auto a0 = And(BitCast(di16, a_u), Set(di16, int16_t{0x00FF}));
5304 const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b_i)));
5305
5306 const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a_u)));
5307 const auto b1 = ShiftRight<8>(BitCast(di16, b_i));
5308
5309 // NOTE: SatWidenMulPairwiseAdd(di16, a_u, b_i) cannot be used in
5310 // SumOfMulQuadAccumulate as it is possible for
5311 // a_u[0]*b_i[0]+a_u[1]*b_i[1] to overflow an int16_t if a_u[0], b_i[0],
5312 // a_u[1], and b_i[1] are all non-zero and b_i[0] and b_i[1] have the same
5313 // sign.
5314
5315 return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
5316 WidenMulPairwiseAdd(di32, a1, b1)));
5317}
5318
5319#endif
5320
5321#if (defined(HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE) == \
5322 defined(HWY_TARGET_TOGGLE))
5323
5324#ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5325#undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5326#else
5327#define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5328#endif
5329
5330#if HWY_HAVE_INTEGER64
5331template <class DI64, HWY_IF_I64_D(DI64)>
5333 DI64 di64, VFromD<Repartition<int16_t, DI64>> a,
5334 VFromD<Repartition<int16_t, DI64>> b, VFromD<DI64> sum) {
5335 const Repartition<int32_t, decltype(di64)> di32;
5336
5337 // WidenMulPairwiseAdd(di32, a, b) is okay here as
5338 // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as
5339 // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if
5340 // a[0], b[0], a[1], and b[1] are all equal to -32768.
5341
5342 const auto i32_pairwise_sum = WidenMulPairwiseAdd(di32, a, b);
5343 const auto i32_pairwise_sum_overflow =
5344 VecFromMask(di32, Eq(i32_pairwise_sum, Set(di32, LimitsMin<int32_t>())));
5345
5346 // The upper 32 bits of sum0 and sum1 need to be zeroed out in the case of
5347 // overflow.
5348 const auto hi32_mask = Set(di64, static_cast<int64_t>(~int64_t{0xFFFFFFFF}));
5349 const auto p0_zero_out_mask =
5350 ShiftLeft<32>(BitCast(di64, i32_pairwise_sum_overflow));
5351 const auto p1_zero_out_mask =
5352 And(BitCast(di64, i32_pairwise_sum_overflow), hi32_mask);
5353
5354 const auto p0 =
5355 AndNot(p0_zero_out_mask,
5356 ShiftRight<32>(ShiftLeft<32>(BitCast(di64, i32_pairwise_sum))));
5357 const auto p1 =
5358 AndNot(p1_zero_out_mask, ShiftRight<32>(BitCast(di64, i32_pairwise_sum)));
5359
5360 return Add(sum, Add(p0, p1));
5361}
5362#endif // HWY_HAVE_INTEGER64
5363#endif // HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
5364
5365#if (defined(HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE) == \
5366 defined(HWY_TARGET_TOGGLE))
5367
5368#ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5369#undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5370#else
5371#define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5372#endif
5373
5374#if HWY_HAVE_INTEGER64
5375template <class DU64, HWY_IF_U64_D(DU64)>
5377 DU64 du64, VFromD<Repartition<uint16_t, DU64>> a,
5378 VFromD<Repartition<uint16_t, DU64>> b, VFromD<DU64> sum) {
5379 const auto u32_even_prod = MulEven(a, b);
5380 const auto u32_odd_prod = MulOdd(a, b);
5381
5382 const auto p0 = Add(PromoteEvenTo(du64, u32_even_prod),
5383 PromoteEvenTo(du64, u32_odd_prod));
5384 const auto p1 =
5385 Add(PromoteOddTo(du64, u32_even_prod), PromoteOddTo(du64, u32_odd_prod));
5386
5387 return Add(sum, Add(p0, p1));
5388}
5389#endif // HWY_HAVE_INTEGER64
5390#endif // HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
5391
5392// ------------------------------ F64 ApproximateReciprocal
5393
5394#if (defined(HWY_NATIVE_F64_APPROX_RECIP) == defined(HWY_TARGET_TOGGLE))
5395#ifdef HWY_NATIVE_F64_APPROX_RECIP
5396#undef HWY_NATIVE_F64_APPROX_RECIP
5397#else
5398#define HWY_NATIVE_F64_APPROX_RECIP
5399#endif
5400
5401#if HWY_HAVE_FLOAT64
5402template <class V, HWY_IF_F64_D(DFromV<V>)>
5404 const DFromV<decltype(v)> d;
5405 return Div(Set(d, 1.0), v);
5406}
5407#endif // HWY_HAVE_FLOAT64
5408
5409#endif // HWY_NATIVE_F64_APPROX_RECIP
5410
5411// ------------------------------ F64 ApproximateReciprocalSqrt
5412
5413#if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE))
5414#ifdef HWY_NATIVE_F64_APPROX_RSQRT
5415#undef HWY_NATIVE_F64_APPROX_RSQRT
5416#else
5417#define HWY_NATIVE_F64_APPROX_RSQRT
5418#endif
5419
5420#if HWY_HAVE_FLOAT64
5421template <class V, HWY_IF_F64_D(DFromV<V>)>
5423 const DFromV<decltype(v)> d;
5424 const RebindToUnsigned<decltype(d)> du;
5425 const auto half = Mul(v, Set(d, 0.5));
5426 // Initial guess based on log2(f)
5427 const auto guess = BitCast(d, Sub(Set(du, uint64_t{0x5FE6EB50C7B537A9u}),
5428 ShiftRight<1>(BitCast(du, v))));
5429 // One Newton-Raphson iteration
5430 return Mul(guess, NegMulAdd(Mul(half, guess), guess, Set(d, 1.5)));
5431}
5432#endif // HWY_HAVE_FLOAT64
5433
5434#endif // HWY_NATIVE_F64_APPROX_RSQRT
5435
5436// ------------------------------ Compress*
5437
5438#if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
5439#ifdef HWY_NATIVE_COMPRESS8
5440#undef HWY_NATIVE_COMPRESS8
5441#else
5442#define HWY_NATIVE_COMPRESS8
5443#endif
5444
5445template <class V, class D, typename T, HWY_IF_T_SIZE(T, 1)>
5446HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d,
5447 T* unaligned) {
5448 HWY_ALIGN T lanes[MaxLanes(d)];
5449 Store(v, d, lanes);
5450
5451 const Simd<T, HWY_MIN(MaxLanes(d), 8), 0> d8;
5452 T* HWY_RESTRICT pos = unaligned;
5453
5454 HWY_ALIGN constexpr T table[2048] = {
5455 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
5456 1, 0, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
5457 2, 0, 1, 3, 4, 5, 6, 7, 0, 2, 1, 3, 4, 5, 6, 7, //
5458 1, 2, 0, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
5459 3, 0, 1, 2, 4, 5, 6, 7, 0, 3, 1, 2, 4, 5, 6, 7, //
5460 1, 3, 0, 2, 4, 5, 6, 7, 0, 1, 3, 2, 4, 5, 6, 7, //
5461 2, 3, 0, 1, 4, 5, 6, 7, 0, 2, 3, 1, 4, 5, 6, 7, //
5462 1, 2, 3, 0, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
5463 4, 0, 1, 2, 3, 5, 6, 7, 0, 4, 1, 2, 3, 5, 6, 7, //
5464 1, 4, 0, 2, 3, 5, 6, 7, 0, 1, 4, 2, 3, 5, 6, 7, //
5465 2, 4, 0, 1, 3, 5, 6, 7, 0, 2, 4, 1, 3, 5, 6, 7, //
5466 1, 2, 4, 0, 3, 5, 6, 7, 0, 1, 2, 4, 3, 5, 6, 7, //
5467 3, 4, 0, 1, 2, 5, 6, 7, 0, 3, 4, 1, 2, 5, 6, 7, //
5468 1, 3, 4, 0, 2, 5, 6, 7, 0, 1, 3, 4, 2, 5, 6, 7, //
5469 2, 3, 4, 0, 1, 5, 6, 7, 0, 2, 3, 4, 1, 5, 6, 7, //
5470 1, 2, 3, 4, 0, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
5471 5, 0, 1, 2, 3, 4, 6, 7, 0, 5, 1, 2, 3, 4, 6, 7, //
5472 1, 5, 0, 2, 3, 4, 6, 7, 0, 1, 5, 2, 3, 4, 6, 7, //
5473 2, 5, 0, 1, 3, 4, 6, 7, 0, 2, 5, 1, 3, 4, 6, 7, //
5474 1, 2, 5, 0, 3, 4, 6, 7, 0, 1, 2, 5, 3, 4, 6, 7, //
5475 3, 5, 0, 1, 2, 4, 6, 7, 0, 3, 5, 1, 2, 4, 6, 7, //
5476 1, 3, 5, 0, 2, 4, 6, 7, 0, 1, 3, 5, 2, 4, 6, 7, //
5477 2, 3, 5, 0, 1, 4, 6, 7, 0, 2, 3, 5, 1, 4, 6, 7, //
5478 1, 2, 3, 5, 0, 4, 6, 7, 0, 1, 2, 3, 5, 4, 6, 7, //
5479 4, 5, 0, 1, 2, 3, 6, 7, 0, 4, 5, 1, 2, 3, 6, 7, //
5480 1, 4, 5, 0, 2, 3, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7, //
5481 2, 4, 5, 0, 1, 3, 6, 7, 0, 2, 4, 5, 1, 3, 6, 7, //
5482 1, 2, 4, 5, 0, 3, 6, 7, 0, 1, 2, 4, 5, 3, 6, 7, //
5483 3, 4, 5, 0, 1, 2, 6, 7, 0, 3, 4, 5, 1, 2, 6, 7, //
5484 1, 3, 4, 5, 0, 2, 6, 7, 0, 1, 3, 4, 5, 2, 6, 7, //
5485 2, 3, 4, 5, 0, 1, 6, 7, 0, 2, 3, 4, 5, 1, 6, 7, //
5486 1, 2, 3, 4, 5, 0, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
5487 6, 0, 1, 2, 3, 4, 5, 7, 0, 6, 1, 2, 3, 4, 5, 7, //
5488 1, 6, 0, 2, 3, 4, 5, 7, 0, 1, 6, 2, 3, 4, 5, 7, //
5489 2, 6, 0, 1, 3, 4, 5, 7, 0, 2, 6, 1, 3, 4, 5, 7, //
5490 1, 2, 6, 0, 3, 4, 5, 7, 0, 1, 2, 6, 3, 4, 5, 7, //
5491 3, 6, 0, 1, 2, 4, 5, 7, 0, 3, 6, 1, 2, 4, 5, 7, //
5492 1, 3, 6, 0, 2, 4, 5, 7, 0, 1, 3, 6, 2, 4, 5, 7, //
5493 2, 3, 6, 0, 1, 4, 5, 7, 0, 2, 3, 6, 1, 4, 5, 7, //
5494 1, 2, 3, 6, 0, 4, 5, 7, 0, 1, 2, 3, 6, 4, 5, 7, //
5495 4, 6, 0, 1, 2, 3, 5, 7, 0, 4, 6, 1, 2, 3, 5, 7, //
5496 1, 4, 6, 0, 2, 3, 5, 7, 0, 1, 4, 6, 2, 3, 5, 7, //
5497 2, 4, 6, 0, 1, 3, 5, 7, 0, 2, 4, 6, 1, 3, 5, 7, //
5498 1, 2, 4, 6, 0, 3, 5, 7, 0, 1, 2, 4, 6, 3, 5, 7, //
5499 3, 4, 6, 0, 1, 2, 5, 7, 0, 3, 4, 6, 1, 2, 5, 7, //
5500 1, 3, 4, 6, 0, 2, 5, 7, 0, 1, 3, 4, 6, 2, 5, 7, //
5501 2, 3, 4, 6, 0, 1, 5, 7, 0, 2, 3, 4, 6, 1, 5, 7, //
5502 1, 2, 3, 4, 6, 0, 5, 7, 0, 1, 2, 3, 4, 6, 5, 7, //
5503 5, 6, 0, 1, 2, 3, 4, 7, 0, 5, 6, 1, 2, 3, 4, 7, //
5504 1, 5, 6, 0, 2, 3, 4, 7, 0, 1, 5, 6, 2, 3, 4, 7, //
5505 2, 5, 6, 0, 1, 3, 4, 7, 0, 2, 5, 6, 1, 3, 4, 7, //
5506 1, 2, 5, 6, 0, 3, 4, 7, 0, 1, 2, 5, 6, 3, 4, 7, //
5507 3, 5, 6, 0, 1, 2, 4, 7, 0, 3, 5, 6, 1, 2, 4, 7, //
5508 1, 3, 5, 6, 0, 2, 4, 7, 0, 1, 3, 5, 6, 2, 4, 7, //
5509 2, 3, 5, 6, 0, 1, 4, 7, 0, 2, 3, 5, 6, 1, 4, 7, //
5510 1, 2, 3, 5, 6, 0, 4, 7, 0, 1, 2, 3, 5, 6, 4, 7, //
5511 4, 5, 6, 0, 1, 2, 3, 7, 0, 4, 5, 6, 1, 2, 3, 7, //
5512 1, 4, 5, 6, 0, 2, 3, 7, 0, 1, 4, 5, 6, 2, 3, 7, //
5513 2, 4, 5, 6, 0, 1, 3, 7, 0, 2, 4, 5, 6, 1, 3, 7, //
5514 1, 2, 4, 5, 6, 0, 3, 7, 0, 1, 2, 4, 5, 6, 3, 7, //
5515 3, 4, 5, 6, 0, 1, 2, 7, 0, 3, 4, 5, 6, 1, 2, 7, //
5516 1, 3, 4, 5, 6, 0, 2, 7, 0, 1, 3, 4, 5, 6, 2, 7, //
5517 2, 3, 4, 5, 6, 0, 1, 7, 0, 2, 3, 4, 5, 6, 1, 7, //
5518 1, 2, 3, 4, 5, 6, 0, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
5519 7, 0, 1, 2, 3, 4, 5, 6, 0, 7, 1, 2, 3, 4, 5, 6, //
5520 1, 7, 0, 2, 3, 4, 5, 6, 0, 1, 7, 2, 3, 4, 5, 6, //
5521 2, 7, 0, 1, 3, 4, 5, 6, 0, 2, 7, 1, 3, 4, 5, 6, //
5522 1, 2, 7, 0, 3, 4, 5, 6, 0, 1, 2, 7, 3, 4, 5, 6, //
5523 3, 7, 0, 1, 2, 4, 5, 6, 0, 3, 7, 1, 2, 4, 5, 6, //
5524 1, 3, 7, 0, 2, 4, 5, 6, 0, 1, 3, 7, 2, 4, 5, 6, //
5525 2, 3, 7, 0, 1, 4, 5, 6, 0, 2, 3, 7, 1, 4, 5, 6, //
5526 1, 2, 3, 7, 0, 4, 5, 6, 0, 1, 2, 3, 7, 4, 5, 6, //
5527 4, 7, 0, 1, 2, 3, 5, 6, 0, 4, 7, 1, 2, 3, 5, 6, //
5528 1, 4, 7, 0, 2, 3, 5, 6, 0, 1, 4, 7, 2, 3, 5, 6, //
5529 2, 4, 7, 0, 1, 3, 5, 6, 0, 2, 4, 7, 1, 3, 5, 6, //
5530 1, 2, 4, 7, 0, 3, 5, 6, 0, 1, 2, 4, 7, 3, 5, 6, //
5531 3, 4, 7, 0, 1, 2, 5, 6, 0, 3, 4, 7, 1, 2, 5, 6, //
5532 1, 3, 4, 7, 0, 2, 5, 6, 0, 1, 3, 4, 7, 2, 5, 6, //
5533 2, 3, 4, 7, 0, 1, 5, 6, 0, 2, 3, 4, 7, 1, 5, 6, //
5534 1, 2, 3, 4, 7, 0, 5, 6, 0, 1, 2, 3, 4, 7, 5, 6, //
5535 5, 7, 0, 1, 2, 3, 4, 6, 0, 5, 7, 1, 2, 3, 4, 6, //
5536 1, 5, 7, 0, 2, 3, 4, 6, 0, 1, 5, 7, 2, 3, 4, 6, //
5537 2, 5, 7, 0, 1, 3, 4, 6, 0, 2, 5, 7, 1, 3, 4, 6, //
5538 1, 2, 5, 7, 0, 3, 4, 6, 0, 1, 2, 5, 7, 3, 4, 6, //
5539 3, 5, 7, 0, 1, 2, 4, 6, 0, 3, 5, 7, 1, 2, 4, 6, //
5540 1, 3, 5, 7, 0, 2, 4, 6, 0, 1, 3, 5, 7, 2, 4, 6, //
5541 2, 3, 5, 7, 0, 1, 4, 6, 0, 2, 3, 5, 7, 1, 4, 6, //
5542 1, 2, 3, 5, 7, 0, 4, 6, 0, 1, 2, 3, 5, 7, 4, 6, //
5543 4, 5, 7, 0, 1, 2, 3, 6, 0, 4, 5, 7, 1, 2, 3, 6, //
5544 1, 4, 5, 7, 0, 2, 3, 6, 0, 1, 4, 5, 7, 2, 3, 6, //
5545 2, 4, 5, 7, 0, 1, 3, 6, 0, 2, 4, 5, 7, 1, 3, 6, //
5546 1, 2, 4, 5, 7, 0, 3, 6, 0, 1, 2, 4, 5, 7, 3, 6, //
5547 3, 4, 5, 7, 0, 1, 2, 6, 0, 3, 4, 5, 7, 1, 2, 6, //
5548 1, 3, 4, 5, 7, 0, 2, 6, 0, 1, 3, 4, 5, 7, 2, 6, //
5549 2, 3, 4, 5, 7, 0, 1, 6, 0, 2, 3, 4, 5, 7, 1, 6, //
5550 1, 2, 3, 4, 5, 7, 0, 6, 0, 1, 2, 3, 4, 5, 7, 6, //
5551 6, 7, 0, 1, 2, 3, 4, 5, 0, 6, 7, 1, 2, 3, 4, 5, //
5552 1, 6, 7, 0, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, //
5553 2, 6, 7, 0, 1, 3, 4, 5, 0, 2, 6, 7, 1, 3, 4, 5, //
5554 1, 2, 6, 7, 0, 3, 4, 5, 0, 1, 2, 6, 7, 3, 4, 5, //
5555 3, 6, 7, 0, 1, 2, 4, 5, 0, 3, 6, 7, 1, 2, 4, 5, //
5556 1, 3, 6, 7, 0, 2, 4, 5, 0, 1, 3, 6, 7, 2, 4, 5, //
5557 2, 3, 6, 7, 0, 1, 4, 5, 0, 2, 3, 6, 7, 1, 4, 5, //
5558 1, 2, 3, 6, 7, 0, 4, 5, 0, 1, 2, 3, 6, 7, 4, 5, //
5559 4, 6, 7, 0, 1, 2, 3, 5, 0, 4, 6, 7, 1, 2, 3, 5, //
5560 1, 4, 6, 7, 0, 2, 3, 5, 0, 1, 4, 6, 7, 2, 3, 5, //
5561 2, 4, 6, 7, 0, 1, 3, 5, 0, 2, 4, 6, 7, 1, 3, 5, //
5562 1, 2, 4, 6, 7, 0, 3, 5, 0, 1, 2, 4, 6, 7, 3, 5, //
5563 3, 4, 6, 7, 0, 1, 2, 5, 0, 3, 4, 6, 7, 1, 2, 5, //
5564 1, 3, 4, 6, 7, 0, 2, 5, 0, 1, 3, 4, 6, 7, 2, 5, //
5565 2, 3, 4, 6, 7, 0, 1, 5, 0, 2, 3, 4, 6, 7, 1, 5, //
5566 1, 2, 3, 4, 6, 7, 0, 5, 0, 1, 2, 3, 4, 6, 7, 5, //
5567 5, 6, 7, 0, 1, 2, 3, 4, 0, 5, 6, 7, 1, 2, 3, 4, //
5568 1, 5, 6, 7, 0, 2, 3, 4, 0, 1, 5, 6, 7, 2, 3, 4, //
5569 2, 5, 6, 7, 0, 1, 3, 4, 0, 2, 5, 6, 7, 1, 3, 4, //
5570 1, 2, 5, 6, 7, 0, 3, 4, 0, 1, 2, 5, 6, 7, 3, 4, //
5571 3, 5, 6, 7, 0, 1, 2, 4, 0, 3, 5, 6, 7, 1, 2, 4, //
5572 1, 3, 5, 6, 7, 0, 2, 4, 0, 1, 3, 5, 6, 7, 2, 4, //
5573 2, 3, 5, 6, 7, 0, 1, 4, 0, 2, 3, 5, 6, 7, 1, 4, //
5574 1, 2, 3, 5, 6, 7, 0, 4, 0, 1, 2, 3, 5, 6, 7, 4, //
5575 4, 5, 6, 7, 0, 1, 2, 3, 0, 4, 5, 6, 7, 1, 2, 3, //
5576 1, 4, 5, 6, 7, 0, 2, 3, 0, 1, 4, 5, 6, 7, 2, 3, //
5577 2, 4, 5, 6, 7, 0, 1, 3, 0, 2, 4, 5, 6, 7, 1, 3, //
5578 1, 2, 4, 5, 6, 7, 0, 3, 0, 1, 2, 4, 5, 6, 7, 3, //
5579 3, 4, 5, 6, 7, 0, 1, 2, 0, 3, 4, 5, 6, 7, 1, 2, //
5580 1, 3, 4, 5, 6, 7, 0, 2, 0, 1, 3, 4, 5, 6, 7, 2, //
5581 2, 3, 4, 5, 6, 7, 0, 1, 0, 2, 3, 4, 5, 6, 7, 1, //
5582 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
5583
5584 for (size_t i = 0; i < Lanes(d); i += 8) {
5585 // Each byte worth of bits is the index of one of 256 8-byte ranges, and its
5586 // population count determines how far to advance the write position.
5587 const size_t bits8 = bits[i / 8];
5588 const auto indices = Load(d8, table + bits8 * 8);
5589 const auto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices);
5590 StoreU(compressed, d8, pos);
5591 pos += PopCount(bits8);
5592 }
5593 return static_cast<size_t>(pos - unaligned);
5594}
5595
5596template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
5597HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) {
5598 uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)];
5599 (void)StoreMaskBits(d, mask, bits);
5600 return CompressBitsStore(v, bits, d, unaligned);
5601}
5602
5603template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
5604HWY_API size_t CompressBlendedStore(V v, M mask, D d,
5605 T* HWY_RESTRICT unaligned) {
5606 HWY_ALIGN T buf[MaxLanes(d)];
5607 const size_t bytes = CompressStore(v, mask, d, buf);
5608 BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned);
5609 return bytes;
5610}
5611
5612// For reasons unknown, HWY_IF_T_SIZE_V is a compile error in SVE.
5613template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
5614HWY_API V Compress(V v, const M mask) {
5615 const DFromV<V> d;
5616 HWY_ALIGN T lanes[MaxLanes(d)];
5617 (void)CompressStore(v, mask, d, lanes);
5618 return Load(d, lanes);
5619}
5620
5621template <class V, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
5622HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
5623 const DFromV<V> d;
5624 HWY_ALIGN T lanes[MaxLanes(d)];
5625 (void)CompressBitsStore(v, bits, d, lanes);
5626 return Load(d, lanes);
5627}
5628
5629template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
5630HWY_API V CompressNot(V v, M mask) {
5631 return Compress(v, Not(mask));
5632}
5633
5634#endif // HWY_NATIVE_COMPRESS8
5635
5636// ------------------------------ Expand
5637
5638// Note that this generic implementation assumes <= 128 bit fixed vectors;
5639// the SVE and RVV targets provide their own native implementations.
5640#if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
5641#ifdef HWY_NATIVE_EXPAND
5642#undef HWY_NATIVE_EXPAND
5643#else
5644#define HWY_NATIVE_EXPAND
5645#endif
5646
5647namespace detail {
5648
5649#if HWY_IDE
5650template <class M>
5651HWY_INLINE uint64_t BitsFromMask(M /* mask */) {
5652 return 0;
5653}
5654#endif // HWY_IDE
5655
5656template <size_t N>
5658 static_assert(N <= 8, "Should only be called for half-vectors");
5659 const Simd<uint8_t, N, 0> du8;
5660 HWY_DASSERT(mask_bits < 0x100);
5661 alignas(16) static constexpr uint8_t table[2048] = {
5662 // PrintExpand8x8Tables
5663 128, 128, 128, 128, 128, 128, 128, 128, //
5664 0, 128, 128, 128, 128, 128, 128, 128, //
5665 128, 0, 128, 128, 128, 128, 128, 128, //
5666 0, 1, 128, 128, 128, 128, 128, 128, //
5667 128, 128, 0, 128, 128, 128, 128, 128, //
5668 0, 128, 1, 128, 128, 128, 128, 128, //
5669 128, 0, 1, 128, 128, 128, 128, 128, //
5670 0, 1, 2, 128, 128, 128, 128, 128, //
5671 128, 128, 128, 0, 128, 128, 128, 128, //
5672 0, 128, 128, 1, 128, 128, 128, 128, //
5673 128, 0, 128, 1, 128, 128, 128, 128, //
5674 0, 1, 128, 2, 128, 128, 128, 128, //
5675 128, 128, 0, 1, 128, 128, 128, 128, //
5676 0, 128, 1, 2, 128, 128, 128, 128, //
5677 128, 0, 1, 2, 128, 128, 128, 128, //
5678 0, 1, 2, 3, 128, 128, 128, 128, //
5679 128, 128, 128, 128, 0, 128, 128, 128, //
5680 0, 128, 128, 128, 1, 128, 128, 128, //
5681 128, 0, 128, 128, 1, 128, 128, 128, //
5682 0, 1, 128, 128, 2, 128, 128, 128, //
5683 128, 128, 0, 128, 1, 128, 128, 128, //
5684 0, 128, 1, 128, 2, 128, 128, 128, //
5685 128, 0, 1, 128, 2, 128, 128, 128, //
5686 0, 1, 2, 128, 3, 128, 128, 128, //
5687 128, 128, 128, 0, 1, 128, 128, 128, //
5688 0, 128, 128, 1, 2, 128, 128, 128, //
5689 128, 0, 128, 1, 2, 128, 128, 128, //
5690 0, 1, 128, 2, 3, 128, 128, 128, //
5691 128, 128, 0, 1, 2, 128, 128, 128, //
5692 0, 128, 1, 2, 3, 128, 128, 128, //
5693 128, 0, 1, 2, 3, 128, 128, 128, //
5694 0, 1, 2, 3, 4, 128, 128, 128, //
5695 128, 128, 128, 128, 128, 0, 128, 128, //
5696 0, 128, 128, 128, 128, 1, 128, 128, //
5697 128, 0, 128, 128, 128, 1, 128, 128, //
5698 0, 1, 128, 128, 128, 2, 128, 128, //
5699 128, 128, 0, 128, 128, 1, 128, 128, //
5700 0, 128, 1, 128, 128, 2, 128, 128, //
5701 128, 0, 1, 128, 128, 2, 128, 128, //
5702 0, 1, 2, 128, 128, 3, 128, 128, //
5703 128, 128, 128, 0, 128, 1, 128, 128, //
5704 0, 128, 128, 1, 128, 2, 128, 128, //
5705 128, 0, 128, 1, 128, 2, 128, 128, //
5706 0, 1, 128, 2, 128, 3, 128, 128, //
5707 128, 128, 0, 1, 128, 2, 128, 128, //
5708 0, 128, 1, 2, 128, 3, 128, 128, //
5709 128, 0, 1, 2, 128, 3, 128, 128, //
5710 0, 1, 2, 3, 128, 4, 128, 128, //
5711 128, 128, 128, 128, 0, 1, 128, 128, //
5712 0, 128, 128, 128, 1, 2, 128, 128, //
5713 128, 0, 128, 128, 1, 2, 128, 128, //
5714 0, 1, 128, 128, 2, 3, 128, 128, //
5715 128, 128, 0, 128, 1, 2, 128, 128, //
5716 0, 128, 1, 128, 2, 3, 128, 128, //
5717 128, 0, 1, 128, 2, 3, 128, 128, //
5718 0, 1, 2, 128, 3, 4, 128, 128, //
5719 128, 128, 128, 0, 1, 2, 128, 128, //
5720 0, 128, 128, 1, 2, 3, 128, 128, //
5721 128, 0, 128, 1, 2, 3, 128, 128, //
5722 0, 1, 128, 2, 3, 4, 128, 128, //
5723 128, 128, 0, 1, 2, 3, 128, 128, //
5724 0, 128, 1, 2, 3, 4, 128, 128, //
5725 128, 0, 1, 2, 3, 4, 128, 128, //
5726 0, 1, 2, 3, 4, 5, 128, 128, //
5727 128, 128, 128, 128, 128, 128, 0, 128, //
5728 0, 128, 128, 128, 128, 128, 1, 128, //
5729 128, 0, 128, 128, 128, 128, 1, 128, //
5730 0, 1, 128, 128, 128, 128, 2, 128, //
5731 128, 128, 0, 128, 128, 128, 1, 128, //
5732 0, 128, 1, 128, 128, 128, 2, 128, //
5733 128, 0, 1, 128, 128, 128, 2, 128, //
5734 0, 1, 2, 128, 128, 128, 3, 128, //
5735 128, 128, 128, 0, 128, 128, 1, 128, //
5736 0, 128, 128, 1, 128, 128, 2, 128, //
5737 128, 0, 128, 1, 128, 128, 2, 128, //
5738 0, 1, 128, 2, 128, 128, 3, 128, //
5739 128, 128, 0, 1, 128, 128, 2, 128, //
5740 0, 128, 1, 2, 128, 128, 3, 128, //
5741 128, 0, 1, 2, 128, 128, 3, 128, //
5742 0, 1, 2, 3, 128, 128, 4, 128, //
5743 128, 128, 128, 128, 0, 128, 1, 128, //
5744 0, 128, 128, 128, 1, 128, 2, 128, //
5745 128, 0, 128, 128, 1, 128, 2, 128, //
5746 0, 1, 128, 128, 2, 128, 3, 128, //
5747 128, 128, 0, 128, 1, 128, 2, 128, //
5748 0, 128, 1, 128, 2, 128, 3, 128, //
5749 128, 0, 1, 128, 2, 128, 3, 128, //
5750 0, 1, 2, 128, 3, 128, 4, 128, //
5751 128, 128, 128, 0, 1, 128, 2, 128, //
5752 0, 128, 128, 1, 2, 128, 3, 128, //
5753 128, 0, 128, 1, 2, 128, 3, 128, //
5754 0, 1, 128, 2, 3, 128, 4, 128, //
5755 128, 128, 0, 1, 2, 128, 3, 128, //
5756 0, 128, 1, 2, 3, 128, 4, 128, //
5757 128, 0, 1, 2, 3, 128, 4, 128, //
5758 0, 1, 2, 3, 4, 128, 5, 128, //
5759 128, 128, 128, 128, 128, 0, 1, 128, //
5760 0, 128, 128, 128, 128, 1, 2, 128, //
5761 128, 0, 128, 128, 128, 1, 2, 128, //
5762 0, 1, 128, 128, 128, 2, 3, 128, //
5763 128, 128, 0, 128, 128, 1, 2, 128, //
5764 0, 128, 1, 128, 128, 2, 3, 128, //
5765 128, 0, 1, 128, 128, 2, 3, 128, //
5766 0, 1, 2, 128, 128, 3, 4, 128, //
5767 128, 128, 128, 0, 128, 1, 2, 128, //
5768 0, 128, 128, 1, 128, 2, 3, 128, //
5769 128, 0, 128, 1, 128, 2, 3, 128, //
5770 0, 1, 128, 2, 128, 3, 4, 128, //
5771 128, 128, 0, 1, 128, 2, 3, 128, //
5772 0, 128, 1, 2, 128, 3, 4, 128, //
5773 128, 0, 1, 2, 128, 3, 4, 128, //
5774 0, 1, 2, 3, 128, 4, 5, 128, //
5775 128, 128, 128, 128, 0, 1, 2, 128, //
5776 0, 128, 128, 128, 1, 2, 3, 128, //
5777 128, 0, 128, 128, 1, 2, 3, 128, //
5778 0, 1, 128, 128, 2, 3, 4, 128, //
5779 128, 128, 0, 128, 1, 2, 3, 128, //
5780 0, 128, 1, 128, 2, 3, 4, 128, //
5781 128, 0, 1, 128, 2, 3, 4, 128, //
5782 0, 1, 2, 128, 3, 4, 5, 128, //
5783 128, 128, 128, 0, 1, 2, 3, 128, //
5784 0, 128, 128, 1, 2, 3, 4, 128, //
5785 128, 0, 128, 1, 2, 3, 4, 128, //
5786 0, 1, 128, 2, 3, 4, 5, 128, //
5787 128, 128, 0, 1, 2, 3, 4, 128, //
5788 0, 128, 1, 2, 3, 4, 5, 128, //
5789 128, 0, 1, 2, 3, 4, 5, 128, //
5790 0, 1, 2, 3, 4, 5, 6, 128, //
5791 128, 128, 128, 128, 128, 128, 128, 0, //
5792 0, 128, 128, 128, 128, 128, 128, 1, //
5793 128, 0, 128, 128, 128, 128, 128, 1, //
5794 0, 1, 128, 128, 128, 128, 128, 2, //
5795 128, 128, 0, 128, 128, 128, 128, 1, //
5796 0, 128, 1, 128, 128, 128, 128, 2, //
5797 128, 0, 1, 128, 128, 128, 128, 2, //
5798 0, 1, 2, 128, 128, 128, 128, 3, //
5799 128, 128, 128, 0, 128, 128, 128, 1, //
5800 0, 128, 128, 1, 128, 128, 128, 2, //
5801 128, 0, 128, 1, 128, 128, 128, 2, //
5802 0, 1, 128, 2, 128, 128, 128, 3, //
5803 128, 128, 0, 1, 128, 128, 128, 2, //
5804 0, 128, 1, 2, 128, 128, 128, 3, //
5805 128, 0, 1, 2, 128, 128, 128, 3, //
5806 0, 1, 2, 3, 128, 128, 128, 4, //
5807 128, 128, 128, 128, 0, 128, 128, 1, //
5808 0, 128, 128, 128, 1, 128, 128, 2, //
5809 128, 0, 128, 128, 1, 128, 128, 2, //
5810 0, 1, 128, 128, 2, 128, 128, 3, //
5811 128, 128, 0, 128, 1, 128, 128, 2, //
5812 0, 128, 1, 128, 2, 128, 128, 3, //
5813 128, 0, 1, 128, 2, 128, 128, 3, //
5814 0, 1, 2, 128, 3, 128, 128, 4, //
5815 128, 128, 128, 0, 1, 128, 128, 2, //
5816 0, 128, 128, 1, 2, 128, 128, 3, //
5817 128, 0, 128, 1, 2, 128, 128, 3, //
5818 0, 1, 128, 2, 3, 128, 128, 4, //
5819 128, 128, 0, 1, 2, 128, 128, 3, //
5820 0, 128, 1, 2, 3, 128, 128, 4, //
5821 128, 0, 1, 2, 3, 128, 128, 4, //
5822 0, 1, 2, 3, 4, 128, 128, 5, //
5823 128, 128, 128, 128, 128, 0, 128, 1, //
5824 0, 128, 128, 128, 128, 1, 128, 2, //
5825 128, 0, 128, 128, 128, 1, 128, 2, //
5826 0, 1, 128, 128, 128, 2, 128, 3, //
5827 128, 128, 0, 128, 128, 1, 128, 2, //
5828 0, 128, 1, 128, 128, 2, 128, 3, //
5829 128, 0, 1, 128, 128, 2, 128, 3, //
5830 0, 1, 2, 128, 128, 3, 128, 4, //
5831 128, 128, 128, 0, 128, 1, 128, 2, //
5832 0, 128, 128, 1, 128, 2, 128, 3, //
5833 128, 0, 128, 1, 128, 2, 128, 3, //
5834 0, 1, 128, 2, 128, 3, 128, 4, //
5835 128, 128, 0, 1, 128, 2, 128, 3, //
5836 0, 128, 1, 2, 128, 3, 128, 4, //
5837 128, 0, 1, 2, 128, 3, 128, 4, //
5838 0, 1, 2, 3, 128, 4, 128, 5, //
5839 128, 128, 128, 128, 0, 1, 128, 2, //
5840 0, 128, 128, 128, 1, 2, 128, 3, //
5841 128, 0, 128, 128, 1, 2, 128, 3, //
5842 0, 1, 128, 128, 2, 3, 128, 4, //
5843 128, 128, 0, 128, 1, 2, 128, 3, //
5844 0, 128, 1, 128, 2, 3, 128, 4, //
5845 128, 0, 1, 128, 2, 3, 128, 4, //
5846 0, 1, 2, 128, 3, 4, 128, 5, //
5847 128, 128, 128, 0, 1, 2, 128, 3, //
5848 0, 128, 128, 1, 2, 3, 128, 4, //
5849 128, 0, 128, 1, 2, 3, 128, 4, //
5850 0, 1, 128, 2, 3, 4, 128, 5, //
5851 128, 128, 0, 1, 2, 3, 128, 4, //
5852 0, 128, 1, 2, 3, 4, 128, 5, //
5853 128, 0, 1, 2, 3, 4, 128, 5, //
5854 0, 1, 2, 3, 4, 5, 128, 6, //
5855 128, 128, 128, 128, 128, 128, 0, 1, //
5856 0, 128, 128, 128, 128, 128, 1, 2, //
5857 128, 0, 128, 128, 128, 128, 1, 2, //
5858 0, 1, 128, 128, 128, 128, 2, 3, //
5859 128, 128, 0, 128, 128, 128, 1, 2, //
5860 0, 128, 1, 128, 128, 128, 2, 3, //
5861 128, 0, 1, 128, 128, 128, 2, 3, //
5862 0, 1, 2, 128, 128, 128, 3, 4, //
5863 128, 128, 128, 0, 128, 128, 1, 2, //
5864 0, 128, 128, 1, 128, 128, 2, 3, //
5865 128, 0, 128, 1, 128, 128, 2, 3, //
5866 0, 1, 128, 2, 128, 128, 3, 4, //
5867 128, 128, 0, 1, 128, 128, 2, 3, //
5868 0, 128, 1, 2, 128, 128, 3, 4, //
5869 128, 0, 1, 2, 128, 128, 3, 4, //
5870 0, 1, 2, 3, 128, 128, 4, 5, //
5871 128, 128, 128, 128, 0, 128, 1, 2, //
5872 0, 128, 128, 128, 1, 128, 2, 3, //
5873 128, 0, 128, 128, 1, 128, 2, 3, //
5874 0, 1, 128, 128, 2, 128, 3, 4, //
5875 128, 128, 0, 128, 1, 128, 2, 3, //
5876 0, 128, 1, 128, 2, 128, 3, 4, //
5877 128, 0, 1, 128, 2, 128, 3, 4, //
5878 0, 1, 2, 128, 3, 128, 4, 5, //
5879 128, 128, 128, 0, 1, 128, 2, 3, //
5880 0, 128, 128, 1, 2, 128, 3, 4, //
5881 128, 0, 128, 1, 2, 128, 3, 4, //
5882 0, 1, 128, 2, 3, 128, 4, 5, //
5883 128, 128, 0, 1, 2, 128, 3, 4, //
5884 0, 128, 1, 2, 3, 128, 4, 5, //
5885 128, 0, 1, 2, 3, 128, 4, 5, //
5886 0, 1, 2, 3, 4, 128, 5, 6, //
5887 128, 128, 128, 128, 128, 0, 1, 2, //
5888 0, 128, 128, 128, 128, 1, 2, 3, //
5889 128, 0, 128, 128, 128, 1, 2, 3, //
5890 0, 1, 128, 128, 128, 2, 3, 4, //
5891 128, 128, 0, 128, 128, 1, 2, 3, //
5892 0, 128, 1, 128, 128, 2, 3, 4, //
5893 128, 0, 1, 128, 128, 2, 3, 4, //
5894 0, 1, 2, 128, 128, 3, 4, 5, //
5895 128, 128, 128, 0, 128, 1, 2, 3, //
5896 0, 128, 128, 1, 128, 2, 3, 4, //
5897 128, 0, 128, 1, 128, 2, 3, 4, //
5898 0, 1, 128, 2, 128, 3, 4, 5, //
5899 128, 128, 0, 1, 128, 2, 3, 4, //
5900 0, 128, 1, 2, 128, 3, 4, 5, //
5901 128, 0, 1, 2, 128, 3, 4, 5, //
5902 0, 1, 2, 3, 128, 4, 5, 6, //
5903 128, 128, 128, 128, 0, 1, 2, 3, //
5904 0, 128, 128, 128, 1, 2, 3, 4, //
5905 128, 0, 128, 128, 1, 2, 3, 4, //
5906 0, 1, 128, 128, 2, 3, 4, 5, //
5907 128, 128, 0, 128, 1, 2, 3, 4, //
5908 0, 128, 1, 128, 2, 3, 4, 5, //
5909 128, 0, 1, 128, 2, 3, 4, 5, //
5910 0, 1, 2, 128, 3, 4, 5, 6, //
5911 128, 128, 128, 0, 1, 2, 3, 4, //
5912 0, 128, 128, 1, 2, 3, 4, 5, //
5913 128, 0, 128, 1, 2, 3, 4, 5, //
5914 0, 1, 128, 2, 3, 4, 5, 6, //
5915 128, 128, 0, 1, 2, 3, 4, 5, //
5916 0, 128, 1, 2, 3, 4, 5, 6, //
5917 128, 0, 1, 2, 3, 4, 5, 6, //
5918 0, 1, 2, 3, 4, 5, 6, 7};
5919 return LoadU(du8, table + mask_bits * 8);
5920}
5921
5922} // namespace detail
5923
5924// Half vector of bytes: one table lookup
5925template <typename T, size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)>
5927 const DFromV<decltype(v)> d;
5928
5929 const uint64_t mask_bits = detail::BitsFromMask(mask);
5931 detail::IndicesForExpandFromBits<N>(mask_bits);
5932 return BitCast(d, TableLookupBytesOr0(v, indices));
5933}
5934
5935// Full vector of bytes: two table lookups
5936template <typename T, HWY_IF_T_SIZE(T, 1)>
5938 const Full128<T> d;
5939 const RebindToUnsigned<decltype(d)> du;
5940 const Half<decltype(du)> duh;
5941 const Vec128<uint8_t> vu = BitCast(du, v);
5942
5943 const uint64_t mask_bits = detail::BitsFromMask(mask);
5944 const uint64_t maskL = mask_bits & 0xFF;
5945 const uint64_t maskH = mask_bits >> 8;
5946
5947 // We want to skip past the v bytes already consumed by idxL. There is no
5948 // instruction for shift-reg by variable bytes. Storing v itself would work
5949 // but would involve a store-load forwarding stall. We instead shuffle using
5950 // loaded indices. multishift_epi64_epi8 would also help, but if we have that,
5951 // we probably also have native 8-bit Expand.
5952 alignas(16) static constexpr uint8_t iota[32] = {
5953 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
5954 11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128,
5955 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
5956 const VFromD<decltype(du)> shift = LoadU(du, iota + PopCount(maskL));
5957 const VFromD<decltype(duh)> vL = LowerHalf(duh, vu);
5958 const VFromD<decltype(duh)> vH =
5959 LowerHalf(duh, TableLookupBytesOr0(vu, shift));
5960
5961 const VFromD<decltype(duh)> idxL = detail::IndicesForExpandFromBits<8>(maskL);
5962 const VFromD<decltype(duh)> idxH = detail::IndicesForExpandFromBits<8>(maskH);
5963
5964 const VFromD<decltype(duh)> expandL = TableLookupBytesOr0(vL, idxL);
5965 const VFromD<decltype(duh)> expandH = TableLookupBytesOr0(vH, idxH);
5966 return BitCast(d, Combine(du, expandH, expandL));
5967}
5968
5969template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
5971 const DFromV<decltype(v)> d;
5972 const RebindToUnsigned<decltype(d)> du;
5973
5974 const Rebind<uint8_t, decltype(d)> du8;
5975 const uint64_t mask_bits = detail::BitsFromMask(mask);
5976
5977 // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply
5978 // the nibble trick used below because not all indices fit within one lane.
5979 alignas(16) static constexpr uint8_t table[2048] = {
5980 // PrintExpand16x8ByteTables
5981 128, 128, 128, 128, 128, 128, 128, 128, //
5982 0, 128, 128, 128, 128, 128, 128, 128, //
5983 128, 0, 128, 128, 128, 128, 128, 128, //
5984 0, 2, 128, 128, 128, 128, 128, 128, //
5985 128, 128, 0, 128, 128, 128, 128, 128, //
5986 0, 128, 2, 128, 128, 128, 128, 128, //
5987 128, 0, 2, 128, 128, 128, 128, 128, //
5988 0, 2, 4, 128, 128, 128, 128, 128, //
5989 128, 128, 128, 0, 128, 128, 128, 128, //
5990 0, 128, 128, 2, 128, 128, 128, 128, //
5991 128, 0, 128, 2, 128, 128, 128, 128, //
5992 0, 2, 128, 4, 128, 128, 128, 128, //
5993 128, 128, 0, 2, 128, 128, 128, 128, //
5994 0, 128, 2, 4, 128, 128, 128, 128, //
5995 128, 0, 2, 4, 128, 128, 128, 128, //
5996 0, 2, 4, 6, 128, 128, 128, 128, //
5997 128, 128, 128, 128, 0, 128, 128, 128, //
5998 0, 128, 128, 128, 2, 128, 128, 128, //
5999 128, 0, 128, 128, 2, 128, 128, 128, //
6000 0, 2, 128, 128, 4, 128, 128, 128, //
6001 128, 128, 0, 128, 2, 128, 128, 128, //
6002 0, 128, 2, 128, 4, 128, 128, 128, //
6003 128, 0, 2, 128, 4, 128, 128, 128, //
6004 0, 2, 4, 128, 6, 128, 128, 128, //
6005 128, 128, 128, 0, 2, 128, 128, 128, //
6006 0, 128, 128, 2, 4, 128, 128, 128, //
6007 128, 0, 128, 2, 4, 128, 128, 128, //
6008 0, 2, 128, 4, 6, 128, 128, 128, //
6009 128, 128, 0, 2, 4, 128, 128, 128, //
6010 0, 128, 2, 4, 6, 128, 128, 128, //
6011 128, 0, 2, 4, 6, 128, 128, 128, //
6012 0, 2, 4, 6, 8, 128, 128, 128, //
6013 128, 128, 128, 128, 128, 0, 128, 128, //
6014 0, 128, 128, 128, 128, 2, 128, 128, //
6015 128, 0, 128, 128, 128, 2, 128, 128, //
6016 0, 2, 128, 128, 128, 4, 128, 128, //
6017 128, 128, 0, 128, 128, 2, 128, 128, //
6018 0, 128, 2, 128, 128, 4, 128, 128, //
6019 128, 0, 2, 128, 128, 4, 128, 128, //
6020 0, 2, 4, 128, 128, 6, 128, 128, //
6021 128, 128, 128, 0, 128, 2, 128, 128, //
6022 0, 128, 128, 2, 128, 4, 128, 128, //
6023 128, 0, 128, 2, 128, 4, 128, 128, //
6024 0, 2, 128, 4, 128, 6, 128, 128, //
6025 128, 128, 0, 2, 128, 4, 128, 128, //
6026 0, 128, 2, 4, 128, 6, 128, 128, //
6027 128, 0, 2, 4, 128, 6, 128, 128, //
6028 0, 2, 4, 6, 128, 8, 128, 128, //
6029 128, 128, 128, 128, 0, 2, 128, 128, //
6030 0, 128, 128, 128, 2, 4, 128, 128, //
6031 128, 0, 128, 128, 2, 4, 128, 128, //
6032 0, 2, 128, 128, 4, 6, 128, 128, //
6033 128, 128, 0, 128, 2, 4, 128, 128, //
6034 0, 128, 2, 128, 4, 6, 128, 128, //
6035 128, 0, 2, 128, 4, 6, 128, 128, //
6036 0, 2, 4, 128, 6, 8, 128, 128, //
6037 128, 128, 128, 0, 2, 4, 128, 128, //
6038 0, 128, 128, 2, 4, 6, 128, 128, //
6039 128, 0, 128, 2, 4, 6, 128, 128, //
6040 0, 2, 128, 4, 6, 8, 128, 128, //
6041 128, 128, 0, 2, 4, 6, 128, 128, //
6042 0, 128, 2, 4, 6, 8, 128, 128, //
6043 128, 0, 2, 4, 6, 8, 128, 128, //
6044 0, 2, 4, 6, 8, 10, 128, 128, //
6045 128, 128, 128, 128, 128, 128, 0, 128, //
6046 0, 128, 128, 128, 128, 128, 2, 128, //
6047 128, 0, 128, 128, 128, 128, 2, 128, //
6048 0, 2, 128, 128, 128, 128, 4, 128, //
6049 128, 128, 0, 128, 128, 128, 2, 128, //
6050 0, 128, 2, 128, 128, 128, 4, 128, //
6051 128, 0, 2, 128, 128, 128, 4, 128, //
6052 0, 2, 4, 128, 128, 128, 6, 128, //
6053 128, 128, 128, 0, 128, 128, 2, 128, //
6054 0, 128, 128, 2, 128, 128, 4, 128, //
6055 128, 0, 128, 2, 128, 128, 4, 128, //
6056 0, 2, 128, 4, 128, 128, 6, 128, //
6057 128, 128, 0, 2, 128, 128, 4, 128, //
6058 0, 128, 2, 4, 128, 128, 6, 128, //
6059 128, 0, 2, 4, 128, 128, 6, 128, //
6060 0, 2, 4, 6, 128, 128, 8, 128, //
6061 128, 128, 128, 128, 0, 128, 2, 128, //
6062 0, 128, 128, 128, 2, 128, 4, 128, //
6063 128, 0, 128, 128, 2, 128, 4, 128, //
6064 0, 2, 128, 128, 4, 128, 6, 128, //
6065 128, 128, 0, 128, 2, 128, 4, 128, //
6066 0, 128, 2, 128, 4, 128, 6, 128, //
6067 128, 0, 2, 128, 4, 128, 6, 128, //
6068 0, 2, 4, 128, 6, 128, 8, 128, //
6069 128, 128, 128, 0, 2, 128, 4, 128, //
6070 0, 128, 128, 2, 4, 128, 6, 128, //
6071 128, 0, 128, 2, 4, 128, 6, 128, //
6072 0, 2, 128, 4, 6, 128, 8, 128, //
6073 128, 128, 0, 2, 4, 128, 6, 128, //
6074 0, 128, 2, 4, 6, 128, 8, 128, //
6075 128, 0, 2, 4, 6, 128, 8, 128, //
6076 0, 2, 4, 6, 8, 128, 10, 128, //
6077 128, 128, 128, 128, 128, 0, 2, 128, //
6078 0, 128, 128, 128, 128, 2, 4, 128, //
6079 128, 0, 128, 128, 128, 2, 4, 128, //
6080 0, 2, 128, 128, 128, 4, 6, 128, //
6081 128, 128, 0, 128, 128, 2, 4, 128, //
6082 0, 128, 2, 128, 128, 4, 6, 128, //
6083 128, 0, 2, 128, 128, 4, 6, 128, //
6084 0, 2, 4, 128, 128, 6, 8, 128, //
6085 128, 128, 128, 0, 128, 2, 4, 128, //
6086 0, 128, 128, 2, 128, 4, 6, 128, //
6087 128, 0, 128, 2, 128, 4, 6, 128, //
6088 0, 2, 128, 4, 128, 6, 8, 128, //
6089 128, 128, 0, 2, 128, 4, 6, 128, //
6090 0, 128, 2, 4, 128, 6, 8, 128, //
6091 128, 0, 2, 4, 128, 6, 8, 128, //
6092 0, 2, 4, 6, 128, 8, 10, 128, //
6093 128, 128, 128, 128, 0, 2, 4, 128, //
6094 0, 128, 128, 128, 2, 4, 6, 128, //
6095 128, 0, 128, 128, 2, 4, 6, 128, //
6096 0, 2, 128, 128, 4, 6, 8, 128, //
6097 128, 128, 0, 128, 2, 4, 6, 128, //
6098 0, 128, 2, 128, 4, 6, 8, 128, //
6099 128, 0, 2, 128, 4, 6, 8, 128, //
6100 0, 2, 4, 128, 6, 8, 10, 128, //
6101 128, 128, 128, 0, 2, 4, 6, 128, //
6102 0, 128, 128, 2, 4, 6, 8, 128, //
6103 128, 0, 128, 2, 4, 6, 8, 128, //
6104 0, 2, 128, 4, 6, 8, 10, 128, //
6105 128, 128, 0, 2, 4, 6, 8, 128, //
6106 0, 128, 2, 4, 6, 8, 10, 128, //
6107 128, 0, 2, 4, 6, 8, 10, 128, //
6108 0, 2, 4, 6, 8, 10, 12, 128, //
6109 128, 128, 128, 128, 128, 128, 128, 0, //
6110 0, 128, 128, 128, 128, 128, 128, 2, //
6111 128, 0, 128, 128, 128, 128, 128, 2, //
6112 0, 2, 128, 128, 128, 128, 128, 4, //
6113 128, 128, 0, 128, 128, 128, 128, 2, //
6114 0, 128, 2, 128, 128, 128, 128, 4, //
6115 128, 0, 2, 128, 128, 128, 128, 4, //
6116 0, 2, 4, 128, 128, 128, 128, 6, //
6117 128, 128, 128, 0, 128, 128, 128, 2, //
6118 0, 128, 128, 2, 128, 128, 128, 4, //
6119 128, 0, 128, 2, 128, 128, 128, 4, //
6120 0, 2, 128, 4, 128, 128, 128, 6, //
6121 128, 128, 0, 2, 128, 128, 128, 4, //
6122 0, 128, 2, 4, 128, 128, 128, 6, //
6123 128, 0, 2, 4, 128, 128, 128, 6, //
6124 0, 2, 4, 6, 128, 128, 128, 8, //
6125 128, 128, 128, 128, 0, 128, 128, 2, //
6126 0, 128, 128, 128, 2, 128, 128, 4, //
6127 128, 0, 128, 128, 2, 128, 128, 4, //
6128 0, 2, 128, 128, 4, 128, 128, 6, //
6129 128, 128, 0, 128, 2, 128, 128, 4, //
6130 0, 128, 2, 128, 4, 128, 128, 6, //
6131 128, 0, 2, 128, 4, 128, 128, 6, //
6132 0, 2, 4, 128, 6, 128, 128, 8, //
6133 128, 128, 128, 0, 2, 128, 128, 4, //
6134 0, 128, 128, 2, 4, 128, 128, 6, //
6135 128, 0, 128, 2, 4, 128, 128, 6, //
6136 0, 2, 128, 4, 6, 128, 128, 8, //
6137 128, 128, 0, 2, 4, 128, 128, 6, //
6138 0, 128, 2, 4, 6, 128, 128, 8, //
6139 128, 0, 2, 4, 6, 128, 128, 8, //
6140 0, 2, 4, 6, 8, 128, 128, 10, //
6141 128, 128, 128, 128, 128, 0, 128, 2, //
6142 0, 128, 128, 128, 128, 2, 128, 4, //
6143 128, 0, 128, 128, 128, 2, 128, 4, //
6144 0, 2, 128, 128, 128, 4, 128, 6, //
6145 128, 128, 0, 128, 128, 2, 128, 4, //
6146 0, 128, 2, 128, 128, 4, 128, 6, //
6147 128, 0, 2, 128, 128, 4, 128, 6, //
6148 0, 2, 4, 128, 128, 6, 128, 8, //
6149 128, 128, 128, 0, 128, 2, 128, 4, //
6150 0, 128, 128, 2, 128, 4, 128, 6, //
6151 128, 0, 128, 2, 128, 4, 128, 6, //
6152 0, 2, 128, 4, 128, 6, 128, 8, //
6153 128, 128, 0, 2, 128, 4, 128, 6, //
6154 0, 128, 2, 4, 128, 6, 128, 8, //
6155 128, 0, 2, 4, 128, 6, 128, 8, //
6156 0, 2, 4, 6, 128, 8, 128, 10, //
6157 128, 128, 128, 128, 0, 2, 128, 4, //
6158 0, 128, 128, 128, 2, 4, 128, 6, //
6159 128, 0, 128, 128, 2, 4, 128, 6, //
6160 0, 2, 128, 128, 4, 6, 128, 8, //
6161 128, 128, 0, 128, 2, 4, 128, 6, //
6162 0, 128, 2, 128, 4, 6, 128, 8, //
6163 128, 0, 2, 128, 4, 6, 128, 8, //
6164 0, 2, 4, 128, 6, 8, 128, 10, //
6165 128, 128, 128, 0, 2, 4, 128, 6, //
6166 0, 128, 128, 2, 4, 6, 128, 8, //
6167 128, 0, 128, 2, 4, 6, 128, 8, //
6168 0, 2, 128, 4, 6, 8, 128, 10, //
6169 128, 128, 0, 2, 4, 6, 128, 8, //
6170 0, 128, 2, 4, 6, 8, 128, 10, //
6171 128, 0, 2, 4, 6, 8, 128, 10, //
6172 0, 2, 4, 6, 8, 10, 128, 12, //
6173 128, 128, 128, 128, 128, 128, 0, 2, //
6174 0, 128, 128, 128, 128, 128, 2, 4, //
6175 128, 0, 128, 128, 128, 128, 2, 4, //
6176 0, 2, 128, 128, 128, 128, 4, 6, //
6177 128, 128, 0, 128, 128, 128, 2, 4, //
6178 0, 128, 2, 128, 128, 128, 4, 6, //
6179 128, 0, 2, 128, 128, 128, 4, 6, //
6180 0, 2, 4, 128, 128, 128, 6, 8, //
6181 128, 128, 128, 0, 128, 128, 2, 4, //
6182 0, 128, 128, 2, 128, 128, 4, 6, //
6183 128, 0, 128, 2, 128, 128, 4, 6, //
6184 0, 2, 128, 4, 128, 128, 6, 8, //
6185 128, 128, 0, 2, 128, 128, 4, 6, //
6186 0, 128, 2, 4, 128, 128, 6, 8, //
6187 128, 0, 2, 4, 128, 128, 6, 8, //
6188 0, 2, 4, 6, 128, 128, 8, 10, //
6189 128, 128, 128, 128, 0, 128, 2, 4, //
6190 0, 128, 128, 128, 2, 128, 4, 6, //
6191 128, 0, 128, 128, 2, 128, 4, 6, //
6192 0, 2, 128, 128, 4, 128, 6, 8, //
6193 128, 128, 0, 128, 2, 128, 4, 6, //
6194 0, 128, 2, 128, 4, 128, 6, 8, //
6195 128, 0, 2, 128, 4, 128, 6, 8, //
6196 0, 2, 4, 128, 6, 128, 8, 10, //
6197 128, 128, 128, 0, 2, 128, 4, 6, //
6198 0, 128, 128, 2, 4, 128, 6, 8, //
6199 128, 0, 128, 2, 4, 128, 6, 8, //
6200 0, 2, 128, 4, 6, 128, 8, 10, //
6201 128, 128, 0, 2, 4, 128, 6, 8, //
6202 0, 128, 2, 4, 6, 128, 8, 10, //
6203 128, 0, 2, 4, 6, 128, 8, 10, //
6204 0, 2, 4, 6, 8, 128, 10, 12, //
6205 128, 128, 128, 128, 128, 0, 2, 4, //
6206 0, 128, 128, 128, 128, 2, 4, 6, //
6207 128, 0, 128, 128, 128, 2, 4, 6, //
6208 0, 2, 128, 128, 128, 4, 6, 8, //
6209 128, 128, 0, 128, 128, 2, 4, 6, //
6210 0, 128, 2, 128, 128, 4, 6, 8, //
6211 128, 0, 2, 128, 128, 4, 6, 8, //
6212 0, 2, 4, 128, 128, 6, 8, 10, //
6213 128, 128, 128, 0, 128, 2, 4, 6, //
6214 0, 128, 128, 2, 128, 4, 6, 8, //
6215 128, 0, 128, 2, 128, 4, 6, 8, //
6216 0, 2, 128, 4, 128, 6, 8, 10, //
6217 128, 128, 0, 2, 128, 4, 6, 8, //
6218 0, 128, 2, 4, 128, 6, 8, 10, //
6219 128, 0, 2, 4, 128, 6, 8, 10, //
6220 0, 2, 4, 6, 128, 8, 10, 12, //
6221 128, 128, 128, 128, 0, 2, 4, 6, //
6222 0, 128, 128, 128, 2, 4, 6, 8, //
6223 128, 0, 128, 128, 2, 4, 6, 8, //
6224 0, 2, 128, 128, 4, 6, 8, 10, //
6225 128, 128, 0, 128, 2, 4, 6, 8, //
6226 0, 128, 2, 128, 4, 6, 8, 10, //
6227 128, 0, 2, 128, 4, 6, 8, 10, //
6228 0, 2, 4, 128, 6, 8, 10, 12, //
6229 128, 128, 128, 0, 2, 4, 6, 8, //
6230 0, 128, 128, 2, 4, 6, 8, 10, //
6231 128, 0, 128, 2, 4, 6, 8, 10, //
6232 0, 2, 128, 4, 6, 8, 10, 12, //
6233 128, 128, 0, 2, 4, 6, 8, 10, //
6234 0, 128, 2, 4, 6, 8, 10, 12, //
6235 128, 0, 2, 4, 6, 8, 10, 12, //
6236 0, 2, 4, 6, 8, 10, 12, 14};
6237 // Extend to double length because InterleaveLower will only use the (valid)
6238 // lower half, and we want N u16.
6239 const Twice<decltype(du8)> du8x2;
6240 const Vec128<uint8_t, 2 * N> indices8 =
6241 ZeroExtendVector(du8x2, Load(du8, table + mask_bits * 8));
6242 const Vec128<uint16_t, N> indices16 =
6243 BitCast(du, InterleaveLower(du8x2, indices8, indices8));
6244 // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte
6245 // indices, add 0 to even and 1 to odd byte lanes.
6246 const Vec128<uint16_t, N> byte_indices = Add(
6247 indices16,
6248 Set(du, static_cast<uint16_t>(HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001)));
6249 return BitCast(d, TableLookupBytesOr0(v, byte_indices));
6250}
6251
6252template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
6253HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
6254 const DFromV<decltype(v)> d;
6255 const RebindToUnsigned<decltype(d)> du;
6256
6257 const uint64_t mask_bits = detail::BitsFromMask(mask);
6258
6259 alignas(16) static constexpr uint32_t packed_array[16] = {
6260 // PrintExpand64x4Nibble - same for 32x4.
6261 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
6262 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
6263 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
6264
6265 // For lane i, shift the i-th 4-bit index down to bits [0, 2).
6266 const Vec128<uint32_t, N> packed = Set(du, packed_array[mask_bits]);
6267 alignas(16) static constexpr uint32_t shifts[4] = {0, 4, 8, 12};
6268 Vec128<uint32_t, N> indices = packed >> Load(du, shifts);
6269 // AVX2 _mm256_permutexvar_epi32 will ignore upper bits, but IndicesFromVec
6270 // checks bounds, so clear the upper bits.
6271 indices = And(indices, Set(du, N - 1));
6272 const Vec128<uint32_t, N> expand =
6274 // TableLookupLanes cannot also zero masked-off lanes, so do that now.
6275 return IfThenElseZero(mask, BitCast(d, expand));
6276}
6277
6278template <typename T, HWY_IF_T_SIZE(T, 8)>
6279HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
6280 // Same as Compress, just zero out the mask=false lanes.
6281 return IfThenElseZero(mask, Compress(v, mask));
6282}
6283
6284// For single-element vectors, this is at least as fast as native.
6285template <typename T>
6289
6290// ------------------------------ LoadExpand
6291template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6293 const TFromD<D>* HWY_RESTRICT unaligned) {
6294 return Expand(LoadU(d, unaligned), mask);
6295}
6296
6297#endif // HWY_NATIVE_EXPAND
6298
6299// ------------------------------ TwoTablesLookupLanes
6300
6301template <class D>
6303
6304// RVV/SVE have their own implementations of
6305// TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, IndicesFromD<D> idx)
6306#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \
6307 HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
6308 HWY_TARGET != HWY_SVE2_128
6309template <class D>
6311 IndicesFromD<D> idx) {
6312 return TwoTablesLookupLanes(a, b, idx);
6313}
6314#endif
6315
6316// ------------------------------ Reverse2, Reverse4, Reverse8 (8-bit)
6317
6318#if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
6319#ifdef HWY_NATIVE_REVERSE2_8
6320#undef HWY_NATIVE_REVERSE2_8
6321#else
6322#define HWY_NATIVE_REVERSE2_8
6323#endif
6324
6325#undef HWY_PREFER_ROTATE
6326// Platforms on which RotateRight is likely faster than TableLookupBytes.
6327// RVV and SVE anyway have their own implementation of this.
6328#if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \
6329 HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8
6330#define HWY_PREFER_ROTATE 1
6331#else
6332#define HWY_PREFER_ROTATE 0
6333#endif
6334
6335template <class D, HWY_IF_T_SIZE_D(D, 1)>
6337 // Exclude AVX3 because its 16-bit RotateRight is actually 3 instructions.
6338#if HWY_PREFER_ROTATE && HWY_TARGET > HWY_AVX3
6339 const Repartition<uint16_t, decltype(d)> du16;
6340 return BitCast(d, RotateRight<8>(BitCast(du16, v)));
6341#else
6342 const VFromD<D> shuffle = Dup128VecFromValues(d, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,
6343 11, 10, 13, 12, 15, 14);
6344 return TableLookupBytes(v, shuffle);
6345#endif
6346}
6347
6348template <class D, HWY_IF_T_SIZE_D(D, 1)>
6350#if HWY_PREFER_ROTATE
6351 const Repartition<uint16_t, decltype(d)> du16;
6352 return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v))));
6353#else
6354 const Repartition<uint8_t, decltype(d)> du8;
6355 const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
6356 du8, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
6357 return TableLookupBytes(v, BitCast(d, shuffle));
6358#endif
6359}
6360
6361template <class D, HWY_IF_T_SIZE_D(D, 1)>
6363#if HWY_PREFER_ROTATE
6364 const Repartition<uint32_t, D> du32;
6365 return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v))));
6366#else
6367 const Repartition<uint8_t, decltype(d)> du8;
6368 const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
6369 du8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
6370 return TableLookupBytes(v, BitCast(d, shuffle));
6371#endif
6372}
6373
6374#endif // HWY_NATIVE_REVERSE2_8
6375
6376// ------------------------------ ReverseLaneBytes
6377
6378#if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE))
6379#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
6380#undef HWY_NATIVE_REVERSE_LANE_BYTES
6381#else
6382#define HWY_NATIVE_REVERSE_LANE_BYTES
6383#endif
6384
6385template <class V, HWY_IF_T_SIZE_V(V, 2)>
6387 const DFromV<V> d;
6388 const Repartition<uint8_t, decltype(d)> du8;
6389 return BitCast(d, Reverse2(du8, BitCast(du8, v)));
6390}
6391
6392template <class V, HWY_IF_T_SIZE_V(V, 4)>
6394 const DFromV<V> d;
6395 const Repartition<uint8_t, decltype(d)> du8;
6396 return BitCast(d, Reverse4(du8, BitCast(du8, v)));
6397}
6398
6399template <class V, HWY_IF_T_SIZE_V(V, 8)>
6401 const DFromV<V> d;
6402 const Repartition<uint8_t, decltype(d)> du8;
6403 return BitCast(d, Reverse8(du8, BitCast(du8, v)));
6404}
6405
6406#endif // HWY_NATIVE_REVERSE_LANE_BYTES
6407
6408// ------------------------------ ReverseBits
6409
6410// On these targets, we emulate 8-bit shifts using 16-bit shifts and therefore
6411// require at least two lanes to BitCast to 16-bit. We avoid Highway's 8-bit
6412// shifts because those would add extra masking already taken care of by
6413// UI8ReverseBitsStep. Note that AVX3_DL/AVX3_ZEN4 support GFNI and use it to
6414// implement ReverseBits, so this code is not used there.
6415#undef HWY_REVERSE_BITS_MIN_BYTES
6416#if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \
6417 HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256)
6418#define HWY_REVERSE_BITS_MIN_BYTES 2
6419#else
6420#define HWY_REVERSE_BITS_MIN_BYTES 1
6421#endif
6422
6423#if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE))
6424#ifdef HWY_NATIVE_REVERSE_BITS_UI8
6425#undef HWY_NATIVE_REVERSE_BITS_UI8
6426#else
6427#define HWY_NATIVE_REVERSE_BITS_UI8
6428#endif
6429
6430namespace detail {
6431
6432template <int kShiftAmt, int kShrResultMask, class V,
6435 const DFromV<decltype(v)> d;
6436 const RebindToUnsigned<decltype(d)> du;
6437#if HWY_REVERSE_BITS_MIN_BYTES == 2
6438 const Repartition<uint16_t, decltype(d)> d_shift;
6439#else
6440 const RebindToUnsigned<decltype(d)> d_shift;
6441#endif
6442
6443 const auto v_to_shift = BitCast(d_shift, v);
6444 const auto shl_result = BitCast(d, ShiftLeft<kShiftAmt>(v_to_shift));
6445 const auto shr_result = BitCast(d, ShiftRight<kShiftAmt>(v_to_shift));
6446 const auto shr_result_mask =
6447 BitCast(d, Set(du, static_cast<uint8_t>(kShrResultMask)));
6448 return Or(And(shr_result, shr_result_mask),
6449 AndNot(shr_result_mask, shl_result));
6450}
6451
6452#if HWY_REVERSE_BITS_MIN_BYTES == 2
6453template <int kShiftAmt, int kShrResultMask, class V,
6456 return V{UI8ReverseBitsStep<kShiftAmt, kShrResultMask>(Vec128<uint8_t>{v.raw})
6457 .raw};
6458}
6459#endif
6460
6461} // namespace detail
6462
6463template <class V, HWY_IF_T_SIZE_V(V, 1)>
6465 auto result = detail::UI8ReverseBitsStep<1, 0x55>(v);
6466 result = detail::UI8ReverseBitsStep<2, 0x33>(result);
6467 result = detail::UI8ReverseBitsStep<4, 0x0F>(result);
6468 return result;
6469}
6470
6471#endif // HWY_NATIVE_REVERSE_BITS_UI8
6472
6473#if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE))
6474#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
6475#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
6476#else
6477#define HWY_NATIVE_REVERSE_BITS_UI16_32_64
6478#endif
6479
6480template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8)),
6483 const DFromV<decltype(v)> d;
6484 const Repartition<uint8_t, decltype(d)> du8;
6485 return ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v))));
6486}
6487#endif // HWY_NATIVE_REVERSE_BITS_UI16_32_64
6488
6489// ------------------------------ Per4LaneBlockShuffle
6490
6491#if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE))
6492#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6493#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6494#else
6495#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6496#endif
6497
6498#if HWY_TARGET != HWY_SCALAR
6499namespace detail {
6500
6501template <class D>
6502HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
6503 const uint32_t x2,
6504 const uint32_t x1,
6505 const uint32_t x0) {
6506#if HWY_TARGET == HWY_RVV
6507 constexpr int kPow2 = d.Pow2();
6508 constexpr int kLoadPow2 = HWY_MAX(kPow2, -1);
6509 const ScalableTag<uint32_t, kLoadPow2> d_load;
6510#else
6511 constexpr size_t kMaxBytes = d.MaxBytes();
6512#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
6513 constexpr size_t kMinLanesToLoad = 2;
6514#else
6515 constexpr size_t kMinLanesToLoad = 4;
6516#endif
6517 constexpr size_t kNumToLoad =
6518 HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad);
6519 const CappedTag<uint32_t, kNumToLoad> d_load;
6520#endif
6521 return ResizeBitCast(d, Dup128VecFromValues(d_load, x0, x1, x2, x3));
6522}
6523
6524} // namespace detail
6525#endif
6526
6527#endif // HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6528
6529#if HWY_TARGET != HWY_SCALAR
6530namespace detail {
6531
6532template <class V>
6533HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<0> /*idx_10_tag*/, V v) {
6534 return DupEven(v);
6535}
6536
6537template <class V>
6538HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<1> /*idx_10_tag*/, V v) {
6539 const DFromV<decltype(v)> d;
6540 return Reverse2(d, v);
6541}
6542
6543template <class V>
6544HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<2> /*idx_10_tag*/, V v) {
6545 return v;
6546}
6547
6548template <class V>
6549HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<3> /*idx_10_tag*/, V v) {
6550 return DupOdd(v);
6551}
6552
6553HWY_INLINE uint32_t U8x4Per4LaneBlkIndices(const uint32_t idx3,
6554 const uint32_t idx2,
6555 const uint32_t idx1,
6556 const uint32_t idx0) {
6557#if HWY_IS_LITTLE_ENDIAN
6558 return static_cast<uint32_t>((idx3 << 24) | (idx2 << 16) | (idx1 << 8) |
6559 idx0);
6560#else
6561 return static_cast<uint32_t>(idx3 | (idx2 << 8) | (idx1 << 16) |
6562 (idx0 << 24));
6563#endif
6564}
6565
6566template <class D>
6567HWY_INLINE Vec<D> TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3,
6568 const uint32_t idx2,
6569 const uint32_t idx1,
6570 const uint32_t idx0) {
6571#if HWY_TARGET == HWY_RVV
6572 const AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
6573#else
6574 const Repartition<uint32_t, D> du32;
6575#endif
6576
6577 return ResizeBitCast(
6578 d, Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0)));
6579}
6580
6581#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
6582 HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_EMU128
6583#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr
6584#else
6585#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8)
6586
6587template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
6588HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, V idx) {
6589 const DFromV<decltype(v)> d;
6590 const Repartition<uint8_t, decltype(d)> du8;
6591 return BitCast(d, TableLookupBytes(BitCast(du8, v), BitCast(du8, idx)));
6592}
6593
6594template <class D, HWY_IF_T_SIZE_D(D, 1)>
6595HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
6596 const uint32_t idx2,
6597 const uint32_t idx1,
6598 const uint32_t idx0) {
6599 const Repartition<uint32_t, decltype(d)> du32;
6600 const uint32_t idx3210 = U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0);
6601 const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
6602 du32, static_cast<uint32_t>(idx3210 + 0x0C0C0C0C),
6603 static_cast<uint32_t>(idx3210 + 0x08080808),
6604 static_cast<uint32_t>(idx3210 + 0x04040404),
6605 static_cast<uint32_t>(idx3210));
6606 return ResizeBitCast(d, v_byte_idx);
6607}
6608
6609template <class D, HWY_IF_T_SIZE_D(D, 2)>
6610HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
6611 const uint32_t idx2,
6612 const uint32_t idx1,
6613 const uint32_t idx0) {
6614 const Repartition<uint32_t, decltype(d)> du32;
6615#if HWY_IS_LITTLE_ENDIAN
6616 const uint32_t idx10 = static_cast<uint32_t>((idx1 << 16) | idx0);
6617 const uint32_t idx32 = static_cast<uint32_t>((idx3 << 16) | idx2);
6618 constexpr uint32_t kLaneByteOffsets{0x01000100};
6619#else
6620 const uint32_t idx10 = static_cast<uint32_t>(idx1 | (idx0 << 16));
6621 const uint32_t idx32 = static_cast<uint32_t>(idx3 | (idx2 << 16));
6622 constexpr uint32_t kLaneByteOffsets{0x00010001};
6623#endif
6624 constexpr uint32_t kHiLaneByteOffsets{kLaneByteOffsets + 0x08080808u};
6625
6626 const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
6627 du32, static_cast<uint32_t>(idx32 * 0x0202u + kHiLaneByteOffsets),
6628 static_cast<uint32_t>(idx10 * 0x0202u + kHiLaneByteOffsets),
6629 static_cast<uint32_t>(idx32 * 0x0202u + kLaneByteOffsets),
6630 static_cast<uint32_t>(idx10 * 0x0202u + kLaneByteOffsets));
6631 return ResizeBitCast(d, v_byte_idx);
6632}
6633
6634template <class D, HWY_IF_T_SIZE_D(D, 4)>
6635HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
6636 const uint32_t idx2,
6637 const uint32_t idx1,
6638 const uint32_t idx0) {
6639 const Repartition<uint32_t, decltype(d)> du32;
6640#if HWY_IS_LITTLE_ENDIAN
6641 constexpr uint32_t kLaneByteOffsets{0x03020100};
6642#else
6643 constexpr uint32_t kLaneByteOffsets{0x00010203};
6644#endif
6645
6646 const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
6647 du32, static_cast<uint32_t>(idx3 * 0x04040404u + kLaneByteOffsets),
6648 static_cast<uint32_t>(idx2 * 0x04040404u + kLaneByteOffsets),
6649 static_cast<uint32_t>(idx1 * 0x04040404u + kLaneByteOffsets),
6650 static_cast<uint32_t>(idx0 * 0x04040404u + kLaneByteOffsets));
6651 return ResizeBitCast(d, v_byte_idx);
6652}
6653#endif
6654
6655template <class D, HWY_IF_T_SIZE_D(D, 1)>
6656HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
6657 const uint32_t idx2,
6658 const uint32_t idx1,
6659 const uint32_t idx0) {
6660 return TblLookupPer4LaneBlkU8IdxInBlk(d, idx3, idx2, idx1, idx0);
6661}
6662
6663#if HWY_TARGET == HWY_RVV
6664template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
6665HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
6666 const uint32_t idx2,
6667 const uint32_t idx1,
6668 const uint32_t idx0) {
6669 const Rebind<uint8_t, decltype(d)> du8;
6670 return PromoteTo(d,
6671 TblLookupPer4LaneBlkU8IdxInBlk(du8, idx3, idx2, idx1, idx0));
6672}
6673#else
6674template <class D, HWY_IF_T_SIZE_D(D, 2)>
6675HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
6676 const uint32_t idx2,
6677 const uint32_t idx1,
6678 const uint32_t idx0) {
6679 const uint16_t u16_idx0 = static_cast<uint16_t>(idx0);
6680 const uint16_t u16_idx1 = static_cast<uint16_t>(idx1);
6681 const uint16_t u16_idx2 = static_cast<uint16_t>(idx2);
6682 const uint16_t u16_idx3 = static_cast<uint16_t>(idx3);
6683#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
6684 constexpr size_t kMinLanesToLoad = 4;
6685#else
6686 constexpr size_t kMinLanesToLoad = 8;
6687#endif
6688 constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad);
6689 const CappedTag<uint16_t, kNumToLoad> d_load;
6690 return ResizeBitCast(
6691 d, Dup128VecFromValues(d_load, u16_idx0, u16_idx1, u16_idx2, u16_idx3,
6692 u16_idx0, u16_idx1, u16_idx2, u16_idx3));
6693}
6694
6695template <class D, HWY_IF_T_SIZE_D(D, 4)>
6696HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
6697 const uint32_t idx2,
6698 const uint32_t idx1,
6699 const uint32_t idx0) {
6700 return Per4LaneBlkShufDupSet4xU32(d, idx3, idx2, idx1, idx0);
6701}
6702
6703template <class D, HWY_IF_T_SIZE_D(D, 8)>
6704HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
6705 const uint32_t idx2,
6706 const uint32_t idx1,
6707 const uint32_t idx0) {
6708 const RebindToUnsigned<decltype(d)> du;
6709 const Rebind<uint32_t, decltype(d)> du32;
6710 return BitCast(d, PromoteTo(du, Per4LaneBlkShufDupSet4xU32(du32, idx3, idx2,
6711 idx1, idx0)));
6712}
6713#endif
6714
6715template <class D, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D)>
6716HWY_INLINE IndicesFromD<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
6717 const uint32_t idx2,
6718 const uint32_t idx1,
6719 const uint32_t idx0) {
6720 const RebindToUnsigned<decltype(d)> du;
6721 using TU = TFromD<decltype(du)>;
6722 auto idx_in_blk = TblLookupPer4LaneBlkIdxInBlk(du, idx3, idx2, idx1, idx0);
6723
6724 constexpr size_t kN = HWY_MAX_LANES_D(D);
6725 if (kN < 4) {
6726 idx_in_blk = And(idx_in_blk, Set(du, static_cast<TU>(kN - 1)));
6727 }
6728
6729#if HWY_TARGET == HWY_RVV
6730 const auto blk_offsets = AndS(Iota0(du), static_cast<TU>(~TU{3}));
6731#else
6732 const auto blk_offsets =
6733 And(Iota(du, TU{0}), Set(du, static_cast<TU>(~TU{3})));
6734#endif
6735 return IndicesFromVec(d, Add(idx_in_blk, blk_offsets));
6736}
6737
6738template <class V, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(DFromV<V>)>
6739HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, IndicesFromD<DFromV<V>> idx) {
6740 return TableLookupLanes(v, idx);
6741}
6742
6743#undef HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE
6744
6745template <class V>
6746HWY_INLINE V TblLookupPer4LaneBlkShuf(V v, size_t idx3210) {
6747 const DFromV<decltype(v)> d;
6748 const uint32_t idx3 = static_cast<uint32_t>((idx3210 >> 6) & 3);
6749 const uint32_t idx2 = static_cast<uint32_t>((idx3210 >> 4) & 3);
6750 const uint32_t idx1 = static_cast<uint32_t>((idx3210 >> 2) & 3);
6751 const uint32_t idx0 = static_cast<uint32_t>(idx3210 & 3);
6752 const auto idx = TblLookupPer4LaneBlkShufIdx(d, idx3, idx2, idx1, idx0);
6753 return Per4LaneBlkShufDoTblLookup(v, idx);
6754}
6755
6756// The detail::Per4LaneBlockShuffle overloads that have the extra lane_size_tag
6757// and vect_size_tag parameters are only called for vectors that have at
6758// least 4 lanes (or scalable vectors that might possibly have 4 or more lanes)
6759template <size_t kIdx3210, size_t kLaneSize, size_t kVectSize, class V>
6761 hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
6762 hwy::SizeTag<kVectSize> /*vect_size_tag*/,
6763 V v) {
6764 return TblLookupPer4LaneBlkShuf(v, kIdx3210);
6765}
6766
6767#if HWY_HAVE_FLOAT64
6768template <class V>
6769HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
6770 hwy::FloatTag /* type_tag */, hwy::SizeTag<4> /* lane_size_tag */, V v) {
6771 const DFromV<decltype(v)> d;
6772 const RepartitionToWide<decltype(d)> dw;
6773 return BitCast(dw, v);
6774}
6775#endif
6776
6777template <size_t kLaneSize, class V>
6779Per4LaneBlockShufCastToWide(hwy::FloatTag /* type_tag */,
6780 hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) {
6781 const DFromV<decltype(v)> d;
6782 const RebindToUnsigned<decltype(d)> du;
6783 const RepartitionToWide<decltype(du)> dw;
6784 return BitCast(dw, v);
6785}
6786
6787template <size_t kLaneSize, class V>
6788HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
6789 hwy::NonFloatTag /* type_tag */,
6790 hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) {
6791 const DFromV<decltype(v)> d;
6792 const RepartitionToWide<decltype(d)> dw;
6793 return BitCast(dw, v);
6794}
6795
6796template <class V>
6797HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x1B> /*idx_3210_tag*/, V v) {
6798 const DFromV<decltype(v)> d;
6799 return Reverse4(d, v);
6800}
6801
6802template <class V,
6803 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
6804 (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
6805HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, V v) {
6806 const DFromV<decltype(v)> d;
6807 const auto vw = Per4LaneBlockShufCastToWide(
6808 hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
6809 return BitCast(d, DupEven(vw));
6810}
6811
6812template <class V,
6813 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
6814 (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
6815HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) {
6816 const DFromV<decltype(v)> d;
6817 const auto vw = Per4LaneBlockShufCastToWide(
6818 hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
6819 const DFromV<decltype(vw)> dw;
6820 return BitCast(d, Reverse2(dw, vw));
6821}
6822
6823#if HWY_MAX_BYTES >= 32
6824template <class V, HWY_IF_T_SIZE_V(V, 8)>
6825HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) {
6826 return SwapAdjacentBlocks(v);
6827}
6828#endif
6829
6830template <class V, HWY_IF_LANES_D(DFromV<V>, 4),
6831 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
6832HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) {
6833 const DFromV<decltype(v)> d;
6834 return InterleaveLower(d, v, v);
6835}
6836
6837template <class V, HWY_IF_T_SIZE_V(V, 4)>
6838HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) {
6839 const DFromV<decltype(v)> d;
6840 return InterleaveLower(d, v, v);
6841}
6842
6843template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
6844HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, V v) {
6845 const DFromV<decltype(v)> d;
6846 return ConcatEven(d, v, v);
6847}
6848
6849template <class V>
6850HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xA0> /*idx_3210_tag*/, V v) {
6851 return DupEven(v);
6852}
6853
6854template <class V>
6855HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xB1> /*idx_3210_tag*/, V v) {
6856 const DFromV<decltype(v)> d;
6857 return Reverse2(d, v);
6858}
6859
6860template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
6861HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, V v) {
6862 const DFromV<decltype(v)> d;
6863 return ConcatOdd(d, v, v);
6864}
6865
6866template <class V>
6867HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xE4> /*idx_3210_tag*/, V v) {
6868 return v;
6869}
6870
6871template <class V,
6872 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
6873 (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
6874HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, V v) {
6875 const DFromV<decltype(v)> d;
6876 const auto vw = Per4LaneBlockShufCastToWide(
6877 hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
6878 return BitCast(d, DupOdd(vw));
6879}
6880
6881template <class V>
6882HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xF5> /*idx_3210_tag*/, V v) {
6883 return DupOdd(v);
6884}
6885
6886template <class V, HWY_IF_T_SIZE_V(V, 4)>
6887HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, V v) {
6888 const DFromV<decltype(v)> d;
6889 return InterleaveUpper(d, v, v);
6890}
6891
6892template <size_t kIdx3210, class V>
6894 const DFromV<decltype(v)> d;
6895 return Per4LaneBlockShuffle(idx_3210_tag, hwy::SizeTag<sizeof(TFromV<V>)>(),
6896 hwy::SizeTag<d.MaxBytes()>(), v);
6897}
6898
6899} // namespace detail
6900#endif // HWY_TARGET != HWY_SCALAR
6901
6902template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
6903 HWY_IF_LANES_D(DFromV<V>, 1)>
6905 static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
6906 static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
6907 static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
6908 static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
6909
6910 return v;
6911}
6912
6913#if HWY_TARGET != HWY_SCALAR
6914template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
6915 HWY_IF_LANES_D(DFromV<V>, 2)>
6917 static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
6918 static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
6919 static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
6920 static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
6921
6922 constexpr bool isReverse2 = (kIdx0 == 1 || kIdx1 == 0) && (kIdx0 != kIdx1);
6923 constexpr size_t kPer2BlkIdx0 = (kIdx0 <= 1) ? kIdx0 : (isReverse2 ? 1 : 0);
6924 constexpr size_t kPer2BlkIdx1 = (kIdx1 <= 1) ? kIdx1 : (isReverse2 ? 0 : 1);
6925
6926 constexpr size_t kIdx10 = (kPer2BlkIdx1 << 1) | kPer2BlkIdx0;
6927 static_assert(kIdx10 <= 3, "kIdx10 <= 3 must be true");
6928 return detail::Per2LaneBlockShuffle(hwy::SizeTag<kIdx10>(), v);
6929}
6930
6931template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
6932 HWY_IF_LANES_GT_D(DFromV<V>, 2)>
6934 static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
6935 static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
6936 static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
6937 static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
6938
6939 constexpr size_t kIdx3210 =
6940 (kIdx3 << 6) | (kIdx2 << 4) | (kIdx1 << 2) | kIdx0;
6942}
6943#endif
6944
6945// ------------------------------ Blocks
6946
6947template <class D>
6948HWY_API size_t Blocks(D d) {
6949 return (d.MaxBytes() <= 16) ? 1 : ((Lanes(d) * sizeof(TFromD<D>) + 15) / 16);
6950}
6951
6952// ------------------------------ Block insert/extract/broadcast ops
6953#if (defined(HWY_NATIVE_BLK_INSERT_EXTRACT) == defined(HWY_TARGET_TOGGLE))
6954#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
6955#undef HWY_NATIVE_BLK_INSERT_EXTRACT
6956#else
6957#define HWY_NATIVE_BLK_INSERT_EXTRACT
6958#endif
6959
6960template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
6961HWY_API V InsertBlock(V /*v*/, V blk_to_insert) {
6962 static_assert(kBlockIdx == 0, "Invalid block index");
6963 return blk_to_insert;
6964}
6965
6966template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
6968 static_assert(kBlockIdx == 0, "Invalid block index");
6969 return v;
6970}
6971
6972template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
6974 static_assert(kBlockIdx == 0, "Invalid block index");
6975 return v;
6976}
6977
6978#endif // HWY_NATIVE_BLK_INSERT_EXTRACT
6979
6980// ------------------------------ BroadcastLane
6981#if (defined(HWY_NATIVE_BROADCASTLANE) == defined(HWY_TARGET_TOGGLE))
6982#ifdef HWY_NATIVE_BROADCASTLANE
6983#undef HWY_NATIVE_BROADCASTLANE
6984#else
6985#define HWY_NATIVE_BROADCASTLANE
6986#endif
6987
6988template <int kLane, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
6990 return Broadcast<kLane>(v);
6991}
6992
6993#endif // HWY_NATIVE_BROADCASTLANE
6994
6995// ------------------------------ Slide1Up and Slide1Down
6996#if (defined(HWY_NATIVE_SLIDE1_UP_DOWN) == defined(HWY_TARGET_TOGGLE))
6997#ifdef HWY_NATIVE_SLIDE1_UP_DOWN
6998#undef HWY_NATIVE_SLIDE1_UP_DOWN
6999#else
7000#define HWY_NATIVE_SLIDE1_UP_DOWN
7001#endif
7002
7003template <class D, HWY_IF_LANES_D(D, 1)>
7005 return Zero(d);
7006}
7007template <class D, HWY_IF_LANES_D(D, 1)>
7009 return Zero(d);
7010}
7011
7012#if HWY_TARGET != HWY_SCALAR
7013template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
7015 return ShiftLeftLanes<1>(d, v);
7016}
7017template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
7019 return ShiftRightLanes<1>(d, v);
7020}
7021#endif // HWY_TARGET != HWY_SCALAR
7022
7023#endif // HWY_NATIVE_SLIDE1_UP_DOWN
7024
7025// ------------------------------ SlideUpBlocks
7026
7027template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
7029 static_assert(kBlocks == 0, "kBlocks == 0 must be true");
7030 return v;
7031}
7032
7033#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
7034template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
7036 static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
7037 "kBlocks must be between 0 and d.MaxBlocks() - 1");
7038 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
7039 return SlideUpLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
7040}
7041#endif
7042
7043// ------------------------------ SlideDownBlocks
7044
7045template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
7047 static_assert(kBlocks == 0, "kBlocks == 0 must be true");
7048 return v;
7049}
7050
7051#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
7052template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
7054 static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
7055 "kBlocks must be between 0 and d.MaxBlocks() - 1");
7056 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
7057 return SlideDownLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
7058}
7059#endif
7060
7061// ------------------------------ Slide mask up/down
7062#if (defined(HWY_NATIVE_SLIDE_MASK) == defined(HWY_TARGET_TOGGLE))
7063
7064#ifdef HWY_NATIVE_SLIDE_MASK
7065#undef HWY_NATIVE_SLIDE_MASK
7066#else
7067#define HWY_NATIVE_SLIDE_MASK
7068#endif
7069
7070template <class D>
7074
7075template <class D>
7079
7080template <class D>
7082 return MaskFromVec(SlideUpLanes(d, VecFromMask(d, m), amt));
7083}
7084
7085template <class D>
7087 return MaskFromVec(SlideDownLanes(d, VecFromMask(d, m), amt));
7088}
7089
7090#endif // HWY_NATIVE_SLIDE_MASK
7091
7092// ------------------------------ SumsOfAdjQuadAbsDiff
7093
7094#if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
7095 defined(HWY_TARGET_TOGGLE))
7096#ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
7097#undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
7098#else
7099#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
7100#endif
7101
7102#if HWY_TARGET != HWY_SCALAR
7103template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
7104HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
7105 static_assert(0 <= kAOffset && kAOffset <= 1,
7106 "kAOffset must be between 0 and 1");
7107 static_assert(0 <= kBOffset && kBOffset <= 3,
7108 "kBOffset must be between 0 and 3");
7109 using D8 = DFromV<V8>;
7110 const D8 d8;
7111 const RebindToUnsigned<decltype(d8)> du8;
7112 const RepartitionToWide<decltype(d8)> d16;
7113 const RepartitionToWide<decltype(du8)> du16;
7114
7115 // Ensure that a is resized to a vector that has at least
7116 // HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the interleave and
7117 // CombineShiftRightBytes operations below.
7118#if HWY_TARGET == HWY_RVV
7119 // On RVV targets, need to ensure that d8_interleave.Pow2() >= 0 is true
7120 // to ensure that Lanes(d8_interleave) >= 16 is true.
7121
7122 // Lanes(d8_interleave) >= Lanes(d8) is guaranteed to be true on RVV
7123 // targets as d8_interleave.Pow2() >= d8.Pow2() is true.
7124 constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0);
7125 const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave;
7126#elif HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
7127 HWY_TARGET == HWY_SVE2_128
7128 // On SVE targets, Lanes(d8_interleave) >= 16 and
7129 // Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD
7130 // tag for a full u8/i8 vector on SVE.
7131 const D8 d8_interleave;
7132#else
7133 // On targets that use non-scalable vector types, Lanes(d8_interleave) is
7134 // equal to HWY_MAX(Lanes(d8), size_t{8} << kAOffset).
7135 constexpr size_t kInterleaveLanes =
7136 HWY_MAX(HWY_MAX_LANES_D(D8), size_t{8} << kAOffset);
7137 const FixedTag<TFromD<D8>, kInterleaveLanes> d8_interleave;
7138#endif
7139
7140 // The ResizeBitCast operation below will resize a to a vector that has
7141 // at least HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the
7142 // InterleaveLower, InterleaveUpper, and CombineShiftRightBytes operations
7143 // below.
7144 const auto a_to_interleave = ResizeBitCast(d8_interleave, a);
7145
7146 const auto a_interleaved_lo =
7147 InterleaveLower(d8_interleave, a_to_interleave, a_to_interleave);
7148 const auto a_interleaved_hi =
7149 InterleaveUpper(d8_interleave, a_to_interleave, a_to_interleave);
7150
7151 /* a01: { a[kAOffset*4+0], a[kAOffset*4+1], a[kAOffset*4+1], a[kAOffset*4+2],
7152 a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
7153 a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
7154 a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8] }
7155 */
7156 /* a23: { a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
7157 a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
7158 a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8],
7159 a[kAOffset*4+8], a[kAOffset*4+9], a[kAOffset*4+9], a[kAOffset*4+10]
7160 } */
7161
7162 // a01 and a23 are resized back to V8 as only the first Lanes(d8) lanes of
7163 // the CombineShiftRightBytes are needed for the subsequent AbsDiff operations
7164 // and as a01 and a23 need to be the same vector type as b01 and b23 for the
7165 // AbsDiff operations below.
7166 const V8 a01 =
7167 ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 1>(
7168 d8_interleave, a_interleaved_hi, a_interleaved_lo));
7169 const V8 a23 =
7170 ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 5>(
7171 d8_interleave, a_interleaved_hi, a_interleaved_lo));
7172
7173 /* b01: { b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
7174 b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
7175 b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
7176 b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1] }
7177 */
7178 /* b23: { b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
7179 b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
7180 b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
7181 b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3] }
7182 */
7183 const V8 b01 = BitCast(d8, Broadcast<kBOffset * 2>(BitCast(d16, b)));
7184 const V8 b23 = BitCast(d8, Broadcast<kBOffset * 2 + 1>(BitCast(d16, b)));
7185
7186 const VFromD<decltype(du16)> absdiff_sum_01 =
7187 SumsOf2(BitCast(du8, AbsDiff(a01, b01)));
7188 const VFromD<decltype(du16)> absdiff_sum_23 =
7189 SumsOf2(BitCast(du8, AbsDiff(a23, b23)));
7190 return BitCast(d16, Add(absdiff_sum_01, absdiff_sum_23));
7191}
7192#endif // HWY_TARGET != HWY_SCALAR
7193
7194#endif // HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
7195
7196// ------------------------------ SumsOfShuffledQuadAbsDiff
7197
7198#if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \
7199 defined(HWY_TARGET_TOGGLE))
7200#ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
7201#undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
7202#else
7203#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
7204#endif
7205
7206#if HWY_TARGET != HWY_SCALAR
7207template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
7208 HWY_IF_UI8_D(DFromV<V8>)>
7209HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
7210 V8 b) {
7211 static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
7212 static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
7213 static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
7214 static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
7215
7216#if HWY_TARGET == HWY_RVV
7217 // On RVV, ensure that both vA and vB have a LMUL of at least 1/2 so that
7218 // both vA and vB can be bitcasted to a u32 vector.
7220 RepartitionToWideX2<DFromV<decltype(a)>>>
7221 d32;
7222 const RepartitionToNarrow<decltype(d32)> d16;
7223 const RepartitionToNarrow<decltype(d16)> d8;
7224
7225 const auto vA = ResizeBitCast(d8, a);
7226 const auto vB = ResizeBitCast(d8, b);
7227#else
7228 const DFromV<decltype(a)> d8;
7229 const RepartitionToWide<decltype(d8)> d16;
7230 const RepartitionToWide<decltype(d16)> d32;
7231
7232 const auto vA = a;
7233 const auto vB = b;
7234#endif
7235
7236 const RebindToUnsigned<decltype(d8)> du8;
7237
7238 const auto a_shuf =
7239 Per4LaneBlockShuffle<kIdx3, kIdx2, kIdx1, kIdx0>(BitCast(d32, vA));
7240 /* a0123_2345: { a_shuf[0], a_shuf[1], a_shuf[2], a_shuf[3],
7241 a_shuf[2], a_shuf[3], a_shuf[4], a_shuf[5],
7242 a_shuf[8], a_shuf[9], a_shuf[10], a_shuf[11],
7243 a_shuf[10], a_shuf[11], a_shuf[12], a_shuf[13] } */
7244 /* a1234_3456: { a_shuf[1], a_shuf[2], a_shuf[3], a_shuf[4],
7245 a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6],
7246 a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12],
7247 a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */
7248#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
7249 // On RVV/SVE targets, use Slide1Up/Slide1Down instead of
7250 // ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any
7251 // lanes that are shifted into an adjacent 16-byte block as any lanes that are
7252 // shifted into an adjacent 16-byte block by Slide1Up/Slide1Down will be
7253 // replaced by the OddEven operation.
7254 const auto a_0123_2345 = BitCast(
7255 d8, OddEven(BitCast(d32, Slide1Up(d16, BitCast(d16, a_shuf))), a_shuf));
7256 const auto a_1234_3456 =
7257 BitCast(d8, OddEven(BitCast(d32, Slide1Up(d8, BitCast(d8, a_shuf))),
7258 BitCast(d32, Slide1Down(d8, BitCast(d8, a_shuf)))));
7259#else
7260 const auto a_0123_2345 =
7261 BitCast(d8, OddEven(ShiftLeftBytes<2>(d32, a_shuf), a_shuf));
7262 const auto a_1234_3456 = BitCast(
7263 d8,
7264 OddEven(ShiftLeftBytes<1>(d32, a_shuf), ShiftRightBytes<1>(d32, a_shuf)));
7265#endif
7266
7267 auto even_sums = SumsOf4(BitCast(du8, AbsDiff(a_0123_2345, vB)));
7268 auto odd_sums = SumsOf4(BitCast(du8, AbsDiff(a_1234_3456, vB)));
7269
7270#if HWY_IS_LITTLE_ENDIAN
7271 odd_sums = ShiftLeft<16>(odd_sums);
7272#else
7273 even_sums = ShiftLeft<16>(even_sums);
7274#endif
7275
7276 const auto sums = OddEven(BitCast(d16, odd_sums), BitCast(d16, even_sums));
7277
7278#if HWY_TARGET == HWY_RVV
7279 return ResizeBitCast(RepartitionToWide<DFromV<V8>>(), sums);
7280#else
7281 return sums;
7282#endif
7283}
7284#endif // HWY_TARGET != HWY_SCALAR
7285
7286#endif // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
7287
7288// ================================================== Operator wrapper
7289
7290// SVE* and RVV currently cannot define operators and have already defined
7291// (only) the corresponding functions such as Add.
7292#if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE))
7293#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
7294#undef HWY_NATIVE_OPERATOR_REPLACEMENTS
7295#else
7296#define HWY_NATIVE_OPERATOR_REPLACEMENTS
7297#endif
7298
7299template <class V>
7300HWY_API V Add(V a, V b) {
7301 return a + b;
7302}
7303template <class V>
7304HWY_API V Sub(V a, V b) {
7305 return a - b;
7306}
7307
7308template <class V>
7309HWY_API V Mul(V a, V b) {
7310 return a * b;
7311}
7312template <class V>
7313HWY_API V Div(V a, V b) {
7314 return a / b;
7315}
7316template <class V>
7317HWY_API V Mod(V a, V b) {
7318 return a % b;
7319}
7320
7321template <class V>
7322V Shl(V a, V b) {
7323 return a << b;
7324}
7325template <class V>
7326V Shr(V a, V b) {
7327 return a >> b;
7328}
7329
7330template <class V>
7331HWY_API auto Eq(V a, V b) -> decltype(a == b) {
7332 return a == b;
7333}
7334template <class V>
7335HWY_API auto Ne(V a, V b) -> decltype(a == b) {
7336 return a != b;
7337}
7338template <class V>
7339HWY_API auto Lt(V a, V b) -> decltype(a == b) {
7340 return a < b;
7341}
7342
7343template <class V>
7344HWY_API auto Gt(V a, V b) -> decltype(a == b) {
7345 return a > b;
7346}
7347template <class V>
7348HWY_API auto Ge(V a, V b) -> decltype(a == b) {
7349 return a >= b;
7350}
7351
7352template <class V>
7353HWY_API auto Le(V a, V b) -> decltype(a == b) {
7354 return a <= b;
7355}
7356
7357#endif // HWY_NATIVE_OPERATOR_REPLACEMENTS
7358
7359// NOLINTNEXTLINE(google-readability-namespace-comments)
7360} // namespace HWY_NAMESPACE
7361} // namespace hwy
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V)
Definition arm_sve-inl.h:2568
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_IF_LANES(kN, lanes)
Definition base.h:616
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_IF_T_SIZE(T, bytes)
Definition base.h:639
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_LANES_GT(kN, lanes)
Definition base.h:618
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_GT(T, bytes)
Definition base.h:649
#define HWY_IF_LANES_LE(kN, lanes)
Definition base.h:617
Definition arm_neon-inl.h:865
Definition arm_neon-inl.h:813
Raw raw
Definition arm_neon-inl.h:851
#define HWY_WASM_EMU256
Definition detect_targets.h:117
#define HWY_SSE2
Definition detect_targets.h:80
#define HWY_NEON
Definition detect_targets.h:93
#define HWY_TARGET
Definition detect_targets.h:543
#define HWY_WASM
Definition detect_targets.h:118
#define HWY_NEON_WITHOUT_AES
Definition detect_targets.h:94
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_REVERSE_BITS_MIN_BYTES
Definition generic_ops-inl.h:6418
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:744
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition ops/shared-inl.h:151
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_INLINE V NormalizeForUIntTruncConvToF32(V v)
Definition generic_ops-inl.h:3977
HWY_INLINE VFromD< RepartitionToWide< DFromV< V > > > SumsOf2(hwy::SignedTag, hwy::SizeTag< 1 >, V v)
Definition arm_neon-inl.h:1959
HWY_API VFromD< DTo > ConvertTo(hwy::FloatTag, DTo, Vec128< TFrom, HWY_MAX_LANES_D(DTo)> from)
Definition emu128-inl.h:1857
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:689
HWY_INLINE V IntDivUsingFloatDiv(V a, V b)
Definition generic_ops-inl.h:4674
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:560
VFromD< F32ExpLzcntMinMaxRepartition< DFromV< V > > > F32ExpLzcntMinMaxCmpV
Definition generic_ops-inl.h:3939
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE void LoadTransposedBlocks4(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &vA, VFromD< D > &vB, VFromD< D > &vC, VFromD< D > &vD)
Definition generic_ops-inl.h:1477
HWY_INLINE void StoreTransposedBlocks3(VFromD< D > A, VFromD< D > B, VFromD< D > C, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1652
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:681
static HWY_INLINE HWY_MAYBE_UNUSED TFromV< V > GetLane(V v)
Definition arm_neon-inl.h:1634
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
HWY_INLINE Vec128< float16_t, N > ConcatEven(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7002
HWY_INLINE V IntDiv(V a, V b)
Definition generic_ops-inl.h:4909
HWY_INLINE void StoreTransposedBlocks2(VFromD< D > A, VFromD< D > B, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1616
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE Vec128< float16_t, N > ConcatOdd(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7009
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:570
HWY_INLINE VFromD< DTo > ZeroExtendResizeBitCast(FromSizeTag, ToSizeTag, DTo d_to, DFrom, VFromD< DFrom > v)
Definition emu128-inl.h:140
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
HWY_INLINE F32ExpLzcntMinMaxCmpV< V > F32ExpLzcntMinMaxBitCast(V v)
Definition generic_ops-inl.h:3942
HWY_INLINE VFromD< D > ReduceWithinBlocks(D d, Func f, VFromD< D > v10)
Definition generic_ops-inl.h:1013
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
HWY_INLINE void StoreTransposedBlocks4(VFromD< D > vA, VFromD< D > vB, VFromD< D > vC, VFromD< D > vD, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:2003
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:1383
HWY_INLINE Vec< D > IntDivConvIntToFloat(D df, V vi)
Definition generic_ops-inl.h:4617
HWY_INLINE VFromD< D > PromoteOddTo(hwy::FloatTag to_type_tag, hwy::SizeTag< 4 > to_lane_size_tag, hwy::FloatTag from_type_tag, D d_to, svfloat16_t v)
Definition arm_sve-inl.h:4419
HWY_INLINE V IntMod(V a, V b)
Definition generic_ops-inl.h:5060
typename AdjustSimdTagToMinVecPow2_t< RemoveConst< D > >::type AdjustSimdTagToMinVecPow2
Definition rvv-inl.h:70
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:774
HWY_INLINE svuint8_t IndicesForExpandFromBits(uint64_t mask_bits)
Definition arm_sve-inl.h:4933
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
HWY_INLINE VFromD< D > PromoteEvenTo(hwy::SignedTag, hwy::SizeTag< 2 >, hwy::SignedTag, D d_to, svint8_t v)
Definition arm_sve-inl.h:4334
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
HWY_INLINE void LoadTransposedBlocks3(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &A, VFromD< D > &B, VFromD< D > &C)
Definition generic_ops-inl.h:1279
HWY_INLINE Vec< D > IntDivConvFloatToInt(D di, V vf)
Definition generic_ops-inl.h:4612
HWY_INLINE V I32RangeU32ToF32BiasedExp(V v)
Definition generic_ops-inl.h:3801
HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v)
Definition arm_sve-inl.h:2690
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE VFromD< D > ReduceAcrossBlocks(D, Func, VFromD< D > v)
Definition generic_ops-inl.h:998
HWY_INLINE V UI8ReverseBitsStep(V v)
Definition generic_ops-inl.h:6434
RebindToUnsigned< D > F32ExpLzcntMinMaxRepartition
Definition generic_ops-inl.h:3928
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
HWY_INLINE VFromD< D > UIntToF32BiasedExp(D d, VFromD< D > v)
Definition generic_ops-inl.h:3772
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API V SaturatedNeg(V v)
Definition generic_ops-inl.h:897
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< DI32 > SatWidenMulPairwiseAccumulate(DI32 di32, VFromD< Repartition< int16_t, DI32 > > a, VFromD< Repartition< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition generic_ops-inl.h:5179
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API svbool_t MaskFalse(const D)
Definition arm_sve-inl.h:372
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
RepartitionToWide< RepartitionToWideX2< D > > RepartitionToWideX3
Definition ops/shared-inl.h:483
HWY_API Mask< D > SlideMask1Up(D d, Mask< D > m)
Definition generic_ops-inl.h:7071
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
V Shl(V a, V b)
Definition generic_ops-inl.h:7322
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API Vec128< int64_t > SaturatedAbs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3288
HWY_API Vec< D > NaN(D d)
Definition generic_ops-inl.h:82
HWY_API VFromD< D > GatherIndexN(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index, const size_t max_lanes_to_load)
Definition generic_ops-inl.h:2789
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API V MaskedModOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:4666
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API V AddSub(V a, V b)
Definition generic_ops-inl.h:775
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API VFromD< D > PromoteInRangeLowerTo(D d, V v)
Definition generic_ops-inl.h:3620
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API VFromD< DTo > ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, VFromD< DFrom > v)
Definition generic_ops-inl.h:162
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API TFromD< D > ReduceMax(D d, VFromD< D > v)
Definition arm_sve-inl.h:3213
HWY_API V Rol(V a, V b)
Definition generic_ops-inl.h:445
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
HWY_API VFromD< DI32 > SatWidenMulAccumFixedPoint(DI32, VFromD< Rebind< int16_t, DI32 > > a, VFromD< Rebind< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition arm_neon-inl.h:6496
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API svbool_t DemoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1420
HWY_API V ZeroIfNegative(V v)
Definition generic_ops-inl.h:266
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API MFromD< DTo > OrderedDemote2MasksTo(DTo d_to, DFrom, MFromD< DFrom > a, MFromD< DFrom > b)
Definition x86_128-inl.h:1107
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfAdjQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3901
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API VFromD< D > SlideDownBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7046
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > PromoteInRangeEvenTo(D d, V v)
Definition generic_ops-inl.h:3652
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfShuffledQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3943
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API VFromD< RepartitionToWideX2< DFromV< V > > > SumsOf4(V v)
Definition generic_ops-inl.h:3733
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v)
Definition generic_ops-inl.h:869
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API Vec< DI16 > SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b)
Definition generic_ops-inl.h:5153
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Mask< D > SlideMask1Down(D d, Mask< D > m)
Definition generic_ops-inl.h:7076
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec< RepartitionToWideX3< DFromV< V > > > SumsOf8AbsDiff(V a, V b)
Definition generic_ops-inl.h:2820
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API V ReverseLaneBytes(V v)
Definition generic_ops-inl.h:6386
HWY_API VFromD< D > PromoteLowerTo(D d, V v)
Definition generic_ops-inl.h:2984
HWY_API V MulAddSub(V mul, V x, V sub_or_add)
Definition arm_sve-inl.h:4285
HWY_API V RotateRightSame(V v, int bits)
Definition generic_ops-inl.h:601
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< DTo > PromoteEvenTo(DTo d_to, Vec1< TFrom > v)
Definition scalar-inl.h:1478
HWY_API V Per4LaneBlockShuffle(V v)
Definition generic_ops-inl.h:6904
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_API TFromD< D > ReduceMin(D d, VFromD< D > v)
Definition arm_sve-inl.h:3208
HWY_API Vec1< MakeWide< T > > SumsOf2(const Vec1< T > v)
Definition scalar-inl.h:549
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API void ScatterIndexN(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index, const size_t max_lanes_to_store)
Definition generic_ops-inl.h:2782
HWY_API Vec< D > Inf(D d)
Definition generic_ops-inl.h:91
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
HWY_API V IfNegativeThenElseZero(V v, V yes)
Definition generic_ops-inl.h:241
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API V Ror(V a, V b)
Definition generic_ops-inl.h:459
HWY_API VFromD< DN > OrderedTruncate2To(DN dn, V a, V b)
Definition emu128-inl.h:1978
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API V RotateLeft(V v)
Definition generic_ops-inl.h:427
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API Mask< D > SlideMaskDownLanes(D d, Mask< D > m, size_t amt)
Definition generic_ops-inl.h:7086
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API void LoadInterleaved2(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1)
Definition arm_neon-inl.h:9049
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:476
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
VFromD< ScalableTag< bfloat16_t > > VBF16
Definition arm_sve-inl.h:410
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
decltype(MaskFromVec(Zero(D()))) Mask
Definition generic_ops-inl.h:52
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API svbool_t LowerHalfOfMask(D, svbool_t m)
Definition arm_sve-inl.h:1456
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
RepartitionToWide< RepartitionToWide< D > > RepartitionToWideX2
Definition ops/shared-inl.h:480
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API MFromD< DFromV< V > > IsEitherNaN(V a, V b)
Definition generic_ops-inl.h:1177
HWY_API V Div(V a, V b)
Definition arm_sve-inl.h:4639
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API V Clamp(const V v, const V lo, const V hi)
Definition generic_ops-inl.h:56
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7353
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API V HighestSetBitIndex(V v)
Definition arm_neon-inl.h:9523
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API size_t Blocks(D d)
Definition generic_ops-inl.h:6948
HWY_API VFromD< D > MaskedGatherIndex(MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2731
HWY_API void SafeFillN(const size_t num, const T value, D d, T *HWY_RESTRICT to)
Definition generic_ops-inl.h:172
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
V Shr(V a, V b)
Definition generic_ops-inl.h:7326
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API svbool_t IsNegative(V v)
Definition arm_sve-inl.h:1623
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition generic_ops-inl.h:187
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_API Mask< D > SlideMaskUpLanes(D d, Mask< D > m, size_t amt)
Definition generic_ops-inl.h:7081
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API V TrailingZeroCount(V v)
Definition arm_neon-inl.h:9530
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
decltype(GetLane(V())) LaneType
Definition generic_ops-inl.h:39
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API MFromD< D > UpperHalfOfMask(D, MFromD< Twice< D > > m)
Definition x86_128-inl.h:1051
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API V ReverseBits(V v)
Definition generic_ops-inl.h:6464
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_API MFromD< D > CombineMasks(D, MFromD< Half< D > > hi, MFromD< Half< D > > lo)
Definition x86_128-inl.h:959
HWY_API Vec1< T > operator%(Vec1< T > a, Vec1< T > b)
Definition generic_ops-inl.h:5095
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
HWY_API VFromD< D > SlideUpBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7028
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API svbool_t PromoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1394
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V RotateLeftSame(V v, int bits)
Definition generic_ops-inl.h:588
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition rvv-inl.h:3761
decltype(IndicesFromVec(D(), Zero(RebindToUnsigned< D >()))) IndicesFromD
Definition generic_ops-inl.h:6302
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API V Mod(V a, V b)
Definition arm_sve-inl.h:4660
HWY_API V IfNegativeThenZeroElse(V v, V no)
Definition generic_ops-inl.h:256
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API V BroadcastLane(const V v)
Definition arm_sve-inl.h:4146
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
Definition abort.h:8
FuncOutput(*)(const void *, FuncInput) Func
Definition nanobenchmark.h:87
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API constexpr bool IsSame()
Definition base.h:499
HWY_API constexpr bool IsSigned()
Definition base.h:2134
typename detail::Relations< T >::Float MakeFloat
Definition base.h:2082
typename IfT< Condition, Then, Else >::type If
Definition base.h:520
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition base.h:2114
constexpr MakeUnsigned< T > SignMask()
Definition base.h:2287
typename EnableIfT< Condition >::type EnableIf
Definition base.h:486
constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed+R::is_float+R::is_bf16)<< 8)>
Definition base.h:2105
HWY_API size_t PopCount(T x)
Definition base.h:2615
HWY_API constexpr T LimitsMax()
Definition base.h:2174
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue()
Definition base.h:2212
#define HWY_IF_U8_D(D)
Definition ops/shared-inl.h:577
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_U16_D(D)
Definition ops/shared-inl.h:578
#define HWY_IF_I16_D(D)
Definition ops/shared-inl.h:583
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_V_SIZE_GT_V(V, bytes)
Definition ops/shared-inl.h:636
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_NOT_BF16_D(D)
Definition ops/shared-inl.h:595
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:624
#define HWY_IF_LANES_LE_D(D, lanes)
Definition ops/shared-inl.h:561
#define HWY_IF_LANES_GT_D(D, lanes)
Definition ops/shared-inl.h:562
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_NOT_FLOAT_V(V)
Definition ops/shared-inl.h:618
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_V_SIZE_V(V, bytes)
Definition ops/shared-inl.h:632
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_V_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:609
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_IF_I8_D(D)
Definition ops/shared-inl.h:582
#define HWY_IF_UI8_D(D)
Definition ops/shared-inl.h:589
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_MAX_BYTES
Definition set_macros-inl.h:168
#define HWY_ALIGN
Definition set_macros-inl.h:167
#define HWY_HAVE_INTEGER64
Definition set_macros-inl.h:172
#define HWY_HAVE_FLOAT64
Definition set_macros-inl.h:174
#define HWY_HAVE_FLOAT16
Definition set_macros-inl.h:173
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition ops/shared-inl.h:198
Definition scalar-inl.h:36
Definition generic_ops-inl.h:975
V operator()(V a, V b) const
Definition generic_ops-inl.h:977
Definition generic_ops-inl.h:989
V operator()(V a, V b) const
Definition generic_ops-inl.h:991
Definition generic_ops-inl.h:982
V operator()(V a, V b) const
Definition generic_ops-inl.h:984
Definition base.h:694
int VFromD
Definition tuple-inl.h:25