Grok 12.0.1
emu128-inl.h
Go to the documentation of this file.
1// Copyright 2022 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Single-element vectors and operations.
17// External include guard in highway.h - see comment there.
18
19#include "hwy/base.h"
20#ifndef HWY_NO_LIBCXX
21#include <math.h> // sqrtf
22#endif
23
24#include "hwy/ops/shared-inl.h"
25
27namespace hwy {
28namespace HWY_NAMESPACE {
29
30template <typename T>
31using Full128 = Simd<T, 16 / sizeof(T), 0>;
32
33// (Wrapper class required for overloading comparison operators.)
34template <typename T, size_t N = 16 / sizeof(T)>
35struct Vec128 {
36 using PrivateT = T; // only for DFromV
37 static constexpr size_t kPrivateN = N; // only for DFromV
38
39 HWY_INLINE Vec128() = default;
40 Vec128(const Vec128&) = default;
41 Vec128& operator=(const Vec128&) = default;
42
44 return *this = (*this * other);
45 }
47 return *this = (*this / other);
48 }
50 return *this = (*this + other);
51 }
53 return *this = (*this - other);
54 }
56 return *this = (*this % other);
57 }
59 return *this = (*this & other);
60 }
62 return *this = (*this | other);
63 }
65 return *this = (*this ^ other);
66 }
67
68 // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h
69 // relies on this for LoadInterleaved*. CAVEAT: this method of padding
70 // prevents using range for, especially in SumOfLanes, where it would be
71 // incorrect. Moving padding to another field would require handling the case
72 // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward.
73 T raw[16 / sizeof(T)] = {};
74};
75
76// 0 or FF..FF, same size as Vec128.
77template <typename T, size_t N = 16 / sizeof(T)>
78struct Mask128 {
80 static HWY_INLINE Raw FromBool(bool b) {
81 return b ? static_cast<Raw>(~Raw{0}) : 0;
82 }
83
84 // Must match the size of Vec128.
85 Raw bits[16 / sizeof(T)] = {};
86};
87
88template <class V>
89using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
90
91template <class V>
92using TFromV = typename V::PrivateT;
93
94// ------------------------------ Zero
95
96// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
97template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
99 Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> v; // zero-initialized
100 return v;
101}
102
103template <class D>
104using VFromD = decltype(Zero(D()));
105
106// ------------------------------ Tuple (VFromD)
107#include "hwy/ops/tuple-inl.h"
108
109// ------------------------------ BitCast
110
111template <class D, class VFrom>
112HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) {
113 VFromD<D> to;
114 CopySameSize(&v.raw, &to.raw);
115 return to;
116}
117
118// ------------------------------ ResizeBitCast
119
120template <class D, class VFrom>
122 using DFrom = DFromV<VFrom>;
123 using TFrom = TFromD<DFrom>;
124 using TTo = TFromD<D>;
125
126 constexpr size_t kFromByteLen = sizeof(TFrom) * HWY_MAX_LANES_D(DFrom);
127 constexpr size_t kToByteLen = sizeof(TTo) * HWY_MAX_LANES_D(D);
128 constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen);
129
130 VFromD<D> to = Zero(d);
131 CopyBytes<kCopyByteLen>(&v.raw, &to.raw);
132 return to;
133}
134
135namespace detail {
136
137// ResizeBitCast on the HWY_EMU128 target has zero-extending semantics if
138// VFromD<DTo> is a larger vector than FromV
139template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom>
140HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */,
141 ToSizeTag /* to_size_tag */,
142 DTo d_to, DFrom /* d_from */,
143 VFromD<DFrom> v) {
144 return ResizeBitCast(d_to, v);
145}
146
147} // namespace detail
148
149// ------------------------------ Set
150template <class D, typename T2>
151HWY_API VFromD<D> Set(D d, const T2 t) {
152 VFromD<D> v;
153 for (size_t i = 0; i < MaxLanes(d); ++i) {
154 v.raw[i] = ConvertScalarTo<TFromD<D>>(t);
155 }
156 return v;
157}
158
159// ------------------------------ Undefined
160template <class D>
162 return Zero(d);
163}
164
165// ------------------------------ Dup128VecFromValues
166
167template <class D, HWY_IF_T_SIZE_D(D, 1)>
169 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
170 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
171 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
172 TFromD<D> t11, TFromD<D> t12,
173 TFromD<D> t13, TFromD<D> t14,
174 TFromD<D> t15) {
175 VFromD<D> result;
176 result.raw[0] = t0;
177 result.raw[1] = t1;
178 result.raw[2] = t2;
179 result.raw[3] = t3;
180 result.raw[4] = t4;
181 result.raw[5] = t5;
182 result.raw[6] = t6;
183 result.raw[7] = t7;
184 result.raw[8] = t8;
185 result.raw[9] = t9;
186 result.raw[10] = t10;
187 result.raw[11] = t11;
188 result.raw[12] = t12;
189 result.raw[13] = t13;
190 result.raw[14] = t14;
191 result.raw[15] = t15;
192 return result;
193}
194
195template <class D, HWY_IF_T_SIZE_D(D, 2)>
196HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
197 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
198 TFromD<D> t5, TFromD<D> t6,
199 TFromD<D> t7) {
200 VFromD<D> result;
201 result.raw[0] = t0;
202 result.raw[1] = t1;
203 result.raw[2] = t2;
204 result.raw[3] = t3;
205 result.raw[4] = t4;
206 result.raw[5] = t5;
207 result.raw[6] = t6;
208 result.raw[7] = t7;
209 return result;
210}
211
212template <class D, HWY_IF_T_SIZE_D(D, 4)>
214 TFromD<D> t2, TFromD<D> t3) {
215 VFromD<D> result;
216 result.raw[0] = t0;
217 result.raw[1] = t1;
218 result.raw[2] = t2;
219 result.raw[3] = t3;
220 return result;
221}
222
223template <class D, HWY_IF_T_SIZE_D(D, 8)>
225 VFromD<D> result;
226 result.raw[0] = t0;
227 result.raw[1] = t1;
228 return result;
229}
230
231// ------------------------------ Iota
232
233template <class D, typename T = TFromD<D>, typename T2>
234HWY_API VFromD<D> Iota(D d, T2 first) {
235 VFromD<D> v;
236 for (size_t i = 0; i < MaxLanes(d); ++i) {
237 v.raw[i] = AddWithWraparound(static_cast<T>(first), i);
238 }
239 return v;
240}
241
242// ================================================== LOGICAL
243
244// ------------------------------ Not
245template <typename T, size_t N>
247 const DFromV<decltype(v)> d;
248 const RebindToUnsigned<decltype(d)> du;
249 using TU = TFromD<decltype(du)>;
250 VFromD<decltype(du)> vu = BitCast(du, v);
251 for (size_t i = 0; i < N; ++i) {
252 vu.raw[i] = static_cast<TU>(~vu.raw[i]);
253 }
254 return BitCast(d, vu);
255}
256
257// ------------------------------ And
258template <typename T, size_t N>
260 const DFromV<decltype(a)> d;
261 const RebindToUnsigned<decltype(d)> du;
262 auto au = BitCast(du, a);
263 auto bu = BitCast(du, b);
264 for (size_t i = 0; i < N; ++i) {
265 au.raw[i] &= bu.raw[i];
266 }
267 return BitCast(d, au);
268}
269template <typename T, size_t N>
270HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) {
271 return And(a, b);
272}
273
274// ------------------------------ AndNot
275template <typename T, size_t N>
279
280// ------------------------------ Or
281template <typename T, size_t N>
283 const DFromV<decltype(a)> d;
284 const RebindToUnsigned<decltype(d)> du;
285 auto au = BitCast(du, a);
286 auto bu = BitCast(du, b);
287 for (size_t i = 0; i < N; ++i) {
288 au.raw[i] |= bu.raw[i];
289 }
290 return BitCast(d, au);
291}
292template <typename T, size_t N>
293HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) {
294 return Or(a, b);
295}
296
297// ------------------------------ Xor
298template <typename T, size_t N>
300 const DFromV<decltype(a)> d;
301 const RebindToUnsigned<decltype(d)> du;
302 auto au = BitCast(du, a);
303 auto bu = BitCast(du, b);
304 for (size_t i = 0; i < N; ++i) {
305 au.raw[i] ^= bu.raw[i];
306 }
307 return BitCast(d, au);
308}
309template <typename T, size_t N>
310HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
311 return Xor(a, b);
312}
313
314// ------------------------------ Xor3
315template <typename T, size_t N>
316HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
317 return Xor(x1, Xor(x2, x3));
318}
319
320// ------------------------------ Or3
321template <typename T, size_t N>
322HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
323 return Or(o1, Or(o2, o3));
324}
325
326// ------------------------------ OrAnd
327template <typename T, size_t N>
328HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
329 return Or(o, And(a1, a2));
330}
331
332// ------------------------------ IfVecThenElse
333template <typename T, size_t N>
334HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
335 Vec128<T, N> no) {
336 return Or(And(mask, yes), AndNot(mask, no));
337}
338
339// ------------------------------ CopySign
340template <typename T, size_t N>
341HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
342 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
343 const DFromV<decltype(magn)> d;
344 return BitwiseIfThenElse(SignBit(d), sign, magn);
345}
346
347// ------------------------------ CopySignToAbs
348template <typename T, size_t N>
349HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
350 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
351 const DFromV<decltype(abs)> d;
352 return OrAnd(abs, SignBit(d), sign);
353}
354
355// ------------------------------ BroadcastSignBit
356template <typename T, size_t N>
358 for (size_t i = 0; i < N; ++i) {
359 v.raw[i] = ScalarShr(v.raw[i], sizeof(T) * 8 - 1);
360 }
361 return v;
362}
363
364// ------------------------------ Mask
365
366// v must be 0 or FF..FF.
367template <typename T, size_t N>
368HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
369 Mask128<T, N> mask;
370 CopySameSize(&v.raw, &mask.bits);
371 return mask;
372}
373
374template <class D>
375using MFromD = decltype(MaskFromVec(VFromD<D>()));
376
377template <class DTo, class MFrom>
378HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) {
379 MFromD<DTo> to;
380 CopySameSize(&mask.bits, &to.bits);
381 return to;
382}
383
384template <class D>
385VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) {
386 VFromD<D> v;
387 CopySameSize(&mask.bits, &v.raw);
388 return v;
389}
390
391template <class D>
392HWY_API MFromD<D> FirstN(D d, size_t n) {
393 MFromD<D> m;
394 for (size_t i = 0; i < MaxLanes(d); ++i) {
395 m.bits[i] = MFromD<D>::FromBool(i < n);
396 }
397 return m;
398}
399
400// Returns mask ? yes : no.
401template <typename T, size_t N>
403 Vec128<T, N> no) {
404 const DFromV<decltype(yes)> d;
405 return IfVecThenElse(VecFromMask(d, mask), yes, no);
406}
407
408template <typename T, size_t N>
410 const DFromV<decltype(yes)> d;
411 return IfVecThenElse(VecFromMask(d, mask), yes, Zero(d));
412}
413
414template <typename T, size_t N>
416 const DFromV<decltype(no)> d;
417 return IfVecThenElse(VecFromMask(d, mask), Zero(d), no);
418}
419
420template <typename T, size_t N>
421HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
422 Vec128<T, N> no) {
423 const DFromV<decltype(v)> d;
424 const RebindToSigned<decltype(d)> di;
425 const auto vi = BitCast(di, v);
426
427 for (size_t i = 0; i < N; ++i) {
428 v.raw[i] = vi.raw[i] < 0 ? yes.raw[i] : no.raw[i];
429 }
430 return v;
431}
432
433// ------------------------------ Mask logical
434
435template <typename T, size_t N>
436HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
437 const Simd<T, N, 0> d;
438 return MaskFromVec(Not(VecFromMask(d, m)));
439}
440
441template <typename T, size_t N>
442HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
443 const Simd<T, N, 0> d;
444 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
445}
446
447template <typename T, size_t N>
448HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
449 const Simd<T, N, 0> d;
450 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
451}
452
453template <typename T, size_t N>
454HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
455 const Simd<T, N, 0> d;
456 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
457}
458
459template <typename T, size_t N>
460HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
461 const Simd<T, N, 0> d;
462 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
463}
464
465template <typename T, size_t N>
466HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
467 const Simd<T, N, 0> d;
468 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
469}
470
471// ================================================== SHIFTS
472
473// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
474
475template <int kBits, typename T, size_t N>
477 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
478 using TU = hwy::MakeUnsigned<T>;
479 for (size_t i = 0; i < N; ++i) {
480 const TU raw_u = static_cast<TU>(v.raw[i]);
481 const auto shifted = raw_u << kBits; // separate line to avoid MSVC warning
482 v.raw[i] = static_cast<T>(shifted);
483 }
484 return v;
485}
486
487template <int kBits, typename T, size_t N>
489 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
490 // Signed right shift is now guaranteed to be arithmetic (rounding toward
491 // negative infinity, i.e. shifting in the sign bit).
492 for (size_t i = 0; i < N; ++i) {
493 v.raw[i] = ScalarShr(v.raw[i], kBits);
494 }
495
496 return v;
497}
498
499// ------------------------------ RotateRight (ShiftRight)
500template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
501HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
502 const DFromV<decltype(v)> d;
503 const RebindToUnsigned<decltype(d)> du;
504
505 constexpr size_t kSizeInBits = sizeof(T) * 8;
506 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
507 if (kBits == 0) return v;
508
509 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
510 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
511}
512
513// ------------------------------ ShiftLeftSame
514
515template <typename T, size_t N>
516HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
517 for (size_t i = 0; i < N; ++i) {
518 const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << bits;
519 v.raw[i] = static_cast<T>(shifted);
520 }
521 return v;
522}
523
524template <typename T, size_t N>
525HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
526 for (size_t i = 0; i < N; ++i) {
527 v.raw[i] = ScalarShr(v.raw[i], bits);
528 }
529
530 return v;
531}
532
533// ------------------------------ Shl
534
535template <typename T, size_t N>
537 for (size_t i = 0; i < N; ++i) {
538 const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i])
539 << bits.raw[i];
540 v.raw[i] = static_cast<T>(shifted);
541 }
542 return v;
543}
544
545template <typename T, size_t N>
547 for (size_t i = 0; i < N; ++i) {
548 v.raw[i] = ScalarShr(v.raw[i], static_cast<int>(bits.raw[i]));
549 }
550
551 return v;
552}
553
554// ================================================== ARITHMETIC
555
556// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
557namespace detail {
558
559template <typename T, size_t N>
561 Vec128<T, N> b) {
562 for (size_t i = 0; i < N; ++i) {
563 const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
564 const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
565 a.raw[i] = static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)));
566 }
567 return a;
568}
569template <typename T, size_t N>
571 Vec128<T, N> b) {
572 for (size_t i = 0; i < N; ++i) {
573 const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
574 const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
575 a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
576 }
577 return a;
578}
579
580template <typename T, size_t N>
582 Vec128<T, N> b) {
583 for (size_t i = 0; i < N; ++i) {
584 a.raw[i] += b.raw[i];
585 }
586 return a;
587}
588
589template <typename T, size_t N>
591 Vec128<T, N> b) {
592 for (size_t i = 0; i < N; ++i) {
593 a.raw[i] -= b.raw[i];
594 }
595 return a;
596}
597
598} // namespace detail
599
600template <typename T, size_t N>
602 return detail::Sub(hwy::IsFloatTag<T>(), a, b);
603}
604template <typename T, size_t N>
606 return detail::Add(hwy::IsFloatTag<T>(), a, b);
607}
608
609// ------------------------------ SumsOf8
610
611template <size_t N>
612HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) {
613 Vec128<uint64_t, (N + 7) / 8> sums;
614 for (size_t i = 0; i < N; ++i) {
615 sums.raw[i / 8] += v.raw[i];
616 }
617 return sums;
618}
619
620template <size_t N>
621HWY_API Vec128<int64_t, (N + 7) / 8> SumsOf8(Vec128<int8_t, N> v) {
622 Vec128<int64_t, (N + 7) / 8> sums;
623 for (size_t i = 0; i < N; ++i) {
624 sums.raw[i / 8] += v.raw[i];
625 }
626 return sums;
627}
628
629// ------------------------------ SaturatedAdd
630template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
633 using TW = MakeSigned<MakeWide<T>>;
634 for (size_t i = 0; i < N; ++i) {
635 a.raw[i] = static_cast<T>(HWY_MIN(
636 HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) + b.raw[i]),
637 hwy::HighestValue<T>()));
638 }
639 return a;
640}
641
642// ------------------------------ SaturatedSub
643template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
646 using TW = MakeSigned<MakeWide<T>>;
647 for (size_t i = 0; i < N; ++i) {
648 a.raw[i] = static_cast<T>(HWY_MIN(
649 HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) - b.raw[i]),
650 hwy::HighestValue<T>()));
651 }
652 return a;
653}
654
655// ------------------------------ AverageRound
656template <typename T, size_t N>
658 static_assert(!IsSigned<T>(), "Only for unsigned");
659 for (size_t i = 0; i < N; ++i) {
660 a.raw[i] = static_cast<T>((a.raw[i] + b.raw[i] + 1) / 2);
661 }
662 return a;
663}
664
665// ------------------------------ Abs
666
667template <typename T, size_t N>
669 for (size_t i = 0; i < N; ++i) {
670 a.raw[i] = ScalarAbs(a.raw[i]);
671 }
672 return a;
673}
674
675// ------------------------------ Min/Max
676
677// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
678namespace detail {
679
680template <typename T, size_t N>
682 Vec128<T, N> b) {
683 for (size_t i = 0; i < N; ++i) {
684 a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
685 }
686 return a;
687}
688template <typename T, size_t N>
690 Vec128<T, N> b) {
691 for (size_t i = 0; i < N; ++i) {
692 a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
693 }
694 return a;
695}
696
697template <typename T, size_t N>
699 Vec128<T, N> b) {
700 for (size_t i = 0; i < N; ++i) {
701 if (ScalarIsNaN(a.raw[i])) {
702 a.raw[i] = b.raw[i];
703 } else if (ScalarIsNaN(b.raw[i])) {
704 // no change
705 } else {
706 a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
707 }
708 }
709 return a;
710}
711template <typename T, size_t N>
713 Vec128<T, N> b) {
714 for (size_t i = 0; i < N; ++i) {
715 if (ScalarIsNaN(a.raw[i])) {
716 a.raw[i] = b.raw[i];
717 } else if (ScalarIsNaN(b.raw[i])) {
718 // no change
719 } else {
720 a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
721 }
722 }
723 return a;
724}
725
726} // namespace detail
727
728template <typename T, size_t N>
730 return detail::Min(hwy::IsFloatTag<T>(), a, b);
731}
732
733template <typename T, size_t N>
735 return detail::Max(hwy::IsFloatTag<T>(), a, b);
736}
737
738// ------------------------------ Neg
739
740// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
741namespace detail {
742
743template <typename T, size_t N>
745 const DFromV<decltype(v)> d;
746 return Zero(d) - v;
747}
748
749template <typename T, size_t N>
751 const DFromV<decltype(v)> d;
752 return Xor(v, SignBit(d));
753}
754
755template <typename T, size_t N>
757 const DFromV<decltype(v)> d;
758 return Xor(v, SignBit(d));
759}
760
761} // namespace detail
762
763template <typename T, size_t N>
765 return detail::Neg(hwy::IsFloatTag<T>(), v);
766}
767
768// ------------------------------ Mul/Div
769
770// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
771namespace detail {
772
773template <typename T, size_t N>
775 Vec128<T, N> b) {
776 for (size_t i = 0; i < N; ++i) {
777 a.raw[i] *= b.raw[i];
778 }
779 return a;
780}
781
782template <typename T, size_t N>
784 for (size_t i = 0; i < N; ++i) {
785 a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) *
786 static_cast<uint64_t>(b.raw[i]));
787 }
788 return a;
789}
790
791template <typename T, size_t N>
793 Vec128<T, N> b) {
794 for (size_t i = 0; i < N; ++i) {
795 a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) *
796 static_cast<uint64_t>(b.raw[i]));
797 }
798 return a;
799}
800
801} // namespace detail
802
803// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
804#ifdef HWY_NATIVE_MUL_8
805#undef HWY_NATIVE_MUL_8
806#else
807#define HWY_NATIVE_MUL_8
808#endif
809#ifdef HWY_NATIVE_MUL_64
810#undef HWY_NATIVE_MUL_64
811#else
812#define HWY_NATIVE_MUL_64
813#endif
814
815template <typename T, size_t N>
817 return detail::Mul(hwy::TypeTag<T>(), a, b);
818}
819
820template <typename T, size_t N, HWY_IF_FLOAT(T)>
821HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
822 for (size_t i = 0; i < N; ++i) {
823 a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i];
824 }
825 return a;
826}
827
828// Returns the upper sizeof(T)*8 bits of a * b in each lane.
829template <class T, size_t N,
830 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
833 using TW = MakeWide<T>;
834 for (size_t i = 0; i < N; ++i) {
835 a.raw[i] = static_cast<T>(
836 (static_cast<TW>(a.raw[i]) * static_cast<TW>(b.raw[i])) >>
837 (sizeof(T) * 8));
838 }
839 return a;
840}
841
842template <class T, HWY_IF_UI64(T)>
844 T hi;
845 Mul128(GetLane(a), GetLane(b), &hi);
846 return Set(Full64<T>(), hi);
847}
848
849template <class T, HWY_IF_UI64(T)>
850HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
851 T hi_0;
852 T hi_1;
853
854 Mul128(GetLane(a), GetLane(b), &hi_0);
855 Mul128(ExtractLane(a, 1), ExtractLane(b, 1), &hi_1);
856
857 return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
858}
859
860template <size_t N>
863 for (size_t i = 0; i < N; ++i) {
864 a.raw[i] = static_cast<int16_t>((a.raw[i] * b.raw[i] + 16384) >> 15);
865 }
866 return a;
867}
868
869// Multiplies even lanes (0, 2, ..) and returns the double-wide result.
870template <class T, size_t N,
871 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
874 Vec128<T, N> b) {
875 using TW = MakeWide<T>;
876 Vec128<TW, (N + 1) / 2> mul;
877 for (size_t i = 0; i < N; i += 2) {
878 const TW a_wide = a.raw[i];
879 mul.raw[i / 2] = static_cast<TW>(a_wide * b.raw[i]);
880 }
881 return mul;
882}
883
884// Multiplies odd lanes (1, 3, ..) and returns the double-wide result.
885template <class T, size_t N,
886 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
889 Vec128<T, N> b) {
890 using TW = MakeWide<T>;
891 Vec128<TW, (N + 1) / 2> mul;
892 for (size_t i = 0; i < N; i += 2) {
893 const TW a_wide = a.raw[i + 1];
894 mul.raw[i / 2] = static_cast<TW>(a_wide * b.raw[i + 1]);
895 }
896 return mul;
897}
898
899template <size_t N>
901 for (size_t i = 0; i < N; ++i) {
902 // Zero inputs are allowed, but callers are responsible for replacing the
903 // return value with something else (typically using IfThenElse). This check
904 // avoids a ubsan error. The result is arbitrary.
905 v.raw[i] = (ScalarAbs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
906 }
907 return v;
908}
909
910// generic_ops takes care of integer T.
911template <typename T, size_t N, HWY_IF_FLOAT(T)>
913 return Abs(a - b);
914}
915
916// ------------------------------ Floating-point multiply-add variants
917
918template <typename T, size_t N, HWY_IF_FLOAT(T)>
919HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
920 Vec128<T, N> add) {
921 return mul * x + add;
922}
923
924template <typename T, size_t N, HWY_IF_FLOAT(T)>
925HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
926 Vec128<T, N> add) {
927 return add - mul * x;
928}
929
930template <typename T, size_t N, HWY_IF_FLOAT(T)>
931HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
932 Vec128<T, N> sub) {
933 return mul * x - sub;
934}
935
936template <typename T, size_t N, HWY_IF_FLOAT(T)>
937HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
938 Vec128<T, N> sub) {
939 return Neg(mul) * x - sub;
940}
941
942// ------------------------------ Floating-point square root
943
944template <size_t N>
946 for (size_t i = 0; i < N; ++i) {
947 const float half = v.raw[i] * 0.5f;
948 // Initial guess based on log2(f)
949 v.raw[i] = BitCastScalar<float>(static_cast<uint32_t>(
950 0x5F3759DF - (BitCastScalar<uint32_t>(v.raw[i]) >> 1)));
951 // One Newton-Raphson iteration
952 v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
953 }
954 return v;
955}
956
957namespace detail {
958
959static HWY_INLINE float ScalarSqrt(float v) {
960#if defined(HWY_NO_LIBCXX)
961#if HWY_COMPILER_GCC_ACTUAL
962 return __builtin_sqrt(v);
963#else
964 uint32_t bits = BitCastScalar<uint32_t>(v);
965 // Coarse approximation, letting the exponent LSB leak into the mantissa
966 bits = (1 << 29) + (bits >> 1) - (1 << 22);
967 return BitCastScalar<float>(bits);
968#endif // !HWY_COMPILER_GCC_ACTUAL
969#else
970 return sqrtf(v);
971#endif // !HWY_NO_LIBCXX
972}
973static HWY_INLINE double ScalarSqrt(double v) {
974#if defined(HWY_NO_LIBCXX)
975#if HWY_COMPILER_GCC_ACTUAL
976 return __builtin_sqrt(v);
977#else
978 uint64_t bits = BitCastScalar<uint64_t>(v);
979 // Coarse approximation, letting the exponent LSB leak into the mantissa
980 bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
981 return BitCastScalar<double>(bits);
982#endif // !HWY_COMPILER_GCC_ACTUAL
983#else
984 return sqrt(v);
985#endif // HWY_NO_LIBCXX
986}
987
988} // namespace detail
989
990template <typename T, size_t N>
992 for (size_t i = 0; i < N; ++i) {
993 v.raw[i] = detail::ScalarSqrt(v.raw[i]);
994 }
995 return v;
996}
997
998// ------------------------------ Floating-point rounding
999
1000template <typename T, size_t N>
1002 using TI = MakeSigned<T>;
1003 const T k0 = ConvertScalarTo<T>(0);
1004 const Vec128<T, N> a = Abs(v);
1005 for (size_t i = 0; i < N; ++i) {
1006 if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN
1007 continue;
1008 }
1009 const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
1010 const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
1011 if (rounded == 0) {
1012 v.raw[i] = v.raw[i] < 0 ? ConvertScalarTo<T>(-0) : k0;
1013 continue;
1014 }
1015 const T rounded_f = ConvertScalarTo<T>(rounded);
1016 // Round to even
1017 if ((rounded & 1) &&
1018 ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
1019 v.raw[i] = ConvertScalarTo<T>(rounded - (v.raw[i] < k0 ? -1 : 1));
1020 continue;
1021 }
1022 v.raw[i] = rounded_f;
1023 }
1024 return v;
1025}
1026
1027// Round-to-nearest even.
1028template <size_t N>
1029HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
1030 using T = float;
1031 using TI = int32_t;
1032 const T k0 = ConvertScalarTo<T>(0);
1033
1034 const Vec128<float, N> abs = Abs(v);
1035 Vec128<int32_t, N> ret;
1036 for (size_t i = 0; i < N; ++i) {
1037 const bool signbit = ScalarSignBit(v.raw[i]);
1038
1039 if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN
1040 // Check if too large to cast or NaN
1041 if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
1042 ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
1043 continue;
1044 }
1045 ret.raw[i] = static_cast<TI>(v.raw[i]);
1046 continue;
1047 }
1048 const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
1049 const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
1050 if (rounded == 0) {
1051 ret.raw[i] = 0;
1052 continue;
1053 }
1054 const T rounded_f = ConvertScalarTo<T>(rounded);
1055 // Round to even
1056 if ((rounded & 1) &&
1057 ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
1058 ret.raw[i] = rounded - (signbit ? -1 : 1);
1059 continue;
1060 }
1061 ret.raw[i] = rounded;
1062 }
1063 return ret;
1064}
1065
1066template <typename T, size_t N>
1068 using TI = MakeSigned<T>;
1069 const Vec128<T, N> abs = Abs(v);
1070 for (size_t i = 0; i < N; ++i) {
1071 if (!(abs.raw[i] <= MantissaEnd<T>())) { // Huge or NaN
1072 continue;
1073 }
1074 const TI truncated = static_cast<TI>(v.raw[i]);
1075 if (truncated == 0) {
1076 v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0};
1077 continue;
1078 }
1079 v.raw[i] = static_cast<T>(truncated);
1080 }
1081 return v;
1082}
1083
1084// Toward +infinity, aka ceiling
1085template <typename Float, size_t N>
1087 constexpr int kMantissaBits = MantissaBits<Float>();
1088 using Bits = MakeUnsigned<Float>;
1089 const Bits kExponentMask = MaxExponentField<Float>();
1090 const Bits kMantissaMask = MantissaMask<Float>();
1091 const Bits kBias = kExponentMask / 2;
1092
1093 for (size_t i = 0; i < N; ++i) {
1094 const bool positive = v.raw[i] > Float(0.0);
1095
1096 Bits bits = BitCastScalar<Bits>(v.raw[i]);
1097
1098 const int exponent =
1099 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1100 // Already an integer.
1101 if (exponent >= kMantissaBits) continue;
1102 // |v| <= 1 => 0 or 1.
1103 if (exponent < 0) {
1104 v.raw[i] = positive ? Float{1} : Float{-0.0};
1105 continue;
1106 }
1107
1108 const Bits mantissa_mask = kMantissaMask >> exponent;
1109 // Already an integer
1110 if ((bits & mantissa_mask) == 0) continue;
1111
1112 // Clear fractional bits and round up
1113 if (positive) bits += (kMantissaMask + 1) >> exponent;
1114 bits &= ~mantissa_mask;
1115
1116 v.raw[i] = BitCastScalar<Float>(bits);
1117 }
1118 return v;
1119}
1120
1121// Toward -infinity, aka floor
1122template <typename Float, size_t N>
1124 constexpr int kMantissaBits = MantissaBits<Float>();
1125 using Bits = MakeUnsigned<Float>;
1126 const Bits kExponentMask = MaxExponentField<Float>();
1127 const Bits kMantissaMask = MantissaMask<Float>();
1128 const Bits kBias = kExponentMask / 2;
1129
1130 for (size_t i = 0; i < N; ++i) {
1131 const bool negative = v.raw[i] < Float(0.0);
1132
1133 Bits bits = BitCastScalar<Bits>(v.raw[i]);
1134
1135 const int exponent =
1136 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1137 // Already an integer.
1138 if (exponent >= kMantissaBits) continue;
1139 // |v| <= 1 => -1 or 0.
1140 if (exponent < 0) {
1141 v.raw[i] = negative ? Float(-1.0) : Float(0.0);
1142 continue;
1143 }
1144
1145 const Bits mantissa_mask = kMantissaMask >> exponent;
1146 // Already an integer
1147 if ((bits & mantissa_mask) == 0) continue;
1148
1149 // Clear fractional bits and round down
1150 if (negative) bits += (kMantissaMask + 1) >> exponent;
1151 bits &= ~mantissa_mask;
1152
1153 v.raw[i] = BitCastScalar<Float>(bits);
1154 }
1155 return v;
1156}
1157
1158// ------------------------------ Floating-point classification
1159
1160template <typename T, size_t N>
1161HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
1162 Mask128<T, N> ret;
1163 for (size_t i = 0; i < N; ++i) {
1164 // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
1165 ret.bits[i] = Mask128<T, N>::FromBool(ScalarIsNaN(v.raw[i]));
1166 }
1167 return ret;
1168}
1169
1170// ================================================== COMPARE
1171
1172template <typename T, size_t N>
1175 for (size_t i = 0; i < N; ++i) {
1176 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] == b.raw[i]);
1177 }
1178 return m;
1179}
1180
1181template <typename T, size_t N>
1184 for (size_t i = 0; i < N; ++i) {
1185 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] != b.raw[i]);
1186 }
1187 return m;
1188}
1189
1190template <typename T, size_t N>
1192 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1193 return (v & bit) == bit;
1194}
1195
1196template <typename T, size_t N>
1199 for (size_t i = 0; i < N; ++i) {
1200 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] < b.raw[i]);
1201 }
1202 return m;
1203}
1204template <typename T, size_t N>
1205HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
1206 Mask128<T, N> m;
1207 for (size_t i = 0; i < N; ++i) {
1208 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] > b.raw[i]);
1209 }
1210 return m;
1211}
1212
1213template <typename T, size_t N>
1216 for (size_t i = 0; i < N; ++i) {
1217 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] <= b.raw[i]);
1218 }
1219 return m;
1220}
1221template <typename T, size_t N>
1222HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
1223 Mask128<T, N> m;
1224 for (size_t i = 0; i < N; ++i) {
1225 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] >= b.raw[i]);
1226 }
1227 return m;
1228}
1229
1230// ------------------------------ Lt128
1231
1232// Only makes sense for full vectors of u64.
1233template <class D>
1235 const bool lt =
1236 (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]);
1238 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
1239 return ret;
1240}
1241
1242template <class D>
1244 Vec128<uint64_t> b) {
1245 const bool lt = a.raw[1] < b.raw[1];
1247 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
1248 return ret;
1249}
1250
1251// ------------------------------ Eq128
1252
1253// Only makes sense for full vectors of u64.
1254template <class D>
1256 const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0];
1258 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
1259 return ret;
1260}
1261
1262template <class D>
1264 Vec128<uint64_t> b) {
1265 const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0];
1267 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
1268 return ret;
1269}
1270
1271template <class D>
1273 Vec128<uint64_t> b) {
1274 const bool eq = a.raw[1] == b.raw[1];
1276 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
1277 return ret;
1278}
1279
1280template <class D>
1282 Vec128<uint64_t> b) {
1283 const bool ne = a.raw[1] != b.raw[1];
1285 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
1286 return ret;
1287}
1288
1289// ------------------------------ Min128, Max128 (Lt128)
1290
1291template <class D>
1293 return IfThenElse(Lt128(d, a, b), a, b);
1294}
1295
1296template <class D>
1298 return IfThenElse(Lt128(d, b, a), a, b);
1299}
1300
1301template <class D>
1303 return IfThenElse(Lt128Upper(d, a, b), a, b);
1304}
1305
1306template <class D>
1308 return IfThenElse(Lt128Upper(d, b, a), a, b);
1309}
1310
1311// ================================================== MEMORY
1312
1313// ------------------------------ Load
1314
1315template <class D>
1316HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
1317 VFromD<D> v;
1318 CopyBytes<d.MaxBytes()>(aligned, v.raw); // copy from array
1319 return v;
1320}
1321
1322template <class D>
1323HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
1324 const TFromD<D>* HWY_RESTRICT p) {
1325 return IfThenElseZero(m, LoadU(d, p));
1326}
1327
1328template <class D>
1329HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
1330 const TFromD<D>* HWY_RESTRICT p) {
1331 return IfThenElse(m, LoadU(d, p), v);
1332}
1333
1334template <class D>
1336 return Load(d, p);
1337}
1338
1339// In some use cases, "load single lane" is sufficient; otherwise avoid this.
1340template <class D>
1341HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT aligned) {
1342 return Load(d, aligned);
1343}
1344
1345#ifdef HWY_NATIVE_LOAD_N
1346#undef HWY_NATIVE_LOAD_N
1347#else
1348#define HWY_NATIVE_LOAD_N
1349#endif
1350
1351template <class D>
1353 size_t max_lanes_to_load) {
1354 VFromD<D> v = Zero(d);
1355 const size_t N = Lanes(d);
1356 const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N);
1357 CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD<D>));
1358 return v;
1359}
1360
1361template <class D>
1363 size_t max_lanes_to_load) {
1364 VFromD<D> v = no;
1365 const size_t N = Lanes(d);
1366 const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N);
1367 CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD<D>));
1368 return v;
1369}
1370
1371// ------------------------------ Store
1372
1373template <class D>
1374HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1375 CopyBytes<d.MaxBytes()>(v.raw, aligned); // copy to array
1376}
1377
1378template <class D>
1380 Store(v, d, p);
1381}
1382
1383template <class D>
1384HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1385 TFromD<D>* HWY_RESTRICT p) {
1386 for (size_t i = 0; i < MaxLanes(d); ++i) {
1387 if (m.bits[i]) p[i] = v.raw[i];
1388 }
1389}
1390
1391#ifdef HWY_NATIVE_STORE_N
1392#undef HWY_NATIVE_STORE_N
1393#else
1394#define HWY_NATIVE_STORE_N
1395#endif
1396
1397template <class D>
1399 size_t max_lanes_to_store) {
1400 const size_t N = Lanes(d);
1401 const size_t num_of_lanes_to_store = HWY_MIN(max_lanes_to_store, N);
1402 CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>));
1403}
1404
1405// ------------------------------ LoadInterleaved2/3/4
1406
1407// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1408// We implement those here because scalar code is likely faster than emulation
1409// via shuffles.
1410#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1411#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1412#else
1413#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1414#endif
1415
1416template <class D, typename T = TFromD<D>>
1417HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
1418 VFromD<D>& v0, VFromD<D>& v1) {
1419 alignas(16) T buf0[MaxLanes(d)];
1420 alignas(16) T buf1[MaxLanes(d)];
1421 for (size_t i = 0; i < MaxLanes(d); ++i) {
1422 buf0[i] = *unaligned++;
1423 buf1[i] = *unaligned++;
1424 }
1425 v0 = Load(d, buf0);
1426 v1 = Load(d, buf1);
1427}
1428
1429template <class D, typename T = TFromD<D>>
1430HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
1431 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
1432 alignas(16) T buf0[MaxLanes(d)];
1433 alignas(16) T buf1[MaxLanes(d)];
1434 alignas(16) T buf2[MaxLanes(d)];
1435 for (size_t i = 0; i < MaxLanes(d); ++i) {
1436 buf0[i] = *unaligned++;
1437 buf1[i] = *unaligned++;
1438 buf2[i] = *unaligned++;
1439 }
1440 v0 = Load(d, buf0);
1441 v1 = Load(d, buf1);
1442 v2 = Load(d, buf2);
1443}
1444
1445template <class D, typename T = TFromD<D>>
1446HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
1447 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
1448 VFromD<D>& v3) {
1449 alignas(16) T buf0[MaxLanes(d)];
1450 alignas(16) T buf1[MaxLanes(d)];
1451 alignas(16) T buf2[MaxLanes(d)];
1452 alignas(16) T buf3[MaxLanes(d)];
1453 for (size_t i = 0; i < MaxLanes(d); ++i) {
1454 buf0[i] = *unaligned++;
1455 buf1[i] = *unaligned++;
1456 buf2[i] = *unaligned++;
1457 buf3[i] = *unaligned++;
1458 }
1459 v0 = Load(d, buf0);
1460 v1 = Load(d, buf1);
1461 v2 = Load(d, buf2);
1462 v3 = Load(d, buf3);
1463}
1464
1465// ------------------------------ StoreInterleaved2/3/4
1466
1467template <class D>
1469 TFromD<D>* HWY_RESTRICT unaligned) {
1470 for (size_t i = 0; i < MaxLanes(d); ++i) {
1471 *unaligned++ = v0.raw[i];
1472 *unaligned++ = v1.raw[i];
1473 }
1474}
1475
1476template <class D>
1478 TFromD<D>* HWY_RESTRICT unaligned) {
1479 for (size_t i = 0; i < MaxLanes(d); ++i) {
1480 *unaligned++ = v0.raw[i];
1481 *unaligned++ = v1.raw[i];
1482 *unaligned++ = v2.raw[i];
1483 }
1484}
1485
1486template <class D>
1488 VFromD<D> v3, D d,
1489 TFromD<D>* HWY_RESTRICT unaligned) {
1490 for (size_t i = 0; i < MaxLanes(d); ++i) {
1491 *unaligned++ = v0.raw[i];
1492 *unaligned++ = v1.raw[i];
1493 *unaligned++ = v2.raw[i];
1494 *unaligned++ = v3.raw[i];
1495 }
1496}
1497
1498// ------------------------------ Stream
1499template <class D>
1500HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1501 Store(v, d, aligned);
1502}
1503
1504// ------------------------------ Scatter in generic_ops-inl.h
1505// ------------------------------ Gather in generic_ops-inl.h
1506
1507// ================================================== CONVERT
1508
1509// ConvertTo and DemoteTo with floating-point input and integer output truncate
1510// (rounding toward zero).
1511
1512namespace detail {
1513
1514template <class ToT, class FromT>
1516 // Prevent ubsan errors when converting float to narrower integer
1517
1518 using FromTU = MakeUnsigned<FromT>;
1519 using ToTU = MakeUnsigned<ToT>;
1520
1521 constexpr unsigned kMaxExpField =
1522 static_cast<unsigned>(MaxExponentField<FromT>());
1523 constexpr unsigned kExpBias = kMaxExpField >> 1;
1524 constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1525 kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1526 kMaxExpField));
1527
1528 // If ToT is signed, compare only the exponent bits of val against
1529 // kMinOutOfRangeExpField.
1530 //
1531 // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1532 // val against kMinOutOfRangeExpField as a negative value is outside of the
1533 // range of an unsigned integer type.
1534 const FromT val_to_compare =
1535 static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1536
1537 // val is within the range of ToT if
1538 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1539 // than kMinOutOfRangeExpField
1540 //
1541 // Otherwise, val is either outside of the range of ToT or equal to
1542 // LimitsMin<ToT>() if
1543 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1544 // than or equal to kMinOutOfRangeExpField.
1545
1546 return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1547 MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1548 ? static_cast<ToT>(val)
1549 : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
1550 static_cast<ToTU>(ScalarSignBit(val)));
1551}
1552
1553template <class ToT, class ToTypeTag, class FromT>
1554HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
1555 return ConvertScalarTo<ToT>(val);
1556}
1557
1558template <class ToT>
1560 float val) {
1561 return CastValueForF2IConv<ToT>(val);
1562}
1563
1564template <class ToT>
1566 float val) {
1567 return CastValueForF2IConv<ToT>(val);
1568}
1569// If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
1570// returns static_cast<ToT>(val)
1571//
1572// Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
1573// implementation-defined result if val is not within the range of ToT.
1574template <class ToT, class FromT>
1576 // Prevent ubsan errors when converting float to narrower integer
1577
1578 using FromTU = MakeUnsigned<FromT>;
1579
1580 constexpr unsigned kMaxExpField =
1581 static_cast<unsigned>(MaxExponentField<FromT>());
1582 constexpr unsigned kExpBias = kMaxExpField >> 1;
1583 constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1584 kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1585 kMaxExpField));
1586
1587 // If ToT is signed, compare only the exponent bits of val against
1588 // kMinOutOfRangeExpField.
1589 //
1590 // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1591 // val against kMinOutOfRangeExpField as a negative value is outside of the
1592 // range of an unsigned integer type.
1593 const FromT val_to_compare =
1594 static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1595
1596 // val is within the range of ToT if
1597 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1598 // than kMinOutOfRangeExpField
1599 //
1600 // Otherwise, val is either outside of the range of ToT or equal to
1601 // LimitsMin<ToT>() if
1602 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1603 // than or equal to kMinOutOfRangeExpField.
1604
1605 return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1606 MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1607 ? static_cast<ToT>(val)
1608 : static_cast<ToT>(LimitsMin<ToT>());
1609}
1610
1611} // namespace detail
1612
1613template <class DTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TFrom)>
1615 static_assert(sizeof(TFromD<DTo>) > sizeof(TFrom), "Not promoting");
1616 VFromD<DTo> ret;
1617 for (size_t i = 0; i < MaxLanes(d); ++i) {
1618 // For bits Y > X, floatX->floatY and intX->intY are always representable.
1619 ret.raw[i] = detail::CastValueForPromoteTo<TFromD<DTo>>(
1620 hwy::TypeTag<TFromD<DTo>>(), from.raw[i]);
1621 }
1622 return ret;
1623}
1624
1625#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1626#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1627#else
1628#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1629#endif
1630
1631template <class D64, HWY_IF_UI64_D(D64)>
1633 VFromD<D64> ret;
1634 for (size_t i = 0; i < MaxLanes(d64); ++i) {
1635 ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D64>>(v.raw[i]);
1636 }
1637 return ret;
1638}
1639
1640// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
1641// so we overload for TFrom=double and ToT={float,int32_t}.
1642template <class D, HWY_IF_F32_D(D)>
1644 VFromD<D> ret;
1645 for (size_t i = 0; i < MaxLanes(d); ++i) {
1646 // Prevent ubsan errors when converting float to narrower integer/float
1647 if (ScalarIsInf(from.raw[i]) ||
1648 ScalarAbs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
1649 ret.raw[i] = ScalarSignBit(from.raw[i]) ? LowestValue<float>()
1651 continue;
1652 }
1653 ret.raw[i] = static_cast<float>(from.raw[i]);
1654 }
1655 return ret;
1656}
1657template <class D, HWY_IF_UI32_D(D)>
1658HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
1659 VFromD<D> ret;
1660 for (size_t i = 0; i < MaxLanes(d); ++i) {
1661 // Prevent ubsan errors when converting double to narrower integer/int32_t
1662 ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(from.raw[i]);
1663 }
1664 return ret;
1665}
1666
1667template <class DTo, typename TFrom, size_t N, HWY_IF_SIGNED(TFrom),
1668 HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)>
1670 using TTo = TFromD<DTo>;
1671 static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1672
1673 VFromD<DTo> ret;
1674 for (size_t i = 0; i < N; ++i) {
1675 // Int to int: choose closest value in ToT to `from` (avoids UB)
1676 from.raw[i] =
1677 HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw[i]), LimitsMax<TTo>());
1678 ret.raw[i] = static_cast<TTo>(from.raw[i]);
1679 }
1680 return ret;
1681}
1682
1683// Disable the default unsigned to signed DemoteTo/ReorderDemote2To
1684// implementations in generic_ops-inl.h on EMU128 as the EMU128 target has
1685// target-specific implementations of the unsigned to signed DemoteTo and
1686// ReorderDemote2To ops
1687
1688// NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
1689// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
1690// !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
1691// SFINAE to occur instead of a hard error due to a dependency on the V template
1692// argument
1693#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
1694#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
1695 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
1696
1697template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom),
1699HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
1700 using TTo = TFromD<DTo>;
1701 static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1702
1703 const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
1704
1705 VFromD<DTo> ret;
1706 for (size_t i = 0; i < N; ++i) {
1707 // Int to int: choose closest value in ToT to `from` (avoids UB)
1708 ret.raw[i] = static_cast<TTo>(HWY_MIN(from.raw[i], max));
1709 }
1710 return ret;
1711}
1712
1713template <class DTo, typename TFrom, size_t N, HWY_IF_UI64(TFrom),
1714 HWY_IF_F32_D(DTo)>
1715HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
1716 using TTo = TFromD<DTo>;
1717 static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1718
1719 VFromD<DTo> ret;
1720 for (size_t i = 0; i < N; ++i) {
1721 // int64_t/uint64_t to float: okay to cast to float as an int64_t/uint64_t
1722 // value is always within the range of a float
1723 ret.raw[i] = static_cast<TTo>(from.raw[i]);
1724 }
1725 return ret;
1726}
1727
1728template <class DBF16, HWY_IF_BF16_D(DBF16), class VF32>
1729HWY_API VFromD<DBF16> ReorderDemote2To(DBF16 dbf16, VF32 a, VF32 b) {
1730 const Repartition<uint32_t, decltype(dbf16)> du32;
1731 const VFromD<decltype(du32)> b_in_lower = ShiftRight<16>(BitCast(du32, b));
1732 // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
1733 const VFromD<decltype(du32)> a_mask = Set(du32, 0xFFFF0000);
1734 return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
1735}
1736
1737template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V,
1738 HWY_IF_SIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1739 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1741 const RepartitionToWide<decltype(dn)> dw;
1742 const size_t NW = Lanes(dw);
1743 using TN = TFromD<DN>;
1744 const TN min = LimitsMin<TN>();
1745 const TN max = LimitsMax<TN>();
1746 VFromD<DN> ret;
1747 for (size_t i = 0; i < NW; ++i) {
1748 ret.raw[i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, a.raw[i]), max));
1749 }
1750 for (size_t i = 0; i < NW; ++i) {
1751 ret.raw[NW + i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, b.raw[i]), max));
1752 }
1753 return ret;
1754}
1755
1756template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), class V,
1757 HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1758 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1759HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
1760 const RepartitionToWide<decltype(dn)> dw;
1761 const size_t NW = Lanes(dw);
1762 using TN = TFromD<DN>;
1763 using TN_U = MakeUnsigned<TN>;
1764 const TN_U max = static_cast<TN_U>(LimitsMax<TN>());
1765 VFromD<DN> ret;
1766 for (size_t i = 0; i < NW; ++i) {
1767 ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max));
1768 }
1769 for (size_t i = 0; i < NW; ++i) {
1770 ret.raw[NW + i] = static_cast<TN>(HWY_MIN(b.raw[i], max));
1771 }
1772 return ret;
1773}
1774
1775template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V,
1776 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
1777 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1778 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1780 return ReorderDemote2To(dn, a, b);
1781}
1782
1783template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), class V,
1784 HWY_IF_F32_D(DFromV<V>),
1785 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1787 const size_t NW = Lanes(dn) / 2;
1788 using TN = TFromD<DN>;
1789 VFromD<DN> ret;
1790 for (size_t i = 0; i < NW; ++i) {
1791 ret.raw[i] = ConvertScalarTo<TN>(a.raw[i]);
1792 }
1793 for (size_t i = 0; i < NW; ++i) {
1794 ret.raw[NW + i] = ConvertScalarTo<TN>(b.raw[i]);
1795 }
1796 return ret;
1797}
1798
1799namespace detail {
1800
1801HWY_INLINE void StoreU16ToF16(const uint16_t val,
1803 CopySameSize(&val, to);
1804}
1805
1807 uint16_t bits16;
1808 CopySameSize(from, &bits16);
1809 return bits16;
1810}
1811
1812} // namespace detail
1813
1814template <class D, HWY_IF_F32_D(D), size_t N>
1816 VFromD<D> ret;
1817 for (size_t i = 0; i < N; ++i) {
1818 ret.raw[i] = F32FromBF16(v.raw[i]);
1819 }
1820 return ret;
1821}
1822
1823#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
1824#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
1825#else
1826#define HWY_NATIVE_DEMOTE_F32_TO_BF16
1827#endif
1828
1829template <class D, HWY_IF_BF16_D(D), size_t N>
1831 VFromD<D> ret;
1832 for (size_t i = 0; i < N; ++i) {
1833 ret.raw[i] = BF16FromF32(v.raw[i]);
1834 }
1835 return ret;
1836}
1837
1838#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1839#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1840#else
1841#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1842#endif
1843
1844template <class D32, HWY_IF_UI32_D(D32)>
1846 VFromD<D32> ret;
1847 for (size_t i = 0; i < MaxLanes(d32); ++i) {
1848 ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D32>>(v.raw[i]);
1849 }
1850 return ret;
1851}
1852
1853// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
1854namespace detail {
1855
1856template <typename TFrom, typename DTo>
1858 Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
1859 using ToT = TFromD<DTo>;
1860 static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size");
1861 VFromD<DTo> ret;
1862 constexpr size_t N = HWY_MAX_LANES_D(DTo);
1863
1864 for (size_t i = 0; i < N; ++i) {
1865 // float## -> int##: return closest representable value
1866 ret.raw[i] = CastValueForF2IConv<ToT>(from.raw[i]);
1867 }
1868 return ret;
1869}
1870
1871template <typename TFrom, typename DTo>
1873 Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
1874 using ToT = TFromD<DTo>;
1875 static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size");
1876 VFromD<DTo> ret;
1877 constexpr size_t N = HWY_MAX_LANES_D(DTo);
1878 for (size_t i = 0; i < N; ++i) {
1879 // int## -> float##: no check needed
1880 ret.raw[i] = static_cast<ToT>(from.raw[i]);
1881 }
1882 return ret;
1883}
1884
1885} // namespace detail
1886
1887template <class DTo, typename TFrom>
1889 return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from);
1890}
1891
1892#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1893#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1894#else
1895#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1896#endif
1897
1898template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
1899 HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
1901 VFromD<DI> ret;
1902 for (size_t i = 0; i < MaxLanes(di); i++) {
1903 ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<DI>>(v.raw[i]);
1904 }
1905 return ret;
1906}
1907
1908template <size_t N>
1912
1913// ------------------------------ Truncations
1914
1915template <class D, HWY_IF_U8_D(D), size_t N>
1917 VFromD<D> ret;
1918 for (size_t i = 0; i < N; ++i) {
1919 ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
1920 }
1921 return ret;
1922}
1923
1924template <class D, HWY_IF_U16_D(D), size_t N>
1925HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
1926 VFromD<D> ret;
1927 for (size_t i = 0; i < N; ++i) {
1928 ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
1929 }
1930 return ret;
1931}
1932
1933template <class D, HWY_IF_U32_D(D), size_t N>
1934HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
1935 VFromD<D> ret;
1936 for (size_t i = 0; i < N; ++i) {
1937 ret.raw[i] = static_cast<uint32_t>(v.raw[i] & 0xFFFFFFFFu);
1938 }
1939 return ret;
1940}
1941
1942template <class D, HWY_IF_U8_D(D), size_t N>
1944 VFromD<D> ret;
1945 for (size_t i = 0; i < N; ++i) {
1946 ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
1947 }
1948 return ret;
1949}
1950
1951template <class D, HWY_IF_U16_D(D), size_t N>
1952HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) {
1953 VFromD<D> ret;
1954 for (size_t i = 0; i < N; ++i) {
1955 ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
1956 }
1957 return ret;
1958}
1959
1960template <class D, HWY_IF_U8_D(D), size_t N>
1962 VFromD<D> ret;
1963 for (size_t i = 0; i < N; ++i) {
1964 ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
1965 }
1966 return ret;
1967}
1968
1969#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
1970#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
1971#else
1972#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
1973#endif
1974
1975template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
1976 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1977 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1979 const RepartitionToWide<decltype(dn)> dw;
1980 const size_t NW = Lanes(dw);
1981 using TW = TFromD<decltype(dw)>;
1982 using TN = TFromD<decltype(dn)>;
1983 VFromD<DN> ret;
1984 constexpr TW max_val{LimitsMax<TN>()};
1985
1986 for (size_t i = 0; i < NW; ++i) {
1987 ret.raw[i] = static_cast<TN>(a.raw[i] & max_val);
1988 }
1989 for (size_t i = 0; i < NW; ++i) {
1990 ret.raw[NW + i] = static_cast<TN>(b.raw[i] & max_val);
1991 }
1992 return ret;
1993}
1994
1995// ================================================== COMBINE
1996
1997template <typename T, size_t N>
1999 Vec128<T, N / 2> ret;
2000 CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
2001 return ret;
2002}
2003
2004template <class D>
2006 return LowerHalf(v);
2007}
2008
2009template <class D>
2011 VFromD<D> ret;
2012 CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
2013 return ret;
2014}
2015
2016template <class D>
2017HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
2018 const Half<decltype(d)> dh;
2019 VFromD<D> ret; // zero-initialized
2020 CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
2021 return ret;
2022}
2023
2024template <class D, class VH = VFromD<Half<D>>>
2025HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
2026 const Half<decltype(d)> dh;
2027 VFromD<D> ret;
2028 CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
2029 CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
2030 return ret;
2031}
2032
2033template <class D>
2035 const Half<decltype(d)> dh;
2036 VFromD<D> ret;
2037 CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
2038 CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
2039 return ret;
2040}
2041
2042template <class D>
2044 const Half<decltype(d)> dh;
2045 VFromD<D> ret;
2046 CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
2047 CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
2048 return ret;
2049}
2050
2051template <class D>
2053 const Half<decltype(d)> dh;
2054 VFromD<D> ret;
2055 CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
2056 CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
2057 return ret;
2058}
2059
2060template <class D>
2062 const Half<decltype(d)> dh;
2063 VFromD<D> ret;
2064 CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
2065 CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
2066 return ret;
2067}
2068
2069template <class D>
2071 const Half<decltype(d)> dh;
2072 VFromD<D> ret;
2073 for (size_t i = 0; i < MaxLanes(dh); ++i) {
2074 ret.raw[i] = lo.raw[2 * i];
2075 }
2076 for (size_t i = 0; i < MaxLanes(dh); ++i) {
2077 ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
2078 }
2079 return ret;
2080}
2081
2082// 2023-11-23: workaround for incorrect codegen (reduction_test fails for
2083// SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
2084#if HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
2085#define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
2086#else
2087#define HWY_EMU128_CONCAT_INLINE HWY_API
2088#endif
2089
2090template <class D>
2092 const Half<decltype(d)> dh;
2093 VFromD<D> ret;
2094 for (size_t i = 0; i < MaxLanes(dh); ++i) {
2095 ret.raw[i] = lo.raw[2 * i + 1];
2096 }
2097 for (size_t i = 0; i < MaxLanes(dh); ++i) {
2098 ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
2099 }
2100 return ret;
2101}
2102
2103// ------------------------------ CombineShiftRightBytes
2104template <int kBytes, class D>
2106 VFromD<D> ret;
2107 const uint8_t* HWY_RESTRICT lo8 =
2108 reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
2109 uint8_t* HWY_RESTRICT ret8 =
2110 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2111 CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
2112 CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
2113 return ret;
2114}
2115
2116// ------------------------------ ShiftLeftBytes
2117
2118template <int kBytes, class D>
2120 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2121 VFromD<D> ret;
2122 uint8_t* HWY_RESTRICT ret8 =
2123 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2124 ZeroBytes<kBytes>(ret8);
2125 CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
2126 return ret;
2127}
2128
2129template <int kBytes, typename T, size_t N>
2130HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
2131 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
2132}
2133
2134// ------------------------------ ShiftLeftLanes
2135
2136template <int kLanes, class D, typename T = TFromD<D>>
2138 const Repartition<uint8_t, decltype(d)> d8;
2139 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2140}
2141
2142template <int kLanes, typename T, size_t N>
2143HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
2144 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
2145}
2146
2147// ------------------------------ ShiftRightBytes
2148template <int kBytes, class D>
2150 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2151 VFromD<D> ret;
2152 const uint8_t* HWY_RESTRICT v8 =
2153 reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
2154 uint8_t* HWY_RESTRICT ret8 =
2155 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2156 CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
2157 ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
2158 return ret;
2159}
2160
2161// ------------------------------ ShiftRightLanes
2162template <int kLanes, class D>
2164 const Repartition<uint8_t, decltype(d)> d8;
2165 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
2166 return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
2167}
2168
2169// ================================================== SWIZZLE
2170
2171template <typename T, size_t N>
2173 return v.raw[0];
2174}
2175
2176template <typename T, size_t N>
2178 v.raw[i] = t;
2179 return v;
2180}
2181
2182template <typename T, size_t N>
2184 return v.raw[i];
2185}
2186
2187template <typename T, size_t N>
2189 for (size_t i = 0; i < N; i += 2) {
2190 v.raw[i + 1] = v.raw[i];
2191 }
2192 return v;
2193}
2194
2195template <typename T, size_t N>
2197 for (size_t i = 0; i < N; i += 2) {
2198 v.raw[i] = v.raw[i + 1];
2199 }
2200 return v;
2201}
2202
2203template <typename T, size_t N>
2204HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
2205 for (size_t i = 0; i < N; i += 2) {
2206 odd.raw[i] = even.raw[i];
2207 }
2208 return odd;
2209}
2210
2211template <class D>
2213 constexpr size_t N = HWY_MAX_LANES_D(D);
2214 for (size_t i = 1; i < N; i += 2) {
2215 a.raw[i] = b.raw[i - 1];
2216 }
2217 return a;
2218}
2219
2220template <class D>
2222 constexpr size_t N = HWY_MAX_LANES_D(D);
2223 for (size_t i = 1; i < N; i += 2) {
2224 b.raw[i - 1] = a.raw[i];
2225 }
2226 return b;
2227}
2228
2229template <typename T, size_t N>
2230HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
2231 return even;
2232}
2233
2234// ------------------------------ SwapAdjacentBlocks
2235template <typename T, size_t N>
2236HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
2237 return v;
2238}
2239
2240// ------------------------------ TableLookupLanes
2241
2242// Returned by SetTableIndices for use by TableLookupLanes.
2243template <typename T, size_t N>
2244struct Indices128 {
2246};
2247
2248template <class D, typename TI, size_t N>
2250 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size must match");
2251 Indices128<TFromD<D>, N> ret;
2252 CopyBytes<d.MaxBytes()>(vec.raw, ret.raw);
2253 return ret;
2254}
2255
2256template <class D, typename TI>
2258 D d, const TI* idx) {
2259 return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
2260}
2261
2262template <typename T, size_t N>
2263HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
2264 Vec128<T, N> ret;
2265 for (size_t i = 0; i < N; ++i) {
2266 ret.raw[i] = v.raw[idx.raw[i]];
2267 }
2268 return ret;
2269}
2270
2271template <typename T, size_t N>
2273 Indices128<T, N> idx) {
2274 using TI = MakeSigned<T>;
2275 Vec128<T, N> ret;
2276 constexpr TI kVecLaneIdxMask = static_cast<TI>(N - 1);
2277 for (size_t i = 0; i < N; ++i) {
2278 const auto src_idx = idx.raw[i];
2279 const auto masked_src_lane_idx = src_idx & kVecLaneIdxMask;
2280 ret.raw[i] = (src_idx < static_cast<TI>(N)) ? a.raw[masked_src_lane_idx]
2281 : b.raw[masked_src_lane_idx];
2282 }
2283 return ret;
2284}
2285
2286// ------------------------------ ReverseBlocks
2287template <class D>
2289 return v; // Single block: no change
2290}
2291
2292// ------------------------------ Reverse
2293
2294template <class D>
2296 VFromD<D> ret;
2297 for (size_t i = 0; i < MaxLanes(d); ++i) {
2298 ret.raw[i] = v.raw[MaxLanes(d) - 1 - i];
2299 }
2300 return ret;
2301}
2302
2303// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
2304#ifdef HWY_NATIVE_REVERSE2_8
2305#undef HWY_NATIVE_REVERSE2_8
2306#else
2307#define HWY_NATIVE_REVERSE2_8
2308#endif
2309
2310template <class D>
2312 VFromD<D> ret;
2313 for (size_t i = 0; i < MaxLanes(d); i += 2) {
2314 ret.raw[i + 0] = v.raw[i + 1];
2315 ret.raw[i + 1] = v.raw[i + 0];
2316 }
2317 return ret;
2318}
2319
2320template <class D>
2322 VFromD<D> ret;
2323 for (size_t i = 0; i < MaxLanes(d); i += 4) {
2324 ret.raw[i + 0] = v.raw[i + 3];
2325 ret.raw[i + 1] = v.raw[i + 2];
2326 ret.raw[i + 2] = v.raw[i + 1];
2327 ret.raw[i + 3] = v.raw[i + 0];
2328 }
2329 return ret;
2330}
2331
2332template <class D>
2334 VFromD<D> ret;
2335 for (size_t i = 0; i < MaxLanes(d); i += 8) {
2336 ret.raw[i + 0] = v.raw[i + 7];
2337 ret.raw[i + 1] = v.raw[i + 6];
2338 ret.raw[i + 2] = v.raw[i + 5];
2339 ret.raw[i + 3] = v.raw[i + 4];
2340 ret.raw[i + 4] = v.raw[i + 3];
2341 ret.raw[i + 5] = v.raw[i + 2];
2342 ret.raw[i + 6] = v.raw[i + 1];
2343 ret.raw[i + 7] = v.raw[i + 0];
2344 }
2345 return ret;
2346}
2347
2348// ------------------------------ SlideUpLanes
2349
2350template <class D>
2351HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
2352 VFromD<D> ret = Zero(d);
2353 constexpr size_t N = HWY_MAX_LANES_D(D);
2354 const size_t clamped_amt = HWY_MIN(amt, N);
2355 CopyBytes(v.raw, ret.raw + clamped_amt,
2356 (N - clamped_amt) * sizeof(TFromD<D>));
2357 return ret;
2358}
2359
2360// ------------------------------ SlideDownLanes
2361
2362template <class D>
2363HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
2364 VFromD<D> ret = Zero(d);
2365 constexpr size_t N = HWY_MAX_LANES_D(D);
2366 const size_t clamped_amt = HWY_MIN(amt, N);
2367 CopyBytes(v.raw + clamped_amt, ret.raw,
2368 (N - clamped_amt) * sizeof(TFromD<D>));
2369 return ret;
2370}
2371
2372// ================================================== BLOCKWISE
2373
2374// ------------------------------ Shuffle*
2375
2376// Swap 32-bit halves in 64-bit halves.
2377template <typename T, size_t N>
2379 static_assert(sizeof(T) == 4, "Only for 32-bit");
2380 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2381 return Reverse2(DFromV<decltype(v)>(), v);
2382}
2383
2384// Swap 64-bit halves
2385template <typename T>
2386HWY_API Vec128<T> Shuffle1032(Vec128<T> v) {
2387 static_assert(sizeof(T) == 4, "Only for 32-bit");
2388 Vec128<T> ret;
2389 ret.raw[3] = v.raw[1];
2390 ret.raw[2] = v.raw[0];
2391 ret.raw[1] = v.raw[3];
2392 ret.raw[0] = v.raw[2];
2393 return ret;
2394}
2395template <typename T>
2396HWY_API Vec128<T> Shuffle01(Vec128<T> v) {
2397 static_assert(sizeof(T) == 8, "Only for 64-bit");
2398 return Reverse2(DFromV<decltype(v)>(), v);
2399}
2400
2401// Rotate right 32 bits
2402template <typename T>
2403HWY_API Vec128<T> Shuffle0321(Vec128<T> v) {
2404 Vec128<T> ret;
2405 ret.raw[3] = v.raw[0];
2406 ret.raw[2] = v.raw[3];
2407 ret.raw[1] = v.raw[2];
2408 ret.raw[0] = v.raw[1];
2409 return ret;
2410}
2411
2412// Rotate left 32 bits
2413template <typename T>
2414HWY_API Vec128<T> Shuffle2103(Vec128<T> v) {
2415 Vec128<T> ret;
2416 ret.raw[3] = v.raw[2];
2417 ret.raw[2] = v.raw[1];
2418 ret.raw[1] = v.raw[0];
2419 ret.raw[0] = v.raw[3];
2420 return ret;
2421}
2422
2423template <typename T>
2424HWY_API Vec128<T> Shuffle0123(Vec128<T> v) {
2425 return Reverse4(DFromV<decltype(v)>(), v);
2426}
2427
2428// ------------------------------ Broadcast
2429template <int kLane, typename T, size_t N>
2431 for (size_t i = 0; i < N; ++i) {
2432 v.raw[i] = v.raw[kLane];
2433 }
2434 return v;
2435}
2436
2437// ------------------------------ TableLookupBytes, TableLookupBytesOr0
2438
2439template <typename T, size_t N, typename TI, size_t NI>
2442 const uint8_t* HWY_RESTRICT v_bytes =
2443 reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
2444 const uint8_t* HWY_RESTRICT idx_bytes =
2445 reinterpret_cast<const uint8_t*>(indices.raw);
2446 Vec128<TI, NI> ret;
2447 uint8_t* HWY_RESTRICT ret_bytes =
2448 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2449 for (size_t i = 0; i < NI * sizeof(TI); ++i) {
2450 const size_t idx = idx_bytes[i];
2451 // Avoid out of bounds reads.
2452 ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0;
2453 }
2454 return ret;
2455}
2456
2457template <typename T, size_t N, typename TI, size_t NI>
2460 // Same as TableLookupBytes, which already returns 0 if out of bounds.
2461 return TableLookupBytes(v, indices);
2462}
2463
2464// ------------------------------ InterleaveLower/InterleaveUpper
2465
2466template <typename T, size_t N>
2468 Vec128<T, N> ret;
2469 for (size_t i = 0; i < N / 2; ++i) {
2470 ret.raw[2 * i + 0] = a.raw[i];
2471 ret.raw[2 * i + 1] = b.raw[i];
2472 }
2473 return ret;
2474}
2475
2476// Additional overload for the optional tag.
2477template <class D>
2479 return InterleaveLower(a, b);
2480}
2481
2482template <class D>
2484 const Half<decltype(d)> dh;
2485 VFromD<D> ret;
2486 for (size_t i = 0; i < MaxLanes(dh); ++i) {
2487 ret.raw[2 * i + 0] = a.raw[MaxLanes(dh) + i];
2488 ret.raw[2 * i + 1] = b.raw[MaxLanes(dh) + i];
2489 }
2490 return ret;
2491}
2492
2493// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2494
2495// Same as Interleave*, except that the return lanes are double-width integers;
2496// this is necessary because the single-lane scalar cannot return two values.
2497template <class V, class DW = RepartitionToWide<DFromV<V>>>
2498HWY_API VFromD<DW> ZipLower(V a, V b) {
2499 return BitCast(DW(), InterleaveLower(a, b));
2500}
2501template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2502HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2503 return BitCast(dw, InterleaveLower(D(), a, b));
2504}
2505
2506template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2507HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2508 return BitCast(dw, InterleaveUpper(D(), a, b));
2509}
2510
2511// ================================================== MASK
2512
2513template <class D>
2514HWY_API bool AllFalse(D d, MFromD<D> mask) {
2515 typename MFromD<D>::Raw or_sum = 0;
2516 for (size_t i = 0; i < MaxLanes(d); ++i) {
2517 or_sum |= mask.bits[i];
2518 }
2519 return or_sum == 0;
2520}
2521
2522template <class D>
2524 constexpr uint64_t kAll = LimitsMax<typename MFromD<D>::Raw>();
2525 uint64_t and_sum = kAll;
2526 for (size_t i = 0; i < MaxLanes(d); ++i) {
2527 and_sum &= mask.bits[i];
2528 }
2529 return and_sum == kAll;
2530}
2531
2532// `p` points to at least 8 readable bytes, not all of which need be valid.
2533template <class D>
2535 MFromD<D> m;
2536 for (size_t i = 0; i < MaxLanes(d); ++i) {
2537 const size_t bit = size_t{1} << (i & 7);
2538 const size_t idx_byte = i >> 3;
2539 m.bits[i] = MFromD<D>::FromBool((bits[idx_byte] & bit) != 0);
2540 }
2541 return m;
2542}
2543
2544template <class D>
2545HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
2546 MFromD<D> m;
2547 for (size_t i = 0; i < MaxLanes(d); ++i) {
2548 m.bits[i] = MFromD<D>::FromBool(((mask_bits >> i) & 1u) != 0);
2549 }
2550 return m;
2551}
2552
2553// `p` points to at least 8 writable bytes.
2554template <class D>
2555HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
2556 bits[0] = 0;
2557 if (MaxLanes(d) > 8) bits[1] = 0; // MaxLanes(d) <= 16, so max two bytes
2558 for (size_t i = 0; i < MaxLanes(d); ++i) {
2559 const size_t bit = size_t{1} << (i & 7);
2560 const size_t idx_byte = i >> 3;
2561 if (mask.bits[i]) {
2562 bits[idx_byte] = static_cast<uint8_t>(bits[idx_byte] | bit);
2563 }
2564 }
2565 return MaxLanes(d) > 8 ? 2 : 1;
2566}
2567
2568template <class D>
2569HWY_API size_t CountTrue(D d, MFromD<D> mask) {
2570 size_t count = 0;
2571 for (size_t i = 0; i < MaxLanes(d); ++i) {
2572 count += mask.bits[i] != 0;
2573 }
2574 return count;
2575}
2576
2577template <class D>
2578HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
2579 for (size_t i = 0; i < MaxLanes(d); ++i) {
2580 if (mask.bits[i] != 0) return i;
2581 }
2582 HWY_DASSERT(false);
2583 return 0;
2584}
2585
2586template <class D>
2587HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
2588 for (size_t i = 0; i < MaxLanes(d); ++i) {
2589 if (mask.bits[i] != 0) return static_cast<intptr_t>(i);
2590 }
2591 return intptr_t{-1};
2592}
2593
2594template <class D>
2595HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
2596 for (intptr_t i = static_cast<intptr_t>(MaxLanes(d) - 1); i >= 0; i--) {
2597 if (mask.bits[i] != 0) return static_cast<size_t>(i);
2598 }
2599 HWY_DASSERT(false);
2600 return 0;
2601}
2602
2603template <class D>
2604HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
2605 for (intptr_t i = static_cast<intptr_t>(MaxLanes(d) - 1); i >= 0; i--) {
2606 if (mask.bits[i] != 0) return i;
2607 }
2608 return intptr_t{-1};
2609}
2610
2611// ------------------------------ Compress
2612
2613template <typename T>
2614struct CompressIsPartition {
2615 enum { value = (sizeof(T) != 1) };
2616};
2617
2618template <typename T, size_t N>
2620 size_t count = 0;
2621 Vec128<T, N> ret;
2622 for (size_t i = 0; i < N; ++i) {
2623 if (mask.bits[i]) {
2624 ret.raw[count++] = v.raw[i];
2625 }
2626 }
2627 for (size_t i = 0; i < N; ++i) {
2628 if (!mask.bits[i]) {
2629 ret.raw[count++] = v.raw[i];
2630 }
2631 }
2632 HWY_DASSERT(count == N);
2633 return ret;
2634}
2635
2636// ------------------------------ Expand
2637
2638// Could also just allow generic_ops-inl.h to implement these, but use our
2639// simple implementation below to ensure the test is correct.
2640#ifdef HWY_NATIVE_EXPAND
2641#undef HWY_NATIVE_EXPAND
2642#else
2643#define HWY_NATIVE_EXPAND
2644#endif
2645
2646template <typename T, size_t N>
2648 size_t in_pos = 0;
2649 Vec128<T, N> ret;
2650 for (size_t i = 0; i < N; ++i) {
2651 if (mask.bits[i]) {
2652 ret.raw[i] = v.raw[in_pos++];
2653 } else {
2654 ret.raw[i] = ConvertScalarTo<T>(0);
2655 }
2656 }
2657 return ret;
2658}
2659
2660// ------------------------------ LoadExpand
2661
2662template <class D>
2663HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
2664 const TFromD<D>* HWY_RESTRICT unaligned) {
2665 size_t in_pos = 0;
2666 VFromD<D> ret;
2667 for (size_t i = 0; i < Lanes(d); ++i) {
2668 if (mask.bits[i]) {
2669 ret.raw[i] = unaligned[in_pos++];
2670 } else {
2671 ret.raw[i] = TFromD<D>(); // zero, also works for float16_t
2672 }
2673 }
2674 return ret;
2675}
2676
2677// ------------------------------ CompressNot
2678template <typename T, size_t N>
2680 size_t count = 0;
2681 Vec128<T, N> ret;
2682 for (size_t i = 0; i < N; ++i) {
2683 if (!mask.bits[i]) {
2684 ret.raw[count++] = v.raw[i];
2685 }
2686 }
2687 for (size_t i = 0; i < N; ++i) {
2688 if (mask.bits[i]) {
2689 ret.raw[count++] = v.raw[i];
2690 }
2691 }
2692 HWY_DASSERT(count == N);
2693 return ret;
2694}
2695
2696// ------------------------------ CompressBlocksNot
2697HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
2698 Mask128<uint64_t> /* m */) {
2699 return v;
2700}
2701
2702// ------------------------------ CompressBits
2703template <typename T, size_t N>
2705 const uint8_t* HWY_RESTRICT bits) {
2706 return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
2707}
2708
2709// ------------------------------ CompressStore
2710
2711// generic_ops-inl defines the 8-bit versions.
2712template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
2713HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
2714 TFromD<D>* HWY_RESTRICT unaligned) {
2715 size_t count = 0;
2716 for (size_t i = 0; i < MaxLanes(d); ++i) {
2717 if (mask.bits[i]) {
2718 unaligned[count++] = v.raw[i];
2719 }
2720 }
2721 return count;
2722}
2723
2724// ------------------------------ CompressBlendedStore
2725template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
2726HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> mask, D d,
2727 TFromD<D>* HWY_RESTRICT unaligned) {
2728 return CompressStore(v, mask, d, unaligned);
2729}
2730
2731// ------------------------------ CompressBitsStore
2732template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
2733HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
2734 D d, TFromD<D>* HWY_RESTRICT unaligned) {
2735 const MFromD<D> mask = LoadMaskBits(d, bits);
2736 StoreU(Compress(v, mask), d, unaligned);
2737 return CountTrue(d, mask);
2738}
2739
2740// ------------------------------ Additional mask logical operations
2741template <class T>
2742HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
2743 return mask;
2744}
2745
2746template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
2748 using TU = hwy::MakeUnsigned<T>;
2749
2750 Mask128<T, N> result;
2751 TU result_lane_mask{0};
2752 for (size_t i = 0; i < N; i++) {
2753 result_lane_mask = static_cast<TU>(result_lane_mask | mask.bits[i]);
2754 result.bits[i] = result_lane_mask;
2755 }
2756 return result;
2757}
2758
2759template <class T, size_t N>
2760HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
2761 return Not(SetAtOrAfterFirst(mask));
2762}
2763
2764template <class T, size_t N>
2766 using TU = hwy::MakeUnsigned<T>;
2767 using TI = hwy::MakeSigned<T>;
2768
2769 Mask128<T, N> result;
2770 TU result_lane_mask = static_cast<TU>(~TU{0});
2771 for (size_t i = 0; i < N; i++) {
2772 const auto curr_lane_mask_bits = mask.bits[i];
2773 result.bits[i] = static_cast<TU>(curr_lane_mask_bits & result_lane_mask);
2774 result_lane_mask =
2775 static_cast<TU>(result_lane_mask &
2776 static_cast<TU>(-static_cast<TI>(mask.bits[i] == 0)));
2777 }
2778 return result;
2779}
2780
2781template <class T, size_t N>
2783 using TU = hwy::MakeUnsigned<T>;
2784 using TI = hwy::MakeSigned<T>;
2785
2786 Mask128<T, N> result;
2787 TU result_lane_mask = static_cast<TU>(~TU{0});
2788 for (size_t i = 0; i < N; i++) {
2789 result.bits[i] = result_lane_mask;
2790 result_lane_mask =
2791 static_cast<TU>(result_lane_mask &
2792 static_cast<TU>(-static_cast<TI>(mask.bits[i] == 0)));
2793 }
2794 return result;
2795}
2796
2797// ------------------------------ WidenMulPairwiseAdd
2798
2799template <class D, HWY_IF_F32_D(D), class VBF16>
2801 const Rebind<uint32_t, decltype(df32)> du32;
2802 using VU32 = VFromD<decltype(du32)>;
2803 const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
2804 // Avoid ZipLower/Upper so this also works on big-endian systems.
2805 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2806 const VU32 ao = And(BitCast(du32, a), odd);
2807 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2808 const VU32 bo = And(BitCast(du32, b), odd);
2809 return Mul(BitCast(df32, ae), BitCast(df32, be)) +
2810 Mul(BitCast(df32, ao), BitCast(df32, bo));
2811}
2812
2813template <class D, HWY_IF_I32_D(D), class VI16>
2814HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
2815 using VI32 = VFromD<decltype(d32)>;
2816 // Manual sign extension requires two shifts for even lanes.
2817 const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
2818 const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
2819 const VI32 ao = ShiftRight<16>(BitCast(d32, a));
2820 const VI32 bo = ShiftRight<16>(BitCast(d32, b));
2821 return Add(Mul(ae, be), Mul(ao, bo));
2822}
2823
2824template <class D, HWY_IF_U32_D(D), class VU16>
2825HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VU16 a, VU16 b) {
2826 const auto lo16_mask = Set(du32, 0x0000FFFFu);
2827
2828 const auto a0 = And(BitCast(du32, a), lo16_mask);
2829 const auto b0 = And(BitCast(du32, b), lo16_mask);
2830
2831 const auto a1 = ShiftRight<16>(BitCast(du32, a));
2832 const auto b1 = ShiftRight<16>(BitCast(du32, b));
2833
2834 return Add(Mul(a0, b0), Mul(a1, b1));
2835}
2836
2837// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2838
2839template <class D, HWY_IF_F32_D(D), size_t N, class VBF16>
2841 const Vec128<float, N> sum0,
2842 Vec128<float, N>& sum1) {
2843 const Rebind<uint32_t, decltype(df32)> du32;
2844 using VU32 = VFromD<decltype(du32)>;
2845 const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
2846 // Avoid ZipLower/Upper so this also works on big-endian systems.
2847 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2848 const VU32 ao = And(BitCast(du32, a), odd);
2849 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2850 const VU32 bo = And(BitCast(du32, b), odd);
2851 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
2852 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
2853}
2854
2855template <class D, HWY_IF_I32_D(D), size_t N, class VI16>
2857 const Vec128<int32_t, N> sum0,
2858 Vec128<int32_t, N>& sum1) {
2859 using VI32 = VFromD<decltype(d32)>;
2860 // Manual sign extension requires two shifts for even lanes.
2861 const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
2862 const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
2863 const VI32 ao = ShiftRight<16>(BitCast(d32, a));
2864 const VI32 bo = ShiftRight<16>(BitCast(d32, b));
2865 sum1 = Add(Mul(ao, bo), sum1);
2866 return Add(Mul(ae, be), sum0);
2867}
2868
2869template <class D, HWY_IF_U32_D(D), size_t N, class VU16>
2871 const Vec128<uint32_t, N> sum0,
2872 Vec128<uint32_t, N>& sum1) {
2873 using VU32 = VFromD<decltype(du32)>;
2874 const VU32 lo16_mask = Set(du32, uint32_t{0x0000FFFFu});
2875 const VU32 ae = And(BitCast(du32, a), lo16_mask);
2876 const VU32 be = And(BitCast(du32, b), lo16_mask);
2877 const VU32 ao = ShiftRight<16>(BitCast(du32, a));
2878 const VU32 bo = ShiftRight<16>(BitCast(du32, b));
2879 sum1 = Add(Mul(ao, bo), sum1);
2880 return Add(Mul(ae, be), sum0);
2881}
2882
2883// ------------------------------ RearrangeToOddPlusEven
2884template <class VW>
2885HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) {
2886 return Add(sum0, sum1);
2887}
2888
2889// ================================================== REDUCTIONS
2890
2891#ifdef HWY_NATIVE_REDUCE_SCALAR
2892#undef HWY_NATIVE_REDUCE_SCALAR
2893#else
2894#define HWY_NATIVE_REDUCE_SCALAR
2895#endif
2896
2897template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2899 T sum = T{0};
2900 for (size_t i = 0; i < MaxLanes(d); ++i) {
2901 sum += v.raw[i];
2902 }
2903 return sum;
2904}
2905template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2907 T min = HighestValue<T>();
2908 for (size_t i = 0; i < MaxLanes(d); ++i) {
2909 min = HWY_MIN(min, v.raw[i]);
2910 }
2911 return min;
2912}
2913template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2915 T max = LowestValue<T>();
2916 for (size_t i = 0; i < MaxLanes(d); ++i) {
2917 max = HWY_MAX(max, v.raw[i]);
2918 }
2919 return max;
2920}
2921
2922// ------------------------------ SumOfLanes
2923
2924template <class D, HWY_IF_LANES_GT_D(D, 1)>
2926 return Set(d, ReduceSum(d, v));
2927}
2928template <class D, HWY_IF_LANES_GT_D(D, 1)>
2930 return Set(d, ReduceMin(d, v));
2931}
2932template <class D, HWY_IF_LANES_GT_D(D, 1)>
2934 return Set(d, ReduceMax(d, v));
2935}
2936
2937// ================================================== OPS WITH DEPENDENCIES
2938
2939// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
2940
2941template <class T, HWY_IF_UI64(T)>
2943 alignas(16) T mul[2];
2944 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
2945 return Load(Full128<T>(), mul);
2946}
2947
2948template <class T, HWY_IF_UI64(T)>
2950 alignas(16) T mul[2];
2951 const Half<Full128<T>> d2;
2952 mul[0] =
2953 Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
2954 return Load(Full128<T>(), mul);
2955}
2956
2957// NOLINTNEXTLINE(google-readability-namespace-comments)
2958} // namespace HWY_NAMESPACE
2959} // namespace hwy
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_IF_SIGNED(T)
Definition base.h:622
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
#define HWY_IF_UNSIGNED(T)
Definition base.h:620
#define HWY_IF_UI64(T)
Definition base.h:687
Definition arm_neon-inl.h:865
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:867
static HWY_INLINE Raw FromBool(bool b)
Definition emu128-inl.h:80
Raw bits[16/sizeof(T)]
Definition emu128-inl.h:85
Definition arm_neon-inl.h:813
HWY_INLINE Vec128()=default
HWY_INLINE Vec128 & operator%=(const Vec128 other)
Definition emu128-inl.h:55
T PrivateT
Definition arm_neon-inl.h:816
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition emu128-inl.h:46
Raw raw
Definition arm_neon-inl.h:851
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition emu128-inl.h:52
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition emu128-inl.h:64
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition emu128-inl.h:61
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition emu128-inl.h:43
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition emu128-inl.h:58
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition emu128-inl.h:49
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_EMU128_CONCAT_INLINE
Definition emu128-inl.h:2087
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:744
HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag, FromT val)
Definition emu128-inl.h:1554
HWY_API VFromD< DTo > ConvertTo(hwy::FloatTag, DTo, Vec128< TFrom, HWY_MAX_LANES_D(DTo)> from)
Definition emu128-inl.h:1857
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:689
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:560
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:681
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:570
HWY_INLINE VFromD< DTo > ZeroExtendResizeBitCast(FromSizeTag, ToSizeTag, DTo d_to, DFrom, VFromD< DFrom > v)
Definition emu128-inl.h:140
HWY_INLINE ToT CastValueForF2IConv(FromT val)
Definition emu128-inl.h:1515
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:774
static HWY_INLINE float ScalarSqrt(float v)
Definition emu128-inl.h:959
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE void StoreU16ToF16(const uint16_t val, hwy::float16_t *HWY_RESTRICT to)
Definition emu128-inl.h:1801
HWY_INLINE uint16_t U16FromF16(const hwy::float16_t *HWY_RESTRICT from)
Definition emu128-inl.h:1806
HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val)
Definition emu128-inl.h:1575
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API TFromD< D > ReduceMax(D d, VFromD< D > v)
Definition arm_sve-inl.h:3213
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_API TFromD< D > ReduceMin(D d, VFromD< D > v)
Definition arm_sve-inl.h:3208
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API VFromD< DN > OrderedTruncate2To(DN dn, V a, V b)
Definition emu128-inl.h:1978
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API void LoadInterleaved2(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1)
Definition arm_neon-inl.h:9049
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
VFromD< ScalableTag< bfloat16_t > > VBF16
Definition arm_sve-inl.h:410
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
Definition abort.h:8
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue< float >()
Definition base.h:2203
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment)
Definition base.h:2676
HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf)
Definition base.h:1778
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val)
Definition base.h:2873
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val)
Definition base.h:2822
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val)
Definition base.h:2829
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue< float >()
Definition base.h:2224
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f)
Definition base.h:1817
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef< T > ScalarAbs(T val)
Definition base.h:2815
HWY_API constexpr RemoveCvRef< T > ScalarShr(T val, int shift_amt)
Definition base.h:2528
HWY_API constexpr T LimitsMax()
Definition base.h:2174
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:624
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_SPECIAL_FLOAT_D(D)
Definition ops/shared-inl.h:540
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
@ value
Definition arm_neon-inl.h:8429
Definition arm_neon-inl.h:5654
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:5655
Definition ops/shared-inl.h:198
Definition base.h:694
Definition base.h:1117
int VFromD
Definition tuple-inl.h:25