Grok 12.0.1
scalar-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Single-element vectors and operations.
17// External include guard in highway.h - see comment there.
18
19#include <stdint.h>
20#ifndef HWY_NO_LIBCXX
21#include <math.h> // sqrtf
22#endif
23
24#include "hwy/ops/shared-inl.h"
25
27namespace hwy {
28namespace HWY_NAMESPACE {
29
30// Single instruction, single data.
31template <typename T>
33
34// (Wrapper class required for overloading comparison operators.)
35template <typename T>
36struct Vec1 {
37 using PrivateT = T; // only for DFromV
38 static constexpr size_t kPrivateN = 1; // only for DFromV
39
40 HWY_INLINE Vec1() = default;
41 Vec1(const Vec1&) = default;
42 Vec1& operator=(const Vec1&) = default;
43 HWY_INLINE explicit Vec1(const T t) : raw(t) {}
44
46 return *this = (*this * other);
47 }
49 return *this = (*this / other);
50 }
52 return *this = (*this + other);
53 }
55 return *this = (*this - other);
56 }
58 return *this = (*this % other);
59 }
61 return *this = (*this & other);
62 }
64 return *this = (*this | other);
65 }
67 return *this = (*this ^ other);
68 }
69
70 T raw;
71};
72
73// 0 or FF..FF, same size as Vec1.
74template <typename T>
75class Mask1 {
77
78 public:
79 static HWY_INLINE Mask1<T> FromBool(bool b) {
80 Mask1<T> mask;
81 mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0;
82 return mask;
83 }
84
86};
87
88template <class V>
90
91template <class V>
92using TFromV = typename V::PrivateT;
93
94// ------------------------------ BitCast
95
96template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
98 static_assert(sizeof(TTo) <= sizeof(TFrom), "Promoting is undefined");
99 TTo to;
100 CopyBytes<sizeof(TTo)>(&v.raw, &to); // not same size - ok to shrink
101 return Vec1<TTo>(to);
102}
103
104// ------------------------------ Zero
105
106template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
107HWY_API Vec1<T> Zero(D /* tag */) {
108 return Vec1<T>(ConvertScalarTo<T>(0));
109}
110
111template <class D>
112using VFromD = decltype(Zero(D()));
113
114// ------------------------------ Tuple (VFromD)
115#include "hwy/ops/tuple-inl.h"
116
117// ------------------------------ Set
118template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2>
119HWY_API Vec1<T> Set(D /* tag */, const T2 t) {
120 return Vec1<T>(static_cast<T>(t));
121}
122
123// ------------------------------ Undefined
124template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
126 return Zero(d);
127}
128
129// ------------------------------ Iota
130template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2>
131HWY_API Vec1<T> Iota(const D /* tag */, const T2 first) {
132 return Vec1<T>(static_cast<T>(first));
133}
134
135// ------------------------------ ResizeBitCast
136
137template <class D, typename FromV>
138HWY_API VFromD<D> ResizeBitCast(D /* tag */, FromV v) {
139 using TFrom = TFromV<FromV>;
140 using TTo = TFromD<D>;
141 constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo));
142 TTo to{};
143 CopyBytes<kCopyLen>(&v.raw, &to);
144 return VFromD<D>(to);
145}
146
147namespace detail {
148
149// ResizeBitCast on the HWY_SCALAR target has zero-extending semantics if
150// sizeof(TFromD<DTo>) is greater than sizeof(TFromV<FromV>)
151template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom>
152HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */,
153 ToSizeTag /* to_size_tag */,
154 DTo d_to, DFrom /*d_from*/,
155 VFromD<DFrom> v) {
156 return ResizeBitCast(d_to, v);
157}
158
159} // namespace detail
160
161// ------------------------------ Dup128VecFromValues
162
163template <class D, HWY_IF_T_SIZE_D(D, 1)>
164HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/,
165 TFromD<D> /*t2*/, TFromD<D> /*t3*/,
166 TFromD<D> /*t4*/, TFromD<D> /*t5*/,
167 TFromD<D> /*t6*/, TFromD<D> /*t7*/,
168 TFromD<D> /*t8*/, TFromD<D> /*t9*/,
169 TFromD<D> /*t10*/, TFromD<D> /*t11*/,
170 TFromD<D> /*t12*/, TFromD<D> /*t13*/,
171 TFromD<D> /*t14*/, TFromD<D> /*t15*/) {
172 return VFromD<D>(t0);
173}
174
175template <class D, HWY_IF_T_SIZE_D(D, 2)>
176HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/,
177 TFromD<D> /*t2*/, TFromD<D> /*t3*/,
178 TFromD<D> /*t4*/, TFromD<D> /*t5*/,
179 TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
180 return VFromD<D>(t0);
181}
182
183template <class D, HWY_IF_T_SIZE_D(D, 4)>
184HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/,
185 TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
186 return VFromD<D>(t0);
187}
188
189template <class D, HWY_IF_T_SIZE_D(D, 8)>
190HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/) {
191 return VFromD<D>(t0);
192}
193
194// ================================================== LOGICAL
195
196// ------------------------------ Not
197
198template <typename T>
200 using TU = MakeUnsigned<T>;
201 const Sisd<TU> du;
202 return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw)));
203}
204
205// ------------------------------ And
206
207template <typename T>
208HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
209 using TU = MakeUnsigned<T>;
210 const Sisd<TU> du;
211 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
212}
213template <typename T>
215 return And(a, b);
216}
217
218// ------------------------------ AndNot
219
220template <typename T>
222 using TU = MakeUnsigned<T>;
223 const Sisd<TU> du;
224 return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw &
225 BitCast(du, b).raw)));
226}
227
228// ------------------------------ Or
229
230template <typename T>
231HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
232 using TU = MakeUnsigned<T>;
233 const Sisd<TU> du;
234 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
235}
236template <typename T>
238 return Or(a, b);
239}
240
241// ------------------------------ Xor
242
243template <typename T>
244HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
245 using TU = MakeUnsigned<T>;
246 const Sisd<TU> du;
247 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
248}
249template <typename T>
251 return Xor(a, b);
252}
253
254// ------------------------------ Xor3
255
256template <typename T>
258 return Xor(x1, Xor(x2, x3));
259}
260
261// ------------------------------ Or3
262
263template <typename T>
265 return Or(o1, Or(o2, o3));
266}
267
268// ------------------------------ OrAnd
269
270template <typename T>
271HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
272 return Or(o, And(a1, a2));
273}
274
275// ------------------------------ Mask
276
277template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
279 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
280 return Mask1<TTo>{m.bits};
281}
282
283// v must be 0 or FF..FF.
284template <typename T>
286 Mask1<T> mask;
287 CopySameSize(&v, &mask);
288 return mask;
289}
290
291template <class D>
292using MFromD = decltype(MaskFromVec(VFromD<D>()));
293
294template <typename T>
296 Vec1<T> v;
297 CopySameSize(&mask, &v);
298 return v;
299}
300
301template <class D, typename T = TFromD<D>>
302Vec1<T> VecFromMask(D /* tag */, const Mask1<T> mask) {
303 Vec1<T> v;
304 CopySameSize(&mask, &v);
305 return v;
306}
307
308template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
309HWY_API Mask1<T> FirstN(D /*tag*/, size_t n) {
310 return Mask1<T>::FromBool(n != 0);
311}
312
313// ------------------------------ IfVecThenElse
314template <typename T>
316 return IfThenElse(MaskFromVec(mask), yes, no);
317}
318
319// ------------------------------ CopySign
320template <typename T>
321HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
322 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
323 const DFromV<decltype(magn)> d;
324 return BitwiseIfThenElse(SignBit(d), sign, magn);
325}
326
327// ------------------------------ CopySignToAbs
328template <typename T>
330 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
331 const Sisd<T> d;
332 return OrAnd(abs, SignBit(d), sign);
333}
334
335// ------------------------------ BroadcastSignBit
336template <typename T>
338 return Vec1<T>(ScalarShr(v.raw, sizeof(T) * 8 - 1));
339}
340
341// ------------------------------ PopulationCount
342
343#ifdef HWY_NATIVE_POPCNT
344#undef HWY_NATIVE_POPCNT
345#else
346#define HWY_NATIVE_POPCNT
347#endif
348
349template <typename T>
351 return Vec1<T>(static_cast<T>(PopCount(v.raw)));
352}
353
354// ------------------------------ IfThenElse
355
356// Returns mask ? yes : no.
357template <typename T>
359 const Vec1<T> no) {
360 return mask.bits ? yes : no;
361}
362
363template <typename T>
365 return mask.bits ? yes : Vec1<T>(ConvertScalarTo<T>(0));
366}
367
368template <typename T>
370 return mask.bits ? Vec1<T>(ConvertScalarTo<T>(0)) : no;
371}
372
373template <typename T>
375 const DFromV<decltype(v)> d;
376 const RebindToSigned<decltype(d)> di;
377 const auto vi = BitCast(di, v);
378
379 return vi.raw < 0 ? yes : no;
380}
381
382// ------------------------------ Mask logical
383
384template <typename T>
388
389template <typename T>
391 const Sisd<T> d;
392 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
393}
394
395template <typename T>
397 const Sisd<T> d;
398 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
399}
400
401template <typename T>
403 const Sisd<T> d;
404 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
405}
406
407template <typename T>
409 const Sisd<T> d;
410 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
411}
412
413template <typename T>
418
419template <class T>
421 return mask;
422}
423
424template <class T>
426 return Not(mask);
427}
428
429template <class T>
431 return mask;
432}
433
434template <class T>
438
439// ------------------------------ LowerHalfOfMask
440
441#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
442#undef HWY_NATIVE_LOWER_HALF_OF_MASK
443#else
444#define HWY_NATIVE_LOWER_HALF_OF_MASK
445#endif
446
447template <class D>
449 return m;
450}
451
452// ================================================== SHIFTS
453
454// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
455
456template <int kBits, typename T>
458 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
459 return Vec1<T>(
460 static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits));
461}
462
463template <int kBits, typename T>
465 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
466 return Vec1<T>(ScalarShr(v.raw, kBits));
467}
468
469// ------------------------------ RotateRight (ShiftRight)
470template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
472 const DFromV<decltype(v)> d;
473 const RebindToUnsigned<decltype(d)> du;
474
475 constexpr size_t kSizeInBits = sizeof(T) * 8;
476 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
477 if (kBits == 0) return v;
478
479 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
480 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
481}
482
483// ------------------------------ ShiftLeftSame (BroadcastSignBit)
484
485template <typename T>
487 return Vec1<T>(
488 static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits));
489}
490
491template <typename T>
493 return Vec1<T>(ScalarShr(v.raw, bits));
494}
495
496// ------------------------------ Shl
497
498// Single-lane => same as ShiftLeftSame except for the argument type.
499template <typename T>
501 return ShiftLeftSame(v, static_cast<int>(bits.raw));
502}
503
504template <typename T>
506 return ShiftRightSame(v, static_cast<int>(bits.raw));
507}
508
509// ================================================== ARITHMETIC
510
511template <typename T>
513 const uint64_t a64 = static_cast<uint64_t>(a.raw);
514 const uint64_t b64 = static_cast<uint64_t>(b.raw);
515 return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
516}
518 return Vec1<float>(a.raw + b.raw);
519}
521 return Vec1<double>(a.raw + b.raw);
522}
523
524template <typename T>
526 const uint64_t a64 = static_cast<uint64_t>(a.raw);
527 const uint64_t b64 = static_cast<uint64_t>(b.raw);
528 return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
529}
531 return Vec1<float>(a.raw - b.raw);
532}
534 return Vec1<double>(a.raw - b.raw);
535}
536
537// ------------------------------ SumsOf8
538
545
546// ------------------------------ SumsOf2
547
548template <class T>
550 const DFromV<decltype(v)> d;
551 const Rebind<MakeWide<T>, decltype(d)> dw;
552 return PromoteTo(dw, v);
553}
554
555// ------------------------------ SaturatedAdd
556
557// Returns a + b clamped to the destination range.
558
559// Unsigned
561 const Vec1<uint8_t> b) {
562 return Vec1<uint8_t>(
563 static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
564}
566 const Vec1<uint16_t> b) {
567 return Vec1<uint16_t>(static_cast<uint16_t>(
568 HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) + b.raw), 65535)));
569}
570
571// Signed
573 return Vec1<int8_t>(
574 static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
575}
577 const Vec1<int16_t> b) {
578 return Vec1<int16_t>(static_cast<int16_t>(
579 HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) + b.raw), 32767)));
580}
581
582// ------------------------------ Saturating subtraction
583
584// Returns a - b clamped to the destination range.
585
586// Unsigned
588 const Vec1<uint8_t> b) {
589 return Vec1<uint8_t>(
590 static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
591}
593 const Vec1<uint16_t> b) {
594 return Vec1<uint16_t>(static_cast<uint16_t>(
595 HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) - b.raw), 65535)));
596}
597
598// Signed
600 return Vec1<int8_t>(
601 static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
602}
604 const Vec1<int16_t> b) {
605 return Vec1<int16_t>(static_cast<int16_t>(
606 HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) - b.raw), 32767)));
607}
608
609// ------------------------------ Average
610
611// Returns (a + b + 1) / 2
612
614 const Vec1<uint8_t> b) {
615 return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
616}
618 const Vec1<uint16_t> b) {
619 return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
620}
621
622// ------------------------------ Absolute value
623
624template <typename T>
626 return Vec1<T>(ScalarAbs(a.raw));
627}
628
629// ------------------------------ Min/Max
630
631// <cmath> may be unavailable, so implement our own.
632
633template <typename T, HWY_IF_NOT_FLOAT(T)>
634HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
635 return Vec1<T>(HWY_MIN(a.raw, b.raw));
636}
637
638template <typename T, HWY_IF_FLOAT(T)>
639HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
640 if (ScalarIsNaN(a.raw)) return b;
641 if (ScalarIsNaN(b.raw)) return a;
642 return Vec1<T>(HWY_MIN(a.raw, b.raw));
643}
644
645template <typename T, HWY_IF_NOT_FLOAT(T)>
646HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
647 return Vec1<T>(HWY_MAX(a.raw, b.raw));
648}
649
650template <typename T, HWY_IF_FLOAT(T)>
651HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
652 if (ScalarIsNaN(a.raw)) return b;
653 if (ScalarIsNaN(b.raw)) return a;
654 return Vec1<T>(HWY_MAX(a.raw, b.raw));
655}
656
657// ------------------------------ Floating-point negate
658
659template <typename T, HWY_IF_FLOAT_OR_SPECIAL(T)>
661 return Xor(v, SignBit(Sisd<T>()));
662}
663
664template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
665HWY_API Vec1<T> Neg(const Vec1<T> v) {
666 return Zero(Sisd<T>()) - v;
667}
668
669// ------------------------------ mul/div
670
671// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
672#ifdef HWY_NATIVE_MUL_8
673#undef HWY_NATIVE_MUL_8
674#else
675#define HWY_NATIVE_MUL_8
676#endif
677#ifdef HWY_NATIVE_MUL_64
678#undef HWY_NATIVE_MUL_64
679#else
680#define HWY_NATIVE_MUL_64
681#endif
682
683template <typename T, HWY_IF_FLOAT(T)>
685 return Vec1<T>(static_cast<T>(double{a.raw} * b.raw));
686}
687
688template <typename T, HWY_IF_NOT_FLOAT(T)>
689HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
690 return Vec1<T>(static_cast<T>(static_cast<uint64_t>(a.raw) *
691 static_cast<uint64_t>(b.raw)));
692}
693
694template <typename T, HWY_IF_FLOAT(T)>
695HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
696 return Vec1<T>(a.raw / b.raw);
697}
698
699// Returns the upper sizeof(T)*8 bits of a * b in each lane.
700template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
703 using TW = MakeWide<T>;
704 return Vec1<T>(static_cast<T>(
705 (static_cast<TW>(a.raw) * static_cast<TW>(b.raw)) >> (sizeof(T) * 8)));
706}
707template <class T, HWY_IF_UI64(T)>
709 T hi;
710 Mul128(a.raw, b.raw, &hi);
711 return Vec1<T>(hi);
712}
713
715 return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw + 16384) >> 15));
716}
717
718// Multiplies even lanes (0, 2 ..) and returns the double-wide result.
719template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
722 using TW = MakeWide<T>;
723 const TW a_wide = a.raw;
724 return Vec1<TW>(static_cast<TW>(a_wide * b.raw));
725}
726
727// Approximate reciprocal
729 // Zero inputs are allowed, but callers are responsible for replacing the
730 // return value with something else (typically using IfThenElse). This check
731 // avoids a ubsan error. The return value is arbitrary.
732 if (v.raw == 0.0f) return Vec1<float>(0.0f);
733 return Vec1<float>(1.0f / v.raw);
734}
735
736// generic_ops takes care of integer T.
737template <typename T, HWY_IF_FLOAT(T)>
739 return Abs(a - b);
740}
741
742// ------------------------------ Floating-point multiply-add variants
743
744template <typename T, HWY_IF_FLOAT(T)>
745HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) {
746 return mul * x + add;
747}
748
749template <typename T, HWY_IF_FLOAT(T)>
751 const Vec1<T> add) {
752 return add - mul * x;
753}
754
755template <typename T, HWY_IF_FLOAT(T)>
756HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) {
757 return mul * x - sub;
758}
759
760template <typename T, HWY_IF_FLOAT(T)>
762 const Vec1<T> sub) {
763 return Neg(mul) * x - sub;
764}
765
766// ------------------------------ Floating-point square root
767
768// Approximate reciprocal square root
770 float f = v.raw;
771 const float half = f * 0.5f;
772 uint32_t bits;
773 CopySameSize(&f, &bits);
774 // Initial guess based on log2(f)
775 bits = 0x5F3759DF - (bits >> 1);
776 CopySameSize(&bits, &f);
777 // One Newton-Raphson iteration
778 return Vec1<float>(f * (1.5f - (half * f * f)));
779}
780
781// Square root
783#if defined(HWY_NO_LIBCXX)
784#if HWY_COMPILER_GCC_ACTUAL
785 return Vec1<float>(__builtin_sqrt(v.raw));
786#else
787 uint32_t bits;
788 CopyBytes<sizeof(bits)>(&v, &bits);
789 // Coarse approximation, letting the exponent LSB leak into the mantissa
790 bits = (1 << 29) + (bits >> 1) - (1 << 22);
791 CopyBytes<sizeof(bits)>(&bits, &v);
792 return v;
793#endif // !HWY_COMPILER_GCC_ACTUAL
794#else
795 return Vec1<float>(sqrtf(v.raw));
796#endif // !HWY_NO_LIBCXX
797}
799#if defined(HWY_NO_LIBCXX)
800#if HWY_COMPILER_GCC_ACTUAL
801 return Vec1<double>(__builtin_sqrt(v.raw));
802#else
803 uint64_t bits;
804 CopyBytes<sizeof(bits)>(&v, &bits);
805 // Coarse approximation, letting the exponent LSB leak into the mantissa
806 bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
807 CopyBytes<sizeof(bits)>(&bits, &v);
808 return v;
809#endif // !HWY_COMPILER_GCC_ACTUAL
810#else
811 return Vec1<double>(sqrt(v.raw));
812#endif // HWY_NO_LIBCXX
813}
814
815// ------------------------------ Floating-point rounding
816
817template <typename T>
819 using TI = MakeSigned<T>;
820 if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN
821 return v;
822 }
823 const T k0 = ConvertScalarTo<T>(0);
824 const T bias = ConvertScalarTo<T>(v.raw < k0 ? -0.5 : 0.5);
825 const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
826 if (rounded == 0) return CopySignToAbs(Vec1<T>(k0), v);
827 TI offset = 0;
828 // Round to even
829 if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
830 ConvertScalarTo<T>(0.5)) {
831 offset = v.raw < k0 ? -1 : 1;
832 }
833 return Vec1<T>(ConvertScalarTo<T>(rounded - offset));
834}
835
836// Round-to-nearest even.
838 using T = float;
839 using TI = int32_t;
840
841 const T abs = Abs(v).raw;
842 const bool is_sign = ScalarSignBit(v.raw);
843
844 if (!(abs < MantissaEnd<T>())) { // Huge or NaN
845 // Check if too large to cast or NaN
846 if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
847 return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
848 }
849 return Vec1<int32_t>(ConvertScalarTo<TI>(v.raw));
850 }
851 const T bias =
852 ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5);
853 const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
854 if (rounded == 0) return Vec1<int32_t>(0);
855 TI offset = 0;
856 // Round to even
857 if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
858 ConvertScalarTo<T>(0.5)) {
859 offset = is_sign ? -1 : 1;
860 }
861 return Vec1<TI>(rounded - offset);
862}
863
864template <typename T>
866 using TI = MakeSigned<T>;
867 if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN
868 return v;
869 }
870 const TI truncated = ConvertScalarTo<TI>(v.raw);
871 if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
872 return Vec1<T>(ConvertScalarTo<T>(truncated));
873}
874
875template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
876 class V>
877V Ceiling(const V v) {
878 const Bits kExponentMask = (1ull << kExponentBits) - 1;
879 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
880 const Bits kBias = kExponentMask / 2;
881
882 Float f = v.raw;
883 const bool positive = f > Float(0.0);
884
885 Bits bits;
886 CopySameSize(&v, &bits);
887
888 const int exponent =
889 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
890 // Already an integer.
891 if (exponent >= kMantissaBits) return v;
892 // |v| <= 1 => 0 or 1.
893 if (exponent < 0) return positive ? V(1) : V(-0.0);
894
895 const Bits mantissa_mask = kMantissaMask >> exponent;
896 // Already an integer
897 if ((bits & mantissa_mask) == 0) return v;
898
899 // Clear fractional bits and round up
900 if (positive) bits += (kMantissaMask + 1) >> exponent;
901 bits &= ~mantissa_mask;
902
903 CopySameSize(&bits, &f);
904 return V(f);
905}
906
907template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
908 class V>
909V Floor(const V v) {
910 const Bits kExponentMask = (1ull << kExponentBits) - 1;
911 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
912 const Bits kBias = kExponentMask / 2;
913
914 Float f = v.raw;
915 const bool negative = f < Float(0.0);
916
917 Bits bits;
918 CopySameSize(&v, &bits);
919
920 const int exponent =
921 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
922 // Already an integer.
923 if (exponent >= kMantissaBits) return v;
924 // |v| <= 1 => -1 or 0.
925 if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
926
927 const Bits mantissa_mask = kMantissaMask >> exponent;
928 // Already an integer
929 if ((bits & mantissa_mask) == 0) return v;
930
931 // Clear fractional bits and round down
932 if (negative) bits += (kMantissaMask + 1) >> exponent;
933 bits &= ~mantissa_mask;
934
935 CopySameSize(&bits, &f);
936 return V(f);
937}
938
939// Toward +infinity, aka ceiling
941 return Ceiling<float, uint32_t, 23, 8>(v);
942}
944 return Ceiling<double, uint64_t, 52, 11>(v);
945}
946
947// Toward -infinity, aka floor
949 return Floor<float, uint32_t, 23, 8>(v);
950}
952 return Floor<double, uint64_t, 52, 11>(v);
953}
954
955// ================================================== COMPARE
956
957template <typename T>
959 return Mask1<T>::FromBool(a.raw == b.raw);
960}
961
962template <typename T>
964 return Mask1<T>::FromBool(a.raw != b.raw);
965}
966
967template <typename T>
969 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
970 return (v & bit) == bit;
971}
972
973template <typename T>
975 return Mask1<T>::FromBool(a.raw < b.raw);
976}
977template <typename T>
979 return Mask1<T>::FromBool(a.raw > b.raw);
980}
981
982template <typename T>
984 return Mask1<T>::FromBool(a.raw <= b.raw);
985}
986template <typename T>
988 return Mask1<T>::FromBool(a.raw >= b.raw);
989}
990
991// ------------------------------ Floating-point classification (==)
992
993template <typename T>
995 // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
997}
998
999// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
1000#ifdef HWY_NATIVE_ISINF
1001#undef HWY_NATIVE_ISINF
1002#else
1003#define HWY_NATIVE_ISINF
1004#endif
1005
1007 const Sisd<float> d;
1008 const RebindToUnsigned<decltype(d)> du;
1009 const Vec1<uint32_t> vu = BitCast(du, v);
1010 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1011 return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u));
1012}
1014 const Sisd<double> d;
1015 const RebindToUnsigned<decltype(d)> du;
1016 const Vec1<uint64_t> vu = BitCast(du, v);
1017 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1018 return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull));
1019}
1020
1022 const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v);
1023 // Shift left to clear the sign bit, check whether exponent != max value.
1024 return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u);
1025}
1027 const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v);
1028 // Shift left to clear the sign bit, check whether exponent != max value.
1029 return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull);
1030}
1031
1032// ================================================== MEMORY
1033
1034// ------------------------------ Load
1035
1036template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
1037HWY_API Vec1<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
1038 T t;
1039 CopySameSize(aligned, &t);
1040 return Vec1<T>(t);
1041}
1042
1043template <class D, typename T = TFromD<D>>
1045 return IfThenElseZero(m, Load(d, aligned));
1046}
1047
1048template <class D, typename T = TFromD<D>>
1050 const T* HWY_RESTRICT aligned) {
1051 return IfThenElse(m, Load(d, aligned), v);
1052}
1053
1054template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
1056 return Load(d, p);
1057}
1058
1059// In some use cases, "load single lane" is sufficient; otherwise avoid this.
1060template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
1062 return Load(d, aligned);
1063}
1064
1065#ifdef HWY_NATIVE_LOAD_N
1066#undef HWY_NATIVE_LOAD_N
1067#else
1068#define HWY_NATIVE_LOAD_N
1069#endif
1070
1071template <class D, typename T = TFromD<D>>
1073 size_t max_lanes_to_load) {
1074 return (max_lanes_to_load > 0) ? Load(d, p) : Zero(d);
1075}
1076
1077template <class D, typename T = TFromD<D>>
1079 size_t max_lanes_to_load) {
1080 return (max_lanes_to_load > 0) ? Load(d, p) : no;
1081}
1082
1083// ------------------------------ Store
1084
1085template <class D, typename T = TFromD<D>>
1086HWY_API void Store(const Vec1<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
1087 CopySameSize(&v.raw, aligned);
1088}
1089
1090template <class D, typename T = TFromD<D>>
1091HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT p) {
1092 return Store(v, d, p);
1093}
1094
1095template <class D, typename T = TFromD<D>>
1097 if (!m.bits) return;
1098 StoreU(v, d, p);
1099}
1100
1101#ifdef HWY_NATIVE_STORE_N
1102#undef HWY_NATIVE_STORE_N
1103#else
1104#define HWY_NATIVE_STORE_N
1105#endif
1106
1107template <class D, typename T = TFromD<D>>
1108HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1109 size_t max_lanes_to_store) {
1110 if (max_lanes_to_store > 0) {
1111 Store(v, d, p);
1112 }
1113}
1114
1115// ------------------------------ LoadInterleaved2/3/4
1116
1117// Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
1118#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1119#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1120#else
1121#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1122#endif
1123
1124template <class D, typename T = TFromD<D>>
1125HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0,
1126 Vec1<T>& v1) {
1127 v0 = LoadU(d, unaligned + 0);
1128 v1 = LoadU(d, unaligned + 1);
1129}
1130
1131template <class D, typename T = TFromD<D>>
1132HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0,
1133 Vec1<T>& v1, Vec1<T>& v2) {
1134 v0 = LoadU(d, unaligned + 0);
1135 v1 = LoadU(d, unaligned + 1);
1136 v2 = LoadU(d, unaligned + 2);
1137}
1138
1139template <class D, typename T = TFromD<D>>
1140HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0,
1141 Vec1<T>& v1, Vec1<T>& v2, Vec1<T>& v3) {
1142 v0 = LoadU(d, unaligned + 0);
1143 v1 = LoadU(d, unaligned + 1);
1144 v2 = LoadU(d, unaligned + 2);
1145 v3 = LoadU(d, unaligned + 3);
1146}
1147
1148// ------------------------------ StoreInterleaved2/3/4
1149
1150template <class D, typename T = TFromD<D>>
1151HWY_API void StoreInterleaved2(const Vec1<T> v0, const Vec1<T> v1, D d,
1152 T* HWY_RESTRICT unaligned) {
1153 StoreU(v0, d, unaligned + 0);
1154 StoreU(v1, d, unaligned + 1);
1155}
1156
1157template <class D, typename T = TFromD<D>>
1159 const Vec1<T> v2, D d,
1160 T* HWY_RESTRICT unaligned) {
1161 StoreU(v0, d, unaligned + 0);
1162 StoreU(v1, d, unaligned + 1);
1163 StoreU(v2, d, unaligned + 2);
1164}
1165
1166template <class D, typename T = TFromD<D>>
1168 const Vec1<T> v2, const Vec1<T> v3, D d,
1169 T* HWY_RESTRICT unaligned) {
1170 StoreU(v0, d, unaligned + 0);
1171 StoreU(v1, d, unaligned + 1);
1172 StoreU(v2, d, unaligned + 2);
1173 StoreU(v3, d, unaligned + 3);
1174}
1175
1176// ------------------------------ Stream
1177
1178template <class D, typename T = TFromD<D>>
1179HWY_API void Stream(const Vec1<T> v, D d, T* HWY_RESTRICT aligned) {
1180 return Store(v, d, aligned);
1181}
1182
1183// ------------------------------ Scatter
1184
1185#ifdef HWY_NATIVE_SCATTER
1186#undef HWY_NATIVE_SCATTER
1187#else
1188#define HWY_NATIVE_SCATTER
1189#endif
1190
1191template <class D, typename T = TFromD<D>, typename TI>
1192HWY_API void ScatterOffset(Vec1<T> v, D d, T* base, Vec1<TI> offset) {
1193 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1194 const intptr_t addr =
1195 reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
1196 Store(v, d, reinterpret_cast<T*>(addr));
1197}
1198
1199template <class D, typename T = TFromD<D>, typename TI>
1201 Vec1<TI> index) {
1202 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1203 Store(v, d, base + index.raw);
1204}
1205
1206template <class D, typename T = TFromD<D>, typename TI>
1208 T* HWY_RESTRICT base, Vec1<TI> index) {
1209 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1210 if (m.bits) Store(v, d, base + index.raw);
1211}
1212
1213// ------------------------------ Gather
1214
1215#ifdef HWY_NATIVE_GATHER
1216#undef HWY_NATIVE_GATHER
1217#else
1218#define HWY_NATIVE_GATHER
1219#endif
1220
1221template <class D, typename T = TFromD<D>>
1222HWY_API Vec1<T> GatherOffset(D d, const T* base, Vec1<MakeSigned<T>> offset) {
1223 HWY_DASSERT(offset.raw >= 0);
1224 const intptr_t addr =
1225 reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
1226 return Load(d, reinterpret_cast<const T*>(addr));
1227}
1228
1229template <class D, typename T = TFromD<D>>
1231 Vec1<MakeSigned<T>> index) {
1232 HWY_DASSERT(index.raw >= 0);
1233 return Load(d, base + index.raw);
1234}
1235
1236template <class D, typename T = TFromD<D>>
1238 Vec1<MakeSigned<T>> index) {
1239 HWY_DASSERT(index.raw >= 0);
1240 return MaskedLoad(m, d, base + index.raw);
1241}
1242
1243template <class D, typename T = TFromD<D>>
1245 const T* HWY_RESTRICT base,
1246 Vec1<MakeSigned<T>> index) {
1247 HWY_DASSERT(index.raw >= 0);
1248 return MaskedLoadOr(no, m, d, base + index.raw);
1249}
1250
1251// ================================================== CONVERT
1252
1253// ConvertTo and DemoteTo with floating-point input and integer output truncate
1254// (rounding toward zero).
1255
1256namespace detail {
1257
1258template <class ToT, class FromT>
1259HWY_INLINE ToT CastValueForF2IConv(FromT val) {
1260 // Prevent ubsan errors when converting float to narrower integer
1261
1262 using FromTU = MakeUnsigned<FromT>;
1263 using ToTU = MakeUnsigned<ToT>;
1264
1265 constexpr unsigned kMaxExpField =
1266 static_cast<unsigned>(MaxExponentField<FromT>());
1267 constexpr unsigned kExpBias = kMaxExpField >> 1;
1268 constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1269 kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1270 kMaxExpField));
1271
1272 // If ToT is signed, compare only the exponent bits of val against
1273 // kMinOutOfRangeExpField.
1274 //
1275 // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1276 // val against kMinOutOfRangeExpField as a negative value is outside of the
1277 // range of an unsigned integer type.
1278 const FromT val_to_compare =
1279 static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1280
1281 // val is within the range of ToT if
1282 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1283 // than kMinOutOfRangeExpField
1284 //
1285 // Otherwise, val is either outside of the range of ToT or equal to
1286 // LimitsMin<ToT>() if
1287 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1288 // than or equal to kMinOutOfRangeExpField.
1289
1290 return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1291 MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1292 ? static_cast<ToT>(val)
1293 : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
1294 static_cast<ToTU>(ScalarSignBit(val)));
1295}
1296
1297template <class ToT, class ToTypeTag, class FromT>
1298HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
1299 return ConvertScalarTo<ToT>(val);
1300}
1301
1302template <class ToT>
1304 float val) {
1305 return CastValueForF2IConv<ToT>(val);
1306}
1307
1308template <class ToT>
1310 float val) {
1311 return CastValueForF2IConv<ToT>(val);
1312}
1313
1314// If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
1315// returns static_cast<ToT>(val)
1316//
1317// Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
1318// implementation-defined result if val is not within the range of ToT.
1319template <class ToT, class FromT>
1321 // Prevent ubsan errors when converting float to narrower integer
1322
1323 using FromTU = MakeUnsigned<FromT>;
1324
1325 constexpr unsigned kMaxExpField =
1326 static_cast<unsigned>(MaxExponentField<FromT>());
1327 constexpr unsigned kExpBias = kMaxExpField >> 1;
1328 constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1329 kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1330 kMaxExpField));
1331
1332 // If ToT is signed, compare only the exponent bits of val against
1333 // kMinOutOfRangeExpField.
1334 //
1335 // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1336 // val against kMinOutOfRangeExpField as a negative value is outside of the
1337 // range of an unsigned integer type.
1338 const FromT val_to_compare =
1339 static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1340
1341 // val is within the range of ToT if
1342 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1343 // than kMinOutOfRangeExpField
1344 //
1345 // Otherwise, val is either outside of the range of ToT or equal to
1346 // LimitsMin<ToT>() if
1347 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1348 // than or equal to kMinOutOfRangeExpField.
1349
1350 return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1351 MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1352 ? static_cast<ToT>(val)
1353 : static_cast<ToT>(LimitsMin<ToT>());
1354}
1355
1356} // namespace detail
1357
1358#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
1359#undef HWY_NATIVE_PROMOTE_F16_TO_F64
1360#else
1361#define HWY_NATIVE_PROMOTE_F16_TO_F64
1362#endif
1363
1364template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
1366 static_assert(sizeof(TTo) > sizeof(TFrom), "Not promoting");
1367 // For bits Y > X, floatX->floatY and intX->intY are always representable.
1368 return Vec1<TTo>(
1369 detail::CastValueForPromoteTo<TTo>(hwy::TypeTag<TTo>(), from.raw));
1370}
1371
1372#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1373#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1374#else
1375#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1376#endif
1377
1378template <class DTo, HWY_IF_UI64_D(DTo)>
1380 using TTo = TFromD<DTo>;
1381 return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(from.raw));
1382}
1383
1384// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
1385// so we overload for TFrom=double and TTo={float,int32_t}.
1386template <class D, HWY_IF_F32_D(D)>
1388 // Prevent ubsan errors when converting float to narrower integer/float
1389 if (IsInf(from).bits ||
1390 Abs(from).raw > static_cast<double>(HighestValue<float>())) {
1393 }
1394 return Vec1<float>(static_cast<float>(from.raw));
1395}
1396template <class D, HWY_IF_UI32_D(D)>
1398 // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
1399 return Vec1<TFromD<D>>(detail::CastValueForF2IConv<TFromD<D>>(from.raw));
1400}
1401
1402template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1403 HWY_IF_SIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)>
1405 static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above");
1406 static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1407
1408 // Int to int: choose closest value in TTo to `from` (avoids UB)
1409 from.raw = HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw), LimitsMax<TTo>());
1410 return Vec1<TTo>(static_cast<TTo>(from.raw));
1411}
1412
1413// Disable the default unsigned to signed DemoteTo implementation in
1414// generic_ops-inl.h on SCALAR as the SCALAR target has a target-specific
1415// implementation of the unsigned to signed DemoteTo op and as ReorderDemote2To
1416// is not supported on the SCALAR target
1417
1418// NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
1419// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
1420// !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
1421// SFINAE to occur instead of a hard error due to a dependency on the V template
1422// argument
1423#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
1424#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
1425 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
1426
1427template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1428 HWY_IF_UNSIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
1429HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1430 static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above");
1431 static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1432
1433 const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
1434
1435 // Int to int: choose closest value in TTo to `from` (avoids UB)
1436 return Vec1<TTo>(static_cast<TTo>(HWY_MIN(from.raw, max)));
1437}
1438
1439template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1440 HWY_IF_UI64(TFrom), HWY_IF_F32_D(DTo)>
1441HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1442 // int64_t/uint64_t to float: simply cast to TTo
1443 return Vec1<TTo>(static_cast<TTo>(from.raw));
1444}
1445
1446#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1447#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1448#else
1449#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1450#endif
1451
1452template <class D32, HWY_IF_UI32_D(D32)>
1454 VFromD<Rebind<double, D32>> v) {
1455 using TTo = TFromD<D32>;
1456 return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
1457}
1458
1459// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions;
1460// use this scalar version to verify the vector implementation.
1461#ifdef HWY_NATIVE_F16C
1462#undef HWY_NATIVE_F16C
1463#else
1464#define HWY_NATIVE_F16C
1465#endif
1466
1467template <class D, HWY_IF_F32_D(D)>
1469 return Vec1<float>(F32FromF16(v.raw));
1470}
1471
1472template <class D, HWY_IF_F32_D(D)>
1474 return Set(d, F32FromBF16(v.raw));
1475}
1476
1477template <class DTo, typename TFrom>
1479 return PromoteTo(d_to, v);
1480}
1481
1482template <class D, HWY_IF_F16_D(D)>
1484 return Vec1<float16_t>(F16FromF32(v.raw));
1485}
1486
1487#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
1488#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
1489#else
1490#define HWY_NATIVE_DEMOTE_F32_TO_BF16
1491#endif
1492
1493template <class D, HWY_IF_BF16_D(D)>
1495 return Set(d, BF16FromF32(v.raw));
1496}
1497
1498template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1499 HWY_IF_FLOAT(TFrom)>
1501 static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size");
1502 // float## -> int##: return closest representable value.
1503 return Vec1<TTo>(detail::CastValueForF2IConv<TTo>(from.raw));
1504}
1505
1506template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1507 HWY_IF_NOT_FLOAT(TFrom)>
1508HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) {
1509 static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size");
1510 // int## -> float##: no check needed
1511 return Vec1<TTo>(static_cast<TTo>(from.raw));
1512}
1513
1514#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1515#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1516#else
1517#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1518#endif
1519
1520template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
1521 HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
1522HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
1523 using TTo = TFromD<DI>;
1524 return VFromD<DI>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
1525}
1526
1530
1531// ------------------------------ TruncateTo
1532
1533template <class D, HWY_IF_U8_D(D)>
1535 return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
1536}
1537
1538template <class D, HWY_IF_U16_D(D)>
1540 return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
1541}
1542
1543template <class D, HWY_IF_U32_D(D)>
1545 return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)};
1546}
1547
1548template <class D, HWY_IF_U8_D(D)>
1550 return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
1551}
1552
1553template <class D, HWY_IF_U16_D(D)>
1555 return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
1556}
1557
1558template <class D, HWY_IF_U8_D(D)>
1560 return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
1561}
1562
1563// ================================================== COMBINE
1564// UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
1565
1566template <typename T>
1568 return v;
1569}
1570
1571template <class D, typename T = TFromD<D>>
1573 return v;
1574}
1575
1576// ================================================== SWIZZLE
1577
1578template <typename T>
1580 return v.raw;
1581}
1582
1583template <typename T>
1584HWY_API T ExtractLane(const Vec1<T> v, size_t i) {
1585 HWY_DASSERT(i == 0);
1586 (void)i;
1587 return v.raw;
1588}
1589
1590template <typename T>
1592 HWY_DASSERT(i == 0);
1593 (void)i;
1594 v.raw = t;
1595 return v;
1596}
1597
1598template <typename T>
1600 return v;
1601}
1602// DupOdd is unsupported.
1603
1604template <typename T>
1606 return even;
1607}
1608
1609template <typename T>
1611 return even;
1612}
1613
1614// ------------------------------ SwapAdjacentBlocks
1615
1616template <typename T>
1618 return v;
1619}
1620
1621// ------------------------------ TableLookupLanes
1622
1623// Returned by SetTableIndices for use by TableLookupLanes.
1624template <typename T>
1628
1629template <class D, typename T = TFromD<D>, typename TI>
1631 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
1632 HWY_DASSERT(vec.raw <= 1);
1633 return Indices1<T>{static_cast<MakeSigned<T>>(vec.raw)};
1634}
1635
1636template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename TI>
1638 return IndicesFromVec(d, LoadU(Sisd<TI>(), idx));
1639}
1640
1641template <typename T>
1643 return v;
1644}
1645
1646template <typename T>
1648 const Indices1<T> idx) {
1649 return (idx.raw == 0) ? a : b;
1650}
1651
1652// ------------------------------ ReverseBlocks
1653
1654// Single block: no change
1655template <class D, typename T = TFromD<D>>
1656HWY_API Vec1<T> ReverseBlocks(D /* tag */, const Vec1<T> v) {
1657 return v;
1658}
1659
1660// ------------------------------ Reverse
1661
1662template <class D, typename T = TFromD<D>>
1663HWY_API Vec1<T> Reverse(D /* tag */, const Vec1<T> v) {
1664 return v;
1665}
1666
1667// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
1668#ifdef HWY_NATIVE_REVERSE2_8
1669#undef HWY_NATIVE_REVERSE2_8
1670#else
1671#define HWY_NATIVE_REVERSE2_8
1672#endif
1673
1674// Must not be called:
1675template <class D, typename T = TFromD<D>>
1676HWY_API Vec1<T> Reverse2(D /* tag */, const Vec1<T> v) {
1677 return v;
1678}
1679
1680template <class D, typename T = TFromD<D>>
1681HWY_API Vec1<T> Reverse4(D /* tag */, const Vec1<T> v) {
1682 return v;
1683}
1684
1685template <class D, typename T = TFromD<D>>
1686HWY_API Vec1<T> Reverse8(D /* tag */, const Vec1<T> v) {
1687 return v;
1688}
1689
1690// ------------------------------ ReverseLaneBytes
1691
1692#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
1693#undef HWY_NATIVE_REVERSE_LANE_BYTES
1694#else
1695#define HWY_NATIVE_REVERSE_LANE_BYTES
1696#endif
1697
1699 const uint32_t val{v.raw};
1700 return Vec1<uint16_t>(
1701 static_cast<uint16_t>(((val << 8) & 0xFF00u) | ((val >> 8) & 0x00FFu)));
1702}
1703
1705 const uint32_t val = v.raw;
1706 return Vec1<uint32_t>(static_cast<uint32_t>(
1707 ((val << 24) & 0xFF000000u) | ((val << 8) & 0x00FF0000u) |
1708 ((val >> 8) & 0x0000FF00u) | ((val >> 24) & 0x000000FFu)));
1709}
1710
1712 const uint64_t val = v.raw;
1713 return Vec1<uint64_t>(static_cast<uint64_t>(
1714 ((val << 56) & 0xFF00000000000000u) |
1715 ((val << 40) & 0x00FF000000000000u) |
1716 ((val << 24) & 0x0000FF0000000000u) | ((val << 8) & 0x000000FF00000000u) |
1717 ((val >> 8) & 0x00000000FF000000u) | ((val >> 24) & 0x0000000000FF0000u) |
1718 ((val >> 40) & 0x000000000000FF00u) |
1719 ((val >> 56) & 0x00000000000000FFu)));
1720}
1721
1722template <class V, HWY_IF_SIGNED_V(V),
1723 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
1725 const DFromV<decltype(v)> d;
1726 const RebindToUnsigned<decltype(d)> du;
1727 return BitCast(d, ReverseLaneBytes(BitCast(du, v)));
1728}
1729
1730// ------------------------------ ReverseBits
1731#ifdef HWY_NATIVE_REVERSE_BITS_UI8
1732#undef HWY_NATIVE_REVERSE_BITS_UI8
1733#else
1734#define HWY_NATIVE_REVERSE_BITS_UI8
1735#endif
1736
1737#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
1738#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
1739#else
1740#define HWY_NATIVE_REVERSE_BITS_UI16_32_64
1741#endif
1742
1743namespace detail {
1744
1745template <class T>
1747 using TU = MakeUnsigned<T>;
1748 constexpr TU kMaxUnsignedVal{LimitsMax<TU>()};
1749 constexpr TU kShrMask1 =
1750 static_cast<TU>(0x5555555555555555u & kMaxUnsignedVal);
1751 constexpr TU kShrMask2 =
1752 static_cast<TU>(0x3333333333333333u & kMaxUnsignedVal);
1753 constexpr TU kShrMask3 =
1754 static_cast<TU>(0x0F0F0F0F0F0F0F0Fu & kMaxUnsignedVal);
1755
1756 constexpr TU kShlMask1 = static_cast<TU>(~kShrMask1);
1757 constexpr TU kShlMask2 = static_cast<TU>(~kShrMask2);
1758 constexpr TU kShlMask3 = static_cast<TU>(~kShrMask3);
1759
1760 TU result = static_cast<TU>(val);
1761 result = static_cast<TU>(((result << 1) & kShlMask1) |
1762 ((result >> 1) & kShrMask1));
1763 result = static_cast<TU>(((result << 2) & kShlMask2) |
1764 ((result >> 2) & kShrMask2));
1765 result = static_cast<TU>(((result << 4) & kShlMask3) |
1766 ((result >> 4) & kShrMask3));
1767 return static_cast<T>(result);
1768}
1769
1770} // namespace detail
1771
1772template <class V, HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 1)>
1773HWY_API V ReverseBits(V v) {
1774 return V(detail::ReverseBitsOfEachByte(v.raw));
1775}
1776
1777template <class V, HWY_IF_UNSIGNED_V(V),
1778 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
1779HWY_API V ReverseBits(V v) {
1781}
1782
1783template <class V, HWY_IF_SIGNED_V(V)>
1784HWY_API V ReverseBits(V v) {
1785 const DFromV<decltype(v)> d;
1786 const RebindToUnsigned<decltype(d)> du;
1787 return BitCast(d, ReverseBits(BitCast(du, v)));
1788}
1789
1790// ------------------------------ SlideUpLanes
1791
1792template <typename D>
1793HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
1794 return v;
1795}
1796
1797// ------------------------------ SlideDownLanes
1798
1799template <typename D>
1800HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
1801 return v;
1802}
1803
1804// ================================================== BLOCKWISE
1805// Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
1806
1807// ------------------------------ Broadcast/splat any lane
1808
1809template <int kLane, typename T>
1811 static_assert(kLane == 0, "Scalar only has one lane");
1812 return v;
1813}
1814
1815// ------------------------------ TableLookupBytes, TableLookupBytesOr0
1816
1817template <typename T, typename TI>
1819 uint8_t in_bytes[sizeof(T)];
1820 uint8_t idx_bytes[sizeof(T)];
1821 uint8_t out_bytes[sizeof(T)];
1822 CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes
1823 CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1824 for (size_t i = 0; i < sizeof(T); ++i) {
1825 out_bytes[i] = in_bytes[idx_bytes[i]];
1826 }
1827 TI out;
1828 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1829 return Vec1<TI>{out};
1830}
1831
1832template <typename T, typename TI>
1834 uint8_t in_bytes[sizeof(T)];
1835 uint8_t idx_bytes[sizeof(T)];
1836 uint8_t out_bytes[sizeof(T)];
1837 CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes
1838 CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1839 for (size_t i = 0; i < sizeof(T); ++i) {
1840 out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
1841 }
1842 TI out;
1843 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1844 return Vec1<TI>{out};
1845}
1846
1847// ------------------------------ ZipLower
1848
1850 return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t{b.raw} << 8) + a.raw));
1851}
1853 return Vec1<uint32_t>((uint32_t{b.raw} << 16) + a.raw);
1854}
1856 return Vec1<uint64_t>((uint64_t{b.raw} << 32) + a.raw);
1857}
1859 return Vec1<int16_t>(static_cast<int16_t>((int32_t{b.raw} << 8) + a.raw));
1860}
1862 return Vec1<int32_t>((int32_t{b.raw} << 16) + a.raw);
1863}
1865 return Vec1<int64_t>((int64_t{b.raw} << 32) + a.raw);
1866}
1867
1868template <class DW, typename TW = TFromD<DW>, typename TN = MakeNarrow<TW>>
1870 return Vec1<TW>(static_cast<TW>((TW{b.raw} << (sizeof(TN) * 8)) + a.raw));
1871}
1872
1873// ================================================== MASK
1874
1875template <class D, typename T = TFromD<D>>
1876HWY_API bool AllFalse(D /* tag */, const Mask1<T> mask) {
1877 return mask.bits == 0;
1878}
1879
1880template <class D, typename T = TFromD<D>>
1881HWY_API bool AllTrue(D /* tag */, const Mask1<T> mask) {
1882 return mask.bits != 0;
1883}
1884
1885// `p` points to at least 8 readable bytes, not all of which need be valid.
1886template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
1887HWY_API Mask1<T> LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) {
1888 return Mask1<T>::FromBool((bits[0] & 1) != 0);
1889}
1890
1891template <class D, HWY_IF_LANES_D(D, 1)>
1892HWY_API MFromD<D> Dup128MaskFromMaskBits(D /*d*/, unsigned mask_bits) {
1893 return MFromD<D>::FromBool((mask_bits & 1) != 0);
1894}
1895
1896// `p` points to at least 8 writable bytes.
1897template <class D, typename T = TFromD<D>>
1898HWY_API size_t StoreMaskBits(D d, const Mask1<T> mask, uint8_t* bits) {
1899 *bits = AllTrue(d, mask);
1900 return 1;
1901}
1902
1903template <class D, typename T = TFromD<D>>
1904HWY_API size_t CountTrue(D /* tag */, const Mask1<T> mask) {
1905 return mask.bits == 0 ? 0 : 1;
1906}
1907
1908template <class D, typename T = TFromD<D>>
1909HWY_API intptr_t FindFirstTrue(D /* tag */, const Mask1<T> mask) {
1910 return mask.bits == 0 ? -1 : 0;
1911}
1912
1913template <class D, typename T = TFromD<D>>
1914HWY_API size_t FindKnownFirstTrue(D /* tag */, const Mask1<T> /* m */) {
1915 return 0; // There is only one lane and we know it is true.
1916}
1917
1918template <class D, typename T = TFromD<D>>
1919HWY_API intptr_t FindLastTrue(D /* tag */, const Mask1<T> mask) {
1920 return mask.bits == 0 ? -1 : 0;
1921}
1922
1923template <class D, typename T = TFromD<D>>
1924HWY_API size_t FindKnownLastTrue(D /* tag */, const Mask1<T> /* m */) {
1925 return 0; // There is only one lane and we know it is true.
1926}
1927
1928// ------------------------------ Compress, CompressBits
1929
1930template <typename T>
1931struct CompressIsPartition {
1932 enum { value = 1 };
1933};
1934
1935template <typename T>
1937 // A single lane is already partitioned by definition.
1938 return v;
1939}
1940
1941template <typename T>
1943 // A single lane is already partitioned by definition.
1944 return v;
1945}
1946
1947// ------------------------------ CompressStore
1948template <class D, typename T = TFromD<D>>
1949HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, D d,
1950 T* HWY_RESTRICT unaligned) {
1951 StoreU(Compress(v, mask), d, unaligned);
1952 return CountTrue(d, mask);
1953}
1954
1955// ------------------------------ CompressBlendedStore
1956template <class D, typename T = TFromD<D>>
1958 T* HWY_RESTRICT unaligned) {
1959 if (!mask.bits) return 0;
1960 StoreU(v, d, unaligned);
1961 return 1;
1962}
1963
1964// ------------------------------ CompressBits
1965template <typename T>
1966HWY_API Vec1<T> CompressBits(Vec1<T> v, const uint8_t* HWY_RESTRICT /*bits*/) {
1967 return v;
1968}
1969
1970// ------------------------------ CompressBitsStore
1971template <class D, typename T = TFromD<D>>
1972HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
1973 D d, T* HWY_RESTRICT unaligned) {
1974 const Mask1<T> mask = LoadMaskBits(d, bits);
1975 StoreU(Compress(v, mask), d, unaligned);
1976 return CountTrue(d, mask);
1977}
1978
1979// ------------------------------ Expand
1980
1981// generic_ops-inl.h requires Vec64/128, so implement [Load]Expand here.
1982#ifdef HWY_NATIVE_EXPAND
1983#undef HWY_NATIVE_EXPAND
1984#else
1985#define HWY_NATIVE_EXPAND
1986#endif
1987
1988template <typename T>
1990 return IfThenElseZero(mask, v);
1991}
1992
1993// ------------------------------ LoadExpand
1994template <class D>
1995HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
1996 const TFromD<D>* HWY_RESTRICT unaligned) {
1997 return MaskedLoad(mask, d, unaligned);
1998}
1999
2000// ------------------------------ WidenMulPairwiseAdd
2001
2002template <class D32, HWY_IF_F32_D(D32)>
2007
2008template <class D32, HWY_IF_I32_D(D32)>
2010 Vec1<int16_t> b) {
2011 return Vec1<int32_t>(a.raw * b.raw);
2012}
2013
2014// ------------------------------ SatWidenMulAccumFixedPoint
2015#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2016#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2017#else
2018#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2019#endif
2020
2021template <class DI32, HWY_IF_I32_D(DI32)>
2023 VFromD<Rebind<int16_t, DI32>> a,
2024 VFromD<Rebind<int16_t, DI32>> b,
2025 VFromD<DI32> sum) {
2026 // Multiplying static_cast<int32_t>(a.raw) by static_cast<int32_t>(b.raw)
2027 // followed by an addition of the product is okay as
2028 // (a.raw * b.raw * 2) is between -2147418112 and 2147483648 and as
2029 // a.raw * b.raw * 2 can only overflow an int32_t if both a.raw and b.raw are
2030 // equal to -32768.
2031
2032 const VFromD<DI32> product(static_cast<int32_t>(a.raw) *
2033 static_cast<int32_t>(b.raw));
2034 const VFromD<DI32> product2 = Add(product, product);
2035
2036 const auto mul_overflow =
2037 VecFromMask(di32, Eq(product2, Set(di32, LimitsMin<int32_t>())));
2038
2039 return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)),
2040 Add(product2, mul_overflow));
2041}
2042
2043// ------------------------------ SatWidenMulPairwiseAdd
2044
2045#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
2046#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
2047#else
2048#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
2049#endif
2050
2051template <class DI16, HWY_IF_I16_D(DI16)>
2053 Vec1<int8_t> b) {
2054 // Saturation of a.raw * b.raw is not needed on the HWY_SCALAR target as the
2055 // input vectors only have 1 lane on the HWY_SCALAR target and as
2056 // a.raw * b.raw is between -32640 and 32385, which is already within the
2057 // range of an int16_t.
2058
2059 // On other targets, a saturated addition of a[0]*b[0] + a[1]*b[1] is needed
2060 // as it is possible for the addition of a[0]*b[0] + a[1]*b[1] to overflow if
2061 // a[0], a[1], b[0], and b[1] are all non-zero and b[0] and b[1] both have the
2062 // same sign.
2063
2064 return Vec1<int16_t>(static_cast<int16_t>(a.raw) *
2065 static_cast<int16_t>(b.raw));
2066}
2067
2068// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2069
2070template <class D32, HWY_IF_F32_D(D32)>
2073 const Vec1<float> sum0,
2074 Vec1<float>& /* sum1 */) {
2075 return MulAdd(Vec1<float>(F32FromBF16(a.raw)),
2076 Vec1<float>(F32FromBF16(b.raw)), sum0);
2077}
2078
2079template <class D32, HWY_IF_I32_D(D32)>
2081 Vec1<int16_t> b,
2082 const Vec1<int32_t> sum0,
2083 Vec1<int32_t>& /* sum1 */) {
2084 return Vec1<int32_t>(a.raw * b.raw + sum0.raw);
2085}
2086
2087template <class DU32, HWY_IF_U32_D(DU32)>
2091 const Vec1<uint32_t> sum0,
2092 Vec1<uint32_t>& /* sum1 */) {
2093 return Vec1<uint32_t>(static_cast<uint32_t>(a.raw) * b.raw + sum0.raw);
2094}
2095
2096// ------------------------------ RearrangeToOddPlusEven
2097template <typename TW>
2099 return sum0; // invariant already holds
2100}
2101
2102// ================================================== REDUCTIONS
2103
2104// Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum.
2105
2106// NOLINTNEXTLINE(google-readability-namespace-comments)
2107} // namespace HWY_NAMESPACE
2108} // namespace hwy
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
Definition scalar-inl.h:75
Raw bits
Definition scalar-inl.h:85
hwy::MakeUnsigned< T > Raw
Definition scalar-inl.h:76
static HWY_INLINE Mask1< T > FromBool(bool b)
Definition scalar-inl.h:79
HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag, FromT val)
Definition emu128-inl.h:1554
HWY_INLINE T ReverseBitsOfEachByte(T val)
Definition scalar-inl.h:1746
HWY_INLINE VFromD< DTo > ZeroExtendResizeBitCast(FromSizeTag, ToSizeTag, DTo d_to, DFrom, VFromD< DFrom > v)
Definition emu128-inl.h:140
HWY_INLINE ToT CastValueForF2IConv(FromT val)
Definition emu128-inl.h:1515
HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val)
Definition emu128-inl.h:1575
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
HWY_API VFromD< DI32 > SatWidenMulAccumFixedPoint(DI32, VFromD< Rebind< int16_t, DI32 > > a, VFromD< Rebind< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition arm_neon-inl.h:6496
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec< DI16 > SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b)
Definition generic_ops-inl.h:5153
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API V ReverseLaneBytes(V v)
Definition generic_ops-inl.h:6386
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< DTo > PromoteEvenTo(DTo d_to, Vec1< TFrom > v)
Definition scalar-inl.h:1478
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API Vec1< MakeWide< T > > SumsOf2(const Vec1< T > v)
Definition scalar-inl.h:549
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API void LoadInterleaved2(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1)
Definition arm_neon-inl.h:9049
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API svbool_t LowerHalfOfMask(D, svbool_t m)
Definition arm_sve-inl.h:1456
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
V Ceiling(const V v)
Definition scalar-inl.h:877
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > MaskedGatherIndex(MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2731
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API V ReverseBits(V v)
Definition generic_ops-inl.h:6464
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
Definition abort.h:8
HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16)
Definition base.h:1304
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue< float >()
Definition base.h:2203
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_API constexpr T LimitsMin()
Definition base.h:2181
HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf)
Definition base.h:1778
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val)
Definition base.h:2873
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val)
Definition base.h:2822
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue< float >()
Definition base.h:2224
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f)
Definition base.h:1817
HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32)
Definition base.h:1374
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef< T > ScalarAbs(T val)
Definition base.h:2815
HWY_API constexpr RemoveCvRef< T > ScalarShr(T val, int shift_amt)
Definition base.h:2528
HWY_API size_t PopCount(T x)
Definition base.h:2615
HWY_API constexpr T LimitsMax()
Definition base.h:2174
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
@ value
Definition arm_neon-inl.h:8429
Definition scalar-inl.h:1625
MakeSigned< T > raw
Definition scalar-inl.h:1626
Definition ops/shared-inl.h:198
Definition scalar-inl.h:36
T raw
Definition scalar-inl.h:70
static constexpr size_t kPrivateN
Definition scalar-inl.h:38
HWY_INLINE Vec1 & operator*=(const Vec1 other)
Definition scalar-inl.h:45
Vec1(const Vec1 &)=default
HWY_INLINE Vec1()=default
HWY_INLINE Vec1 & operator%=(const Vec1 other)
Definition scalar-inl.h:57
HWY_INLINE Vec1 & operator^=(const Vec1 other)
Definition scalar-inl.h:66
HWY_INLINE Vec1(const T t)
Definition scalar-inl.h:43
HWY_INLINE Vec1 & operator&=(const Vec1 other)
Definition scalar-inl.h:60
T PrivateT
Definition scalar-inl.h:37
Vec1 & operator=(const Vec1 &)=default
HWY_INLINE Vec1 & operator-=(const Vec1 other)
Definition scalar-inl.h:54
HWY_INLINE Vec1 & operator+=(const Vec1 other)
Definition scalar-inl.h:51
HWY_INLINE Vec1 & operator|=(const Vec1 other)
Definition scalar-inl.h:63
HWY_INLINE Vec1 & operator/=(const Vec1 other)
Definition scalar-inl.h:48
Definition base.h:694
int VFromD
Definition tuple-inl.h:25