Grok 12.0.1
wasm_256-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 256-bit WASM vectors and operations. Experimental.
17// External include guard in highway.h - see comment there.
18
19// For half-width vectors. Already includes base.h and shared-inl.h.
21
23namespace hwy {
24namespace HWY_NAMESPACE {
25
26template <typename T>
27class Vec256 {
28 public:
29 using PrivateT = T; // only for DFromV
30 static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV
31
32 // Compound assignment. Only usable if there is a corresponding non-member
33 // binary operator overload. For example, only f32 and f64 support division.
35 return *this = (*this * other);
36 }
38 return *this = (*this / other);
39 }
41 return *this = (*this + other);
42 }
44 return *this = (*this - other);
45 }
47 return *this = (*this % other);
48 }
50 return *this = (*this & other);
51 }
53 return *this = (*this | other);
54 }
56 return *this = (*this ^ other);
57 }
58
61};
62
63template <typename T>
68
69// ------------------------------ Zero
70
71// Avoid VFromD here because it is defined in terms of Zero.
72template <class D, HWY_IF_V_SIZE_D(D, 32)>
74 const Half<decltype(d)> dh;
76 ret.v0 = ret.v1 = Zero(dh);
77 return ret;
78}
79
80// ------------------------------ BitCast
81template <class D, typename TFrom>
83 const Half<decltype(d)> dh;
84 VFromD<D> ret;
85 ret.v0 = BitCast(dh, v.v0);
86 ret.v1 = BitCast(dh, v.v1);
87 return ret;
88}
89
90// ------------------------------ ResizeBitCast
91
92// 32-byte vector to 32-byte vector: Same as BitCast
93template <class D, typename FromV, HWY_IF_V_SIZE_V(FromV, 32),
94 HWY_IF_V_SIZE_D(D, 32)>
95HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
96 return BitCast(d, v);
97}
98
99// <= 16-byte vector to 32-byte vector
100template <class D, typename FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
101 HWY_IF_V_SIZE_D(D, 32)>
102HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
103 const Half<decltype(d)> dh;
104 VFromD<D> ret;
105 ret.v0 = ResizeBitCast(dh, v);
106 ret.v1 = Zero(dh);
107 return ret;
108}
109
110// 32-byte vector to <= 16-byte vector
111template <class D, typename FromV, HWY_IF_V_SIZE_V(FromV, 32),
112 HWY_IF_V_SIZE_LE_D(D, 16)>
113HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
114 return ResizeBitCast(d, v.v0);
115}
116
117// ------------------------------ Set
118template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2>
119HWY_API VFromD<D> Set(D d, const T2 t) {
120 const Half<decltype(d)> dh;
121 VFromD<D> ret;
122 ret.v0 = ret.v1 = Set(dh, static_cast<TFromD<D>>(t));
123 return ret;
124}
125
126// Undefined, Iota defined in wasm_128.
127
128// ------------------------------ Dup128VecFromValues
129template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 32)>
130HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
131 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
132 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
133 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
134 TFromD<D> t11, TFromD<D> t12,
135 TFromD<D> t13, TFromD<D> t14,
136 TFromD<D> t15) {
137 const Half<decltype(d)> dh;
138 VFromD<D> ret;
139 ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t8,
140 t9, t10, t11, t12, t13, t14, t15);
141 return ret;
142}
143
144template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 32)>
145HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
146 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
147 TFromD<D> t5, TFromD<D> t6,
148 TFromD<D> t7) {
149 const Half<decltype(d)> dh;
150 VFromD<D> ret;
151 ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7);
152 return ret;
153}
154
155template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 32)>
156HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
157 TFromD<D> t2, TFromD<D> t3) {
158 const Half<decltype(d)> dh;
159 VFromD<D> ret;
160 ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1, t2, t3);
161 return ret;
162}
163
164template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 32)>
165HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
166 const Half<decltype(d)> dh;
167 VFromD<D> ret;
168 ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1);
169 return ret;
170}
171
172// ================================================== ARITHMETIC
173
174template <typename T>
176 a.v0 += b.v0;
177 a.v1 += b.v1;
178 return a;
179}
180
181template <typename T>
183 a.v0 -= b.v0;
184 a.v1 -= b.v1;
185 return a;
186}
187
188// ------------------------------ SumsOf8
191 ret.v0 = SumsOf8(v.v0);
192 ret.v1 = SumsOf8(v.v1);
193 return ret;
194}
195
197 Vec256<int64_t> ret;
198 ret.v0 = SumsOf8(v.v0);
199 ret.v1 = SumsOf8(v.v1);
200 return ret;
201}
202
203template <typename T>
205 a.v0 = SaturatedAdd(a.v0, b.v0);
206 a.v1 = SaturatedAdd(a.v1, b.v1);
207 return a;
208}
209
210template <typename T>
212 a.v0 = SaturatedSub(a.v0, b.v0);
213 a.v1 = SaturatedSub(a.v1, b.v1);
214 return a;
215}
216
217template <typename T>
219 a.v0 = AverageRound(a.v0, b.v0);
220 a.v1 = AverageRound(a.v1, b.v1);
221 return a;
222}
223
224template <typename T>
226 v.v0 = Abs(v.v0);
227 v.v1 = Abs(v.v1);
228 return v;
229}
230
231// ------------------------------ Shift lanes by constant #bits
232
233template <int kBits, typename T>
235 v.v0 = ShiftLeft<kBits>(v.v0);
236 v.v1 = ShiftLeft<kBits>(v.v1);
237 return v;
238}
239
240template <int kBits, typename T>
242 v.v0 = ShiftRight<kBits>(v.v0);
243 v.v1 = ShiftRight<kBits>(v.v1);
244 return v;
245}
246
247// ------------------------------ RotateRight (ShiftRight, Or)
248template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
250 const DFromV<decltype(v)> d;
251 const RebindToUnsigned<decltype(d)> du;
252
253 constexpr size_t kSizeInBits = sizeof(T) * 8;
254 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
255 if (kBits == 0) return v;
256
257 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
258 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
259}
260
261// ------------------------------ Shift lanes by same variable #bits
262
263template <typename T>
265 v.v0 = ShiftLeftSame(v.v0, bits);
266 v.v1 = ShiftLeftSame(v.v1, bits);
267 return v;
268}
269
270template <typename T>
272 v.v0 = ShiftRightSame(v.v0, bits);
273 v.v1 = ShiftRightSame(v.v1, bits);
274 return v;
275}
276
277// ------------------------------ Min, Max
278template <typename T>
280 a.v0 = Min(a.v0, b.v0);
281 a.v1 = Min(a.v1, b.v1);
282 return a;
283}
284
285template <typename T>
287 a.v0 = Max(a.v0, b.v0);
288 a.v1 = Max(a.v1, b.v1);
289 return a;
290}
291// ------------------------------ Integer multiplication
292
293template <typename T>
295 a.v0 *= b.v0;
296 a.v1 *= b.v1;
297 return a;
298}
299
300template <typename T>
302 a.v0 = MulHigh(a.v0, b.v0);
303 a.v1 = MulHigh(a.v1, b.v1);
304 return a;
305}
306
307template <typename T>
309 a.v0 = MulFixedPoint15(a.v0, b.v0);
310 a.v1 = MulFixedPoint15(a.v1, b.v1);
311 return a;
312}
313
314// Cannot use MakeWide because that returns uint128_t for uint64_t, but we want
315// uint64_t.
316template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
320 ret.v0 = MulEven(a.v0, b.v0);
321 ret.v1 = MulEven(a.v1, b.v1);
322 return ret;
323}
324template <class T, HWY_IF_UI64(T)>
326 Vec256<T> ret;
327 ret.v0 = MulEven(a.v0, b.v0);
328 ret.v1 = MulEven(a.v1, b.v1);
329 return ret;
330}
331
332template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
336 ret.v0 = MulOdd(a.v0, b.v0);
337 ret.v1 = MulOdd(a.v1, b.v1);
338 return ret;
339}
340template <class T, HWY_IF_UI64(T)>
342 Vec256<T> ret;
343 ret.v0 = MulOdd(a.v0, b.v0);
344 ret.v1 = MulOdd(a.v1, b.v1);
345 return ret;
346}
347
348// ------------------------------ Negate
349template <typename T>
351 v.v0 = Neg(v.v0);
352 v.v1 = Neg(v.v1);
353 return v;
354}
355
356// ------------------------------ AbsDiff
357// generic_ops takes care of integer T.
358template <typename T, HWY_IF_FLOAT(T)>
360 return Abs(a - b);
361}
362
363// ------------------------------ Floating-point division
364// generic_ops takes care of integer T.
365template <typename T, HWY_IF_FLOAT(T)>
367 a.v0 /= b.v0;
368 a.v1 /= b.v1;
369 return a;
370}
371
372// Approximate reciprocal
374 const Vec256<float> one = Set(Full256<float>(), 1.0f);
375 return one / v;
376}
377
378// ------------------------------ Floating-point multiply-add variants
379
381 Vec256<float> add) {
382 mul.v0 = MulAdd(mul.v0, x.v0, add.v0);
383 mul.v1 = MulAdd(mul.v1, x.v1, add.v1);
384 return mul;
385}
386
388 Vec256<float> add) {
389 mul.v0 = NegMulAdd(mul.v0, x.v0, add.v0);
390 mul.v1 = NegMulAdd(mul.v1, x.v1, add.v1);
391 return mul;
392}
393
395 Vec256<float> sub) {
396 mul.v0 = MulSub(mul.v0, x.v0, sub.v0);
397 mul.v1 = MulSub(mul.v1, x.v1, sub.v1);
398 return mul;
399}
400
402 Vec256<float> sub) {
403 mul.v0 = NegMulSub(mul.v0, x.v0, sub.v0);
404 mul.v1 = NegMulSub(mul.v1, x.v1, sub.v1);
405 return mul;
406}
407
408// ------------------------------ Floating-point square root
409
410template <typename T>
412 v.v0 = Sqrt(v.v0);
413 v.v1 = Sqrt(v.v1);
414 return v;
415}
416
417// Approximate reciprocal square root
419 // TODO(eustas): find cheaper a way to calculate this.
420 const Vec256<float> one = Set(Full256<float>(), 1.0f);
421 return one / Sqrt(v);
422}
423
424// ------------------------------ Floating-point rounding
425
426// Toward nearest integer, ties to even
428 v.v0 = Round(v.v0);
429 v.v1 = Round(v.v1);
430 return v;
431}
432
433// Toward zero, aka truncate
435 v.v0 = Trunc(v.v0);
436 v.v1 = Trunc(v.v1);
437 return v;
438}
439
440// Toward +infinity, aka ceiling
442 v.v0 = Ceil(v.v0);
443 v.v1 = Ceil(v.v1);
444 return v;
445}
446
447// Toward -infinity, aka floor
449 v.v0 = Floor(v.v0);
450 v.v1 = Floor(v.v1);
451 return v;
452}
453
454// ------------------------------ Floating-point classification
455
456template <typename T>
458 return v != v;
459}
460
461template <typename T, HWY_IF_FLOAT(T)>
463 const DFromV<decltype(v)> d;
464 const RebindToUnsigned<decltype(d)> du;
465 const VFromD<decltype(du)> vu = BitCast(du, v);
466 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
467 return RebindMask(d, Eq(Add(vu, vu), Set(du, hwy::MaxExponentTimes2<T>())));
468}
469
470// Returns whether normal/subnormal/zero.
471template <typename T, HWY_IF_FLOAT(T)>
473 const DFromV<decltype(v)> d;
474 const RebindToUnsigned<decltype(d)> du;
475 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
476 const VFromD<decltype(du)> vu = BitCast(du, v);
477 // 'Shift left' to clear the sign bit, then right so we can compare with the
478 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
479 // negative and non-negative floats would be greater).
480 const VFromD<decltype(di)> exp =
481 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
482 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
483}
484
485// ================================================== COMPARE
486
487// Comparisons fill a lane with 1-bits if the condition is true, else 0.
488
489template <class DTo, typename TFrom, typename TTo = TFromD<DTo>>
491 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
492 return MFromD<DTo>{Mask128<TTo>{m.m0.raw}, Mask128<TTo>{m.m1.raw}};
493}
494
495template <typename T>
497 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
498 return (v & bit) == bit;
499}
500
501template <typename T>
504 m.m0 = operator==(a.v0, b.v0);
505 m.m1 = operator==(a.v1, b.v1);
506 return m;
507}
508
509template <typename T>
512 m.m0 = operator!=(a.v0, b.v0);
513 m.m1 = operator!=(a.v1, b.v1);
514 return m;
515}
516
517template <typename T>
520 m.m0 = operator<(a.v0, b.v0);
521 m.m1 = operator<(a.v1, b.v1);
522 return m;
523}
524
525template <typename T>
528 m.m0 = operator>(a.v0, b.v0);
529 m.m1 = operator>(a.v1, b.v1);
530 return m;
531}
532
533template <typename T>
536 m.m0 = operator<=(a.v0, b.v0);
537 m.m1 = operator<=(a.v1, b.v1);
538 return m;
539}
540
541template <typename T>
544 m.m0 = operator>=(a.v0, b.v0);
545 m.m1 = operator>=(a.v1, b.v1);
546 return m;
547}
548
549// ------------------------------ FirstN (Iota, Lt)
550
551template <class D, HWY_IF_V_SIZE_D(D, 32)>
552HWY_API MFromD<D> FirstN(const D d, size_t num) {
553 const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
554 using TI = TFromD<decltype(di)>;
555 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<TI>(num)));
556}
557
558// ================================================== LOGICAL
559
560template <typename T>
562 v.v0 = Not(v.v0);
563 v.v1 = Not(v.v1);
564 return v;
565}
566
567template <typename T>
569 a.v0 = And(a.v0, b.v0);
570 a.v1 = And(a.v1, b.v1);
571 return a;
572}
573
574template <typename T>
576 not_mask.v0 = AndNot(not_mask.v0, mask.v0);
577 not_mask.v1 = AndNot(not_mask.v1, mask.v1);
578 return not_mask;
579}
580
581template <typename T>
583 a.v0 = Or(a.v0, b.v0);
584 a.v1 = Or(a.v1, b.v1);
585 return a;
586}
587
588template <typename T>
590 a.v0 = Xor(a.v0, b.v0);
591 a.v1 = Xor(a.v1, b.v1);
592 return a;
593}
594
595template <typename T>
597 return Xor(x1, Xor(x2, x3));
598}
599
600template <typename T>
602 return Or(o1, Or(o2, o3));
603}
604
605template <typename T>
607 return Or(o, And(a1, a2));
608}
609
610template <typename T>
612 return IfThenElse(MaskFromVec(mask), yes, no);
613}
614
615// ------------------------------ Operator overloads (internal-only if float)
616
617template <typename T>
619 return And(a, b);
620}
621
622template <typename T>
624 return Or(a, b);
625}
626
627template <typename T>
629 return Xor(a, b);
630}
631
632// ------------------------------ CopySign
633template <typename T>
635 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
636 const DFromV<decltype(magn)> d;
637 return BitwiseIfThenElse(SignBit(d), sign, magn);
638}
639
640// ------------------------------ CopySignToAbs
641template <typename T>
643 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
644 const DFromV<decltype(sign)> d;
645 return OrAnd(abs, SignBit(d), sign);
646}
647
648// ------------------------------ Mask
649
650// Mask and Vec are the same (true = FF..FF).
651template <typename T>
654 m.m0 = MaskFromVec(v.v0);
655 m.m1 = MaskFromVec(v.v1);
656 return m;
657}
658
659template <class D, typename T = TFromD<D>>
661 const Half<decltype(d)> dh;
662 Vec256<T> v;
663 v.v0 = VecFromMask(dh, m.m0);
664 v.v1 = VecFromMask(dh, m.m1);
665 return v;
666}
667
668// mask ? yes : no
669template <typename T>
671 yes.v0 = IfThenElse(mask.m0, yes.v0, no.v0);
672 yes.v1 = IfThenElse(mask.m1, yes.v1, no.v1);
673 return yes;
674}
675
676// mask ? yes : 0
677template <typename T>
679 return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
680}
681
682// mask ? 0 : no
683template <typename T>
685 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
686}
687
688template <typename T>
690 v.v0 = IfNegativeThenElse(v.v0, yes.v0, no.v0);
691 v.v1 = IfNegativeThenElse(v.v1, yes.v1, no.v1);
692 return v;
693}
694
695// ------------------------------ Mask logical
696
697template <typename T>
701
702template <typename T>
704 const Full256<T> d;
705 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
706}
707
708template <typename T>
710 const Full256<T> d;
711 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
712}
713
714template <typename T>
716 const Full256<T> d;
717 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
718}
719
720template <typename T>
722 const Full256<T> d;
723 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
724}
725
726template <typename T>
731
732// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
733template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
735 v.v0 = operator<<(v.v0, bits.v0);
736 v.v1 = operator<<(v.v1, bits.v1);
737 return v;
738}
739
740// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
741template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
743 v.v0 = operator>>(v.v0, bits.v0);
744 v.v1 = operator>>(v.v1, bits.v1);
745 return v;
746}
747
748// ------------------------------ BroadcastSignBit (compare, VecFromMask)
749
750template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
752 return ShiftRight<sizeof(T) * 8 - 1>(v);
753}
755 const DFromV<decltype(v)> d;
756 return VecFromMask(d, v < Zero(d));
757}
758
759// ================================================== MEMORY
760
761// ------------------------------ Load
762
763template <class D, HWY_IF_V_SIZE_D(D, 32)>
764HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
765 const Half<decltype(d)> dh;
766 VFromD<D> ret;
767 ret.v0 = Load(dh, aligned);
768 ret.v1 = Load(dh, aligned + Lanes(dh));
769 return ret;
770}
771
772template <class D, typename T = TFromD<D>>
774 return IfThenElseZero(m, Load(d, aligned));
775}
776
777template <class D, typename T = TFromD<D>>
779 const T* HWY_RESTRICT aligned) {
780 return IfThenElse(m, Load(d, aligned), v);
781}
782
783// LoadU == Load.
784template <class D, HWY_IF_V_SIZE_D(D, 32)>
785HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
786 return Load(d, p);
787}
788
789template <class D, HWY_IF_V_SIZE_D(D, 32)>
790HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
791 const Half<decltype(d)> dh;
792 VFromD<D> ret;
793 ret.v0 = ret.v1 = Load(dh, p);
794 return ret;
795}
796
797// ------------------------------ Store
798
799template <class D, typename T = TFromD<D>>
800HWY_API void Store(Vec256<T> v, D d, T* HWY_RESTRICT aligned) {
801 const Half<decltype(d)> dh;
802 Store(v.v0, dh, aligned);
803 Store(v.v1, dh, aligned + Lanes(dh));
804}
805
806// StoreU == Store.
807template <class D, typename T = TFromD<D>>
809 Store(v, d, p);
810}
811
812template <class D, typename T = TFromD<D>>
816
817// ------------------------------ Stream
818template <class D, typename T = TFromD<D>>
819HWY_API void Stream(Vec256<T> v, D d, T* HWY_RESTRICT aligned) {
820 // Same as aligned stores.
821 Store(v, d, aligned);
822}
823
824// ------------------------------ Scatter, Gather defined in wasm_128
825
826// ================================================== SWIZZLE
827
828// ------------------------------ ExtractLane
829template <typename T>
830HWY_API T ExtractLane(const Vec256<T> v, size_t i) {
831 alignas(32) T lanes[32 / sizeof(T)];
832 Store(v, DFromV<decltype(v)>(), lanes);
833 return lanes[i];
834}
835
836// ------------------------------ InsertLane
837template <typename T>
838HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) {
839 DFromV<decltype(v)> d;
840 alignas(32) T lanes[32 / sizeof(T)];
841 Store(v, d, lanes);
842 lanes[i] = t;
843 return Load(d, lanes);
844}
845
846// ------------------------------ ExtractBlock
847template <int kBlockIdx, class T>
849 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
850 return (kBlockIdx == 0) ? v.v0 : v.v1;
851}
852
853// ------------------------------ InsertBlock
854template <int kBlockIdx, class T>
856 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
857 Vec256<T> result;
858 if (kBlockIdx == 0) {
859 result.v0 = blk_to_insert;
860 result.v1 = v.v1;
861 } else {
862 result.v0 = v.v0;
863 result.v1 = blk_to_insert;
864 }
865 return result;
866}
867
868// ------------------------------ BroadcastBlock
869template <int kBlockIdx, class T>
871 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
872 Vec256<T> result;
873 result.v0 = result.v1 = (kBlockIdx == 0 ? v.v0 : v.v1);
874 return result;
875}
876
877// ------------------------------ LowerHalf
878
879template <class D, typename T = TFromD<D>>
881 return v.v0;
882}
883
884template <typename T>
886 return v.v0;
887}
888
889// ------------------------------ GetLane (LowerHalf)
890template <typename T>
892 return GetLane(LowerHalf(v));
893}
894
895// ------------------------------ ShiftLeftBytes
896
897template <int kBytes, class D, typename T = TFromD<D>>
899 const Half<decltype(d)> dh;
900 v.v0 = ShiftLeftBytes<kBytes>(dh, v.v0);
901 v.v1 = ShiftLeftBytes<kBytes>(dh, v.v1);
902 return v;
903}
904
905template <int kBytes, typename T>
907 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
908}
909
910// ------------------------------ ShiftLeftLanes
911
912template <int kLanes, class D, typename T = TFromD<D>>
914 const Repartition<uint8_t, decltype(d)> d8;
915 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
916}
917
918template <int kLanes, typename T>
920 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
921}
922
923// ------------------------------ ShiftRightBytes
924template <int kBytes, class D, typename T = TFromD<D>>
926 const Half<decltype(d)> dh;
927 v.v0 = ShiftRightBytes<kBytes>(dh, v.v0);
928 v.v1 = ShiftRightBytes<kBytes>(dh, v.v1);
929 return v;
930}
931
932// ------------------------------ ShiftRightLanes
933template <int kLanes, class D, typename T = TFromD<D>>
935 const Repartition<uint8_t, decltype(d)> d8;
936 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
937}
938
939// ------------------------------ UpperHalf (ShiftRightBytes)
940template <class D, typename T = TFromD<D>>
941HWY_API Vec128<T> UpperHalf(D /* tag */, const Vec256<T> v) {
942 return v.v1;
943}
944
945// ------------------------------ CombineShiftRightBytes
946
947template <int kBytes, class D, typename T = TFromD<D>>
949 const Half<decltype(d)> dh;
950 hi.v0 = CombineShiftRightBytes<kBytes>(dh, hi.v0, lo.v0);
951 hi.v1 = CombineShiftRightBytes<kBytes>(dh, hi.v1, lo.v1);
952 return hi;
953}
954
955// ------------------------------ Broadcast/splat any lane
956
957template <int kLane, typename T>
959 Vec256<T> ret;
960 ret.v0 = Broadcast<kLane>(v.v0);
961 ret.v1 = Broadcast<kLane>(v.v1);
962 return ret;
963}
964
965template <int kLane, typename T>
967 constexpr int kLanesPerBlock = static_cast<int>(16 / sizeof(T));
968 static_assert(0 <= kLane && kLane < kLanesPerBlock * 2, "Invalid lane");
969 constexpr int kLaneInBlkIdx = kLane & (kLanesPerBlock - 1);
970 Vec256<T> ret;
971 ret.v0 = ret.v1 =
972 Broadcast<kLaneInBlkIdx>(kLane >= kLanesPerBlock ? v.v1 : v.v0);
973 return ret;
974}
975
976// ------------------------------ TableLookupBytes
977
978// Both full
979template <typename T, typename TI>
981 from.v0 = TableLookupBytes(bytes.v0, from.v0);
982 from.v1 = TableLookupBytes(bytes.v1, from.v1);
983 return from;
984}
985
986// Partial index vector
987template <typename T, typename TI, size_t NI>
989 const Vec128<TI, NI> from) {
990 // First expand to full 128, then 256.
991 const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw});
992 const auto tbl_full = TableLookupBytes(bytes, from_256);
993 // Shrink to 128, then partial.
994 return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw};
995}
996
997// Partial table vector
998template <typename T, size_t N, typename TI>
1000 // First expand to full 128, then 256.
1001 const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw});
1002 return TableLookupBytes(bytes_256, from);
1003}
1004
1005// Partial both are handled by wasm_128.
1006
1007template <class V, class VI>
1008HWY_API VI TableLookupBytesOr0(V bytes, VI from) {
1009 // wasm out-of-bounds policy already zeros, so TableLookupBytes is fine.
1010 return TableLookupBytes(bytes, from);
1011}
1012
1013// ------------------------------ Hard-coded shuffles
1014
1015template <typename T>
1017 v.v0 = Shuffle01(v.v0);
1018 v.v1 = Shuffle01(v.v1);
1019 return v;
1020}
1021
1022template <typename T>
1024 v.v0 = Shuffle2301(v.v0);
1025 v.v1 = Shuffle2301(v.v1);
1026 return v;
1027}
1028
1029template <typename T>
1031 v.v0 = Shuffle1032(v.v0);
1032 v.v1 = Shuffle1032(v.v1);
1033 return v;
1034}
1035
1036template <typename T>
1038 v.v0 = Shuffle0321(v.v0);
1039 v.v1 = Shuffle0321(v.v1);
1040 return v;
1041}
1042
1043template <typename T>
1045 v.v0 = Shuffle2103(v.v0);
1046 v.v1 = Shuffle2103(v.v1);
1047 return v;
1048}
1049
1050template <typename T>
1052 v.v0 = Shuffle0123(v.v0);
1053 v.v1 = Shuffle0123(v.v1);
1054 return v;
1055}
1056
1057// Used by generic_ops-inl.h
1058namespace detail {
1059
1060template <typename T, HWY_IF_T_SIZE(T, 4)>
1062 a.v0 = ShuffleTwo2301(a.v0, b.v0);
1063 a.v1 = ShuffleTwo2301(a.v1, b.v1);
1064 return a;
1065}
1066template <typename T, HWY_IF_T_SIZE(T, 4)>
1068 a.v0 = ShuffleTwo1230(a.v0, b.v0);
1069 a.v1 = ShuffleTwo1230(a.v1, b.v1);
1070 return a;
1071}
1072template <typename T, HWY_IF_T_SIZE(T, 4)>
1074 a.v0 = ShuffleTwo3012(a.v0, b.v0);
1075 a.v1 = ShuffleTwo3012(a.v1, b.v1);
1076 return a;
1077}
1078
1079} // namespace detail
1080
1081// ------------------------------ TableLookupLanes
1082
1083// Returned by SetTableIndices for use by TableLookupLanes.
1084template <typename T>
1086 __v128_u i0;
1087 __v128_u i1;
1088};
1089
1090template <class D, typename T = TFromD<D>, typename TI>
1092 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
1093 Indices256<T> ret;
1094 ret.i0 = vec.v0.raw;
1095 ret.i1 = vec.v1.raw;
1096 return ret;
1097}
1098
1099template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI>
1101 const Rebind<TI, decltype(d)> di;
1102 return IndicesFromVec(d, LoadU(di, idx));
1103}
1104
1105template <typename T>
1107 const DFromV<decltype(v)> d;
1108 const Half<decltype(d)> dh;
1109 const auto idx_i0 = IndicesFromVec(dh, Vec128<T>{idx.i0});
1110 const auto idx_i1 = IndicesFromVec(dh, Vec128<T>{idx.i1});
1111
1112 Vec256<T> result;
1113 result.v0 = TwoTablesLookupLanes(v.v0, v.v1, idx_i0);
1114 result.v1 = TwoTablesLookupLanes(v.v0, v.v1, idx_i1);
1115 return result;
1116}
1117
1118template <typename T>
1120 // The out of bounds behavior will already zero lanes.
1121 return TableLookupLanesOr0(v, idx);
1122}
1123
1124template <typename T>
1126 Indices256<T> idx) {
1127 const DFromV<decltype(a)> d;
1128 const Half<decltype(d)> dh;
1129 const RebindToUnsigned<decltype(d)> du;
1130 using TU = MakeUnsigned<T>;
1131 constexpr size_t kLanesPerVect = 32 / sizeof(TU);
1132
1133 Vec256<TU> vi;
1134 vi.v0 = Vec128<TU>{idx.i0};
1135 vi.v1 = Vec128<TU>{idx.i1};
1136 const auto vmod = vi & Set(du, TU{kLanesPerVect - 1});
1137 const auto is_lo = RebindMask(d, vi == vmod);
1138
1139 const auto idx_i0 = IndicesFromVec(dh, vmod.v0);
1140 const auto idx_i1 = IndicesFromVec(dh, vmod.v1);
1141
1142 Vec256<T> result_lo;
1143 Vec256<T> result_hi;
1144 result_lo.v0 = TwoTablesLookupLanes(a.v0, a.v1, idx_i0);
1145 result_lo.v1 = TwoTablesLookupLanes(a.v0, a.v1, idx_i1);
1146 result_hi.v0 = TwoTablesLookupLanes(b.v0, b.v1, idx_i0);
1147 result_hi.v1 = TwoTablesLookupLanes(b.v0, b.v1, idx_i1);
1148 return IfThenElse(is_lo, result_lo, result_hi);
1149}
1150
1151// ------------------------------ Reverse
1152template <class D, typename T = TFromD<D>>
1154 const Half<decltype(d)> dh;
1155 Vec256<T> ret;
1156 ret.v1 = Reverse(dh, v.v0); // note reversed v1 member order
1157 ret.v0 = Reverse(dh, v.v1);
1158 return ret;
1159}
1160
1161// ------------------------------ Reverse2
1162template <class D, typename T = TFromD<D>>
1164 const Half<decltype(d)> dh;
1165 v.v0 = Reverse2(dh, v.v0);
1166 v.v1 = Reverse2(dh, v.v1);
1167 return v;
1168}
1169
1170// ------------------------------ Reverse4
1171
1172// Each block has only 2 lanes, so swap blocks and their lanes.
1173template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
1175 const Half<decltype(d)> dh;
1176 Vec256<T> ret;
1177 ret.v0 = Reverse2(dh, v.v1); // swapped
1178 ret.v1 = Reverse2(dh, v.v0);
1179 return ret;
1180}
1181
1182template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 8)>
1183HWY_API Vec256<T> Reverse4(D d, Vec256<T> v) {
1184 const Half<decltype(d)> dh;
1185 v.v0 = Reverse4(dh, v.v0);
1186 v.v1 = Reverse4(dh, v.v1);
1187 return v;
1188}
1189
1190// ------------------------------ Reverse8
1191
1192template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
1193HWY_API Vec256<T> Reverse8(D /* tag */, Vec256<T> /* v */) {
1194 HWY_ASSERT(0); // don't have 8 u64 lanes
1195}
1196
1197// Each block has only 4 lanes, so swap blocks and their lanes.
1198template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
1199HWY_API Vec256<T> Reverse8(D d, const Vec256<T> v) {
1200 const Half<decltype(d)> dh;
1201 Vec256<T> ret;
1202 ret.v0 = Reverse4(dh, v.v1); // swapped
1203 ret.v1 = Reverse4(dh, v.v0);
1204 return ret;
1205}
1206
1207template <class D, typename T = TFromD<D>,
1208 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1209HWY_API Vec256<T> Reverse8(D d, Vec256<T> v) {
1210 const Half<decltype(d)> dh;
1211 v.v0 = Reverse8(dh, v.v0);
1212 v.v1 = Reverse8(dh, v.v1);
1213 return v;
1214}
1215
1216// ------------------------------ InterleaveLower
1217
1218template <typename T>
1220 a.v0 = InterleaveLower(a.v0, b.v0);
1221 a.v1 = InterleaveLower(a.v1, b.v1);
1222 return a;
1223}
1224
1225// wasm_128 already defines a template with D, V, V args.
1226
1227// ------------------------------ InterleaveUpper (UpperHalf)
1228
1229template <class D, typename T = TFromD<D>>
1231 const Half<decltype(d)> dh;
1232 a.v0 = InterleaveUpper(dh, a.v0, b.v0);
1233 a.v1 = InterleaveUpper(dh, a.v1, b.v1);
1234 return a;
1235}
1236
1237// ------------------------------ InterleaveWholeLower
1238template <class D, HWY_IF_V_SIZE_D(D, 32)>
1240 const Half<decltype(d)> dh;
1241 VFromD<D> ret;
1242 ret.v0 = InterleaveLower(a.v0, b.v0);
1243 ret.v1 = InterleaveUpper(dh, a.v0, b.v0);
1244 return ret;
1245}
1246
1247// ------------------------------ InterleaveWholeUpper
1248template <class D, HWY_IF_V_SIZE_D(D, 32)>
1250 const Half<decltype(d)> dh;
1251 VFromD<D> ret;
1252 ret.v0 = InterleaveLower(a.v1, b.v1);
1253 ret.v1 = InterleaveUpper(dh, a.v1, b.v1);
1254 return ret;
1255}
1256
1257// ------------------------------ ZipLower/ZipUpper defined in wasm_128
1258
1259// ================================================== COMBINE
1260
1261// ------------------------------ Combine (InterleaveLower)
1262template <class D, typename T = TFromD<D>>
1264 Vec256<T> ret;
1265 ret.v1 = hi;
1266 ret.v0 = lo;
1267 return ret;
1268}
1269
1270// ------------------------------ ZeroExtendVector (Combine)
1271template <class D, typename T = TFromD<D>>
1273 const Half<decltype(d)> dh;
1274 return Combine(d, Zero(dh), lo);
1275}
1276
1277// ------------------------------ ZeroExtendResizeBitCast
1278
1279namespace detail {
1280
1281template <size_t kFromVectSize, class DTo, class DFrom,
1282 HWY_IF_LANES_LE(kFromVectSize, 8)>
1284 hwy::SizeTag<kFromVectSize> /* from_size_tag */,
1285 hwy::SizeTag<32> /* to_size_tag */, DTo d_to, DFrom d_from,
1286 VFromD<DFrom> v) {
1287 const Half<decltype(d_to)> dh_to;
1288 return ZeroExtendVector(d_to, ZeroExtendResizeBitCast(dh_to, d_from, v));
1289}
1290
1291} // namespace detail
1292
1293// ------------------------------ ConcatLowerLower
1294template <class D, typename T = TFromD<D>>
1296 Vec256<T> ret;
1297 ret.v1 = hi.v0;
1298 ret.v0 = lo.v0;
1299 return ret;
1300}
1301
1302// ------------------------------ ConcatUpperUpper
1303template <class D, typename T = TFromD<D>>
1305 Vec256<T> ret;
1306 ret.v1 = hi.v1;
1307 ret.v0 = lo.v1;
1308 return ret;
1309}
1310
1311// ------------------------------ ConcatLowerUpper
1312template <class D, typename T = TFromD<D>>
1314 Vec256<T> ret;
1315 ret.v1 = hi.v0;
1316 ret.v0 = lo.v1;
1317 return ret;
1318}
1319
1320// ------------------------------ ConcatUpperLower
1321template <class D, typename T = TFromD<D>>
1323 Vec256<T> ret;
1324 ret.v1 = hi.v1;
1325 ret.v0 = lo.v0;
1326 return ret;
1327}
1328
1329// ------------------------------ ConcatOdd
1330template <class D, typename T = TFromD<D>>
1332 const Half<decltype(d)> dh;
1333 Vec256<T> ret;
1334 ret.v0 = ConcatOdd(dh, lo.v1, lo.v0);
1335 ret.v1 = ConcatOdd(dh, hi.v1, hi.v0);
1336 return ret;
1337}
1338
1339// ------------------------------ ConcatEven
1340template <class D, typename T = TFromD<D>>
1342 const Half<decltype(d)> dh;
1343 Vec256<T> ret;
1344 ret.v0 = ConcatEven(dh, lo.v1, lo.v0);
1345 ret.v1 = ConcatEven(dh, hi.v1, hi.v0);
1346 return ret;
1347}
1348
1349// ------------------------------ DupEven
1350template <typename T>
1352 v.v0 = DupEven(v.v0);
1353 v.v1 = DupEven(v.v1);
1354 return v;
1355}
1356
1357// ------------------------------ DupOdd
1358template <typename T>
1360 v.v0 = DupOdd(v.v0);
1361 v.v1 = DupOdd(v.v1);
1362 return v;
1363}
1364
1365// ------------------------------ OddEven
1366template <typename T>
1368 a.v0 = OddEven(a.v0, b.v0);
1369 a.v1 = OddEven(a.v1, b.v1);
1370 return a;
1371}
1372
1373// ------------------------------ InterleaveEven
1374template <class D, HWY_IF_V_SIZE_D(D, 32)>
1376 const Half<decltype(d)> dh;
1377 a.v0 = InterleaveEven(dh, a.v0, b.v0);
1378 a.v1 = InterleaveEven(dh, a.v1, b.v1);
1379 return a;
1380}
1381
1382// ------------------------------ InterleaveOdd
1383template <class D, HWY_IF_V_SIZE_D(D, 32)>
1385 const Half<decltype(d)> dh;
1386 a.v0 = InterleaveOdd(dh, a.v0, b.v0);
1387 a.v1 = InterleaveOdd(dh, a.v1, b.v1);
1388 return a;
1389}
1390
1391// ------------------------------ OddEvenBlocks
1392template <typename T>
1394 odd.v0 = even.v0;
1395 return odd;
1396}
1397
1398// ------------------------------ SwapAdjacentBlocks
1399template <typename T>
1401 Vec256<T> ret;
1402 ret.v0 = v.v1; // swapped order
1403 ret.v1 = v.v0;
1404 return ret;
1405}
1406
1407// ------------------------------ ReverseBlocks
1408template <class D, typename T = TFromD<D>>
1410 return SwapAdjacentBlocks(v); // 2 blocks, so Swap = Reverse
1411}
1412
1413// ------------------------------ Per4LaneBlockShuffle
1414namespace detail {
1415
1416template <size_t kIdx3210, class V>
1418 hwy::SizeTag<1> /*lane_size_tag*/,
1419 hwy::SizeTag<32> /*vect_size_tag*/, V v) {
1420 const DFromV<decltype(v)> d;
1421 const Half<decltype(d)> dh;
1422 using VH = VFromD<decltype(dh)>;
1423
1424 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
1425 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
1426 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
1427 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
1428
1429 V ret;
1430 ret.v0 = VH{wasm_i8x16_shuffle(
1431 v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3, kIdx0 + 4, kIdx1 + 4,
1432 kIdx2 + 4, kIdx3 + 4, kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8,
1433 kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)};
1434 ret.v1 = VH{wasm_i8x16_shuffle(
1435 v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3, kIdx0 + 4, kIdx1 + 4,
1436 kIdx2 + 4, kIdx3 + 4, kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8,
1437 kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)};
1438 return ret;
1439}
1440
1441template <size_t kIdx3210, class V>
1443 hwy::SizeTag<2> /*lane_size_tag*/,
1444 hwy::SizeTag<32> /*vect_size_tag*/, V v) {
1445 const DFromV<decltype(v)> d;
1446 const Half<decltype(d)> dh;
1447 using VH = VFromD<decltype(dh)>;
1448
1449 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
1450 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
1451 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
1452 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
1453
1454 V ret;
1455 ret.v0 = VH{wasm_i16x8_shuffle(v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3,
1456 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)};
1457 ret.v1 = VH{wasm_i16x8_shuffle(v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3,
1458 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)};
1459 return ret;
1460}
1461
1462template <size_t kIdx3210, class V>
1464 hwy::SizeTag<4> /*lane_size_tag*/,
1465 hwy::SizeTag<32> /*vect_size_tag*/, V v) {
1466 const DFromV<decltype(v)> d;
1467 const Half<decltype(d)> dh;
1468 using VH = VFromD<decltype(dh)>;
1469
1470 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
1471 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
1472 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
1473 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
1474
1475 V ret;
1476 ret.v0 =
1477 VH{wasm_i32x4_shuffle(v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3)};
1478 ret.v1 =
1479 VH{wasm_i32x4_shuffle(v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3)};
1480 return ret;
1481}
1482
1483template <size_t kIdx3210, class V>
1485 hwy::SizeTag<8> /*lane_size_tag*/,
1486 hwy::SizeTag<32> /*vect_size_tag*/, V v) {
1487 const DFromV<decltype(v)> d;
1488 const Half<decltype(d)> dh;
1489 using VH = VFromD<decltype(dh)>;
1490
1491 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
1492 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
1493 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
1494 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
1495
1496 V ret;
1497 ret.v0 = VH{wasm_i64x2_shuffle(v.v0.raw, v.v1.raw, kIdx0, kIdx1)};
1498 ret.v1 = VH{wasm_i64x2_shuffle(v.v0.raw, v.v1.raw, kIdx2, kIdx3)};
1499 return ret;
1500}
1501
1502} // namespace detail
1503
1504// ------------------------------ SlideUpBlocks
1505template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)>
1507 static_assert(0 <= kBlocks && kBlocks <= 1,
1508 "kBlocks must be between 0 and 1");
1509 return (kBlocks == 1) ? ConcatLowerLower(d, v, Zero(d)) : v;
1510}
1511
1512// ------------------------------ SlideDownBlocks
1513template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)>
1515 static_assert(0 <= kBlocks && kBlocks <= 1,
1516 "kBlocks must be between 0 and 1");
1517 const Half<decltype(d)> dh;
1518 return (kBlocks == 1) ? ZeroExtendVector(d, UpperHalf(dh, v)) : v;
1519}
1520
1521// ------------------------------ SlideUpLanes
1522
1523template <class D, HWY_IF_V_SIZE_D(D, 32)>
1524HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
1525 const Half<decltype(d)> dh;
1526 const RebindToUnsigned<decltype(d)> du;
1527 const RebindToUnsigned<decltype(dh)> dh_u;
1528 const auto vu = BitCast(du, v);
1529 VFromD<D> ret;
1530
1531#if !HWY_IS_DEBUG_BUILD
1532 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
1533 if (__builtin_constant_p(amt) && amt < kLanesPerBlock) {
1534 switch (amt * sizeof(TFromD<D>)) {
1535 case 0:
1536 return v;
1537 case 1:
1538 ret.v0 = BitCast(dh, ShiftLeftBytes<1>(dh_u, vu.v0));
1539 ret.v1 = BitCast(dh, CombineShiftRightBytes<15>(dh_u, vu.v1, vu.v0));
1540 return ret;
1541 case 2:
1542 ret.v0 = BitCast(dh, ShiftLeftBytes<2>(dh_u, vu.v0));
1543 ret.v1 = BitCast(dh, CombineShiftRightBytes<14>(dh_u, vu.v1, vu.v0));
1544 return ret;
1545 case 3:
1546 ret.v0 = BitCast(dh, ShiftLeftBytes<3>(dh_u, vu.v0));
1547 ret.v1 = BitCast(dh, CombineShiftRightBytes<13>(dh_u, vu.v1, vu.v0));
1548 return ret;
1549 case 4:
1550 ret.v0 = BitCast(dh, ShiftLeftBytes<4>(dh_u, vu.v0));
1551 ret.v1 = BitCast(dh, CombineShiftRightBytes<12>(dh_u, vu.v1, vu.v0));
1552 return ret;
1553 case 5:
1554 ret.v0 = BitCast(dh, ShiftLeftBytes<5>(dh_u, vu.v0));
1555 ret.v1 = BitCast(dh, CombineShiftRightBytes<11>(dh_u, vu.v1, vu.v0));
1556 return ret;
1557 case 6:
1558 ret.v0 = BitCast(dh, ShiftLeftBytes<6>(dh_u, vu.v0));
1559 ret.v1 = BitCast(dh, CombineShiftRightBytes<10>(dh_u, vu.v1, vu.v0));
1560 return ret;
1561 case 7:
1562 ret.v0 = BitCast(dh, ShiftLeftBytes<7>(dh_u, vu.v0));
1563 ret.v1 = BitCast(dh, CombineShiftRightBytes<9>(dh_u, vu.v1, vu.v0));
1564 return ret;
1565 case 8:
1566 ret.v0 = BitCast(dh, ShiftLeftBytes<8>(dh_u, vu.v0));
1567 ret.v1 = BitCast(dh, CombineShiftRightBytes<8>(dh_u, vu.v1, vu.v0));
1568 return ret;
1569 case 9:
1570 ret.v0 = BitCast(dh, ShiftLeftBytes<9>(dh_u, vu.v0));
1571 ret.v1 = BitCast(dh, CombineShiftRightBytes<7>(dh_u, vu.v1, vu.v0));
1572 return ret;
1573 case 10:
1574 ret.v0 = BitCast(dh, ShiftLeftBytes<10>(dh_u, vu.v0));
1575 ret.v1 = BitCast(dh, CombineShiftRightBytes<6>(dh_u, vu.v1, vu.v0));
1576 return ret;
1577 case 11:
1578 ret.v0 = BitCast(dh, ShiftLeftBytes<11>(dh_u, vu.v0));
1579 ret.v1 = BitCast(dh, CombineShiftRightBytes<5>(dh_u, vu.v1, vu.v0));
1580 return ret;
1581 case 12:
1582 ret.v0 = BitCast(dh, ShiftLeftBytes<12>(dh_u, vu.v0));
1583 ret.v1 = BitCast(dh, CombineShiftRightBytes<4>(dh_u, vu.v1, vu.v0));
1584 return ret;
1585 case 13:
1586 ret.v0 = BitCast(dh, ShiftLeftBytes<13>(dh_u, vu.v0));
1587 ret.v1 = BitCast(dh, CombineShiftRightBytes<3>(dh_u, vu.v1, vu.v0));
1588 return ret;
1589 case 14:
1590 ret.v0 = BitCast(dh, ShiftLeftBytes<14>(dh_u, vu.v0));
1591 ret.v1 = BitCast(dh, CombineShiftRightBytes<2>(dh_u, vu.v1, vu.v0));
1592 return ret;
1593 case 15:
1594 ret.v0 = BitCast(dh, ShiftLeftBytes<15>(dh_u, vu.v0));
1595 ret.v1 = BitCast(dh, CombineShiftRightBytes<1>(dh_u, vu.v1, vu.v0));
1596 return ret;
1597 }
1598 }
1599
1600 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
1601 ret.v0 = Zero(dh);
1602 ret.v1 = SlideUpLanes(dh, LowerHalf(dh, v), amt - kLanesPerBlock);
1603 return ret;
1604 }
1605#endif
1606
1607 const Repartition<uint8_t, decltype(d)> du8;
1608 const RebindToSigned<decltype(du8)> di8;
1609 const Half<decltype(di8)> dh_i8;
1610
1611 const auto lo_byte_idx = BitCast(
1612 di8,
1613 Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromD<D>))));
1614
1615 const auto hi_byte_idx =
1616 UpperHalf(dh_i8, lo_byte_idx) - Set(dh_i8, int8_t{16});
1617 const auto hi_sel_mask =
1618 UpperHalf(dh_i8, lo_byte_idx) > Set(dh_i8, int8_t{15});
1619
1620 ret = BitCast(d,
1621 TableLookupBytesOr0(ConcatLowerLower(du, vu, vu), lo_byte_idx));
1622 ret.v1 =
1623 BitCast(dh, IfThenElse(hi_sel_mask,
1624 TableLookupBytes(UpperHalf(dh_u, vu), hi_byte_idx),
1625 BitCast(dh_i8, ret.v1)));
1626 return ret;
1627}
1628
1629// ------------------------------ Slide1Up
1630template <typename D, HWY_IF_V_SIZE_D(D, 32)>
1632 VFromD<D> ret;
1633 const Half<decltype(d)> dh;
1634 constexpr int kShrByteAmt = static_cast<int>(16 - sizeof(TFromD<D>));
1635 ret.v0 = ShiftLeftLanes<1>(dh, v.v0);
1636 ret.v1 = CombineShiftRightBytes<kShrByteAmt>(dh, v.v1, v.v0);
1637 return ret;
1638}
1639
1640// ------------------------------ SlideDownLanes
1641
1642template <class D, HWY_IF_V_SIZE_D(D, 32)>
1643HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
1644 const Half<decltype(d)> dh;
1645 const RebindToUnsigned<decltype(d)> du;
1646 const RebindToUnsigned<decltype(dh)> dh_u;
1647 VFromD<D> ret;
1648
1649 const auto vu = BitCast(du, v);
1650
1651#if !HWY_IS_DEBUG_BUILD
1652 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
1653 if (__builtin_constant_p(amt) && amt < kLanesPerBlock) {
1654 switch (amt * sizeof(TFromD<D>)) {
1655 case 0:
1656 return v;
1657 case 1:
1658 ret.v0 = BitCast(dh, CombineShiftRightBytes<1>(dh_u, vu.v1, vu.v0));
1659 ret.v1 = BitCast(dh, ShiftRightBytes<1>(dh_u, vu.v1));
1660 return ret;
1661 case 2:
1662 ret.v0 = BitCast(dh, CombineShiftRightBytes<2>(dh_u, vu.v1, vu.v0));
1663 ret.v1 = BitCast(dh, ShiftRightBytes<2>(dh_u, vu.v1));
1664 return ret;
1665 case 3:
1666 ret.v0 = BitCast(dh, CombineShiftRightBytes<3>(dh_u, vu.v1, vu.v0));
1667 ret.v1 = BitCast(dh, ShiftRightBytes<3>(dh_u, vu.v1));
1668 return ret;
1669 case 4:
1670 ret.v0 = BitCast(dh, CombineShiftRightBytes<4>(dh_u, vu.v1, vu.v0));
1671 ret.v1 = BitCast(dh, ShiftRightBytes<4>(dh_u, vu.v1));
1672 return ret;
1673 case 5:
1674 ret.v0 = BitCast(dh, CombineShiftRightBytes<5>(dh_u, vu.v1, vu.v0));
1675 ret.v1 = BitCast(dh, ShiftRightBytes<5>(dh_u, vu.v1));
1676 return ret;
1677 case 6:
1678 ret.v0 = BitCast(dh, CombineShiftRightBytes<6>(dh_u, vu.v1, vu.v0));
1679 ret.v1 = BitCast(dh, ShiftRightBytes<6>(dh_u, vu.v1));
1680 return ret;
1681 case 7:
1682 ret.v0 = BitCast(dh, CombineShiftRightBytes<7>(dh_u, vu.v1, vu.v0));
1683 ret.v1 = BitCast(dh, ShiftRightBytes<7>(dh_u, vu.v1));
1684 return ret;
1685 case 8:
1686 ret.v0 = BitCast(dh, CombineShiftRightBytes<8>(dh_u, vu.v1, vu.v0));
1687 ret.v1 = BitCast(dh, ShiftRightBytes<8>(dh_u, vu.v1));
1688 return ret;
1689 case 9:
1690 ret.v0 = BitCast(dh, CombineShiftRightBytes<9>(dh_u, vu.v1, vu.v0));
1691 ret.v1 = BitCast(dh, ShiftRightBytes<9>(dh_u, vu.v1));
1692 return ret;
1693 case 10:
1694 ret.v0 = BitCast(dh, CombineShiftRightBytes<10>(dh_u, vu.v1, vu.v0));
1695 ret.v1 = BitCast(dh, ShiftRightBytes<10>(dh_u, vu.v1));
1696 return ret;
1697 case 11:
1698 ret.v0 = BitCast(dh, CombineShiftRightBytes<11>(dh_u, vu.v1, vu.v0));
1699 ret.v1 = BitCast(dh, ShiftRightBytes<11>(dh_u, vu.v1));
1700 return ret;
1701 case 12:
1702 ret.v0 = BitCast(dh, CombineShiftRightBytes<12>(dh_u, vu.v1, vu.v0));
1703 ret.v1 = BitCast(dh, ShiftRightBytes<12>(dh_u, vu.v1));
1704 return ret;
1705 case 13:
1706 ret.v0 = BitCast(dh, CombineShiftRightBytes<13>(dh_u, vu.v1, vu.v0));
1707 ret.v1 = BitCast(dh, ShiftRightBytes<13>(dh_u, vu.v1));
1708 return ret;
1709 case 14:
1710 ret.v0 = BitCast(dh, CombineShiftRightBytes<14>(dh_u, vu.v1, vu.v0));
1711 ret.v1 = BitCast(dh, ShiftRightBytes<14>(dh_u, vu.v1));
1712 return ret;
1713 case 15:
1714 ret.v0 = BitCast(dh, CombineShiftRightBytes<15>(dh_u, vu.v1, vu.v0));
1715 ret.v1 = BitCast(dh, ShiftRightBytes<15>(dh_u, vu.v1));
1716 return ret;
1717 }
1718 }
1719
1720 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
1721 ret.v0 = SlideDownLanes(dh, UpperHalf(dh, v), amt - kLanesPerBlock);
1722 ret.v1 = Zero(dh);
1723 return ret;
1724 }
1725#endif
1726
1727 const Repartition<uint8_t, decltype(d)> du8;
1728 const Half<decltype(du8)> dh_u8;
1729
1730 const auto lo_byte_idx =
1731 Iota(du8, static_cast<uint8_t>(amt * sizeof(TFromD<D>)));
1732 const auto u8_16 = Set(du8, uint8_t{16});
1733 const auto hi_byte_idx = lo_byte_idx - u8_16;
1734
1735 const auto lo_sel_mask =
1736 LowerHalf(dh_u8, lo_byte_idx) < LowerHalf(dh_u8, u8_16);
1737 ret = BitCast(d, IfThenElseZero(hi_byte_idx < u8_16,
1739 hi_byte_idx)));
1740 ret.v0 =
1741 BitCast(dh, IfThenElse(lo_sel_mask,
1742 TableLookupBytes(LowerHalf(dh_u, vu),
1743 LowerHalf(dh_u8, lo_byte_idx)),
1744 BitCast(dh_u8, LowerHalf(dh, ret))));
1745 return ret;
1746}
1747
1748// ------------------------------ Slide1Down
1749template <typename D, HWY_IF_V_SIZE_D(D, 32)>
1751 VFromD<D> ret;
1752 const Half<decltype(d)> dh;
1753 constexpr int kShrByteAmt = static_cast<int>(sizeof(TFromD<D>));
1754 ret.v0 = CombineShiftRightBytes<kShrByteAmt>(dh, v.v1, v.v0);
1755 ret.v1 = ShiftRightBytes<kShrByteAmt>(dh, v.v1);
1756 return ret;
1757}
1758
1759// ================================================== CONVERT
1760
1761// ------------------------------ PromoteTo
1762
1763template <class D, HWY_IF_V_SIZE_D(D, 32), typename TN,
1764 HWY_IF_T_SIZE_D(D, sizeof(TN) * 2)>
1766 const Half<decltype(d)> dh;
1767 VFromD<D> ret;
1768 // PromoteLowerTo is defined later in generic_ops-inl.h.
1769 ret.v0 = PromoteTo(dh, LowerHalf(v));
1770 ret.v1 = PromoteUpperTo(dh, v);
1771 return ret;
1772}
1773
1774// 4x promotion: 8-bit to 32-bit or 16-bit to 64-bit
1775template <class DW, HWY_IF_V_SIZE_D(DW, 32),
1776 HWY_IF_T_SIZE_ONE_OF_D(DW, (1 << 4) | (1 << 8)),
1777 HWY_IF_NOT_FLOAT_D(DW), typename TN,
1778 HWY_IF_T_SIZE_D(DW, sizeof(TN) * 4), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TN)>
1780 const Half<decltype(d)> dh;
1781 // 16-bit lanes for UI8->UI32, 32-bit lanes for UI16->UI64
1782 const Rebind<MakeWide<TN>, decltype(d)> d2;
1783 const auto v_2x = PromoteTo(d2, v);
1784 Vec256<TFromD<DW>> ret;
1785 // PromoteLowerTo is defined later in generic_ops-inl.h.
1786 ret.v0 = PromoteTo(dh, LowerHalf(v_2x));
1787 ret.v1 = PromoteUpperTo(dh, v_2x);
1788 return ret;
1789}
1790
1791// 8x promotion: 8-bit to 64-bit
1792template <class DW, HWY_IF_V_SIZE_D(DW, 32), HWY_IF_T_SIZE_D(DW, 8),
1793 HWY_IF_NOT_FLOAT_D(DW), typename TN, HWY_IF_T_SIZE(TN, 1)>
1795 const Half<decltype(d)> dh;
1796 const Repartition<MakeWide<MakeWide<TN>>, decltype(dh)> d4; // 32-bit lanes
1797 const auto v32 = PromoteTo(d4, v);
1798 Vec256<TFromD<DW>> ret;
1799 // PromoteLowerTo is defined later in generic_ops-inl.h.
1800 ret.v0 = PromoteTo(dh, LowerHalf(v32));
1801 ret.v1 = PromoteUpperTo(dh, v32);
1802 return ret;
1803}
1804
1805// ------------------------------ PromoteUpperTo
1806
1807// Not native, but still define this here because wasm_128 toggles
1808// HWY_NATIVE_PROMOTE_UPPER_TO.
1809template <class D, class T>
1811 // Lanes(d) may differ from Lanes(DFromV<decltype(v)>()). Use the lane type
1812 // from v because it cannot be deduced from D (could be either bf16 or f16).
1813 const Rebind<T, decltype(d)> dh;
1814 return PromoteTo(d, UpperHalf(dh, v));
1815}
1816
1817// ------------------------------ DemoteTo
1818
1819template <class D, HWY_IF_U16_D(D)>
1821 return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(v.v0.raw, v.v1.raw)};
1822}
1823
1824template <class D, HWY_IF_I16_D(D)>
1826 return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw)};
1827}
1828
1829template <class D, HWY_IF_U8_D(D)>
1831 const auto intermediate = wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw);
1832 return Vec64<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
1833}
1834
1835template <class D, HWY_IF_U8_D(D)>
1837 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(v.v0.raw, v.v1.raw)};
1838}
1839
1840template <class D, HWY_IF_I8_D(D)>
1842 const auto intermediate = wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw);
1843 return Vec64<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
1844}
1845
1846template <class D, HWY_IF_I8_D(D)>
1848 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(v.v0.raw, v.v1.raw)};
1849}
1850
1851template <class D, HWY_IF_I32_D(D)>
1853 const Vec64<int32_t> lo{wasm_i32x4_trunc_sat_f64x2_zero(v.v0.raw)};
1854 const Vec64<int32_t> hi{wasm_i32x4_trunc_sat_f64x2_zero(v.v1.raw)};
1855 return Combine(di, hi, lo);
1856}
1857
1858template <class D, HWY_IF_U32_D(D)>
1860 const Vec64<uint32_t> lo{wasm_u32x4_trunc_sat_f64x2_zero(v.v0.raw)};
1861 const Vec64<uint32_t> hi{wasm_u32x4_trunc_sat_f64x2_zero(v.v1.raw)};
1862 return Combine(di, hi, lo);
1863}
1864
1865template <class D, HWY_IF_F32_D(D)>
1867 const Vec64<float> lo = DemoteTo(Full64<float>(), v.v0);
1868 const Vec64<float> hi = DemoteTo(Full64<float>(), v.v1);
1869 return Combine(df, hi, lo);
1870}
1871
1872template <class D, HWY_IF_F32_D(D)>
1874 const Vec64<float> lo = DemoteTo(Full64<float>(), v.v0);
1875 const Vec64<float> hi = DemoteTo(Full64<float>(), v.v1);
1876 return Combine(df, hi, lo);
1877}
1878
1879template <class D, HWY_IF_F16_D(D)>
1881 const Half<decltype(d16)> d16h;
1882 const Vec64<float16_t> lo = DemoteTo(d16h, v.v0);
1883 const Vec64<float16_t> hi = DemoteTo(d16h, v.v1);
1884 return Combine(d16, hi, lo);
1885}
1886
1887// For already range-limited input [0, 255].
1889 const Full64<uint8_t> du8;
1890 const Full256<int32_t> di32; // no unsigned DemoteTo
1891 return DemoteTo(du8, BitCast(di32, v));
1892}
1893
1894// ------------------------------ Truncations
1895
1896template <class D, HWY_IF_U8_D(D)>
1898 return Vec32<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24, 0,
1899 8, 16, 24, 0, 8, 16, 24, 0, 8, 16,
1900 24)};
1901}
1902
1903template <class D, HWY_IF_U16_D(D)>
1905 return Vec64<uint16_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9, 16,
1906 17, 24, 25, 0, 1, 8, 9, 16, 17, 24,
1907 25)};
1908}
1909
1910template <class D, HWY_IF_U32_D(D)>
1912 return Vec128<uint32_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3, 8,
1913 9, 10, 11, 16, 17, 18, 19, 24, 25,
1914 26, 27)};
1915}
1916
1917template <class D, HWY_IF_U8_D(D)>
1919 return Vec64<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12, 16,
1920 20, 24, 28, 0, 4, 8, 12, 16, 20, 24,
1921 28)};
1922}
1923
1924template <class D, HWY_IF_U16_D(D)>
1926 return Vec128<uint16_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5, 8,
1927 9, 12, 13, 16, 17, 20, 21, 24, 25,
1928 28, 29)};
1929}
1930
1931template <class D, HWY_IF_U8_D(D)>
1933 return Vec128<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6, 8,
1934 10, 12, 14, 16, 18, 20, 22, 24, 26,
1935 28, 30)};
1936}
1937
1938// ------------------------------ ReorderDemote2To
1939template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 32),
1941 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
1942 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
1943HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
1944 const Half<decltype(dn)> dnh;
1945 VFromD<DN> demoted;
1946 demoted.v0 = DemoteTo(dnh, a);
1947 demoted.v1 = DemoteTo(dnh, b);
1948 return demoted;
1949}
1950
1951template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 32), HWY_IF_UNSIGNED_D(DN),
1953 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
1954 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
1955HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
1956 const Half<decltype(dn)> dnh;
1957 VFromD<DN> demoted;
1958 demoted.v0 = DemoteTo(dnh, a);
1959 demoted.v1 = DemoteTo(dnh, b);
1960 return demoted;
1961}
1962
1963// ------------------------------ Convert i32 <=> f32 (Round)
1964
1965template <class DTo, typename TFrom, typename TTo = TFromD<DTo>>
1967 const Half<decltype(d)> dh;
1968 Vec256<TTo> ret;
1969 ret.v0 = ConvertTo(dh, v.v0);
1970 ret.v1 = ConvertTo(dh, v.v1);
1971 return ret;
1972}
1973
1977
1978// ================================================== MISC
1979
1980// ------------------------------ LoadMaskBits (TestBit)
1981
1982// `p` points to at least 8 readable bytes, not all of which need be valid.
1983template <class D, HWY_IF_V_SIZE_D(D, 32),
1984 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
1986 const Half<decltype(d)> dh;
1987 MFromD<D> ret;
1988 ret.m0 = LoadMaskBits(dh, bits);
1989 // If size=4, one 128-bit vector has 4 mask bits; otherwise 2 for size=8.
1990 // Both halves fit in one byte's worth of mask bits.
1991 constexpr size_t kBitsPerHalf = 16 / sizeof(TFromD<D>);
1992 const uint8_t bits_upper[8] = {static_cast<uint8_t>(bits[0] >> kBitsPerHalf)};
1993 ret.m1 = LoadMaskBits(dh, bits_upper);
1994 return ret;
1995}
1996
1997template <class D, HWY_IF_V_SIZE_D(D, 32),
1998 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
1999HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
2000 const Half<decltype(d)> dh;
2001 MFromD<D> ret;
2002 ret.m0 = LoadMaskBits(dh, bits);
2003 constexpr size_t kLanesPerHalf = 16 / sizeof(TFromD<D>);
2004 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
2005 static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes");
2006 ret.m1 = LoadMaskBits(dh, bits + kBytesPerHalf);
2007 return ret;
2008}
2009
2010template <class D, HWY_IF_V_SIZE_D(D, 32)>
2011HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
2012 const Half<decltype(d)> dh;
2013 MFromD<D> ret;
2014 ret.m0 = ret.m1 = Dup128MaskFromMaskBits(dh, mask_bits);
2015 return ret;
2016}
2017
2018// ------------------------------ Mask
2019
2020// `p` points to at least 8 writable bytes.
2021template <class D, typename T = TFromD<D>,
2022 HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
2023HWY_API size_t StoreMaskBits(D d, const Mask256<T> mask, uint8_t* bits) {
2024 const Half<decltype(d)> dh;
2025 StoreMaskBits(dh, mask.m0, bits);
2026 const uint8_t lo = bits[0];
2027 StoreMaskBits(dh, mask.m1, bits);
2028 // If size=4, one 128-bit vector has 4 mask bits; otherwise 2 for size=8.
2029 // Both halves fit in one byte's worth of mask bits.
2030 constexpr size_t kBitsPerHalf = 16 / sizeof(T);
2031 bits[0] = static_cast<uint8_t>(lo | (bits[0] << kBitsPerHalf));
2032 return (kBitsPerHalf * 2 + 7) / 8;
2033}
2034
2035template <class D, typename T = TFromD<D>,
2036 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
2037HWY_API size_t StoreMaskBits(D d, const Mask256<T> mask, uint8_t* bits) {
2038 const Half<decltype(d)> dh;
2039 constexpr size_t kLanesPerHalf = 16 / sizeof(T);
2040 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
2041 static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes");
2042 StoreMaskBits(dh, mask.m0, bits);
2043 StoreMaskBits(dh, mask.m1, bits + kBytesPerHalf);
2044 return kBytesPerHalf * 2;
2045}
2046
2047template <class D, typename T = TFromD<D>>
2048HWY_API size_t CountTrue(D d, const Mask256<T> m) {
2049 const Half<decltype(d)> dh;
2050 return CountTrue(dh, m.m0) + CountTrue(dh, m.m1);
2051}
2052
2053template <class D, typename T = TFromD<D>>
2055 const Half<decltype(d)> dh;
2056 return AllFalse(dh, m.m0) && AllFalse(dh, m.m1);
2057}
2058
2059template <class D, typename T = TFromD<D>>
2060HWY_API bool AllTrue(D d, const Mask256<T> m) {
2061 const Half<decltype(d)> dh;
2062 return AllTrue(dh, m.m0) && AllTrue(dh, m.m1);
2063}
2064
2065template <class D, typename T = TFromD<D>>
2067 const Half<decltype(d)> dh;
2068 const intptr_t lo = FindFirstTrue(dh, mask.m0); // not known
2069 constexpr size_t kLanesPerHalf = 16 / sizeof(T);
2070 return lo >= 0 ? static_cast<size_t>(lo)
2071 : kLanesPerHalf + FindKnownFirstTrue(dh, mask.m1);
2072}
2073
2074template <class D, typename T = TFromD<D>>
2075HWY_API intptr_t FindFirstTrue(D d, const Mask256<T> mask) {
2076 const Half<decltype(d)> dh;
2077 const intptr_t lo = FindFirstTrue(dh, mask.m0);
2078 constexpr int kLanesPerHalf = 16 / sizeof(T);
2079 if (lo >= 0) return lo;
2080
2081 const intptr_t hi = FindFirstTrue(dh, mask.m1);
2082 return hi + (hi >= 0 ? kLanesPerHalf : 0);
2083}
2084
2085template <class D, typename T = TFromD<D>>
2086HWY_API size_t FindKnownLastTrue(D d, const Mask256<T> mask) {
2087 const Half<decltype(d)> dh;
2088 const intptr_t hi = FindLastTrue(dh, mask.m1); // not known
2089 constexpr size_t kLanesPerHalf = 16 / sizeof(T);
2090 return hi >= 0 ? kLanesPerHalf + static_cast<size_t>(hi)
2091 : FindKnownLastTrue(dh, mask.m0);
2092}
2093
2094template <class D, typename T = TFromD<D>>
2095HWY_API intptr_t FindLastTrue(D d, const Mask256<T> mask) {
2096 const Half<decltype(d)> dh;
2097 constexpr int kLanesPerHalf = 16 / sizeof(T);
2098 const intptr_t hi = FindLastTrue(dh, mask.m1);
2099 return hi >= 0 ? kLanesPerHalf + hi : FindLastTrue(dh, mask.m0);
2100}
2101
2102// ------------------------------ CompressStore
2103template <class D, typename T = TFromD<D>>
2105 T* HWY_RESTRICT unaligned) {
2106 const Half<decltype(d)> dh;
2107 const size_t count = CompressStore(v.v0, mask.m0, dh, unaligned);
2108 const size_t count2 = CompressStore(v.v1, mask.m1, dh, unaligned + count);
2109 return count + count2;
2110}
2111
2112// ------------------------------ CompressBlendedStore
2113template <class D, typename T = TFromD<D>>
2115 T* HWY_RESTRICT unaligned) {
2116 const Half<decltype(d)> dh;
2117 const size_t count = CompressBlendedStore(v.v0, m.m0, dh, unaligned);
2118 const size_t count2 = CompressBlendedStore(v.v1, m.m1, dh, unaligned + count);
2119 return count + count2;
2120}
2121
2122// ------------------------------ CompressBitsStore
2123
2124template <class D, typename T = TFromD<D>>
2125HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
2126 D d, T* HWY_RESTRICT unaligned) {
2127 const Mask256<T> m = LoadMaskBits(d, bits);
2128 return CompressStore(v, m, d, unaligned);
2129}
2130
2131// ------------------------------ Compress
2132template <typename T>
2134 const DFromV<decltype(v)> d;
2135 alignas(32) T lanes[32 / sizeof(T)] = {};
2136 (void)CompressStore(v, mask, d, lanes);
2137 return Load(d, lanes);
2138}
2139
2140// ------------------------------ CompressNot
2141template <typename T>
2143 return Compress(v, Not(mask));
2144}
2145
2146// ------------------------------ CompressBlocksNot
2148 Mask256<uint64_t> mask) {
2149 const Full128<uint64_t> dh;
2150 // Because the non-selected (mask=1) blocks are undefined, we can return the
2151 // input unless mask = 01, in which case we must bring down the upper block.
2152 return AllTrue(dh, AndNot(mask.m1, mask.m0)) ? SwapAdjacentBlocks(v) : v;
2153}
2154
2155// ------------------------------ CompressBits
2156template <typename T>
2158 const Mask256<T> m = LoadMaskBits(DFromV<decltype(v)>(), bits);
2159 return Compress(v, m);
2160}
2161
2162// ------------------------------ Expand
2163template <typename T>
2165 Vec256<T> ret;
2166 const Full256<T> d;
2167 const Half<decltype(d)> dh;
2168 alignas(32) T lanes[32 / sizeof(T)] = {};
2169 Store(v, d, lanes);
2170 ret.v0 = Expand(v.v0, mask.m0);
2171 ret.v1 = Expand(LoadU(dh, lanes + CountTrue(dh, mask.m0)), mask.m1);
2172 return ret;
2173}
2174
2175// ------------------------------ LoadExpand
2176template <class D, HWY_IF_V_SIZE_D(D, 32)>
2177HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
2178 const TFromD<D>* HWY_RESTRICT unaligned) {
2179 return Expand(LoadU(d, unaligned), mask);
2180}
2181
2182// ------------------------------ LoadInterleaved3/4
2183
2184// Implemented in generic_ops, we just overload LoadTransposedBlocks3/4.
2185
2186namespace detail {
2187
2188// Input:
2189// 1 0 (<- first block of unaligned)
2190// 3 2
2191// 5 4
2192// Output:
2193// 3 0
2194// 4 1
2195// 5 2
2196template <class D, typename T = TFromD<D>>
2197HWY_API void LoadTransposedBlocks3(D d, const T* HWY_RESTRICT unaligned,
2198 Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) {
2199 const Vec256<T> v10 = LoadU(d, unaligned + 0 * MaxLanes(d));
2200 const Vec256<T> v32 = LoadU(d, unaligned + 1 * MaxLanes(d));
2201 const Vec256<T> v54 = LoadU(d, unaligned + 2 * MaxLanes(d));
2202
2203 A = ConcatUpperLower(d, v32, v10);
2204 B = ConcatLowerUpper(d, v54, v10);
2205 C = ConcatUpperLower(d, v54, v32);
2206}
2207
2208// Input (128-bit blocks):
2209// 1 0 (first block of unaligned)
2210// 3 2
2211// 5 4
2212// 7 6
2213// Output:
2214// 4 0 (LSB of A)
2215// 5 1
2216// 6 2
2217// 7 3
2218template <class D, typename T = TFromD<D>>
2219HWY_API void LoadTransposedBlocks4(D d, const T* HWY_RESTRICT unaligned,
2220 Vec256<T>& vA, Vec256<T>& vB, Vec256<T>& vC,
2221 Vec256<T>& vD) {
2222 const Vec256<T> v10 = LoadU(d, unaligned + 0 * MaxLanes(d));
2223 const Vec256<T> v32 = LoadU(d, unaligned + 1 * MaxLanes(d));
2224 const Vec256<T> v54 = LoadU(d, unaligned + 2 * MaxLanes(d));
2225 const Vec256<T> v76 = LoadU(d, unaligned + 3 * MaxLanes(d));
2226
2227 vA = ConcatLowerLower(d, v54, v10);
2228 vB = ConcatUpperUpper(d, v54, v10);
2229 vC = ConcatLowerLower(d, v76, v32);
2230 vD = ConcatUpperUpper(d, v76, v32);
2231}
2232
2233} // namespace detail
2234
2235// ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower)
2236
2237// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
2238
2239namespace detail {
2240
2241// Input (128-bit blocks):
2242// 2 0 (LSB of i)
2243// 3 1
2244// Output:
2245// 1 0
2246// 3 2
2247template <class D, typename T = TFromD<D>>
2249 T* HWY_RESTRICT unaligned) {
2250 const Vec256<T> out0 = ConcatLowerLower(d, j, i);
2251 const Vec256<T> out1 = ConcatUpperUpper(d, j, i);
2252 StoreU(out0, d, unaligned + 0 * MaxLanes(d));
2253 StoreU(out1, d, unaligned + 1 * MaxLanes(d));
2254}
2255
2256// Input (128-bit blocks):
2257// 3 0 (LSB of i)
2258// 4 1
2259// 5 2
2260// Output:
2261// 1 0
2262// 3 2
2263// 5 4
2264template <class D, typename T = TFromD<D>>
2266 T* HWY_RESTRICT unaligned) {
2267 const Vec256<T> out0 = ConcatLowerLower(d, j, i);
2268 const Vec256<T> out1 = ConcatUpperLower(d, i, k);
2269 const Vec256<T> out2 = ConcatUpperUpper(d, k, j);
2270 StoreU(out0, d, unaligned + 0 * MaxLanes(d));
2271 StoreU(out1, d, unaligned + 1 * MaxLanes(d));
2272 StoreU(out2, d, unaligned + 2 * MaxLanes(d));
2273}
2274
2275// Input (128-bit blocks):
2276// 4 0 (LSB of i)
2277// 5 1
2278// 6 2
2279// 7 3
2280// Output:
2281// 1 0
2282// 3 2
2283// 5 4
2284// 7 6
2285template <class D, typename T = TFromD<D>>
2287 Vec256<T> l, D d,
2288 T* HWY_RESTRICT unaligned) {
2289 // Write lower halves, then upper.
2290 const Vec256<T> out0 = ConcatLowerLower(d, j, i);
2291 const Vec256<T> out1 = ConcatLowerLower(d, l, k);
2292 StoreU(out0, d, unaligned + 0 * MaxLanes(d));
2293 StoreU(out1, d, unaligned + 1 * MaxLanes(d));
2294 const Vec256<T> out2 = ConcatUpperUpper(d, j, i);
2295 const Vec256<T> out3 = ConcatUpperUpper(d, l, k);
2296 StoreU(out2, d, unaligned + 2 * MaxLanes(d));
2297 StoreU(out3, d, unaligned + 3 * MaxLanes(d));
2298}
2299
2300} // namespace detail
2301
2302// ------------------------------ Additional mask logical operations
2303
2304template <class T>
2306 const Full256<T> d;
2307 const Half<decltype(d)> dh;
2308 const Repartition<int64_t, decltype(dh)> dh_i64;
2309
2310 Mask256<T> result;
2311 result.m0 = SetAtOrAfterFirst(mask.m0);
2312 result.m1 = SetAtOrAfterFirst(mask.m1);
2313
2314 // Copy the sign bit of the lower 128-bit half to the upper 128-bit half
2315 const auto vmask_lo = BitCast(dh_i64, VecFromMask(dh, result.m0));
2316 result.m1 =
2318 dh_i64, vmask_lo, vmask_lo)))));
2319
2320 return result;
2321}
2322
2323template <class T>
2327
2328template <class T>
2330 const Full256<T> d;
2331 const RebindToSigned<decltype(d)> di;
2332 const Repartition<int64_t, decltype(d)> di64;
2333 const Half<decltype(di64)> dh_i64;
2334
2335 const auto zero = Zero(di64);
2336 const auto vmask = BitCast(di64, VecFromMask(d, mask));
2337
2338 const auto vmask_eq_0 = VecFromMask(di64, vmask == zero);
2339 auto vmask2_lo = LowerHalf(dh_i64, vmask_eq_0);
2340 auto vmask2_hi = UpperHalf(dh_i64, vmask_eq_0);
2341
2342 vmask2_lo = And(vmask2_lo, InterleaveLower(vmask2_lo, vmask2_lo));
2343 vmask2_hi = And(ConcatLowerUpper(dh_i64, vmask2_hi, vmask2_lo),
2344 InterleaveUpper(dh_i64, vmask2_lo, vmask2_lo));
2345 vmask2_lo = InterleaveLower(Set(dh_i64, int64_t{-1}), vmask2_lo);
2346
2347 const auto vmask2 = Combine(di64, vmask2_hi, vmask2_lo);
2348 const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask))));
2349 return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2))));
2350}
2351
2352template <class T>
2354 const Full256<T> d;
2355 constexpr size_t kLanesPerBlock = MaxLanes(d) / 2;
2356
2357 const auto vmask = VecFromMask(d, mask);
2358 const auto vmask_lo = ConcatLowerLower(d, vmask, Zero(d));
2359 return SetBeforeFirst(
2360 MaskFromVec(CombineShiftRightBytes<(kLanesPerBlock - 1) * sizeof(T)>(
2361 d, vmask, vmask_lo)));
2362}
2363
2364// ------------------------------ WidenMulPairwiseAdd
2365template <class D32, typename T16, typename T32 = TFromD<D32>>
2367 const Half<decltype(d32)> d32h;
2368 Vec256<T32> result;
2369 result.v0 = WidenMulPairwiseAdd(d32h, a.v0, b.v0);
2370 result.v1 = WidenMulPairwiseAdd(d32h, a.v1, b.v1);
2371 return result;
2372}
2373
2374// ------------------------------ ReorderWidenMulAccumulate
2375template <class D32, typename T16, typename T32 = TFromD<D32>>
2377 Vec256<T16> b, Vec256<T32> sum0,
2378 Vec256<T32>& sum1) {
2379 const Half<decltype(d32)> d32h;
2380 sum0.v0 = ReorderWidenMulAccumulate(d32h, a.v0, b.v0, sum0.v0, sum1.v0);
2381 sum0.v1 = ReorderWidenMulAccumulate(d32h, a.v1, b.v1, sum0.v1, sum1.v1);
2382 return sum0;
2383}
2384
2385// ------------------------------ RearrangeToOddPlusEven
2386template <typename TW>
2388 sum0.v0 = RearrangeToOddPlusEven(sum0.v0, sum1.v0);
2389 sum0.v1 = RearrangeToOddPlusEven(sum0.v1, sum1.v1);
2390 return sum0;
2391}
2392
2393// ------------------------------ Reductions in generic_ops
2394
2395// ------------------------------ Lt128
2396
2397template <class D, typename T = TFromD<D>>
2399 const Half<decltype(d)> dh;
2400 Mask256<T> ret;
2401 ret.m0 = Lt128(dh, a.v0, b.v0);
2402 ret.m1 = Lt128(dh, a.v1, b.v1);
2403 return ret;
2404}
2405
2406template <class D, typename T = TFromD<D>>
2408 const Half<decltype(d)> dh;
2409 Mask256<T> ret;
2410 ret.m0 = Lt128Upper(dh, a.v0, b.v0);
2411 ret.m1 = Lt128Upper(dh, a.v1, b.v1);
2412 return ret;
2413}
2414
2415template <class D, typename T = TFromD<D>>
2417 const Half<decltype(d)> dh;
2418 Mask256<T> ret;
2419 ret.m0 = Eq128(dh, a.v0, b.v0);
2420 ret.m1 = Eq128(dh, a.v1, b.v1);
2421 return ret;
2422}
2423
2424template <class D, typename T = TFromD<D>>
2426 const Half<decltype(d)> dh;
2427 Mask256<T> ret;
2428 ret.m0 = Eq128Upper(dh, a.v0, b.v0);
2429 ret.m1 = Eq128Upper(dh, a.v1, b.v1);
2430 return ret;
2431}
2432
2433template <class D, typename T = TFromD<D>>
2435 const Half<decltype(d)> dh;
2436 Mask256<T> ret;
2437 ret.m0 = Ne128(dh, a.v0, b.v0);
2438 ret.m1 = Ne128(dh, a.v1, b.v1);
2439 return ret;
2440}
2441
2442template <class D, typename T = TFromD<D>>
2444 const Half<decltype(d)> dh;
2445 Mask256<T> ret;
2446 ret.m0 = Ne128Upper(dh, a.v0, b.v0);
2447 ret.m1 = Ne128Upper(dh, a.v1, b.v1);
2448 return ret;
2449}
2450
2451template <class D, typename T = TFromD<D>>
2453 const Half<decltype(d)> dh;
2454 Vec256<T> ret;
2455 ret.v0 = Min128(dh, a.v0, b.v0);
2456 ret.v1 = Min128(dh, a.v1, b.v1);
2457 return ret;
2458}
2459
2460template <class D, typename T = TFromD<D>>
2462 const Half<decltype(d)> dh;
2463 Vec256<T> ret;
2464 ret.v0 = Max128(dh, a.v0, b.v0);
2465 ret.v1 = Max128(dh, a.v1, b.v1);
2466 return ret;
2467}
2468
2469template <class D, typename T = TFromD<D>>
2471 const Half<decltype(d)> dh;
2472 Vec256<T> ret;
2473 ret.v0 = Min128Upper(dh, a.v0, b.v0);
2474 ret.v1 = Min128Upper(dh, a.v1, b.v1);
2475 return ret;
2476}
2477
2478template <class D, typename T = TFromD<D>>
2480 const Half<decltype(d)> dh;
2481 Vec256<T> ret;
2482 ret.v0 = Max128Upper(dh, a.v0, b.v0);
2483 ret.v1 = Max128Upper(dh, a.v1, b.v1);
2484 return ret;
2485}
2486
2487// NOLINTNEXTLINE(google-readability-namespace-comments)
2488} // namespace HWY_NAMESPACE
2489} // namespace hwy
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_IF_T_SIZE(T, bytes)
Definition base.h:639
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_IF_LANES_LE(kN, lanes)
Definition base.h:617
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
Definition arm_neon-inl.h:865
Raw raw
Definition arm_neon-inl.h:878
Definition arm_neon-inl.h:813
Raw raw
Definition arm_neon-inl.h:851
Definition wasm_256-inl.h:27
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition wasm_256-inl.h:55
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition wasm_256-inl.h:49
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition wasm_256-inl.h:43
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition wasm_256-inl.h:40
HWY_INLINE Vec256 & operator%=(const Vec256 other)
Definition wasm_256-inl.h:46
Vec128< T > v1
Definition wasm_256-inl.h:60
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition wasm_256-inl.h:52
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition wasm_256-inl.h:37
static constexpr size_t kPrivateN
Definition wasm_256-inl.h:30
Vec128< T > v0
Definition wasm_256-inl.h:59
T PrivateT
Definition wasm_256-inl.h:29
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition wasm_256-inl.h:34
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE void LoadTransposedBlocks4(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &vA, VFromD< D > &vB, VFromD< D > &vC, VFromD< D > &vD)
Definition generic_ops-inl.h:1477
HWY_INLINE void StoreTransposedBlocks3(VFromD< D > A, VFromD< D > B, VFromD< D > C, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1652
HWY_INLINE void StoreTransposedBlocks2(VFromD< D > A, VFromD< D > B, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1616
HWY_INLINE void StoreTransposedBlocks4(VFromD< D > vA, VFromD< D > vB, VFromD< D > vC, VFromD< D > vD, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:2003
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE void LoadTransposedBlocks3(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &A, VFromD< D > &B, VFromD< D > &C)
Definition generic_ops-inl.h:1279
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API Vec256< T > TableLookupLanesOr0(Vec256< T > v, Indices256< T > idx)
Definition wasm_256-inl.h:1119
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API VFromD< DTo > ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, VFromD< DFrom > v)
Definition generic_ops-inl.h:162
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API VFromD< D > SlideDownBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7046
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
HWY_API VFromD< D > SlideUpBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7028
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API V BroadcastLane(const V v)
Definition arm_sve-inl.h:4146
Definition abort.h:8
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:624
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_V_SIZE_V(V, bytes)
Definition ops/shared-inl.h:632
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_D(D)
Definition ops/shared-inl.h:536
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition wasm_256-inl.h:1085
__v128_u i0
Definition wasm_256-inl.h:1086
__v128_u i1
Definition wasm_256-inl.h:1087
Definition wasm_256-inl.h:64
Mask128< T > m1
Definition wasm_256-inl.h:66
Mask128< T > m0
Definition wasm_256-inl.h:65
Definition ops/shared-inl.h:198
Definition base.h:694
int VFromD
Definition tuple-inl.h:25
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()