Grok 12.0.1
wasm_128-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 128-bit WASM vectors and operations.
17// External include guard in highway.h - see comment there.
18
19#include <wasm_simd128.h>
20
21#include "hwy/base.h"
22#include "hwy/ops/shared-inl.h"
23
24#ifdef HWY_WASM_OLD_NAMES
25#define wasm_i8x16_shuffle wasm_v8x16_shuffle
26#define wasm_i16x8_shuffle wasm_v16x8_shuffle
27#define wasm_i32x4_shuffle wasm_v32x4_shuffle
28#define wasm_i64x2_shuffle wasm_v64x2_shuffle
29#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
30#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
31#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
32#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
33#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
34#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
35#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
36#define wasm_i62x2_trunc_sat_f64x2 wasm_i64x2_trunc_saturate_f64x2
37#define wasm_u8x16_add_sat wasm_u8x16_add_saturate
38#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
39#define wasm_u16x8_add_sat wasm_u16x8_add_saturate
40#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
41#define wasm_i8x16_add_sat wasm_i8x16_add_saturate
42#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
43#define wasm_i16x8_add_sat wasm_i16x8_add_saturate
44#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
45#endif
46
48namespace hwy {
49namespace HWY_NAMESPACE {
50
51#if HWY_TARGET == HWY_WASM_EMU256
52template <typename T>
53using Full256 = Simd<T, 32 / sizeof(T), 0>;
54#endif
55
56namespace detail {
57
58template <typename T>
59struct Raw128 {
60 using type = __v128_u;
61};
62template <>
63struct Raw128<float> {
64 using type = __f32x4;
65};
66template <>
67struct Raw128<double> {
68 using type = __f64x2;
69};
70
71} // namespace detail
72
73template <typename T, size_t N = 16 / sizeof(T)>
74class Vec128 {
75 using Raw = typename detail::Raw128<T>::type;
76
77 public:
78 using PrivateT = T; // only for DFromV
79 static constexpr size_t kPrivateN = N; // only for DFromV
80
81 // Compound assignment. Only usable if there is a corresponding non-member
82 // binary operator overload. For example, only f32 and f64 support division.
84 return *this = (*this * other);
85 }
87 return *this = (*this / other);
88 }
90 return *this = (*this + other);
91 }
93 return *this = (*this - other);
94 }
96 return *this = (*this % other);
97 }
99 return *this = (*this & other);
100 }
102 return *this = (*this | other);
103 }
105 return *this = (*this ^ other);
106 }
107
108 Raw raw;
109};
110
111template <typename T>
112using Vec64 = Vec128<T, 8 / sizeof(T)>;
113
114template <typename T>
115using Vec32 = Vec128<T, 4 / sizeof(T)>;
116
117template <typename T>
118using Vec16 = Vec128<T, 2 / sizeof(T)>;
119
120// FF..FF or 0.
121template <typename T, size_t N = 16 / sizeof(T)>
122struct Mask128 {
123 using PrivateT = T; // only for DFromM
124 static constexpr size_t kPrivateN = N; // only for DFromM
125
127};
128
129template <class V>
131
132template <class M>
134
135template <class V>
136using TFromV = typename V::PrivateT;
137
138// ------------------------------ Zero
139
140// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
141template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
143 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_i32x4_splat(0)};
144}
145template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
146HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
147 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_f32x4_splat(0.0f)};
148}
149template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
150HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
151 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_f64x2_splat(0.0)};
152}
153
154template <class D>
155using VFromD = decltype(Zero(D()));
156
157// ------------------------------ Tuple (VFromD)
158#include "hwy/ops/tuple-inl.h"
159
160// ------------------------------ BitCast
161
162namespace detail {
163
164HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
165HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
166 return static_cast<__v128_u>(v);
167}
168HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
169 return static_cast<__v128_u>(v);
170}
171
172template <typename T, size_t N>
173HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
174 return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
175}
176
177// Cannot rely on function overloading because return types differ.
178template <typename T>
180 HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
181};
182template <>
184 HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
185};
186template <>
187struct BitCastFromInteger128<double> {
188 HWY_INLINE __f64x2 operator()(__v128_u v) { return static_cast<__f64x2>(v); }
189};
190
191template <class D>
192HWY_INLINE VFromD<D> BitCastFromByte(D d, Vec128<uint8_t, d.MaxBytes()> v) {
194}
195
196} // namespace detail
197
198template <class D, typename FromT>
200 Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
202}
203
204// ------------------------------ ResizeBitCast
205
206template <class D, typename FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
207 HWY_IF_V_SIZE_LE_D(D, 16)>
208HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
209 const Repartition<uint8_t, decltype(d)> du8_to;
210 return BitCast(d, VFromD<decltype(du8_to)>{detail::BitCastToInteger(v.raw)});
211}
212
213// ------------------------------ Set
214
215template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
217 return VFromD<D>{wasm_i8x16_splat(static_cast<int8_t>(t))};
218}
219template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
220HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
221 return VFromD<D>{wasm_i16x8_splat(static_cast<int16_t>(t))};
222}
223template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
224HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
225 return VFromD<D>{wasm_i32x4_splat(static_cast<int32_t>(t))};
226}
227template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
228HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
229 return VFromD<D>{wasm_i64x2_splat(static_cast<int64_t>(t))};
230}
231
232template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_SPECIAL_FLOAT_D(D)>
233HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
234 return VFromD<D>{wasm_i16x8_splat(BitCastScalar<int16_t>(t))};
235}
236template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
237HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
238 return VFromD<D>{wasm_f32x4_splat(t)};
239}
240template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
241HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
242 return VFromD<D>{wasm_f64x2_splat(t)};
243}
244
245HWY_DIAGNOSTICS(push)
246HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
247
248// For all vector sizes.
249template <class D>
251 return Zero(d);
252}
253
255
256// For all vector sizes.
257template <class D, typename T = TFromD<D>, typename T2>
258HWY_API VFromD<D> Iota(D d, const T2 first) {
259 HWY_ALIGN T lanes[MaxLanes(d)];
260 for (size_t i = 0; i < MaxLanes(d); ++i) {
261 lanes[i] = AddWithWraparound(static_cast<T>(first), i);
262 }
263 return Load(d, lanes);
264}
265
266// ------------------------------ Dup128VecFromValues
267template <class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
268HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
269 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
270 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
271 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
272 TFromD<D> t11, TFromD<D> t12,
273 TFromD<D> t13, TFromD<D> t14,
274 TFromD<D> t15) {
275 return VFromD<D>{wasm_i8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
276 t11, t12, t13, t14, t15)};
277}
278
279template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
280HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
281 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
282 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
283 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
284 TFromD<D> t11, TFromD<D> t12,
285 TFromD<D> t13, TFromD<D> t14,
286 TFromD<D> t15) {
287 return VFromD<D>{wasm_u8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
288 t11, t12, t13, t14, t15)};
289}
290
291template <class D, HWY_IF_I16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
292HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
293 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
294 TFromD<D> t5, TFromD<D> t6,
295 TFromD<D> t7) {
296 return VFromD<D>{wasm_i16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
297}
298
299template <class D, HWY_IF_U16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
300HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
301 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
302 TFromD<D> t5, TFromD<D> t6,
303 TFromD<D> t7) {
304 return VFromD<D>{wasm_u16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
305}
306
307template <class D, HWY_IF_SPECIAL_FLOAT_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
308HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
309 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
310 TFromD<D> t5, TFromD<D> t6,
311 TFromD<D> t7) {
312 const RebindToSigned<decltype(d)> di;
313 return BitCast(d,
315 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
316 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
317 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
318 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
319}
320
321template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
322HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
323 TFromD<D> t2, TFromD<D> t3) {
324 return VFromD<D>{wasm_i32x4_make(t0, t1, t2, t3)};
325}
326
327template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
328HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
329 TFromD<D> t2, TFromD<D> t3) {
330 return VFromD<D>{wasm_u32x4_make(t0, t1, t2, t3)};
331}
332
333template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
334HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
335 TFromD<D> t2, TFromD<D> t3) {
336 return VFromD<D>{wasm_f32x4_make(t0, t1, t2, t3)};
337}
338
339template <class D, HWY_IF_I64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
340HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
341 return VFromD<D>{wasm_i64x2_make(t0, t1)};
342}
343
344template <class D, HWY_IF_U64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
345HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
346 return VFromD<D>{wasm_u64x2_make(t0, t1)};
347}
348
349template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
350HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
351 return VFromD<D>{wasm_f64x2_make(t0, t1)};
352}
353
354// ================================================== ARITHMETIC
355
356// ------------------------------ Addition
357
358// Unsigned
359template <size_t N>
361 const Vec128<uint8_t, N> b) {
362 return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
363}
364template <size_t N>
366 const Vec128<uint16_t, N> b) {
367 return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
368}
369template <size_t N>
371 const Vec128<uint32_t, N> b) {
372 return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
373}
374template <size_t N>
376 const Vec128<uint64_t, N> b) {
377 return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
378}
379
380// Signed
381template <size_t N>
383 const Vec128<int8_t, N> b) {
384 return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
385}
386template <size_t N>
388 const Vec128<int16_t, N> b) {
389 return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
390}
391template <size_t N>
393 const Vec128<int32_t, N> b) {
394 return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
395}
396template <size_t N>
398 const Vec128<int64_t, N> b) {
399 return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
400}
401
402// Float
403template <size_t N>
405 const Vec128<float, N> b) {
406 return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
407}
408template <size_t N>
410 const Vec128<double, N> b) {
411 return Vec128<double, N>{wasm_f64x2_add(a.raw, b.raw)};
412}
413
414// ------------------------------ Subtraction
415
416// Unsigned
417template <size_t N>
419 const Vec128<uint8_t, N> b) {
420 return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
421}
422template <size_t N>
427template <size_t N>
429 const Vec128<uint32_t, N> b) {
430 return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
431}
432template <size_t N>
434 const Vec128<uint64_t, N> b) {
435 return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
436}
437
438// Signed
439template <size_t N>
441 const Vec128<int8_t, N> b) {
442 return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
443}
444template <size_t N>
446 const Vec128<int16_t, N> b) {
447 return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
448}
449template <size_t N>
451 const Vec128<int32_t, N> b) {
452 return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
453}
454template <size_t N>
456 const Vec128<int64_t, N> b) {
457 return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
458}
459
460// Float
461template <size_t N>
463 const Vec128<float, N> b) {
464 return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
465}
466template <size_t N>
468 const Vec128<double, N> b) {
469 return Vec128<double, N>{wasm_f64x2_sub(a.raw, b.raw)};
470}
471
472// ------------------------------ SaturatedAdd
473
474// Returns a + b clamped to the destination range.
475
476// Unsigned
477template <size_t N>
479 const Vec128<uint8_t, N> b) {
480 return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
481}
482template <size_t N>
484 const Vec128<uint16_t, N> b) {
485 return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
486}
487
488// Signed
489template <size_t N>
491 const Vec128<int8_t, N> b) {
492 return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
493}
494template <size_t N>
496 const Vec128<int16_t, N> b) {
497 return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
498}
499
500// ------------------------------ SaturatedSub
501
502// Returns a - b clamped to the destination range.
503
504// Unsigned
505template <size_t N>
507 const Vec128<uint8_t, N> b) {
508 return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
509}
510template <size_t N>
512 const Vec128<uint16_t, N> b) {
513 return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
514}
515
516// Signed
517template <size_t N>
519 const Vec128<int8_t, N> b) {
520 return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
521}
522template <size_t N>
524 const Vec128<int16_t, N> b) {
525 return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
526}
527
528// ------------------------------ Average
529
530// Returns (a + b + 1) / 2
531
532// Unsigned
533template <size_t N>
535 const Vec128<uint8_t, N> b) {
536 return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
537}
538template <size_t N>
540 const Vec128<uint16_t, N> b) {
541 return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
542}
543
544// ------------------------------ Absolute value
545
546// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
547template <size_t N>
549 return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
550}
551template <size_t N>
553 return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
554}
555template <size_t N>
557 return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
558}
559template <size_t N>
561 return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
562}
563
564template <size_t N>
566 return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
567}
568template <size_t N>
570 return Vec128<double, N>{wasm_f64x2_abs(v.raw)};
571}
572
573// ------------------------------ Shift lanes by constant #bits
574
575// Unsigned
576template <int kBits, size_t N>
578 return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
579}
580template <int kBits, size_t N>
582 return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
583}
584template <int kBits, size_t N>
586 return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
587}
588template <int kBits, size_t N>
590 return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
591}
592template <int kBits, size_t N>
594 return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
595}
596template <int kBits, size_t N>
598 return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
599}
600
601// Signed
602template <int kBits, size_t N>
604 return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
605}
606template <int kBits, size_t N>
608 return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
609}
610template <int kBits, size_t N>
612 return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
613}
614template <int kBits, size_t N>
616 return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
617}
618template <int kBits, size_t N>
620 return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
621}
622template <int kBits, size_t N>
624 return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
625}
626
627// 8-bit
628template <int kBits, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
629HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
630 const DFromV<decltype(v)> d8;
631 // Use raw instead of BitCast to support N=1.
632 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
633 return kBits == 1
634 ? (v + v)
635 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
636}
637
638template <int kBits, size_t N>
640 const DFromV<decltype(v)> d8;
641 // Use raw instead of BitCast to support N=1.
642 const Vec128<uint8_t, N> shifted{
643 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
644 return shifted & Set(d8, 0xFF >> kBits);
645}
646
647template <int kBits, size_t N>
649 const DFromV<decltype(v)> di;
650 const RebindToUnsigned<decltype(di)> du;
651 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
652 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
653 return (shifted ^ shifted_sign) - shifted_sign;
654}
655
656// ------------------------------ RotateRight (ShiftRight, Or)
657template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
658HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
659 const DFromV<decltype(v)> d;
660 const RebindToUnsigned<decltype(d)> du;
661
662 constexpr size_t kSizeInBits = sizeof(T) * 8;
663 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
664
665 if (kBits == 0) return v;
666 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
667 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
668}
669
670// ------------------------------ Shift lanes by same variable #bits
671
672// After https://reviews.llvm.org/D108415 shift argument became unsigned.
673HWY_DIAGNOSTICS(push)
674HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
675
676// Unsigned
677template <size_t N>
678HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
679 const int bits) {
680 return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
681}
682template <size_t N>
684 const int bits) {
685 return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
686}
687template <size_t N>
689 const int bits) {
690 return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
691}
692template <size_t N>
694 const int bits) {
695 return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
696}
697template <size_t N>
699 const int bits) {
700 return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
701}
702template <size_t N>
704 const int bits) {
705 return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
706}
707
708// Signed
709template <size_t N>
711 const int bits) {
712 return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
713}
714template <size_t N>
716 const int bits) {
717 return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
718}
719template <size_t N>
721 const int bits) {
722 return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
723}
724template <size_t N>
726 const int bits) {
727 return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
728}
729template <size_t N>
731 const int bits) {
732 return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
733}
734template <size_t N>
736 const int bits) {
737 return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
738}
739
740// 8-bit
741template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
742HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
743 const DFromV<decltype(v)> d8;
744 // Use raw instead of BitCast to support N=1.
745 const Vec128<T, N> shifted{
746 ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
747 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
748}
749
750template <size_t N>
752 const int bits) {
753 const DFromV<decltype(v)> d8;
754 // Use raw instead of BitCast to support N=1.
755 const Vec128<uint8_t, N> shifted{
756 ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
757 return shifted & Set(d8, 0xFF >> bits);
758}
759
760template <size_t N>
762 const DFromV<decltype(v)> di;
763 const RebindToUnsigned<decltype(di)> du;
764 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
765 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
766 return (shifted ^ shifted_sign) - shifted_sign;
767}
768
769// ignore Wsign-conversion
771
772// ------------------------------ Minimum
773
774// Unsigned
775template <size_t N>
779template <size_t N>
783template <size_t N>
787template <size_t N>
788HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
789 // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
790 const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
791 const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
792 const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
793 const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
794 alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
795 return Vec128<uint64_t, N>{wasm_v128_load(min)};
796}
797
798// Signed
799template <size_t N>
803template <size_t N>
807template <size_t N>
811template <size_t N>
812HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
813 alignas(16) int64_t min[4];
814 min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
815 wasm_i64x2_extract_lane(b.raw, 0));
816 min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
817 wasm_i64x2_extract_lane(b.raw, 1));
818 return Vec128<int64_t, N>{wasm_v128_load(min)};
819}
820
821// Float
822template <size_t N>
824 // Equivalent to a < b ? a : b (taking into account our swapped arg order,
825 // so that Min(NaN, x) is x to match x86).
826 return Vec128<float, N>{wasm_f32x4_pmin(b.raw, a.raw)};
827}
828template <size_t N>
830 // Equivalent to a < b ? a : b (taking into account our swapped arg order,
831 // so that Min(NaN, x) is x to match x86).
832 return Vec128<double, N>{wasm_f64x2_pmin(b.raw, a.raw)};
833}
834
835// ------------------------------ Maximum
836
837// Unsigned
838template <size_t N>
842template <size_t N>
846template <size_t N>
850template <size_t N>
851HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
852 // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
853 const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
854 const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
855 const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
856 const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
857 alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
858 return Vec128<uint64_t, N>{wasm_v128_load(max)};
859}
860
861// Signed
862template <size_t N>
866template <size_t N>
870template <size_t N>
874template <size_t N>
875HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
876 alignas(16) int64_t max[2];
877 max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
878 wasm_i64x2_extract_lane(b.raw, 0));
879 max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
880 wasm_i64x2_extract_lane(b.raw, 1));
881 return Vec128<int64_t, N>{wasm_v128_load(max)};
882}
883
884// Float
885template <size_t N>
887 // Equivalent to b < a ? a : b (taking into account our swapped arg order,
888 // so that Max(NaN, x) is x to match x86).
889 return Vec128<float, N>{wasm_f32x4_pmax(b.raw, a.raw)};
890}
891template <size_t N>
893 // Equivalent to b < a ? a : b (taking into account our swapped arg order,
894 // so that Max(NaN, x) is x to match x86).
895 return Vec128<double, N>{wasm_f64x2_pmax(b.raw, a.raw)};
896}
897
898// ------------------------------ Integer multiplication
899
900// Unsigned
901template <size_t N>
903 const Vec128<uint16_t, N> b) {
904 return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
905}
906template <size_t N>
908 const Vec128<uint32_t, N> b) {
909 return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
910}
911
912// Signed
913template <size_t N>
915 const Vec128<int16_t, N> b) {
916 return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
917}
918template <size_t N>
920 const Vec128<int32_t, N> b) {
921 return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
922}
923
924// Returns the upper sizeof(T)*8 bits of a * b in each lane.
925template <size_t N>
927 const Vec128<uint8_t, N> b) {
928 const auto l = wasm_u16x8_extmul_low_u8x16(a.raw, b.raw);
929 const auto h = wasm_u16x8_extmul_high_u8x16(a.raw, b.raw);
930 // TODO(eustas): shift-right + narrow?
931 return Vec128<uint8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
932 17, 19, 21, 23, 25, 27, 29, 31)};
933}
934template <size_t N>
936 const Vec128<int8_t, N> b) {
937 const auto l = wasm_i16x8_extmul_low_i8x16(a.raw, b.raw);
938 const auto h = wasm_i16x8_extmul_high_i8x16(a.raw, b.raw);
939 // TODO(eustas): shift-right + narrow?
940 return Vec128<int8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
941 17, 19, 21, 23, 25, 27, 29, 31)};
942}
943template <size_t N>
945 const Vec128<uint16_t, N> b) {
946 const auto l = wasm_u32x4_extmul_low_u16x8(a.raw, b.raw);
947 const auto h = wasm_u32x4_extmul_high_u16x8(a.raw, b.raw);
948 // TODO(eustas): shift-right + narrow?
949 return Vec128<uint16_t, N>{
950 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
951}
952template <size_t N>
954 const Vec128<int16_t, N> b) {
955 const auto l = wasm_i32x4_extmul_low_i16x8(a.raw, b.raw);
956 const auto h = wasm_i32x4_extmul_high_i16x8(a.raw, b.raw);
957 // TODO(eustas): shift-right + narrow?
958 return Vec128<int16_t, N>{
959 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
960}
961template <size_t N>
963 const Vec128<uint32_t, N> b) {
964 const auto l = wasm_u64x2_extmul_low_u32x4(a.raw, b.raw);
965 const auto h = wasm_u64x2_extmul_high_u32x4(a.raw, b.raw);
966 // TODO(eustas): shift-right + narrow?
967 return Vec128<uint32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
968}
969template <size_t N>
971 const Vec128<int32_t, N> b) {
972 const auto l = wasm_i64x2_extmul_low_i32x4(a.raw, b.raw);
973 const auto h = wasm_i64x2_extmul_high_i32x4(a.raw, b.raw);
974 // TODO(eustas): shift-right + narrow?
975 return Vec128<int32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
976}
977
978template <size_t N>
979HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
980 Vec128<int16_t, N> b) {
981 return Vec128<int16_t, N>{wasm_i16x8_q15mulr_sat(a.raw, b.raw)};
982}
983
984// Multiplies even lanes (0, 2 ..) and returns the double-width result.
985template <class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
986 HWY_IF_SIGNED(T)>
987HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(const Vec128<T, N> a,
988 const Vec128<T, N> b) {
989 const DFromV<decltype(a)> d;
990 const RepartitionToWide<decltype(d)> dw;
991 constexpr int kSrcBits = sizeof(T) * 8;
992
993 const auto ae =
994 ShiftRight<kSrcBits>(ShiftLeft<kSrcBits>(ResizeBitCast(dw, a)));
995 const auto be =
996 ShiftRight<kSrcBits>(ShiftLeft<kSrcBits>(ResizeBitCast(dw, b)));
997 return ae * be;
998}
999template <class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
1000 HWY_IF_UNSIGNED(T)>
1001HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(const Vec128<T, N> a,
1002 const Vec128<T, N> b) {
1003 const DFromV<decltype(a)> d;
1004 const RepartitionToWide<decltype(d)> dw;
1005 const auto kEvenMask = Set(dw, LimitsMax<T>());
1006
1007 const auto ae = And(ResizeBitCast(dw, a), kEvenMask);
1008 const auto be = And(ResizeBitCast(dw, b), kEvenMask);
1009 return ae * be;
1010}
1011template <size_t N>
1012HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
1013 const Vec128<int32_t, N> b) {
1014 const DFromV<decltype(a)> d;
1015 const RepartitionToWide<decltype(d)> dw;
1016 const auto ae = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, a))).raw;
1017 const auto be = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, b))).raw;
1018 return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
1019}
1020template <size_t N>
1021HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
1022 const Vec128<uint32_t, N> b) {
1023 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
1024 const auto ae = wasm_v128_and(a.raw, kEvenMask);
1025 const auto be = wasm_v128_and(b.raw, kEvenMask);
1026 return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
1027}
1028
1029// Multiplies odd lanes (1, 3 ..) and returns the double-width result.
1030template <class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
1032HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(const Vec128<T, N> a,
1033 const Vec128<T, N> b) {
1034 const DFromV<decltype(a)> d;
1035 const RepartitionToWide<decltype(d)> dw;
1036 constexpr int kSrcBits = sizeof(T) * 8;
1037
1038 const auto ao = ShiftRight<kSrcBits>(BitCast(dw, a));
1039 const auto bo = ShiftRight<kSrcBits>(BitCast(dw, b));
1040 return ao * bo;
1041}
1042template <class T, size_t N, HWY_IF_UI32(T)>
1044 const Vec128<T, N> b) {
1045 const DFromV<decltype(a)> d;
1046 const RepartitionToWide<decltype(d)> dw;
1047
1048 const auto ao = ShiftRight<32>(BitCast(dw, a));
1049 const auto bo = ShiftRight<32>(BitCast(dw, b));
1050 return Vec128<MakeWide<T>, (N + 1) / 2>{wasm_i64x2_mul(ao.raw, bo.raw)};
1051}
1052
1053// ------------------------------ Negate
1054
1055template <typename T, size_t N, HWY_IF_FLOAT_OR_SPECIAL(T)>
1056HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
1057 return Xor(v, SignBit(DFromV<decltype(v)>()));
1058}
1059
1060template <size_t N>
1062 return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
1063}
1064template <size_t N>
1066 return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
1067}
1068template <size_t N>
1070 return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
1071}
1072template <size_t N>
1074 return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
1075}
1076
1077// ------------------------------ Floating-point mul / div
1078
1079template <size_t N>
1083template <size_t N>
1087
1088template <size_t N>
1090 const Vec128<float, N> b) {
1091 return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
1092}
1093template <size_t N>
1095 const Vec128<double, N> b) {
1096 return Vec128<double, N>{wasm_f64x2_div(a.raw, b.raw)};
1097}
1098
1099template <typename T, size_t N>
1101 return Set(DFromV<decltype(v)>(), T{1.0}) / v;
1102}
1103
1104// Integer overload defined in generic_ops-inl.h.
1105template <typename T, size_t N, HWY_IF_FLOAT(T)>
1106HWY_API Vec128<T, N> AbsDiff(const Vec128<T, N> a, const Vec128<T, N> b) {
1107 return Abs(a - b);
1108}
1109
1110// ------------------------------ Floating-point multiply-add variants
1111
1112template <typename T, size_t N, HWY_IF_FLOAT(T)>
1113HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
1114 Vec128<T, N> add) {
1115 return mul * x + add;
1116}
1117
1118template <typename T, size_t N, HWY_IF_FLOAT(T)>
1119HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
1120 Vec128<T, N> add) {
1121 return add - mul * x;
1122}
1123
1124template <typename T, size_t N, HWY_IF_FLOAT(T)>
1125HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
1126 Vec128<T, N> sub) {
1127 return mul * x - sub;
1128}
1129
1130template <typename T, size_t N, HWY_IF_FLOAT(T)>
1131HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
1132 Vec128<T, N> sub) {
1133 return Neg(mul) * x - sub;
1134}
1135
1136// ------------------------------ Floating-point square root
1137
1138// Full precision square root
1139template <size_t N>
1141 return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
1142}
1143template <size_t N>
1145 return Vec128<double, N>{wasm_f64x2_sqrt(v.raw)};
1146}
1147
1148// Approximate reciprocal square root
1149template <typename T, size_t N>
1151 // TODO(eustas): find cheaper a way to calculate this.
1152 return Set(DFromV<decltype(v)>(), T{1.0}) / Sqrt(v);
1153}
1154
1155// ------------------------------ Floating-point rounding
1156
1157// Toward nearest integer, ties to even
1158template <size_t N>
1159HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
1160 return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
1161}
1162template <size_t N>
1163HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
1164 return Vec128<double, N>{wasm_f64x2_nearest(v.raw)};
1165}
1166
1167// Toward zero, aka truncate
1168template <size_t N>
1169HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
1170 return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
1171}
1172template <size_t N>
1174 return Vec128<double, N>{wasm_f64x2_trunc(v.raw)};
1175}
1176
1177// Toward +infinity, aka ceiling
1178template <size_t N>
1179HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
1180 return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
1181}
1182template <size_t N>
1184 return Vec128<double, N>{wasm_f64x2_ceil(v.raw)};
1185}
1186
1187// Toward -infinity, aka floor
1188template <size_t N>
1189HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
1190 return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
1191}
1192template <size_t N>
1194 return Vec128<double, N>{wasm_f64x2_floor(v.raw)};
1195}
1196
1197// ------------------------------ Floating-point classification
1198template <typename T, size_t N>
1199HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
1200 return v != v;
1201}
1202
1203template <typename T, size_t N, HWY_IF_FLOAT(T)>
1205 const DFromV<decltype(v)> d;
1206 const RebindToUnsigned<decltype(d)> du;
1207 const VFromD<decltype(du)> vu = BitCast(du, v);
1208 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1209 return RebindMask(d, Eq(Add(vu, vu), Set(du, hwy::MaxExponentTimes2<T>())));
1210}
1211
1212// Returns whether normal/subnormal/zero.
1213template <typename T, size_t N, HWY_IF_FLOAT(T)>
1215 const DFromV<decltype(v)> d;
1216 const RebindToUnsigned<decltype(d)> du;
1217 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1218 const VFromD<decltype(du)> vu = BitCast(du, v);
1219 // 'Shift left' to clear the sign bit, then right so we can compare with the
1220 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
1221 // negative and non-negative floats would be greater).
1222 const VFromD<decltype(di)> exp =
1223 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
1224 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
1225}
1226
1227// ================================================== COMPARE
1228
1229// Comparisons fill a lane with 1-bits if the condition is true, else 0.
1230
1231// Mask and Vec are the same (true = FF..FF).
1232template <typename T, size_t N>
1233HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1234 return Mask128<T, N>{v.raw};
1235}
1236
1237template <class D>
1238using MFromD = decltype(MaskFromVec(VFromD<D>()));
1239
1240template <typename TFrom, size_t NFrom, class DTo>
1241HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
1242 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
1243 return MFromD<DTo>{m.raw};
1244}
1245
1246template <typename T, size_t N>
1247HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1248 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1249 return (v & bit) == bit;
1250}
1251
1252// ------------------------------ Equality
1253
1254// Unsigned
1255template <size_t N>
1257 const Vec128<uint8_t, N> b) {
1258 return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
1259}
1260template <size_t N>
1262 const Vec128<uint16_t, N> b) {
1263 return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
1264}
1265template <size_t N>
1267 const Vec128<uint32_t, N> b) {
1268 return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
1269}
1270template <size_t N>
1272 const Vec128<uint64_t, N> b) {
1273 return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
1274}
1275
1276// Signed
1277template <size_t N>
1279 const Vec128<int8_t, N> b) {
1280 return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
1281}
1282template <size_t N>
1287template <size_t N>
1289 const Vec128<int32_t, N> b) {
1290 return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
1291}
1292template <size_t N>
1294 const Vec128<int64_t, N> b) {
1295 return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
1296}
1297
1298// Float
1299template <size_t N>
1301 const Vec128<float, N> b) {
1302 return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
1303}
1304template <size_t N>
1306 const Vec128<double, N> b) {
1307 return Mask128<double, N>{wasm_f64x2_eq(a.raw, b.raw)};
1308}
1309
1310// ------------------------------ Inequality
1311
1312// Unsigned
1313template <size_t N>
1314HWY_API Mask128<uint8_t, N> operator!=(const Vec128<uint8_t, N> a,
1315 const Vec128<uint8_t, N> b) {
1316 return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1317}
1318template <size_t N>
1319HWY_API Mask128<uint16_t, N> operator!=(const Vec128<uint16_t, N> a,
1320 const Vec128<uint16_t, N> b) {
1321 return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1322}
1323template <size_t N>
1324HWY_API Mask128<uint32_t, N> operator!=(const Vec128<uint32_t, N> a,
1325 const Vec128<uint32_t, N> b) {
1326 return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1327}
1328template <size_t N>
1329HWY_API Mask128<uint64_t, N> operator!=(const Vec128<uint64_t, N> a,
1330 const Vec128<uint64_t, N> b) {
1331 return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1332}
1333
1334// Signed
1335template <size_t N>
1336HWY_API Mask128<int8_t, N> operator!=(const Vec128<int8_t, N> a,
1337 const Vec128<int8_t, N> b) {
1338 return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1339}
1340template <size_t N>
1341HWY_API Mask128<int16_t, N> operator!=(const Vec128<int16_t, N> a,
1342 const Vec128<int16_t, N> b) {
1343 return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1344}
1345template <size_t N>
1346HWY_API Mask128<int32_t, N> operator!=(const Vec128<int32_t, N> a,
1347 const Vec128<int32_t, N> b) {
1348 return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1349}
1350template <size_t N>
1351HWY_API Mask128<int64_t, N> operator!=(const Vec128<int64_t, N> a,
1352 const Vec128<int64_t, N> b) {
1353 return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1354}
1355
1356// Float
1357template <size_t N>
1358HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
1359 const Vec128<float, N> b) {
1360 return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
1361}
1362template <size_t N>
1363HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a,
1364 const Vec128<double, N> b) {
1365 return Mask128<double, N>{wasm_f64x2_ne(a.raw, b.raw)};
1366}
1367
1368// ------------------------------ Strict inequality
1369
1370template <size_t N>
1372 const Vec128<int8_t, N> b) {
1373 return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
1374}
1375template <size_t N>
1377 const Vec128<int16_t, N> b) {
1378 return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
1379}
1380template <size_t N>
1382 const Vec128<int32_t, N> b) {
1383 return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
1384}
1385template <size_t N>
1387 const Vec128<int64_t, N> b) {
1388 return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
1389}
1390
1391template <size_t N>
1393 const Vec128<uint8_t, N> b) {
1394 return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
1395}
1396template <size_t N>
1398 const Vec128<uint16_t, N> b) {
1399 return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
1400}
1401template <size_t N>
1403 const Vec128<uint32_t, N> b) {
1404 return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
1405}
1406template <size_t N>
1408 const Vec128<uint64_t, N> b) {
1409 const DFromV<decltype(a)> d;
1410 const Repartition<uint32_t, decltype(d)> d32;
1411 const auto a32 = BitCast(d32, a);
1412 const auto b32 = BitCast(d32, b);
1413 // If the upper halves are not equal, this is the answer.
1414 const auto m_gt = a32 > b32;
1415
1416 // Otherwise, the lower half decides.
1417 const auto m_eq = a32 == b32;
1418 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
1419 const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));
1420
1421 const auto gt = Or(lo_gt, m_gt);
1422 // Copy result in upper 32 bits to lower 32 bits.
1423 return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
1424}
1425
1426template <size_t N>
1428 const Vec128<float, N> b) {
1429 return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
1430}
1431template <size_t N>
1433 const Vec128<double, N> b) {
1434 return Mask128<double, N>{wasm_f64x2_gt(a.raw, b.raw)};
1435}
1436
1437template <typename T, size_t N>
1438HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) {
1439 return operator>(b, a);
1440}
1441
1442// ------------------------------ Weak inequality
1443
1444// Float >=
1445template <size_t N>
1447 const Vec128<float, N> b) {
1448 return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
1449}
1450template <size_t N>
1452 const Vec128<double, N> b) {
1453 return Mask128<double, N>{wasm_f64x2_ge(a.raw, b.raw)};
1454}
1455
1456template <size_t N>
1458 const Vec128<int8_t, N> b) {
1459 return Mask128<int8_t, N>{wasm_i8x16_ge(a.raw, b.raw)};
1460}
1461template <size_t N>
1463 const Vec128<int16_t, N> b) {
1464 return Mask128<int16_t, N>{wasm_i16x8_ge(a.raw, b.raw)};
1465}
1466template <size_t N>
1468 const Vec128<int32_t, N> b) {
1469 return Mask128<int32_t, N>{wasm_i32x4_ge(a.raw, b.raw)};
1470}
1471template <size_t N>
1473 const Vec128<int64_t, N> b) {
1474 return Mask128<int64_t, N>{wasm_i64x2_ge(a.raw, b.raw)};
1475}
1476
1477template <size_t N>
1479 const Vec128<uint8_t, N> b) {
1480 return Mask128<uint8_t, N>{wasm_u8x16_ge(a.raw, b.raw)};
1481}
1482template <size_t N>
1484 const Vec128<uint16_t, N> b) {
1485 return Mask128<uint16_t, N>{wasm_u16x8_ge(a.raw, b.raw)};
1486}
1487template <size_t N>
1489 const Vec128<uint32_t, N> b) {
1490 return Mask128<uint32_t, N>{wasm_u32x4_ge(a.raw, b.raw)};
1491}
1492template <size_t N>
1494 const Vec128<uint64_t, N> b) {
1495 return Not(b > a);
1496}
1497
1498template <typename T, size_t N>
1499HWY_API Mask128<T, N> operator<=(const Vec128<T, N> a, const Vec128<T, N> b) {
1500 return operator>=(b, a);
1501}
1502
1503// ------------------------------ FirstN (Iota, Lt)
1504
1505template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1506HWY_API MFromD<D> FirstN(D d, size_t num) {
1507 const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
1508 using TI = TFromD<decltype(di)>;
1509 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<TI>(num)));
1510}
1511
1512// ================================================== LOGICAL
1513
1514// ------------------------------ Not
1515
1516template <typename T, size_t N>
1517HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
1518 return Vec128<T, N>{wasm_v128_not(v.raw)};
1519}
1520
1521// ------------------------------ And
1522
1523template <typename T, size_t N>
1524HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
1525 return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
1526}
1527
1528// ------------------------------ AndNot
1529
1530// Returns ~not_mask & mask.
1531template <typename T, size_t N>
1532HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
1533 return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
1534}
1535
1536// ------------------------------ Or
1537
1538template <typename T, size_t N>
1539HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
1540 return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
1541}
1542
1543// ------------------------------ Xor
1544
1545template <typename T, size_t N>
1546HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
1547 return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
1548}
1549
1550// ------------------------------ Xor3
1551
1552template <typename T, size_t N>
1553HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
1554 return Xor(x1, Xor(x2, x3));
1555}
1556
1557// ------------------------------ Or3
1558
1559template <typename T, size_t N>
1560HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
1561 return Or(o1, Or(o2, o3));
1562}
1563
1564// ------------------------------ OrAnd
1565
1566template <typename T, size_t N>
1567HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
1568 return Or(o, And(a1, a2));
1569}
1570
1571// ------------------------------ IfVecThenElse
1572
1573template <typename T, size_t N>
1574HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
1575 Vec128<T, N> no) {
1576 return IfThenElse(MaskFromVec(mask), yes, no);
1577}
1578
1579// ------------------------------ Operator overloads (internal-only if float)
1580
1581template <typename T, size_t N>
1582HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
1583 return And(a, b);
1584}
1585
1586template <typename T, size_t N>
1587HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
1588 return Or(a, b);
1589}
1590
1591template <typename T, size_t N>
1592HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
1593 return Xor(a, b);
1594}
1595
1596// ------------------------------ CopySign
1597template <typename T, size_t N>
1598HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
1599 const Vec128<T, N> sign) {
1600 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1601 const DFromV<decltype(magn)> d;
1602 return BitwiseIfThenElse(SignBit(d), sign, magn);
1603}
1604
1605// ------------------------------ CopySignToAbs
1606template <typename T, size_t N>
1607HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
1608 const Vec128<T, N> sign) {
1609 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1610 const DFromV<decltype(abs)> d;
1611 return OrAnd(abs, SignBit(d), sign);
1612}
1613
1614// ------------------------------ BroadcastSignBit (compare)
1615
1616template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
1617HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
1618 return ShiftRight<sizeof(T) * 8 - 1>(v);
1619}
1620template <size_t N>
1622 const DFromV<decltype(v)> d;
1623 return VecFromMask(d, v < Zero(d));
1624}
1625
1626// ------------------------------ Mask
1627
1628template <class D>
1629HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
1630 return VFromD<D>{v.raw};
1631}
1632
1633// mask ? yes : no
1634template <typename T, size_t N>
1635HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1636 Vec128<T, N> no) {
1637 return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1638}
1639
1640// mask ? yes : 0
1641template <typename T, size_t N>
1642HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1643 return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
1644}
1645
1646// mask ? 0 : no
1647template <typename T, size_t N>
1648HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1649 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
1650}
1651
1652template <typename T, size_t N>
1653HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1654 Vec128<T, N> no) {
1655 static_assert(IsSigned<T>(), "Only works for signed/float");
1656 const DFromV<decltype(v)> d;
1657 const RebindToSigned<decltype(d)> di;
1658
1659 v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
1660 return IfThenElse(MaskFromVec(v), yes, no);
1661}
1662
1663// ------------------------------ Mask logical
1664
1665template <typename T, size_t N>
1666HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1667 const DFromM<decltype(m)> d;
1668 return MaskFromVec(Not(VecFromMask(d, m)));
1669}
1670
1671template <typename T, size_t N>
1672HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1673 const DFromM<decltype(a)> d;
1674 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1675}
1676
1677template <typename T, size_t N>
1678HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1679 const DFromM<decltype(a)> d;
1680 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1681}
1682
1683template <typename T, size_t N>
1684HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1685 const DFromM<decltype(a)> d;
1686 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1687}
1688
1689template <typename T, size_t N>
1690HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1691 const DFromM<decltype(a)> d;
1692 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1693}
1694
1695template <typename T, size_t N>
1696HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
1697 const DFromM<decltype(a)> d;
1698 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
1699}
1700
1701// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
1702
1703// The x86 multiply-by-Pow2() trick will not work because WASM saturates
1704// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
1705// scalar count operand, per-lane shift instructions would require extract_lane
1706// for each lane, and hoping that shuffle is correctly mapped to a native
1707// instruction. Using non-vector shifts would incur a store-load forwarding
1708// stall when loading the result vector. We instead test bits of the shift
1709// count to "predicate" a shift of the entire vector by a constant.
1710
1711template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
1712HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1713 const DFromV<decltype(v)> d;
1714 Mask128<T, N> mask;
1715 // Need a signed type for BroadcastSignBit.
1716 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1717 // Move the highest valid bit of the shift count into the sign bit.
1718 test = ShiftLeft<5>(test);
1719
1720 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1721 test = ShiftLeft<1>(test); // next bit (descending order)
1722 v = IfThenElse(mask, ShiftLeft<4>(v), v);
1723
1724 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1725 test = ShiftLeft<1>(test); // next bit (descending order)
1726 v = IfThenElse(mask, ShiftLeft<2>(v), v);
1727
1728 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1729 return IfThenElse(mask, ShiftLeft<1>(v), v);
1730}
1731
1732template <typename T, size_t N, HWY_IF_T_SIZE(T, 2),
1735 const DFromV<decltype(v)> d;
1736 Mask128<T, N> mask;
1737 // Need a signed type for BroadcastSignBit.
1738 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1739 // Move the highest valid bit of the shift count into the sign bit.
1740 test = ShiftLeft<12>(test);
1741
1742 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1743 test = ShiftLeft<1>(test); // next bit (descending order)
1744 v = IfThenElse(mask, ShiftLeft<8>(v), v);
1745
1746 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1747 test = ShiftLeft<1>(test); // next bit (descending order)
1748 v = IfThenElse(mask, ShiftLeft<4>(v), v);
1749
1750 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1751 test = ShiftLeft<1>(test); // next bit (descending order)
1752 v = IfThenElse(mask, ShiftLeft<2>(v), v);
1753
1754 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1755 return IfThenElse(mask, ShiftLeft<1>(v), v);
1756}
1757
1758template <typename T, size_t N, HWY_IF_UI32(T)>
1759HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1760 const DFromV<decltype(v)> d;
1761 Mask128<T, N> mask;
1762 // Need a signed type for BroadcastSignBit.
1763 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1764 // Move the highest valid bit of the shift count into the sign bit.
1765 test = ShiftLeft<27>(test);
1766
1767 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1768 test = ShiftLeft<1>(test); // next bit (descending order)
1769 v = IfThenElse(mask, ShiftLeft<16>(v), v);
1770
1771 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1772 test = ShiftLeft<1>(test); // next bit (descending order)
1773 v = IfThenElse(mask, ShiftLeft<8>(v), v);
1774
1775 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1776 test = ShiftLeft<1>(test); // next bit (descending order)
1777 v = IfThenElse(mask, ShiftLeft<4>(v), v);
1778
1779 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1780 test = ShiftLeft<1>(test); // next bit (descending order)
1781 v = IfThenElse(mask, ShiftLeft<2>(v), v);
1782
1783 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1784 return IfThenElse(mask, ShiftLeft<1>(v), v);
1785}
1786
1787template <typename T, size_t N, HWY_IF_UI64(T)>
1788HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1789 const DFromV<decltype(v)> d;
1790 const RebindToUnsigned<decltype(d)> du;
1791 using TU = MakeUnsigned<T>;
1792 alignas(16) TU lanes[2] = {};
1793 alignas(16) TU bits_lanes[2] = {};
1794 Store(BitCast(du, v), du, lanes);
1795 Store(BitCast(du, bits), du, bits_lanes);
1796 lanes[0] <<= (bits_lanes[0] & 63);
1797 lanes[1] <<= (bits_lanes[1] & 63);
1798 return BitCast(d, Load(du, lanes));
1799}
1800
1801// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
1802
1803template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
1805 const DFromV<decltype(v)> d;
1806 Mask128<T, N> mask;
1807 // Need a signed type for BroadcastSignBit.
1808 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1809 // Move the highest valid bit of the shift count into the sign bit.
1810 test = ShiftLeft<5>(test);
1811
1812 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1813 test = ShiftLeft<1>(test); // next bit (descending order)
1814 v = IfThenElse(mask, ShiftRight<4>(v), v);
1815
1816 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1817 test = ShiftLeft<1>(test); // next bit (descending order)
1818 v = IfThenElse(mask, ShiftRight<2>(v), v);
1819
1820 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1821 return IfThenElse(mask, ShiftRight<1>(v), v);
1822}
1823
1824template <typename T, size_t N, HWY_IF_T_SIZE(T, 2),
1827 const DFromV<decltype(v)> d;
1828 Mask128<T, N> mask;
1829 // Need a signed type for BroadcastSignBit.
1830 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1831 // Move the highest valid bit of the shift count into the sign bit.
1832 test = ShiftLeft<12>(test);
1833
1834 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1835 test = ShiftLeft<1>(test); // next bit (descending order)
1836 v = IfThenElse(mask, ShiftRight<8>(v), v);
1837
1838 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1839 test = ShiftLeft<1>(test); // next bit (descending order)
1840 v = IfThenElse(mask, ShiftRight<4>(v), v);
1841
1842 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1843 test = ShiftLeft<1>(test); // next bit (descending order)
1844 v = IfThenElse(mask, ShiftRight<2>(v), v);
1845
1846 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1847 return IfThenElse(mask, ShiftRight<1>(v), v);
1848}
1849
1850template <typename T, size_t N, HWY_IF_UI32(T)>
1851HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
1852 const DFromV<decltype(v)> d;
1853 Mask128<T, N> mask;
1854 // Need a signed type for BroadcastSignBit.
1855 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1856 // Move the highest valid bit of the shift count into the sign bit.
1857 test = ShiftLeft<27>(test);
1858
1859 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1860 test = ShiftLeft<1>(test); // next bit (descending order)
1861 v = IfThenElse(mask, ShiftRight<16>(v), v);
1862
1863 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1864 test = ShiftLeft<1>(test); // next bit (descending order)
1865 v = IfThenElse(mask, ShiftRight<8>(v), v);
1866
1867 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1868 test = ShiftLeft<1>(test); // next bit (descending order)
1869 v = IfThenElse(mask, ShiftRight<4>(v), v);
1870
1871 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1872 test = ShiftLeft<1>(test); // next bit (descending order)
1873 v = IfThenElse(mask, ShiftRight<2>(v), v);
1874
1875 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1876 return IfThenElse(mask, ShiftRight<1>(v), v);
1877}
1878
1879template <typename T, size_t N, HWY_IF_UI64(T)>
1880HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
1881 const DFromV<decltype(v)> d;
1882 alignas(16) T lanes[2] = {};
1883 alignas(16) T bits_lanes[2] = {};
1884 Store(v, d, lanes);
1885 Store(bits, d, bits_lanes);
1886 lanes[0] >>= (bits_lanes[0] & 63);
1887 lanes[1] >>= (bits_lanes[1] & 63);
1888 return Load(d, lanes);
1889}
1890
1891// ================================================== MEMORY
1892
1893// ------------------------------ Load
1894
1895template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
1896HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
1897 return Vec128<T>{wasm_v128_load(aligned)};
1898}
1899
1900// Partial
1901template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
1902HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
1903 VFromD<D> v;
1904 CopyBytes<d.MaxBytes()>(p, &v);
1905 return v;
1906}
1907
1908// LoadU == Load.
1909template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1910HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1911 return Load(d, p);
1912}
1913
1914// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1915template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1916HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1917 return Load(d, p);
1918}
1919
1920template <class D, typename T = TFromD<D>>
1921HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT aligned) {
1922 return IfThenElseZero(m, Load(d, aligned));
1923}
1924
1925template <class D, typename T = TFromD<D>>
1926HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
1927 const T* HWY_RESTRICT aligned) {
1928 return IfThenElse(m, Load(d, aligned), v);
1929}
1930
1931// ------------------------------ Store
1932
1933namespace detail {
1934
1935template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
1937 return static_cast<T>(wasm_i8x16_extract_lane(v.raw, kLane));
1938}
1939template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2),
1942 const int16_t lane = wasm_i16x8_extract_lane(v.raw, kLane);
1943 return static_cast<T>(lane);
1944}
1945template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2),
1948 const DFromV<decltype(v)> d;
1949 const RebindToUnsigned<decltype(d)> du;
1950
1951 const uint16_t bits = ExtractLane<kLane>(BitCast(du, v));
1952 return BitCastScalar<T>(bits);
1953}
1954template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
1955HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
1956 return static_cast<T>(wasm_i32x4_extract_lane(v.raw, kLane));
1957}
1958template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
1959HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
1960 return static_cast<T>(wasm_i64x2_extract_lane(v.raw, kLane));
1961}
1962
1963template <size_t kLane, size_t N>
1965 return wasm_f32x4_extract_lane(v.raw, kLane);
1966}
1967template <size_t kLane, size_t N>
1969 return wasm_f64x2_extract_lane(v.raw, kLane);
1970}
1971
1972} // namespace detail
1973
1974template <class D, HWY_IF_V_SIZE_D(D, 16)>
1975HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
1976 wasm_v128_store(aligned, v.raw);
1977}
1978
1979// Partial
1980template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
1982 CopyBytes<d.MaxBytes()>(&v, p);
1983}
1984
1985template <class D, HWY_IF_LANES_D(D, 1)>
1986HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
1987 *p = detail::ExtractLane<0>(v);
1988}
1989
1990// StoreU == Store.
1991template <class D>
1992HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1993 Store(v, d, p);
1994}
1995
1996template <class D>
1997HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1998 TFromD<D>* HWY_RESTRICT p) {
1999 StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
2000}
2001
2002// ------------------------------ Non-temporal stores
2003
2004// Same as aligned stores on non-x86.
2005
2006template <class D>
2007HWY_API void Stream(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
2008 wasm_v128_store(aligned, v.raw);
2009}
2010
2011// ------------------------------ Scatter in generic_ops-inl.h
2012// ------------------------------ Gather in generic_ops-inl.h
2013
2014// ================================================== SWIZZLE
2015
2016// ------------------------------ ExtractLane
2017
2018// One overload per vector length just in case *_extract_lane raise compile
2019// errors if their argument is out of bounds (even if that would never be
2020// reached at runtime).
2021template <typename T>
2022HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
2023 HWY_DASSERT(i == 0);
2024 (void)i;
2025 return detail::ExtractLane<0>(v);
2026}
2027
2028template <typename T>
2029HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
2030#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
2031 if (__builtin_constant_p(i)) {
2032 switch (i) {
2033 case 0:
2034 return detail::ExtractLane<0>(v);
2035 case 1:
2036 return detail::ExtractLane<1>(v);
2037 }
2038 }
2039#endif
2040 alignas(16) T lanes[2];
2041 Store(v, DFromV<decltype(v)>(), lanes);
2042 return lanes[i];
2043}
2044
2045template <typename T>
2046HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
2047#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
2048 if (__builtin_constant_p(i)) {
2049 switch (i) {
2050 case 0:
2051 return detail::ExtractLane<0>(v);
2052 case 1:
2053 return detail::ExtractLane<1>(v);
2054 case 2:
2055 return detail::ExtractLane<2>(v);
2056 case 3:
2057 return detail::ExtractLane<3>(v);
2058 }
2059 }
2060#endif
2061 alignas(16) T lanes[4];
2062 Store(v, DFromV<decltype(v)>(), lanes);
2063 return lanes[i];
2064}
2065
2066template <typename T>
2067HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
2068#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
2069 if (__builtin_constant_p(i)) {
2070 switch (i) {
2071 case 0:
2072 return detail::ExtractLane<0>(v);
2073 case 1:
2074 return detail::ExtractLane<1>(v);
2075 case 2:
2076 return detail::ExtractLane<2>(v);
2077 case 3:
2078 return detail::ExtractLane<3>(v);
2079 case 4:
2080 return detail::ExtractLane<4>(v);
2081 case 5:
2082 return detail::ExtractLane<5>(v);
2083 case 6:
2084 return detail::ExtractLane<6>(v);
2085 case 7:
2086 return detail::ExtractLane<7>(v);
2087 }
2088 }
2089#endif
2090 alignas(16) T lanes[8];
2091 Store(v, DFromV<decltype(v)>(), lanes);
2092 return lanes[i];
2093}
2094
2095template <typename T>
2096HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
2097#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
2098 if (__builtin_constant_p(i)) {
2099 switch (i) {
2100 case 0:
2101 return detail::ExtractLane<0>(v);
2102 case 1:
2103 return detail::ExtractLane<1>(v);
2104 case 2:
2105 return detail::ExtractLane<2>(v);
2106 case 3:
2107 return detail::ExtractLane<3>(v);
2108 case 4:
2109 return detail::ExtractLane<4>(v);
2110 case 5:
2111 return detail::ExtractLane<5>(v);
2112 case 6:
2113 return detail::ExtractLane<6>(v);
2114 case 7:
2115 return detail::ExtractLane<7>(v);
2116 case 8:
2117 return detail::ExtractLane<8>(v);
2118 case 9:
2119 return detail::ExtractLane<9>(v);
2120 case 10:
2121 return detail::ExtractLane<10>(v);
2122 case 11:
2123 return detail::ExtractLane<11>(v);
2124 case 12:
2125 return detail::ExtractLane<12>(v);
2126 case 13:
2127 return detail::ExtractLane<13>(v);
2128 case 14:
2129 return detail::ExtractLane<14>(v);
2130 case 15:
2131 return detail::ExtractLane<15>(v);
2132 }
2133 }
2134#endif
2135 alignas(16) T lanes[16];
2136 Store(v, DFromV<decltype(v)>(), lanes);
2137 return lanes[i];
2138}
2139
2140// ------------------------------ GetLane
2141template <typename T, size_t N>
2142HWY_API T GetLane(const Vec128<T, N> v) {
2143 return detail::ExtractLane<0>(v);
2144}
2145
2146// ------------------------------ InsertLane
2147
2148namespace detail {
2149
2150template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
2152 static_assert(kLane < N, "Lane index out of bounds");
2153 return Vec128<T, N>{
2154 wasm_i8x16_replace_lane(v.raw, kLane, static_cast<int8_t>(t))};
2155}
2156
2157template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
2159 static_assert(kLane < N, "Lane index out of bounds");
2160 return Vec128<T, N>{
2161 wasm_i16x8_replace_lane(v.raw, kLane, BitCastScalar<int16_t>(t))};
2162}
2163
2164template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
2165HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
2166 static_assert(kLane < N, "Lane index out of bounds");
2167 return Vec128<T, N>{
2168 wasm_i32x4_replace_lane(v.raw, kLane, static_cast<int32_t>(t))};
2169}
2170
2171template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
2172HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
2173 static_assert(kLane < N, "Lane index out of bounds");
2174 return Vec128<T, N>{
2175 wasm_i64x2_replace_lane(v.raw, kLane, static_cast<int64_t>(t))};
2176}
2177
2178template <size_t kLane, size_t N>
2180 static_assert(kLane < N, "Lane index out of bounds");
2181 return Vec128<float, N>{wasm_f32x4_replace_lane(v.raw, kLane, t)};
2182}
2183
2184template <size_t kLane, size_t N>
2186 static_assert(kLane < 2, "Lane index out of bounds");
2187 return Vec128<double, N>{wasm_f64x2_replace_lane(v.raw, kLane, t)};
2188}
2189
2190} // namespace detail
2191
2192// Requires one overload per vector length because InsertLane<3> may be a
2193// compile error if it calls wasm_f64x2_replace_lane.
2194
2195template <typename T>
2196HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
2197 HWY_DASSERT(i == 0);
2198 (void)i;
2199 return Set(DFromV<decltype(v)>(), t);
2200}
2201
2202template <typename T>
2203HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
2204#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
2205 if (__builtin_constant_p(i)) {
2206 switch (i) {
2207 case 0:
2208 return detail::InsertLane<0>(v, t);
2209 case 1:
2210 return detail::InsertLane<1>(v, t);
2211 }
2212 }
2213#endif
2214 const DFromV<decltype(v)> d;
2215 alignas(16) T lanes[2];
2216 Store(v, d, lanes);
2217 lanes[i] = t;
2218 return Load(d, lanes);
2219}
2220
2221template <typename T>
2222HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
2223#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
2224 if (__builtin_constant_p(i)) {
2225 switch (i) {
2226 case 0:
2227 return detail::InsertLane<0>(v, t);
2228 case 1:
2229 return detail::InsertLane<1>(v, t);
2230 case 2:
2231 return detail::InsertLane<2>(v, t);
2232 case 3:
2233 return detail::InsertLane<3>(v, t);
2234 }
2235 }
2236#endif
2237 const DFromV<decltype(v)> d;
2238 alignas(16) T lanes[4];
2239 Store(v, d, lanes);
2240 lanes[i] = t;
2241 return Load(d, lanes);
2242}
2243
2244template <typename T>
2245HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
2246#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
2247 if (__builtin_constant_p(i)) {
2248 switch (i) {
2249 case 0:
2250 return detail::InsertLane<0>(v, t);
2251 case 1:
2252 return detail::InsertLane<1>(v, t);
2253 case 2:
2254 return detail::InsertLane<2>(v, t);
2255 case 3:
2256 return detail::InsertLane<3>(v, t);
2257 case 4:
2258 return detail::InsertLane<4>(v, t);
2259 case 5:
2260 return detail::InsertLane<5>(v, t);
2261 case 6:
2262 return detail::InsertLane<6>(v, t);
2263 case 7:
2264 return detail::InsertLane<7>(v, t);
2265 }
2266 }
2267#endif
2268 const DFromV<decltype(v)> d;
2269 alignas(16) T lanes[8];
2270 Store(v, d, lanes);
2271 lanes[i] = t;
2272 return Load(d, lanes);
2273}
2274
2275template <typename T>
2276HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
2277#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
2278 if (__builtin_constant_p(i)) {
2279 switch (i) {
2280 case 0:
2281 return detail::InsertLane<0>(v, t);
2282 case 1:
2283 return detail::InsertLane<1>(v, t);
2284 case 2:
2285 return detail::InsertLane<2>(v, t);
2286 case 3:
2287 return detail::InsertLane<3>(v, t);
2288 case 4:
2289 return detail::InsertLane<4>(v, t);
2290 case 5:
2291 return detail::InsertLane<5>(v, t);
2292 case 6:
2293 return detail::InsertLane<6>(v, t);
2294 case 7:
2295 return detail::InsertLane<7>(v, t);
2296 case 8:
2297 return detail::InsertLane<8>(v, t);
2298 case 9:
2299 return detail::InsertLane<9>(v, t);
2300 case 10:
2301 return detail::InsertLane<10>(v, t);
2302 case 11:
2303 return detail::InsertLane<11>(v, t);
2304 case 12:
2305 return detail::InsertLane<12>(v, t);
2306 case 13:
2307 return detail::InsertLane<13>(v, t);
2308 case 14:
2309 return detail::InsertLane<14>(v, t);
2310 case 15:
2311 return detail::InsertLane<15>(v, t);
2312 }
2313 }
2314#endif
2315 const DFromV<decltype(v)> d;
2316 alignas(16) T lanes[16];
2317 Store(v, d, lanes);
2318 lanes[i] = t;
2319 return Load(d, lanes);
2320}
2321
2322// ------------------------------ LowerHalf
2323
2324template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
2325HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
2326 return VFromD<D>{v.raw};
2327}
2328template <typename T, size_t N>
2329HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
2330 return Vec128<T, N / 2>{v.raw};
2331}
2332
2333// ------------------------------ ShiftLeftBytes
2334
2335// 0x01..0F, kBytes = 1 => 0x02..0F00
2336template <int kBytes, class D>
2337HWY_API VFromD<D> ShiftLeftBytes(D /* tag */, VFromD<D> v) {
2338 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2339 const __i8x16 zero = wasm_i8x16_splat(0);
2340 switch (kBytes) {
2341 case 0:
2342 return v;
2343
2344 case 1:
2345 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
2346 7, 8, 9, 10, 11, 12, 13, 14)};
2347
2348 case 2:
2349 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
2350 6, 7, 8, 9, 10, 11, 12, 13)};
2351
2352 case 3:
2353 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
2354 4, 5, 6, 7, 8, 9, 10, 11, 12)};
2355
2356 case 4:
2357 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
2358 3, 4, 5, 6, 7, 8, 9, 10, 11)};
2359
2360 case 5:
2361 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
2362 2, 3, 4, 5, 6, 7, 8, 9, 10)};
2363
2364 case 6:
2365 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2366 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
2367
2368 case 7:
2369 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2370 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
2371
2372 case 8:
2373 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2374 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
2375
2376 case 9:
2377 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2378 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
2379
2380 case 10:
2381 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2382 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
2383
2384 case 11:
2385 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2386 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
2387
2388 case 12:
2389 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2390 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
2391
2392 case 13:
2393 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2394 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
2395
2396 case 14:
2397 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2398 16, 16, 16, 16, 16, 16, 16, 16, 0,
2399 1)};
2400
2401 case 15:
2402 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2403 16, 16, 16, 16, 16, 16, 16, 16, 16,
2404 0)};
2405 }
2406 return VFromD<D>{zero};
2407}
2408
2409template <int kBytes, typename T, size_t N>
2410HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
2411 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
2412}
2413
2414// ------------------------------ ShiftLeftLanes
2415
2416template <int kLanes, class D>
2418 const Repartition<uint8_t, decltype(d)> d8;
2419 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
2420 return BitCast(d, ShiftLeftBytes<kBytes>(BitCast(d8, v)));
2421}
2422
2423template <int kLanes, typename T, size_t N>
2424HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
2425 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
2426}
2427
2428// ------------------------------ ShiftRightBytes
2429namespace detail {
2430
2431// Helper function allows zeroing invalid lanes in caller.
2432template <int kBytes, typename T, size_t N>
2434 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2435 const __i8x16 zero = wasm_i8x16_splat(0);
2436
2437 switch (kBytes) {
2438 case 0:
2439 return v.raw;
2440
2441 case 1:
2442 return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2443 12, 13, 14, 15, 16);
2444
2445 case 2:
2446 return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2447 13, 14, 15, 16, 16);
2448
2449 case 3:
2450 return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2451 13, 14, 15, 16, 16, 16);
2452
2453 case 4:
2454 return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2455 14, 15, 16, 16, 16, 16);
2456
2457 case 5:
2458 return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
2459 15, 16, 16, 16, 16, 16);
2460
2461 case 6:
2462 return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2463 16, 16, 16, 16, 16, 16);
2464
2465 case 7:
2466 return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2467 16, 16, 16, 16, 16, 16, 16);
2468
2469 case 8:
2470 return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
2471 16, 16, 16, 16, 16, 16, 16);
2472
2473 case 9:
2474 return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
2475 16, 16, 16, 16, 16, 16, 16);
2476
2477 case 10:
2478 return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
2479 16, 16, 16, 16, 16, 16, 16);
2480
2481 case 11:
2482 return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
2483 16, 16, 16, 16, 16, 16, 16);
2484
2485 case 12:
2486 return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
2487 16, 16, 16, 16, 16, 16, 16);
2488
2489 case 13:
2490 return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
2491 16, 16, 16, 16, 16, 16, 16);
2492
2493 case 14:
2494 return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
2495 16, 16, 16, 16, 16, 16, 16);
2496
2497 case 15:
2498 return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
2499 16, 16, 16, 16, 16, 16, 16);
2500 case 16:
2501 return zero;
2502 }
2503}
2504
2505} // namespace detail
2506
2507// 0x01..0F, kBytes = 1 => 0x0001..0E
2508template <int kBytes, class D>
2510 // For partial vectors, clear upper lanes so we shift in zeros.
2511 if (d.MaxBytes() != 16) {
2512 const Full128<TFromD<D>> dfull;
2513 const VFromD<decltype(dfull)> vfull{v.raw};
2514 v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
2515 }
2516 return VFromD<D>{detail::ShrBytes<kBytes>(v)};
2517}
2518
2519// ------------------------------ ShiftRightLanes
2520template <int kLanes, class D>
2522 const Repartition<uint8_t, decltype(d)> d8;
2523 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
2524 return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
2525}
2526
2527// ------------------------------ UpperHalf (ShiftRightBytes)
2528
2529template <class D, typename T = TFromD<D>>
2530HWY_API Vec64<T> UpperHalf(D /* tag */, const Vec128<T> v) {
2531 return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
2532}
2533
2534// Partial
2535template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2536HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
2537 return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
2538}
2539
2540// ------------------------------ CombineShiftRightBytes
2541
2542template <int kBytes, class D, typename T = TFromD<D>>
2543HWY_API Vec128<T> CombineShiftRightBytes(D /* tag */, Vec128<T> hi,
2544 Vec128<T> lo) {
2545 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2546 switch (kBytes) {
2547 case 0:
2548 return lo;
2549
2550 case 1:
2551 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
2552 8, 9, 10, 11, 12, 13, 14, 15, 16)};
2553
2554 case 2:
2555 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
2556 9, 10, 11, 12, 13, 14, 15, 16, 17)};
2557
2558 case 3:
2559 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
2560 10, 11, 12, 13, 14, 15, 16, 17, 18)};
2561
2562 case 4:
2563 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
2564 11, 12, 13, 14, 15, 16, 17, 18, 19)};
2565
2566 case 5:
2567 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
2568 12, 13, 14, 15, 16, 17, 18, 19, 20)};
2569
2570 case 6:
2571 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
2572 12, 13, 14, 15, 16, 17, 18, 19, 20,
2573 21)};
2574
2575 case 7:
2576 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
2577 13, 14, 15, 16, 17, 18, 19, 20, 21,
2578 22)};
2579
2580 case 8:
2581 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
2582 14, 15, 16, 17, 18, 19, 20, 21, 22,
2583 23)};
2584
2585 case 9:
2586 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
2587 15, 16, 17, 18, 19, 20, 21, 22, 23,
2588 24)};
2589
2590 case 10:
2591 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
2592 15, 16, 17, 18, 19, 20, 21, 22, 23,
2593 24, 25)};
2594
2595 case 11:
2596 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
2597 16, 17, 18, 19, 20, 21, 22, 23, 24,
2598 25, 26)};
2599
2600 case 12:
2601 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
2602 17, 18, 19, 20, 21, 22, 23, 24, 25,
2603 26, 27)};
2604
2605 case 13:
2606 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
2607 18, 19, 20, 21, 22, 23, 24, 25, 26,
2608 27, 28)};
2609
2610 case 14:
2611 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
2612 19, 20, 21, 22, 23, 24, 25, 26, 27,
2613 28, 29)};
2614
2615 case 15:
2616 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
2617 20, 21, 22, 23, 24, 25, 26, 27, 28,
2618 29, 30)};
2619 }
2620 return hi;
2621}
2622
2623template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2625 constexpr size_t kSize = d.MaxBytes();
2626 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
2627 const Repartition<uint8_t, decltype(d)> d8;
2628 using V8 = Vec128<uint8_t>;
2629 const DFromV<V8> dfull8;
2630 const Repartition<TFromD<D>, decltype(dfull8)> dfull;
2631 const V8 hi8{BitCast(d8, hi).raw};
2632 // Move into most-significant bytes
2633 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
2634 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8);
2635 return VFromD<D>{BitCast(dfull, r).raw};
2636}
2637
2638// ------------------------------ Broadcast/splat any lane
2639
2640template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
2642 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2643 return Vec128<T, N>{wasm_i8x16_shuffle(
2644 v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane,
2645 kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
2646}
2647
2648template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
2649HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2650 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2651 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
2652 kLane, kLane, kLane, kLane, kLane)};
2653}
2654
2655template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
2656HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2657 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2658 return Vec128<T, N>{
2659 wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
2660}
2661
2662template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
2663HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2664 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2665 return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
2666}
2667
2668// ------------------------------ TableLookupBytes
2669
2670// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
2671// lane indices in [0, 16).
2672template <typename T, size_t N, typename TI, size_t NI>
2673HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
2674 const Vec128<TI, NI> from) {
2675 return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
2676}
2677
2678template <typename T, size_t N, typename TI, size_t NI>
2679HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes,
2680 const Vec128<TI, NI> from) {
2681 const DFromV<decltype(from)> d;
2682 // Mask size must match vector type, so cast everything to this type.
2683 Repartition<int8_t, decltype(d)> di8;
2684 Repartition<int8_t, DFromV<decltype(bytes)>> d_bytes8;
2685 const auto msb = BitCast(di8, from) < Zero(di8);
2686 const auto lookup =
2687 TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
2688 return BitCast(d, IfThenZeroElse(msb, lookup));
2689}
2690
2691// ------------------------------ Hard-coded shuffles
2692
2693// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
2694// Shuffle0321 rotates one lane to the right (the previous least-significant
2695// lane is now most-significant). These could also be implemented via
2696// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
2697
2698// Swap 32-bit halves in 64-bit halves.
2699template <typename T, size_t N>
2700HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
2701 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2702 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2703 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
2704}
2705
2706// These are used by generic_ops-inl to implement LoadInterleaved3.
2707namespace detail {
2708
2709template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
2711 const Vec128<T, N> b) {
2712 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2713 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16,
2714 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2715 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2716}
2717template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
2719 const Vec128<T, N> b) {
2720 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2721 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8,
2722 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2723}
2724template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
2725HWY_API Vec128<T, N> ShuffleTwo2301(const Vec128<T, N> a,
2726 const Vec128<T, N> b) {
2727 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2728 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
2729}
2730
2731template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
2733 const Vec128<T, N> b) {
2734 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2735 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16,
2736 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2737 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2738}
2739template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
2741 const Vec128<T, N> b) {
2742 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2743 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8,
2744 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2745}
2746template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
2747HWY_API Vec128<T, N> ShuffleTwo1230(const Vec128<T, N> a,
2748 const Vec128<T, N> b) {
2749 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2750 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
2751}
2752
2753template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
2755 const Vec128<T, N> b) {
2756 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2757 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16,
2758 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2759 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2760}
2761template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
2763 const Vec128<T, N> b) {
2764 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2765 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8,
2766 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2767}
2768template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
2769HWY_API Vec128<T, N> ShuffleTwo3012(const Vec128<T, N> a,
2770 const Vec128<T, N> b) {
2771 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2772 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
2773}
2774
2775} // namespace detail
2776
2777// Swap 64-bit halves
2778template <typename T>
2779HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
2780 static_assert(sizeof(T) == 8, "Only for 64-bit lanes");
2781 return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2782}
2783template <typename T>
2784HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
2785 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2786 return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2787}
2788
2789// Rotate right 32 bits
2790template <typename T>
2791HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
2792 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2793 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
2794}
2795
2796// Rotate left 32 bits
2797template <typename T>
2798HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
2799 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2800 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
2801}
2802
2803// Reverse
2804template <typename T>
2805HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
2806 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2807 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
2808}
2809
2810// ------------------------------ TableLookupLanes
2811
2812// Returned by SetTableIndices for use by TableLookupLanes.
2813template <typename T, size_t N = 16 / sizeof(T)>
2814struct Indices128 {
2815 __v128_u raw;
2816};
2817
2818namespace detail {
2819
2820template <class D, HWY_IF_T_SIZE_D(D, 1)>
2821HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
2822 D d) {
2823 const Repartition<uint8_t, decltype(d)> d8;
2824 return Iota(d8, 0);
2825}
2826
2827template <class D, HWY_IF_T_SIZE_D(D, 2)>
2828HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
2829 D d) {
2830 const Repartition<uint8_t, decltype(d)> d8;
2831 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
2832 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
2833 return Load(d8, kBroadcastLaneBytes);
2834}
2835
2836template <class D, HWY_IF_T_SIZE_D(D, 4)>
2838 D d) {
2839 const Repartition<uint8_t, decltype(d)> d8;
2840 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
2841 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2842 return Load(d8, kBroadcastLaneBytes);
2843}
2844
2845template <class D, HWY_IF_T_SIZE_D(D, 8)>
2847 D d) {
2848 const Repartition<uint8_t, decltype(d)> d8;
2849 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
2850 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2851 return Load(d8, kBroadcastLaneBytes);
2852}
2853
2854template <class D, HWY_IF_T_SIZE_D(D, 1)>
2856 const Repartition<uint8_t, decltype(d)> d8;
2857 return Zero(d8);
2858}
2859
2860template <class D, HWY_IF_T_SIZE_D(D, 2)>
2862 const Repartition<uint8_t, decltype(d)> d8;
2863 alignas(16) static constexpr uint8_t kByteOffsets[16] = {
2864 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
2865 return Load(d8, kByteOffsets);
2866}
2867
2868template <class D, HWY_IF_T_SIZE_D(D, 4)>
2870 const Repartition<uint8_t, decltype(d)> d8;
2871 alignas(16) static constexpr uint8_t kByteOffsets[16] = {
2872 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
2873 return Load(d8, kByteOffsets);
2874}
2875
2876template <class D, HWY_IF_T_SIZE_D(D, 8)>
2878 const Repartition<uint8_t, decltype(d)> d8;
2879 alignas(16) static constexpr uint8_t kByteOffsets[16] = {
2880 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
2881 return Load(d8, kByteOffsets);
2882}
2883
2884} // namespace detail
2885
2886template <class D, typename TI, HWY_IF_V_SIZE_LE_D(D, 16),
2887 HWY_IF_T_SIZE_D(D, 1)>
2889 D d, Vec128<TI, MaxLanes(D())> vec) {
2890 using T = TFromD<D>;
2891 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2892#if HWY_IS_DEBUG_BUILD
2893 const RebindToUnsigned<decltype(d)> du;
2894 using TU = TFromD<decltype(du)>;
2896 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
2897#endif
2898
2899 (void)d;
2900 return Indices128<TFromD<D>, MaxLanes(D())>{vec.raw};
2901}
2902
2903template <class D, typename TI, HWY_IF_V_SIZE_LE_D(D, 16),
2904 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
2905HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
2906 D d, Vec128<TI, MaxLanes(D())> vec) {
2907 using T = TFromD<D>;
2908 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2909#if HWY_IS_DEBUG_BUILD
2910 const RebindToUnsigned<decltype(d)> du;
2911 using TU = TFromD<decltype(du)>;
2913 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
2914#endif
2915
2916 const Repartition<uint8_t, decltype(d)> d8;
2917 using V8 = VFromD<decltype(d8)>;
2918
2919 // Broadcast each lane index to all bytes of T and shift to bytes
2920 const V8 lane_indices = TableLookupBytes(
2922 constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
2923 const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
2924 const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
2925 return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, sum).raw};
2926}
2927
2928template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI>
2930 D d, const TI* idx) {
2931 const Rebind<TI, decltype(d)> di;
2932 return IndicesFromVec(d, LoadU(di, idx));
2933}
2934
2935template <typename T, size_t N>
2936HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
2937 using TI = MakeSigned<T>;
2938 const DFromV<decltype(v)> d;
2939 const Rebind<TI, decltype(d)> di;
2940 return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}));
2941}
2942
2943template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2944HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
2945 Indices128<T, N> idx) {
2946 const DFromV<decltype(a)> d;
2947 const Twice<decltype(d)> dt;
2948// TableLookupLanes currently requires table and index vectors to be the same
2949// size, though a half-length index vector would be sufficient here.
2950#if HWY_IS_MSAN
2951 const Vec128<T, N> idx_vec{idx.raw};
2952 const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
2953#else
2954 // We only keep LowerHalf of the result, which is valid in idx.
2955 const Indices128<T, N * 2> idx2{idx.raw};
2956#endif
2957 return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2));
2958}
2959
2960template <typename T>
2961HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
2962 Indices128<T> idx) {
2963 const DFromV<decltype(a)> d;
2964 const Repartition<uint8_t, decltype(d)> du8;
2965
2966 const VFromD<decltype(du8)> byte_idx{idx.raw};
2967 const auto byte_idx_mod = byte_idx & Set(du8, uint8_t{0x0F});
2968 // If ANDing did not change the index, it is for the lower half.
2969 const auto is_lo = (byte_idx == byte_idx_mod);
2970
2971 return BitCast(d, IfThenElse(is_lo, TableLookupBytes(a, byte_idx_mod),
2972 TableLookupBytes(b, byte_idx_mod)));
2973}
2974
2975// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
2976
2977// Single lane: no change
2978template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
2979HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) {
2980 return v;
2981}
2982
2983// 32-bit x2: shuffle
2984template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
2985HWY_API Vec64<T> Reverse(D /* tag */, const Vec64<T> v) {
2986 return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw};
2987}
2988
2989// 64-bit x2: shuffle
2990template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
2991HWY_API Vec128<T> Reverse(D /* tag */, const Vec128<T> v) {
2992 return Shuffle01(v);
2993}
2994
2995// 32-bit x2: shuffle
2996template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
2997HWY_API Vec128<T> Reverse(D /* tag */, const Vec128<T> v) {
2998 return Shuffle0123(v);
2999}
3000
3001// 16-bit
3002template <class D, HWY_IF_T_SIZE_D(D, 2)>
3003HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
3004 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
3005 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
3006}
3007
3008template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)>
3009HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
3010 static constexpr int kN = 16 + Lanes(d);
3011 return VFromD<D>{wasm_i8x16_shuffle(
3012 v.raw, v.raw,
3013 // kN is adjusted to ensure we have valid indices for all lengths.
3014 kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, kN - 9,
3015 kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16)};
3016}
3017
3018// ------------------------------ Reverse2
3019
3020template <class D, HWY_IF_T_SIZE_D(D, 2)>
3021HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
3022 const RepartitionToWide<RebindToUnsigned<decltype(d)>> dw;
3023 return BitCast(d, RotateRight<16>(BitCast(dw, v)));
3024}
3025
3026template <class D, HWY_IF_T_SIZE_D(D, 4)>
3027HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) {
3028 return Shuffle2301(v);
3029}
3030
3031template <class D, HWY_IF_T_SIZE_D(D, 8)>
3032HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) {
3033 return Shuffle01(v);
3034}
3035
3036// ------------------------------ Reverse4
3037
3038template <class D, HWY_IF_T_SIZE_D(D, 2)>
3039HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
3040 return VFromD<D>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2, 1, 0, 7, 6, 5, 4)};
3041}
3042
3043template <class D, HWY_IF_T_SIZE_D(D, 4)>
3044HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
3045 return Shuffle0123(v);
3046}
3047
3048template <class D, HWY_IF_T_SIZE_D(D, 8)>
3049HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D>) {
3050 HWY_ASSERT(0); // don't have 8 u64 lanes
3051}
3052
3053// ------------------------------ Reverse8
3054
3055template <class D, HWY_IF_T_SIZE_D(D, 2)>
3056HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
3057 return Reverse(d, v);
3058}
3059
3060template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
3061HWY_API VFromD<D> Reverse8(D /* tag */, const VFromD<D>) {
3062 HWY_ASSERT(0); // don't have 8 lanes for > 16-bit lanes
3063}
3064
3065// ------------------------------ InterleaveLower
3066
3067template <size_t N>
3070 return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
3071 a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
3072}
3073template <size_t N>
3076 return Vec128<uint16_t, N>{
3077 wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
3078}
3079template <size_t N>
3082 return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
3083}
3084template <size_t N>
3089
3090template <size_t N>
3093 return Vec128<int8_t, N>{wasm_i8x16_shuffle(
3094 a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
3095}
3096template <size_t N>
3099 return Vec128<int16_t, N>{
3100 wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
3101}
3102template <size_t N>
3105 return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
3106}
3107template <size_t N>
3110 return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
3111}
3112
3113template <size_t N>
3115 Vec128<float, N> b) {
3116 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
3117}
3118
3119template <size_t N>
3122 return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
3123}
3124
3125template <class T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_SPECIAL_FLOAT(T)>
3127 const DFromV<decltype(a)> d;
3128 const RebindToUnsigned<decltype(d)> du;
3129 return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b)));
3130}
3131
3132// Additional overload for the optional tag (all vector lengths).
3133template <class D>
3135 return InterleaveLower(a, b);
3136}
3137
3138// ------------------------------ InterleaveUpper (UpperHalf)
3139
3140// All functions inside detail lack the required D parameter.
3141namespace detail {
3142
3143template <size_t N>
3146 return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
3147 26, 11, 27, 12, 28, 13, 29, 14,
3148 30, 15, 31)};
3149}
3150template <size_t N>
3153 return Vec128<uint16_t, N>{
3154 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
3155}
3156template <size_t N>
3159 return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
3160}
3161template <size_t N>
3166
3167template <size_t N>
3170 return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
3171 26, 11, 27, 12, 28, 13, 29, 14,
3172 30, 15, 31)};
3173}
3174template <size_t N>
3177 return Vec128<int16_t, N>{
3178 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
3179}
3180template <size_t N>
3183 return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
3184}
3185template <size_t N>
3188 return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
3189}
3190
3191template <size_t N>
3193 Vec128<float, N> b) {
3194 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
3195}
3196
3197template <size_t N>
3200 return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
3201}
3202
3203} // namespace detail
3204
3205// Full
3206template <class D, typename T = TFromD<D>>
3207HWY_API Vec128<T> InterleaveUpper(D /* tag */, Vec128<T> a, Vec128<T> b) {
3208 return detail::InterleaveUpper(a, b);
3209}
3210
3211// Partial
3212template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
3214 const Half<decltype(d)> d2;
3215 return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
3216 VFromD<D>{UpperHalf(d2, b).raw});
3217}
3218
3219// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
3220
3221// Same as Interleave*, except that the return lanes are double-width integers;
3222// this is necessary because the single-lane scalar cannot return two values.
3223template <class V, class DW = RepartitionToWide<DFromV<V>>>
3224HWY_API VFromD<DW> ZipLower(V a, V b) {
3225 return BitCast(DW(), InterleaveLower(a, b));
3226}
3227template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
3228HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
3229 return BitCast(dw, InterleaveLower(D(), a, b));
3230}
3231
3232template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
3233HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
3234 return BitCast(dw, InterleaveUpper(D(), a, b));
3235}
3236
3237// ------------------------------ Per4LaneBlockShuffle
3238namespace detail {
3239
3240template <size_t kIdx3210, size_t kVectSize, class V,
3241 HWY_IF_LANES_LE(kVectSize, 16)>
3243 hwy::SizeTag<1> /*lane_size_tag*/,
3244 hwy::SizeTag<kVectSize> /*vect_size_tag*/,
3245 V v) {
3246 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
3247 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
3248 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
3249 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
3250 return V{wasm_i8x16_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3,
3251 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4,
3252 kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8,
3253 kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)};
3254}
3255
3256template <size_t kIdx3210, size_t kVectSize, class V,
3257 HWY_IF_LANES_LE(kVectSize, 16)>
3259 hwy::SizeTag<2> /*lane_size_tag*/,
3260 hwy::SizeTag<kVectSize> /*vect_size_tag*/,
3261 V v) {
3262 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
3263 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
3264 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
3265 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
3266 return V{wasm_i16x8_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3,
3267 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)};
3268}
3269
3270template <size_t kIdx3210, size_t kVectSize, class V,
3271 HWY_IF_LANES_LE(kVectSize, 16)>
3273 hwy::SizeTag<4> /*lane_size_tag*/,
3274 hwy::SizeTag<kVectSize> /*vect_size_tag*/,
3275 V v) {
3276 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
3277 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
3278 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
3279 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
3280 return V{wasm_i32x4_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3)};
3281}
3282
3283} // namespace detail
3284
3285// ------------------------------ SlideUpLanes
3286
3287namespace detail {
3288
3289template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
3290HWY_INLINE V SlideUpLanes(V v, size_t amt) {
3291 const DFromV<decltype(v)> d;
3292 const Full64<uint64_t> du64;
3293 const auto vu64 = ResizeBitCast(du64, v);
3294 return ResizeBitCast(
3295 d, ShiftLeftSame(vu64, static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
3296}
3297
3298template <class V, HWY_IF_V_SIZE_V(V, 16)>
3299HWY_INLINE V SlideUpLanes(V v, size_t amt) {
3300 const DFromV<decltype(v)> d;
3301 const Repartition<uint8_t, decltype(d)> du8;
3302 const auto idx =
3303 Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>)));
3304 return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx));
3305}
3306
3307} // namespace detail
3308
3309template <class D, HWY_IF_LANES_D(D, 1)>
3310HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
3311 return v;
3312}
3313
3314template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
3315HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
3316#if !HWY_IS_DEBUG_BUILD
3317 if (__builtin_constant_p(amt)) {
3318 switch (amt) {
3319 case 0:
3320 return v;
3321 case 1:
3322 return ShiftLeftLanes<1>(d, v);
3323 }
3324 }
3325#else
3326 (void)d;
3327#endif
3328
3329 return detail::SlideUpLanes(v, amt);
3330}
3331
3332template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
3333HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
3334#if !HWY_IS_DEBUG_BUILD
3335 if (__builtin_constant_p(amt)) {
3336 switch (amt) {
3337 case 0:
3338 return v;
3339 case 1:
3340 return ShiftLeftLanes<1>(d, v);
3341 case 2:
3342 return ShiftLeftLanes<2>(d, v);
3343 case 3:
3344 return ShiftLeftLanes<3>(d, v);
3345 }
3346 }
3347#else
3348 (void)d;
3349#endif
3350
3351 return detail::SlideUpLanes(v, amt);
3352}
3353
3354template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
3355HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
3356#if !HWY_IS_DEBUG_BUILD
3357 if (__builtin_constant_p(amt)) {
3358 switch (amt) {
3359 case 0:
3360 return v;
3361 case 1:
3362 return ShiftLeftLanes<1>(d, v);
3363 case 2:
3364 return ShiftLeftLanes<2>(d, v);
3365 case 3:
3366 return ShiftLeftLanes<3>(d, v);
3367 case 4:
3368 return ShiftLeftLanes<4>(d, v);
3369 case 5:
3370 return ShiftLeftLanes<5>(d, v);
3371 case 6:
3372 return ShiftLeftLanes<6>(d, v);
3373 case 7:
3374 return ShiftLeftLanes<7>(d, v);
3375 }
3376 }
3377#else
3378 (void)d;
3379#endif
3380
3381 return detail::SlideUpLanes(v, amt);
3382}
3383
3384template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
3385HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
3386#if !HWY_IS_DEBUG_BUILD
3387 if (__builtin_constant_p(amt)) {
3388 switch (amt) {
3389 case 0:
3390 return v;
3391 case 1:
3392 return ShiftLeftLanes<1>(d, v);
3393 case 2:
3394 return ShiftLeftLanes<2>(d, v);
3395 case 3:
3396 return ShiftLeftLanes<3>(d, v);
3397 case 4:
3398 return ShiftLeftLanes<4>(d, v);
3399 case 5:
3400 return ShiftLeftLanes<5>(d, v);
3401 case 6:
3402 return ShiftLeftLanes<6>(d, v);
3403 case 7:
3404 return ShiftLeftLanes<7>(d, v);
3405 case 8:
3406 return ShiftLeftLanes<8>(d, v);
3407 case 9:
3408 return ShiftLeftLanes<9>(d, v);
3409 case 10:
3410 return ShiftLeftLanes<10>(d, v);
3411 case 11:
3412 return ShiftLeftLanes<11>(d, v);
3413 case 12:
3414 return ShiftLeftLanes<12>(d, v);
3415 case 13:
3416 return ShiftLeftLanes<13>(d, v);
3417 case 14:
3418 return ShiftLeftLanes<14>(d, v);
3419 case 15:
3420 return ShiftLeftLanes<15>(d, v);
3421 }
3422 }
3423#else
3424 (void)d;
3425#endif
3426
3427 return detail::SlideUpLanes(v, amt);
3428}
3429
3430// ------------------------------ SlideDownLanes
3431
3432namespace detail {
3433
3434template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
3435HWY_INLINE V SlideDownLanes(V v, size_t amt) {
3436 const DFromV<decltype(v)> d;
3437 const Repartition<UnsignedFromSize<d.MaxBytes()>, decltype(d)> dv;
3438 return BitCast(d,
3439 ShiftRightSame(BitCast(dv, v),
3440 static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
3441}
3442
3443template <class V, HWY_IF_V_SIZE_V(V, 16)>
3444HWY_INLINE V SlideDownLanes(V v, size_t amt) {
3445 const DFromV<decltype(v)> d;
3446 const Repartition<int8_t, decltype(d)> di8;
3447 auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
3448 idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
3449 return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
3450}
3451
3452} // namespace detail
3453
3454template <class D, HWY_IF_LANES_D(D, 1)>
3455HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
3456 return v;
3457}
3458
3459template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
3460HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
3461#if !HWY_IS_DEBUG_BUILD
3462 if (__builtin_constant_p(amt)) {
3463 switch (amt) {
3464 case 0:
3465 return v;
3466 case 1:
3467 return ShiftRightLanes<1>(d, v);
3468 }
3469 }
3470#else
3471 (void)d;
3472#endif
3473
3474 return detail::SlideDownLanes(v, amt);
3475}
3476
3477template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
3478HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
3479#if !HWY_IS_DEBUG_BUILD
3480 if (__builtin_constant_p(amt)) {
3481 switch (amt) {
3482 case 0:
3483 return v;
3484 case 1:
3485 return ShiftRightLanes<1>(d, v);
3486 case 2:
3487 return ShiftRightLanes<2>(d, v);
3488 case 3:
3489 return ShiftRightLanes<3>(d, v);
3490 }
3491 }
3492#else
3493 (void)d;
3494#endif
3495
3496 return detail::SlideDownLanes(v, amt);
3497}
3498
3499template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
3500HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
3501#if !HWY_IS_DEBUG_BUILD
3502 if (__builtin_constant_p(amt)) {
3503 switch (amt) {
3504 case 0:
3505 return v;
3506 case 1:
3507 return ShiftRightLanes<1>(d, v);
3508 case 2:
3509 return ShiftRightLanes<2>(d, v);
3510 case 3:
3511 return ShiftRightLanes<3>(d, v);
3512 case 4:
3513 return ShiftRightLanes<4>(d, v);
3514 case 5:
3515 return ShiftRightLanes<5>(d, v);
3516 case 6:
3517 return ShiftRightLanes<6>(d, v);
3518 case 7:
3519 return ShiftRightLanes<7>(d, v);
3520 }
3521 }
3522#else
3523 (void)d;
3524#endif
3525
3526 return detail::SlideDownLanes(v, amt);
3527}
3528
3529template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
3530HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
3531#if !HWY_IS_DEBUG_BUILD
3532 if (__builtin_constant_p(amt)) {
3533 switch (amt) {
3534 case 0:
3535 return v;
3536 case 1:
3537 return ShiftRightLanes<1>(d, v);
3538 case 2:
3539 return ShiftRightLanes<2>(d, v);
3540 case 3:
3541 return ShiftRightLanes<3>(d, v);
3542 case 4:
3543 return ShiftRightLanes<4>(d, v);
3544 case 5:
3545 return ShiftRightLanes<5>(d, v);
3546 case 6:
3547 return ShiftRightLanes<6>(d, v);
3548 case 7:
3549 return ShiftRightLanes<7>(d, v);
3550 case 8:
3551 return ShiftRightLanes<8>(d, v);
3552 case 9:
3553 return ShiftRightLanes<9>(d, v);
3554 case 10:
3555 return ShiftRightLanes<10>(d, v);
3556 case 11:
3557 return ShiftRightLanes<11>(d, v);
3558 case 12:
3559 return ShiftRightLanes<12>(d, v);
3560 case 13:
3561 return ShiftRightLanes<13>(d, v);
3562 case 14:
3563 return ShiftRightLanes<14>(d, v);
3564 case 15:
3565 return ShiftRightLanes<15>(d, v);
3566 }
3567 }
3568#else
3569 (void)d;
3570#endif
3571
3572 return detail::SlideDownLanes(v, amt);
3573}
3574
3575// ================================================== COMBINE
3576
3577// ------------------------------ Combine (InterleaveLower)
3578
3579// N = N/2 + N/2 (upper half undefined)
3580template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
3581HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
3582 const Half<decltype(d)> dh;
3583 const RebindToUnsigned<decltype(dh)> duh;
3584 // Treat half-width input as one lane, and expand to two lanes.
3585 using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
3586 const VU lo{BitCast(duh, lo_half).raw};
3587 const VU hi{BitCast(duh, hi_half).raw};
3588 return BitCast(d, InterleaveLower(lo, hi));
3589}
3590
3591// ------------------------------ ZeroExtendVector (IfThenElseZero)
3592template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
3594 const Half<D> dh;
3595 return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
3596}
3597
3598// ------------------------------ ConcatLowerLower
3599template <class D, typename T = TFromD<D>>
3600HWY_API Vec128<T> ConcatLowerLower(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
3601 return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
3602}
3603
3604// ------------------------------ ConcatUpperUpper
3605template <class D, typename T = TFromD<D>>
3606HWY_API Vec128<T> ConcatUpperUpper(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
3607 return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
3608}
3609
3610// ------------------------------ ConcatLowerUpper
3611template <class D, typename T = TFromD<D>>
3612HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) {
3613 return CombineShiftRightBytes<8>(d, hi, lo);
3614}
3615
3616// ------------------------------ ConcatUpperLower
3617template <class D, typename T = TFromD<D>>
3618HWY_API Vec128<T> ConcatUpperLower(D d, Vec128<T> hi, Vec128<T> lo) {
3619 return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
3620}
3621
3622// ------------------------------ Concat partial (Combine, LowerHalf)
3623
3624template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
3626 const Half<decltype(d)> d2;
3627 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
3628}
3629
3630template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
3632 const Half<decltype(d)> d2;
3633 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
3634}
3635
3636template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
3638 const VFromD<D> lo) {
3639 const Half<decltype(d)> d2;
3640 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
3641}
3642
3643template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
3645 const Half<decltype(d)> d2;
3646 return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
3647}
3648
3649// ------------------------------ ConcatOdd
3650
3651// 8-bit full
3652template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3653HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
3654 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15,
3655 17, 19, 21, 23, 25, 27, 29, 31)};
3656}
3657
3658// 8-bit x8
3659template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3660HWY_API Vec64<T> ConcatOdd(D /* tag */, Vec64<T> hi, Vec64<T> lo) {
3661 // Don't care about upper half.
3662 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21,
3663 23, 1, 3, 5, 7, 17, 19, 21, 23)};
3664}
3665
3666// 8-bit x4
3667template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3668HWY_API Vec32<T> ConcatOdd(D /* tag */, Vec32<T> hi, Vec32<T> lo) {
3669 // Don't care about upper 3/4.
3670 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
3671 19, 1, 3, 17, 19, 1, 3, 17, 19)};
3672}
3673
3674// 16-bit full
3675template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3676HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
3677 return Vec128<T>{
3678 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
3679}
3680
3681// 16-bit x4
3682template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3683HWY_API Vec64<T> ConcatOdd(D /* tag */, Vec64<T> hi, Vec64<T> lo) {
3684 // Don't care about upper half.
3685 return Vec128<T, 4>{
3686 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
3687}
3688
3689// 32-bit full
3690template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
3691HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
3692 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
3693}
3694
3695// Any T x2
3696template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
3697HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
3698 return InterleaveUpper(d, lo, hi);
3699}
3700
3701// ------------------------------ ConcatEven (InterleaveLower)
3702
3703// 8-bit full
3704template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3705HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
3706 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14,
3707 16, 18, 20, 22, 24, 26, 28, 30)};
3708}
3709
3710// 8-bit x8
3711template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3712HWY_API Vec64<T> ConcatEven(D /* tag */, Vec64<T> hi, Vec64<T> lo) {
3713 // Don't care about upper half.
3714 return Vec64<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20, 22,
3715 0, 2, 4, 6, 16, 18, 20, 22)};
3716}
3717
3718// 8-bit x4
3719template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3720HWY_API Vec32<T> ConcatEven(D /* tag */, Vec32<T> hi, Vec32<T> lo) {
3721 // Don't care about upper 3/4.
3722 return Vec32<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16, 18,
3723 0, 2, 16, 18, 0, 2, 16, 18)};
3724}
3725
3726// 16-bit full
3727template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3728HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
3729 return Vec128<T>{
3730 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
3731}
3732
3733// 16-bit x4
3734template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3735HWY_API Vec64<T> ConcatEven(D /* tag */, Vec64<T> hi, Vec64<T> lo) {
3736 // Don't care about upper half.
3737 return Vec64<T>{wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
3738}
3739
3740// 32-bit full
3741template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
3742HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
3743 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
3744}
3745
3746// Any T x2
3747template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
3748HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
3749 return InterleaveLower(d, lo, hi);
3750}
3751
3752// ------------------------------ DupEven (InterleaveLower)
3753
3754template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
3755HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
3756 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6,
3757 8, 8, 10, 10, 12, 12, 14, 14)};
3758}
3759
3760template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
3761HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
3762 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6)};
3763}
3764
3765template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
3766HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
3767 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
3768}
3769
3770template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
3771HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
3772 return InterleaveLower(DFromV<decltype(v)>(), v, v);
3773}
3774
3775// ------------------------------ DupOdd (InterleaveUpper)
3776
3777template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
3778HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3779 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7,
3780 9, 9, 11, 11, 13, 13, 15, 15)};
3781}
3782
3783template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
3784HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3785 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7)};
3786}
3787
3788template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
3789HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3790 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
3791}
3792
3793template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
3794HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
3795 return InterleaveUpper(DFromV<decltype(v)>(), v, v);
3796}
3797
3798// ------------------------------ OddEven
3799
3800namespace detail {
3801
3802template <typename T, size_t N>
3804 const Vec128<T, N> b) {
3805 const DFromV<decltype(a)> d;
3806 const Repartition<uint8_t, decltype(d)> d8;
3807 alignas(16) static constexpr uint8_t mask[16] = {
3808 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3809 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
3810}
3811template <typename T, size_t N>
3813 const Vec128<T, N> b) {
3814 return Vec128<T, N>{
3815 wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
3816}
3817template <typename T, size_t N>
3819 const Vec128<T, N> b) {
3820 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
3821}
3822template <typename T, size_t N>
3824 const Vec128<T, N> b) {
3825 return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
3826}
3827
3828} // namespace detail
3829
3830template <typename T, size_t N>
3831HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
3832 return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
3833}
3834template <size_t N>
3836 const Vec128<float, N> b) {
3837 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
3838}
3839
3840// ------------------------------ InterleaveEven
3841template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3843 return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 2, 18, 4, 20, 6, 22,
3844 8, 24, 10, 26, 12, 28, 14, 30)};
3845}
3846
3847template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3849 return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 2, 10, 4, 12, 6, 14)};
3850}
3851
3852template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
3854 return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 2, 6)};
3855}
3856
3857template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
3859 return InterleaveLower(a, b);
3860}
3861
3862// ------------------------------ InterleaveOdd
3863template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3865 return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 17, 3, 19, 5, 21, 7, 23,
3866 9, 25, 11, 27, 13, 29, 15, 31)};
3867}
3868
3869template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3871 return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 9, 3, 11, 5, 13, 7, 15)};
3872}
3873
3874template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
3876 return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 5, 3, 7)};
3877}
3878
3879template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
3881 return InterleaveUpper(d, a, b);
3882}
3883
3884// ------------------------------ OddEvenBlocks
3885template <typename T, size_t N>
3886HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
3887 return even;
3888}
3889
3890// ------------------------------ SwapAdjacentBlocks
3891
3892template <typename T, size_t N>
3893HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
3894 return v;
3895}
3896
3897// ------------------------------ ReverseBlocks
3898
3899// Single block: no change
3900template <class D>
3901HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
3902 return v;
3903}
3904
3905// ================================================== CONVERT
3906
3907// ------------------------------ Promotions (part w/ narrow lanes -> full)
3908
3909// Unsigned: zero-extend.
3910template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
3911HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
3912 return VFromD<D>{wasm_u16x8_extend_low_u8x16(v.raw)};
3913}
3914template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
3915HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
3916 return VFromD<D>{wasm_u32x4_extend_low_u16x8(v.raw)};
3917}
3918template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
3919HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
3920 return VFromD<D>{wasm_u64x2_extend_low_u32x4(v.raw)};
3921}
3922
3923template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
3924HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
3925 return VFromD<D>{
3926 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
3927}
3928
3929template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
3930HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
3931 return VFromD<D>{wasm_u16x8_extend_low_u8x16(v.raw)};
3932}
3933template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
3934HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
3935 return VFromD<D>{wasm_u32x4_extend_low_u16x8(v.raw)};
3936}
3937template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
3938HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
3939 return VFromD<D>{wasm_u64x2_extend_low_u32x4(v.raw)};
3940}
3941
3942template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
3943HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
3944 return VFromD<D>{
3945 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
3946}
3947
3948// U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to
3949// TFromD<D>
3950template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D),
3952 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
3954 const Rebind<uint32_t, decltype(d)> du32;
3955 return PromoteTo(d, PromoteTo(du32, v));
3956}
3957
3958// Signed: replicate sign bit.
3959template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
3960HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
3961 return VFromD<D>{wasm_i16x8_extend_low_i8x16(v.raw)};
3962}
3963template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
3964HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
3965 return VFromD<D>{wasm_i32x4_extend_low_i16x8(v.raw)};
3966}
3967template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
3968HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
3969 return VFromD<D>{wasm_i64x2_extend_low_i32x4(v.raw)};
3970}
3971
3972template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
3973HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
3974 return VFromD<D>{
3975 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
3976}
3977
3978// I8/I16 to I64: First, promote to I32, and then promote to I64
3979template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D),
3981 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
3982HWY_API VFromD<D> PromoteTo(D d, V v) {
3983 const Rebind<int32_t, decltype(d)> di32;
3984 return PromoteTo(d, PromoteTo(di32, v));
3985}
3986
3987template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3989 const Rebind<uint16_t, decltype(df32)> du16;
3990 const RebindToSigned<decltype(df32)> di32;
3991 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3992}
3993
3994template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3995HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
3996 return VFromD<D>{wasm_f64x2_convert_low_i32x4(v.raw)};
3997}
3998
3999template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
4000HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
4001 return VFromD<D>{wasm_f64x2_convert_low_u32x4(v.raw)};
4002}
4003
4004template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
4006 return VFromD<D>{wasm_f64x2_promote_low_f32x4(v.raw)};
4007}
4008
4009template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
4010HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
4011 const Rebind<int32_t, decltype(di64)> di32;
4012 const RebindToFloat<decltype(di32)> df32;
4013 const RebindToUnsigned<decltype(di32)> du32;
4014 const Repartition<uint8_t, decltype(du32)> du32_as_du8;
4015
4016 const auto exponent_adj = BitCast(
4017 du32,
4018 Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
4019 BitCast(du32_as_du8, Set(du32, uint32_t{157}))),
4020 BitCast(du32_as_du8, Set(du32, uint32_t{32}))));
4021 const auto adj_v =
4022 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
4023
4024 const auto f32_to_i32_result = ConvertTo(di32, adj_v);
4025 const auto lo64_or_mask = PromoteTo(
4026 di64,
4027 BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result,
4028 Set(di32, LimitsMax<int32_t>())))));
4029
4030 return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result))
4031 << PromoteTo(di64, exponent_adj),
4032 lo64_or_mask);
4033}
4034
4035template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
4036HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
4037 const Rebind<uint32_t, decltype(du64)> du32;
4038 const RebindToFloat<decltype(du32)> df32;
4039 const Repartition<uint8_t, decltype(du32)> du32_as_du8;
4040
4041 const auto exponent_adj = BitCast(
4042 du32,
4043 Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
4044 BitCast(du32_as_du8, Set(du32, uint32_t{158}))),
4045 BitCast(du32_as_du8, Set(du32, uint32_t{32}))));
4046
4047 const auto adj_v =
4048 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
4049 const auto f32_to_u32_result = ConvertTo(du32, adj_v);
4050 const auto lo32_or_mask = PromoteTo(
4051 du64,
4052 VecFromMask(du32, f32_to_u32_result == Set(du32, LimitsMax<uint32_t>())));
4053
4054 return Or(PromoteTo(du64, f32_to_u32_result) << PromoteTo(du64, exponent_adj),
4055 lo32_or_mask);
4056}
4057
4058// ------------------------------ PromoteUpperTo
4059
4060// Per-target flag to prevent generic_ops-inl.h from defining PromoteUpperTo.
4061#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
4062#undef HWY_NATIVE_PROMOTE_UPPER_TO
4063#else
4064#define HWY_NATIVE_PROMOTE_UPPER_TO
4065#endif
4066
4067// Unsigned: zero-extend.
4068template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
4071 return VFromD<D>{wasm_u16x8_extend_high_u8x16(v.raw)};
4072}
4073template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
4076 return VFromD<D>{wasm_u32x4_extend_high_u16x8(v.raw)};
4077}
4078template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
4081 return VFromD<D>{wasm_u64x2_extend_high_u32x4(v.raw)};
4082}
4083
4084template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
4085HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
4086 VFromD<Repartition<uint8_t, D>> v) {
4087 return VFromD<D>{wasm_u16x8_extend_high_u8x16(v.raw)};
4088}
4089template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
4090HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
4091 VFromD<Repartition<uint16_t, D>> v) {
4092 return VFromD<D>{wasm_u32x4_extend_high_u16x8(v.raw)};
4093}
4094template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
4095HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
4096 VFromD<Repartition<uint32_t, D>> v) {
4097 return VFromD<D>{wasm_u64x2_extend_high_u32x4(v.raw)};
4098}
4099
4100// Signed: replicate sign bit.
4101template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
4104 return VFromD<D>{wasm_i16x8_extend_high_i8x16(v.raw)};
4105}
4106template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
4109 return VFromD<D>{wasm_i32x4_extend_high_i16x8(v.raw)};
4110}
4111template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
4114 return VFromD<D>{wasm_i64x2_extend_high_i32x4(v.raw)};
4115}
4116
4117template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
4119 const Rebind<float16_t, decltype(df32)> dh;
4120 return PromoteTo(df32, UpperHalf(dh, v));
4121}
4122
4123template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
4125 const Repartition<uint16_t, decltype(df32)> du16;
4126 const RebindToSigned<decltype(df32)> di32;
4127 return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v))));
4128}
4129
4130template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
4131HWY_API VFromD<D> PromoteUpperTo(D dd, VFromD<Repartition<int32_t, D>> v) {
4132 // There is no wasm_f64x2_convert_high_i32x4.
4133 return PromoteTo(dd, UpperHalf(Rebind<int32_t, D>(), v));
4134}
4135
4136template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
4137HWY_API VFromD<D> PromoteUpperTo(D dd, VFromD<Repartition<uint32_t, D>> v) {
4138 // There is no wasm_f64x2_convert_high_u32x4.
4139 return PromoteTo(dd, UpperHalf(Rebind<uint32_t, D>(), v));
4140}
4141
4142template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
4144 // There is no wasm_f64x2_promote_high_f32x4.
4145 return PromoteTo(dd, UpperHalf(Rebind<float, D>(), v));
4146}
4147
4148template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
4149HWY_API VFromD<D> PromoteUpperTo(D d64, VFromD<Repartition<float, D>> v) {
4150 return PromoteTo(d64, UpperHalf(Rebind<float, D>(), v));
4151}
4152
4153// Generic version for <=64 bit input/output (_high is only for full vectors).
4154template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V>
4156 const Rebind<TFromV<V>, decltype(d)> dh;
4157 return PromoteTo(d, UpperHalf(dh, v));
4158}
4159
4160// ------------------------------ Demotions (full -> part w/ narrow lanes)
4161
4162template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
4163HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
4164 return VFromD<D>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
4165}
4166
4167template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
4168HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
4169 return VFromD<D>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
4170}
4171
4172template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
4173HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
4174 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
4175 return VFromD<D>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
4176}
4177
4178template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
4179HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
4180 return VFromD<D>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
4181}
4182
4183template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
4184HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
4185 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
4186 return VFromD<D>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
4187}
4188
4189template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
4190HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
4191 return VFromD<D>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
4192}
4193
4194template <class D, HWY_IF_UNSIGNED_D(D),
4195 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
4196HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint32_t, D>> v) {
4197 const DFromV<decltype(v)> du32;
4198 const RebindToSigned<decltype(du32)> di32;
4199 return DemoteTo(dn, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF))));
4200}
4201
4202template <class D, HWY_IF_U8_D(D)>
4204 const DFromV<decltype(v)> du16;
4205 const RebindToSigned<decltype(du16)> di16;
4206 return DemoteTo(du8, BitCast(di16, Min(v, Set(du16, 0x7FFF))));
4207}
4208
4209template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4210HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
4211 return VFromD<D>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
4212}
4213
4214template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4215HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
4216 return VFromD<D>{wasm_u32x4_trunc_sat_f64x2_zero(v.raw)};
4217}
4218
4219template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
4220HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
4221 return VFromD<D>{wasm_f32x4_demote_f64x2_zero(v.raw)};
4222}
4223
4224template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
4226 const Rebind<double, decltype(df32)> df64;
4227 const RebindToUnsigned<decltype(df64)> du64;
4228 const RebindToSigned<decltype(df32)> di32;
4229 const RebindToUnsigned<decltype(df32)> du32;
4230
4231 const auto k2p64_63 = Set(df64, 27670116110564327424.0);
4232 const auto f64_hi52 =
4233 Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63;
4234 const auto f64_lo12 =
4235 PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
4236 Set(du32, uint32_t{0x00000FFF}))));
4237
4238 const auto f64_sum = f64_hi52 + f64_lo12;
4239 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
4240
4241 const auto f64_sum_is_inexact =
4242 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
4243 const auto f64_bits_decrement =
4244 And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))),
4245 f64_sum_is_inexact);
4246
4247 const auto adj_f64_val = BitCast(
4248 df64,
4249 Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));
4250
4251 return DemoteTo(df32, adj_f64_val);
4252}
4253template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
4255 const Rebind<double, decltype(df32)> df64;
4256 const RebindToUnsigned<decltype(df64)> du64;
4257 const RebindToSigned<decltype(df32)> di32;
4258 const RebindToUnsigned<decltype(df32)> du32;
4259
4260 const auto k2p64 = Set(df64, 18446744073709551616.0);
4261 const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
4262 const auto f64_lo12 =
4263 PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
4264 Set(du32, uint32_t{0x00000FFF}))));
4265
4266 const auto f64_sum = f64_hi52 + f64_lo12;
4267 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
4268 const auto f64_sum_is_inexact =
4269 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
4270
4271 const auto adj_f64_val = BitCast(
4272 df64,
4273 Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)),
4274 f64_sum_is_inexact));
4275
4276 return DemoteTo(df32, adj_f64_val);
4277}
4278
4279// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
4280// above 2*N.
4281template <class D, HWY_IF_I16_D(D)>
4282HWY_API Vec32<int16_t> ReorderDemote2To(D dn, Vec32<int32_t> a,
4283 Vec32<int32_t> b) {
4284 const DFromV<decltype(a)> d;
4285 const Twice<decltype(d)> dt;
4286 return DemoteTo(dn, Combine(dt, b, a));
4287}
4288template <class D, HWY_IF_I16_D(D)>
4289HWY_API Vec64<int16_t> ReorderDemote2To(D dn, Vec64<int32_t> a,
4290 Vec64<int32_t> b) {
4291 const Twice<decltype(dn)> dn_full;
4292 const Repartition<uint32_t, decltype(dn_full)> du32_full;
4293
4294 const Vec128<int16_t> v_full{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
4295 const auto vu32_full = BitCast(du32_full, v_full);
4296 return LowerHalf(
4297 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
4298}
4299template <class D, HWY_IF_I16_D(D)>
4300HWY_API Vec128<int16_t> ReorderDemote2To(D /* tag */, Vec128<int32_t> a,
4301 Vec128<int32_t> b) {
4302 return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
4303}
4304
4305template <class D, HWY_IF_U16_D(D)>
4306HWY_API Vec32<uint16_t> ReorderDemote2To(D dn, Vec32<int32_t> a,
4307 Vec32<int32_t> b) {
4308 const DFromV<decltype(a)> d;
4309 const Twice<decltype(d)> dt;
4310 return DemoteTo(dn, Combine(dt, b, a));
4311}
4312template <class D, HWY_IF_U16_D(D)>
4313HWY_API Vec64<uint16_t> ReorderDemote2To(D dn, Vec64<int32_t> a,
4314 Vec64<int32_t> b) {
4315 const Twice<decltype(dn)> dn_full;
4316 const Repartition<uint32_t, decltype(dn_full)> du32_full;
4317
4318 const Vec128<int16_t> v_full{wasm_u16x8_narrow_i32x4(a.raw, b.raw)};
4319 const auto vu32_full = BitCast(du32_full, v_full);
4320 return LowerHalf(
4321 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
4322}
4323template <class D, HWY_IF_U16_D(D)>
4324HWY_API Vec128<uint16_t> ReorderDemote2To(D /* tag */, Vec128<int32_t> a,
4325 Vec128<int32_t> b) {
4326 return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(a.raw, b.raw)};
4327}
4328
4329template <class D, HWY_IF_U16_D(D)>
4331 Vec128<uint32_t> b) {
4332 const DFromV<decltype(a)> du32;
4333 const RebindToSigned<decltype(du32)> di32;
4334 const auto max_i32 = Set(du32, 0x7FFFFFFFu);
4335
4336 const auto clamped_a = BitCast(di32, Min(a, max_i32));
4337 const auto clamped_b = BitCast(di32, Min(b, max_i32));
4338 return ReorderDemote2To(dn, clamped_a, clamped_b);
4339}
4340template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
4343 const DFromV<decltype(a)> d;
4344 const Twice<decltype(d)> dt;
4345 return DemoteTo(dn, Combine(dt, b, a));
4346}
4347
4348// Specializations for partial vectors because i8x16_narrow_i16x8 sets lanes
4349// above 2*N.
4350template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
4351HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a,
4352 VFromD<Repartition<int16_t, D>> b) {
4353 const DFromV<decltype(a)> d;
4354 const Twice<decltype(d)> dt;
4355 return DemoteTo(dn, Combine(dt, b, a));
4356}
4357template <class D, HWY_IF_I8_D(D)>
4359 Vec64<int16_t> b) {
4360 const Twice<decltype(dn)> dn_full;
4361 const Repartition<uint32_t, decltype(dn_full)> du32_full;
4362
4363 const Vec128<int8_t> v_full{wasm_i8x16_narrow_i16x8(a.raw, b.raw)};
4364 const auto vu32_full = BitCast(du32_full, v_full);
4365 return LowerHalf(
4366 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
4367}
4368template <class D, HWY_IF_I8_D(D)>
4369HWY_API Vec128<int8_t> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
4370 Vec128<int16_t> b) {
4371 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(a.raw, b.raw)};
4372}
4373
4374template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
4375HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a,
4376 VFromD<Repartition<int16_t, D>> b) {
4377 const DFromV<decltype(a)> d;
4378 const Twice<decltype(d)> dt;
4379 return DemoteTo(dn, Combine(dt, b, a));
4380}
4381template <class D, HWY_IF_U8_D(D)>
4383 Vec64<int16_t> b) {
4384 const Twice<decltype(dn)> dn_full;
4385 const Repartition<uint32_t, decltype(dn_full)> du32_full;
4386
4387 const Vec128<uint8_t> v_full{wasm_u8x16_narrow_i16x8(a.raw, b.raw)};
4388 const auto vu32_full = BitCast(du32_full, v_full);
4389 return LowerHalf(
4390 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
4391}
4392template <class D, HWY_IF_U8_D(D)>
4393HWY_API Vec128<uint8_t> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
4394 Vec128<int16_t> b) {
4395 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(a.raw, b.raw)};
4396}
4397
4398template <class D, HWY_IF_U8_D(D)>
4400 Vec128<uint16_t> b) {
4401 const DFromV<decltype(a)> du16;
4402 const RebindToSigned<decltype(du16)> di16;
4403 const auto max_i16 = Set(du16, 0x7FFFu);
4404
4405 const auto clamped_a = BitCast(di16, Min(a, max_i16));
4406 const auto clamped_b = BitCast(di16, Min(b, max_i16));
4407 return ReorderDemote2To(dn, clamped_a, clamped_b);
4408}
4409template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
4410HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint16_t, D>> a,
4411 VFromD<Repartition<uint16_t, D>> b) {
4412 const DFromV<decltype(a)> d;
4413 const Twice<decltype(d)> dt;
4414 return DemoteTo(dn, Combine(dt, b, a));
4415}
4416
4417// For already range-limited input [0, 255].
4418template <size_t N>
4419HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
4420 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
4421 return Vec128<uint8_t, N>{
4422 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
4423}
4424
4425// ------------------------------ Truncations
4426
4427template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)>
4429 // BitCast requires the same size; DTo might be u8x1 and v u16x1.
4430 const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto;
4431 return VFromD<DTo>{BitCast(dto, v).raw};
4432}
4433
4434template <class D, HWY_IF_U8_D(D)>
4435HWY_API Vec16<uint8_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
4436 const Full128<uint8_t> d;
4437 const auto v1 = BitCast(d, v);
4438 const auto v2 = ConcatEven(d, v1, v1);
4439 const auto v4 = ConcatEven(d, v2, v2);
4440 return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4))));
4441}
4442
4443template <class D, HWY_IF_U16_D(D)>
4444HWY_API Vec32<uint16_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
4445 const Full128<uint16_t> d;
4446 const auto v1 = BitCast(d, v);
4447 const auto v2 = ConcatEven(d, v1, v1);
4448 return LowerHalf(LowerHalf(ConcatEven(d, v2, v2)));
4449}
4450
4451template <class D, HWY_IF_U32_D(D)>
4452HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
4453 const Full128<uint32_t> d;
4454 const auto v1 = BitCast(d, v);
4455 return LowerHalf(ConcatEven(d, v1, v1));
4456}
4457
4458template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)>
4459HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
4460 const Repartition<uint8_t, DFromV<decltype(v)>> d;
4461 const auto v1 = Vec128<uint8_t>{v.raw};
4462 const auto v2 = ConcatEven(d, v1, v1);
4463 const auto v3 = ConcatEven(d, v2, v2);
4464 return VFromD<D>{v3.raw};
4465}
4466
4467template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
4468HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
4469 const Repartition<uint16_t, DFromV<decltype(v)>> d;
4470 const auto v1 = Vec128<uint16_t>{v.raw};
4471 const auto v2 = ConcatEven(d, v1, v1);
4472 return VFromD<D>{v2.raw};
4473}
4474
4475template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)>
4476HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
4477 const Repartition<uint8_t, DFromV<decltype(v)>> d;
4478 const auto v1 = Vec128<uint8_t>{v.raw};
4479 const auto v2 = ConcatEven(d, v1, v1);
4480 return VFromD<D>{v2.raw};
4481}
4482
4483// ------------------------------ Demotions to/from i64
4484
4485namespace detail {
4486template <class D, HWY_IF_UNSIGNED_D(D)>
4491
4492template <class D, HWY_IF_SIGNED_D(D)>
4494 D /*dn*/, VFromD<Rebind<uint64_t, D>> v) {
4495 const DFromV<decltype(v)> du64;
4496 return And(v,
4497 Set(du64, static_cast<uint64_t>(hwy::HighestValue<TFromD<D>>())));
4498}
4499
4500template <class D>
4502 D dn, VFromD<Rebind<uint64_t, D>> v) {
4503 const Rebind<uint64_t, D> du64;
4504 const RebindToSigned<decltype(du64)> di64;
4505 constexpr int kShiftAmt = static_cast<int>(sizeof(TFromD<D>) * 8) -
4506 static_cast<int>(hwy::IsSigned<TFromD<D>>());
4507
4508 const auto too_big = BitCast(
4509 du64, VecFromMask(
4510 di64, Gt(BitCast(di64, ShiftRight<kShiftAmt>(v)), Zero(di64))));
4511 return DemoteFromU64MaskOutResult(dn, Or(v, too_big));
4512}
4513
4514template <class D, class V>
4516 return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a));
4517}
4518
4519} // namespace detail
4520
4521template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
4522 HWY_IF_SIGNED_D(D)>
4523HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
4524 const DFromV<decltype(v)> di64;
4525 const RebindToUnsigned<decltype(di64)> du64;
4526 const RebindToUnsigned<decltype(dn)> dn_u;
4527
4528 // Negative values are saturated by first saturating their bitwise inverse
4529 // and then inverting the saturation result
4530 const auto invert_mask = BitCast(du64, BroadcastSignBit(v));
4531 const auto saturated_vals = Xor(
4532 invert_mask,
4533 detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v))));
4534 return BitCast(dn, TruncateTo(dn_u, saturated_vals));
4535}
4536
4537template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
4539HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
4540 const DFromV<decltype(v)> di64;
4541 const RebindToUnsigned<decltype(di64)> du64;
4542
4543 const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v));
4544 return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals));
4545}
4546
4547template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
4549HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
4550 return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v));
4551}
4552
4553template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 4),
4557 const DFromV<decltype(a)> d;
4558 const Twice<decltype(d)> dt;
4559 return DemoteTo(dn, Combine(dt, b, a));
4560}
4561
4562template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4563HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
4564 VFromD<Repartition<uint64_t, D>> b) {
4565 const DFromV<decltype(a)> d;
4566 const Twice<decltype(d)> dt;
4567 return DemoteTo(dn, Combine(dt, b, a));
4568}
4569
4570template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
4572 Vec128<int64_t> b) {
4573 const DFromV<decltype(a)> di64;
4574 const RebindToUnsigned<decltype(di64)> du64;
4575 const Half<decltype(dn)> dnh;
4576
4577 // Negative values are saturated by first saturating their bitwise inverse
4578 // and then inverting the saturation result
4579 const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a));
4580 const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b));
4581 const auto saturated_a = Xor(
4582 invert_mask_a,
4583 detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a))));
4584 const auto saturated_b = Xor(
4585 invert_mask_b,
4586 detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b))));
4587
4588 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
4589}
4590
4591template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
4593 Vec128<int64_t> b) {
4594 const DFromV<decltype(a)> di64;
4595 const RebindToUnsigned<decltype(di64)> du64;
4596 const Half<decltype(dn)> dnh;
4597
4598 const auto saturated_a = detail::DemoteFromU64Saturate(
4599 dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a)));
4600 const auto saturated_b = detail::DemoteFromU64Saturate(
4601 dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b)));
4602
4603 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
4604}
4605
4606template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
4608 Vec128<uint64_t> b) {
4609 const Half<decltype(dn)> dnh;
4610
4611 const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a);
4612 const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b);
4613
4614 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
4615}
4616
4617template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
4618 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4619 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
4620 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
4621HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
4622 return ReorderDemote2To(d, a, b);
4623}
4624
4625// ------------------------------ ConvertTo
4626
4627template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
4629 return VFromD<D>{wasm_f32x4_convert_i32x4(v.raw)};
4630}
4631template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
4633 return VFromD<D>{wasm_f32x4_convert_u32x4(v.raw)};
4634}
4635
4636template <class D, HWY_IF_F64_D(D)>
4638 // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
4639 const Repartition<uint32_t, decltype(dd)> d32;
4640 const Repartition<uint64_t, decltype(dd)> d64;
4641
4642 // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
4643 const auto k84_63 = Set(d64, 0x4530000080000000ULL);
4644 const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
4645
4646 // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
4647 const auto k52 = Set(d32, 0x43300000);
4648 const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
4649
4650 const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
4651 return (v_upper - k84_63_52) + v_lower; // order matters!
4652}
4653
4654namespace detail {
4655template <class VW>
4657 const DFromV<decltype(w)> d64;
4658 const RebindToFloat<decltype(d64)> dd;
4659 const auto cnst2_52_dbl = Set(dd, 0x0010000000000000); // 2^52
4660 return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl;
4661}
4662} // namespace detail
4663
4664template <class D, HWY_IF_F64_D(D)>
4666 // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
4667 const RebindToUnsigned<decltype(dd)> d64;
4668 using VU = VFromD<decltype(d64)>;
4669
4670 const VU msk_lo = Set(d64, 0xFFFFFFFF);
4671 const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
4672
4673 // Extract the 32 lowest/highest significant bits of v
4674 const VU v_lo = And(v, msk_lo);
4675 const VU v_hi = ShiftRight<32>(v);
4676
4677 const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo);
4678 return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl);
4679}
4680
4681// Truncates (rounds toward zero).
4682template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
4684 return VFromD<D>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
4685}
4686template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
4687HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float, D>> v) {
4688 return VFromD<D>{wasm_u32x4_trunc_sat_f32x4(v.raw)};
4689}
4690
4691template <class DI, HWY_IF_I64_D(DI)>
4693 using VI = VFromD<decltype(di)>;
4694 using MI = MFromD<decltype(di)>;
4695 const RebindToUnsigned<decltype(di)> du;
4696 using VU = VFromD<decltype(du)>;
4697 const Repartition<uint16_t, decltype(di)> du16;
4698 const VI k1075 = Set(di, 1075); // biased exponent of 2^52
4699
4700 // Exponent indicates whether the number can be represented as int64_t.
4701 const VU biased_exp = ShiftRight<52>(BitCast(du, v)) & Set(du, 0x7FF);
4702 const MI in_range = BitCast(di, biased_exp) < Set(di, 1086);
4703
4704 // If we were to cap the exponent at 51 and add 2^52, the number would be in
4705 // [2^52, 2^53) and mantissa bits could be read out directly. We need to
4706 // round-to-0 (truncate).
4707 // Use 16-bit saturated unsigned subtraction to compute shift_mnt and
4708 // shift_int since biased_exp[i] is a non-negative integer that is less than
4709 // or equal to 2047.
4710 // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
4711 // zero as the upper 48 bits of both k1075 and biased_exp are zero.
4712
4713 const VU shift_mnt = BitCast(
4714 du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
4715 const VU shift_int = BitCast(
4716 du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
4717 const VU mantissa = BitCast(du, v) & Set(du, (1ULL << 52) - 1);
4718 // Include implicit 1-bit
4719 VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt;
4720 // WASM clamps shift count; zero if greater.
4721 const MI tiny = BitCast(di, shift_mnt) > Set(di, 63);
4722 int53 = IfThenZeroElse(RebindMask(du, tiny), int53);
4723
4724 // For inputs larger than 2^53 - 1, insert zeros at the bottom.
4725 // For inputs less than 2^63, the implicit 1-bit is guaranteed not to be
4726 // shifted out of the left shift result below as shift_int[i] <= 10 is true
4727 // for any inputs that are less than 2^63.
4728 const VU shifted = int53 << shift_int;
4729
4730 // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
4731 const VI sign_mask = BroadcastSignBit(BitCast(di, v));
4732 const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
4733 const VI magnitude = IfThenElse(in_range, BitCast(di, shifted), limit);
4734
4735 // If the input was negative, negate the integer (two's complement).
4736 return (magnitude ^ sign_mask) - sign_mask;
4737}
4738
4739template <class DU, HWY_IF_U64_D(DU)>
4741 const RebindToSigned<decltype(du)> di;
4742 using MI = MFromD<decltype(di)>;
4743 using VU = VFromD<decltype(du)>;
4744 const Repartition<uint16_t, decltype(di)> du16;
4745 const VU k1075 = Set(du, 1075); /* biased exponent of 2^52 */
4746
4747 const auto non_neg_v = ZeroIfNegative(v);
4748
4749 // Exponent indicates whether the number can be represented as int64_t.
4750 const VU biased_exp = ShiftRight<52>(BitCast(du, non_neg_v));
4751 const VU out_of_range =
4752 BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086)));
4753
4754 // If we were to cap the exponent at 51 and add 2^52, the number would be in
4755 // [2^52, 2^53) and mantissa bits could be read out directly. We need to
4756 // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
4757 // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
4758 // manually shift the mantissa into place (we already have many of the
4759 // inputs anyway).
4760
4761 // Use 16-bit saturated unsigned subtraction to compute shift_mnt and
4762 // shift_int since biased_exp[i] is a non-negative integer that is less than
4763 // or equal to 2047.
4764
4765 // 16-bit saturated unsigned subtraction is also more efficient than a
4766 // 64-bit subtraction followed by a 64-bit signed Max operation on
4767 // WASM.
4768
4769 // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
4770 // zero as the upper 48 bits of both k1075 and biased_exp are zero.
4771
4772 const VU shift_mnt = BitCast(
4773 du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
4774 const VU shift_int = BitCast(
4775 du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
4776 const VU mantissa = BitCast(du, non_neg_v) & Set(du, (1ULL << 52) - 1);
4777 // Include implicit 1-bit.
4778 VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt;
4779 // WASM clamps shift count; zero if greater.
4780 const MI tiny = BitCast(di, shift_mnt) > Set(di, 63);
4781 int53 = IfThenZeroElse(RebindMask(du, tiny), int53);
4782
4783 // For inputs larger than 2^53 - 1, insert zeros at the bottom.
4784
4785 // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be
4786 // shifted out of the left shift result below as shift_int[i] <= 11 is true
4787 // for any inputs that are less than 2^64.
4788
4789 const VU shifted = int53 << shift_int;
4790 return (shifted | out_of_range);
4791}
4792
4793// ------------------------------ NearestInt (Round)
4794template <size_t N>
4795HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
4796 return ConvertTo(RebindToSigned<DFromV<decltype(v)>>(), Round(v));
4797}
4798
4799// ================================================== MISC
4800
4801// ------------------------------ SumsOf8 (ShiftRight, Add)
4802template <size_t N>
4803HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
4804 const DFromV<decltype(v)> du8;
4805 const RepartitionToWide<decltype(du8)> du16;
4806 const RepartitionToWide<decltype(du16)> du32;
4807 const RepartitionToWide<decltype(du32)> du64;
4808 using VU16 = VFromD<decltype(du16)>;
4809
4810 const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
4811 const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF));
4812 const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
4813
4814 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
4815 BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
4816 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
4817 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
4818 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
4819 BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
4820 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
4821 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
4822 return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
4823}
4824
4825template <size_t N>
4826HWY_API Vec128<int64_t, N / 8> SumsOf8(const Vec128<int8_t, N> v) {
4827 const DFromV<decltype(v)> di8;
4828 const RepartitionToWide<decltype(di8)> di16;
4829 const RepartitionToWide<decltype(di16)> di32;
4830 const RepartitionToWide<decltype(di32)> di64;
4831 const RebindToUnsigned<decltype(di32)> du32;
4832 const RebindToUnsigned<decltype(di64)> du64;
4833 using VI16 = VFromD<decltype(di16)>;
4834
4835 const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
4836 const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
4837 const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
4838
4839 const VI16 sDC_zz_98_zz_54_zz_10_zz =
4840 BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
4841 const VI16 sFC_xx_B8_xx_74_xx_30_xx =
4842 Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
4843 const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
4844 BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
4845 const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
4846 Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
4847 return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
4848}
4849
4850// ------------------------------ LoadMaskBits (TestBit)
4851
4852namespace detail {
4853
4854template <class D, HWY_IF_T_SIZE_D(D, 1)>
4855HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
4856 const RebindToUnsigned<decltype(d)> du;
4857 // Easier than Set(), which would require an >8-bit type, which would not
4858 // compile for T=uint8_t, N=1.
4859 const VFromD<D> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
4860
4861 // Replicate bytes 8x such that each byte contains the bit that governs it.
4862 alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4863 1, 1, 1, 1, 1, 1, 1, 1};
4864 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
4865
4866 alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4867 1, 2, 4, 8, 16, 32, 64, 128};
4868 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
4869}
4870
4871template <class D, HWY_IF_T_SIZE_D(D, 2)>
4872HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
4873 const RebindToUnsigned<decltype(d)> du;
4874 alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4875 return RebindMask(
4876 d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
4877}
4878
4879template <class D, HWY_IF_T_SIZE_D(D, 4)>
4880HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
4881 const RebindToUnsigned<decltype(d)> du;
4882 alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4883 return RebindMask(
4884 d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
4885}
4886
4887template <class D, HWY_IF_T_SIZE_D(D, 8)>
4888HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
4889 const RebindToUnsigned<decltype(d)> du;
4890 alignas(16) static constexpr uint64_t kBit[8] = {1, 2};
4891 return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
4892}
4893
4894} // namespace detail
4895
4896// `p` points to at least 8 readable bytes, not all of which need be valid.
4897template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
4898HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
4899 uint64_t mask_bits = 0;
4900 CopyBytes<(MaxLanes(d) + 7) / 8>(bits, &mask_bits);
4901 return detail::LoadMaskBits(d, mask_bits);
4902}
4903
4904// ------------------------------ Dup128MaskFromMaskBits
4905
4906template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
4907HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4908 constexpr size_t kN = MaxLanes(d);
4909 if (kN < 8) mask_bits &= (1u << kN) - 1;
4910 return detail::LoadMaskBits(d, mask_bits);
4911}
4912
4913// ------------------------------ Mask
4914
4915namespace detail {
4916
4917// Full
4918template <typename T>
4919HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
4920 const Mask128<T> mask) {
4921 alignas(16) uint64_t lanes[2];
4922 wasm_v128_store(lanes, mask.raw);
4923
4924 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
4925 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
4926 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
4927 return (hi + lo);
4928}
4929
4930// 64-bit
4931template <typename T>
4933 const Mask128<T, 8> mask) {
4934 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
4935 return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) *
4936 kMagic) >>
4937 56;
4938}
4939
4940// 32-bit or less: need masking
4941template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
4942HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
4943 const Mask128<T, N> mask) {
4944 uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
4945 // Clear potentially undefined bytes.
4946 bytes &= (1ULL << (N * 8)) - 1;
4947 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
4948 return (bytes * kMagic) >> 56;
4949}
4950
4951template <typename T, size_t N>
4952HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
4953 const Mask128<T, N> mask) {
4954 // Remove useless lower half of each u16 while preserving the sign bit.
4955 const __i16x8 zero = wasm_i16x8_splat(0);
4956 const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
4957 return BitsFromMask(hwy::SizeTag<1>(), mask8);
4958}
4959
4960template <typename T, size_t N>
4961HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
4962 const Mask128<T, N> mask) {
4963 const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
4964 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
4965 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
4966 alignas(16) uint32_t lanes[4];
4967 wasm_v128_store(lanes, sliced_mask);
4968 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
4969}
4970
4971template <typename T, size_t N>
4972HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
4973 const Mask128<T, N> mask) {
4974 const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
4975 const __i64x2 slice = wasm_i64x2_make(1, 2);
4976 const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
4977 alignas(16) uint64_t lanes[2];
4978 wasm_v128_store(lanes, sliced_mask);
4979 return lanes[0] | lanes[1];
4980}
4981
4982// Returns the lowest N bits for the BitsFromMask result.
4983template <typename T, size_t N>
4984constexpr uint64_t OnlyActive(uint64_t bits) {
4985 return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
4986}
4987
4988// Returns 0xFF for bytes with index >= N, otherwise 0.
4989template <size_t N>
4990constexpr __i8x16 BytesAbove() {
4991 return
4992 (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
4993 : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
4994 : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
4995 : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
4996 : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
4997 : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
4998 : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
4999 : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
5000 : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
5001 : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
5002 -1, -1, -1, -1, -1)
5003 : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
5004 -1, -1, -1, -1)
5005 : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
5006 -1, -1, -1, -1)
5007 : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
5008 -1, -1, -1)
5009 : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
5010 -1, -1, -1)
5011 : (N == 11)
5012 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
5013 : (N == 13)
5014 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
5015 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
5016}
5017
5018template <typename T, size_t N>
5019HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
5020 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
5021}
5022
5023template <typename T>
5024HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
5025 return PopCount(BitsFromMask(tag, m));
5026}
5027
5028template <typename T>
5029HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
5030 return PopCount(BitsFromMask(tag, m));
5031}
5032
5033template <typename T>
5034HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
5035 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
5036 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
5037 alignas(16) uint64_t lanes[2];
5038 wasm_v128_store(lanes, shifted_bits);
5039 return PopCount(lanes[0] | lanes[1]);
5040}
5041
5042template <typename T>
5043HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
5044 alignas(16) int64_t lanes[2];
5045 wasm_v128_store(lanes, m.raw);
5046 return static_cast<size_t>(-(lanes[0] + lanes[1]));
5047}
5048
5049} // namespace detail
5050
5051// `p` points to at least 8 writable bytes.
5052template <class D>
5053HWY_API size_t StoreMaskBits(D d, const MFromD<D> mask, uint8_t* bits) {
5054 const uint64_t mask_bits = detail::BitsFromMask(mask);
5055 const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
5056 CopyBytes<kNumBytes>(&mask_bits, bits);
5057 return kNumBytes;
5058}
5059
5060template <class D, HWY_IF_V_SIZE_D(D, 16)>
5061HWY_API size_t CountTrue(D /* tag */, const MFromD<D> m) {
5062 return detail::CountTrue(hwy::SizeTag<sizeof(TFromD<D>)>(), m);
5063}
5064
5065// Partial
5066template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
5068 // Ensure all undefined bytes are 0.
5069 const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()};
5070 const Full128<T> dfull;
5071 return CountTrue(dfull, Mask128<T>{AndNot(mask, m).raw});
5072}
5073
5074// Full vector
5075template <class D, HWY_IF_V_SIZE_D(D, 16)>
5076HWY_API bool AllFalse(D d, const MFromD<D> m) {
5077 const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
5078 return !wasm_v128_any_true(v8.raw);
5079}
5080
5081// Full vector
5082namespace detail {
5083template <typename T>
5085 return wasm_i8x16_all_true(m.raw);
5086}
5087template <typename T>
5089 return wasm_i16x8_all_true(m.raw);
5090}
5091template <typename T>
5093 return wasm_i32x4_all_true(m.raw);
5094}
5095template <typename T>
5097 return wasm_i64x2_all_true(m.raw);
5098}
5099
5100} // namespace detail
5101
5102template <class D, typename T = TFromD<D>>
5103HWY_API bool AllTrue(D /* tag */, const Mask128<T> m) {
5104 return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
5105}
5106
5107// Partial vectors
5108
5109template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
5110HWY_API bool AllFalse(D d, const MFromD<D> m) {
5111 // Ensure all undefined bytes are 0.
5112 const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()};
5113 return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw});
5114}
5115
5116template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
5117HWY_API bool AllTrue(D d, const MFromD<D> m) {
5118 // Ensure all undefined bytes are FF.
5119 const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()};
5120 return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
5121}
5122
5123template <class D>
5124HWY_API size_t FindKnownFirstTrue(D /* tag */, const MFromD<D> mask) {
5125 const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
5126 return Num0BitsBelowLS1Bit_Nonzero32(bits);
5127}
5128
5129template <class D>
5130HWY_API intptr_t FindFirstTrue(D /* tag */, const MFromD<D> mask) {
5131 const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
5132 return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1;
5133}
5134
5135template <class D>
5136HWY_API size_t FindKnownLastTrue(D /* tag */, const MFromD<D> mask) {
5137 const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
5138 return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits);
5139}
5140
5141template <class D>
5142HWY_API intptr_t FindLastTrue(D /* tag */, const MFromD<D> mask) {
5143 const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
5144 return bits
5145 ? (31 - static_cast<intptr_t>(Num0BitsAboveMS1Bit_Nonzero32(bits)))
5146 : -1;
5147}
5148
5149// ------------------------------ Compress
5150
5151namespace detail {
5152
5153template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
5154HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
5155 HWY_DASSERT(mask_bits < 256);
5156 const Simd<T, N, 0> d;
5157 const Rebind<uint8_t, decltype(d)> d8;
5158 const Simd<uint16_t, N, 0> du;
5159
5160 // We need byte indices for TableLookupBytes (one vector's worth for each of
5161 // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
5162 // can instead store lane indices and convert to byte indices (2*lane + 0..1),
5163 // with the doubling baked into the table. Unpacking nibbles is likely more
5164 // costly than the higher cache footprint from storing bytes.
5165 alignas(16) static constexpr uint8_t table[256 * 8] = {
5166 // PrintCompress16x8Tables
5167 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5168 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5169 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
5170 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5171 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
5172 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
5173 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
5174 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5175 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
5176 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
5177 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
5178 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
5179 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
5180 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
5181 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
5182 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5183 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
5184 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
5185 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
5186 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
5187 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
5188 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
5189 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
5190 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
5191 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
5192 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
5193 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
5194 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
5195 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
5196 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
5197 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
5198 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5199 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
5200 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
5201 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
5202 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
5203 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
5204 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
5205 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
5206 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
5207 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
5208 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
5209 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
5210 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
5211 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
5212 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
5213 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
5214 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
5215 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
5216 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
5217 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
5218 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
5219 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
5220 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
5221 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
5222 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
5223 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
5224 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
5225 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
5226 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
5227 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
5228 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
5229 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
5230 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5231 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
5232 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
5233 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
5234 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
5235 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
5236 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
5237 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
5238 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
5239 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
5240 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
5241 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
5242 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
5243 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
5244 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
5245 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
5246 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
5247 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
5248 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
5249 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
5250 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
5251 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
5252 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
5253 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
5254 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
5255 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
5256 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
5257 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
5258 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
5259 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
5260 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
5261 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
5262 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
5263 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
5264 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
5265 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
5266 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
5267 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
5268 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
5269 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
5270 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
5271 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
5272 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
5273 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
5274 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
5275 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
5276 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
5277 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
5278 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
5279 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
5280 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
5281 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
5282 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
5283 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
5284 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
5285 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
5286 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
5287 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
5288 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
5289 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
5290 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
5291 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
5292 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
5293 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
5294 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5295
5296 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
5297 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
5298 return BitCast(d, pairs + Set(du, 0x0100));
5299}
5300
5301template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
5302HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
5303 HWY_DASSERT(mask_bits < 256);
5304 const Simd<T, N, 0> d;
5305 const Rebind<uint8_t, decltype(d)> d8;
5306 const Simd<uint16_t, N, 0> du;
5307
5308 // We need byte indices for TableLookupBytes (one vector's worth for each of
5309 // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
5310 // can instead store lane indices and convert to byte indices (2*lane + 0..1),
5311 // with the doubling baked into the table. Unpacking nibbles is likely more
5312 // costly than the higher cache footprint from storing bytes.
5313 alignas(16) static constexpr uint8_t table[256 * 8] = {
5314 // PrintCompressNot16x8Tables
5315 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
5316 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
5317 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
5318 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
5319 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
5320 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
5321 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
5322 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
5323 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
5324 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
5325 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
5326 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
5327 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
5328 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
5329 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
5330 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
5331 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
5332 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
5333 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
5334 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
5335 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
5336 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
5337 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
5338 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
5339 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
5340 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
5341 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
5342 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
5343 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
5344 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
5345 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
5346 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
5347 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
5348 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
5349 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
5350 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
5351 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
5352 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
5353 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
5354 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
5355 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
5356 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
5357 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
5358 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
5359 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
5360 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
5361 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
5362 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
5363 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
5364 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
5365 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
5366 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
5367 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
5368 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
5369 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
5370 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
5371 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
5372 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
5373 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
5374 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
5375 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
5376 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
5377 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
5378 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
5379 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
5380 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
5381 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
5382 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
5383 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
5384 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
5385 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
5386 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
5387 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
5388 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
5389 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
5390 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
5391 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
5392 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
5393 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
5394 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
5395 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
5396 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
5397 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
5398 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
5399 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
5400 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
5401 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
5402 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
5403 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
5404 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
5405 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
5406 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
5407 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
5408 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
5409 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
5410 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
5411 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
5412 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
5413 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
5414 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
5415 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
5416 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
5417 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
5418 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
5419 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
5420 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
5421 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
5422 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
5423 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
5424 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
5425 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
5426 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
5427 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
5428 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
5429 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
5430 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
5431 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
5432 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
5433 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
5434 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
5435 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
5436 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
5437 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
5438 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
5439 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
5440 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
5441 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
5442 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
5443
5444 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
5445 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
5446 return BitCast(d, pairs + Set(du, 0x0100));
5447}
5448
5449template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
5450HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
5451 HWY_DASSERT(mask_bits < 16);
5452
5453 // There are only 4 lanes, so we can afford to load the index vector directly.
5454 alignas(16) static constexpr uint8_t u8_indices[16 * 16] = {
5455 // PrintCompress32x4Tables
5456 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5457 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5458 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
5459 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5460 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
5461 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
5462 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
5463 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5464 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
5465 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
5466 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
5467 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
5468 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
5469 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
5470 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
5471 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5472 const Simd<T, N, 0> d;
5473 const Repartition<uint8_t, decltype(d)> d8;
5474 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5475}
5476
5477template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
5478HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
5479 HWY_DASSERT(mask_bits < 16);
5480
5481 // There are only 4 lanes, so we can afford to load the index vector directly.
5482 alignas(16) static constexpr uint8_t u8_indices[16 * 16] = {
5483 // PrintCompressNot32x4Tables
5484 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
5485 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5486 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
5487 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5488 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
5489 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
5490 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5491 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5492 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
5493 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5494 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
5495 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
5496 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5497 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5498 12, 13, 14, 15};
5499 const Simd<T, N, 0> d;
5500 const Repartition<uint8_t, decltype(d)> d8;
5501 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5502}
5503
5504template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
5505HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
5506 HWY_DASSERT(mask_bits < 4);
5507
5508 // There are only 2 lanes, so we can afford to load the index vector directly.
5509 alignas(16) static constexpr uint8_t u8_indices[4 * 16] = {
5510 // PrintCompress64x2Tables
5511 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5512 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5513 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5514 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5515
5516 const Simd<T, N, 0> d;
5517 const Repartition<uint8_t, decltype(d)> d8;
5518 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5519}
5520
5521template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
5522HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
5523 HWY_DASSERT(mask_bits < 4);
5524
5525 // There are only 2 lanes, so we can afford to load the index vector directly.
5526 alignas(16) static constexpr uint8_t u8_indices[4 * 16] = {
5527 // PrintCompressNot64x2Tables
5528 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5529 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5530 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5531 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5532
5533 const Simd<T, N, 0> d;
5534 const Repartition<uint8_t, decltype(d)> d8;
5535 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5536}
5537
5538// Helper functions called by both Compress and CompressStore - avoids a
5539// redundant BitsFromMask in the latter.
5540
5541template <typename T, size_t N>
5542HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
5543 const auto idx = detail::IdxFromBits<T, N>(mask_bits);
5544 const DFromV<decltype(v)> d;
5545 const RebindToSigned<decltype(d)> di;
5546 return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
5547}
5548
5549template <typename T, size_t N>
5550HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
5551 const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
5552 const DFromV<decltype(v)> d;
5553 const RebindToSigned<decltype(d)> di;
5554 return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
5555}
5556
5557} // namespace detail
5558
5559template <typename T>
5560struct CompressIsPartition {
5561#if HWY_TARGET == HWY_WASM_EMU256
5562 enum { value = 0 };
5563#else
5564 enum { value = (sizeof(T) != 1) };
5565#endif
5566};
5567
5568// Single lane: no-op
5569template <typename T>
5570HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
5571 return v;
5572}
5573
5574// Two lanes: conditional swap
5575template <typename T, HWY_IF_T_SIZE(T, 8)>
5576HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
5577 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
5578 const Full128<T> d;
5579 const Vec128<T> m = VecFromMask(d, mask);
5580 const Vec128<T> maskL = DupEven(m);
5581 const Vec128<T> maskH = DupOdd(m);
5582 const Vec128<T> swap = AndNot(maskL, maskH);
5583 return IfVecThenElse(swap, Shuffle01(v), v);
5584}
5585
5586// General case, 2 or 4 byte lanes
5587template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 2))>
5588HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
5589 return detail::Compress(v, detail::BitsFromMask(mask));
5590}
5591
5592// Single lane: no-op
5593template <typename T>
5594HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
5595 return v;
5596}
5597
5598// Two lanes: conditional swap
5599template <typename T, HWY_IF_T_SIZE(T, 8)>
5600HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
5601 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
5602 const Full128<T> d;
5603 const Vec128<T> m = VecFromMask(d, mask);
5604 const Vec128<T> maskL = DupEven(m);
5605 const Vec128<T> maskH = DupOdd(m);
5606 const Vec128<T> swap = AndNot(maskH, maskL);
5607 return IfVecThenElse(swap, Shuffle01(v), v);
5608}
5609
5610// General case, 2 or 4 byte lanes
5611template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
5612HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
5613 // For partial vectors, we cannot pull the Not() into the table because
5614 // BitsFromMask clears the upper bits.
5615 if (N < 16 / sizeof(T)) {
5616 return detail::Compress(v, detail::BitsFromMask(Not(mask)));
5617 }
5618 return detail::CompressNot(v, detail::BitsFromMask(mask));
5619}
5620
5621// ------------------------------ CompressBlocksNot
5622HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
5623 Mask128<uint64_t> /* m */) {
5624 return v;
5625}
5626
5627// ------------------------------ CompressBits
5628template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
5629HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
5630 const uint8_t* HWY_RESTRICT bits) {
5631 uint64_t mask_bits = 0;
5632 constexpr size_t kNumBytes = (N + 7) / 8;
5633 CopyBytes<kNumBytes>(bits, &mask_bits);
5634 if (N < 8) {
5635 mask_bits &= (1ull << N) - 1;
5636 }
5637
5638 return detail::Compress(v, mask_bits);
5639}
5640
5641// ------------------------------ CompressStore
5642template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
5643HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
5644 TFromD<D>* HWY_RESTRICT unaligned) {
5645 const uint64_t mask_bits = detail::BitsFromMask(mask);
5646 const auto c = detail::Compress(v, mask_bits);
5647 StoreU(c, d, unaligned);
5648 return PopCount(mask_bits);
5649}
5650
5651// ------------------------------ CompressBlendedStore
5652template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
5653HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
5654 TFromD<D>* HWY_RESTRICT unaligned) {
5655 const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
5656 const uint64_t mask_bits = detail::BitsFromMask(m);
5657 const size_t count = PopCount(mask_bits);
5658 const VFromD<decltype(du)> compressed =
5659 detail::Compress(BitCast(du, v), mask_bits);
5660 const MFromD<D> store_mask = RebindMask(d, FirstN(du, count));
5661 BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
5662 return count;
5663}
5664
5665// ------------------------------ CompressBitsStore
5666
5667template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
5668HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
5669 D d, TFromD<D>* HWY_RESTRICT unaligned) {
5670 uint64_t mask_bits = 0;
5671 constexpr size_t kN = MaxLanes(d);
5672 CopyBytes<(kN + 7) / 8>(bits, &mask_bits);
5673 if (kN < 8) {
5674 mask_bits &= (1ull << kN) - 1;
5675 }
5676
5677 const auto c = detail::Compress(v, mask_bits);
5678 StoreU(c, d, unaligned);
5679 return PopCount(mask_bits);
5680}
5681
5682// ------------------------------ StoreInterleaved2/3/4
5683
5684// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
5685// generic_ops-inl.h.
5686
5687// ------------------------------ Additional mask logical operations
5688template <class T>
5689HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
5690 return mask;
5691}
5692template <class T>
5693HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
5694 const FixedTag<T, 2> d;
5695 const auto vmask = VecFromMask(d, mask);
5696 return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
5697}
5698template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
5699HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
5700 const Simd<T, N, 0> d;
5701 const auto vmask = VecFromMask(d, mask);
5702 const auto neg_vmask =
5703 ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask)));
5704 return MaskFromVec(Or(vmask, neg_vmask));
5705}
5706template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
5707HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
5708 const Full128<T> d;
5709 const Repartition<int64_t, decltype(d)> di64;
5710
5711 auto vmask = BitCast(di64, VecFromMask(d, mask));
5712 vmask = Or(vmask, Neg(vmask));
5713
5714 // Copy the sign bit of the first int64_t lane to the second int64_t lane
5715 const auto vmask2 = BroadcastSignBit(InterleaveLower(Zero(di64), vmask));
5716 return MaskFromVec(BitCast(d, Or(vmask, vmask2)));
5717}
5718
5719template <class T, size_t N>
5720HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
5721 return Not(SetAtOrAfterFirst(mask));
5722}
5723
5724template <class T>
5725HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
5726 return mask;
5727}
5728template <class T>
5729HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
5730 const FixedTag<T, 2> d;
5731 const RebindToSigned<decltype(d)> di;
5732
5733 const auto vmask = BitCast(di, VecFromMask(d, mask));
5734 const auto zero = Zero(di);
5735 const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
5736 return MaskFromVec(BitCast(d, And(vmask, vmask2)));
5737}
5738template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
5739HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
5740 const Simd<T, N, 0> d;
5741 const RebindToSigned<decltype(d)> di;
5742
5743 const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask));
5744 const auto only_first_vmask =
5745 BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask)))));
5746 return MaskFromVec(only_first_vmask);
5747}
5748template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
5749HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
5750 const Full128<T> d;
5751 const RebindToSigned<decltype(d)> di;
5752 const Repartition<int64_t, decltype(d)> di64;
5753
5754 const auto zero = Zero(di64);
5755 const auto vmask = BitCast(di64, VecFromMask(d, mask));
5756 const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero);
5757 const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask))));
5758 return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2))));
5759}
5760
5761template <class T>
5762HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) {
5763 const FixedTag<T, 1> d;
5764 const RebindToSigned<decltype(d)> di;
5765 using TI = MakeSigned<T>;
5766
5767 return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
5768}
5769template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
5770HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
5771 const Simd<T, N, 0> d;
5772 return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
5773}
5774
5775// ------------------------------ MulEven/Odd (Load)
5776
5777template <class T, HWY_IF_UI64(T)>
5778HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
5779 alignas(16) T mul[2];
5780 mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 0)),
5781 static_cast<T>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
5782 return Load(Full128<T>(), mul);
5783}
5784
5785template <class T, HWY_IF_UI64(T)>
5786HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
5787 alignas(16) T mul[2];
5788 mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 1)),
5789 static_cast<T>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
5790 return Load(Full128<T>(), mul);
5791}
5792
5793// ------------------------------ I64/U64 MulHigh (GetLane)
5794template <class T, HWY_IF_UI64(T)>
5795HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
5796 T hi;
5797 Mul128(GetLane(a), GetLane(b), &hi);
5798 return Set(Full64<T>(), hi);
5799}
5800
5801template <class T, HWY_IF_UI64(T)>
5802HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
5803 T hi_0;
5804 T hi_1;
5805 Mul128(GetLane(a), GetLane(b), &hi_0);
5806 Mul128(detail::ExtractLane<1>(a), detail::ExtractLane<1>(b), &hi_1);
5807 return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
5808}
5809
5810// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
5811
5812// Generic for all vector lengths.
5813template <class D32, HWY_IF_F32_D(D32),
5815HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
5816 const Rebind<uint32_t, decltype(df32)> du32;
5817 using VU32 = VFromD<decltype(du32)>;
5818 const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
5819 // Using shift/and instead of Zip leads to the odd/even order that
5820 // RearrangeToOddPlusEven prefers.
5821 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5822 const VU32 ao = And(BitCast(du32, a), odd);
5823 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5824 const VU32 bo = And(BitCast(du32, b), odd);
5825 return Mul(BitCast(df32, ae), BitCast(df32, be)) +
5826 Mul(BitCast(df32, ao), BitCast(df32, bo));
5827}
5828
5829template <class D32, HWY_IF_F32_D(D32),
5831HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
5832 const VFromD<D32> sum0,
5833 VFromD<D32>& sum1) {
5834 const Rebind<uint32_t, decltype(df32)> du32;
5835 using VU32 = VFromD<decltype(du32)>;
5836 const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
5837 // Using shift/and instead of Zip leads to the odd/even order that
5838 // RearrangeToOddPlusEven prefers.
5839 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5840 const VU32 ao = And(BitCast(du32, a), odd);
5841 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5842 const VU32 bo = And(BitCast(du32, b), odd);
5843 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
5844 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
5845}
5846
5847// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
5848// safe.
5849template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
5851HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
5852 return VFromD<D32>{wasm_i32x4_dot_i16x8(a.raw, b.raw)};
5853}
5854
5855template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
5857HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) {
5858 const auto lo16_mask = Set(du32, 0x0000FFFFu);
5859
5860 const auto a0 = And(BitCast(du32, a), lo16_mask);
5861 const auto b0 = And(BitCast(du32, b), lo16_mask);
5862
5863 const auto a1 = ShiftRight<16>(BitCast(du32, a));
5864 const auto b1 = ShiftRight<16>(BitCast(du32, b));
5865
5866 return MulAdd(a1, b1, a0 * b0);
5867}
5868
5869// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
5870// safe.
5871template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
5874 const VFromD<D32> sum0,
5875 VFromD<D32>& /*sum1*/) {
5876 return sum0 + WidenMulPairwiseAdd(d, a, b);
5877}
5878
5879// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
5880// safe.
5881template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
5884 const VFromD<DU32> sum0,
5885 VFromD<DU32>& /*sum1*/) {
5886 return sum0 + WidenMulPairwiseAdd(d, a, b);
5887}
5888
5889// ------------------------------ RearrangeToOddPlusEven
5890template <size_t N>
5891HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(
5892 const Vec128<int32_t, N> sum0, const Vec128<int32_t, N> /*sum1*/) {
5893 return sum0; // invariant already holds
5894}
5895
5896template <size_t N>
5897HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven(
5898 const Vec128<uint32_t, N> sum0, const Vec128<uint32_t, N> /*sum1*/) {
5899 return sum0; // invariant already holds
5900}
5901
5902template <size_t N>
5903HWY_API Vec128<float, N> RearrangeToOddPlusEven(const Vec128<float, N> sum0,
5904 const Vec128<float, N> sum1) {
5905 return Add(sum0, sum1);
5906}
5907
5908// ------------------------------ Reductions
5909
5910// Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum.
5911
5912// ------------------------------ Lt128
5913
5914template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
5916 // Truth table of Eq and Lt for Hi and Lo u64.
5917 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
5918 // =H =L cH cL | out = cH | (=H & cL)
5919 // 0 0 0 0 | 0
5920 // 0 0 0 1 | 0
5921 // 0 0 1 0 | 1
5922 // 0 0 1 1 | 1
5923 // 0 1 0 0 | 0
5924 // 0 1 0 1 | 0
5925 // 0 1 1 0 | 1
5926 // 1 0 0 0 | 0
5927 // 1 0 0 1 | 1
5928 // 1 1 0 0 | 0
5929 const MFromD<D> eqHL = Eq(a, b);
5930 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
5931 // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
5932 // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
5933 // comparison result leftwards requires only 4. IfThenElse compiles to the
5934 // same code as OrAnd().
5935 const VFromD<D> ltLx = DupEven(ltHL);
5936 const VFromD<D> outHx = IfThenElse(eqHL, ltLx, ltHL);
5937 return MaskFromVec(DupOdd(outHx));
5938}
5939
5940template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5941HWY_INLINE MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) {
5942 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
5943 return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
5944}
5945
5946// ------------------------------ Eq128
5947
5948template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
5950 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
5951 return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
5952}
5953
5954template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5955HWY_INLINE MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) {
5956 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
5957 return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
5958}
5959
5960// ------------------------------ Ne128
5961
5962template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
5964 const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
5965 return MaskFromVec(Or(Reverse2(d, neHL), neHL));
5966}
5967
5968template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5969HWY_INLINE MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) {
5970 const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
5971 return MaskFromVec(InterleaveUpper(d, neHL, neHL));
5972}
5973
5974// ------------------------------ Min128, Max128 (Lt128)
5975
5976// Without a native OddEven, it seems infeasible to go faster than Lt128.
5977template <class D>
5978HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
5979 return IfThenElse(Lt128(d, a, b), a, b);
5980}
5981
5982template <class D>
5983HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
5984 return IfThenElse(Lt128(d, b, a), a, b);
5985}
5986
5987template <class D>
5988HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
5989 return IfThenElse(Lt128Upper(d, a, b), a, b);
5990}
5991
5992template <class D>
5993HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
5994 return IfThenElse(Lt128Upper(d, b, a), a, b);
5995}
5996
5997// NOLINTNEXTLINE(google-readability-namespace-comments)
5998} // namespace HWY_NAMESPACE
5999} // namespace hwy
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_IF_SIGNED(T)
Definition base.h:622
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_IF_T_SIZE(T, bytes)
Definition base.h:639
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_IF_SPECIAL_FLOAT(T)
Definition base.h:629
#define HWY_IF_LANES_LE(kN, lanes)
Definition base.h:617
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
#define HWY_IF_NOT_SPECIAL_FLOAT(T)
Definition base.h:631
#define HWY_IF_UNSIGNED(T)
Definition base.h:620
Definition arm_neon-inl.h:865
T PrivateT
Definition arm_neon-inl.h:870
detail::Raw128< T >::type raw
Definition wasm_128-inl.h:126
Raw raw
Definition arm_neon-inl.h:878
Definition arm_neon-inl.h:813
HWY_INLINE Vec128 & operator%=(const Vec128 other)
Definition wasm_128-inl.h:95
T PrivateT
Definition arm_neon-inl.h:816
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition wasm_128-inl.h:86
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:815
Raw raw
Definition arm_neon-inl.h:851
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition wasm_128-inl.h:92
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition wasm_128-inl.h:104
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition wasm_128-inl.h:101
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition wasm_128-inl.h:83
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition wasm_128-inl.h:98
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition wasm_128-inl.h:89
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_API V InsertLane(const V v, TFromD< D > t)
Definition arm_neon-inl.h:1793
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition wasm_128-inl.h:2433
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE V SlideUpLanes(V v, size_t amt)
Definition arm_neon-inl.h:6201
constexpr __i8x16 BytesAbove()
Definition wasm_128-inl.h:4990
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:5084
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, uint64_t mask_bits)
Definition arm_neon-inl.h:8448
HWY_INLINE VFromD< D > ReorderDemote2From64To32Combine(D dn, V a, V b)
Definition wasm_128-inl.h:4515
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition wasm_128-inl.h:1936
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:3803
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
HWY_INLINE Vec128< float16_t, N > ConcatEven(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7002
HWY_INLINE MFromD< D > LoadMaskBits(D d, uint64_t mask_bits)
Definition arm_neon-inl.h:8051
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecBroadcastLaneBytes(D d)
Definition arm_neon-inl.h:5661
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:164
HWY_INLINE VFromD< Rebind< double, DFromV< VW > > > U64ToF64VecFast(VW w)
Definition wasm_128-inl.h:4656
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecByteOffsets(D d)
Definition arm_neon-inl.h:5695
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8296
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8851
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:8276
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8860
HWY_INLINE VFromD< Rebind< uint64_t, D > > DemoteFromU64MaskOutResult(D, VFromD< Rebind< uint64_t, D > > v)
Definition wasm_128-inl.h:4487
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE VFromD< Rebind< uint64_t, D > > DemoteFromU64Saturate(D dn, VFromD< Rebind< uint64_t, D > > v)
Definition wasm_128-inl.h:4501
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE V SlideDownLanes(V v, size_t amt)
Definition arm_neon-inl.h:6346
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, uint64_t mask_bits)
Definition arm_neon-inl.h:8600
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API V ZeroIfNegative(V v)
Definition generic_ops-inl.h:266
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
Vec128< T, 4/sizeof(T)> Vec32
Definition arm_neon-inl.h:858
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
Vec128< T, 2/sizeof(T)> Vec16
Definition arm_neon-inl.h:861
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
Vec128< T, 8/sizeof(T)> Vec64
Definition arm_neon-inl.h:855
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API V Per4LaneBlockShuffle(V v)
Definition generic_ops-inl.h:6904
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
Simd< typename M::PrivateT, M::kPrivateN, 0 > DFromM
Definition arm_neon-inl.h:888
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
Definition abort.h:8
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment)
Definition base.h:2676
constexpr size_t FloorLog2(TI x)
Definition base.h:2662
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2577
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2540
HWY_API size_t PopCount(T x)
Definition base.h:2615
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue()
Definition base.h:2212
#define HWY_IF_U32_D(D)
Definition ops/shared-inl.h:579
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_I64_D(D)
Definition ops/shared-inl.h:585
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_UI64_D(D)
Definition ops/shared-inl.h:592
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_I32_D(D)
Definition ops/shared-inl.h:584
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_ALIGN
Definition set_macros-inl.h:167
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
@ value
Definition arm_neon-inl.h:8429
Definition arm_neon-inl.h:5654
__v128_u raw
Definition wasm_128-inl.h:2815
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:5655
Definition ops/shared-inl.h:198
HWY_INLINE __f64x2 operator()(__v128_u v)
Definition wasm_128-inl.h:188
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition wasm_128-inl.h:184
HWY_INLINE __v128_u operator()(__v128_u v)
Definition wasm_128-inl.h:180
__f64x2 type
Definition wasm_128-inl.h:68
__f32x4 type
Definition wasm_128-inl.h:64
Definition x86_128-inl.h:67
__v128_u type
Definition wasm_128-inl.h:60
Definition base.h:694
Definition base.h:1117
int VFromD
Definition tuple-inl.h:25
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()