19#include <wasm_simd128.h>
24#ifdef HWY_WASM_OLD_NAMES
25#define wasm_i8x16_shuffle wasm_v8x16_shuffle
26#define wasm_i16x8_shuffle wasm_v16x8_shuffle
27#define wasm_i32x4_shuffle wasm_v32x4_shuffle
28#define wasm_i64x2_shuffle wasm_v64x2_shuffle
29#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
30#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
31#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
32#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
33#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
34#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
35#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
36#define wasm_i62x2_trunc_sat_f64x2 wasm_i64x2_trunc_saturate_f64x2
37#define wasm_u8x16_add_sat wasm_u8x16_add_saturate
38#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
39#define wasm_u16x8_add_sat wasm_u16x8_add_saturate
40#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
41#define wasm_i8x16_add_sat wasm_i8x16_add_saturate
42#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
43#define wasm_i16x8_add_sat wasm_i16x8_add_saturate
44#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
51#if HWY_TARGET == HWY_WASM_EMU256
73template <
typename T,
size_t N = 16 /
sizeof(T)>
79 static constexpr size_t kPrivateN = N;
84 return *
this = (*
this * other);
87 return *
this = (*
this / other);
90 return *
this = (*
this + other);
93 return *
this = (*
this - other);
96 return *
this = (*
this % other);
99 return *
this = (*
this & other);
102 return *
this = (*
this | other);
105 return *
this = (*
this ^ other);
112using Vec64 = Vec128<T, 8 /
sizeof(T)>;
115using Vec32 = Vec128<T, 4 /
sizeof(T)>;
118using Vec16 = Vec128<T, 2 /
sizeof(T)>;
121template <
typename T,
size_t N = 16 /
sizeof(T)>
124 static constexpr size_t kPrivateN = N;
136using TFromV =
typename V::PrivateT;
141template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
145template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
149template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
166 return static_cast<__v128_u
>(v);
169 return static_cast<__v128_u
>(v);
172template <
typename T,
size_t N>
198template <
class D,
typename FromT>
200 Vec128<FromT, Repartition<FromT, D>().
MaxLanes()> v) {
215template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
217 return VFromD<D>{wasm_i8x16_splat(
static_cast<int8_t
>(t))};
219template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
221 return VFromD<D>{wasm_i16x8_splat(
static_cast<int16_t
>(t))};
223template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
225 return VFromD<D>{wasm_i32x4_splat(
static_cast<int32_t
>(t))};
227template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
229 return VFromD<D>{wasm_i64x2_splat(
static_cast<int64_t
>(t))};
232template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_SPECIAL_FLOAT_D(D)>
234 return VFromD<D>{wasm_i16x8_splat(BitCastScalar<int16_t>(t))};
236template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
240template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
257template <
class D,
typename T = TFromD<D>,
typename T2>
260 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
263 return Load(
d, lanes);
267template <
class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
269 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
270 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
271 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
272 TFromD<D> t11, TFromD<D> t12,
273 TFromD<D> t13, TFromD<D> t14,
275 return VFromD<D>{wasm_i8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
276 t11, t12, t13, t14, t15)};
279template <
class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
281 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
282 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
283 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
284 TFromD<D> t11, TFromD<D> t12,
285 TFromD<D> t13, TFromD<D> t14,
287 return VFromD<D>{wasm_u8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
288 t11, t12, t13, t14, t15)};
291template <
class D, HWY_IF_I16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
293 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
294 TFromD<D> t5, TFromD<D> t6,
296 return VFromD<D>{wasm_i16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
299template <
class D, HWY_IF_U16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
301 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
302 TFromD<D> t5, TFromD<D> t6,
304 return VFromD<D>{wasm_u16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
307template <
class D, HWY_IF_SPECIAL_FLOAT_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
309 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
310 TFromD<D> t5, TFromD<D> t6,
315 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
316 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
317 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
318 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
321template <
class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
323 TFromD<D> t2, TFromD<D> t3) {
324 return VFromD<D>{wasm_i32x4_make(t0, t1, t2, t3)};
327template <
class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
329 TFromD<D> t2, TFromD<D> t3) {
330 return VFromD<D>{wasm_u32x4_make(t0, t1, t2, t3)};
333template <
class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
335 TFromD<D> t2, TFromD<D> t3) {
336 return VFromD<D>{wasm_f32x4_make(t0, t1, t2, t3)};
339template <
class D, HWY_IF_I64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
341 return VFromD<D>{wasm_i64x2_make(t0, t1)};
344template <
class D, HWY_IF_U64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
346 return VFromD<D>{wasm_u64x2_make(t0, t1)};
349template <
class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
351 return VFromD<D>{wasm_f64x2_make(t0, t1)};
576template <
int kBits,
size_t N>
580template <
int kBits,
size_t N>
584template <
int kBits,
size_t N>
588template <
int kBits,
size_t N>
592template <
int kBits,
size_t N>
596template <
int kBits,
size_t N>
602template <
int kBits,
size_t N>
606template <
int kBits,
size_t N>
610template <
int kBits,
size_t N>
614template <
int kBits,
size_t N>
618template <
int kBits,
size_t N>
622template <
int kBits,
size_t N>
628template <
int kBits,
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
630 const DFromV<
decltype(v)> d8;
632 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<
MakeWide<T>>{v.raw}).raw};
635 : (shifted &
Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
638template <
int kBits,
size_t N>
640 const DFromV<
decltype(v)> d8;
644 return shifted &
Set(d8, 0xFF >> kBits);
647template <
int kBits,
size_t N>
649 const DFromV<
decltype(v)> di;
651 const auto shifted =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
652 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
653 return (shifted ^ shifted_sign) - shifted_sign;
657template <
int kBits,
typename T,
size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
662 constexpr size_t kSizeInBits =
sizeof(T) * 8;
663 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
665 if (kBits == 0)
return v;
741template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
743 const DFromV<
decltype(v)> d8;
745 const Vec128<T, N> shifted{
747 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
753 const DFromV<
decltype(v)> d8;
757 return shifted &
Set(d8, 0xFF >> bits);
762 const DFromV<
decltype(v)> di;
765 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> bits));
766 return (shifted ^ shifted_sign) - shifted_sign;
788HWY_API Vec128<uint64_t, N>
Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
790 const uint64_t a0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0));
791 const uint64_t b0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0));
792 const uint64_t a1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1));
793 const uint64_t b1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1));
795 return Vec128<uint64_t, N>{wasm_v128_load(min)};
812HWY_API Vec128<int64_t, N>
Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
813 alignas(16) int64_t min[4];
814 min[0] =
HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
815 wasm_i64x2_extract_lane(b.raw, 0));
816 min[1] =
HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
817 wasm_i64x2_extract_lane(b.raw, 1));
818 return Vec128<int64_t, N>{wasm_v128_load(min)};
851HWY_API Vec128<uint64_t, N>
Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
853 const uint64_t a0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0));
854 const uint64_t b0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0));
855 const uint64_t a1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1));
856 const uint64_t b1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1));
858 return Vec128<uint64_t, N>{wasm_v128_load(max)};
875HWY_API Vec128<int64_t, N>
Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
876 alignas(16) int64_t max[2];
877 max[0] =
HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
878 wasm_i64x2_extract_lane(b.raw, 0));
879 max[1] =
HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
880 wasm_i64x2_extract_lane(b.raw, 1));
881 return Vec128<int64_t, N>{wasm_v128_load(max)};
928 const auto l = wasm_u16x8_extmul_low_u8x16(a.
raw, b.
raw);
929 const auto h = wasm_u16x8_extmul_high_u8x16(a.
raw, b.
raw);
931 return Vec128<uint8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
932 17, 19, 21, 23, 25, 27, 29, 31)};
937 const auto l = wasm_i16x8_extmul_low_i8x16(a.
raw, b.
raw);
938 const auto h = wasm_i16x8_extmul_high_i8x16(a.
raw, b.
raw);
940 return Vec128<int8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
941 17, 19, 21, 23, 25, 27, 29, 31)};
946 const auto l = wasm_u32x4_extmul_low_u16x8(a.
raw, b.
raw);
947 const auto h = wasm_u32x4_extmul_high_u16x8(a.
raw, b.
raw);
950 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
955 const auto l = wasm_i32x4_extmul_low_i16x8(a.
raw, b.
raw);
956 const auto h = wasm_i32x4_extmul_high_i16x8(a.
raw, b.
raw);
959 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
964 const auto l = wasm_u64x2_extmul_low_u32x4(a.
raw, b.
raw);
965 const auto h = wasm_u64x2_extmul_high_u32x4(a.
raw, b.
raw);
972 const auto l = wasm_i64x2_extmul_low_i32x4(a.
raw, b.
raw);
973 const auto h = wasm_i64x2_extmul_high_i32x4(a.
raw, b.
raw);
980 Vec128<int16_t, N> b) {
981 return Vec128<int16_t, N>{wasm_i16x8_q15mulr_sat(a.raw, b.raw)};
987HWY_API Vec128<MakeWide<T>, (N + 1) / 2>
MulEven(
const Vec128<T, N> a,
988 const Vec128<T, N> b) {
991 constexpr int kSrcBits =
sizeof(T) * 8;
994 ShiftRight<kSrcBits>(ShiftLeft<kSrcBits>(
ResizeBitCast(dw, a)));
996 ShiftRight<kSrcBits>(ShiftLeft<kSrcBits>(
ResizeBitCast(dw, b)));
1001HWY_API Vec128<MakeWide<T>, (N + 1) / 2>
MulEven(
const Vec128<T, N> a,
1002 const Vec128<T, N> b) {
1005 const auto kEvenMask =
Set(dw, LimitsMax<T>());
1012HWY_API Vec128<int64_t, (N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
1013 const Vec128<int32_t, N> b) {
1016 const auto ae = ShiftRight<32>(ShiftLeft<32>(
ResizeBitCast(dw, a))).raw;
1017 const auto be = ShiftRight<32>(ShiftLeft<32>(
ResizeBitCast(dw, b))).raw;
1018 return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
1021HWY_API Vec128<uint64_t, (N + 1) / 2>
MulEven(
const Vec128<uint32_t, N> a,
1022 const Vec128<uint32_t, N> b) {
1023 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
1024 const auto ae = wasm_v128_and(a.raw, kEvenMask);
1025 const auto be = wasm_v128_and(b.raw, kEvenMask);
1026 return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
1032HWY_API Vec128<MakeWide<T>, (N + 1) / 2>
MulOdd(
const Vec128<T, N> a,
1033 const Vec128<T, N> b) {
1036 constexpr int kSrcBits =
sizeof(T) * 8;
1038 const auto ao = ShiftRight<kSrcBits>(
BitCast(dw, a));
1039 const auto bo = ShiftRight<kSrcBits>(
BitCast(dw, b));
1042template <
class T,
size_t N, HWY_IF_UI32(T)>
1048 const auto ao = ShiftRight<32>(
BitCast(dw, a));
1049 const auto bo = ShiftRight<32>(
BitCast(dw, b));
1055template <
typename T,
size_t N, HWY_IF_FLOAT_OR_SPECIAL(T)>
1056HWY_API Vec128<T, N>
Neg(
const Vec128<T, N> v) {
1099template <
typename T,
size_t N>
1101 return Set(
DFromV<
decltype(v)>(), T{1.0}) / v;
1105template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1106HWY_API Vec128<T, N>
AbsDiff(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1112template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1113HWY_API Vec128<T, N>
MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
1115 return mul * x + add;
1118template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1121 return add - mul * x;
1124template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1125HWY_API Vec128<T, N>
MulSub(Vec128<T, N> mul, Vec128<T, N> x,
1127 return mul * x - sub;
1130template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1133 return Neg(mul) * x - sub;
1149template <
typename T,
size_t N>
1159HWY_API Vec128<float, N>
Round(
const Vec128<float, N> v) {
1160 return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
1163HWY_API Vec128<double, N>
Round(
const Vec128<double, N> v) {
1164 return Vec128<double, N>{wasm_f64x2_nearest(v.raw)};
1169HWY_API Vec128<float, N>
Trunc(
const Vec128<float, N> v) {
1170 return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
1179HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N> v) {
1180 return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
1189HWY_API Vec128<float, N>
Floor(
const Vec128<float, N> v) {
1190 return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
1198template <
typename T,
size_t N>
1203template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1213template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1222 const VFromD<
decltype(di)> exp =
1232template <
typename T,
size_t N>
1234 return Mask128<T, N>{v.raw};
1240template <
typename TFrom,
size_t NFrom,
class DTo>
1242 static_assert(
sizeof(TFrom) ==
sizeof(TFromD<DTo>),
"Must have same size");
1243 return MFromD<DTo>{
m.raw};
1246template <
typename T,
size_t N>
1248 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1249 return (v & bit) == bit;
1315 const Vec128<uint8_t, N> b) {
1316 return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1320 const Vec128<uint16_t, N> b) {
1321 return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1325 const Vec128<uint32_t, N> b) {
1326 return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1330 const Vec128<uint64_t, N> b) {
1331 return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1337 const Vec128<int8_t, N> b) {
1338 return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1342 const Vec128<int16_t, N> b) {
1343 return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1347 const Vec128<int32_t, N> b) {
1348 return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1352 const Vec128<int64_t, N> b) {
1353 return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1359 const Vec128<float, N> b) {
1360 return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
1364 const Vec128<double, N> b) {
1365 return Mask128<double, N>{wasm_f64x2_ne(a.raw, b.raw)};
1411 const auto a32 =
BitCast(d32, a);
1412 const auto b32 =
BitCast(d32, b);
1414 const auto m_gt = a32 > b32;
1417 const auto m_eq = a32 == b32;
1418 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
1421 const auto gt =
Or(lo_gt, m_gt);
1437template <
typename T,
size_t N>
1438HWY_API Mask128<T, N>
operator<(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1498template <
typename T,
size_t N>
1505template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1508 using TI =
TFromD<
decltype(di)>;
1516template <
typename T,
size_t N>
1518 return Vec128<T, N>{wasm_v128_not(v.raw)};
1523template <
typename T,
size_t N>
1524HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
1525 return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
1531template <
typename T,
size_t N>
1532HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
1533 return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
1538template <
typename T,
size_t N>
1539HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
1540 return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
1545template <
typename T,
size_t N>
1546HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
1547 return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
1552template <
typename T,
size_t N>
1553HWY_API Vec128<T, N>
Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
1554 return Xor(x1,
Xor(x2, x3));
1559template <
typename T,
size_t N>
1560HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
1561 return Or(o1,
Or(o2, o3));
1566template <
typename T,
size_t N>
1567HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
1568 return Or(o,
And(a1, a2));
1573template <
typename T,
size_t N>
1581template <
typename T,
size_t N>
1582HWY_API Vec128<T, N>
operator&(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1586template <
typename T,
size_t N>
1587HWY_API Vec128<T, N>
operator|(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1591template <
typename T,
size_t N>
1592HWY_API Vec128<T, N>
operator^(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1597template <
typename T,
size_t N>
1599 const Vec128<T, N> sign) {
1600 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1601 const DFromV<
decltype(magn)>
d;
1606template <
typename T,
size_t N>
1608 const Vec128<T, N> sign) {
1609 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1610 const DFromV<
decltype(abs)>
d;
1616template <
typename T,
size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
1634template <
typename T,
size_t N>
1637 return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1641template <
typename T,
size_t N>
1647template <
typename T,
size_t N>
1652template <
typename T,
size_t N>
1655 static_assert(IsSigned<T>(),
"Only works for signed/float");
1665template <
typename T,
size_t N>
1671template <
typename T,
size_t N>
1672HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1677template <
typename T,
size_t N>
1678HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1683template <
typename T,
size_t N>
1684HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1689template <
typename T,
size_t N>
1690HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1695template <
typename T,
size_t N>
1711template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
1718 test = ShiftLeft<5>(test);
1721 test = ShiftLeft<1>(test);
1725 test = ShiftLeft<1>(test);
1740 test = ShiftLeft<12>(test);
1743 test = ShiftLeft<1>(test);
1747 test = ShiftLeft<1>(test);
1751 test = ShiftLeft<1>(test);
1758template <
typename T,
size_t N, HWY_IF_UI32(T)>
1765 test = ShiftLeft<27>(test);
1768 test = ShiftLeft<1>(test);
1772 test = ShiftLeft<1>(test);
1776 test = ShiftLeft<1>(test);
1780 test = ShiftLeft<1>(test);
1787template <
typename T,
size_t N, HWY_IF_UI64(T)>
1791 using TU = MakeUnsigned<T>;
1792 alignas(16) TU lanes[2] = {};
1793 alignas(16) TU bits_lanes[2] = {};
1796 lanes[0] <<= (bits_lanes[0] & 63);
1797 lanes[1] <<= (bits_lanes[1] & 63);
1803template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
1810 test = ShiftLeft<5>(test);
1813 test = ShiftLeft<1>(test);
1817 test = ShiftLeft<1>(test);
1821 return IfThenElse(mask, ShiftRight<1>(v), v);
1832 test = ShiftLeft<12>(test);
1835 test = ShiftLeft<1>(test);
1839 test = ShiftLeft<1>(test);
1843 test = ShiftLeft<1>(test);
1847 return IfThenElse(mask, ShiftRight<1>(v), v);
1850template <
typename T,
size_t N, HWY_IF_UI32(T)>
1857 test = ShiftLeft<27>(test);
1860 test = ShiftLeft<1>(test);
1864 test = ShiftLeft<1>(test);
1868 test = ShiftLeft<1>(test);
1872 test = ShiftLeft<1>(test);
1876 return IfThenElse(mask, ShiftRight<1>(v), v);
1879template <
typename T,
size_t N, HWY_IF_UI64(T)>
1882 alignas(16) T lanes[2] = {};
1883 alignas(16) T bits_lanes[2] = {};
1885 Store(bits,
d, bits_lanes);
1886 lanes[0] >>= (bits_lanes[0] & 63);
1887 lanes[1] >>= (bits_lanes[1] & 63);
1888 return Load(
d, lanes);
1895template <
class D, HWY_IF_V_SIZE_D(D, 16),
typename T = TFromD<D>>
1897 return Vec128<T>{wasm_v128_load(aligned)};
1901template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
1909template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1915template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1920template <
class D,
typename T = TFromD<D>>
1925template <
class D,
typename T = TFromD<D>>
1935template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
1937 return static_cast<T
>(wasm_i8x16_extract_lane(v.
raw, kLane));
1939template <
size_t kLane,
typename T,
size_t N,
HWY_IF_T_SIZE(T, 2),
1942 const int16_t lane = wasm_i16x8_extract_lane(v.
raw, kLane);
1943 return static_cast<T
>(lane);
1945template <
size_t kLane,
typename T,
size_t N,
HWY_IF_T_SIZE(T, 2),
1951 const uint16_t bits = ExtractLane<kLane>(
BitCast(du, v));
1952 return BitCastScalar<T>(bits);
1954template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
1956 return static_cast<T
>(wasm_i32x4_extract_lane(v.raw, kLane));
1958template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
1960 return static_cast<T
>(wasm_i64x2_extract_lane(v.raw, kLane));
1963template <
size_t kLane,
size_t N>
1965 return wasm_f32x4_extract_lane(v.
raw, kLane);
1967template <
size_t kLane,
size_t N>
1969 return wasm_f64x2_extract_lane(v.
raw, kLane);
1974template <
class D, HWY_IF_V_SIZE_D(D, 16)>
1976 wasm_v128_store(aligned, v.raw);
1980template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
1985template <
class D, HWY_IF_LANES_D(D, 1)>
1987 *
p = detail::ExtractLane<0>(v);
2008 wasm_v128_store(aligned, v.raw);
2021template <
typename T>
2025 return detail::ExtractLane<0>(v);
2028template <
typename T>
2030#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
2031 if (__builtin_constant_p(i)) {
2034 return detail::ExtractLane<0>(v);
2036 return detail::ExtractLane<1>(v);
2040 alignas(16) T lanes[2];
2045template <
typename T>
2047#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
2048 if (__builtin_constant_p(i)) {
2051 return detail::ExtractLane<0>(v);
2053 return detail::ExtractLane<1>(v);
2055 return detail::ExtractLane<2>(v);
2057 return detail::ExtractLane<3>(v);
2061 alignas(16) T lanes[4];
2066template <
typename T>
2068#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
2069 if (__builtin_constant_p(i)) {
2072 return detail::ExtractLane<0>(v);
2074 return detail::ExtractLane<1>(v);
2076 return detail::ExtractLane<2>(v);
2078 return detail::ExtractLane<3>(v);
2080 return detail::ExtractLane<4>(v);
2082 return detail::ExtractLane<5>(v);
2084 return detail::ExtractLane<6>(v);
2086 return detail::ExtractLane<7>(v);
2090 alignas(16) T lanes[8];
2095template <
typename T>
2097#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
2098 if (__builtin_constant_p(i)) {
2101 return detail::ExtractLane<0>(v);
2103 return detail::ExtractLane<1>(v);
2105 return detail::ExtractLane<2>(v);
2107 return detail::ExtractLane<3>(v);
2109 return detail::ExtractLane<4>(v);
2111 return detail::ExtractLane<5>(v);
2113 return detail::ExtractLane<6>(v);
2115 return detail::ExtractLane<7>(v);
2117 return detail::ExtractLane<8>(v);
2119 return detail::ExtractLane<9>(v);
2121 return detail::ExtractLane<10>(v);
2123 return detail::ExtractLane<11>(v);
2125 return detail::ExtractLane<12>(v);
2127 return detail::ExtractLane<13>(v);
2129 return detail::ExtractLane<14>(v);
2131 return detail::ExtractLane<15>(v);
2135 alignas(16) T lanes[16];
2141template <
typename T,
size_t N>
2143 return detail::ExtractLane<0>(v);
2150template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
2152 static_assert(kLane < N,
"Lane index out of bounds");
2154 wasm_i8x16_replace_lane(v.
raw, kLane,
static_cast<int8_t
>(t))};
2157template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
2159 static_assert(kLane < N,
"Lane index out of bounds");
2161 wasm_i16x8_replace_lane(v.
raw, kLane, BitCastScalar<int16_t>(t))};
2164template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
2166 static_assert(kLane < N,
"Lane index out of bounds");
2167 return Vec128<T, N>{
2168 wasm_i32x4_replace_lane(v.raw, kLane,
static_cast<int32_t
>(t))};
2171template <
size_t kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
2173 static_assert(kLane < N,
"Lane index out of bounds");
2174 return Vec128<T, N>{
2175 wasm_i64x2_replace_lane(v.raw, kLane,
static_cast<int64_t
>(t))};
2178template <
size_t kLane,
size_t N>
2180 static_assert(kLane < N,
"Lane index out of bounds");
2184template <
size_t kLane,
size_t N>
2186 static_assert(kLane < 2,
"Lane index out of bounds");
2195template <
typename T>
2202template <
typename T>
2204#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
2205 if (__builtin_constant_p(i)) {
2208 return detail::InsertLane<0>(v, t);
2210 return detail::InsertLane<1>(v, t);
2215 alignas(16) T lanes[2];
2218 return Load(
d, lanes);
2221template <
typename T>
2223#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
2224 if (__builtin_constant_p(i)) {
2227 return detail::InsertLane<0>(v, t);
2229 return detail::InsertLane<1>(v, t);
2231 return detail::InsertLane<2>(v, t);
2233 return detail::InsertLane<3>(v, t);
2238 alignas(16) T lanes[4];
2241 return Load(
d, lanes);
2244template <
typename T>
2246#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
2247 if (__builtin_constant_p(i)) {
2250 return detail::InsertLane<0>(v, t);
2252 return detail::InsertLane<1>(v, t);
2254 return detail::InsertLane<2>(v, t);
2256 return detail::InsertLane<3>(v, t);
2258 return detail::InsertLane<4>(v, t);
2260 return detail::InsertLane<5>(v, t);
2262 return detail::InsertLane<6>(v, t);
2264 return detail::InsertLane<7>(v, t);
2269 alignas(16) T lanes[8];
2272 return Load(
d, lanes);
2275template <
typename T>
2277#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
2278 if (__builtin_constant_p(i)) {
2281 return detail::InsertLane<0>(v, t);
2283 return detail::InsertLane<1>(v, t);
2285 return detail::InsertLane<2>(v, t);
2287 return detail::InsertLane<3>(v, t);
2289 return detail::InsertLane<4>(v, t);
2291 return detail::InsertLane<5>(v, t);
2293 return detail::InsertLane<6>(v, t);
2295 return detail::InsertLane<7>(v, t);
2297 return detail::InsertLane<8>(v, t);
2299 return detail::InsertLane<9>(v, t);
2301 return detail::InsertLane<10>(v, t);
2303 return detail::InsertLane<11>(v, t);
2305 return detail::InsertLane<12>(v, t);
2307 return detail::InsertLane<13>(v, t);
2309 return detail::InsertLane<14>(v, t);
2311 return detail::InsertLane<15>(v, t);
2316 alignas(16) T lanes[16];
2319 return Load(
d, lanes);
2324template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
2328template <
typename T,
size_t N>
2330 return Vec128<T, N / 2>{v.raw};
2336template <
int kBytes,
class D>
2338 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2339 const __i8x16 zero = wasm_i8x16_splat(0);
2345 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
2346 7, 8, 9, 10, 11, 12, 13, 14)};
2349 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
2350 6, 7, 8, 9, 10, 11, 12, 13)};
2353 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
2354 4, 5, 6, 7, 8, 9, 10, 11, 12)};
2357 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
2358 3, 4, 5, 6, 7, 8, 9, 10, 11)};
2361 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
2362 2, 3, 4, 5, 6, 7, 8, 9, 10)};
2365 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2366 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
2369 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2370 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
2373 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2374 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
2377 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2378 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
2381 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2382 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
2385 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2386 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
2389 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2390 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
2393 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2394 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
2397 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2398 16, 16, 16, 16, 16, 16, 16, 16, 0,
2402 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
2403 16, 16, 16, 16, 16, 16, 16, 16, 16,
2409template <
int kBytes,
typename T,
size_t N>
2411 return ShiftLeftBytes<kBytes>(
DFromV<
decltype(v)>(), v);
2416template <
int kLanes,
class D>
2419 constexpr size_t kBytes = kLanes *
sizeof(TFromD<D>);
2423template <
int kLanes,
typename T,
size_t N>
2425 return ShiftLeftLanes<kLanes>(
DFromV<
decltype(v)>(), v);
2432template <
int kBytes,
typename T,
size_t N>
2434 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2435 const __i8x16 zero = wasm_i8x16_splat(0);
2442 return wasm_i8x16_shuffle(v.
raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2443 12, 13, 14, 15, 16);
2446 return wasm_i8x16_shuffle(v.
raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2447 13, 14, 15, 16, 16);
2450 return wasm_i8x16_shuffle(v.
raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2451 13, 14, 15, 16, 16, 16);
2454 return wasm_i8x16_shuffle(v.
raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2455 14, 15, 16, 16, 16, 16);
2458 return wasm_i8x16_shuffle(v.
raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
2459 15, 16, 16, 16, 16, 16);
2462 return wasm_i8x16_shuffle(v.
raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2463 16, 16, 16, 16, 16, 16);
2466 return wasm_i8x16_shuffle(v.
raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2467 16, 16, 16, 16, 16, 16, 16);
2470 return wasm_i8x16_shuffle(v.
raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
2471 16, 16, 16, 16, 16, 16, 16);
2474 return wasm_i8x16_shuffle(v.
raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
2475 16, 16, 16, 16, 16, 16, 16);
2478 return wasm_i8x16_shuffle(v.
raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
2479 16, 16, 16, 16, 16, 16, 16);
2482 return wasm_i8x16_shuffle(v.
raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
2483 16, 16, 16, 16, 16, 16, 16);
2486 return wasm_i8x16_shuffle(v.
raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
2487 16, 16, 16, 16, 16, 16, 16);
2490 return wasm_i8x16_shuffle(v.
raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
2491 16, 16, 16, 16, 16, 16, 16);
2494 return wasm_i8x16_shuffle(v.
raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
2495 16, 16, 16, 16, 16, 16, 16);
2498 return wasm_i8x16_shuffle(v.
raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
2499 16, 16, 16, 16, 16, 16, 16);
2508template <
int kBytes,
class D>
2511 if (
d.MaxBytes() != 16) {
2512 const Full128<TFromD<D>> dfull;
2513 const VFromD<
decltype(dfull)> vfull{v.raw};
2516 return VFromD<D>{detail::ShrBytes<kBytes>(v)};
2520template <
int kLanes,
class D>
2523 constexpr size_t kBytes = kLanes *
sizeof(TFromD<D>);
2529template <
class D,
typename T = TFromD<D>>
2535template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2542template <
int kBytes,
class D,
typename T = TFromD<D>>
2545 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2551 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
2552 8, 9, 10, 11, 12, 13, 14, 15, 16)};
2555 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
2556 9, 10, 11, 12, 13, 14, 15, 16, 17)};
2559 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
2560 10, 11, 12, 13, 14, 15, 16, 17, 18)};
2563 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
2564 11, 12, 13, 14, 15, 16, 17, 18, 19)};
2567 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
2568 12, 13, 14, 15, 16, 17, 18, 19, 20)};
2571 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
2572 12, 13, 14, 15, 16, 17, 18, 19, 20,
2576 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
2577 13, 14, 15, 16, 17, 18, 19, 20, 21,
2581 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
2582 14, 15, 16, 17, 18, 19, 20, 21, 22,
2586 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
2587 15, 16, 17, 18, 19, 20, 21, 22, 23,
2591 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
2592 15, 16, 17, 18, 19, 20, 21, 22, 23,
2596 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
2597 16, 17, 18, 19, 20, 21, 22, 23, 24,
2601 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
2602 17, 18, 19, 20, 21, 22, 23, 24, 25,
2606 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
2607 18, 19, 20, 21, 22, 23, 24, 25, 26,
2611 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
2612 19, 20, 21, 22, 23, 24, 25, 26, 27,
2616 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
2617 20, 21, 22, 23, 24, 25, 26, 27, 28,
2623template <
int kBytes,
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2625 constexpr size_t kSize =
d.MaxBytes();
2626 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
2628 using V8 = Vec128<uint8_t>;
2629 const DFromV<V8> dfull8;
2630 const Repartition<TFromD<D>,
decltype(dfull8)> dfull;
2631 const V8 hi8{
BitCast(d8, hi).raw};
2640template <
int kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
2642 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
2644 v.
raw, v.
raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane,
2645 kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
2648template <
int kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
2650 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
2651 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
2652 kLane, kLane, kLane, kLane, kLane)};
2655template <
int kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
2657 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
2658 return Vec128<T, N>{
2659 wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
2662template <
int kLane,
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
2664 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
2665 return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
2672template <
typename T,
size_t N,
typename TI,
size_t NI>
2674 const Vec128<TI, NI> from) {
2675 return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
2678template <
typename T,
size_t N,
typename TI,
size_t NI>
2680 const Vec128<TI, NI> from) {
2681 const DFromV<
decltype(from)>
d;
2699template <
typename T,
size_t N>
2701 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2702 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2703 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
2709template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
2712 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2714 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2715 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2717template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
2720 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2722 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2724template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
2726 const Vec128<T, N> b) {
2727 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2728 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
2731template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
2734 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2736 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2737 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2739template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
2742 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2744 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2746template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
2748 const Vec128<T, N> b) {
2749 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2750 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
2753template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
2756 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2758 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2759 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2761template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
2764 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2766 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2768template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
2770 const Vec128<T, N> b) {
2771 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
2772 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
2778template <
typename T>
2780 static_assert(
sizeof(T) == 8,
"Only for 64-bit lanes");
2781 return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2783template <
typename T>
2785 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2786 return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2790template <
typename T>
2792 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2793 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
2797template <
typename T>
2799 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2800 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
2804template <
typename T>
2806 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2807 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
2813template <
typename T,
size_t N = 16 /
sizeof(T)>
2820template <
class D, HWY_IF_T_SIZE_D(D, 1)>
2827template <
class D, HWY_IF_T_SIZE_D(D, 2)>
2831 alignas(16)
static constexpr uint8_t kBroadcastLaneBytes[16] = {
2832 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
2833 return Load(d8, kBroadcastLaneBytes);
2836template <
class D, HWY_IF_T_SIZE_D(D, 4)>
2840 alignas(16)
static constexpr uint8_t kBroadcastLaneBytes[16] = {
2841 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2842 return Load(d8, kBroadcastLaneBytes);
2845template <
class D, HWY_IF_T_SIZE_D(D, 8)>
2849 alignas(16)
static constexpr uint8_t kBroadcastLaneBytes[16] = {
2850 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2851 return Load(d8, kBroadcastLaneBytes);
2854template <
class D, HWY_IF_T_SIZE_D(D, 1)>
2860template <
class D, HWY_IF_T_SIZE_D(D, 2)>
2863 alignas(16)
static constexpr uint8_t kByteOffsets[16] = {
2864 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
2865 return Load(d8, kByteOffsets);
2868template <
class D, HWY_IF_T_SIZE_D(D, 4)>
2871 alignas(16)
static constexpr uint8_t kByteOffsets[16] = {
2872 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
2873 return Load(d8, kByteOffsets);
2876template <
class D, HWY_IF_T_SIZE_D(D, 8)>
2879 alignas(16)
static constexpr uint8_t kByteOffsets[16] = {
2880 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
2881 return Load(d8, kByteOffsets);
2891 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2892#if HWY_IS_DEBUG_BUILD
2894 using TU =
TFromD<
decltype(du)>;
2907 using T = TFromD<D>;
2908 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2909#if HWY_IS_DEBUG_BUILD
2911 using TU =
TFromD<
decltype(du)>;
2917 using V8 =
VFromD<
decltype(d8)>;
2922 constexpr int kIndexShiftAmt =
static_cast<int>(
FloorLog2(
sizeof(T)));
2923 const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
2928template <
class D, HWY_IF_V_SIZE_LE_D(D, 16),
typename TI>
2930 D
d,
const TI* idx) {
2931 const Rebind<TI,
decltype(
d)> di;
2935template <
typename T,
size_t N>
2939 const Rebind<TI,
decltype(
d)> di;
2943template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2945 Indices128<T, N> idx) {
2947 const Twice<
decltype(
d)> dt;
2951 const Vec128<T, N> idx_vec{idx.raw};
2952 const Indices128<T, N * 2> idx2{
Combine(dt, idx_vec, idx_vec).raw};
2955 const Indices128<T, N * 2> idx2{idx.raw};
2960template <
typename T>
2962 Indices128<T> idx) {
2966 const VFromD<
decltype(du8)> byte_idx{idx.raw};
2967 const auto byte_idx_mod = byte_idx &
Set(du8, uint8_t{0x0F});
2969 const auto is_lo = (byte_idx == byte_idx_mod);
2978template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
2984template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
2986 return Vec64<T>{
Shuffle2301(Vec128<T>{v.raw}).raw};
2990template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
2996template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
3002template <
class D, HWY_IF_T_SIZE_D(D, 2)>
3008template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)>
3010 static constexpr int kN = 16 +
Lanes(
d);
3014 kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, kN - 9,
3015 kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16)};
3020template <
class D, HWY_IF_T_SIZE_D(D, 2)>
3026template <
class D, HWY_IF_T_SIZE_D(D, 4)>
3031template <
class D, HWY_IF_T_SIZE_D(D, 8)>
3038template <
class D, HWY_IF_T_SIZE_D(D, 2)>
3040 return VFromD<D>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2, 1, 0, 7, 6, 5, 4)};
3043template <
class D, HWY_IF_T_SIZE_D(D, 4)>
3048template <
class D, HWY_IF_T_SIZE_D(D, 8)>
3055template <
class D, HWY_IF_T_SIZE_D(D, 2)>
3060template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
3061HWY_API VFromD<D> Reverse8(D , const VFromD<D>) {
3071 a.
raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
3077 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
3094 a.
raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
3100 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
3125template <
class T,
size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_SPECIAL_FLOAT(T)>
3147 26, 11, 27, 12, 28, 13, 29, 14,
3154 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
3171 26, 11, 27, 12, 28, 13, 29, 14,
3178 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
3206template <
class D,
typename T = TFromD<D>>
3212template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
3214 const Half<
decltype(
d)> d2;
3223template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
3227template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
3232template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
3240template <
size_t kIdx3210,
size_t kVectSize,
class V,
3246 constexpr int kIdx3 =
static_cast<int>((kIdx3210 >> 6) & 3);
3247 constexpr int kIdx2 =
static_cast<int>((kIdx3210 >> 4) & 3);
3248 constexpr int kIdx1 =
static_cast<int>((kIdx3210 >> 2) & 3);
3249 constexpr int kIdx0 =
static_cast<int>(kIdx3210 & 3);
3250 return V{wasm_i8x16_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3,
3251 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4,
3252 kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8,
3253 kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)};
3256template <
size_t kIdx3210,
size_t kVectSize,
class V,
3262 constexpr int kIdx3 =
static_cast<int>((kIdx3210 >> 6) & 3);
3263 constexpr int kIdx2 =
static_cast<int>((kIdx3210 >> 4) & 3);
3264 constexpr int kIdx1 =
static_cast<int>((kIdx3210 >> 2) & 3);
3265 constexpr int kIdx0 =
static_cast<int>(kIdx3210 & 3);
3266 return V{wasm_i16x8_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3,
3267 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)};
3270template <
size_t kIdx3210,
size_t kVectSize,
class V,
3276 constexpr int kIdx3 =
static_cast<int>((kIdx3210 >> 6) & 3);
3277 constexpr int kIdx2 =
static_cast<int>((kIdx3210 >> 4) & 3);
3278 constexpr int kIdx1 =
static_cast<int>((kIdx3210 >> 2) & 3);
3279 constexpr int kIdx0 =
static_cast<int>(kIdx3210 & 3);
3280 return V{wasm_i32x4_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3)};
3289template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
3292 const Full64<uint64_t> du64;
3295 d,
ShiftLeftSame(vu64,
static_cast<int>(amt *
sizeof(TFromV<V>) * 8)));
3298template <
class V, HWY_IF_V_SIZE_V(V, 16)>
3303 Iota(du8,
static_cast<uint8_t
>(
size_t{0} - amt *
sizeof(TFromV<V>)));
3309template <
class D, HWY_IF_LANES_D(D, 1)>
3314template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
3316#if !HWY_IS_DEBUG_BUILD
3317 if (__builtin_constant_p(amt)) {
3322 return ShiftLeftLanes<1>(
d, v);
3332template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
3334#if !HWY_IS_DEBUG_BUILD
3335 if (__builtin_constant_p(amt)) {
3340 return ShiftLeftLanes<1>(
d, v);
3342 return ShiftLeftLanes<2>(
d, v);
3344 return ShiftLeftLanes<3>(
d, v);
3354template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
3356#if !HWY_IS_DEBUG_BUILD
3357 if (__builtin_constant_p(amt)) {
3362 return ShiftLeftLanes<1>(
d, v);
3364 return ShiftLeftLanes<2>(
d, v);
3366 return ShiftLeftLanes<3>(
d, v);
3368 return ShiftLeftLanes<4>(
d, v);
3370 return ShiftLeftLanes<5>(
d, v);
3372 return ShiftLeftLanes<6>(
d, v);
3374 return ShiftLeftLanes<7>(
d, v);
3384template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
3386#if !HWY_IS_DEBUG_BUILD
3387 if (__builtin_constant_p(amt)) {
3392 return ShiftLeftLanes<1>(
d, v);
3394 return ShiftLeftLanes<2>(
d, v);
3396 return ShiftLeftLanes<3>(
d, v);
3398 return ShiftLeftLanes<4>(
d, v);
3400 return ShiftLeftLanes<5>(
d, v);
3402 return ShiftLeftLanes<6>(
d, v);
3404 return ShiftLeftLanes<7>(
d, v);
3406 return ShiftLeftLanes<8>(
d, v);
3408 return ShiftLeftLanes<9>(
d, v);
3410 return ShiftLeftLanes<10>(
d, v);
3412 return ShiftLeftLanes<11>(
d, v);
3414 return ShiftLeftLanes<12>(
d, v);
3416 return ShiftLeftLanes<13>(
d, v);
3418 return ShiftLeftLanes<14>(
d, v);
3420 return ShiftLeftLanes<15>(
d, v);
3434template <
class V, HWY_IF_V_SIZE_LE_V(V, 8)>
3440 static_cast<int>(amt *
sizeof(TFromV<V>) * 8)));
3443template <
class V, HWY_IF_V_SIZE_V(V, 16)>
3447 auto idx =
Iota(di8,
static_cast<int8_t
>(amt *
sizeof(TFromV<V>)));
3454template <
class D, HWY_IF_LANES_D(D, 1)>
3459template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
3461#if !HWY_IS_DEBUG_BUILD
3462 if (__builtin_constant_p(amt)) {
3467 return ShiftRightLanes<1>(
d, v);
3477template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
3479#if !HWY_IS_DEBUG_BUILD
3480 if (__builtin_constant_p(amt)) {
3485 return ShiftRightLanes<1>(
d, v);
3487 return ShiftRightLanes<2>(
d, v);
3489 return ShiftRightLanes<3>(
d, v);
3499template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
3501#if !HWY_IS_DEBUG_BUILD
3502 if (__builtin_constant_p(amt)) {
3507 return ShiftRightLanes<1>(
d, v);
3509 return ShiftRightLanes<2>(
d, v);
3511 return ShiftRightLanes<3>(
d, v);
3513 return ShiftRightLanes<4>(
d, v);
3515 return ShiftRightLanes<5>(
d, v);
3517 return ShiftRightLanes<6>(
d, v);
3519 return ShiftRightLanes<7>(
d, v);
3529template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
3531#if !HWY_IS_DEBUG_BUILD
3532 if (__builtin_constant_p(amt)) {
3537 return ShiftRightLanes<1>(
d, v);
3539 return ShiftRightLanes<2>(
d, v);
3541 return ShiftRightLanes<3>(
d, v);
3543 return ShiftRightLanes<4>(
d, v);
3545 return ShiftRightLanes<5>(
d, v);
3547 return ShiftRightLanes<6>(
d, v);
3549 return ShiftRightLanes<7>(
d, v);
3551 return ShiftRightLanes<8>(
d, v);
3553 return ShiftRightLanes<9>(
d, v);
3555 return ShiftRightLanes<10>(
d, v);
3557 return ShiftRightLanes<11>(
d, v);
3559 return ShiftRightLanes<12>(
d, v);
3561 return ShiftRightLanes<13>(
d, v);
3563 return ShiftRightLanes<14>(
d, v);
3565 return ShiftRightLanes<15>(
d, v);
3580template <
class D, HWY_IF_V_SIZE_LE_D(D, 16),
class VH = VFromD<Half<D>>>
3582 const Half<
decltype(
d)> dh;
3586 const VU lo{
BitCast(duh, lo_half).raw};
3587 const VU hi{
BitCast(duh, hi_half).raw};
3592template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
3599template <
class D,
typename T = TFromD<D>>
3601 return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
3605template <
class D,
typename T = TFromD<D>>
3607 return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
3611template <
class D,
typename T = TFromD<D>>
3613 return CombineShiftRightBytes<8>(
d, hi, lo);
3617template <
class D,
typename T = TFromD<D>>
3624template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
3626 const Half<
decltype(
d)> d2;
3630template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
3632 const Half<
decltype(
d)> d2;
3636template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
3639 const Half<
decltype(
d)> d2;
3643template <
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
3645 const Half<
decltype(
d)> d2;
3652template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3654 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15,
3655 17, 19, 21, 23, 25, 27, 29, 31)};
3659template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3662 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21,
3663 23, 1, 3, 5, 7, 17, 19, 21, 23)};
3667template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3670 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
3671 19, 1, 3, 17, 19, 1, 3, 17, 19)};
3675template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3678 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
3682template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3685 return Vec128<T, 4>{
3686 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
3690template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
3692 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
3696template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
3704template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3706 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14,
3707 16, 18, 20, 22, 24, 26, 28, 30)};
3711template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3714 return Vec64<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20, 22,
3715 0, 2, 4, 6, 16, 18, 20, 22)};
3719template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3722 return Vec32<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16, 18,
3723 0, 2, 16, 18, 0, 2, 16, 18)};
3727template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3730 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
3734template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3737 return Vec64<T>{wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
3741template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
3743 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
3747template <
typename D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
3754template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
3756 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6,
3757 8, 8, 10, 10, 12, 12, 14, 14)};
3760template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
3762 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6)};
3765template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
3767 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
3770template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
3777template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 1)>
3779 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7,
3780 9, 9, 11, 11, 13, 13, 15, 15)};
3783template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
3785 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7)};
3788template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
3790 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
3793template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
3802template <
typename T,
size_t N>
3807 alignas(16)
static constexpr uint8_t mask[16] = {
3808 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3811template <
typename T,
size_t N>
3815 wasm_i16x8_shuffle(a.
raw, b.
raw, 8, 1, 10, 3, 12, 5, 14, 7)};
3817template <
typename T,
size_t N>
3822template <
typename T,
size_t N>
3830template <
typename T,
size_t N>
3831HWY_API Vec128<T, N>
OddEven(
const Vec128<T, N> a,
const Vec128<T, N> b) {
3841template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3843 return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 2, 18, 4, 20, 6, 22,
3844 8, 24, 10, 26, 12, 28, 14, 30)};
3847template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3849 return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 2, 10, 4, 12, 6, 14)};
3852template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
3854 return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 2, 6)};
3857template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
3863template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3865 return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 17, 3, 19, 5, 21, 7, 23,
3866 9, 25, 11, 27, 13, 29, 15, 31)};
3869template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3871 return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 9, 3, 11, 5, 13, 7, 15)};
3874template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
3876 return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 5, 3, 7)};
3879template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
3885template <
typename T,
size_t N>
3892template <
typename T,
size_t N>
3910template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
3912 return VFromD<D>{wasm_u16x8_extend_low_u8x16(v.raw)};
3914template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
3916 return VFromD<D>{wasm_u32x4_extend_low_u16x8(v.raw)};
3918template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
3920 return VFromD<D>{wasm_u64x2_extend_low_u32x4(v.raw)};
3923template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
3926 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
3929template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
3931 return VFromD<D>{wasm_u16x8_extend_low_u8x16(v.raw)};
3933template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
3935 return VFromD<D>{wasm_u32x4_extend_low_u16x8(v.raw)};
3937template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
3939 return VFromD<D>{wasm_u64x2_extend_low_u32x4(v.raw)};
3942template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
3945 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
3954 const Rebind<uint32_t,
decltype(
d)> du32;
3959template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
3961 return VFromD<D>{wasm_i16x8_extend_low_i8x16(v.raw)};
3963template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
3965 return VFromD<D>{wasm_i32x4_extend_low_i16x8(v.raw)};
3967template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
3969 return VFromD<D>{wasm_i64x2_extend_low_i32x4(v.raw)};
3972template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
3975 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
3983 const Rebind<int32_t,
decltype(
d)> di32;
3987template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3989 const Rebind<uint16_t,
decltype(df32)> du16;
3994template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3996 return VFromD<D>{wasm_f64x2_convert_low_i32x4(v.raw)};
3999template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
4001 return VFromD<D>{wasm_f64x2_convert_low_u32x4(v.raw)};
4004template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
4006 return VFromD<D>{wasm_f64x2_promote_low_f32x4(v.raw)};
4009template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
4011 const Rebind<int32_t,
decltype(di64)> di32;
4014 const Repartition<uint8_t,
decltype(du32)> du32_as_du8;
4016 const auto exponent_adj =
BitCast(
4019 BitCast(du32_as_du8,
Set(du32, uint32_t{157}))),
4020 BitCast(du32_as_du8,
Set(du32, uint32_t{32}))));
4024 const auto f32_to_i32_result =
ConvertTo(di32, adj_v);
4028 Set(di32, LimitsMax<int32_t>())))));
4035template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
4037 const Rebind<uint32_t,
decltype(du64)> du32;
4039 const Repartition<uint8_t,
decltype(du32)> du32_as_du8;
4041 const auto exponent_adj =
BitCast(
4044 BitCast(du32_as_du8,
Set(du32, uint32_t{158}))),
4045 BitCast(du32_as_du8,
Set(du32, uint32_t{32}))));
4049 const auto f32_to_u32_result =
ConvertTo(du32, adj_v);
4052 VecFromMask(du32, f32_to_u32_result ==
Set(du32, LimitsMax<uint32_t>())));
4061#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
4062#undef HWY_NATIVE_PROMOTE_UPPER_TO
4064#define HWY_NATIVE_PROMOTE_UPPER_TO
4068template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
4071 return VFromD<D>{wasm_u16x8_extend_high_u8x16(v.raw)};
4073template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
4076 return VFromD<D>{wasm_u32x4_extend_high_u16x8(v.raw)};
4078template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
4081 return VFromD<D>{wasm_u64x2_extend_high_u32x4(v.raw)};
4084template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
4086 VFromD<Repartition<uint8_t, D>> v) {
4087 return VFromD<D>{wasm_u16x8_extend_high_u8x16(v.raw)};
4089template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
4091 VFromD<Repartition<uint16_t, D>> v) {
4092 return VFromD<D>{wasm_u32x4_extend_high_u16x8(v.raw)};
4094template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
4096 VFromD<Repartition<uint32_t, D>> v) {
4097 return VFromD<D>{wasm_u64x2_extend_high_u32x4(v.raw)};
4101template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
4104 return VFromD<D>{wasm_i16x8_extend_high_i8x16(v.raw)};
4106template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
4109 return VFromD<D>{wasm_i32x4_extend_high_i16x8(v.raw)};
4111template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
4114 return VFromD<D>{wasm_i64x2_extend_high_i32x4(v.raw)};
4117template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
4123template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
4130template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
4136template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
4142template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
4148template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
4154template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
class V>
4156 const Rebind<TFromV<V>,
decltype(
d)> dh;
4162template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
4164 return VFromD<D>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
4167template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
4169 return VFromD<D>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
4172template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
4174 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
4175 return VFromD<D>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
4178template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
4180 return VFromD<D>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
4183template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
4185 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
4186 return VFromD<D>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
4189template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
4191 return VFromD<D>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
4197 const DFromV<
decltype(v)> du32;
4202template <
class D, HWY_IF_U8_D(D)>
4204 const DFromV<
decltype(v)> du16;
4209template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4211 return VFromD<D>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
4214template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4216 return VFromD<D>{wasm_u32x4_trunc_sat_f64x2_zero(v.raw)};
4219template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
4221 return VFromD<D>{wasm_f32x4_demote_f64x2_zero(v.raw)};
4224template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
4226 const Rebind<double,
decltype(df32)> df64;
4231 const auto k2p64_63 =
Set(df64, 27670116110564327424.0);
4232 const auto f64_hi52 =
4234 const auto f64_lo12 =
4236 Set(du32, uint32_t{0x00000FFF}))));
4238 const auto f64_sum = f64_hi52 + f64_lo12;
4239 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
4241 const auto f64_sum_is_inexact =
4243 const auto f64_bits_decrement =
4245 f64_sum_is_inexact);
4247 const auto adj_f64_val =
BitCast(
4249 Or(
BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));
4251 return DemoteTo(df32, adj_f64_val);
4253template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
4255 const Rebind<double,
decltype(df32)> df64;
4260 const auto k2p64 =
Set(df64, 18446744073709551616.0);
4261 const auto f64_hi52 =
Or(
BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
4262 const auto f64_lo12 =
4264 Set(du32, uint32_t{0x00000FFF}))));
4266 const auto f64_sum = f64_hi52 + f64_lo12;
4267 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
4268 const auto f64_sum_is_inexact =
4271 const auto adj_f64_val =
BitCast(
4274 f64_sum_is_inexact));
4276 return DemoteTo(df32, adj_f64_val);
4281template <
class D, HWY_IF_I16_D(D)>
4285 const Twice<
decltype(
d)> dt;
4288template <
class D, HWY_IF_I16_D(D)>
4291 const Twice<
decltype(dn)> dn_full;
4292 const Repartition<uint32_t,
decltype(dn_full)> du32_full;
4294 const Vec128<int16_t> v_full{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
4295 const auto vu32_full =
BitCast(du32_full, v_full);
4299template <
class D, HWY_IF_I16_D(D)>
4301 Vec128<int32_t> b) {
4302 return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
4305template <
class D, HWY_IF_U16_D(D)>
4309 const Twice<
decltype(
d)> dt;
4312template <
class D, HWY_IF_U16_D(D)>
4315 const Twice<
decltype(dn)> dn_full;
4316 const Repartition<uint32_t,
decltype(dn_full)> du32_full;
4318 const Vec128<int16_t> v_full{wasm_u16x8_narrow_i32x4(a.raw, b.raw)};
4319 const auto vu32_full =
BitCast(du32_full, v_full);
4323template <
class D, HWY_IF_U16_D(D)>
4325 Vec128<int32_t> b) {
4326 return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(a.raw, b.raw)};
4329template <
class D, HWY_IF_U16_D(D)>
4332 const DFromV<
decltype(a)> du32;
4334 const auto max_i32 =
Set(du32, 0x7FFFFFFFu);
4336 const auto clamped_a =
BitCast(di32,
Min(a, max_i32));
4337 const auto clamped_b =
BitCast(di32,
Min(b, max_i32));
4340template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
4344 const Twice<
decltype(
d)> dt;
4350template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
4352 VFromD<Repartition<int16_t, D>> b) {
4354 const Twice<
decltype(
d)> dt;
4357template <
class D, HWY_IF_I8_D(D)>
4360 const Twice<
decltype(dn)> dn_full;
4361 const Repartition<uint32_t,
decltype(dn_full)> du32_full;
4364 const auto vu32_full =
BitCast(du32_full, v_full);
4368template <
class D, HWY_IF_I8_D(D)>
4370 Vec128<int16_t> b) {
4371 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(a.raw, b.raw)};
4374template <
class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
4376 VFromD<Repartition<int16_t, D>> b) {
4378 const Twice<
decltype(
d)> dt;
4381template <
class D, HWY_IF_U8_D(D)>
4384 const Twice<
decltype(dn)> dn_full;
4385 const Repartition<uint32_t,
decltype(dn_full)> du32_full;
4388 const auto vu32_full =
BitCast(du32_full, v_full);
4392template <
class D, HWY_IF_U8_D(D)>
4394 Vec128<int16_t> b) {
4395 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(a.raw, b.raw)};
4398template <
class D, HWY_IF_U8_D(D)>
4401 const DFromV<
decltype(a)> du16;
4403 const auto max_i16 =
Set(du16, 0x7FFFu);
4405 const auto clamped_a =
BitCast(di16,
Min(a, max_i16));
4406 const auto clamped_b =
BitCast(di16,
Min(b, max_i16));
4409template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
4411 VFromD<Repartition<uint16_t, D>> b) {
4413 const Twice<
decltype(
d)> dt;
4420 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
4421 return Vec128<uint8_t, N>{
4422 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
4427template <
typename From,
class DTo, HWY_IF_LANES_D(DTo, 1)>
4434template <
class D, HWY_IF_U8_D(D)>
4436 const Full128<uint8_t>
d;
4443template <
class D, HWY_IF_U16_D(D)>
4445 const Full128<uint16_t>
d;
4451template <
class D, HWY_IF_U32_D(D)>
4453 const Full128<uint32_t>
d;
4458template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)>
4461 const auto v1 = Vec128<uint8_t>{v.raw};
4467template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
4470 const auto v1 = Vec128<uint16_t>{v.raw};
4475template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)>
4478 const auto v1 = Vec128<uint8_t>{v.raw};
4486template <
class D, HWY_IF_UNSIGNED_D(D)>
4492template <
class D, HWY_IF_SIGNED_D(D)>
4495 const DFromV<
decltype(v)> du64;
4505 constexpr int kShiftAmt =
static_cast<int>(
sizeof(
TFromD<D>) * 8) -
4506 static_cast<int>(hwy::IsSigned<TFromD<D>>());
4510 di64,
Gt(
BitCast(di64, ShiftRight<kShiftAmt>(v)),
Zero(di64))));
4514template <
class D,
class V>
4524 const DFromV<
decltype(v)> di64;
4531 const auto saturated_vals =
Xor(
4540 const DFromV<
decltype(v)> di64;
4558 const Twice<
decltype(
d)> dt;
4562template <
class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4564 VFromD<Repartition<uint64_t, D>> b) {
4566 const Twice<
decltype(
d)> dt;
4570template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
4573 const DFromV<
decltype(a)> di64;
4575 const Half<
decltype(dn)> dnh;
4581 const auto saturated_a =
Xor(
4584 const auto saturated_b =
Xor(
4591template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
4594 const DFromV<
decltype(a)> di64;
4596 const Half<
decltype(dn)> dnh;
4606template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
4609 const Half<
decltype(dn)> dnh;
4617template <
class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
class V,
4618 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4619 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
4620 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
4627template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
4629 return VFromD<D>{wasm_f32x4_convert_i32x4(v.raw)};
4631template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
4633 return VFromD<D>{wasm_f32x4_convert_u32x4(v.raw)};
4636template <
class D, HWY_IF_F64_D(D)>
4643 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
4644 const auto v_upper =
BitCast(dd, ShiftRight<32>(
BitCast(d64, v)) ^ k84_63);
4647 const auto k52 =
Set(d32, 0x43300000);
4650 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
4651 return (v_upper - k84_63_52) + v_lower;
4657 const DFromV<
decltype(w)> d64;
4659 const auto cnst2_52_dbl =
Set(dd, 0x0010000000000000);
4664template <
class D, HWY_IF_F64_D(D)>
4668 using VU =
VFromD<
decltype(d64)>;
4670 const VU msk_lo =
Set(d64, 0xFFFFFFFF);
4671 const auto cnst2_32_dbl =
Set(dd, 4294967296.0);
4674 const VU v_lo =
And(v, msk_lo);
4675 const VU v_hi = ShiftRight<32>(v);
4682template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
4684 return VFromD<D>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
4686template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
4688 return VFromD<D>{wasm_u32x4_trunc_sat_f32x4(v.raw)};
4691template <
class DI, HWY_IF_I64_D(DI)>
4693 using VI =
VFromD<
decltype(di)>;
4694 using MI =
MFromD<
decltype(di)>;
4696 using VU =
VFromD<
decltype(du)>;
4698 const VI k1075 =
Set(di, 1075);
4701 const VU biased_exp = ShiftRight<52>(
BitCast(du, v)) &
Set(du, 0x7FF);
4702 const MI in_range =
BitCast(di, biased_exp) <
Set(di, 1086);
4717 const VU mantissa =
BitCast(du, v) &
Set(du, (1ULL << 52) - 1);
4719 VU int53 = (mantissa |
Set(du, 1ULL << 52)) >> shift_mnt;
4721 const MI tiny =
BitCast(di, shift_mnt) >
Set(di, 63);
4728 const VU shifted = int53 << shift_int;
4732 const VI limit =
Set(di, LimitsMax<int64_t>()) - sign_mask;
4736 return (magnitude ^ sign_mask) - sign_mask;
4739template <
class DU, HWY_IF_U64_D(DU)>
4742 using MI =
MFromD<
decltype(di)>;
4743 using VU =
VFromD<
decltype(du)>;
4745 const VU k1075 =
Set(du, 1075);
4750 const VU biased_exp = ShiftRight<52>(
BitCast(du, non_neg_v));
4751 const VU out_of_range =
4776 const VU mantissa =
BitCast(du, non_neg_v) &
Set(du, (1ULL << 52) - 1);
4778 VU int53 = (mantissa |
Set(du, 1ULL << 52)) >> shift_mnt;
4780 const MI tiny =
BitCast(di, shift_mnt) >
Set(di, 63);
4789 const VU shifted = int53 << shift_int;
4790 return (shifted | out_of_range);
4804 const DFromV<
decltype(v)> du8;
4808 using VU16 =
VFromD<
decltype(du16)>;
4810 const VU16 vFDB97531 = ShiftRight<8>(
BitCast(du16, v));
4811 const VU16 vECA86420 =
And(
BitCast(du16, v),
Set(du16, 0xFF));
4812 const VU16 sFE_DC_BA_98_76_54_32_10 =
Add(vFDB97531, vECA86420);
4814 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
4815 BitCast(du16, ShiftRight<16>(
BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
4816 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
4817 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
4818 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
4819 BitCast(du16, ShiftRight<32>(
BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
4820 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
4821 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
4822 return And(
BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70),
Set(du64, 0xFFFF));
4827 const DFromV<
decltype(v)> di8;
4833 using VI16 =
VFromD<
decltype(di16)>;
4835 const VI16 vFDB97531 = ShiftRight<8>(
BitCast(di16, v));
4836 const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(
BitCast(di16, v)));
4837 const VI16 sFE_DC_BA_98_76_54_32_10 =
Add(vFDB97531, vECA86420);
4839 const VI16 sDC_zz_98_zz_54_zz_10_zz =
4840 BitCast(di16, ShiftLeft<16>(
BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
4841 const VI16 sFC_xx_B8_xx_74_xx_30_xx =
4842 Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
4843 const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
4844 BitCast(di16, ShiftLeft<32>(
BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
4845 const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
4846 Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
4847 return ShiftRight<48>(
BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
4854template <
class D, HWY_IF_T_SIZE_D(D, 1)>
4859 const VFromD<D> vbits{wasm_i32x4_splat(
static_cast<int32_t
>(bits))};
4862 alignas(16)
static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4863 1, 1, 1, 1, 1, 1, 1, 1};
4866 alignas(16)
static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4867 1, 2, 4, 8, 16, 32, 64, 128};
4871template <
class D, HWY_IF_T_SIZE_D(D, 2)>
4874 alignas(16)
static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4879template <
class D, HWY_IF_T_SIZE_D(D, 4)>
4882 alignas(16)
static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4887template <
class D, HWY_IF_T_SIZE_D(D, 8)>
4890 alignas(16)
static constexpr uint64_t kBit[8] = {1, 2};
4897template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
4899 uint64_t mask_bits = 0;
4906template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
4909 if (kN < 8) mask_bits &= (1u << kN) - 1;
4918template <
typename T>
4920 const Mask128<T> mask) {
4921 alignas(16) uint64_t lanes[2];
4922 wasm_v128_store(lanes, mask.raw);
4924 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
4925 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
4926 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
4931template <
typename T>
4934 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
4935 return (
static_cast<uint64_t
>(wasm_i64x2_extract_lane(mask.
raw, 0)) *
4941template <
typename T,
size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
4944 uint64_t bytes =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(mask.
raw, 0));
4946 bytes &= (1ULL << (N * 8)) - 1;
4947 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
4948 return (bytes * kMagic) >> 56;
4951template <
typename T,
size_t N>
4953 const Mask128<T, N> mask) {
4955 const __i16x8 zero = wasm_i16x8_splat(0);
4956 const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
4960template <
typename T,
size_t N>
4962 const Mask128<T, N> mask) {
4963 const __i32x4 mask_i =
static_cast<__i32x4
>(mask.raw);
4964 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
4965 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
4966 alignas(16) uint32_t lanes[4];
4967 wasm_v128_store(lanes, sliced_mask);
4968 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
4971template <
typename T,
size_t N>
4973 const Mask128<T, N> mask) {
4974 const __i64x2 mask_i =
static_cast<__i64x2
>(mask.raw);
4975 const __i64x2 slice = wasm_i64x2_make(1, 2);
4976 const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
4977 alignas(16) uint64_t lanes[2];
4978 wasm_v128_store(lanes, sliced_mask);
4979 return lanes[0] | lanes[1];
4983template <
typename T,
size_t N>
4984constexpr uint64_t
OnlyActive(uint64_t bits) {
4985 return ((N *
sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
4992 (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
4993 : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
4994 : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
4995 : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
4996 : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
4997 : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
4998 : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
4999 : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
5000 : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
5001 : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
5003 : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
5005 : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
5007 : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
5009 : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
5012 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
5014 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
5015 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
5018template <
typename T,
size_t N>
5023template <
typename T>
5028template <
typename T>
5033template <
typename T>
5035 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
5036 const __i32x4 shifted_bits = wasm_v128_and(
m.raw, var_shift);
5037 alignas(16) uint64_t lanes[2];
5038 wasm_v128_store(lanes, shifted_bits);
5039 return PopCount(lanes[0] | lanes[1]);
5042template <
typename T>
5044 alignas(16) int64_t lanes[2];
5045 wasm_v128_store(lanes,
m.raw);
5046 return static_cast<size_t>(-(lanes[0] + lanes[1]));
5055 const size_t kNumBytes = (
d.MaxLanes() + 7) / 8;
5056 CopyBytes<kNumBytes>(&mask_bits, bits);
5060template <
class D, HWY_IF_V_SIZE_D(D, 16)>
5066template <
class D,
typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
5075template <
class D, HWY_IF_V_SIZE_D(D, 16)>
5078 return !wasm_v128_any_true(v8.raw);
5083template <
typename T>
5085 return wasm_i8x16_all_true(
m.raw);
5087template <
typename T>
5089 return wasm_i16x8_all_true(
m.raw);
5091template <
typename T>
5093 return wasm_i32x4_all_true(
m.raw);
5095template <
typename T>
5097 return wasm_i64x2_all_true(
m.raw);
5102template <
class D,
typename T = TFromD<D>>
5109template <
class D,
typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
5116template <
class D,
typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
5153template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
5157 const Rebind<uint8_t,
decltype(
d)> d8;
5165 alignas(16)
static constexpr uint8_t table[256 * 8] = {
5167 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5168 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5169 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
5170 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5171 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
5172 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
5173 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
5174 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5175 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
5176 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
5177 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
5178 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
5179 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
5180 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
5181 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
5182 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5183 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
5184 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
5185 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
5186 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
5187 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
5188 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
5189 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
5190 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
5191 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
5192 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
5193 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
5194 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
5195 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
5196 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
5197 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
5198 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5199 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
5200 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
5201 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
5202 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
5203 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
5204 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
5205 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
5206 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
5207 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
5208 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
5209 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
5210 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
5211 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
5212 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
5213 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
5214 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
5215 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
5216 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
5217 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
5218 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
5219 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
5220 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
5221 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
5222 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
5223 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
5224 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
5225 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
5226 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
5227 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
5228 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
5229 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
5230 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5231 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
5232 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
5233 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
5234 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
5235 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
5236 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
5237 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
5238 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
5239 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
5240 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
5241 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
5242 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
5243 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
5244 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
5245 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
5246 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
5247 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
5248 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
5249 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
5250 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
5251 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
5252 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
5253 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
5254 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
5255 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
5256 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
5257 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
5258 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
5259 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
5260 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
5261 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
5262 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
5263 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
5264 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
5265 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
5266 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
5267 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
5268 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
5269 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
5270 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
5271 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
5272 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
5273 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
5274 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
5275 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
5276 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
5277 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
5278 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
5279 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
5280 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
5281 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
5282 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
5283 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
5284 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
5285 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
5286 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
5287 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
5288 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
5289 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
5290 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
5291 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
5292 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
5293 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
5294 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5301template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 2)>
5305 const Rebind<uint8_t,
decltype(
d)> d8;
5313 alignas(16)
static constexpr uint8_t table[256 * 8] = {
5315 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
5316 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
5317 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
5318 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
5319 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
5320 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
5321 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
5322 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
5323 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
5324 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
5325 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
5326 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
5327 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
5328 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
5329 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
5330 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
5331 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
5332 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
5333 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
5334 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
5335 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
5336 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
5337 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
5338 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
5339 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
5340 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
5341 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
5342 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
5343 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
5344 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
5345 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
5346 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
5347 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
5348 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
5349 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
5350 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
5351 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
5352 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
5353 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
5354 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
5355 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
5356 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
5357 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
5358 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
5359 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
5360 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
5361 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
5362 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
5363 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
5364 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
5365 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
5366 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
5367 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
5368 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
5369 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
5370 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
5371 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
5372 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
5373 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
5374 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
5375 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
5376 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
5377 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
5378 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
5379 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
5380 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
5381 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
5382 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
5383 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
5384 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
5385 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
5386 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
5387 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
5388 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
5389 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
5390 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
5391 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
5392 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
5393 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
5394 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
5395 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
5396 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
5397 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
5398 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
5399 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
5400 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
5401 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
5402 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
5403 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
5404 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
5405 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
5406 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
5407 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
5408 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
5409 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
5410 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
5411 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
5412 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
5413 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
5414 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
5415 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
5416 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
5417 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
5418 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
5419 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
5420 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
5421 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
5422 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
5423 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
5424 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
5425 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
5426 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
5427 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
5428 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
5429 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
5430 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
5431 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
5432 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
5433 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
5434 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
5435 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
5436 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
5437 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
5438 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
5439 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
5440 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
5441 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
5442 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
5449template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
5454 alignas(16)
static constexpr uint8_t u8_indices[16 * 16] = {
5456 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5457 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5458 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
5459 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5460 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
5461 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
5462 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
5463 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5464 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5465 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
5466 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
5467 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5468 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5469 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
5470 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
5471 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5472 const Simd<T, N, 0>
d;
5474 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5477template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 4)>
5482 alignas(16)
static constexpr uint8_t u8_indices[16 * 16] = {
5484 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
5485 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5486 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
5487 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5488 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
5489 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
5490 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5491 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5492 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
5493 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5494 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
5495 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
5496 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5497 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5499 const Simd<T, N, 0>
d;
5501 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5504template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
5509 alignas(16)
static constexpr uint8_t u8_indices[4 * 16] = {
5511 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5512 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5513 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5514 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5516 const Simd<T, N, 0>
d;
5518 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5521template <
typename T,
size_t N, HWY_IF_T_SIZE(T, 8)>
5526 alignas(16)
static constexpr uint8_t u8_indices[4 * 16] = {
5528 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5529 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5530 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5531 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5533 const Simd<T, N, 0>
d;
5535 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5541template <
typename T,
size_t N>
5543 const auto idx = detail::IdxFromBits<T, N>(mask_bits);
5549template <
typename T,
size_t N>
5551 const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
5559template <
typename T>
5560struct CompressIsPartition {
5561#if HWY_TARGET == HWY_WASM_EMU256
5564 enum {
value = (
sizeof(T) != 1) };
5569template <
typename T>
5575template <
typename T, HWY_IF_T_SIZE(T, 8)>
5580 const Vec128<T> maskL =
DupEven(
m);
5581 const Vec128<T> maskH =
DupOdd(
m);
5582 const Vec128<T> swap =
AndNot(maskL, maskH);
5587template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 2))>
5588HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
5589 return detail::Compress(v, detail::BitsFromMask(mask));
5593template <
typename T>
5599template <
typename T, HWY_IF_T_SIZE(T, 8)>
5604 const Vec128<T> maskL =
DupEven(
m);
5605 const Vec128<T> maskH =
DupOdd(
m);
5606 const Vec128<T> swap =
AndNot(maskH, maskL);
5611template <
typename T,
size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
5612HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
5615 if (N < 16 / sizeof(T)) {
5616 return detail::Compress(v, detail::BitsFromMask(Not(mask)));
5618 return detail::CompressNot(v, detail::BitsFromMask(mask));
5622HWY_API Vec128<u
int64_t> CompressBlocksNot(Vec128<u
int64_t> v,
5623 Mask128<u
int64_t> ) {
5628template <
typename T,
size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
5631 uint64_t mask_bits = 0;
5632 constexpr size_t kNumBytes = (N + 7) / 8;
5633 CopyBytes<kNumBytes>(bits, &mask_bits);
5635 mask_bits &= (1ull << N) - 1;
5642template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
5652template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
5657 const size_t count =
PopCount(mask_bits);
5658 const VFromD<
decltype(du)> compressed =
5667template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
5670 uint64_t mask_bits = 0;
5672 CopyBytes<(kN + 7) / 8>(bits, &mask_bits);
5674 mask_bits &= (1ull << kN) - 1;
5694 const FixedTag<T, 2>
d;
5698template <
class T,
size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
5700 const Simd<T, N, 0>
d;
5702 const auto neg_vmask =
5706template <
class T, HWY_IF_NOT_T_SIZE(T, 8)>
5712 vmask =
Or(vmask,
Neg(vmask));
5719template <
class T,
size_t N>
5730 const FixedTag<T, 2>
d;
5734 const auto zero =
Zero(di);
5738template <
class T,
size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
5740 const Simd<T, N, 0>
d;
5744 const auto only_first_vmask =
5748template <
class T, HWY_IF_NOT_T_SIZE(T, 8)>
5754 const auto zero =
Zero(di64);
5763 const FixedTag<T, 1>
d;
5765 using TI = MakeSigned<T>;
5769template <
class T,
size_t N, HWY_IF_LANES_GT(N, 1)>
5771 const Simd<T, N, 0>
d;
5777template <
class T, HWY_IF_UI64(T)>
5779 alignas(16) T mul[2];
5780 mul[0] =
Mul128(
static_cast<T
>(wasm_i64x2_extract_lane(a.raw, 0)),
5781 static_cast<T
>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
5782 return Load(Full128<T>(), mul);
5785template <
class T, HWY_IF_UI64(T)>
5787 alignas(16) T mul[2];
5788 mul[0] =
Mul128(
static_cast<T
>(wasm_i64x2_extract_lane(a.raw, 1)),
5789 static_cast<T
>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
5790 return Load(Full128<T>(), mul);
5794template <
class T, HWY_IF_UI64(T)>
5798 return Set(Full64<T>(), hi);
5801template <
class T, HWY_IF_UI64(T)>
5806 Mul128(detail::ExtractLane<1>(a), detail::ExtractLane<1>(b), &hi_1);
5816 const Rebind<uint32_t,
decltype(df32)> du32;
5817 using VU32 =
VFromD<
decltype(du32)>;
5818 const VU32 odd =
Set(du32, 0xFFFF0000u);
5821 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
5823 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
5834 const Rebind<uint32_t,
decltype(df32)> du32;
5835 using VU32 =
VFromD<
decltype(du32)>;
5836 const VU32 odd =
Set(du32, 0xFFFF0000u);
5839 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
5841 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
5852 return VFromD<D32>{wasm_i32x4_dot_i16x8(a.raw, b.raw)};
5858 const auto lo16_mask =
Set(du32, 0x0000FFFFu);
5860 const auto a0 =
And(
BitCast(du32, a), lo16_mask);
5861 const auto b0 =
And(
BitCast(du32, b), lo16_mask);
5863 const auto a1 = ShiftRight<16>(
BitCast(du32, a));
5864 const auto b1 = ShiftRight<16>(
BitCast(du32, b));
5866 return MulAdd(a1, b1, a0 * b0);
5892 const Vec128<int32_t, N> sum0,
const Vec128<int32_t, N> ) {
5898 const Vec128<uint32_t, N> sum0,
const Vec128<uint32_t, N> ) {
5904 const Vec128<float, N> sum1) {
5905 return Add(sum0, sum1);
5914template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
5940template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5948template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
5954template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5962template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
5968template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_IF_SIGNED(T)
Definition base.h:622
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_IF_T_SIZE(T, bytes)
Definition base.h:639
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_IF_SPECIAL_FLOAT(T)
Definition base.h:629
#define HWY_IF_LANES_LE(kN, lanes)
Definition base.h:617
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
#define HWY_IF_NOT_SPECIAL_FLOAT(T)
Definition base.h:631
#define HWY_IF_UNSIGNED(T)
Definition base.h:620
Definition arm_neon-inl.h:865
T PrivateT
Definition arm_neon-inl.h:870
detail::Raw128< T >::type raw
Definition wasm_128-inl.h:126
Raw raw
Definition arm_neon-inl.h:878
Definition arm_neon-inl.h:813
HWY_INLINE Vec128 & operator%=(const Vec128 other)
Definition wasm_128-inl.h:95
T PrivateT
Definition arm_neon-inl.h:816
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition wasm_128-inl.h:86
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:815
Raw raw
Definition arm_neon-inl.h:851
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition wasm_128-inl.h:92
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition wasm_128-inl.h:104
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition wasm_128-inl.h:101
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition wasm_128-inl.h:83
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition wasm_128-inl.h:98
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition wasm_128-inl.h:89
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_API V InsertLane(const V v, TFromD< D > t)
Definition arm_neon-inl.h:1793
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition wasm_128-inl.h:2433
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE V SlideUpLanes(V v, size_t amt)
Definition arm_neon-inl.h:6201
constexpr __i8x16 BytesAbove()
Definition wasm_128-inl.h:4990
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:5084
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, uint64_t mask_bits)
Definition arm_neon-inl.h:8448
HWY_INLINE VFromD< D > ReorderDemote2From64To32Combine(D dn, V a, V b)
Definition wasm_128-inl.h:4515
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition wasm_128-inl.h:1936
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:3803
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
HWY_INLINE Vec128< float16_t, N > ConcatEven(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7002
HWY_INLINE MFromD< D > LoadMaskBits(D d, uint64_t mask_bits)
Definition arm_neon-inl.h:8051
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecBroadcastLaneBytes(D d)
Definition arm_neon-inl.h:5661
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:164
HWY_INLINE VFromD< Rebind< double, DFromV< VW > > > U64ToF64VecFast(VW w)
Definition wasm_128-inl.h:4656
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecByteOffsets(D d)
Definition arm_neon-inl.h:5695
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8296
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8851
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:8276
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8860
HWY_INLINE VFromD< Rebind< uint64_t, D > > DemoteFromU64MaskOutResult(D, VFromD< Rebind< uint64_t, D > > v)
Definition wasm_128-inl.h:4487
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE VFromD< Rebind< uint64_t, D > > DemoteFromU64Saturate(D dn, VFromD< Rebind< uint64_t, D > > v)
Definition wasm_128-inl.h:4501
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE V SlideDownLanes(V v, size_t amt)
Definition arm_neon-inl.h:6346
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, uint64_t mask_bits)
Definition arm_neon-inl.h:8600
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API V ZeroIfNegative(V v)
Definition generic_ops-inl.h:266
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
Vec128< T, 4/sizeof(T)> Vec32
Definition arm_neon-inl.h:858
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
Vec128< T, 2/sizeof(T)> Vec16
Definition arm_neon-inl.h:861
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
Vec128< T, 8/sizeof(T)> Vec64
Definition arm_neon-inl.h:855
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API V Per4LaneBlockShuffle(V v)
Definition generic_ops-inl.h:6904
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
Simd< typename M::PrivateT, M::kPrivateN, 0 > DFromM
Definition arm_neon-inl.h:888
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment)
Definition base.h:2676
constexpr size_t FloorLog2(TI x)
Definition base.h:2662
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2577
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2540
HWY_API size_t PopCount(T x)
Definition base.h:2615
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue()
Definition base.h:2212
#define HWY_IF_U32_D(D)
Definition ops/shared-inl.h:579
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_I64_D(D)
Definition ops/shared-inl.h:585
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_UI64_D(D)
Definition ops/shared-inl.h:592
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_I32_D(D)
Definition ops/shared-inl.h:584
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_ALIGN
Definition set_macros-inl.h:167
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
@ value
Definition arm_neon-inl.h:8429
Definition arm_neon-inl.h:5654
__v128_u raw
Definition wasm_128-inl.h:2815
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:5655
Definition ops/shared-inl.h:198
HWY_INLINE __f64x2 operator()(__v128_u v)
Definition wasm_128-inl.h:188
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition wasm_128-inl.h:184
Definition wasm_128-inl.h:179
HWY_INLINE __v128_u operator()(__v128_u v)
Definition wasm_128-inl.h:180
__f64x2 type
Definition wasm_128-inl.h:68
__f32x4 type
Definition wasm_128-inl.h:64
Definition x86_128-inl.h:67
__v128_u type
Definition wasm_128-inl.h:60
int VFromD
Definition tuple-inl.h:25