35 return *
this = (*
this * other);
38 return *
this = (*
this / other);
41 return *
this = (*
this + other);
44 return *
this = (*
this - other);
47 return *
this = (*
this % other);
50 return *
this = (*
this & other);
53 return *
this = (*
this | other);
56 return *
this = (*
this ^ other);
72template <
class D, HWY_IF_V_SIZE_D(D, 32)>
74 const Half<
decltype(
d)> dh;
76 ret.
v0 = ret.v1 =
Zero(dh);
81template <
class D,
typename TFrom>
83 const Half<
decltype(
d)> dh;
103 const Half<
decltype(
d)> dh;
118template <
class D, HWY_IF_V_SIZE_D(D, 32),
typename T2>
120 const Half<
decltype(
d)> dh;
129template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 32)>
131 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
132 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
133 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
134 TFromD<D> t11, TFromD<D> t12,
135 TFromD<D> t13, TFromD<D> t14,
137 const Half<
decltype(
d)> dh;
139 ret.v0 = ret.v1 =
Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t8,
140 t9, t10, t11, t12, t13, t14, t15);
144template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 32)>
146 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
147 TFromD<D> t5, TFromD<D> t6,
149 const Half<
decltype(
d)> dh;
155template <
class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 32)>
157 TFromD<D> t2, TFromD<D> t3) {
158 const Half<
decltype(
d)> dh;
164template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 32)>
166 const Half<
decltype(
d)> dh;
233template <
int kBits,
typename T>
235 v.
v0 = ShiftLeft<kBits>(v.
v0);
236 v.
v1 = ShiftLeft<kBits>(v.
v1);
240template <
int kBits,
typename T>
242 v.
v0 = ShiftRight<kBits>(v.
v0);
243 v.
v1 = ShiftRight<kBits>(v.
v1);
248template <
int kBits,
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
253 constexpr size_t kSizeInBits =
sizeof(T) * 8;
254 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
255 if (kBits == 0)
return v;
324template <
class T, HWY_IF_UI64(T)>
340template <
class T, HWY_IF_UI64(T)>
358template <
typename T, HWY_IF_FLOAT(T)>
365template <
typename T, HWY_IF_FLOAT(T)>
421 return one /
Sqrt(v);
461template <
typename T, HWY_IF_FLOAT(T)>
471template <
typename T, HWY_IF_FLOAT(T)>
480 const VFromD<
decltype(di)> exp =
489template <
class DTo,
typename TFrom,
typename TTo = TFromD<DTo>>
491 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
497 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
498 return (v & bit) == bit;
551template <
class D, HWY_IF_V_SIZE_D(D, 32)>
554 using TI =
TFromD<
decltype(di)>;
597 return Xor(x1,
Xor(x2, x3));
602 return Or(o1,
Or(o2, o3));
607 return Or(o,
And(a1, a2));
635 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
636 const DFromV<
decltype(magn)>
d;
643 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
644 const DFromV<
decltype(sign)>
d;
659template <
class D,
typename T = TFromD<D>>
661 const Half<
decltype(
d)> dh;
733template <
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
741template <
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
750template <
typename T, HWY_IF_NOT_T_SIZE(T, 1)>
763template <
class D, HWY_IF_V_SIZE_D(D, 32)>
765 const Half<
decltype(
d)> dh;
767 ret.v0 =
Load(dh, aligned);
772template <
class D,
typename T = TFromD<D>>
777template <
class D,
typename T = TFromD<D>>
784template <
class D, HWY_IF_V_SIZE_D(D, 32)>
789template <
class D, HWY_IF_V_SIZE_D(D, 32)>
791 const Half<
decltype(
d)> dh;
793 ret.v0 = ret.v1 =
Load(dh,
p);
799template <
class D,
typename T = TFromD<D>>
801 const Half<
decltype(
d)> dh;
807template <
class D,
typename T = TFromD<D>>
812template <
class D,
typename T = TFromD<D>>
818template <
class D,
typename T = TFromD<D>>
831 alignas(32) T lanes[32 /
sizeof(T)];
840 alignas(32) T lanes[32 /
sizeof(T)];
843 return Load(
d, lanes);
847template <
int kBlockIdx,
class T>
849 static_assert(kBlockIdx == 0 || kBlockIdx == 1,
"Invalid block index");
850 return (kBlockIdx == 0) ? v.
v0 : v.
v1;
854template <
int kBlockIdx,
class T>
856 static_assert(kBlockIdx == 0 || kBlockIdx == 1,
"Invalid block index");
858 if (kBlockIdx == 0) {
859 result.
v0 = blk_to_insert;
863 result.
v1 = blk_to_insert;
869template <
int kBlockIdx,
class T>
871 static_assert(kBlockIdx == 0 || kBlockIdx == 1,
"Invalid block index");
873 result.
v0 = result.
v1 = (kBlockIdx == 0 ? v.
v0 : v.
v1);
879template <
class D,
typename T = TFromD<D>>
897template <
int kBytes,
class D,
typename T = TFromD<D>>
899 const Half<
decltype(
d)> dh;
900 v.
v0 = ShiftLeftBytes<kBytes>(dh, v.
v0);
901 v.
v1 = ShiftLeftBytes<kBytes>(dh, v.
v1);
905template <
int kBytes,
typename T>
907 return ShiftLeftBytes<kBytes>(
DFromV<
decltype(v)>(), v);
912template <
int kLanes,
class D,
typename T = TFromD<D>>
918template <
int kLanes,
typename T>
920 return ShiftLeftLanes<kLanes>(
DFromV<
decltype(v)>(), v);
924template <
int kBytes,
class D,
typename T = TFromD<D>>
926 const Half<
decltype(
d)> dh;
927 v.
v0 = ShiftRightBytes<kBytes>(dh, v.
v0);
928 v.
v1 = ShiftRightBytes<kBytes>(dh, v.
v1);
933template <
int kLanes,
class D,
typename T = TFromD<D>>
940template <
class D,
typename T = TFromD<D>>
947template <
int kBytes,
class D,
typename T = TFromD<D>>
949 const Half<
decltype(
d)> dh;
950 hi.
v0 = CombineShiftRightBytes<kBytes>(dh, hi.
v0, lo.
v0);
951 hi.
v1 = CombineShiftRightBytes<kBytes>(dh, hi.
v1, lo.
v1);
957template <
int kLane,
typename T>
960 ret.
v0 = Broadcast<kLane>(v.
v0);
961 ret.
v1 = Broadcast<kLane>(v.
v1);
965template <
int kLane,
typename T>
967 constexpr int kLanesPerBlock =
static_cast<int>(16 /
sizeof(T));
968 static_assert(0 <= kLane && kLane < kLanesPerBlock * 2,
"Invalid lane");
969 constexpr int kLaneInBlkIdx = kLane & (kLanesPerBlock - 1);
972 Broadcast<kLaneInBlkIdx>(kLane >= kLanesPerBlock ? v.
v1 : v.
v0);
979template <
typename T,
typename TI>
987template <
typename T,
typename TI,
size_t NI>
998template <
typename T,
size_t N,
typename TI>
1007template <
class V,
class VI>
1015template <
typename T>
1022template <
typename T>
1029template <
typename T>
1036template <
typename T>
1043template <
typename T>
1050template <
typename T>
1060template <
typename T, HWY_IF_T_SIZE(T, 4)>
1066template <
typename T, HWY_IF_T_SIZE(T, 4)>
1072template <
typename T, HWY_IF_T_SIZE(T, 4)>
1084template <
typename T>
1090template <
class D,
typename T = TFromD<D>,
typename TI>
1092 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
1094 ret.
i0 = vec.
v0.raw;
1095 ret.
i1 = vec.
v1.raw;
1099template <
class D, HWY_IF_V_SIZE_D(D, 32),
typename TI>
1101 const Rebind<TI,
decltype(
d)> di;
1105template <
typename T>
1108 const Half<
decltype(
d)> dh;
1118template <
typename T>
1124template <
typename T>
1128 const Half<
decltype(
d)> dh;
1131 constexpr size_t kLanesPerVect = 32 /
sizeof(TU);
1136 const auto vmod = vi &
Set(du, TU{kLanesPerVect - 1});
1148 return IfThenElse(is_lo, result_lo, result_hi);
1152template <
class D,
typename T = TFromD<D>>
1154 const Half<
decltype(
d)> dh;
1162template <
class D,
typename T = TFromD<D>>
1164 const Half<
decltype(
d)> dh;
1173template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
1175 const Half<
decltype(
d)> dh;
1182template <
class D,
typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 8)>
1184 const Half<
decltype(
d)> dh;
1192template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
1198template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
1200 const Half<
decltype(
d)> dh;
1207template <
class D,
typename T = TFromD<D>,
1208 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1209HWY_API Vec256<T> Reverse8(D d, Vec256<T> v) {
1210 const Half<decltype(d)> dh;
1211 v.v0 = Reverse8(dh, v.v0);
1212 v.v1 = Reverse8(dh, v.v1);
1218template <
typename T>
1229template <
class D,
typename T = TFromD<D>>
1231 const Half<
decltype(
d)> dh;
1238template <
class D, HWY_IF_V_SIZE_D(D, 32)>
1240 const Half<
decltype(
d)> dh;
1248template <
class D, HWY_IF_V_SIZE_D(D, 32)>
1250 const Half<
decltype(
d)> dh;
1262template <
class D,
typename T = TFromD<D>>
1271template <
class D,
typename T = TFromD<D>>
1273 const Half<
decltype(
d)> dh;
1281template <
size_t kFromVectSize,
class DTo,
class DFrom,
1287 const Half<
decltype(d_to)> dh_to;
1294template <
class D,
typename T = TFromD<D>>
1303template <
class D,
typename T = TFromD<D>>
1312template <
class D,
typename T = TFromD<D>>
1321template <
class D,
typename T = TFromD<D>>
1330template <
class D,
typename T = TFromD<D>>
1332 const Half<
decltype(
d)> dh;
1340template <
class D,
typename T = TFromD<D>>
1342 const Half<
decltype(
d)> dh;
1350template <
typename T>
1358template <
typename T>
1366template <
typename T>
1374template <
class D, HWY_IF_V_SIZE_D(D, 32)>
1376 const Half<
decltype(
d)> dh;
1383template <
class D, HWY_IF_V_SIZE_D(D, 32)>
1385 const Half<
decltype(
d)> dh;
1392template <
typename T>
1399template <
typename T>
1408template <
class D,
typename T = TFromD<D>>
1416template <
size_t kIdx3210,
class V>
1421 const Half<
decltype(
d)> dh;
1422 using VH =
VFromD<
decltype(dh)>;
1424 constexpr int kIdx3 =
static_cast<int>((kIdx3210 >> 6) & 3);
1425 constexpr int kIdx2 =
static_cast<int>((kIdx3210 >> 4) & 3);
1426 constexpr int kIdx1 =
static_cast<int>((kIdx3210 >> 2) & 3);
1427 constexpr int kIdx0 =
static_cast<int>(kIdx3210 & 3);
1430 ret.v0 = VH{wasm_i8x16_shuffle(
1431 v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3, kIdx0 + 4, kIdx1 + 4,
1432 kIdx2 + 4, kIdx3 + 4, kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8,
1433 kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)};
1434 ret.v1 = VH{wasm_i8x16_shuffle(
1435 v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3, kIdx0 + 4, kIdx1 + 4,
1436 kIdx2 + 4, kIdx3 + 4, kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8,
1437 kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)};
1441template <
size_t kIdx3210,
class V>
1446 const Half<
decltype(
d)> dh;
1447 using VH =
VFromD<
decltype(dh)>;
1449 constexpr int kIdx3 =
static_cast<int>((kIdx3210 >> 6) & 3);
1450 constexpr int kIdx2 =
static_cast<int>((kIdx3210 >> 4) & 3);
1451 constexpr int kIdx1 =
static_cast<int>((kIdx3210 >> 2) & 3);
1452 constexpr int kIdx0 =
static_cast<int>(kIdx3210 & 3);
1455 ret.v0 = VH{wasm_i16x8_shuffle(v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3,
1456 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)};
1457 ret.v1 = VH{wasm_i16x8_shuffle(v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3,
1458 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)};
1462template <
size_t kIdx3210,
class V>
1467 const Half<
decltype(
d)> dh;
1468 using VH =
VFromD<
decltype(dh)>;
1470 constexpr int kIdx3 =
static_cast<int>((kIdx3210 >> 6) & 3);
1471 constexpr int kIdx2 =
static_cast<int>((kIdx3210 >> 4) & 3);
1472 constexpr int kIdx1 =
static_cast<int>((kIdx3210 >> 2) & 3);
1473 constexpr int kIdx0 =
static_cast<int>(kIdx3210 & 3);
1477 VH{wasm_i32x4_shuffle(v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3)};
1479 VH{wasm_i32x4_shuffle(v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3)};
1483template <
size_t kIdx3210,
class V>
1488 const Half<
decltype(
d)> dh;
1489 using VH =
VFromD<
decltype(dh)>;
1491 constexpr int kIdx3 =
static_cast<int>((kIdx3210 >> 6) & 3);
1492 constexpr int kIdx2 =
static_cast<int>((kIdx3210 >> 4) & 3);
1493 constexpr int kIdx1 =
static_cast<int>((kIdx3210 >> 2) & 3);
1494 constexpr int kIdx0 =
static_cast<int>(kIdx3210 & 3);
1497 ret.v0 = VH{wasm_i64x2_shuffle(v.v0.raw, v.v1.raw, kIdx0, kIdx1)};
1498 ret.v1 = VH{wasm_i64x2_shuffle(v.v0.raw, v.v1.raw, kIdx2, kIdx3)};
1505template <
int kBlocks,
class D, HWY_IF_V_SIZE_D(D, 32)>
1507 static_assert(0 <= kBlocks && kBlocks <= 1,
1508 "kBlocks must be between 0 and 1");
1513template <
int kBlocks,
class D, HWY_IF_V_SIZE_D(D, 32)>
1515 static_assert(0 <= kBlocks && kBlocks <= 1,
1516 "kBlocks must be between 0 and 1");
1517 const Half<
decltype(
d)> dh;
1523template <
class D, HWY_IF_V_SIZE_D(D, 32)>
1525 const Half<
decltype(
d)> dh;
1528 const auto vu =
BitCast(du, v);
1531#if !HWY_IS_DEBUG_BUILD
1532 constexpr size_t kLanesPerBlock = 16 /
sizeof(TFromD<D>);
1533 if (__builtin_constant_p(amt) && amt < kLanesPerBlock) {
1534 switch (amt *
sizeof(TFromD<D>)) {
1538 ret.v0 =
BitCast(dh, ShiftLeftBytes<1>(dh_u, vu.v0));
1539 ret.v1 =
BitCast(dh, CombineShiftRightBytes<15>(dh_u, vu.v1, vu.v0));
1542 ret.v0 =
BitCast(dh, ShiftLeftBytes<2>(dh_u, vu.v0));
1543 ret.v1 =
BitCast(dh, CombineShiftRightBytes<14>(dh_u, vu.v1, vu.v0));
1546 ret.v0 =
BitCast(dh, ShiftLeftBytes<3>(dh_u, vu.v0));
1547 ret.v1 =
BitCast(dh, CombineShiftRightBytes<13>(dh_u, vu.v1, vu.v0));
1550 ret.v0 =
BitCast(dh, ShiftLeftBytes<4>(dh_u, vu.v0));
1551 ret.v1 =
BitCast(dh, CombineShiftRightBytes<12>(dh_u, vu.v1, vu.v0));
1554 ret.v0 =
BitCast(dh, ShiftLeftBytes<5>(dh_u, vu.v0));
1555 ret.v1 =
BitCast(dh, CombineShiftRightBytes<11>(dh_u, vu.v1, vu.v0));
1558 ret.v0 =
BitCast(dh, ShiftLeftBytes<6>(dh_u, vu.v0));
1559 ret.v1 =
BitCast(dh, CombineShiftRightBytes<10>(dh_u, vu.v1, vu.v0));
1562 ret.v0 =
BitCast(dh, ShiftLeftBytes<7>(dh_u, vu.v0));
1563 ret.v1 =
BitCast(dh, CombineShiftRightBytes<9>(dh_u, vu.v1, vu.v0));
1566 ret.v0 =
BitCast(dh, ShiftLeftBytes<8>(dh_u, vu.v0));
1567 ret.v1 =
BitCast(dh, CombineShiftRightBytes<8>(dh_u, vu.v1, vu.v0));
1570 ret.v0 =
BitCast(dh, ShiftLeftBytes<9>(dh_u, vu.v0));
1571 ret.v1 =
BitCast(dh, CombineShiftRightBytes<7>(dh_u, vu.v1, vu.v0));
1574 ret.v0 =
BitCast(dh, ShiftLeftBytes<10>(dh_u, vu.v0));
1575 ret.v1 =
BitCast(dh, CombineShiftRightBytes<6>(dh_u, vu.v1, vu.v0));
1578 ret.v0 =
BitCast(dh, ShiftLeftBytes<11>(dh_u, vu.v0));
1579 ret.v1 =
BitCast(dh, CombineShiftRightBytes<5>(dh_u, vu.v1, vu.v0));
1582 ret.v0 =
BitCast(dh, ShiftLeftBytes<12>(dh_u, vu.v0));
1583 ret.v1 =
BitCast(dh, CombineShiftRightBytes<4>(dh_u, vu.v1, vu.v0));
1586 ret.v0 =
BitCast(dh, ShiftLeftBytes<13>(dh_u, vu.v0));
1587 ret.v1 =
BitCast(dh, CombineShiftRightBytes<3>(dh_u, vu.v1, vu.v0));
1590 ret.v0 =
BitCast(dh, ShiftLeftBytes<14>(dh_u, vu.v0));
1591 ret.v1 =
BitCast(dh, CombineShiftRightBytes<2>(dh_u, vu.v1, vu.v0));
1594 ret.v0 =
BitCast(dh, ShiftLeftBytes<15>(dh_u, vu.v0));
1595 ret.v1 =
BitCast(dh, CombineShiftRightBytes<1>(dh_u, vu.v1, vu.v0));
1600 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
1609 const Half<
decltype(di8)> dh_i8;
1611 const auto lo_byte_idx =
BitCast(
1613 Iota(du8,
static_cast<uint8_t
>(
size_t{0} - amt *
sizeof(TFromD<D>))));
1615 const auto hi_byte_idx =
1616 UpperHalf(dh_i8, lo_byte_idx) -
Set(dh_i8, int8_t{16});
1617 const auto hi_sel_mask =
1618 UpperHalf(dh_i8, lo_byte_idx) >
Set(dh_i8, int8_t{15});
1630template <
typename D, HWY_IF_V_SIZE_D(D, 32)>
1633 const Half<
decltype(
d)> dh;
1634 constexpr int kShrByteAmt =
static_cast<int>(16 -
sizeof(TFromD<D>));
1635 ret.v0 = ShiftLeftLanes<1>(dh, v.v0);
1636 ret.v1 = CombineShiftRightBytes<kShrByteAmt>(dh, v.v1, v.v0);
1642template <
class D, HWY_IF_V_SIZE_D(D, 32)>
1644 const Half<
decltype(
d)> dh;
1649 const auto vu =
BitCast(du, v);
1651#if !HWY_IS_DEBUG_BUILD
1652 constexpr size_t kLanesPerBlock = 16 /
sizeof(TFromD<D>);
1653 if (__builtin_constant_p(amt) && amt < kLanesPerBlock) {
1654 switch (amt *
sizeof(TFromD<D>)) {
1658 ret.v0 =
BitCast(dh, CombineShiftRightBytes<1>(dh_u, vu.v1, vu.v0));
1659 ret.v1 =
BitCast(dh, ShiftRightBytes<1>(dh_u, vu.v1));
1662 ret.v0 =
BitCast(dh, CombineShiftRightBytes<2>(dh_u, vu.v1, vu.v0));
1663 ret.v1 =
BitCast(dh, ShiftRightBytes<2>(dh_u, vu.v1));
1666 ret.v0 =
BitCast(dh, CombineShiftRightBytes<3>(dh_u, vu.v1, vu.v0));
1667 ret.v1 =
BitCast(dh, ShiftRightBytes<3>(dh_u, vu.v1));
1670 ret.v0 =
BitCast(dh, CombineShiftRightBytes<4>(dh_u, vu.v1, vu.v0));
1671 ret.v1 =
BitCast(dh, ShiftRightBytes<4>(dh_u, vu.v1));
1674 ret.v0 =
BitCast(dh, CombineShiftRightBytes<5>(dh_u, vu.v1, vu.v0));
1675 ret.v1 =
BitCast(dh, ShiftRightBytes<5>(dh_u, vu.v1));
1678 ret.v0 =
BitCast(dh, CombineShiftRightBytes<6>(dh_u, vu.v1, vu.v0));
1679 ret.v1 =
BitCast(dh, ShiftRightBytes<6>(dh_u, vu.v1));
1682 ret.v0 =
BitCast(dh, CombineShiftRightBytes<7>(dh_u, vu.v1, vu.v0));
1683 ret.v1 =
BitCast(dh, ShiftRightBytes<7>(dh_u, vu.v1));
1686 ret.v0 =
BitCast(dh, CombineShiftRightBytes<8>(dh_u, vu.v1, vu.v0));
1687 ret.v1 =
BitCast(dh, ShiftRightBytes<8>(dh_u, vu.v1));
1690 ret.v0 =
BitCast(dh, CombineShiftRightBytes<9>(dh_u, vu.v1, vu.v0));
1691 ret.v1 =
BitCast(dh, ShiftRightBytes<9>(dh_u, vu.v1));
1694 ret.v0 =
BitCast(dh, CombineShiftRightBytes<10>(dh_u, vu.v1, vu.v0));
1695 ret.v1 =
BitCast(dh, ShiftRightBytes<10>(dh_u, vu.v1));
1698 ret.v0 =
BitCast(dh, CombineShiftRightBytes<11>(dh_u, vu.v1, vu.v0));
1699 ret.v1 =
BitCast(dh, ShiftRightBytes<11>(dh_u, vu.v1));
1702 ret.v0 =
BitCast(dh, CombineShiftRightBytes<12>(dh_u, vu.v1, vu.v0));
1703 ret.v1 =
BitCast(dh, ShiftRightBytes<12>(dh_u, vu.v1));
1706 ret.v0 =
BitCast(dh, CombineShiftRightBytes<13>(dh_u, vu.v1, vu.v0));
1707 ret.v1 =
BitCast(dh, ShiftRightBytes<13>(dh_u, vu.v1));
1710 ret.v0 =
BitCast(dh, CombineShiftRightBytes<14>(dh_u, vu.v1, vu.v0));
1711 ret.v1 =
BitCast(dh, ShiftRightBytes<14>(dh_u, vu.v1));
1714 ret.v0 =
BitCast(dh, CombineShiftRightBytes<15>(dh_u, vu.v1, vu.v0));
1715 ret.v1 =
BitCast(dh, ShiftRightBytes<15>(dh_u, vu.v1));
1720 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
1728 const Half<
decltype(du8)> dh_u8;
1730 const auto lo_byte_idx =
1731 Iota(du8,
static_cast<uint8_t
>(amt *
sizeof(TFromD<D>)));
1732 const auto u8_16 =
Set(du8, uint8_t{16});
1733 const auto hi_byte_idx = lo_byte_idx - u8_16;
1735 const auto lo_sel_mask =
1749template <
typename D, HWY_IF_V_SIZE_D(D, 32)>
1752 const Half<
decltype(
d)> dh;
1753 constexpr int kShrByteAmt =
static_cast<int>(
sizeof(TFromD<D>));
1754 ret.v0 = CombineShiftRightBytes<kShrByteAmt>(dh, v.v1, v.v0);
1755 ret.v1 = ShiftRightBytes<kShrByteAmt>(dh, v.v1);
1766 const Half<
decltype(
d)> dh;
1780 const Half<
decltype(
d)> dh;
1795 const Half<
decltype(
d)> dh;
1809template <
class D,
class T>
1813 const Rebind<T,
decltype(
d)> dh;
1819template <
class D, HWY_IF_U16_D(D)>
1824template <
class D, HWY_IF_I16_D(D)>
1829template <
class D, HWY_IF_U8_D(D)>
1831 const auto intermediate = wasm_i16x8_narrow_i32x4(v.
v0.raw, v.
v1.raw);
1832 return Vec64<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
1835template <
class D, HWY_IF_U8_D(D)>
1840template <
class D, HWY_IF_I8_D(D)>
1842 const auto intermediate = wasm_i16x8_narrow_i32x4(v.
v0.raw, v.
v1.raw);
1843 return Vec64<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
1846template <
class D, HWY_IF_I8_D(D)>
1851template <
class D, HWY_IF_I32_D(D)>
1858template <
class D, HWY_IF_U32_D(D)>
1865template <
class D, HWY_IF_F32_D(D)>
1872template <
class D, HWY_IF_F32_D(D)>
1879template <
class D, HWY_IF_F16_D(D)>
1881 const Half<
decltype(d16)> d16h;
1896template <
class D, HWY_IF_U8_D(D)>
1899 8, 16, 24, 0, 8, 16, 24, 0, 8, 16,
1903template <
class D, HWY_IF_U16_D(D)>
1906 17, 24, 25, 0, 1, 8, 9, 16, 17, 24,
1910template <
class D, HWY_IF_U32_D(D)>
1913 9, 10, 11, 16, 17, 18, 19, 24, 25,
1917template <
class D, HWY_IF_U8_D(D)>
1920 20, 24, 28, 0, 4, 8, 12, 16, 20, 24,
1924template <
class D, HWY_IF_U16_D(D)>
1927 9, 12, 13, 16, 17, 20, 21, 24, 25,
1931template <
class D, HWY_IF_U8_D(D)>
1934 10, 12, 14, 16, 18, 20, 22, 24, 26,
1944 const Half<
decltype(dn)> dnh;
1956 const Half<
decltype(dn)> dnh;
1965template <
class DTo,
typename TFrom,
typename TTo = TFromD<DTo>>
1967 const Half<
decltype(
d)> dh;
1986 const Half<
decltype(
d)> dh;
1991 constexpr size_t kBitsPerHalf = 16 /
sizeof(
TFromD<D>);
1992 const uint8_t bits_upper[8] = {
static_cast<uint8_t
>(bits[0] >> kBitsPerHalf)};
2000 const Half<
decltype(
d)> dh;
2003 constexpr size_t kLanesPerHalf = 16 /
sizeof(TFromD<D>);
2004 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
2005 static_assert(kBytesPerHalf != 0,
"Lane size <= 16 bits => at least 8 lanes");
2010template <
class D, HWY_IF_V_SIZE_D(D, 32)>
2012 const Half<
decltype(
d)> dh;
2021template <
class D,
typename T = TFromD<D>,
2022 HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
2023HWY_API
size_t StoreMaskBits(D d, const Mask256<T> mask, u
int8_t* bits) {
2024 const Half<decltype(d)> dh;
2025 StoreMaskBits(dh, mask.m0, bits);
2026 const u
int8_t lo = bits[0];
2027 StoreMaskBits(dh, mask.m1, bits);
2030 constexpr
size_t kBitsPerHalf = 16 / sizeof(T);
2031 bits[0] = static_cast<u
int8_t>(lo | (bits[0] << kBitsPerHalf));
2032 return (kBitsPerHalf * 2 + 7) / 8;
2035template <
class D,
typename T = TFromD<D>,
2036 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
2037HWY_API
size_t StoreMaskBits(D d, const Mask256<T> mask, u
int8_t* bits) {
2038 const Half<decltype(d)> dh;
2039 constexpr
size_t kLanesPerHalf = 16 / sizeof(T);
2040 constexpr
size_t kBytesPerHalf = kLanesPerHalf / 8;
2041 static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes");
2042 StoreMaskBits(dh, mask.m0, bits);
2043 StoreMaskBits(dh, mask.m1, bits + kBytesPerHalf);
2044 return kBytesPerHalf * 2;
2047template <
class D,
typename T = TFromD<D>>
2049 const Half<
decltype(
d)> dh;
2053template <
class D,
typename T = TFromD<D>>
2055 const Half<
decltype(
d)> dh;
2059template <
class D,
typename T = TFromD<D>>
2061 const Half<
decltype(
d)> dh;
2065template <
class D,
typename T = TFromD<D>>
2067 const Half<
decltype(
d)> dh;
2069 constexpr size_t kLanesPerHalf = 16 /
sizeof(T);
2070 return lo >= 0 ?
static_cast<size_t>(lo)
2074template <
class D,
typename T = TFromD<D>>
2076 const Half<
decltype(
d)> dh;
2078 constexpr int kLanesPerHalf = 16 /
sizeof(T);
2079 if (lo >= 0)
return lo;
2082 return hi + (hi >= 0 ? kLanesPerHalf : 0);
2085template <
class D,
typename T = TFromD<D>>
2087 const Half<
decltype(
d)> dh;
2089 constexpr size_t kLanesPerHalf = 16 /
sizeof(T);
2090 return hi >= 0 ? kLanesPerHalf +
static_cast<size_t>(hi)
2094template <
class D,
typename T = TFromD<D>>
2096 const Half<
decltype(
d)> dh;
2097 constexpr int kLanesPerHalf = 16 /
sizeof(T);
2103template <
class D,
typename T = TFromD<D>>
2106 const Half<
decltype(
d)> dh;
2109 return count + count2;
2113template <
class D,
typename T = TFromD<D>>
2116 const Half<
decltype(
d)> dh;
2119 return count + count2;
2124template <
class D,
typename T = TFromD<D>>
2132template <
typename T>
2135 alignas(32) T lanes[32 /
sizeof(T)] = {};
2137 return Load(
d, lanes);
2141template <
typename T>
2156template <
typename T>
2163template <
typename T>
2167 const Half<
decltype(
d)> dh;
2168 alignas(32) T lanes[32 /
sizeof(T)] = {};
2176template <
class D, HWY_IF_V_SIZE_D(D, 32)>
2196template <
class D,
typename T = TFromD<D>>
2218template <
class D,
typename T = TFromD<D>>
2247template <
class D,
typename T = TFromD<D>>
2264template <
class D,
typename T = TFromD<D>>
2285template <
class D,
typename T = TFromD<D>>
2307 const Half<
decltype(
d)> dh;
2318 dh_i64, vmask_lo, vmask_lo)))));
2333 const Half<
decltype(di64)> dh_i64;
2335 const auto zero =
Zero(di64);
2338 const auto vmask_eq_0 =
VecFromMask(di64, vmask == zero);
2339 auto vmask2_lo =
LowerHalf(dh_i64, vmask_eq_0);
2340 auto vmask2_hi =
UpperHalf(dh_i64, vmask_eq_0);
2347 const auto vmask2 =
Combine(di64, vmask2_hi, vmask2_lo);
2355 constexpr size_t kLanesPerBlock =
MaxLanes(
d) / 2;
2361 d, vmask, vmask_lo)));
2365template <
class D32,
typename T16,
typename T32 = TFromD<D32>>
2367 const Half<
decltype(d32)> d32h;
2375template <
class D32,
typename T16,
typename T32 = TFromD<D32>>
2379 const Half<
decltype(d32)> d32h;
2386template <
typename TW>
2397template <
class D,
typename T = TFromD<D>>
2399 const Half<
decltype(
d)> dh;
2406template <
class D,
typename T = TFromD<D>>
2408 const Half<
decltype(
d)> dh;
2415template <
class D,
typename T = TFromD<D>>
2417 const Half<
decltype(
d)> dh;
2424template <
class D,
typename T = TFromD<D>>
2426 const Half<
decltype(
d)> dh;
2433template <
class D,
typename T = TFromD<D>>
2435 const Half<
decltype(
d)> dh;
2442template <
class D,
typename T = TFromD<D>>
2444 const Half<
decltype(
d)> dh;
2451template <
class D,
typename T = TFromD<D>>
2453 const Half<
decltype(
d)> dh;
2460template <
class D,
typename T = TFromD<D>>
2462 const Half<
decltype(
d)> dh;
2469template <
class D,
typename T = TFromD<D>>
2471 const Half<
decltype(
d)> dh;
2478template <
class D,
typename T = TFromD<D>>
2480 const Half<
decltype(
d)> dh;
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_IF_T_SIZE(T, bytes)
Definition base.h:639
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_IF_LANES_LE(kN, lanes)
Definition base.h:617
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
Definition arm_neon-inl.h:865
Raw raw
Definition arm_neon-inl.h:878
Definition arm_neon-inl.h:813
Raw raw
Definition arm_neon-inl.h:851
Definition wasm_256-inl.h:27
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition wasm_256-inl.h:55
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition wasm_256-inl.h:49
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition wasm_256-inl.h:43
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition wasm_256-inl.h:40
HWY_INLINE Vec256 & operator%=(const Vec256 other)
Definition wasm_256-inl.h:46
Vec128< T > v1
Definition wasm_256-inl.h:60
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition wasm_256-inl.h:52
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition wasm_256-inl.h:37
static constexpr size_t kPrivateN
Definition wasm_256-inl.h:30
Vec128< T > v0
Definition wasm_256-inl.h:59
T PrivateT
Definition wasm_256-inl.h:29
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition wasm_256-inl.h:34
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE void LoadTransposedBlocks4(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &vA, VFromD< D > &vB, VFromD< D > &vC, VFromD< D > &vD)
Definition generic_ops-inl.h:1477
HWY_INLINE void StoreTransposedBlocks3(VFromD< D > A, VFromD< D > B, VFromD< D > C, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1652
HWY_INLINE void StoreTransposedBlocks2(VFromD< D > A, VFromD< D > B, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1616
HWY_INLINE void StoreTransposedBlocks4(VFromD< D > vA, VFromD< D > vB, VFromD< D > vC, VFromD< D > vD, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:2003
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE void LoadTransposedBlocks3(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &A, VFromD< D > &B, VFromD< D > &C)
Definition generic_ops-inl.h:1279
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API Vec256< T > TableLookupLanesOr0(Vec256< T > v, Indices256< T > idx)
Definition wasm_256-inl.h:1119
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API VFromD< DTo > ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, VFromD< DFrom > v)
Definition generic_ops-inl.h:162
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API VFromD< D > SlideDownBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7046
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
HWY_API VFromD< D > SlideUpBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7028
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API V BroadcastLane(const V v)
Definition arm_sve-inl.h:4146
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:624
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_V_SIZE_V(V, bytes)
Definition ops/shared-inl.h:632
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_D(D)
Definition ops/shared-inl.h:536
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition wasm_256-inl.h:1085
__v128_u i0
Definition wasm_256-inl.h:1086
__v128_u i1
Definition wasm_256-inl.h:1087
Definition wasm_256-inl.h:64
Mask128< T > m1
Definition wasm_256-inl.h:66
Mask128< T > m0
Definition wasm_256-inl.h:65
Definition ops/shared-inl.h:198
int VFromD
Definition tuple-inl.h:25