46 return *
this = (*
this * other);
49 return *
this = (*
this / other);
52 return *
this = (*
this + other);
55 return *
this = (*
this - other);
58 return *
this = (*
this % other);
61 return *
this = (*
this & other);
64 return *
this = (*
this | other);
67 return *
this = (*
this ^ other);
92using TFromV =
typename V::PrivateT;
96template <
class DTo,
typename TTo = TFromD<DTo>,
typename TFrom>
98 static_assert(
sizeof(TTo) <=
sizeof(TFrom),
"Promoting is undefined");
100 CopyBytes<sizeof(TTo)>(&v.
raw, &to);
106template <
class D, HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>>
108 return Vec1<T>(ConvertScalarTo<T>(0));
118template <
class D, HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>,
typename T2>
120 return Vec1<T>(
static_cast<T
>(t));
124template <
class D, HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>>
130template <
class D, HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>,
typename T2>
132 return Vec1<T>(
static_cast<T
>(first));
137template <
class D,
typename FromV>
139 using TFrom = TFromV<FromV>;
140 using TTo = TFromD<D>;
141 constexpr size_t kCopyLen =
HWY_MIN(
sizeof(TFrom),
sizeof(TTo));
143 CopyBytes<kCopyLen>(&v.raw, &to);
151template <
class FromSizeTag,
class ToSizeTag,
class DTo,
class DFrom>
163template <
class D, HWY_IF_T_SIZE_D(D, 1)>
165 TFromD<D> , TFromD<D> ,
166 TFromD<D> , TFromD<D> ,
167 TFromD<D> , TFromD<D> ,
168 TFromD<D> , TFromD<D> ,
169 TFromD<D> , TFromD<D> ,
170 TFromD<D> , TFromD<D> ,
171 TFromD<D> , TFromD<D> ) {
175template <
class D, HWY_IF_T_SIZE_D(D, 2)>
177 TFromD<D> , TFromD<D> ,
178 TFromD<D> , TFromD<D> ,
179 TFromD<D> , TFromD<D> ) {
183template <
class D, HWY_IF_T_SIZE_D(D, 4)>
185 TFromD<D> , TFromD<D> ) {
189template <
class D, HWY_IF_T_SIZE_D(D, 8)>
258 return Xor(x1,
Xor(x2, x3));
265 return Or(o1,
Or(o2, o3));
272 return Or(o,
And(a1, a2));
277template <
class DTo,
typename TTo = TFromD<DTo>,
typename TFrom>
279 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
301template <
class D,
typename T = TFromD<D>>
308template <
class D, HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>>
322 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
323 const DFromV<
decltype(magn)>
d;
330 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
343#ifdef HWY_NATIVE_POPCNT
344#undef HWY_NATIVE_POPCNT
346#define HWY_NATIVE_POPCNT
360 return mask.
bits ? yes : no;
365 return mask.
bits ? yes :
Vec1<T>(ConvertScalarTo<T>(0));
370 return mask.
bits ?
Vec1<T>(ConvertScalarTo<T>(0)) : no;
377 const auto vi =
BitCast(di, v);
379 return vi.raw < 0 ? yes : no;
441#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
442#undef HWY_NATIVE_LOWER_HALF_OF_MASK
444#define HWY_NATIVE_LOWER_HALF_OF_MASK
456template <
int kBits,
typename T>
458 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
463template <
int kBits,
typename T>
465 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
470template <
int kBits,
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
475 constexpr size_t kSizeInBits =
sizeof(T) * 8;
476 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
477 if (kBits == 0)
return v;
513 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw);
514 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw);
515 return Vec1<T>(
static_cast<T
>((a64 + b64) &
static_cast<uint64_t
>(~T(0))));
526 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw);
527 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw);
528 return Vec1<T>(
static_cast<T
>((a64 - b64) &
static_cast<uint64_t
>(~T(0))));
633template <
typename T, HWY_IF_NOT_FLOAT(T)>
638template <
typename T, HWY_IF_FLOAT(T)>
639HWY_API Vec1<T>
Min(
const Vec1<T> a,
const Vec1<T> b) {
642 return Vec1<T>(
HWY_MIN(a.raw, b.raw));
645template <
typename T, HWY_IF_NOT_FLOAT(T)>
650template <
typename T, HWY_IF_FLOAT(T)>
651HWY_API Vec1<T>
Max(
const Vec1<T> a,
const Vec1<T> b) {
654 return Vec1<T>(
HWY_MAX(a.raw, b.raw));
659template <
typename T, HWY_IF_FLOAT_OR_SPECIAL(T)>
664template <
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
666 return Zero(Sisd<T>()) - v;
672#ifdef HWY_NATIVE_MUL_8
673#undef HWY_NATIVE_MUL_8
675#define HWY_NATIVE_MUL_8
677#ifdef HWY_NATIVE_MUL_64
678#undef HWY_NATIVE_MUL_64
680#define HWY_NATIVE_MUL_64
683template <
typename T, HWY_IF_FLOAT(T)>
688template <
typename T, HWY_IF_NOT_FLOAT(T)>
690 return Vec1<T>(
static_cast<T
>(
static_cast<uint64_t
>(a.raw) *
691 static_cast<uint64_t
>(b.raw)));
694template <
typename T, HWY_IF_FLOAT(T)>
696 return Vec1<T>(a.raw / b.raw);
705 (
static_cast<TW
>(a.
raw) *
static_cast<TW
>(b.
raw)) >> (
sizeof(T) * 8)));
707template <
class T, HWY_IF_UI64(T)>
723 const TW a_wide = a.
raw;
737template <
typename T, HWY_IF_FLOAT(T)>
744template <
typename T, HWY_IF_FLOAT(T)>
746 return mul * x + add;
749template <
typename T, HWY_IF_FLOAT(T)>
752 return add - mul * x;
755template <
typename T, HWY_IF_FLOAT(T)>
757 return mul * x - sub;
760template <
typename T, HWY_IF_FLOAT(T)>
763 return Neg(mul) * x - sub;
771 const float half = f * 0.5f;
775 bits = 0x5F3759DF - (bits >> 1);
783#if defined(HWY_NO_LIBCXX)
784#if HWY_COMPILER_GCC_ACTUAL
788 CopyBytes<sizeof(bits)>(&v, &bits);
790 bits = (1 << 29) + (bits >> 1) - (1 << 22);
791 CopyBytes<sizeof(bits)>(&bits, &v);
799#if defined(HWY_NO_LIBCXX)
800#if HWY_COMPILER_GCC_ACTUAL
804 CopyBytes<sizeof(bits)>(&v, &bits);
806 bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
807 CopyBytes<sizeof(bits)>(&bits, &v);
820 if (!(
Abs(v).raw < MantissaEnd<T>())) {
823 const T k0 = ConvertScalarTo<T>(0);
824 const T bias = ConvertScalarTo<T>(v.
raw < k0 ? -0.5 : 0.5);
825 const TI rounded = ConvertScalarTo<TI>(v.
raw + bias);
829 if ((rounded & 1) &&
ScalarAbs(ConvertScalarTo<T>(rounded) - v.
raw) ==
830 ConvertScalarTo<T>(0.5)) {
831 offset = v.
raw < k0 ? -1 : 1;
833 return Vec1<T>(ConvertScalarTo<T>(rounded - offset));
844 if (!(abs < MantissaEnd<T>())) {
846 if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
847 return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
852 ConvertScalarTo<T>(v.
raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5);
853 const TI rounded = ConvertScalarTo<TI>(v.
raw + bias);
857 if ((rounded & 1) &&
ScalarAbs(ConvertScalarTo<T>(rounded) - v.
raw) ==
858 ConvertScalarTo<T>(0.5)) {
859 offset = is_sign ? -1 : 1;
867 if (!(
Abs(v).raw <= MantissaEnd<T>())) {
870 const TI truncated = ConvertScalarTo<TI>(v.
raw);
872 return Vec1<T>(ConvertScalarTo<T>(truncated));
875template <
typename Float,
typename Bits,
int kMantissaBits,
int kExponentBits,
878 const Bits kExponentMask = (1ull << kExponentBits) - 1;
879 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
880 const Bits kBias = kExponentMask / 2;
883 const bool positive = f > Float(0.0);
889 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
891 if (exponent >= kMantissaBits)
return v;
893 if (exponent < 0)
return positive ? V(1) : V(-0.0);
895 const Bits mantissa_mask = kMantissaMask >> exponent;
897 if ((bits & mantissa_mask) == 0)
return v;
900 if (positive) bits += (kMantissaMask + 1) >> exponent;
901 bits &= ~mantissa_mask;
907template <
typename Float,
typename Bits,
int kMantissaBits,
int kExponentBits,
910 const Bits kExponentMask = (1ull << kExponentBits) - 1;
911 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
912 const Bits kBias = kExponentMask / 2;
915 const bool negative = f < Float(0.0);
921 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
923 if (exponent >= kMantissaBits)
return v;
925 if (exponent < 0)
return V(negative ? Float(-1.0) : Float(0.0));
927 const Bits mantissa_mask = kMantissaMask >> exponent;
929 if ((bits & mantissa_mask) == 0)
return v;
932 if (negative) bits += (kMantissaMask + 1) >> exponent;
933 bits &= ~mantissa_mask;
941 return Ceiling<float, uint32_t, 23, 8>(v);
944 return Ceiling<double, uint64_t, 52, 11>(v);
949 return Floor<float, uint32_t, 23, 8>(v);
952 return Floor<double, uint64_t, 52, 11>(v);
969 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
970 return (v & bit) == bit;
1000#ifdef HWY_NATIVE_ISINF
1001#undef HWY_NATIVE_ISINF
1003#define HWY_NATIVE_ISINF
1018 return RebindMask(
d, (vu + vu) ==
Set(du, 0xFFE0000000000000ull));
1036template <
class D, HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>>
1043template <
class D,
typename T = TFromD<D>>
1048template <
class D,
typename T = TFromD<D>>
1054template <
class D, HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>>
1060template <
class D, HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>>
1062 return Load(
d, aligned);
1065#ifdef HWY_NATIVE_LOAD_N
1066#undef HWY_NATIVE_LOAD_N
1068#define HWY_NATIVE_LOAD_N
1071template <
class D,
typename T = TFromD<D>>
1073 size_t max_lanes_to_load) {
1074 return (max_lanes_to_load > 0) ?
Load(
d,
p) :
Zero(
d);
1077template <
class D,
typename T = TFromD<D>>
1079 size_t max_lanes_to_load) {
1080 return (max_lanes_to_load > 0) ?
Load(
d,
p) : no;
1085template <
class D,
typename T = TFromD<D>>
1090template <
class D,
typename T = TFromD<D>>
1095template <
class D,
typename T = TFromD<D>>
1097 if (!
m.bits)
return;
1101#ifdef HWY_NATIVE_STORE_N
1102#undef HWY_NATIVE_STORE_N
1104#define HWY_NATIVE_STORE_N
1107template <
class D,
typename T = TFromD<D>>
1109 size_t max_lanes_to_store) {
1110 if (max_lanes_to_store > 0) {
1118#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1119#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1121#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1124template <
class D,
typename T = TFromD<D>>
1127 v0 =
LoadU(
d, unaligned + 0);
1128 v1 =
LoadU(
d, unaligned + 1);
1131template <
class D,
typename T = TFromD<D>>
1134 v0 =
LoadU(
d, unaligned + 0);
1135 v1 =
LoadU(
d, unaligned + 1);
1136 v2 =
LoadU(
d, unaligned + 2);
1139template <
class D,
typename T = TFromD<D>>
1142 v0 =
LoadU(
d, unaligned + 0);
1143 v1 =
LoadU(
d, unaligned + 1);
1144 v2 =
LoadU(
d, unaligned + 2);
1145 v3 =
LoadU(
d, unaligned + 3);
1150template <
class D,
typename T = TFromD<D>>
1157template <
class D,
typename T = TFromD<D>>
1166template <
class D,
typename T = TFromD<D>>
1178template <
class D,
typename T = TFromD<D>>
1180 return Store(v,
d, aligned);
1185#ifdef HWY_NATIVE_SCATTER
1186#undef HWY_NATIVE_SCATTER
1188#define HWY_NATIVE_SCATTER
1191template <
class D,
typename T = TFromD<D>,
typename TI>
1193 static_assert(
sizeof(T) ==
sizeof(TI),
"Index/lane size must match");
1194 const intptr_t addr =
1195 reinterpret_cast<intptr_t
>(base) +
static_cast<intptr_t
>(offset.
raw);
1196 Store(v,
d,
reinterpret_cast<T*
>(addr));
1199template <
class D,
typename T = TFromD<D>,
typename TI>
1202 static_assert(
sizeof(T) ==
sizeof(TI),
"Index/lane size must match");
1206template <
class D,
typename T = TFromD<D>,
typename TI>
1209 static_assert(
sizeof(T) ==
sizeof(TI),
"Index/lane size must match");
1215#ifdef HWY_NATIVE_GATHER
1216#undef HWY_NATIVE_GATHER
1218#define HWY_NATIVE_GATHER
1221template <
class D,
typename T = TFromD<D>>
1224 const intptr_t addr =
1225 reinterpret_cast<intptr_t
>(base) +
static_cast<intptr_t
>(offset.raw);
1226 return Load(
d,
reinterpret_cast<const T*
>(addr));
1229template <
class D,
typename T = TFromD<D>>
1233 return Load(
d, base + index.raw);
1236template <
class D,
typename T = TFromD<D>>
1243template <
class D,
typename T = TFromD<D>>
1258template <
class ToT,
class FromT>
1265 constexpr unsigned kMaxExpField =
1266 static_cast<unsigned>(MaxExponentField<FromT>());
1267 constexpr unsigned kExpBias = kMaxExpField >> 1;
1268 constexpr unsigned kMinOutOfRangeExpField =
static_cast<unsigned>(
HWY_MIN(
1269 kExpBias +
sizeof(ToT) * 8 -
static_cast<unsigned>(IsSigned<ToT>()),
1278 const FromT val_to_compare =
1279 static_cast<FromT
>(IsSigned<ToT>() ?
ScalarAbs(val) : val);
1290 return (
static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1291 MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1292 ?
static_cast<ToT
>(val)
1293 : static_cast<ToT>(static_cast<ToTU>(
LimitsMax<ToT>()) +
1297template <
class ToT,
class ToTypeTag,
class FromT>
1299 return ConvertScalarTo<ToT>(val);
1305 return CastValueForF2IConv<ToT>(val);
1311 return CastValueForF2IConv<ToT>(val);
1319template <
class ToT,
class FromT>
1323 using FromTU = MakeUnsigned<FromT>;
1325 constexpr unsigned kMaxExpField =
1326 static_cast<unsigned>(MaxExponentField<FromT>());
1327 constexpr unsigned kExpBias = kMaxExpField >> 1;
1328 constexpr unsigned kMinOutOfRangeExpField =
static_cast<unsigned>(
HWY_MIN(
1329 kExpBias +
sizeof(ToT) * 8 -
static_cast<unsigned>(IsSigned<ToT>()),
1338 const FromT val_to_compare =
1339 static_cast<FromT
>(IsSigned<ToT>() ?
ScalarAbs(val) : val);
1350 return (
static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1351 MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1352 ?
static_cast<ToT
>(val)
1358#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
1359#undef HWY_NATIVE_PROMOTE_F16_TO_F64
1361#define HWY_NATIVE_PROMOTE_F16_TO_F64
1364template <
class DTo,
typename TTo = TFromD<DTo>,
typename TFrom>
1366 static_assert(
sizeof(TTo) >
sizeof(TFrom),
"Not promoting");
1369 detail::CastValueForPromoteTo<TTo>(hwy::TypeTag<TTo>(), from.
raw));
1372#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1373#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1375#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1378template <
class DTo, HWY_IF_UI64_D(DTo)>
1381 return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(from.
raw));
1386template <
class D, HWY_IF_F32_D(D)>
1389 if (
IsInf(from).bits ||
1396template <
class D, HWY_IF_UI32_D(D)>
1402template <
class DTo,
typename TTo = TFromD<DTo>,
typename TFrom,
1403 HWY_IF_SIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)>
1405 static_assert(!IsFloat<TFrom>(),
"TFrom=double are handled above");
1406 static_assert(
sizeof(TTo) <
sizeof(TFrom),
"Not demoting");
1423#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
1424#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
1425 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
1427template <
class DTo,
typename TTo = TFromD<DTo>,
typename TFrom,
1428 HWY_IF_UNSIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
1430 static_assert(!IsFloat<TFrom>(),
"TFrom=double are handled above");
1431 static_assert(
sizeof(TTo) <
sizeof(TFrom),
"Not demoting");
1433 const auto max =
static_cast<MakeUnsigned<TTo>
>(LimitsMax<TTo>());
1436 return Vec1<TTo>(
static_cast<TTo
>(
HWY_MIN(from.raw, max)));
1439template <
class DTo,
typename TTo = TFromD<DTo>,
typename TFrom,
1440 HWY_IF_UI64(TFrom), HWY_IF_F32_D(DTo)>
1443 return Vec1<TTo>(
static_cast<TTo
>(from.raw));
1446#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1447#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1449#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1452template <
class D32, HWY_IF_UI32_D(D32)>
1454 VFromD<Rebind<double, D32>> v) {
1455 using TTo = TFromD<D32>;
1456 return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
1461#ifdef HWY_NATIVE_F16C
1462#undef HWY_NATIVE_F16C
1464#define HWY_NATIVE_F16C
1467template <
class D, HWY_IF_F32_D(D)>
1472template <
class D, HWY_IF_F32_D(D)>
1477template <
class DTo,
typename TFrom>
1482template <
class D, HWY_IF_F16_D(D)>
1487#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
1488#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
1490#define HWY_NATIVE_DEMOTE_F32_TO_BF16
1493template <
class D, HWY_IF_BF16_D(D)>
1498template <
class DTo,
typename TTo = TFromD<DTo>,
typename TFrom,
1499 HWY_IF_FLOAT(TFrom)>
1501 static_assert(
sizeof(TTo) ==
sizeof(TFrom),
"Should have same size");
1503 return Vec1<TTo>(detail::CastValueForF2IConv<TTo>(from.
raw));
1506template <
class DTo,
typename TTo = TFromD<DTo>,
typename TFrom,
1507 HWY_IF_NOT_FLOAT(TFrom)>
1509 static_assert(
sizeof(TTo) ==
sizeof(TFrom),
"Should have same size");
1511 return Vec1<TTo>(
static_cast<TTo
>(from.raw));
1514#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1515#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1517#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1523 using TTo = TFromD<DI>;
1524 return VFromD<DI>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
1533template <
class D, HWY_IF_U8_D(D)>
1538template <
class D, HWY_IF_U16_D(D)>
1543template <
class D, HWY_IF_U32_D(D)>
1548template <
class D, HWY_IF_U8_D(D)>
1553template <
class D, HWY_IF_U16_D(D)>
1558template <
class D, HWY_IF_U8_D(D)>
1566template <
typename T>
1571template <
class D,
typename T = TFromD<D>>
1578template <
typename T>
1583template <
typename T>
1590template <
typename T>
1598template <
typename T>
1604template <
typename T>
1609template <
typename T>
1616template <
typename T>
1624template <
typename T>
1629template <
class D,
typename T = TFromD<D>,
typename TI>
1631 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane size");
1636template <
class D, HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>,
typename TI>
1641template <
typename T>
1646template <
typename T>
1649 return (idx.
raw == 0) ? a : b;
1655template <
class D,
typename T = TFromD<D>>
1662template <
class D,
typename T = TFromD<D>>
1668#ifdef HWY_NATIVE_REVERSE2_8
1669#undef HWY_NATIVE_REVERSE2_8
1671#define HWY_NATIVE_REVERSE2_8
1675template <
class D,
typename T = TFromD<D>>
1680template <
class D,
typename T = TFromD<D>>
1685template <
class D,
typename T = TFromD<D>>
1692#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
1693#undef HWY_NATIVE_REVERSE_LANE_BYTES
1695#define HWY_NATIVE_REVERSE_LANE_BYTES
1699 const uint32_t val{v.
raw};
1701 static_cast<uint16_t
>(((val << 8) & 0xFF00u) | ((val >> 8) & 0x00FFu)));
1705 const uint32_t val = v.
raw;
1707 ((val << 24) & 0xFF000000u) | ((val << 8) & 0x00FF0000u) |
1708 ((val >> 8) & 0x0000FF00u) | ((val >> 24) & 0x000000FFu)));
1712 const uint64_t val = v.
raw;
1714 ((val << 56) & 0xFF00000000000000u) |
1715 ((val << 40) & 0x00FF000000000000u) |
1716 ((val << 24) & 0x0000FF0000000000u) | ((val << 8) & 0x000000FF00000000u) |
1717 ((val >> 8) & 0x00000000FF000000u) | ((val >> 24) & 0x0000000000FF0000u) |
1718 ((val >> 40) & 0x000000000000FF00u) |
1719 ((val >> 56) & 0x00000000000000FFu)));
1731#ifdef HWY_NATIVE_REVERSE_BITS_UI8
1732#undef HWY_NATIVE_REVERSE_BITS_UI8
1734#define HWY_NATIVE_REVERSE_BITS_UI8
1737#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
1738#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
1740#define HWY_NATIVE_REVERSE_BITS_UI16_32_64
1748 constexpr TU kMaxUnsignedVal{LimitsMax<TU>()};
1749 constexpr TU kShrMask1 =
1750 static_cast<TU
>(0x5555555555555555u & kMaxUnsignedVal);
1751 constexpr TU kShrMask2 =
1752 static_cast<TU
>(0x3333333333333333u & kMaxUnsignedVal);
1753 constexpr TU kShrMask3 =
1754 static_cast<TU
>(0x0F0F0F0F0F0F0F0Fu & kMaxUnsignedVal);
1756 constexpr TU kShlMask1 =
static_cast<TU
>(~kShrMask1);
1757 constexpr TU kShlMask2 =
static_cast<TU
>(~kShrMask2);
1758 constexpr TU kShlMask3 =
static_cast<TU
>(~kShrMask3);
1760 TU result =
static_cast<TU
>(val);
1761 result =
static_cast<TU
>(((result << 1) & kShlMask1) |
1762 ((result >> 1) & kShrMask1));
1763 result =
static_cast<TU
>(((result << 2) & kShlMask2) |
1764 ((result >> 2) & kShrMask2));
1765 result =
static_cast<TU
>(((result << 4) & kShlMask3) |
1766 ((result >> 4) & kShrMask3));
1767 return static_cast<T
>(result);
1772template <
class V, HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 1)>
1783template <
class V, HWY_IF_SIGNED_V(V)>
1792template <
typename D>
1799template <
typename D>
1809template <
int kLane,
typename T>
1811 static_assert(kLane == 0,
"Scalar only has one lane");
1817template <
typename T,
typename TI>
1819 uint8_t in_bytes[
sizeof(T)];
1820 uint8_t idx_bytes[
sizeof(T)];
1821 uint8_t out_bytes[
sizeof(T)];
1822 CopyBytes<sizeof(T)>(&in, &in_bytes);
1823 CopyBytes<sizeof(T)>(&
indices, &idx_bytes);
1824 for (
size_t i = 0; i <
sizeof(T); ++i) {
1825 out_bytes[i] = in_bytes[idx_bytes[i]];
1828 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1832template <
typename T,
typename TI>
1834 uint8_t in_bytes[
sizeof(T)];
1835 uint8_t idx_bytes[
sizeof(T)];
1836 uint8_t out_bytes[
sizeof(T)];
1837 CopyBytes<sizeof(T)>(&in, &in_bytes);
1838 CopyBytes<sizeof(T)>(&
indices, &idx_bytes);
1839 for (
size_t i = 0; i <
sizeof(T); ++i) {
1840 out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
1843 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1868template <
class DW,
typename TW = TFromD<DW>,
typename TN = MakeNarrow<TW>>
1870 return Vec1<TW>(
static_cast<TW
>((TW{b.
raw} << (
sizeof(TN) * 8)) + a.
raw));
1875template <
class D,
typename T = TFromD<D>>
1877 return mask.
bits == 0;
1880template <
class D,
typename T = TFromD<D>>
1882 return mask.
bits != 0;
1886template <
class D, HWY_IF_LANES_D(D, 1),
typename T = TFromD<D>>
1891template <
class D, HWY_IF_LANES_D(D, 1)>
1893 return MFromD<D>::FromBool((mask_bits & 1) != 0);
1897template <
class D,
typename T = TFromD<D>>
1903template <
class D,
typename T = TFromD<D>>
1905 return mask.
bits == 0 ? 0 : 1;
1908template <
class D,
typename T = TFromD<D>>
1910 return mask.
bits == 0 ? -1 : 0;
1913template <
class D,
typename T = TFromD<D>>
1918template <
class D,
typename T = TFromD<D>>
1920 return mask.
bits == 0 ? -1 : 0;
1923template <
class D,
typename T = TFromD<D>>
1930template <
typename T>
1931struct CompressIsPartition {
1935template <
typename T>
1941template <
typename T>
1948template <
class D,
typename T = TFromD<D>>
1956template <
class D,
typename T = TFromD<D>>
1959 if (!mask.
bits)
return 0;
1965template <
typename T>
1971template <
class D,
typename T = TFromD<D>>
1982#ifdef HWY_NATIVE_EXPAND
1983#undef HWY_NATIVE_EXPAND
1985#define HWY_NATIVE_EXPAND
1988template <
typename T>
2002template <
class D32, HWY_IF_F32_D(D32)>
2008template <
class D32, HWY_IF_I32_D(D32)>
2015#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2016#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2018#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2021template <
class DI32, HWY_IF_I32_D(DI32)>
2023 VFromD<Rebind<int16_t, DI32>> a,
2024 VFromD<Rebind<int16_t, DI32>> b,
2032 const VFromD<DI32> product(
static_cast<int32_t
>(a.raw) *
2033 static_cast<int32_t
>(b.raw));
2036 const auto mul_overflow =
2040 Add(product2, mul_overflow));
2045#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
2046#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
2048#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
2051template <
class DI16, HWY_IF_I16_D(DI16)>
2065 static_cast<int16_t
>(b.
raw));
2070template <
class D32, HWY_IF_F32_D(D32)>
2079template <
class D32, HWY_IF_I32_D(D32)>
2087template <
class DU32, HWY_IF_U32_D(DU32)>
2097template <
typename TW>
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
Definition scalar-inl.h:75
Raw bits
Definition scalar-inl.h:85
hwy::MakeUnsigned< T > Raw
Definition scalar-inl.h:76
static HWY_INLINE Mask1< T > FromBool(bool b)
Definition scalar-inl.h:79
HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag, FromT val)
Definition emu128-inl.h:1554
HWY_INLINE T ReverseBitsOfEachByte(T val)
Definition scalar-inl.h:1746
HWY_INLINE VFromD< DTo > ZeroExtendResizeBitCast(FromSizeTag, ToSizeTag, DTo d_to, DFrom, VFromD< DFrom > v)
Definition emu128-inl.h:140
HWY_INLINE ToT CastValueForF2IConv(FromT val)
Definition emu128-inl.h:1515
HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val)
Definition emu128-inl.h:1575
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
HWY_API VFromD< DI32 > SatWidenMulAccumFixedPoint(DI32, VFromD< Rebind< int16_t, DI32 > > a, VFromD< Rebind< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition arm_neon-inl.h:6496
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec< DI16 > SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b)
Definition generic_ops-inl.h:5153
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API V ReverseLaneBytes(V v)
Definition generic_ops-inl.h:6386
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< DTo > PromoteEvenTo(DTo d_to, Vec1< TFrom > v)
Definition scalar-inl.h:1478
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API Vec1< MakeWide< T > > SumsOf2(const Vec1< T > v)
Definition scalar-inl.h:549
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API void LoadInterleaved2(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1)
Definition arm_neon-inl.h:9049
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API svbool_t LowerHalfOfMask(D, svbool_t m)
Definition arm_sve-inl.h:1456
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
V Ceiling(const V v)
Definition scalar-inl.h:877
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > MaskedGatherIndex(MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2731
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API V ReverseBits(V v)
Definition generic_ops-inl.h:6464
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16)
Definition base.h:1304
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue< float >()
Definition base.h:2203
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_API constexpr T LimitsMin()
Definition base.h:2181
HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf)
Definition base.h:1778
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val)
Definition base.h:2873
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val)
Definition base.h:2822
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue< float >()
Definition base.h:2224
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f)
Definition base.h:1817
HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32)
Definition base.h:1374
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef< T > ScalarAbs(T val)
Definition base.h:2815
HWY_API constexpr RemoveCvRef< T > ScalarShr(T val, int shift_amt)
Definition base.h:2528
HWY_API size_t PopCount(T x)
Definition base.h:2615
HWY_API constexpr T LimitsMax()
Definition base.h:2174
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
@ value
Definition arm_neon-inl.h:8429
Definition scalar-inl.h:1625
MakeSigned< T > raw
Definition scalar-inl.h:1626
Definition ops/shared-inl.h:198
Definition scalar-inl.h:36
T raw
Definition scalar-inl.h:70
static constexpr size_t kPrivateN
Definition scalar-inl.h:38
HWY_INLINE Vec1 & operator*=(const Vec1 other)
Definition scalar-inl.h:45
Vec1(const Vec1 &)=default
HWY_INLINE Vec1()=default
HWY_INLINE Vec1 & operator%=(const Vec1 other)
Definition scalar-inl.h:57
HWY_INLINE Vec1 & operator^=(const Vec1 other)
Definition scalar-inl.h:66
HWY_INLINE Vec1(const T t)
Definition scalar-inl.h:43
HWY_INLINE Vec1 & operator&=(const Vec1 other)
Definition scalar-inl.h:60
T PrivateT
Definition scalar-inl.h:37
Vec1 & operator=(const Vec1 &)=default
HWY_INLINE Vec1 & operator-=(const Vec1 other)
Definition scalar-inl.h:54
HWY_INLINE Vec1 & operator+=(const Vec1 other)
Definition scalar-inl.h:51
HWY_INLINE Vec1 & operator|=(const Vec1 other)
Definition scalar-inl.h:63
HWY_INLINE Vec1 & operator/=(const Vec1 other)
Definition scalar-inl.h:48
int VFromD
Definition tuple-inl.h:25