29#if HWY_COMPILER_GCC_ACTUAL
32 ignored
"-Wmaybe-uninitialized")
44#include <avx2intrin.h>
45#include <bmi2intrin.h>
46#include <f16cintrin.h>
52#include
"hwy/ops/shared-inl.h"
54#include
"hwy/ops/x86_128-inl.h"
88 static constexpr size_t kPrivateN = 32 /
sizeof(T);
93 return *
this = (*
this * other);
96 return *
this = (*
this / other);
99 return *
this = (*
this + other);
102 return *
this = (*
this - other);
105 return *
this = (*
this % other);
108 return *
this = (*
this & other);
111 return *
this = (*
this | other);
114 return *
this = (*
this ^ other);
120#if HWY_TARGET <= HWY_AVX3
125template <
size_t size>
167#if HWY_TARGET <= HWY_AVX3
180using Full256 = Simd<T, 32 /
sizeof(T), 0>;
189 return _mm256_castph_si256(v);
194 return _mm256_castpd_si256(v);
197#if HWY_AVX3_HAVE_F32_TO_BF16C
204#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
206 return reinterpret_cast<__m256i
>(v);
211 return BitCastScalar<__m256i>(v);
241template <
class D, HWY_IF_V_SIZE_D(D, 32)>
248template <
class D, HWY_IF_V_SIZE_D(D, 32),
typename FromT>
256template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
260template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
264template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
272template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
276template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
283template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
285 return VFromD<D>{_mm256_set1_epi8(
static_cast<char>(t))};
287template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
289 return VFromD<D>{_mm256_set1_epi16(
static_cast<short>(t))};
291template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
293 return VFromD<D>{_mm256_set1_epi32(
static_cast<int>(t))};
295template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
297 return VFromD<D>{_mm256_set1_epi64x(
static_cast<long long>(t))};
301template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
302HWY_API Vec256<float16_t>
Set(D , float16_t t) {
303 return Vec256<float16_t>{_mm256_set1_ph(t)};
306template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
310template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
323 return VFromD<D>{_mm256_undefined_si256()};
325template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
329template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
337template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
341template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
364 const DFromV<
decltype(v)> d_from;
365 const Half<
decltype(d_from)> dh_from;
380 return BitCast(
d, Vec256<uint8_t>{_mm256_castsi128_si256(
386template <
class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 32)>
388 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
389 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
390 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
391 TFromD<D> t11, TFromD<D> t12,
392 TFromD<D> t13, TFromD<D> t14,
395 static_cast<char>(t0),
static_cast<char>(t1),
static_cast<char>(t2),
396 static_cast<char>(t3),
static_cast<char>(t4),
static_cast<char>(t5),
397 static_cast<char>(t6),
static_cast<char>(t7),
static_cast<char>(t8),
398 static_cast<char>(t9),
static_cast<char>(t10),
static_cast<char>(t11),
399 static_cast<char>(t12),
static_cast<char>(t13),
static_cast<char>(t14),
400 static_cast<char>(t15),
static_cast<char>(t0),
static_cast<char>(t1),
401 static_cast<char>(t2),
static_cast<char>(t3),
static_cast<char>(t4),
402 static_cast<char>(t5),
static_cast<char>(t6),
static_cast<char>(t7),
403 static_cast<char>(t8),
static_cast<char>(t9),
static_cast<char>(t10),
404 static_cast<char>(t11),
static_cast<char>(t12),
static_cast<char>(t13),
405 static_cast<char>(t14),
static_cast<char>(t15))};
408template <
class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 32)>
410 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
411 TFromD<D> t5, TFromD<D> t6,
414 _mm256_setr_epi16(
static_cast<int16_t
>(t0),
static_cast<int16_t
>(t1),
415 static_cast<int16_t
>(t2),
static_cast<int16_t
>(t3),
416 static_cast<int16_t
>(t4),
static_cast<int16_t
>(t5),
417 static_cast<int16_t
>(t6),
static_cast<int16_t
>(t7),
418 static_cast<int16_t
>(t0),
static_cast<int16_t
>(t1),
419 static_cast<int16_t
>(t2),
static_cast<int16_t
>(t3),
420 static_cast<int16_t
>(t4),
static_cast<int16_t
>(t5),
421 static_cast<int16_t
>(t6),
static_cast<int16_t
>(t7))};
425template <
class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 32)>
427 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
428 TFromD<D> t5, TFromD<D> t6,
430 return VFromD<D>{_mm256_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
431 t3, t4, t5, t6, t7)};
435template <
class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 32)>
437 TFromD<D> t2, TFromD<D> t3) {
439 _mm256_setr_epi32(
static_cast<int32_t
>(t0),
static_cast<int32_t
>(t1),
440 static_cast<int32_t
>(t2),
static_cast<int32_t
>(t3),
441 static_cast<int32_t
>(t0),
static_cast<int32_t
>(t1),
442 static_cast<int32_t
>(t2),
static_cast<int32_t
>(t3))};
445template <
class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 32)>
447 TFromD<D> t2, TFromD<D> t3) {
448 return VFromD<D>{_mm256_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3)};
451template <
class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 32)>
454 _mm256_setr_epi64x(
static_cast<int64_t
>(t0),
static_cast<int64_t
>(t1),
455 static_cast<int64_t
>(t0),
static_cast<int64_t
>(t1))};
458template <
class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 32)>
460 return VFromD<D>{_mm256_setr_pd(t0, t1, t0, t1)};
468HWY_API Vec256<T>
And(Vec256<T> a, Vec256<T> b) {
486HWY_API Vec256<T>
AndNot(Vec256<T> not_mask, Vec256<T> mask) {
487 const DFromV<
decltype(mask)>
d;
502HWY_API Vec256<T>
Or(Vec256<T> a, Vec256<T> b) {
519HWY_API Vec256<T>
Xor(Vec256<T> a, Vec256<T> b) {
538#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
540 return BitCast(
d, Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
542 return Xor(v,
BitCast(
d, Vec256<TU>{_mm256_set1_epi32(-1)}));
548HWY_API Vec256<T>
Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
549#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
552 using VU =
VFromD<
decltype(du)>;
553 const __m256i ret = _mm256_ternarylogic_epi64(
557 return Xor(x1,
Xor(x2, x3));
563HWY_API Vec256<T>
Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
564#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
567 using VU =
VFromD<
decltype(du)>;
568 const __m256i ret = _mm256_ternarylogic_epi64(
572 return Or(o1,
Or(o2, o3));
578HWY_API Vec256<T>
OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
579#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
582 using VU =
VFromD<
decltype(du)>;
583 const __m256i ret = _mm256_ternarylogic_epi64(
587 return Or(o,
And(a1, a2));
594#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
597 using VU =
VFromD<
decltype(du)>;
626#if HWY_TARGET <= HWY_AVX3_DL
628#ifdef HWY_NATIVE_POPCNT
629#undef HWY_NATIVE_POPCNT
631#define HWY_NATIVE_POPCNT
664#if HWY_TARGET <= HWY_AVX3
696template <
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
702 Vec256<float16_t> yes,
703 Vec256<float16_t> no) {
704 return Vec256<float16_t>{_mm256_mask_blend_ph(mask.raw, no.raw, yes.raw)};
741template <
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
779template <
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
797#if HWY_COMPILER_HAS_MASK_INTRINSICS
806#if HWY_COMPILER_HAS_MASK_INTRINSICS
815#if HWY_COMPILER_HAS_MASK_INTRINSICS
824#if HWY_COMPILER_HAS_MASK_INTRINSICS
834#if HWY_COMPILER_HAS_MASK_INTRINSICS
843#if HWY_COMPILER_HAS_MASK_INTRINSICS
852#if HWY_COMPILER_HAS_MASK_INTRINSICS
861#if HWY_COMPILER_HAS_MASK_INTRINSICS
871#if HWY_COMPILER_HAS_MASK_INTRINSICS
880#if HWY_COMPILER_HAS_MASK_INTRINSICS
889#if HWY_COMPILER_HAS_MASK_INTRINSICS
898#if HWY_COMPILER_HAS_MASK_INTRINSICS
908#if HWY_COMPILER_HAS_MASK_INTRINSICS
917#if HWY_COMPILER_HAS_MASK_INTRINSICS
926#if HWY_COMPILER_HAS_MASK_INTRINSICS
935#if HWY_COMPILER_HAS_MASK_INTRINSICS
945#if HWY_COMPILER_HAS_MASK_INTRINSICS
954#if HWY_COMPILER_HAS_MASK_INTRINSICS
963#if HWY_COMPILER_HAS_MASK_INTRINSICS
972#if HWY_COMPILER_HAS_MASK_INTRINSICS
980template <
typename T, HWY_IF_T_SIZE(T, 1)>
982#if HWY_COMPILER_HAS_MASK_INTRINSICS
983 return Mask256<T>{
static_cast<__mmask32
>(_knot_mask32(
m.raw))};
985 return Mask256<T>{
static_cast<__mmask32
>(~m.raw)};
989template <
typename T, HWY_IF_T_SIZE(T, 2)>
991#if HWY_COMPILER_HAS_MASK_INTRINSICS
992 return Mask256<T>{
static_cast<__mmask16
>(_knot_mask16(
m.raw))};
994 return Mask256<T>{
static_cast<__mmask16
>(~m.raw)};
998template <
typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
999HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
1000#if HWY_COMPILER_HAS_MASK_INTRINSICS
1001 return Mask256<T>{static_cast<__mmask8>(_knot_mask8(m.raw))};
1003 return Mask256<T>{static_cast<__mmask8>(~m.raw)};
1007template <
typename T>
1012template <
typename T>
1017template <
typename T>
1022template <
typename T>
1033template <
typename T>
1034HWY_API Mask256<T>
And(
const Mask256<T> a, Mask256<T> b) {
1038template <
typename T>
1039HWY_API Mask256<T>
AndNot(
const Mask256<T> a, Mask256<T> b) {
1043template <
typename T>
1044HWY_API Mask256<T>
Or(
const Mask256<T> a, Mask256<T> b) {
1048template <
typename T>
1049HWY_API Mask256<T>
Xor(
const Mask256<T> a, Mask256<T> b) {
1053template <
typename T>
1059template <
typename T>
1064template <
class D, HWY_IF_LANES_D(D, 32)>
1067#if HWY_COMPILER_HAS_MASK_INTRINSICS
1068 const __mmask32 combined_mask = _mm512_kunpackw(
1069 static_cast<__mmask32
>(hi.raw),
static_cast<__mmask32
>(lo.raw));
1071 const auto combined_mask =
1072 ((
static_cast<uint32_t
>(hi.raw) << 16) | (lo.raw & 0xFFFFu));
1075 return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)
>(combined_mask)};
1078template <
class D, HWY_IF_LANES_D(D, 16)>
1080#if HWY_COMPILER_HAS_MASK_INTRINSICS
1081 const auto shifted_mask = _kshiftri_mask32(
static_cast<__mmask32
>(
m.raw), 16);
1083 const auto shifted_mask =
static_cast<uint32_t
>(
m.raw) >> 16;
1086 return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)
>(shifted_mask)};
1089template <
class D, HWY_IF_LANES_D(D, 32)>
1091 using RawM =
decltype(MFromD<D>().raw);
1092#if HWY_COMPILER_HAS_MASK_INTRINSICS
1094 static_cast<RawM
>(_kshiftli_mask32(
static_cast<__mmask32
>(
m.raw), 1))};
1096 return MFromD<D>{
static_cast<RawM
>(
static_cast<uint32_t
>(
m.raw) << 1)};
1100template <
class D, HWY_IF_LANES_D(D, 32)>
1102 using RawM =
decltype(MFromD<D>().raw);
1103#if HWY_COMPILER_HAS_MASK_INTRINSICS
1105 static_cast<RawM
>(_kshiftri_mask32(
static_cast<__mmask32
>(
m.raw), 1))};
1107 return MFromD<D>{
static_cast<RawM
>(
static_cast<uint32_t
>(
m.raw) >> 1)};
1116template <
typename T>
1118 return Mask256<T>{v.raw};
1121template <
typename T>
1123 return Vec256<T>{v.raw};
1129template <
typename T, HWY_IF_NOT_FLOAT3264(T)>
1131 return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
1135 return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
1138 Vec256<double> no) {
1139 return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
1143template <
typename T>
1145 const DFromV<
decltype(yes)>
d;
1150template <
typename T>
1156template <
typename T>
1158 static_assert(IsSigned<T>(),
"Only for float");
1160 const auto zero =
Zero(
d);
1167template <
typename T>
1173template <
typename T>
1174HWY_API Mask256<T>
And(
const Mask256<T> a, Mask256<T> b) {
1179template <
typename T>
1180HWY_API Mask256<T>
AndNot(
const Mask256<T> a, Mask256<T> b) {
1185template <
typename T>
1186HWY_API Mask256<T>
Or(
const Mask256<T> a, Mask256<T> b) {
1191template <
typename T>
1192HWY_API Mask256<T>
Xor(
const Mask256<T> a, Mask256<T> b) {
1197template <
typename T>
1207#if HWY_TARGET <= HWY_AVX3
1211template <
class DTo, HWY_IF_V_SIZE_D(DTo, 32),
typename TFrom>
1213 static_assert(
sizeof(TFrom) ==
sizeof(TFromD<DTo>),
"Must have same size");
1214 return MFromD<DTo>{
m.raw};
1219template <
typename T>
1224template <
typename T>
1229template <
typename T>
1234template <
typename T>
1242template <
typename T>
1243HWY_API Mask256<T>
TestBit(
const Vec256<T> v,
const Vec256<T> bit) {
1244 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1250template <
typename T, HWY_IF_T_SIZE(T, 1)>
1254template <
typename T, HWY_IF_UI16(T)>
1256 return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
1258template <
typename T, HWY_IF_UI32(T)>
1260 return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
1262template <
typename T, HWY_IF_UI64(T)>
1264 return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
1269 Vec256<float16_t> b) {
1273 return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1287template <
typename T, HWY_IF_T_SIZE(T, 1)>
1291template <
typename T, HWY_IF_UI16(T)>
1293 return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
1295template <
typename T, HWY_IF_UI32(T)>
1297 return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
1299template <
typename T, HWY_IF_UI64(T)>
1301 return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
1306 Vec256<float16_t> b) {
1310 return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1351HWY_API Mask256<float16_t>
operator>(Vec256<float16_t> a, Vec256<float16_t> b) {
1355 return Mask256<
float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
1370 Vec256<float16_t> b) {
1374 return Mask256<
float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
1419template <
typename T>
1423template <
typename T>
1427template <
typename T>
1431template <
typename T>
1438template <
typename T, HWY_IF_NOT_FLOAT(T)>
1443template <
typename T, HWY_IF_FLOAT(T)>
1449template <
typename T, HWY_IF_T_SIZE(T, 1)>
1454template <
typename T, HWY_IF_UI16(T)>
1456 return Vec256<T>{_mm256_movm_epi16(v.raw)};
1459template <
typename T, HWY_IF_UI32(T)>
1461 return Vec256<T>{_mm256_movm_epi32(v.raw)};
1464template <
typename T, HWY_IF_UI64(T)>
1466 return Vec256<T>{_mm256_movm_epi64(v.raw)};
1471 return Vec256<float16_t>{_mm256_castsi256_ph(_mm256_movm_epi16(v.raw))};
1487template <
class DTo, HWY_IF_V_SIZE_D(DTo, 32),
typename TFrom>
1489 static_assert(
sizeof(TFrom) ==
sizeof(TFromD<DTo>),
"Must have same size");
1490 const Full256<TFrom> dfrom;
1494template <
typename T>
1495HWY_API Mask256<T>
TestBit(
const Vec256<T> v,
const Vec256<T> bit) {
1496 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1497 return (v & bit) == bit;
1502template <
typename T, HWY_IF_T_SIZE(T, 1)>
1504 return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1507template <
typename T, HWY_IF_UI16(T)>
1509 return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1512template <
typename T, HWY_IF_UI32(T)>
1514 return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1517template <
typename T, HWY_IF_UI64(T)>
1519 return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1523 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1527 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1532template <
typename T, HWY_IF_NOT_FLOAT3264(T)>
1537 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1540 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1551#if HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 903
1552#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1554#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1559#if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1560 using i8x32 =
signed char __attribute__((__vector_size__(32)));
1561 return Mask256<int8_t>{
static_cast<__m256i
>(
reinterpret_cast<i8x32
>(a.raw) >
1562 reinterpret_cast<i8x32
>(b.raw))};
1564 return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1568 Vec256<int16_t> b) {
1569 return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1572 Vec256<int32_t> b) {
1573 return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1576 Vec256<int64_t> b) {
1577 return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1580template <
typename T>
1582 const Full256<T> du;
1584 const Vec256<T> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
1590 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1594 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1599template <
typename T>
1601 return detail::Gt(hwy::TypeTag<T>(), a, b);
1608template <
typename T>
1610 return Not(
Gt(tag, b, a));
1613template <
typename T>
1615 return Not(
Gt(tag, b, a));
1620 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1624 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1629template <
typename T>
1631 return detail::Ge(hwy::TypeTag<T>(), a, b);
1638template <
typename T>
1643template <
typename T>
1664#if HWY_TARGET <= HWY_AVX3
1669 const auto msb =
Set(du, 1ull << 63);
1686#if HWY_TARGET <= HWY_AVX3
1695HWY_API Vec256<float16_t>
Min(Vec256<float16_t> a, Vec256<float16_t> b) {
1696 return Vec256<float16_t>{_mm256_min_ph(a.raw, b.raw)};
1722#if HWY_TARGET <= HWY_AVX3
1727 const auto msb =
Set(du, 1ull << 63);
1744#if HWY_TARGET <= HWY_AVX3
1753HWY_API Vec256<float16_t>
Max(Vec256<float16_t> a, Vec256<float16_t> b) {
1754 return Vec256<float16_t>{_mm256_max_ph(a.raw, b.raw)};
1768template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
1771 static_cast<char>(31),
static_cast<char>(30),
static_cast<char>(29),
1772 static_cast<char>(28),
static_cast<char>(27),
static_cast<char>(26),
1773 static_cast<char>(25),
static_cast<char>(24),
static_cast<char>(23),
1774 static_cast<char>(22),
static_cast<char>(21),
static_cast<char>(20),
1775 static_cast<char>(19),
static_cast<char>(18),
static_cast<char>(17),
1776 static_cast<char>(16),
static_cast<char>(15),
static_cast<char>(14),
1777 static_cast<char>(13),
static_cast<char>(12),
static_cast<char>(11),
1778 static_cast<char>(10),
static_cast<char>(9),
static_cast<char>(8),
1779 static_cast<char>(7),
static_cast<char>(6),
static_cast<char>(5),
1780 static_cast<char>(4),
static_cast<char>(3),
static_cast<char>(2),
1781 static_cast<char>(1),
static_cast<char>(0))};
1784template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
1787 int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12}, int16_t{11},
1788 int16_t{10}, int16_t{9}, int16_t{8}, int16_t{7}, int16_t{6}, int16_t{5},
1789 int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0})};
1793template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
1796 _mm256_set_ph(float16_t{15}, float16_t{14}, float16_t{13}, float16_t{12},
1797 float16_t{11}, float16_t{10}, float16_t{9}, float16_t{8},
1798 float16_t{7}, float16_t{6}, float16_t{5}, float16_t{4},
1799 float16_t{3}, float16_t{2}, float16_t{1}, float16_t{0})};
1803template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
1805 return VFromD<D>{_mm256_set_epi32(int32_t{7}, int32_t{6}, int32_t{5},
1806 int32_t{4}, int32_t{3}, int32_t{2},
1807 int32_t{1}, int32_t{0})};
1810template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
1813 _mm256_set_epi64x(int64_t{3}, int64_t{2}, int64_t{1}, int64_t{0})};
1816template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
1819 _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)};
1822template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
1824 return VFromD<D>{_mm256_set_pd(3.0, 2.0, 1.0, 0.0)};
1829template <
class D, HWY_IF_V_SIZE_D(D, 32),
typename T2>
1836template <
class D, HWY_IF_V_SIZE_D(D, 32),
class M = MFromD<D>>
1843#if HWY_TARGET <= HWY_AVX3
1845 const uint64_t all = (1ull << kN) - 1;
1846 return M::FromBits(_bzhi_u64(all, n));
1848 const uint32_t all =
static_cast<uint32_t
>((1ull << kN) - 1);
1849 return M::FromBits(_bzhi_u32(all,
static_cast<uint32_t
>(n)));
1853 using TI =
TFromD<
decltype(di)>;
1892HWY_API Vec256<float16_t>
operator+(Vec256<float16_t> a, Vec256<float16_t> b) {
1893 return Vec256<float16_t>{_mm256_add_ph(a.raw, b.raw)};
1935HWY_API Vec256<float16_t>
operator-(Vec256<float16_t> a, Vec256<float16_t> b) {
1936 return Vec256<float16_t>{_mm256_sub_ph(a.raw, b.raw)};
1957 return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
1965#if HWY_TARGET <= HWY_AVX3
1977 static_cast<__mmask16
>(0x5555), v.
raw,
Zero(
d).raw, 0)};
1987template <
int kAOffset,
int kBOffset>
1990 static_assert(0 <= kAOffset && kAOffset <= 1,
1991 "kAOffset must be between 0 and 1");
1992 static_assert(0 <= kBOffset && kBOffset <= 3,
1993 "kBOffset must be between 0 and 3");
1996 (kAOffset << 5) | (kBOffset << 3) | (kAOffset << 2) | kBOffset)};
2001#if HWY_TARGET <= HWY_AVX3
2002template <
int kIdx3,
int kIdx2,
int kIdx1,
int kIdx0>
2005 static_assert(0 <= kIdx0 && kIdx0 <= 3,
"kIdx0 must be between 0 and 3");
2006 static_assert(0 <= kIdx1 && kIdx1 <= 3,
"kIdx1 must be between 0 and 3");
2007 static_assert(0 <= kIdx2 && kIdx2 <= 3,
"kIdx2 must be between 0 and 3");
2008 static_assert(0 <= kIdx3 && kIdx3 <= 3,
"kIdx3 must be between 0 and 3");
2010 _mm256_dbsad_epu8(b.
raw, a.
raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
2034#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
2037 const auto sum = a + b;
2040 const auto i32_max =
Set(
d, LimitsMax<int32_t>());
2042 i32_max.raw,
MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
2043 return IfThenElse(overflow_mask, overflow_result, sum);
2048 const auto sum = a + b;
2051 const auto i64_max =
Set(
d, LimitsMax<int64_t>());
2053 i64_max.raw,
MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
2054 return IfThenElse(overflow_mask, overflow_result, sum);
2078#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
2081 const auto diff = a - b;
2084 const auto i32_max =
Set(
d, LimitsMax<int32_t>());
2086 i32_max.raw,
MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
2087 return IfThenElse(overflow_mask, overflow_result, diff);
2092 const auto diff = a - b;
2095 const auto i64_max =
Set(
d, LimitsMax<int64_t>());
2097 i64_max.raw,
MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
2098 return IfThenElse(overflow_mask, overflow_result, diff);
2118#if HWY_COMPILER_MSVC
2121 const auto zero =
Zero(
d);
2134#if HWY_TARGET <= HWY_AVX3
2181#if HWY_TARGET <= HWY_AVX3_DL
2183template <
typename T>
2185 return Vec256<T>{_mm256_gf2p8affine_epi64_epi8(v.
raw, matrix.
raw, 0)};
2220#if HWY_TARGET > HWY_AVX3_DL
2222template <
int kBits,
typename T, HWY_IF_T_SIZE(T, 1)>
2224 const Full256<T> d8;
2226 const auto shifted =
BitCast(d8, ShiftLeft<kBits>(
BitCast(d16, v)));
2229 : (shifted &
Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
2261#if HWY_TARGET > HWY_AVX3_DL
2265 const Full256<uint8_t> d8;
2267 const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
2268 return shifted &
Set(d8, 0xFF >> kBits);
2273 const Full256<int8_t> di;
2274 const Full256<uint8_t> du;
2275 const auto shifted =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
2276 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
2277 return (shifted ^ shifted_sign) - shifted_sign;
2289#if HWY_TARGET > HWY_AVX3_DL
2292 static_assert(0 <= kBits && kBits < 8,
"Invalid shift count");
2293 if (kBits == 0)
return v;
2301 static_assert(0 <= kBits && kBits < 16,
"Invalid shift count");
2302 if (kBits == 0)
return v;
2309 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
2310#if HWY_TARGET <= HWY_AVX3
2313 if (kBits == 0)
return v;
2320 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
2321#if HWY_TARGET <= HWY_AVX3
2324 if (kBits == 0)
return v;
2330#if HWY_TARGET <= HWY_AVX3
2332template <
class T, HWY_IF_UI32(T)>
2337template <
class T, HWY_IF_UI32(T)>
2342template <
class T, HWY_IF_UI64(T)>
2343HWY_API Vec256<T>
Rol(Vec256<T> a, Vec256<T> b) {
2344 return Vec256<T>{_mm256_rolv_epi64(a.raw, b.raw)};
2347template <
class T, HWY_IF_UI64(T)>
2348HWY_API Vec256<T>
Ror(Vec256<T> a, Vec256<T> b) {
2349 return Vec256<T>{_mm256_rorv_epi64(a.raw, b.raw)};
2362 return ShiftRight<15>(v);
2366 return ShiftRight<31>(v);
2370#if HWY_TARGET == HWY_AVX2
2380#if HWY_TARGET <= HWY_AVX3
2386 const auto right =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
2388 return right | sign;
2399template <
typename T, HWY_IF_T_SIZE(T, 2)>
2401 static_assert(IsSigned<T>(),
"Only works for signed/float");
2403#if HWY_TARGET <= HWY_AVX3
2415template <
typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
2416HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
2417 static_assert(IsSigned<T>(), "Only works for
signed/
float");
2419#if HWY_TARGET <= HWY_AVX3
2422 return IfThenElse(MaskFromVec(v), yes, no);
2424 const DFromV<decltype(v)> d;
2425 const RebindToFloat<decltype(d)> df;
2427 const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v));
2428 return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no)));
2434HWY_API Vec256<
int8_t> IfNegativeThenNegOrUndefIfZero(Vec256<
int8_t> mask,
2436 return Vec256<
int8_t>{_mm256_sign_epi8(v.raw, mask.raw)};
2439HWY_API Vec256<
int16_t> IfNegativeThenNegOrUndefIfZero(Vec256<
int16_t> mask,
2440 Vec256<
int16_t> v) {
2441 return Vec256<
int16_t>{_mm256_sign_epi16(v.raw, mask.raw)};
2444HWY_API Vec256<
int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<
int32_t> mask,
2445 Vec256<
int32_t> v) {
2446 return Vec256<
int32_t>{_mm256_sign_epi32(v.raw, mask.raw)};
2451HWY_API Vec256<u
int16_t> ShiftLeftSame(const Vec256<u
int16_t> v,
2454 if (__builtin_constant_p(bits)) {
2455 return Vec256<u
int16_t>{_mm256_slli_epi16(v.raw, bits)};
2458 return Vec256<u
int16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2460HWY_API Vec256<u
int32_t> ShiftLeftSame(const Vec256<u
int32_t> v,
2463 if (__builtin_constant_p(bits)) {
2464 return Vec256<u
int32_t>{_mm256_slli_epi32(v.raw, bits)};
2467 return Vec256<u
int32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2469HWY_API Vec256<u
int64_t> ShiftLeftSame(const Vec256<u
int64_t> v,
2472 if (__builtin_constant_p(bits)) {
2473 return Vec256<u
int64_t>{_mm256_slli_epi64(v.raw, bits)};
2476 return Vec256<u
int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2479HWY_API Vec256<
int16_t> ShiftLeftSame(const Vec256<
int16_t> v, const
int bits) {
2481 if (__builtin_constant_p(bits)) {
2482 return Vec256<
int16_t>{_mm256_slli_epi16(v.raw, bits)};
2485 return Vec256<
int16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2488HWY_API Vec256<
int32_t> ShiftLeftSame(const Vec256<
int32_t> v, const
int bits) {
2490 if (__builtin_constant_p(bits)) {
2491 return Vec256<
int32_t>{_mm256_slli_epi32(v.raw, bits)};
2494 return Vec256<
int32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2497HWY_API Vec256<
int64_t> ShiftLeftSame(const Vec256<
int64_t> v, const
int bits) {
2499 if (__builtin_constant_p(bits)) {
2500 return Vec256<
int64_t>{_mm256_slli_epi64(v.raw, bits)};
2503 return Vec256<
int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2506template <
typename T, HWY_IF_T_SIZE(T, 1)>
2511 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
2519 if (__builtin_constant_p(bits)) {
2528 if (__builtin_constant_p(bits)) {
2537 if (__builtin_constant_p(bits)) {
2548 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
2554 if (__builtin_constant_p(bits)) {
2564 if (__builtin_constant_p(bits)) {
2572#if HWY_TARGET <= HWY_AVX3
2574 if (__builtin_constant_p(bits)) {
2585 return right | sign;
2593 const auto shifted_sign =
2594 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
2595 return (shifted ^ shifted_sign) - shifted_sign;
2603template <
typename T>
2609template <
typename T>
2616template <
typename T>
2624template <
typename T>
2632HWY_API Vec256<float16_t>
operator*(Vec256<float16_t> a, Vec256<float16_t> b) {
2633 return Vec256<float16_t>{_mm256_mul_ph(a.raw, b.raw)};
2644HWY_API Vec256<float16_t>
operator/(Vec256<float16_t> a, Vec256<float16_t> b) {
2645 return Vec256<float16_t>{_mm256_div_ph(a.raw, b.raw)};
2658 return Vec256<float16_t>{_mm256_rcp_ph(v.raw)};
2663 return Vec256<float>{_mm256_rcp_ps(v.raw)};
2666#if HWY_TARGET <= HWY_AVX3
2674#if HWY_TARGET <= HWY_AVX3
2676template <
typename T, HWY_IF_U8(T)>
2681template <
typename T, HWY_IF_I8(T)>
2684 return Vec256<T>{_mm256_mask_min_epi8(no.raw,
m.raw, a.raw, b.raw)};
2687template <
typename T, HWY_IF_U16(T)>
2690 return Vec256<T>{_mm256_mask_min_epu16(no.raw,
m.raw, a.raw, b.raw)};
2692template <
typename T, HWY_IF_I16(T)>
2695 return Vec256<T>{_mm256_mask_min_epi16(no.raw,
m.raw, a.raw, b.raw)};
2698template <
typename T, HWY_IF_U32(T)>
2701 return Vec256<T>{_mm256_mask_min_epu32(no.raw,
m.raw, a.raw, b.raw)};
2703template <
typename T, HWY_IF_I32(T)>
2706 return Vec256<T>{_mm256_mask_min_epi32(no.raw,
m.raw, a.raw, b.raw)};
2709template <
typename T, HWY_IF_U64(T)>
2712 return Vec256<T>{_mm256_mask_min_epu64(no.raw,
m.raw, a.raw, b.raw)};
2714template <
typename T, HWY_IF_I64(T)>
2717 return Vec256<T>{_mm256_mask_min_epi64(no.raw,
m.raw, a.raw, b.raw)};
2720template <
typename T, HWY_IF_F32(T)>
2723 return Vec256<T>{_mm256_mask_min_ps(no.raw,
m.raw, a.raw, b.raw)};
2726template <
typename T, HWY_IF_F64(T)>
2729 return Vec256<T>{_mm256_mask_min_pd(no.raw,
m.raw, a.raw, b.raw)};
2733template <
typename T, HWY_IF_F16(T)>
2736 return Vec256<T>{_mm256_mask_min_ph(no.raw,
m.raw, a.raw, b.raw)};
2742template <
typename T, HWY_IF_U8(T)>
2747template <
typename T, HWY_IF_I8(T)>
2750 return Vec256<T>{_mm256_mask_max_epi8(no.raw,
m.raw, a.raw, b.raw)};
2753template <
typename T, HWY_IF_U16(T)>
2756 return Vec256<T>{_mm256_mask_max_epu16(no.raw,
m.raw, a.raw, b.raw)};
2758template <
typename T, HWY_IF_I16(T)>
2761 return Vec256<T>{_mm256_mask_max_epi16(no.raw,
m.raw, a.raw, b.raw)};
2764template <
typename T, HWY_IF_U32(T)>
2767 return Vec256<T>{_mm256_mask_max_epu32(no.raw,
m.raw, a.raw, b.raw)};
2769template <
typename T, HWY_IF_I32(T)>
2772 return Vec256<T>{_mm256_mask_max_epi32(no.raw,
m.raw, a.raw, b.raw)};
2775template <
typename T, HWY_IF_U64(T)>
2778 return Vec256<T>{_mm256_mask_max_epu64(no.raw,
m.raw, a.raw, b.raw)};
2780template <
typename T, HWY_IF_I64(T)>
2783 return Vec256<T>{_mm256_mask_max_epi64(no.raw,
m.raw, a.raw, b.raw)};
2786template <
typename T, HWY_IF_F32(T)>
2789 return Vec256<T>{_mm256_mask_max_ps(no.raw,
m.raw, a.raw, b.raw)};
2792template <
typename T, HWY_IF_F64(T)>
2795 return Vec256<T>{_mm256_mask_max_pd(no.raw,
m.raw, a.raw, b.raw)};
2799template <
typename T, HWY_IF_F16(T)>
2802 return Vec256<T>{_mm256_mask_max_ph(no.raw,
m.raw, a.raw, b.raw)};
2808template <
typename T, HWY_IF_UI8(T)>
2814template <
typename T, HWY_IF_UI16(T)>
2817 return Vec256<T>{_mm256_mask_add_epi16(no.raw,
m.raw, a.raw, b.raw)};
2820template <
typename T, HWY_IF_UI32(T)>
2823 return Vec256<T>{_mm256_mask_add_epi32(no.raw,
m.raw, a.raw, b.raw)};
2826template <
typename T, HWY_IF_UI64(T)>
2829 return Vec256<T>{_mm256_mask_add_epi64(no.raw,
m.raw, a.raw, b.raw)};
2832template <
typename T, HWY_IF_F32(T)>
2835 return Vec256<T>{_mm256_mask_add_ps(no.raw,
m.raw, a.raw, b.raw)};
2838template <
typename T, HWY_IF_F64(T)>
2841 return Vec256<T>{_mm256_mask_add_pd(no.raw,
m.raw, a.raw, b.raw)};
2845template <
typename T, HWY_IF_F16(T)>
2848 return Vec256<T>{_mm256_mask_add_ph(no.raw,
m.raw, a.raw, b.raw)};
2854template <
typename T, HWY_IF_UI8(T)>
2860template <
typename T, HWY_IF_UI16(T)>
2863 return Vec256<T>{_mm256_mask_sub_epi16(no.raw,
m.raw, a.raw, b.raw)};
2866template <
typename T, HWY_IF_UI32(T)>
2869 return Vec256<T>{_mm256_mask_sub_epi32(no.raw,
m.raw, a.raw, b.raw)};
2872template <
typename T, HWY_IF_UI64(T)>
2875 return Vec256<T>{_mm256_mask_sub_epi64(no.raw,
m.raw, a.raw, b.raw)};
2878template <
typename T, HWY_IF_F32(T)>
2881 return Vec256<T>{_mm256_mask_sub_ps(no.raw,
m.raw, a.raw, b.raw)};
2884template <
typename T, HWY_IF_F64(T)>
2887 return Vec256<T>{_mm256_mask_sub_pd(no.raw,
m.raw, a.raw, b.raw)};
2891template <
typename T, HWY_IF_F16(T)>
2894 return Vec256<T>{_mm256_mask_sub_ph(no.raw,
m.raw, a.raw, b.raw)};
2912 Mask256<float16_t>
m, Vec256<float16_t> a,
2913 Vec256<float16_t> b) {
2914 return Vec256<float16_t>{_mm256_mask_mul_ph(no.raw,
m.raw, a.raw, b.raw)};
2932 Mask256<float16_t>
m, Vec256<float16_t> a,
2933 Vec256<float16_t> b) {
2934 return Vec256<float16_t>{_mm256_mask_div_ph(no.raw,
m.raw, a.raw, b.raw)};
2940template <
typename T, HWY_IF_I8(T)>
2946template <
typename T, HWY_IF_U8(T)>
2949 return Vec256<T>{_mm256_mask_adds_epu8(no.raw,
m.raw, a.raw, b.raw)};
2952template <
typename T, HWY_IF_I16(T)>
2955 return Vec256<T>{_mm256_mask_adds_epi16(no.raw,
m.raw, a.raw, b.raw)};
2958template <
typename T, HWY_IF_U16(T)>
2961 return Vec256<T>{_mm256_mask_adds_epu16(no.raw,
m.raw, a.raw, b.raw)};
2966template <
typename T, HWY_IF_I8(T)>
2972template <
typename T, HWY_IF_U8(T)>
2975 return Vec256<T>{_mm256_mask_subs_epu8(no.raw,
m.raw, a.raw, b.raw)};
2978template <
typename T, HWY_IF_I16(T)>
2981 return Vec256<T>{_mm256_mask_subs_epi16(no.raw,
m.raw, a.raw, b.raw)};
2984template <
typename T, HWY_IF_U16(T)>
2987 return Vec256<T>{_mm256_mask_subs_epu16(no.raw,
m.raw, a.raw, b.raw)};
2996HWY_API Vec256<float16_t>
MulAdd(Vec256<float16_t> mul, Vec256<float16_t> x,
2997 Vec256<float16_t> add) {
2998 return Vec256<float16_t>{_mm256_fmadd_ph(mul.raw, x.raw, add.raw)};
3001HWY_API Vec256<float16_t>
NegMulAdd(Vec256<float16_t> mul, Vec256<float16_t> x,
3002 Vec256<float16_t> add) {
3003 return Vec256<float16_t>{_mm256_fnmadd_ph(mul.raw, x.raw, add.raw)};
3006HWY_API Vec256<float16_t>
MulSub(Vec256<float16_t> mul, Vec256<float16_t> x,
3007 Vec256<float16_t> sub) {
3008 return Vec256<float16_t>{_mm256_fmsub_ph(mul.raw, x.raw, sub.raw)};
3011HWY_API Vec256<float16_t>
NegMulSub(Vec256<float16_t> mul, Vec256<float16_t> x,
3012 Vec256<float16_t> sub) {
3013 return Vec256<float16_t>{_mm256_fnmsub_ph(mul.raw, x.raw, sub.raw)};
3018HWY_API Vec256<float>
MulAdd(Vec256<float> mul, Vec256<float> x,
3019 Vec256<float> add) {
3020#ifdef HWY_DISABLE_BMI2_FMA
3021 return mul * x + add;
3023 return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
3028#ifdef HWY_DISABLE_BMI2_FMA
3029 return mul * x + add;
3036 Vec256<float> add) {
3037#ifdef HWY_DISABLE_BMI2_FMA
3038 return add - mul * x;
3040 return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
3045#ifdef HWY_DISABLE_BMI2_FMA
3046 return add - mul * x;
3052HWY_API Vec256<float>
MulSub(Vec256<float> mul, Vec256<float> x,
3053 Vec256<float> sub) {
3054#ifdef HWY_DISABLE_BMI2_FMA
3055 return mul * x - sub;
3057 return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
3062#ifdef HWY_DISABLE_BMI2_FMA
3063 return mul * x - sub;
3070 Vec256<float> sub) {
3071#ifdef HWY_DISABLE_BMI2_FMA
3072 return Neg(mul * x) - sub;
3074 return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
3079#ifdef HWY_DISABLE_BMI2_FMA
3080 return Neg(mul * x) - sub;
3087HWY_API Vec256<float16_t>
MulAddSub(Vec256<float16_t> mul, Vec256<float16_t> x,
3088 Vec256<float16_t> sub_or_add) {
3089 return Vec256<float16_t>{_mm256_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
3095#ifdef HWY_DISABLE_BMI2_FMA
3096 return AddSub(mul * x, sub_or_add);
3104#ifdef HWY_DISABLE_BMI2_FMA
3105 return AddSub(mul * x, sub_or_add);
3115HWY_API Vec256<float16_t>
Sqrt(Vec256<float16_t> v) {
3116 return Vec256<float16_t>{_mm256_sqrt_ph(v.raw)};
3129 return Vec256<float16_t>{_mm256_rsqrt_ph(v.raw)};
3133 return Vec256<float>{_mm256_rsqrt_ps(v.raw)};
3136#if HWY_TARGET <= HWY_AVX3
3138#if HWY_COMPILER_MSVC
3153 return Vec256<float16_t>{_mm256_roundscale_ph(
3154 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
3158 return Vec256<float>{
3159 _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
3163 _mm256_round_pd(v.
raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
3169 return Vec256<float16_t>{
3170 _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
3174 return Vec256<float>{
3175 _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
3179 _mm256_round_pd(v.
raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
3184HWY_API Vec256<float16_t>
Ceil(Vec256<float16_t> v) {
3185 return Vec256<float16_t>{
3186 _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
3190 return Vec256<float>{
3191 _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
3195 _mm256_round_pd(v.
raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
3201 return Vec256<float16_t>{
3202 _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
3206 return Vec256<float>{
3207 _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
3211 _mm256_round_pd(v.
raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
3216#if HWY_HAVE_FLOAT16 || HWY_IDE
3219 return Mask256<float16_t>{_mm256_fpclass_ph_mask(
3224 Vec256<float16_t> b) {
3228 return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3233 return Mask256<float16_t>{_mm256_fpclass_ph_mask(
3240 return Not(Mask256<float16_t>{_mm256_fpclass_ph_mask(
3248#if HWY_TARGET <= HWY_AVX3
3256#if HWY_TARGET <= HWY_AVX3
3265#if HWY_TARGET <= HWY_AVX3
3273#if HWY_TARGET <= HWY_AVX3
3280#if HWY_TARGET <= HWY_AVX3
3310template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3313 _mm256_load_si256(
reinterpret_cast<const __m256i*
>(aligned))};
3317template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3320 return Vec256<float16_t>{_mm256_load_ph(aligned)};
3323template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3327template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3332template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3334 return VFromD<D>{_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(
p))};
3338template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3340 return Vec256<float16_t>{_mm256_loadu_ph(
p)};
3343template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3347template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3354#if HWY_TARGET <= HWY_AVX3
3356template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
3359 return VFromD<D>{_mm256_maskz_loadu_epi8(
m.raw,
p)};
3362template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
3366 return BitCast(
d,
VFromD<
decltype(du)>{_mm256_maskz_loadu_epi16(
m.raw,
p)});
3369template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3372 return VFromD<D>{_mm256_maskz_loadu_epi32(
m.raw,
p)};
3375template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3378 return VFromD<D>{_mm256_maskz_loadu_epi64(
m.raw,
p)};
3381template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3387template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3393template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
3396 return VFromD<D>{_mm256_mask_loadu_epi8(v.raw,
m.raw,
p)};
3399template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
3404 _mm256_mask_loadu_epi16(
BitCast(du, v).raw,
m.raw,
p)});
3407template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3410 return VFromD<D>{_mm256_mask_loadu_epi32(v.raw,
m.raw,
p)};
3413template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3416 return VFromD<D>{_mm256_mask_loadu_epi64(v.raw,
m.raw,
p)};
3419template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3425template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3441template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3444 auto pi =
reinterpret_cast<const int*
>(
p);
3445 return VFromD<D>{_mm256_maskload_epi32(pi,
m.raw)};
3448template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3451 auto pi =
reinterpret_cast<const long long*
>(
p);
3452 return VFromD<D>{_mm256_maskload_epi64(pi,
m.raw)};
3455template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3458 const Vec256<int32_t> mi =
3460 return Vec256<float>{_mm256_maskload_ps(
p, mi.raw)};
3463template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3466 const Vec256<int64_t> mi =
3468 return Vec256<double>{_mm256_maskload_pd(
p, mi.raw)};
3477template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
3483#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
3490 _mm256_castsi128_si256(v128), v128, 1)});
3495 return BitCast(
d,
VFromD<
decltype(du)>{_mm256_broadcastsi128_si256(v128)});
3498template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3500#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
3502 const __m128 v128 =
LoadU(d128,
p).raw;
3504 _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
3506 return Vec256<float>{_mm256_broadcast_ps(
reinterpret_cast<const __m128*
>(
p))};
3509template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3511#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
3513 const __m128d v128 =
LoadU(d128,
p).raw;
3515 _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
3518 _mm256_broadcast_pd(
reinterpret_cast<const __m128d*
>(
p))};
3524template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3526 _mm256_store_si256(
reinterpret_cast<__m256i*
>(aligned), v.raw);
3529template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3532 _mm256_store_ph(aligned, v.raw);
3535template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3537 _mm256_store_ps(aligned, v.
raw);
3539template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3542 _mm256_store_pd(aligned, v.
raw);
3545template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3547 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(
p), v.raw);
3550template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3553 _mm256_storeu_ph(
p, v.raw);
3556template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3558 _mm256_storeu_ps(
p, v.
raw);
3560template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3562 _mm256_storeu_pd(
p, v.
raw);
3567#if HWY_TARGET <= HWY_AVX3
3569template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
3572 _mm256_mask_storeu_epi8(
p,
m.raw, v.raw);
3575template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
3579 _mm256_mask_storeu_epi16(
reinterpret_cast<uint16_t*
>(
p),
3583template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3586 _mm256_mask_storeu_epi32(
p,
m.raw, v.raw);
3589template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3592 _mm256_mask_storeu_epi64(
p,
m.raw, v.raw);
3595template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3598 _mm256_mask_storeu_ps(
p,
m.raw, v.
raw);
3601template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3604 _mm256_mask_storeu_pd(
p,
m.raw, v.
raw);
3626 using TU =
TFromD<
decltype(du)>;
3631 for (
size_t i = 0; i <
MaxLanes(
d); ++i) {
3638template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3641 auto pi =
reinterpret_cast<int*
>(
p);
3642 _mm256_maskstore_epi32(pi,
m.raw, v.raw);
3645template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3648 auto pi =
reinterpret_cast<long long*
>(
p);
3649 _mm256_maskstore_epi64(pi,
m.raw, v.raw);
3652template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3655 const Vec256<int32_t> mi =
3657 _mm256_maskstore_ps(
p, mi.raw, v.raw);
3660template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3663 const Vec256<int64_t> mi =
3665 _mm256_maskstore_pd(
p, mi.raw, v.raw);
3672template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
3675 _mm256_stream_si256(
reinterpret_cast<__m256i*
>(aligned),
BitCast(du, v).raw);
3677template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3679 _mm256_stream_ps(aligned, v.
raw);
3681template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3684 _mm256_stream_pd(aligned, v.
raw);
3698 Vec256<int32_t> offset) {
3699 _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
3702template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3706 _mm256_i64scatter_epi64(base, offset.
raw, v.raw, 1);
3709template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3712 _mm256_i32scatter_ps(base, offset.
raw, v.raw, 1);
3715template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3718 _mm256_i64scatter_pd(base, offset.
raw, v.raw, 1);
3723template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3727 _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
3730template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3733 VFromD<RebindToSigned<D>> index) {
3734 _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
3737template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3740 _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
3743template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3746 _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
3751template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3755 _mm256_mask_i32scatter_epi32(base,
m.raw, index.raw, v.raw, 4);
3758template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3761 VFromD<RebindToSigned<D>> index) {
3762 _mm256_mask_i64scatter_epi64(base,
m.raw, index.raw, v.raw, 8);
3765template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3769 _mm256_mask_i32scatter_ps(base,
m.raw, index.raw, v.raw, 4);
3772template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3776 _mm256_mask_i64scatter_pd(base,
m.raw, index.raw, v.raw, 8);
3785template <
int kScale,
typename T, HWY_IF_UI32(T)>
3788 return Vec256<T>{_mm256_i32gather_epi32(
3789 reinterpret_cast<const int32_t*
>(base),
indices.raw, kScale)};
3792template <
int kScale,
typename T, HWY_IF_UI64(T)>
3795 return Vec256<T>{_mm256_i64gather_epi64(
3799template <
int kScale>
3805template <
int kScale>
3813template <
class D, HWY_IF_V_SIZE_D(D, 32)>
3815 VFromD<RebindToSigned<D>> offsets) {
3819 return detail::NativeGather256<1>(base, offsets);
3822template <
class D, HWY_IF_V_SIZE_D(D, 32)>
3835template <
int kScale,
typename T, HWY_IF_UI32(T)>
3839#if HWY_TARGET <= HWY_AVX3
3840 return Vec256<T>{_mm256_mmask_i32gather_epi32(
3841 no.
raw,
m.raw,
indices.raw,
reinterpret_cast<const int32_t*
>(base),
3844 return Vec256<T>{_mm256_mask_i32gather_epi32(
3845 no.
raw,
reinterpret_cast<const int32_t*
>(base),
indices.raw,
m.raw,
3850template <
int kScale,
typename T, HWY_IF_UI64(T)>
3854#if HWY_TARGET <= HWY_AVX3
3855 return Vec256<T>{_mm256_mmask_i64gather_epi64(
3864 BitCast(dd, no).raw,
reinterpret_cast<const double*
>(base),
3869template <
int kScale>
3874#if HWY_TARGET <= HWY_AVX3
3876 _mm256_mmask_i32gather_ps(no.
raw,
m.raw,
indices.raw, base, kScale)};
3879 _mm256_mask_i32gather_ps(no.
raw, base,
indices.raw,
m.raw, kScale)};
3883template <
int kScale>
3888#if HWY_TARGET <= HWY_AVX3
3890 _mm256_mmask_i64gather_pd(no.
raw,
m.raw,
indices.raw, base, kScale)};
3893 _mm256_mask_i64gather_pd(no.
raw, base,
indices.raw,
m.raw, kScale)};
3899template <
class D, HWY_IF_V_SIZE_D(D, 32)>
3916template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3918 return VFromD<D>{_mm256_castsi256_si128(v.raw)};
3920template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
3924template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3932template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3936template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3941template <
typename T>
3943 const Full128<T> dh;
3949template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
3952 const Twice<
decltype(du)> dut;
3954 _mm256_extracti128_si256(
BitCast(dut, v).raw, 1)});
3956template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3960template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3966template <
typename T>
3971#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3972 constexpr size_t kLanesPerBlock = 16 /
sizeof(T);
3973 if (__builtin_constant_p(i < kLanesPerBlock) && (i < kLanesPerBlock)) {
3978 alignas(32) T lanes[32 /
sizeof(T)];
3984template <
typename T>
3990template <
typename T>
3997template <
int kBlockIdx,
class T>
3999 static_assert(kBlockIdx == 0 || kBlockIdx == 1,
"Invalid block index");
4020#if !defined(HWY_HAVE_ZEXT)
4021#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
4022 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000)
4023#define HWY_HAVE_ZEXT 1
4025#define HWY_HAVE_ZEXT 0
4029template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
4032 return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
4033#elif HWY_COMPILER_MSVC
4035 return VFromD<D>{_mm256_set_m128i(_mm_setzero_si128(), lo.raw)};
4037 return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
4041template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
4045 return Vec256<float16_t>{_mm256_zextph128_ph256(lo.raw)};
4047 const RebindToUnsigned<D> du;
4052template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4057 return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.
raw, 0)};
4060template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4073template <
class DTo,
class DFrom>
4077 const Twice<
decltype(d_from)> dt_from;
4078 const Twice<
decltype(dt_from)> dq_from;
4086template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4089 const Half<
decltype(du)> dh_u;
4092 lo256.raw,
BitCast(dh_u, hi).raw, 1)});
4094template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4099template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4106template <
int kBytes,
class D, HWY_IF_V_SIZE_D(D, 32)>
4108 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
4110 return VFromD<D>{_mm256_slli_si256(v.raw, kBytes)};
4114template <
int kBytes,
class D, HWY_IF_V_SIZE_D(D, 32)>
4116 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
4118 return VFromD<D>{_mm256_srli_si256(v.raw, kBytes)};
4122template <
int kBytes,
class D, HWY_IF_V_SIZE_D(D, 32)>
4125 return BitCast(
d, Vec256<uint8_t>{_mm256_alignr_epi8(
4131template <
int kLane,
typename T, HWY_IF_T_SIZE(T, 2)>
4135 using VU =
VFromD<
decltype(du)>;
4137 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
4139 const __m256i lo = _mm256_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF);
4140 return BitCast(
d, VU{_mm256_unpacklo_epi64(lo, lo)});
4143 _mm256_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF);
4144 return BitCast(
d, VU{_mm256_unpackhi_epi64(hi, hi)});
4147template <
int kLane,
typename T, HWY_IF_UI32(T)>
4149 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
4150 return Vec256<T>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
4153template <
int kLane,
typename T, HWY_IF_UI64(T)>
4155 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
4156 return Vec256<T>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
4161 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
4167 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
4173template <
int kBlockIdx,
class T>
4175 static_assert(kBlockIdx == 0 || kBlockIdx == 1,
"Invalid block index");
4185template <
class T, HWY_IF_T_SIZE(T, 1)>
4192template <
class T, HWY_IF_T_SIZE(T, 2)>
4197 const Half<
decltype(
d)> dh;
4203template <
class T, HWY_IF_UI32(T)>
4207 return Vec256<T>{_mm256_broadcastd_epi32(
LowerHalf(dh, v).raw)};
4210template <
class T, HWY_IF_UI64(T)>
4214 return Vec256<T>{_mm256_broadcastq_epi64(
LowerHalf(dh, v).raw)};
4229template <
size_t kLaneIdx,
class T, hwy::EnableIf<kLaneIdx != 0>* =
nullptr,
4230 HWY_IF_NOT_T_SIZE(T, 8)>
4233 constexpr size_t kLanesPerBlock = 16 /
sizeof(T);
4234 constexpr int kBlockIdx =
static_cast<int>(kLaneIdx / kLanesPerBlock);
4235 constexpr int kLaneInBlkIdx =
4236 static_cast<int>(kLaneIdx) & (kLanesPerBlock - 1);
4237 return Broadcast<kLaneInBlkIdx>(BroadcastBlock<kBlockIdx>(v));
4240template <
size_t kLaneIdx,
class T, hwy::EnableIf<kLaneIdx != 0>* =
nullptr,
4244 static_assert(kLaneIdx <= 3,
"Invalid lane");
4246 _mm256_permute4x64_epi64(v.
raw,
static_cast<int>(0x55 * kLaneIdx))};
4249template <
size_t kLaneIdx, hwy::EnableIf<kLaneIdx != 0>* =
nullptr>
4252 static_assert(kLaneIdx <= 3,
"Invalid lane");
4254 _mm256_permute4x64_pd(v.
raw,
static_cast<int>(0x55 * kLaneIdx))};
4259template <
int kLaneIdx,
class T>
4261 static_assert(kLaneIdx >= 0,
"Invalid lane");
4275template <
typename T, HWY_IF_UI32(T)>
4286template <
typename T, HWY_IF_T_SIZE(T, 4)>
4290 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
4294template <
typename T, HWY_IF_T_SIZE(T, 4)>
4298 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
4302template <
typename T, HWY_IF_T_SIZE(T, 4)>
4306 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
4370template <
typename T>
4376template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1),
typename TI>
4378 static_assert(
sizeof(
TFromD<D>) ==
sizeof(TI),
"Index size must match lane");
4379#if HWY_IS_DEBUG_BUILD
4388template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2),
typename TI>
4390 static_assert(
sizeof(TFromD<D>) ==
sizeof(TI),
"Index size must match lane");
4391 const Full256<TI> di;
4392#if HWY_IS_DEBUG_BUILD
4397#if HWY_TARGET <= HWY_AVX3
4399 return Indices256<TFromD<D>>{vec.raw};
4402 using V8 =
VFromD<
decltype(d8)>;
4403 alignas(32)
static constexpr uint8_t kByteOffsets[32] = {
4404 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
4405 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
4408 alignas(32)
static constexpr uint8_t kBroadcastLaneBytes[32] = {
4409 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
4410 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
4415 const V8 byte_indices =
BitCast(d8, ShiftLeft<1>(
BitCast(d16, lane_indices)));
4417 return Indices256<TFromD<D>>{
Add(byte_indices,
Load(d8, kByteOffsets)).raw};
4422template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4),
typename TI>
4424 static_assert(
sizeof(TFromD<D>) ==
sizeof(TI),
"Index size must match lane");
4425#if HWY_IS_DEBUG_BUILD
4426 const Full256<TI> di;
4430 return Indices256<TFromD<D>>{vec.raw};
4434template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8),
typename TI>
4436 static_assert(
sizeof(TFromD<D>) ==
sizeof(TI),
"Index size must match lane");
4437 const Rebind<TI,
decltype(
d)> di;
4439#if HWY_IS_DEBUG_BUILD
4444#if HWY_TARGET <= HWY_AVX3
4446 return Indices256<TFromD<D>>{idx64.raw};
4450 const Vec256<TI> dup =
4451 BitCast(di, Vec256<float>{_mm256_moveldup_ps(
BitCast(df, idx64).raw)});
4453 const Vec256<TI> idx32 = dup + dup +
Set(di, TI(1) << 32);
4454 return Indices256<TFromD<D>>{idx32.raw};
4458template <
class D, HWY_IF_V_SIZE_D(D, 32),
typename TI>
4460 const Rebind<TI,
decltype(
d)> di;
4464template <
typename T, HWY_IF_T_SIZE(T, 1)>
4466#if HWY_TARGET <= HWY_AVX3_DL
4467 return Vec256<T>{_mm256_permutexvar_epi8(idx.raw, v.
raw)};
4472 const auto sel_hi_mask =
4479#if HWY_TARGET <= HWY_AVX3
4480 return Vec256<T>{_mm256_mask_shuffle_epi8(
4481 lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)};
4484 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
4489template <
typename T, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_SPECIAL_FLOAT(T)>
4491#if HWY_TARGET <= HWY_AVX3
4492 return Vec256<T>{_mm256_permutexvar_epi16(idx.raw, v.
raw)};
4503 Indices256<float16_t> idx) {
4504 return Vec256<float16_t>{_mm256_permutexvar_ph(idx.raw, v.raw)};
4508template <
typename T, HWY_IF_T_SIZE(T, 4)>
4510 return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
4513template <
typename T, HWY_IF_T_SIZE(T, 8)>
4515#if HWY_TARGET <= HWY_AVX3
4516 return Vec256<T>{_mm256_permutexvar_epi64(idx.raw, v.raw)};
4518 return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
4529#if HWY_TARGET <= HWY_AVX3
4535 BitCast(du, v).raw, idx.raw)});
4539template <
typename T, HWY_IF_T_SIZE(T, 1)>
4542#if HWY_TARGET <= HWY_AVX3_DL
4546 const auto sel_hi_mask =
4550 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
4554template <
typename T, HWY_IF_T_SIZE(T, 2)>
4556 Indices256<T> idx) {
4557#if HWY_TARGET <= HWY_AVX3
4558 return Vec256<T>{_mm256_permutex2var_epi16(a.raw, idx.raw, b.raw)};
4563 Indices256<uint8_t>{idx.raw}));
4567template <
typename T, HWY_IF_UI32(T)>
4569 Indices256<T> idx) {
4570#if HWY_TARGET <= HWY_AVX3
4571 return Vec256<T>{_mm256_permutex2var_epi32(a.raw, idx.raw, b.raw)};
4575 const Vec256<T> idx_vec{idx.raw};
4581 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
4587 Vec256<float16_t> b,
4588 Indices256<float16_t> idx) {
4589 return Vec256<float16_t>{_mm256_permutex2var_ph(a.raw, idx.raw, b.raw)};
4594#if HWY_TARGET <= HWY_AVX3
4598 const auto sel_hi_mask =
4602 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
4606template <
typename T, HWY_IF_UI64(T)>
4608 Indices256<T> idx) {
4609#if HWY_TARGET <= HWY_AVX3
4610 return Vec256<T>{_mm256_permutex2var_epi64(a.raw, idx.raw, b.raw)};
4615 Indices256<uint32_t>{idx.raw}));
4621#if HWY_TARGET <= HWY_AVX3
4633template <
typename T>
4638 BitCast(du, v).raw, _MM_SHUFFLE(1, 0, 3, 2))});
4654template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
4656 alignas(32)
static constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
4660template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
4662 alignas(32)
static constexpr int64_t kReverse[4] = {3, 2, 1, 0};
4666template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4668#if HWY_TARGET <= HWY_AVX3
4670 alignas(32)
static constexpr int16_t kReverse[16] = {
4671 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4672 const Vec256<int16_t> idx =
Load(di, kReverse);
4674 _mm256_permutexvar_epi16(idx.raw,
BitCast(di, v).raw)});
4678 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
4681 _mm256_permute4x64_epi64(rev128.raw, _MM_SHUFFLE(1, 0, 3, 2))};
4685template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
4687#if HWY_TARGET <= HWY_AVX3_DL
4688 alignas(32)
static constexpr TFromD<D> kReverse[32] = {
4689 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
4690 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4694 alignas(32)
static constexpr TFromD<D> kReverse[32] = {
4695 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
4696 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4705template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4709 di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
4715template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
4723template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4727 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
4731template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
4736template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
4749template <
typename T, HWY_IF_T_SIZE(T, 1)>
4753template <
typename T, HWY_IF_T_SIZE(T, 2)>
4757 using VU =
VFromD<
decltype(du)>;
4759 d, VU{_mm256_unpacklo_epi16(
BitCast(du, a).raw,
BitCast(du, b).raw)});
4761template <
typename T, HWY_IF_UI32(T)>
4763 return Vec256<T>{_mm256_unpacklo_epi32(a.raw, b.raw)};
4765template <
typename T, HWY_IF_UI64(T)>
4767 return Vec256<T>{_mm256_unpacklo_epi64(a.raw, b.raw)};
4779template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
4781 return VFromD<D>{_mm256_unpackhi_epi8(a.raw, b.raw)};
4783template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4786 using VU =
VFromD<
decltype(du)>;
4788 d, VU{_mm256_unpackhi_epi16(
BitCast(du, a).raw,
BitCast(du, b).raw)});
4790template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
4792 return VFromD<D>{_mm256_unpackhi_epi32(a.raw, b.raw)};
4794template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
4796 return VFromD<D>{_mm256_unpackhi_epi64(a.raw, b.raw)};
4799template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4801 return VFromD<D>{_mm256_unpackhi_ps(a.raw, b.raw)};
4803template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4805 return VFromD<D>{_mm256_unpackhi_pd(a.raw, b.raw)};
4815template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4818 const Half<
decltype(
d)> d2;
4821 d,
VFromD<
decltype(du)>{_mm256_inserti128_si256(
4824template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4827 const Half<
decltype(
d)> d2;
4830template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4833 const Half<
decltype(
d)> d2;
4838template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4841 return BitCast(
d,
VFromD<
decltype(du)>{_mm256_permute2x128_si256(
4844template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4849template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4856template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4862template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4867template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4874template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4877 return BitCast(
d,
VFromD<
decltype(du)>{_mm256_permute2x128_si256(
4880template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4885template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4892template <
int kBlockIdx,
class T>
4894 static_assert(kBlockIdx == 0 || kBlockIdx == 1,
"Invalid block index");
4904template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
4907#if HWY_TARGET <= HWY_AVX3_DL
4908 alignas(32)
static constexpr uint8_t kIdx[32] = {
4909 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
4910 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
4912 d, Vec256<uint16_t>{_mm256_permutex2var_epi8(
4917 const Vec256<uint16_t> uH = ShiftRight<8>(
BitCast(dw, hi));
4918 const Vec256<uint16_t> uL = ShiftRight<8>(
BitCast(dw, lo));
4919 const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
4920 return VFromD<D>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
4924template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4927#if HWY_TARGET <= HWY_AVX3
4928 alignas(32)
static constexpr uint16_t kIdx[16] = {
4929 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
4931 d, Vec256<uint16_t>{_mm256_permutex2var_epi16(
4936 const Vec256<uint32_t> uH = ShiftRight<16>(
BitCast(dw, hi));
4937 const Vec256<uint32_t> uL = ShiftRight<16>(
BitCast(dw, lo));
4938 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
4940 u16, _MM_SHUFFLE(3, 1, 2, 0))});
4944template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
4947#if HWY_TARGET <= HWY_AVX3
4948 alignas(32)
static constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
4950 d, Vec256<uint32_t>{_mm256_permutex2var_epi32(
4954 const Vec256<float> v3131{_mm256_shuffle_ps(
4955 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
4957 _MM_SHUFFLE(3, 1, 2, 0))};
4961template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4964#if HWY_TARGET <= HWY_AVX3
4965 alignas(32)
static constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
4966 return VFromD<D>{_mm256_permutex2var_ps(lo.raw,
Load(du, kIdx).raw, hi.raw)};
4969 _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
4970 return BitCast(
d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
4971 BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
4975template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
4978#if HWY_TARGET <= HWY_AVX3
4979 alignas(64)
static constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
4981 d, Vec256<uint64_t>{_mm256_permutex2var_epi64(
4985 const Vec256<double> v31{
4986 _mm256_shuffle_pd(
BitCast(df, lo).raw,
BitCast(df, hi).raw, 15)};
4988 _mm256_permute4x64_epi64(
BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
4992template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4994#if HWY_TARGET <= HWY_AVX3
4996 alignas(64)
static constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
4998 _mm256_permutex2var_pd(lo.
raw,
Load(du, kIdx).raw, hi.
raw)};
5003 _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
5009template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5012#if HWY_TARGET <= HWY_AVX3_DL
5013 alignas(64)
static constexpr uint8_t kIdx[32] = {
5014 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
5015 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
5017 d, Vec256<uint32_t>{_mm256_permutex2var_epi8(
5022 const Vec256<uint16_t> mask =
Set(dw, 0x00FF);
5023 const Vec256<uint16_t> uH =
And(
BitCast(dw, hi), mask);
5024 const Vec256<uint16_t> uL =
And(
BitCast(dw, lo), mask);
5025 const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
5026 return VFromD<D>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
5030template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5033#if HWY_TARGET <= HWY_AVX3
5034 alignas(64)
static constexpr uint16_t kIdx[16] = {
5035 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
5037 d, Vec256<uint32_t>{_mm256_permutex2var_epi16(
5042 const Vec256<uint32_t> mask =
Set(dw, 0x0000FFFF);
5043 const Vec256<uint32_t> uH =
And(
BitCast(dw, hi), mask);
5044 const Vec256<uint32_t> uL =
And(
BitCast(dw, lo), mask);
5045 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
5047 u16, _MM_SHUFFLE(3, 1, 2, 0))});
5051template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
5054#if HWY_TARGET <= HWY_AVX3
5055 alignas(64)
static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
5057 d, Vec256<uint32_t>{_mm256_permutex2var_epi32(
5061 const Vec256<float> v2020{_mm256_shuffle_ps(
5062 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
5064 _MM_SHUFFLE(3, 1, 2, 0))};
5069template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5072#if HWY_TARGET <= HWY_AVX3
5073 alignas(64)
static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
5074 return VFromD<D>{_mm256_permutex2var_ps(lo.raw,
Load(du, kIdx).raw, hi.raw)};
5077 _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
5078 return BitCast(
d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
5079 BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
5084template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
5087#if HWY_TARGET <= HWY_AVX3
5088 alignas(64)
static constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
5090 d, Vec256<uint64_t>{_mm256_permutex2var_epi64(
5094 const Vec256<double> v20{
5097 _mm256_permute4x64_epi64(
BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
5102template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
5104#if HWY_TARGET <= HWY_AVX3
5106 alignas(64)
static constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
5108 _mm256_permutex2var_pd(lo.
raw,
Load(du, kIdx).raw, hi.
raw)};
5113 _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
5119#if HWY_TARGET <= HWY_AVX3
5120template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5122#if HWY_TARGET <= HWY_AVX3_DL
5124 alignas(32)
static constexpr uint8_t kIdx[32] = {
5125 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
5126 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
5127 return VFromD<D>{_mm256_permutex2var_epi8(a.raw,
Load(du, kIdx).raw, b.raw)};
5133template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5136 alignas(32)
static constexpr uint16_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
5137 4, 20, 5, 21, 6, 22, 7, 23};
5139 d,
VFromD<
decltype(du)>{_mm256_permutex2var_epi16(
5143template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
5146 alignas(32)
static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5147 return VFromD<D>{_mm256_permutex2var_epi32(a.raw,
Load(du, kIdx).raw, b.raw)};
5150template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5153 alignas(32)
static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5154 return VFromD<D>{_mm256_permutex2var_ps(a.raw,
Load(du, kIdx).raw, b.raw)};
5157template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
5160 alignas(32)
static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
5161 return VFromD<D>{_mm256_permutex2var_epi64(a.raw,
Load(du, kIdx).raw, b.raw)};
5164template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
5167 alignas(32)
static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
5168 return VFromD<D>{_mm256_permutex2var_pd(a.raw,
Load(du, kIdx).raw, b.raw)};
5171template <
class D, HWY_IF_V_SIZE_D(D, 32)>
5179#if HWY_TARGET <= HWY_AVX3
5180template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5182#if HWY_TARGET <= HWY_AVX3_DL
5184 alignas(32)
static constexpr uint8_t kIdx[32] = {
5185 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
5186 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
5187 return VFromD<D>{_mm256_permutex2var_epi8(a.raw,
Load(du, kIdx).raw, b.raw)};
5193template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5196 alignas(32)
static constexpr uint16_t kIdx[16] = {
5197 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
5199 d,
VFromD<
decltype(du)>{_mm256_permutex2var_epi16(
5203template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
5206 alignas(32)
static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
5207 return VFromD<D>{_mm256_permutex2var_epi32(a.raw,
Load(du, kIdx).raw, b.raw)};
5210template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5213 alignas(32)
static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
5214 return VFromD<D>{_mm256_permutex2var_ps(a.raw,
Load(du, kIdx).raw, b.raw)};
5217template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
5220 alignas(32)
static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
5221 return VFromD<D>{_mm256_permutex2var_epi64(a.raw,
Load(du, kIdx).raw, b.raw)};
5224template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
5227 alignas(32)
static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
5228 return VFromD<D>{_mm256_permutex2var_pd(a.raw,
Load(du, kIdx).raw, b.raw)};
5231template <
class D, HWY_IF_V_SIZE_D(D, 32)>
5239template <
typename T, HWY_IF_UI32(T)>
5241 return Vec256<T>{_mm256_shuffle_epi32(v.
raw, _MM_SHUFFLE(2, 2, 0, 0))};
5245 _mm256_shuffle_ps(v.
raw, v.
raw, _MM_SHUFFLE(2, 2, 0, 0))};
5248template <
typename T, HWY_IF_T_SIZE(T, 8)>
5256template <
typename T, HWY_IF_UI32(T)>
5258 return Vec256<T>{_mm256_shuffle_epi32(v.
raw, _MM_SHUFFLE(3, 3, 1, 1))};
5262 _mm256_shuffle_ps(v.
raw, v.
raw, _MM_SHUFFLE(3, 3, 1, 1))};
5265template <
typename T, HWY_IF_T_SIZE(T, 8)>
5273template <
typename T, HWY_IF_T_SIZE(T, 1)>
5277 const VFromD<
decltype(d8)> mask =
5278 Dup128VecFromValues(d8, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF,
5279 0, 0xFF, 0, 0xFF, 0);
5283template <
typename T, HWY_IF_UI16(T)>
5293 return Vec256<float16_t>{
5294 _mm256_mask_blend_ph(
static_cast<__mmask16
>(0x5555), a.raw, b.raw)};
5298template <
typename T, HWY_IF_UI32(T)>
5300 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)};
5303template <
typename T, HWY_IF_UI64(T)>
5305 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)};
5318#if HWY_TARGET <= HWY_AVX3
5319template <
class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
5321 return VFromD<D>{_mm256_mask_shuffle_epi32(
5322 a.raw,
static_cast<__mmask8
>(0xAA), b.raw,
5323 static_cast<_MM_PERM_ENUM
>(_MM_SHUFFLE(2, 2, 0, 0)))};
5325template <
class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
5327 return VFromD<D>{_mm256_mask_shuffle_ps(a.raw,
static_cast<__mmask8
>(0xAA),
5329 _MM_SHUFFLE(2, 2, 0, 0))};
5332template <
class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
5335 const VFromD<
decltype(df)> b2_b0_a2_a0{_mm256_shuffle_ps(
5336 BitCast(df, a).raw,
BitCast(df, b).raw, _MM_SHUFFLE(2, 0, 2, 0))};
5338 d,
VFromD<
decltype(df)>{_mm256_shuffle_ps(
5339 b2_b0_a2_a0.raw, b2_b0_a2_a0.raw, _MM_SHUFFLE(3, 1, 2, 0))});
5344template <
class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
5351#if HWY_TARGET <= HWY_AVX3
5352template <
class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
5354 return VFromD<D>{_mm256_mask_shuffle_epi32(
5355 b.raw,
static_cast<__mmask8
>(0x55), a.raw,
5356 static_cast<_MM_PERM_ENUM
>(_MM_SHUFFLE(3, 3, 1, 1)))};
5358template <
class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
5360 return VFromD<D>{_mm256_mask_shuffle_ps(b.raw,
static_cast<__mmask8
>(0x55),
5362 _MM_SHUFFLE(3, 3, 1, 1))};
5365template <
class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
5368 const VFromD<
decltype(df)> b3_b1_a3_a3{_mm256_shuffle_ps(
5369 BitCast(df, a).raw,
BitCast(df, b).raw, _MM_SHUFFLE(3, 1, 3, 1))};
5371 d,
VFromD<
decltype(df)>{_mm256_shuffle_ps(
5372 b3_b1_a3_a3.raw, b3_b1_a3_a3.raw, _MM_SHUFFLE(3, 1, 2, 0))});
5377template <
class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
5384template <
typename T, HWY_IF_NOT_FLOAT3264(T)>
5386 const DFromV<
decltype(odd)>
d;
5402template <
class D, HWY_IF_V_SIZE_D(D, 32)>
5410template <
typename T,
typename TI>
5412 const DFromV<
decltype(from)>
d;
5413 return BitCast(
d, Vec256<uint8_t>{_mm256_shuffle_epi8(
5414 BitCast(Full256<uint8_t>(), bytes).raw,
5415 BitCast(Full256<uint8_t>(), from).raw)});
5419template <
typename T,
typename TI,
size_t NI>
5421 const Full256<TI> di;
5422 const Half<
decltype(di)> dih;
5427 return Vec128<TI, NI>{
LowerHalf(dih, tbl_full).raw};
5431template <
typename T,
size_t N,
typename TI>
5443template <
int kLane,
class T, HWY_IF_T_SIZE(T, 1)>
5445 static_assert(0 <= kLane && kLane < 16,
"Invalid lane");
5453template <
class D, HWY_IF_V_SIZE_D(D, 32)>
5457 const uint32_t x0) {
5458 return BitCast(
d, Vec256<uint32_t>{_mm256_set_epi32(
5459 static_cast<int32_t
>(x3),
static_cast<int32_t
>(x2),
5460 static_cast<int32_t
>(x1),
static_cast<int32_t
>(x0),
5461 static_cast<int32_t
>(x3),
static_cast<int32_t
>(x2),
5462 static_cast<int32_t
>(x1),
static_cast<int32_t
>(x0))});
5465template <
size_t kIdx3210,
class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
5469 return V{_mm256_shuffle_epi32(v.raw,
static_cast<int>(kIdx3210 & 0xFF))};
5472template <
size_t kIdx3210,
class V, HWY_IF_FLOAT(TFromV<V>)>
5476 return V{_mm256_shuffle_ps(v.raw, v.raw,
static_cast<int>(kIdx3210 & 0xFF))};
5495template <
size_t kIdx3210,
class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
5499 return V{_mm256_permute4x64_epi64(v.raw,
static_cast<int>(kIdx3210 & 0xFF))};
5502template <
size_t kIdx3210,
class V, HWY_IF_FLOAT(TFromV<V>)>
5506 return V{_mm256_permute4x64_pd(v.raw,
static_cast<int>(kIdx3210 & 0xFF))};
5515#if HWY_TARGET <= HWY_AVX3
5516template <
int kI32Lanes,
class V, HWY_IF_V_SIZE_V(V, 32)>
5525template <
int kI64Lanes,
class V, HWY_IF_V_SIZE_V(V, 32)>
5534template <
int kI64Lanes,
class V, HWY_IF_V_SIZE_V(V, 32)>
5536 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5537 "kI64Lanes must be between 0 and 3");
5545 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5546 "kI64Lanes must be between 0 and 3");
5547 constexpr int kIdx0 = (-kI64Lanes) & 3;
5548 constexpr int kIdx1 = (-kI64Lanes + 1) & 3;
5549 constexpr int kIdx2 = (-kI64Lanes + 2) & 3;
5550 constexpr int kIdx3 = (-kI64Lanes + 3) & 3;
5551 constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0);
5552 constexpr int kBlendMask = (1 << (kI64Lanes * 2)) - 1;
5555 return V{_mm256_blend_epi32(_mm256_permute4x64_epi64(v.raw, kIdx3210),
5556 Zero(
d).raw, kBlendMask)};
5562 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5563 "kI64Lanes must be between 0 and 3");
5564 constexpr int kIdx0 = (-kI64Lanes) & 3;
5565 constexpr int kIdx1 = (-kI64Lanes + 1) & 3;
5566 constexpr int kIdx2 = (-kI64Lanes + 2) & 3;
5567 constexpr int kIdx3 = (-kI64Lanes + 3) & 3;
5568 constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0);
5569 constexpr int kBlendMask = (1 << kI64Lanes) - 1;
5573 return BitCast(
d, Vec256<double>{_mm256_blend_pd(
5574 _mm256_permute4x64_pd(
BitCast(dd, v).raw, kIdx3210),
5575 Zero(dd).raw, kBlendMask)});
5585 const auto idx_vec =
5586 Iota(du8,
static_cast<uint8_t
>(
size_t{0} - amt *
sizeof(
TFromD<D>)));
5589#if HWY_TARGET <= HWY_AVX3_DL
5598 ? ((1 << 2) | (1 << 8))
5602 using TU =
TFromD<
decltype(du)>;
5604 const auto idx =
Iota(du,
static_cast<TU
>(
size_t{0} - amt));
5605#if HWY_TARGET <= HWY_AVX3
5606 const auto masked_idx =
5610 const auto masked_idx =
And(idx,
Set(du,
static_cast<TU
>(
MaxLanes(
d) - 1)));
5616#if HWY_TARGET > HWY_AVX3
5617template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
5619 const RepartitionToNarrow<D> dn;
5626template <
int kBlocks,
class D, HWY_IF_V_SIZE_D(D, 32)>
5628 static_assert(0 <= kBlocks && kBlocks <= 1,
5629 "kBlocks must be between 0 and 1");
5633template <
class D, HWY_IF_V_SIZE_D(D, 32)>
5635#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
5636 constexpr size_t kLanesPerBlock = 16 /
sizeof(TFromD<D>);
5637 if (__builtin_constant_p(amt)) {
5639 switch (amt *
sizeof(TFromD<D>)) {
5643 return CombineShiftRightBytes<15>(
d, v, v_lo);
5645 return CombineShiftRightBytes<14>(
d, v, v_lo);
5647 return CombineShiftRightBytes<13>(
d, v, v_lo);
5649#if HWY_TARGET <= HWY_AVX3
5650 return detail::CombineShiftRightI32Lanes<7>(v,
Zero(
d));
5652 return CombineShiftRightBytes<12>(
d, v, v_lo);
5655 return CombineShiftRightBytes<11>(
d, v, v_lo);
5657 return CombineShiftRightBytes<10>(
d, v, v_lo);
5659 return CombineShiftRightBytes<9>(
d, v, v_lo);
5661 return detail::SlideUpI64Lanes<1>(v);
5663 return CombineShiftRightBytes<7>(
d, v, v_lo);
5665 return CombineShiftRightBytes<6>(
d, v, v_lo);
5667 return CombineShiftRightBytes<5>(
d, v, v_lo);
5669#if HWY_TARGET <= HWY_AVX3
5670 return detail::CombineShiftRightI32Lanes<5>(v,
Zero(
d));
5672 return CombineShiftRightBytes<4>(
d, v, v_lo);
5675 return CombineShiftRightBytes<3>(
d, v, v_lo);
5677 return CombineShiftRightBytes<2>(
d, v, v_lo);
5679 return CombineShiftRightBytes<1>(
d, v, v_lo);
5682#if HWY_TARGET <= HWY_AVX3
5684 return detail::CombineShiftRightI32Lanes<3>(v,
Zero(
d));
5687 return detail::SlideUpI64Lanes<3>(v);
5688#if HWY_TARGET <= HWY_AVX3
5690 return detail::CombineShiftRightI32Lanes<1>(v,
Zero(
d));
5695 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
5696 const Half<
decltype(
d)> dh;
5707template <
typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5710 return CombineShiftRightBytes<15>(
d, v, v_lo);
5713template <
typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5716 return CombineShiftRightBytes<14>(
d, v, v_lo);
5719template <
typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
5721#if HWY_TARGET <= HWY_AVX3
5722 return detail::CombineShiftRightI32Lanes<7>(v,
Zero(
d));
5725 return CombineShiftRightBytes<12>(
d, v, v_lo);
5729template <
typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
5731 return detail::SlideUpI64Lanes<1>(v);
5738#if HWY_TARGET <= HWY_AVX3
5739template <
int kI64Lanes,
class V, HWY_IF_V_SIZE_V(V, 32)>
5741 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5742 "kI64Lanes must be between 0 and 3");
5744 return CombineShiftRightI64Lanes<kI64Lanes>(
Zero(
d), v);
5750 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5751 "kI64Lanes must be between 0 and 3");
5752 constexpr int kIdx1 = (kI64Lanes + 1) & 3;
5753 constexpr int kIdx2 = (kI64Lanes + 2) & 3;
5754 constexpr int kIdx3 = (kI64Lanes + 3) & 3;
5755 constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kI64Lanes);
5756 constexpr int kBlendMask =
5757 static_cast<int>((0xFFu << ((4 - kI64Lanes) * 2)) & 0xFFu);
5760 return V{_mm256_blend_epi32(_mm256_permute4x64_epi64(v.raw, kIdx3210),
5761 Zero(
d).raw, kBlendMask)};
5767 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5768 "kI64Lanes must be between 0 and 3");
5769 constexpr int kIdx1 = (kI64Lanes + 1) & 3;
5770 constexpr int kIdx2 = (kI64Lanes + 2) & 3;
5771 constexpr int kIdx3 = (kI64Lanes + 3) & 3;
5772 constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kI64Lanes);
5773 constexpr int kBlendMask = (0x0F << (4 - kI64Lanes)) & 0x0F;
5777 return BitCast(
d, Vec256<double>{_mm256_blend_pd(
5778 _mm256_permute4x64_pd(
BitCast(dd, v).raw, kIdx3210),
5779 Zero(dd).raw, kBlendMask)});
5789 auto idx_vec =
Iota(du8,
static_cast<uint8_t
>(amt *
sizeof(
TFromD<D>)));
5791#if HWY_TARGET <= HWY_AVX3_DL
5792 const auto result_mask = idx_vec <
Set(du8, uint8_t{32});
5794 _mm256_maskz_permutexvar_epi8(result_mask.raw, idx_vec.raw, v.raw)};
5799 Set(di8, int8_t{31}))));
5806 ? ((1 << 2) | (1 << 8))
5810 using TU =
TFromD<
decltype(du)>;
5812 const auto idx =
Iota(du,
static_cast<TU
>(amt));
5813 const auto masked_idx =
And(idx,
Set(du,
static_cast<TU
>(
MaxLanes(
d) - 1)));
5819#if HWY_TARGET > HWY_AVX3
5820template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
5822 const RepartitionToNarrow<D> dn;
5823 return BitCast(
d, TableLookupSlideDownLanes(dn,
BitCast(dn, v), amt * 2));
5829template <
int kBlocks,
class D, HWY_IF_V_SIZE_D(D, 32)>
5831 static_assert(0 <= kBlocks && kBlocks <= 1,
5832 "kBlocks must be between 0 and 1");
5833 const Half<
decltype(
d)> dh;
5837template <
class D, HWY_IF_V_SIZE_D(D, 32)>
5839#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
5840 constexpr size_t kLanesPerBlock = 16 /
sizeof(TFromD<D>);
5841 const Half<
decltype(
d)> dh;
5842 if (__builtin_constant_p(amt)) {
5844 switch (amt *
sizeof(TFromD<D>)) {
5848 return CombineShiftRightBytes<1>(
d, v_hi, v);
5850 return CombineShiftRightBytes<2>(
d, v_hi, v);
5852 return CombineShiftRightBytes<3>(
d, v_hi, v);
5854#if HWY_TARGET <= HWY_AVX3
5855 return detail::CombineShiftRightI32Lanes<1>(
Zero(
d), v);
5857 return CombineShiftRightBytes<4>(
d, v_hi, v);
5860 return CombineShiftRightBytes<5>(
d, v_hi, v);
5862 return CombineShiftRightBytes<6>(
d, v_hi, v);
5864 return CombineShiftRightBytes<7>(
d, v_hi, v);
5866 return detail::SlideDownI64Lanes<1>(v);
5868 return CombineShiftRightBytes<9>(
d, v_hi, v);
5870 return CombineShiftRightBytes<10>(
d, v_hi, v);
5872 return CombineShiftRightBytes<11>(
d, v_hi, v);
5874#if HWY_TARGET <= HWY_AVX3
5875 return detail::CombineShiftRightI32Lanes<3>(
Zero(
d), v);
5877 return CombineShiftRightBytes<12>(
d, v_hi, v);
5880 return CombineShiftRightBytes<13>(
d, v_hi, v);
5882 return CombineShiftRightBytes<14>(
d, v_hi, v);
5884 return CombineShiftRightBytes<15>(
d, v_hi, v);
5887#if HWY_TARGET <= HWY_AVX3
5889 return detail::CombineShiftRightI32Lanes<5>(
Zero(
d), v);
5892 return detail::SlideDownI64Lanes<3>(v);
5893#if HWY_TARGET <= HWY_AVX3
5895 return detail::CombineShiftRightI32Lanes<7>(
Zero(
d), v);
5900 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
5911template <
typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5913 const Half<
decltype(
d)> dh;
5915 return CombineShiftRightBytes<1>(
d, v_hi, v);
5918template <
typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5920 const Half<
decltype(
d)> dh;
5922 return CombineShiftRightBytes<2>(
d, v_hi, v);
5925template <
typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
5927#if HWY_TARGET <= HWY_AVX3
5928 return detail::CombineShiftRightI32Lanes<1>(
Zero(
d), v);
5930 const Half<
decltype(
d)> dh;
5932 return CombineShiftRightBytes<4>(
d, v_hi, v);
5936template <
typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
5938 return detail::SlideDownI64Lanes<1>(v);
5945#if HWY_TARGET > HWY_AVX3 && !HWY_IDE
5949 const Half<
decltype(
d)> dh;
5950 const Rebind<uint32_t,
decltype(dh)> du32;
5962#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
5965 return AVX2ShlU16Vec256(v, bits);
5973#if HWY_TARGET <= HWY_AVX3_DL
5976 const VFromD<
decltype(
d)> masks =
5977 Dup128VecFromValues(
d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
5978 0, 0, 0, 0, 0, 0, 0);
5981 d, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 0, 0, 0, 0, 0, 0, 0, 0);
5984 return VFromD<
decltype(
d)>{_mm256_gf2p8mul_epi8(v.
raw, mul.raw)};
5987 using VW =
VFromD<
decltype(dw)>;
5988 const VW even_mask =
Set(dw, 0x00FF);
5989 const VW odd_mask =
Set(dw, 0xFF00);
5991 const VW bits16 =
BitCast(dw, bits);
5993 const VW evens =
Shl(tag, vw,
And(bits16, even_mask));
5994 const VW odds =
Shl(tag,
And(vw, odd_mask), ShiftRight<8>(bits16));
6009template <
typename T>
6020template <
typename T>
6027#if HWY_TARGET > HWY_AVX3
6033 const Half<
decltype(
d)> dh;
6034 const Rebind<int32_t,
decltype(dh)> di32;
6035 const Rebind<uint32_t,
decltype(dh)> du32;
6037 const auto lo_shr_result =
6039 const auto hi_shr_result =
6042 BitCast(di32, hi_shr_result));
6049#if HWY_TARGET <= HWY_AVX3
6052 return detail::AVX2ShrU16Vec256(v, bits);
6060 using VW =
VFromD<
decltype(dw)>;
6061 const VW mask =
Set(dw, 0x00FF);
6063 const VW bits16 =
BitCast(dw, bits);
6064 const VW evens =
And(vw, mask) >>
And(bits16, mask);
6066 const VW odds = vw >> ShiftRight<8>(bits16);
6078#if HWY_TARGET > HWY_AVX3
6084 const Half<
decltype(
d)> dh;
6085 const Rebind<int32_t,
decltype(dh)> di32;
6087 const auto lo_shr_result =
6089 const auto hi_shr_result =
6098#if HWY_TARGET <= HWY_AVX3
6101 return detail::AVX2ShrI16Vec256(v, bits);
6110 using VW =
VFromD<
decltype(dw)>;
6111 const VW mask =
Set(dw, 0x00FF);
6113 const VW bits16 =
BitCast(dw, bits);
6114 const VW evens = ShiftRight<8>(ShiftLeft<8>(vw)) >>
And(bits16, mask);
6116 const VW odds = vw >>
BitCast(dw, ShiftRight<8>(
BitCast(dw_u, bits16)));
6125#if HWY_TARGET <= HWY_AVX3
6129 return detail::SignedShr(
d, v, bits);
6134template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6142template <
class DI16, HWY_IF_V_SIZE_D(DI16, 32), HWY_IF_I16_D(DI16)>
6144 DI16 ,
VFromD<Repartition<uint8_t, DI16>> a,
6145 VFromD<Repartition<int8_t, DI16>> b) {
6146 return VFromD<DI16>{_mm256_maddubs_epi16(a.raw, b.raw)};
6151#if HWY_TARGET <= HWY_AVX3_DL
6152template <
class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 32)>
6154 DI32 ,
VFromD<Repartition<int16_t, DI32>> a,
6156 return VFromD<DI32>{_mm256_dpwssds_epi32(sum.raw, a.raw, b.raw)};
6161template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6167#if HWY_TARGET <= HWY_AVX3_DL
6187#if HWY_TARGET <= HWY_AVX3_DL
6189template <
class DI32, HWY_IF_V_SIZE_D(DI32, 32)>
6191 DI32 ,
VFromD<Repartition<uint8_t, DI32>> a_u,
6193 return VFromD<DI32>{_mm256_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)};
6202template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6207template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6212#if HWY_TARGET <= HWY_AVX3
6213template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6222template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6226template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
6230template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
6234template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6238template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6242template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6251template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
6255template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6259template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6263template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6265 return VFromD<D>{_mm256_cvtepi32_epi64(v.raw)};
6267template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6271template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6276#if HWY_TARGET <= HWY_AVX3
6277template <
class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_I64_D(D)>
6279 const Rebind<float,
decltype(di64)> df32;
6287template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6289 return VFromD<D>{_mm256_cvttps_epi64(v.raw)};
6291template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6293 return VFromD<D>{_mm256_cvttps_epu64(v.raw)};
6295template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6297 return VFromD<D>{_mm256_maskz_cvttps_epu64(
6303#if HWY_TARGET > HWY_AVX3
6308template <
class D, HWY_IF_LANES_D(D, 4)>
6312 Vec256<int32_t> v) {
6316template <
class D, HWY_IF_LANES_D(D, 4)>
6320 Vec256<int32_t> v) {
6329template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
6331 const __m256i u16 = _mm256_packus_epi32(v.
raw, v.
raw);
6334 return VFromD<D>{_mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
6337template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
6344template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
6346 const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw);
6347 return VFromD<D>{_mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
6350template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
6352 const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
6354 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
6355 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
6356 return VFromD<D>{_mm_packus_epi16(i16, i16)};
6359template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
6361#if HWY_TARGET <= HWY_AVX3
6363 return VFromD<D>{_mm256_cvtusepi32_epi8(v.raw)};
6371template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
6373 const __m256i u8 = _mm256_packus_epi16(v.
raw, v.
raw);
6374 return VFromD<D>{_mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
6377template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
6384template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
6386 const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
6388 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
6389 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
6390 return VFromD<D>{_mm_packs_epi16(i16, i16)};
6393template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
6395 const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw);
6396 return VFromD<D>{_mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
6399#if HWY_TARGET <= HWY_AVX3
6400template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
6404template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
6406 return VFromD<D>{_mm256_cvtsepi64_epi16(v.raw)};
6408template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
6410 return VFromD<D>{_mm256_cvtsepi64_epi8(v.raw)};
6413template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6416 return VFromD<D>{_mm256_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
6418template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
6421 return VFromD<D>{_mm256_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
6423template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
6426 return VFromD<D>{_mm256_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
6429template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6433template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
6435 return VFromD<D>{_mm256_cvtusepi64_epi16(v.raw)};
6437template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
6439 return VFromD<D>{_mm256_cvtusepi64_epi8(v.raw)};
6443#ifndef HWY_DISABLE_F16C
6454 df16,
VFromD<
decltype(du16)>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
6462template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
6464 return VFromD<D>{_mm256_cvtpd_ph(v.raw)};
6468#if HWY_AVX3_HAVE_F32_TO_BF16C
6469template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
6471#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
6474 __asm__(
"vcvtneps2bf16 %1, %0" :
"=v"(raw_result) :
"v"(v.raw));
6483template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
6486#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
6489 __asm__(
"vcvtne2ps2bf16 %2, %1, %0"
6491 :
"v"(b.raw),
"v"(a.raw));
6501template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
6507template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6509 Vec256<int32_t> b) {
6510 return VFromD<D>{_mm256_packus_epi32(a.raw, b.raw)};
6513template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6518 const auto max_i32 =
Set(
d, 0x7FFFFFFFu);
6523template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I8_D(D)>
6529template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
6531 Vec256<int16_t> b) {
6532 return VFromD<D>{_mm256_packus_epi16(a.raw, b.raw)};
6535template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
6540 const auto max_i16 =
Set(
d, 0x7FFFu);
6545#if HWY_TARGET > HWY_AVX3
6546template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6548 Vec256<int64_t> b) {
6549 const DFromV<
decltype(a)> di64;
6551 const Half<
decltype(dn)> dnh;
6558 const auto saturated_a =
Xor(
6561 const auto saturated_b =
Xor(
6566 Vec256<float>{_mm256_shuffle_ps(
BitCast(dn_f, saturated_a).raw,
6567 BitCast(dn_f, saturated_b).raw,
6568 _MM_SHUFFLE(2, 0, 2, 0))});
6571template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
6573 Vec256<int64_t> b) {
6574 const DFromV<
decltype(a)> di64;
6576 const Half<
decltype(dn)> dnh;
6585 Vec256<float>{_mm256_shuffle_ps(
BitCast(dn_f, saturated_a).raw,
6586 BitCast(dn_f, saturated_b).raw,
6587 _MM_SHUFFLE(2, 0, 2, 0))});
6590template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
6592 Vec256<uint64_t> b) {
6593 const Half<
decltype(dn)> dnh;
6600 Vec256<float>{_mm256_shuffle_ps(
BitCast(dn_f, saturated_a).raw,
6601 BitCast(dn_f, saturated_b).raw,
6602 _MM_SHUFFLE(2, 0, 2, 0))});
6606template <
class D,
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
6607 HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
6608 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
6609 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2),
6610 HWY_IF_T_SIZE_ONE_OF_V(V,
6611 (1 << 1) | (1 << 2) | (1 << 4) |
6612 ((HWY_TARGET > HWY_AVX3) ? (1 << 8) : 0))>
6613HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
6614 return VFromD<D>{_mm256_permute4x64_epi64(ReorderDemote2To(d, a, b).raw,
6615 _MM_SHUFFLE(3, 1, 2, 0))};
6618template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
6623template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
6628#if HWY_TARGET <= HWY_AVX3
6629template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6631 return VFromD<D>{_mm256_cvttpd_epu32(v.raw)};
6633template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6635 return VFromD<D>{_mm256_maskz_cvttpd_epu32(
6639template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
6641 return VFromD<D>{_mm256_cvtepi64_ps(v.raw)};
6643template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
6645 return VFromD<D>{_mm256_cvtepu64_ps(v.raw)};
6651 const Full256<uint32_t> d32;
6652 const Full64<uint8_t> d8;
6653 alignas(32)
static constexpr uint32_t k8From32[8] = {
6654 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
6668template <u
int32_t LO, u
int32_t HI,
typename T>
6672#if HWY_TARGET <= HWY_AVX3_DL
6673 alignas(32)
static constexpr uint32_t kMap[8] = {
6674 LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0};
6675 const auto result = _mm256_permutexvar_epi8(
Load(d32, kMap).raw, v.
raw);
6677 alignas(32)
static constexpr uint32_t kMap[8] = {LO, HI, ~0u, ~0u,
6680 const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC);
6691template <u
int16_t LO, u
int16_t HI,
typename T>
6695#if HWY_TARGET <= HWY_AVX3_DL
6696 alignas(32)
static constexpr uint16_t kMap[16] = {
6697 LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
6698 const auto result = _mm256_permutexvar_epi8(
Load(d16, kMap).raw, v.
raw);
6701 constexpr uint16_t ff =
static_cast<uint16_t
>(~0u);
6702 alignas(32)
static constexpr uint16_t kMap[16] = {
6703 LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff};
6705 const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC);
6706 const auto half = _mm256_castsi256_si128(mixed);
6713template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
6716#if HWY_TARGET <= HWY_AVX3_DL
6717 alignas(32)
static constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0,
6719 const auto result = _mm256_permutexvar_epi8(
Load(d32, kMap).raw, v.
raw);
6722 alignas(32)
static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u,
6723 0x0800FFFFu, ~0u, ~0u, ~0u};
6727 const auto result = lo | hi;
6732template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
6734 const auto result = detail::LookupAndConcatQuarters<0x100, 0x908>(v);
6738template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6740 const Full256<uint32_t> d32;
6741 alignas(32)
static constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6};
6744 return LowerHalf(Vec256<uint32_t>{v32.raw});
6747template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
6749 const auto full = detail::LookupAndConcatQuarters<0x400, 0xC08>(v);
6753template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
6755 const auto full = detail::LookupAndConcatHalves<0x05040100, 0x0D0C0908>(v);
6759template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
6761 const auto full = detail::LookupAndConcatHalves<0x06040200, 0x0E0C0A08>(v);
6768template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
6770 return VFromD<D>{_mm256_cvtepu16_ph(v.raw)};
6772template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
6774 return VFromD<D>{_mm256_cvtepi16_ph(v.raw)};
6778template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
6783#if HWY_TARGET <= HWY_AVX3
6784template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
6789template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6794template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6803template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
6805 return VFromD<D>{_mm256_cvttph_epi16(v.raw)};
6807template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6809 return VFromD<D>{_mm256_cvttph_epu16(v.raw)};
6811template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6813 return VFromD<D>{_mm256_maskz_cvttph_epu16(
6818template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6823#if HWY_TARGET <= HWY_AVX3
6824template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6828template <
class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
6830 return VFromD<DU>{_mm256_cvttps_epu32(v.raw)};
6832template <
class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
6837template <
class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
6839 return VFromD<DU>{_mm256_cvttpd_epu64(v.raw)};
6841template <
class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
6849 const Full256<int32_t> di;
6851 di, v, Vec256<int32_t>{_mm256_cvtps_epi32(v.raw)});
6854#ifndef HWY_DISABLE_F16C
6856template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
6871template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6873 return VFromD<D>{_mm256_cvtph_pd(v.raw)};
6878template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
6880 const Rebind<uint16_t,
decltype(df32)> du16;
6887#if !defined(HWY_DISABLE_PCLMUL_AES)
6891#if HWY_TARGET <= HWY_AVX3_DL
6895 const Half<
decltype(
d)> d2;
6903#if HWY_TARGET <= HWY_AVX3_DL
6907 const Half<
decltype(
d)> d2;
6916#if HWY_TARGET <= HWY_AVX3_DL
6920 const Half<
decltype(
d)> d2;
6928#if HWY_TARGET <= HWY_AVX3_DL
6932 const Half<
decltype(
d)> d2;
6939template <
class V, HWY_IF_V_SIZE_GT_V(V, 16), HWY_IF_U8_D(DFromV<V>)>
6941 const DFromV<
decltype(state)>
d;
6942#if HWY_TARGET <= HWY_AVX3_DL
6950 const auto zero =
Zero(
d);
6953 const Half<
decltype(
d)> dh;
6959template <u
int8_t kRcon>
6962#if HWY_TARGET <= HWY_AVX3_DL
6964 d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
6966 d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
6969 const auto sub_word_result =
AESLastRound(w13, rconXorMask);
6972 const Half<
decltype(
d)> d2;
6979#if HWY_TARGET <= HWY_AVX3_DL
6983 const Half<
decltype(
d)> d2;
6990#if HWY_TARGET <= HWY_AVX3_DL
6994 const Half<
decltype(
d)> d2;
7004#if HWY_TARGET <= HWY_AVX3
7009template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7012 constexpr size_t kNumBytes = (kN + 7) / 8;
7014 uint64_t mask_bits = 0;
7015 CopyBytes<kNumBytes>(bits, &mask_bits);
7018 mask_bits &= (1ull << kN) - 1;
7021 return MFromD<D>::FromBits(mask_bits);
7027template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7030 constexpr size_t kNumBytes = (kN + 7) / 8;
7032 CopyBytes<kNumBytes>(&mask.raw, bits);
7036 const int mask_bits =
static_cast<int>((1ull << kN) - 1);
7037 bits[0] =
static_cast<uint8_t
>(bits[0] & mask_bits);
7044template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7046 return PopCount(
static_cast<uint64_t
>(mask.raw));
7049template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7054template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7060template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7065template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7075template <
typename T>
7077#if HWY_COMPILER_HAS_MASK_INTRINSICS
7078 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
7080 return mask.
raw == 0;
7083template <
typename T>
7085#if HWY_COMPILER_HAS_MASK_INTRINSICS
7086 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
7088 return mask.
raw == 0;
7091template <
typename T>
7093#if HWY_COMPILER_HAS_MASK_INTRINSICS
7094 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
7096 return mask.
raw == 0;
7099template <
typename T>
7101 return (uint64_t{mask.
raw} & 0xF) == 0;
7106template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7113template <
typename T>
7115#if HWY_COMPILER_HAS_MASK_INTRINSICS
7116 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
7118 return mask.
raw == 0xFFFFFFFFu;
7121template <
typename T>
7123#if HWY_COMPILER_HAS_MASK_INTRINSICS
7124 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
7126 return mask.
raw == 0xFFFFu;
7129template <
typename T>
7131#if HWY_COMPILER_HAS_MASK_INTRINSICS
7132 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
7134 return mask.
raw == 0xFFu;
7137template <
typename T>
7140 return mask.
raw == 0xFu;
7145template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7154template <
typename T, HWY_IF_T_SIZE(T, 4)>
7163template <
typename T, HWY_IF_T_SIZE(T, 8)>
7166 alignas(16)
static constexpr uint64_t packed_array[16] = {
7168 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
7169 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
7170 0x00001032, 0x00001320, 0x00000321, 0x00003210};
7176 const auto packed =
Set(du64, packed_array[mask.raw]);
7177 alignas(64)
static constexpr uint64_t shifts[4] = {0, 4, 8, 12};
7178 const auto indices = Indices256<T>{(packed >>
Load(du64, shifts)).raw};
7186template <
typename T, HWY_IF_T_SIZE(T, 8)>
7189 alignas(16)
static constexpr uint64_t packed_array[16] = {
7191 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
7192 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
7193 0x00003210, 0x00003201, 0x00003210, 0x00003210};
7199 const auto packed =
Set(du64, packed_array[mask.
raw]);
7200 alignas(32)
static constexpr uint64_t shifts[4] = {0, 4, 8, 12};
7216template <
typename T, HWY_IF_T_SIZE(T, 1)>
7217HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
7221 const auto vbits =
BitCast(du,
Set(du32,
static_cast<uint32_t
>(mask_bits)));
7225 alignas(32)
static constexpr uint64_t kRep8[4] = {
7226 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
7227 0x0303030303030303ull};
7231 du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
7235template <
typename T, HWY_IF_T_SIZE(T, 2)>
7236HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
7239 alignas(32)
static constexpr uint16_t kBit[16] = {
7240 1, 2, 4, 8, 16, 32, 64, 128,
7241 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
7242 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
7246template <
typename T, HWY_IF_T_SIZE(T, 4)>
7247HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
7250 alignas(32)
static constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
7251 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
7255template <
typename T, HWY_IF_T_SIZE(T, 8)>
7256HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
7259 alignas(32)
static constexpr uint64_t kBit[8] = {1, 2, 4, 8};
7266template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7269 constexpr size_t kNumBytes = (kN + 7) / 8;
7271 uint64_t mask_bits = 0;
7272 CopyBytes<kNumBytes>(bits, &mask_bits);
7275 mask_bits &= (1ull << kN) - 1;
7278 return detail::LoadMaskBits256<TFromD<D>>(mask_bits);
7285template <
typename T, HWY_IF_T_SIZE(T, 1)>
7288 const Full256<uint8_t> d8;
7291 return static_cast<uint32_t
>(_mm256_movemask_epi8(sign_bits));
7294template <
typename T, HWY_IF_T_SIZE(T, 2)>
7296#if !defined(HWY_DISABLE_BMI2_FMA) && !defined(HWY_DISABLE_PEXT_ON_AVX2)
7298 const Full256<uint8_t> d8;
7303 return _pext_u32(
static_cast<uint32_t
>(sign_bits8), 0xAAAAAAAAu);
7308 const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
7310 const auto compressed = _mm256_castsi256_si128(
7311 _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0)));
7312 return static_cast<unsigned>(_mm_movemask_epi8(compressed));
7316template <
typename T, HWY_IF_T_SIZE(T, 4)>
7319 const Full256<float> df;
7321 return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
7324template <
typename T, HWY_IF_T_SIZE(T, 8)>
7327 const Full256<double> df;
7329 return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
7335template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7337 constexpr size_t N =
Lanes(
d);
7338 constexpr size_t kNumBytes = (N + 7) / 8;
7341 CopyBytes<kNumBytes>(&mask_bits, bits);
7349template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7356template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
7362template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7368template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
7370 constexpr uint64_t kAllBits = (1ull <<
Lanes(
d)) - 1;
7374template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7380template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
7385template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7391template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7397template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7403template <
class D, HWY_IF_V_SIZE_D(D, 32)>
7414template <
typename T, HWY_IF_T_SIZE(T, 4)>
7415HWY_INLINE Vec256<uint32_t> IndicesFromBits256(uint64_t mask_bits) {
7416 const Full256<uint32_t> d32;
7422 alignas(16)
static constexpr uint32_t packed_array[256] = {
7424 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
7425 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
7426 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
7427 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
7428 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
7429 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
7430 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
7431 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
7432 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
7433 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
7434 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
7435 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
7436 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
7437 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
7438 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
7439 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
7440 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
7441 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
7442 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
7443 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
7444 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
7445 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
7446 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
7447 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
7448 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
7449 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
7450 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
7451 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
7452 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
7453 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
7454 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
7455 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
7456 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
7457 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
7458 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
7459 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
7460 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
7461 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
7462 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
7463 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
7464 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
7465 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
7466 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
7472 const auto packed =
Set(d32, packed_array[mask_bits]);
7473 alignas(32)
static constexpr uint32_t shifts[8] = {0, 4, 8, 12,
7475 return packed >>
Load(d32, shifts);
7478template <
typename T, HWY_IF_T_SIZE(T, 8)>
7479HWY_INLINE Vec256<uint32_t> IndicesFromBits256(uint64_t mask_bits) {
7480 const Full256<uint32_t> d32;
7485 alignas(32)
static constexpr uint32_t u32_indices[128] = {
7487 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7,
7488 10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7,
7489 12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7,
7490 10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7,
7491 14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5,
7492 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5,
7493 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3,
7494 10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15};
7495 return Load(d32, u32_indices + 8 * mask_bits);
7498template <
typename T, HWY_IF_T_SIZE(T, 4)>
7499HWY_INLINE Vec256<uint32_t> IndicesFromNotBits256(uint64_t mask_bits) {
7500 const Full256<uint32_t> d32;
7506 alignas(16)
static constexpr uint32_t packed_array[256] = {
7508 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
7509 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
7510 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
7511 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
7512 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
7513 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
7514 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
7515 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
7516 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
7517 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
7518 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
7519 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
7520 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
7521 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
7522 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
7523 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
7524 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
7525 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
7526 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
7527 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
7528 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
7529 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
7530 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
7531 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
7532 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
7533 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
7534 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
7535 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
7536 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
7537 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
7538 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
7539 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
7540 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
7541 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
7542 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
7543 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
7544 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
7545 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
7546 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
7547 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
7548 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
7549 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
7550 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
7556 const Vec256<uint32_t> packed =
Set(d32, packed_array[mask_bits]);
7557 alignas(32)
static constexpr uint32_t shifts[8] = {0, 4, 8, 12,
7559 return packed >>
Load(d32, shifts);
7562template <
typename T, HWY_IF_T_SIZE(T, 8)>
7563HWY_INLINE Vec256<uint32_t> IndicesFromNotBits256(uint64_t mask_bits) {
7564 const Full256<uint32_t> d32;
7569 alignas(32)
static constexpr uint32_t u32_indices[128] = {
7571 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9,
7572 8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11,
7573 8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13,
7574 8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13,
7575 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15,
7576 8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15,
7577 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15,
7578 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
7579 return Load(d32, u32_indices + 8 * mask_bits);
7582template <
typename T, HWY_IF_NOT_T_SIZE(T, 2)>
7590 const Indices256<uint32_t>
indices{IndicesFromBits256<T>(mask_bits).raw};
7596template <
typename T, HWY_IF_T_SIZE(T, 2)>
7600 const auto vu16 =
BitCast(du, v);
7601 const Half<
decltype(du)> duh;
7602 const auto half0 =
LowerHalf(duh, vu16);
7603 const auto half1 =
UpperHalf(duh, vu16);
7605 const uint64_t mask_bits0 = mask_bits & 0xFF;
7606 const uint64_t mask_bits1 = mask_bits >> 8;
7610 alignas(32) uint16_t all_true[16] = {};
7612 const size_t num_true0 =
PopCount(mask_bits0);
7613 Store(compressed0, duh, all_true);
7614 StoreU(compressed1, duh, all_true + num_true0);
7620 alignas(32) uint16_t all_false[16] = {};
7621 const size_t num_true1 =
PopCount(mask_bits1);
7622 Store(compressed1, duh, all_false + 8);
7623 StoreU(compressed0, duh, all_false + num_true1);
7625 const auto mask =
FirstN(du, num_true0 + num_true1);
7634template <
typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
7635HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const u
int64_t mask_bits) {
7636 const DFromV<decltype(v)> d;
7637 const Repartition<u
int32_t, decltype(d)> du32;
7639 HWY_DASSERT(mask_bits < (1ull << Lanes(d)));
7642 const Indices256<u
int32_t> indices{IndicesFromNotBits256<T>(mask_bits).raw};
7643 return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
7648template <
typename T, HWY_IF_T_SIZE(T, 2)>
7651 return Compress(v, mask_bits ^ 0xFFFF);
7656template <
typename T, HWY_IF_NOT_T_SIZE(T, 1)>
7661template <
typename T, HWY_IF_NOT_T_SIZE(T, 1)>
7667 Mask256<uint64_t> mask) {
7671template <
typename T, HWY_IF_NOT_T_SIZE(T, 1)>
7673 constexpr size_t N = 32 /
sizeof(T);
7674 constexpr size_t kNumBytes = (N + 7) / 8;
7676 uint64_t mask_bits = 0;
7677 CopyBytes<kNumBytes>(bits, &mask_bits);
7680 mask_bits &= (1ull << N) - 1;
7688template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
7692 const size_t count =
PopCount(mask_bits);
7703 const size_t count =
PopCount(mask_bits);
7710 const Vec256<uint32_t> idx_mask =
7711 detail::IndicesFromBits256<TFromD<D>>(mask_bits);
7713 const Mask256<uint32_t> mask32 =
MaskFromVec(ShiftLeft<28>(idx_mask));
7715 const MFromD<
decltype(du)> mask_u{mask32.raw};
7726template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7730 const size_t count =
PopCount(mask_bits);
7733#if HWY_MEM_OPS_MIGHT_FAULT
7736 alignas(32) TFromD<D> buf[16];
7737 Store(compressed,
d, buf);
7738 CopyBytes(buf, unaligned, count *
sizeof(TFromD<D>));
7745template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
7748 constexpr size_t N =
Lanes(
d);
7749 constexpr size_t kNumBytes = (N + 7) / 8;
7751 uint64_t mask_bits = 0;
7752 CopyBytes<kNumBytes>(bits, &mask_bits);
7755 mask_bits &= (1ull << N) - 1;
7757 const size_t count =
PopCount(mask_bits);
7769template <
class D, HWY_IF_V_SIZE_GT_D(D, 16)>
7771 const Half<
decltype(
d)> dh;
7782#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE
7794template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
7797 return VFromD<D>{_mm256_maskz_expandloadu_epi8(mask.raw, unaligned)};
7800template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
7803 return VFromD<D>{_mm256_maskz_expandloadu_epi16(mask.raw, unaligned)};
7807#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
7819template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
7822 return VFromD<D>{_mm256_maskz_expandloadu_epi32(mask.raw, unaligned)};
7825template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
7828 return VFromD<D>{_mm256_maskz_expandloadu_epi64(mask.raw, unaligned)};
7835template <
typename T, HWY_IF_T_SIZE(T, 1)>
7838#if HWY_TARGET <= HWY_AVX3_DL
7845 const Half<
decltype(
d)> dh;
7847 constexpr size_t N = 32 /
sizeof(T);
7848 const size_t countL =
PopCount(mask_bits & ((1 << (N / 2)) - 1));
7854 alignas(32) T lanes[N];
7858 return Combine(
d, expandH, expandL);
7863#if HWY_TARGET != HWY_AVX3
7865template <
typename T, HWY_IF_T_SIZE(T, 2)>
7868#if HWY_TARGET <= HWY_AVX3_DL
7874 const Half<
decltype(
d)> dh;
7880 alignas(32) T lanes[32 /
sizeof(T)];
7884 const Vec128<T> expandH =
Expand(vH, maskH);
7885 return Combine(
d, expandH, expandL);
7891template <
typename T, HWY_IF_T_SIZE(T, 4)>
7894#if HWY_TARGET <= HWY_AVX3
7902 alignas(16)
constexpr uint32_t packed_array[256] = {
7904 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0,
7905 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10,
7906 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0,
7907 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210,
7908 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0,
7909 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10,
7910 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0,
7911 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210,
7912 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0,
7913 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10,
7914 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0,
7915 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210,
7916 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0,
7917 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10,
7918 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0,
7919 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210,
7920 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0,
7921 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10,
7922 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0,
7923 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210,
7924 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0,
7925 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10,
7926 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0,
7927 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210,
7928 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0,
7929 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10,
7930 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0,
7931 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210,
7932 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0,
7933 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10,
7934 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0,
7935 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210,
7936 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0,
7937 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10,
7938 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0,
7939 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210,
7940 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0,
7941 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10,
7942 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0,
7943 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210,
7944 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0,
7945 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10,
7946 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210,
7950 const Vec256<uint32_t> packed =
Set(du, packed_array[mask_bits]);
7951 alignas(32)
constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
7953 const Indices256<uint32_t>
indices{(packed >>
Load(du, shifts)).raw};
7960template <
typename T, HWY_IF_T_SIZE(T, 8)>
7963#if HWY_TARGET <= HWY_AVX3
7971 alignas(16)
constexpr uint64_t packed_array[16] = {
7973 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
7974 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
7975 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
7978 const Vec256<uint64_t> packed =
Set(du, packed_array[mask_bits]);
7979 alignas(32)
constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
7980#if HWY_TARGET <= HWY_AVX3
7982 const Indices256<uint64_t>
indices{(packed >>
Load(du, shifts)).raw};
7986 const Vec256<uint64_t> masked =
And(packed >>
Load(du, shifts),
Set(du, 3));
8001#if HWY_TARGET <= HWY_AVX3_DL
8003 using TU =
TFromD<
decltype(du)>;
8004 const TU*
HWY_RESTRICT pu =
reinterpret_cast<const TU*
>(unaligned);
8016#if HWY_TARGET <= HWY_AVX3
8018 using TU =
TFromD<
decltype(du)>;
8019 const TU*
HWY_RESTRICT pu =
reinterpret_cast<const TU*
>(unaligned);
8040template <
class D, HWY_IF_V_SIZE_D(D, 32)>
8043 constexpr size_t N =
Lanes(
d);
8063template <
class D, HWY_IF_V_SIZE_D(D, 32)>
8067 constexpr size_t N =
Lanes(
d);
8091template <
class D, HWY_IF_V_SIZE_D(D, 32)>
8094 constexpr size_t N =
Lanes(
d);
8097 StoreU(out0,
d, unaligned + 0 * N);
8098 StoreU(out1,
d, unaligned + 1 * N);
8109template <
class D, HWY_IF_V_SIZE_D(D, 32)>
8112 constexpr size_t N =
Lanes(
d);
8116 StoreU(out0,
d, unaligned + 0 * N);
8117 StoreU(out1,
d, unaligned + 1 * N);
8118 StoreU(out2,
d, unaligned + 2 * N);
8131template <
class D, HWY_IF_V_SIZE_D(D, 32)>
8135 constexpr size_t N =
Lanes(
d);
8139 StoreU(out0,
d, unaligned + 0 * N);
8140 StoreU(out1,
d, unaligned + 1 * N);
8143 StoreU(out2,
d, unaligned + 2 * N);
8144 StoreU(out3,
d, unaligned + 3 * N);
8150#if HWY_TARGET <= HWY_AVX3
8153 constexpr size_t N =
Lanes(Full256<T>());
8154 constexpr uint32_t kActiveElemMask =
8155 static_cast<uint32_t
>((uint64_t{1} << N) - 1);
8161 constexpr size_t N =
Lanes(Full256<T>());
8162 constexpr uint32_t kActiveElemMask =
8163 static_cast<uint32_t
>((uint64_t{1} << N) - 1);
8169 constexpr size_t N =
Lanes(Full256<T>());
8170 constexpr uint32_t kActiveElemMask =
8171 static_cast<uint32_t
>((uint64_t{1} << N) - 1);
8187 const Half<
decltype(di64)> dh_i64;
8188 const Half<
decltype(di32)> dh_i32;
8189 using VF32 =
VFromD<
decltype(df32)>;
8192 vmask =
Or(vmask,
Neg(vmask));
8196 di32, VF32{_mm256_shuffle_ps(
Zero(df32).raw,
BitCast(df32, vmask).raw,
8197 _MM_SHUFFLE(1, 1, 0, 0))});
8217 const Half<
decltype(di64)> dh_i64;
8219 const auto zero =
Zero(di64);
8222 const auto vmask_eq_0 =
VecFromMask(di64, vmask == zero);
8223 auto vmask2_lo =
LowerHalf(dh_i64, vmask_eq_0);
8224 auto vmask2_hi =
UpperHalf(dh_i64, vmask_eq_0);
8231 const auto vmask2 =
Combine(di64, vmask2_hi, vmask2_lo);
8239 constexpr size_t kLanesPerBlock =
MaxLanes(
d) / 2;
8245 d, vmask, vmask_lo)));
8253#if HWY_TARGET <= HWY_AVX3
8254template <
class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
8256 return V{_mm256_lzcnt_epi32(v.raw)};
8259template <
class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
8261 return V{_mm256_lzcnt_epi64(v.raw)};
#define HWY_RESTRICT
Definition base.h:95
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_ASSERT(condition)
Definition base.h:237
Definition arm_neon-inl.h:865
Definition arm_neon-inl.h:813
Raw raw
Definition arm_neon-inl.h:851
Definition wasm_256-inl.h:27
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition x86_256-inl.h:113
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition x86_256-inl.h:107
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition x86_256-inl.h:101
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition x86_256-inl.h:98
HWY_INLINE Vec256 & operator%=(const Vec256 other)
Definition x86_256-inl.h:104
Raw raw
Definition x86_256-inl.h:117
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition x86_256-inl.h:110
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition x86_256-inl.h:95
static constexpr size_t kPrivateN
Definition wasm_256-inl.h:30
typename detail::Raw256< T >::type Raw
Definition x86_256-inl.h:84
T PrivateT
Definition wasm_256-inl.h:29
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition x86_256-inl.h:92
#define HWY_COMPILER_CLANGCL
Definition detect_compiler_arch.h:45
#define HWY_TARGET
Definition detect_targets.h:543
#define HWY_AVX3
Definition detect_targets.h:74
HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV< V > t)
Definition x86_128-inl.h:6289
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:744
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE Vec256< T > BroadcastLane(hwy::SizeTag< 0 >, Vec256< T > v)
Definition x86_256-inl.h:4186
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition ops/shared-inl.h:151
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_INLINE VFromD< D > TableLookupSlideDownLanes(D d, VFromD< D > v, size_t amt)
Definition x86_256-inl.h:5786
HWY_INLINE Vec256< T > NativeGather256(const T *HWY_RESTRICT base, Vec256< int32_t > indices)
Definition x86_256-inl.h:3786
HWY_API Vec128< T, N > GaloisAffine(Vec128< T, N > v, VFromD< Repartition< uint64_t, Simd< T, N, 0 > > > matrix)
Definition x86_128-inl.h:1870
HWY_INLINE VFromD< DI > FixConversionOverflow(DI di, VFromD< RebindToFloat< DI > > original, VFromD< DI > converted)
Definition x86_128-inl.h:10061
static HWY_INLINE uint32_t AVX3Blsi(T x)
Definition x86_128-inl.h:12517
HWY_API Vec128< T, N > Shl(hwy::UnsignedTag, Vec128< T, N > v, Vec128< T, N > bits)
Definition ppc_vsx-inl.h:3336
HWY_INLINE V CombineShiftRightI64Lanes(V hi, V lo)
Definition x86_256-inl.h:5526
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE Mask128< T, N > UnmaskedNot(const Mask128< T, N > m)
Definition x86_128-inl.h:1635
HWY_INLINE Vec128< uint32_t, 2 > LookupAndConcatQuarters(Vec256< T > v)
Definition x86_256-inl.h:6692
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1593
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:5084
HWY_INLINE Vec128< uint32_t > LookupAndConcatHalves(Vec256< T > v)
Definition x86_256-inl.h:6669
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE void LoadTransposedBlocks4(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &vA, VFromD< D > &vB, VFromD< D > &vC, VFromD< D > &vD)
Definition generic_ops-inl.h:1477
HWY_INLINE void StoreTransposedBlocks3(VFromD< D > A, VFromD< D > B, VFromD< D > C, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1652
HWY_INLINE Vec256< T > NativeMaskedGatherOr256(Vec256< T > no, Mask256< T > m, const T *HWY_RESTRICT base, Vec256< int32_t > indices)
Definition x86_256-inl.h:3836
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:3803
HWY_INLINE V SlideDownI64Lanes(V v)
Definition x86_256-inl.h:5740
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
static HWY_INLINE uint32_t AVX3Blsmsk(T x)
Definition x86_128-inl.h:12537
HWY_INLINE void StoreTransposedBlocks2(VFromD< D > A, VFromD< D > B, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1616
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:164
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition x86_256-inl.h:7076
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE VFromD< DTo > ZeroExtendResizeBitCast(FromSizeTag, ToSizeTag, DTo d_to, DFrom, VFromD< DFrom > v)
Definition emu128-inl.h:140
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2845
HWY_INLINE void StoreTransposedBlocks4(VFromD< D > vA, VFromD< D > vB, VFromD< D > vC, VFromD< D > vD, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:2003
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE VFromD< D > NativeLoadExpand(MFromD< D > mask, D, const uint8_t *HWY_RESTRICT unaligned)
Definition x86_128-inl.h:12412
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:1383
HWY_INLINE VFromD< D > PromoteOddTo(hwy::FloatTag to_type_tag, hwy::SizeTag< 4 > to_lane_size_tag, hwy::FloatTag from_type_tag, D d_to, svfloat16_t v)
Definition arm_sve-inl.h:4419
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8851
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
HWY_INLINE VFromD< D > PromoteEvenTo(hwy::SignedTag, hwy::SizeTag< 2 >, hwy::SignedTag, D d_to, svint8_t v)
Definition arm_sve-inl.h:4334
HWY_INLINE void LoadTransposedBlocks3(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &A, VFromD< D > &B, VFromD< D > &C)
Definition generic_ops-inl.h:1279
HWY_INLINE VFromD< D > TableLookupSlideUpLanes(D d, VFromD< D > v, size_t amt)
Definition x86_256-inl.h:5582
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8860
HWY_API Vec128< T, N > CompressBits(Vec128< T, N > v, uint64_t mask_bits)
Definition ppc_vsx-inl.h:6007
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE VFromD< Rebind< uint64_t, D > > DemoteFromU64Saturate(D dn, VFromD< Rebind< uint64_t, D > > v)
Definition wasm_128-inl.h:4501
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE Vec128< uint8_t, N > NativeExpand(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition x86_128-inl.h:12400
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE V CombineShiftRightI32Lanes(V hi, V lo)
Definition x86_256-inl.h:5517
HWY_INLINE V SlideUpI64Lanes(V v)
Definition x86_256-inl.h:5535
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< DI32 > SatWidenMulPairwiseAccumulate(DI32 di32, VFromD< Repartition< int16_t, DI32 > > a, VFromD< Repartition< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition generic_ops-inl.h:5179
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API Mask< D > SlideMask1Up(D d, Mask< D > m)
Definition generic_ops-inl.h:7071
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
V Shl(V a, V b)
Definition generic_ops-inl.h:7322
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API V AddSub(V a, V b)
Definition generic_ops-inl.h:775
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API V Rol(V a, V b)
Definition generic_ops-inl.h:445
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API V ZeroIfNegative(V v)
Definition generic_ops-inl.h:266
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfAdjQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3901
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API VFromD< D > SlideDownBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7046
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfShuffledQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3943
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec< DI16 > SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b)
Definition generic_ops-inl.h:5153
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Mask< D > SlideMask1Down(D d, Mask< D > m)
Definition generic_ops-inl.h:7076
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API Vec< RepartitionToWideX3< DFromV< V > > > SumsOf8AbsDiff(V a, V b)
Definition generic_ops-inl.h:2820
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API V MulAddSub(V mul, V x, V sub_or_add)
Definition arm_sve-inl.h:4285
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
Simd< T, 32/sizeof(T), 0 > Full256
Definition wasm_128-inl.h:53
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API V Ror(V a, V b)
Definition generic_ops-inl.h:459
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
unsigned int Shift64Count
Definition x86_128-inl.h:4535
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API MFromD< DFromV< V > > IsEitherNaN(V a, V b)
Definition generic_ops-inl.h:1177
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
long long int GatherIndex64
Definition x86_128-inl.h:5737
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API MFromD< D > UpperHalfOfMask(D, MFromD< Twice< D > > m)
Definition x86_128-inl.h:1051
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_API MFromD< D > CombineMasks(D, MFromD< Half< D > > hi, MFromD< Half< D > > lo)
Definition x86_128-inl.h:959
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
HWY_API VFromD< D > SlideUpBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7028
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API V BroadcastLane(const V v)
Definition arm_sve-inl.h:4146
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2577
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2540
HWY_API size_t PopCount(T x)
Definition base.h:2615
#define HWY_IF_F16_D(D)
Definition ops/shared-inl.h:597
#define HWY_IF_FLOAT_D(D)
Definition ops/shared-inl.h:535
#define HWY_IF_V_SIZE_GT_V(V, bytes)
Definition ops/shared-inl.h:636
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_UI32_D(D)
Definition ops/shared-inl.h:591
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_V_SIZE_V(V, bytes)
Definition ops/shared-inl.h:632
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_V_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:609
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_NOT_FLOAT_D(D)
Definition ops/shared-inl.h:536
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_AFTER_NAMESPACE()
Definition set_macros-inl.h:633
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition arm_neon-inl.h:8428
Definition wasm_256-inl.h:1085
__m256i raw
Definition x86_256-inl.h:4372
Definition wasm_256-inl.h:64
typename detail::RawMask256< sizeof(T)>::type Raw
Definition x86_256-inl.h:148
static Mask256< T > FromBits(uint64_t mask_bits)
Definition x86_256-inl.h:150
Raw raw
Definition x86_256-inl.h:154
Definition ops/shared-inl.h:198
HWY_INLINE __m256d operator()(__m256i v)
Definition x86_256-inl.h:238
HWY_INLINE __m256 operator()(__m256i v)
Definition x86_256-inl.h:234
Definition x86_256-inl.h:223
HWY_INLINE __m256i operator()(__m256i v)
Definition x86_256-inl.h:224
__m256d type
Definition x86_256-inl.h:77
__m256 type
Definition x86_256-inl.h:73
Definition x86_256-inl.h:62
__m256i type
Definition x86_256-inl.h:63
__mmask32 type
Definition x86_256-inl.h:129
__mmask16 type
Definition x86_256-inl.h:133
__mmask8 type
Definition x86_256-inl.h:137
__mmask8 type
Definition x86_256-inl.h:141
Definition x86_256-inl.h:126
int VFromD
Definition tuple-inl.h:25
#define HWY_X86_FPCLASS_NEG_INF
Definition x86_128-inl.h:11266
#define HWY_X86_FPCLASS_SNAN
Definition x86_128-inl.h:11269
#define HWY_X86_FPCLASS_POS_INF
Definition x86_128-inl.h:11265
#define HWY_X86_FPCLASS_QNAN
Definition x86_128-inl.h:11262