28#if HWY_COMPILER_GCC_ACTUAL
31 ignored
"-Wmaybe-uninitialized")
45#include <avx2intrin.h>
46#include <f16cintrin.h>
49#include <avx512fintrin.h>
50#include <avx512vlintrin.h>
51#include <avx512bwintrin.h>
52#include <avx512vlbwintrin.h>
53#include <avx512dqintrin.h>
54#include <avx512vldqintrin.h>
55#include <avx512cdintrin.h>
56#include <avx512vlcdintrin.h>
59#include <avx512bitalgintrin.h>
60#include <avx512vlbitalgintrin.h>
61#include <avx512vbmiintrin.h>
62#include <avx512vbmivlintrin.h>
63#include <avx512vbmi2intrin.h>
64#include <avx512vlvbmi2intrin.h>
65#include <avx512vpopcntdqintrin.h>
66#include <avx512vpopcntdqvlintrin.h>
67#include <avx512vnniintrin.h>
68#include <avx512vlvnniintrin.h>
70#include <vaesintrin.h>
71#include <vpclmulqdqintrin.h>
72#include <gfniintrin.h>
76#include <avx512fp16intrin.h>
77#include <avx512vlfp16intrin.h>
84#include
"hwy/ops/x86_256-inl.h"
112template <
size_t size>
144 return *
this = (*
this * other);
147 return *
this = (*
this / other);
150 return *
this = (*
this + other);
153 return *
this = (*
this - other);
156 return *
this = (*
this % other);
159 return *
this = (*
this & other);
162 return *
this = (*
this | other);
165 return *
this = (*
this ^ other);
188 return _mm512_castph_si512(v);
193 return _mm512_castpd_si512(v);
196#if HWY_AVX3_HAVE_F32_TO_BF16C
203#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
205 return reinterpret_cast<__m512i
>(v);
210 return BitCastScalar<__m512i>(v);
240template <
class D, HWY_IF_V_SIZE_D(D, 64)>
247template <
class D, HWY_IF_V_SIZE_D(D, 64),
typename FromT>
254template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
256 return VFromD<D>{_mm512_set1_epi8(
static_cast<char>(t))};
258template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI16_D(D)>
260 return VFromD<D>{_mm512_set1_epi16(
static_cast<short>(t))};
262template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
264 return VFromD<D>{_mm512_set1_epi32(
static_cast<int>(t))};
266template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
268 return VFromD<D>{_mm512_set1_epi64(
static_cast<long long>(t))};
272template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
273HWY_API Vec512<float16_t>
Set(D , float16_t t) {
274 return Vec512<float16_t>{_mm512_set1_ph(t)};
277template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
281template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
289#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
292template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
294 return Set(
d, TFromD<D>{0});
297template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
299 const RebindToUnsigned<D> du;
300 return Vec512<bfloat16_t>{
Set(du, 0).
raw};
302template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
304 const RebindToUnsigned<D> du;
305 return Vec512<float16_t>{
Set(du, 0).raw};
310template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
314template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
318template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
326template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
330template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
349template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
353template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
361template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
365template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
378 return BitCast(
d, Vec128<uint8_t>{_mm512_castsi512_si128(
379 BitCast(Full512<uint8_t>(), v).raw)});
386 return BitCast(
d, Vec512<uint8_t>{_mm512_castsi128_si512(
394 return BitCast(
d, Vec512<uint8_t>{_mm512_castsi256_si512(
395 BitCast(Full256<uint8_t>(), v).raw)});
400template <
class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 64)>
402 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
403 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
404 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
405 TFromD<D> t11, TFromD<D> t12,
406 TFromD<D> t13, TFromD<D> t14,
408#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
412 t7, t8, t9, t10, t11, t12, t13, t14, t15)));
418 static_cast<char>(t15),
static_cast<char>(t14),
static_cast<char>(t13),
419 static_cast<char>(t12),
static_cast<char>(t11),
static_cast<char>(t10),
420 static_cast<char>(t9),
static_cast<char>(t8),
static_cast<char>(t7),
421 static_cast<char>(t6),
static_cast<char>(t5),
static_cast<char>(t4),
422 static_cast<char>(t3),
static_cast<char>(t2),
static_cast<char>(t1),
423 static_cast<char>(t0),
static_cast<char>(t15),
static_cast<char>(t14),
424 static_cast<char>(t13),
static_cast<char>(t12),
static_cast<char>(t11),
425 static_cast<char>(t10),
static_cast<char>(t9),
static_cast<char>(t8),
426 static_cast<char>(t7),
static_cast<char>(t6),
static_cast<char>(t5),
427 static_cast<char>(t4),
static_cast<char>(t3),
static_cast<char>(t2),
428 static_cast<char>(t1),
static_cast<char>(t0),
static_cast<char>(t15),
429 static_cast<char>(t14),
static_cast<char>(t13),
static_cast<char>(t12),
430 static_cast<char>(t11),
static_cast<char>(t10),
static_cast<char>(t9),
431 static_cast<char>(t8),
static_cast<char>(t7),
static_cast<char>(t6),
432 static_cast<char>(t5),
static_cast<char>(t4),
static_cast<char>(t3),
433 static_cast<char>(t2),
static_cast<char>(t1),
static_cast<char>(t0),
434 static_cast<char>(t15),
static_cast<char>(t14),
static_cast<char>(t13),
435 static_cast<char>(t12),
static_cast<char>(t11),
static_cast<char>(t10),
436 static_cast<char>(t9),
static_cast<char>(t8),
static_cast<char>(t7),
437 static_cast<char>(t6),
static_cast<char>(t5),
static_cast<char>(t4),
438 static_cast<char>(t3),
static_cast<char>(t2),
static_cast<char>(t1),
439 static_cast<char>(t0))};
443template <
class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 64)>
445 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
446 TFromD<D> t5, TFromD<D> t6,
448#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
450 return BroadcastBlock<0>(
458 _mm512_set_epi16(
static_cast<int16_t
>(t7),
static_cast<int16_t
>(t6),
459 static_cast<int16_t
>(t5),
static_cast<int16_t
>(t4),
460 static_cast<int16_t
>(t3),
static_cast<int16_t
>(t2),
461 static_cast<int16_t
>(t1),
static_cast<int16_t
>(t0),
462 static_cast<int16_t
>(t7),
static_cast<int16_t
>(t6),
463 static_cast<int16_t
>(t5),
static_cast<int16_t
>(t4),
464 static_cast<int16_t
>(t3),
static_cast<int16_t
>(t2),
465 static_cast<int16_t
>(t1),
static_cast<int16_t
>(t0),
466 static_cast<int16_t
>(t7),
static_cast<int16_t
>(t6),
467 static_cast<int16_t
>(t5),
static_cast<int16_t
>(t4),
468 static_cast<int16_t
>(t3),
static_cast<int16_t
>(t2),
469 static_cast<int16_t
>(t1),
static_cast<int16_t
>(t0),
470 static_cast<int16_t
>(t7),
static_cast<int16_t
>(t6),
471 static_cast<int16_t
>(t5),
static_cast<int16_t
>(t4),
472 static_cast<int16_t
>(t3),
static_cast<int16_t
>(t2),
473 static_cast<int16_t
>(t1),
static_cast<int16_t
>(t0))};
478template <
class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 64)>
480 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
481 TFromD<D> t5, TFromD<D> t6,
483 return VFromD<D>{_mm512_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
484 t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5,
485 t6, t7, t0, t1, t2, t3, t4, t5, t6, t7)};
489template <
class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 64)>
491 TFromD<D> t2, TFromD<D> t3) {
493 _mm512_setr_epi32(
static_cast<int32_t
>(t0),
static_cast<int32_t
>(t1),
494 static_cast<int32_t
>(t2),
static_cast<int32_t
>(t3),
495 static_cast<int32_t
>(t0),
static_cast<int32_t
>(t1),
496 static_cast<int32_t
>(t2),
static_cast<int32_t
>(t3),
497 static_cast<int32_t
>(t0),
static_cast<int32_t
>(t1),
498 static_cast<int32_t
>(t2),
static_cast<int32_t
>(t3),
499 static_cast<int32_t
>(t0),
static_cast<int32_t
>(t1),
500 static_cast<int32_t
>(t2),
static_cast<int32_t
>(t3))};
503template <
class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 64)>
505 TFromD<D> t2, TFromD<D> t3) {
506 return VFromD<D>{_mm512_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2,
507 t3, t0, t1, t2, t3)};
510template <
class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 64)>
513 _mm512_setr_epi64(
static_cast<int64_t
>(t0),
static_cast<int64_t
>(t1),
514 static_cast<int64_t
>(t0),
static_cast<int64_t
>(t1),
515 static_cast<int64_t
>(t0),
static_cast<int64_t
>(t1),
516 static_cast<int64_t
>(t0),
static_cast<int64_t
>(t1))};
519template <
class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 64)>
521 return VFromD<D>{_mm512_setr_pd(t0, t1, t0, t1, t0, t1, t0, t1)};
528template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
530#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
532 alignas(64)
static constexpr TFromD<D> kIota[64] = {
533 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
534 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
535 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
536 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
537 return Load(
d, kIota);
541 static_cast<char>(63),
static_cast<char>(62),
static_cast<char>(61),
542 static_cast<char>(60),
static_cast<char>(59),
static_cast<char>(58),
543 static_cast<char>(57),
static_cast<char>(56),
static_cast<char>(55),
544 static_cast<char>(54),
static_cast<char>(53),
static_cast<char>(52),
545 static_cast<char>(51),
static_cast<char>(50),
static_cast<char>(49),
546 static_cast<char>(48),
static_cast<char>(47),
static_cast<char>(46),
547 static_cast<char>(45),
static_cast<char>(44),
static_cast<char>(43),
548 static_cast<char>(42),
static_cast<char>(41),
static_cast<char>(40),
549 static_cast<char>(39),
static_cast<char>(38),
static_cast<char>(37),
550 static_cast<char>(36),
static_cast<char>(35),
static_cast<char>(34),
551 static_cast<char>(33),
static_cast<char>(32),
static_cast<char>(31),
552 static_cast<char>(30),
static_cast<char>(29),
static_cast<char>(28),
553 static_cast<char>(27),
static_cast<char>(26),
static_cast<char>(25),
554 static_cast<char>(24),
static_cast<char>(23),
static_cast<char>(22),
555 static_cast<char>(21),
static_cast<char>(20),
static_cast<char>(19),
556 static_cast<char>(18),
static_cast<char>(17),
static_cast<char>(16),
557 static_cast<char>(15),
static_cast<char>(14),
static_cast<char>(13),
558 static_cast<char>(12),
static_cast<char>(11),
static_cast<char>(10),
559 static_cast<char>(9),
static_cast<char>(8),
static_cast<char>(7),
560 static_cast<char>(6),
static_cast<char>(5),
static_cast<char>(4),
561 static_cast<char>(3),
static_cast<char>(2),
static_cast<char>(1),
562 static_cast<char>(0))};
566template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI16_D(D)>
568#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
570 alignas(64)
static constexpr TFromD<D> kIota[32] = {
571 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
572 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
573 return Load(
d, kIota);
577 int16_t{31}, int16_t{30}, int16_t{29}, int16_t{28}, int16_t{27},
578 int16_t{26}, int16_t{25}, int16_t{24}, int16_t{23}, int16_t{22},
579 int16_t{21}, int16_t{20}, int16_t{19}, int16_t{18}, int16_t{17},
580 int16_t{16}, int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12},
581 int16_t{11}, int16_t{10}, int16_t{9}, int16_t{8}, int16_t{7}, int16_t{6},
582 int16_t{5}, int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0})};
587template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
590 float16_t{31}, float16_t{30}, float16_t{29}, float16_t{28}, float16_t{27},
591 float16_t{26}, float16_t{25}, float16_t{24}, float16_t{23}, float16_t{22},
592 float16_t{21}, float16_t{20}, float16_t{19}, float16_t{18}, float16_t{17},
593 float16_t{16}, float16_t{15}, float16_t{14}, float16_t{13}, float16_t{12},
594 float16_t{11}, float16_t{10}, float16_t{9}, float16_t{8}, float16_t{7},
595 float16_t{6}, float16_t{5}, float16_t{4}, float16_t{3}, float16_t{2},
596 float16_t{1}, float16_t{0})};
600template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
603 int32_t{15}, int32_t{14}, int32_t{13}, int32_t{12}, int32_t{11},
604 int32_t{10}, int32_t{9}, int32_t{8}, int32_t{7}, int32_t{6}, int32_t{5},
605 int32_t{4}, int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})};
608template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
610 return VFromD<D>{_mm512_set_epi64(int64_t{7}, int64_t{6}, int64_t{5},
611 int64_t{4}, int64_t{3}, int64_t{2},
612 int64_t{1}, int64_t{0})};
615template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
617 return VFromD<D>{_mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f,
618 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f,
622template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
624 return VFromD<D>{_mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)};
629template <
class D,
typename T2, HWY_IF_V_SIZE_D(D, 64)>
642 using VU =
VFromD<
decltype(du)>;
643 const __m512i vu =
BitCast(du, v).raw;
644 return BitCast(
d, VU{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
669 const DFromV<
decltype(mask)>
d;
723 using VU =
VFromD<
decltype(du)>;
724 const __m512i ret = _mm512_ternarylogic_epi64(
728 return Xor(x1,
Xor(x2, x3));
738 using VU =
VFromD<
decltype(du)>;
739 const __m512i ret = _mm512_ternarylogic_epi64(
743 return Or(o1,
Or(o2, o3));
753 using VU =
VFromD<
decltype(du)>;
754 const __m512i ret = _mm512_ternarylogic_epi64(
758 return Or(o,
And(a1, a2));
768 using VU =
VFromD<
decltype(du)>;
797#if HWY_TARGET <= HWY_AVX3_DL
799#ifdef HWY_NATIVE_POPCNT
800#undef HWY_NATIVE_POPCNT
802#define HWY_NATIVE_POPCNT
848template <
typename T, HWY_IF_NOT_T_SIZE(T, 1)>
851 const uint32_t all = ~uint32_t{0};
854 m.raw =
static_cast<decltype(
m.raw)
>(_bzhi_u32(all, n));
858#if HWY_COMPILER_MSVC >= 1920 || HWY_COMPILER_GCC_ACTUAL >= 900 || \
859 HWY_COMPILER_CLANG || HWY_COMPILER_ICC
860template <
typename T, HWY_IF_T_SIZE(T, 1)>
864 uint32_t hi_mask_len;
866 if (__builtin_constant_p(n >= 32) && n >= 32) {
867 if (__builtin_constant_p(n >= 64) && n >= 64) {
870 hi_mask_len =
static_cast<uint32_t
>(n) - 32u;
872 lo_mask = hi_mask = 0xFFFFFFFFu;
876 const uint32_t lo_mask_len =
static_cast<uint32_t
>(n);
877 lo_mask = _bzhi_u32(0xFFFFFFFFu, lo_mask_len);
880 if (__builtin_constant_p(lo_mask_len <= 32) && lo_mask_len <= 32) {
881 return Mask512<T>{
static_cast<__mmask64
>(lo_mask)};
885 _addcarry_u32(_subborrow_u32(0, lo_mask_len, 32u, &hi_mask_len),
886 0xFFFFFFFFu, 0u, &hi_mask);
888 hi_mask = _bzhi_u32(hi_mask, hi_mask_len);
889#if HWY_COMPILER_GCC && !HWY_COMPILER_ICC
890 if (__builtin_constant_p((
static_cast<uint64_t
>(hi_mask) << 32) | lo_mask))
892 return Mask512<T>{
static_cast<__mmask64
>(
893 (
static_cast<uint64_t
>(hi_mask) << 32) | lo_mask)};
894#if HWY_COMPILER_GCC && !HWY_COMPILER_ICC
896 return Mask512<T>{_mm512_kunpackd(
static_cast<__mmask64
>(hi_mask),
897 static_cast<__mmask64
>(lo_mask))};
901template <
typename T, HWY_IF_T_SIZE(T, 1)>
903 const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0};
904 return Mask512<T>{
static_cast<__mmask64
>(bits)};
910template <
class D, HWY_IF_V_SIZE_D(D, 64)>
918 const uint64_t all = ~uint64_t{0};
919 m.raw =
static_cast<decltype(
m.raw)
>(_bzhi_u64(all, n));
922 return detail::FirstN<TFromD<D>>(n);
960template <
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
967 Vec512<float16_t> yes,
968 Vec512<float16_t> no) {
969 return Vec512<float16_t>{_mm512_mask_blend_ph(mask.raw, no.raw, yes.raw)};
1001template <
typename T>
1010template <
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1024template <
typename T>
1030template <
typename T>
1035template <
typename T>
1040template <
typename T>
1048template <
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1059template <
typename T>
1061 static_assert(IsSigned<T>(),
"Only works for signed/float");
1108HWY_API Vec512<float16_t>
operator+(Vec512<float16_t> a, Vec512<float16_t> b) {
1109 return Vec512<float16_t>{_mm512_add_ph(a.raw, b.raw)};
1151HWY_API Vec512<float16_t>
operator-(Vec512<float16_t> a, Vec512<float16_t> b) {
1152 return Vec512<float16_t>{_mm512_sub_ph(a.raw, b.raw)};
1184 static_cast<__mmask32
>(0x55555555), v.
raw,
Zero(
d).raw, 0)};
1205 Set(di32, int32_t{-512});
1212#if HWY_TARGET <= HWY_AVX3
1213template <
int kIdx3,
int kIdx2,
int kIdx1,
int kIdx0>
1216 static_assert(0 <= kIdx0 && kIdx0 <= 3,
"kIdx0 must be between 0 and 3");
1217 static_assert(0 <= kIdx1 && kIdx1 <= 3,
"kIdx1 must be between 0 and 3");
1218 static_assert(0 <= kIdx2 && kIdx2 <= 3,
"kIdx2 must be between 0 and 3");
1219 static_assert(0 <= kIdx3 && kIdx3 <= 3,
"kIdx3 must be between 0 and 3");
1221 _mm512_dbsad_epu8(b.
raw, a.
raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
1281#if HWY_COMPILER_MSVC
1284 const auto zero =
Zero(
d);
1302#if HWY_TARGET <= HWY_AVX3_DL
1304template <
typename T>
1306 return Vec512<T>{_mm512_gf2p8affine_epi64_epi8(v.
raw, matrix.
raw, 0)};
1341#if HWY_TARGET <= HWY_AVX3_DL
1344template <
int kBits,
class V, HWY_IF_T_SIZE_V(V, 1)>
1347 if (kBits == 0)
return v;
1348 if (kBits == 1)
return v + v;
1349 constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) &
1350 (0x0101010101010101ULL * (0xFF >> kBits));
1356template <
int kBits,
typename T, HWY_IF_T_SIZE(T, 1)>
1358 const DFromV<
decltype(v)> d8;
1360 const auto shifted =
BitCast(d8, ShiftLeft<kBits>(
BitCast(d16, v)));
1363 : (shifted &
Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1400#if HWY_TARGET <= HWY_AVX3_DL
1403template <
int kBits,
class V, HWY_IF_U8_D(DFromV<V>)>
1406 if (kBits == 0)
return v;
1407 constexpr uint64_t kMatrix =
1408 (0x0102040810204080ULL << kBits) &
1409 (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1414template <
int kBits,
class V, HWY_IF_I8_D(DFromV<V>)>
1416 const Repartition<uint64_t, DFromV<V>> du64;
1417 if (kBits == 0)
return v;
1418 constexpr uint64_t kShift =
1419 (0x0102040810204080ULL << kBits) &
1420 (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1421 constexpr uint64_t kSign =
1422 kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits)));
1430 const DFromV<
decltype(v)> d8;
1432 const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
1433 return shifted &
Set(d8, 0xFF >> kBits);
1438 const DFromV<
decltype(v)> di;
1440 const auto shifted =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
1441 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
1442 return (shifted ^ shifted_sign) - shifted_sign;
1449#if HWY_TARGET <= HWY_AVX3_DL
1451template <
int kBits,
class V, HWY_IF_U8(TFromV<V>)>
1453 static_assert(0 <= kBits && kBits < 8,
"Invalid shift count");
1455 const Repartition<uint64_t, DFromV<V>> du64;
1456 if (kBits == 0)
return v;
1458 constexpr uint64_t kShrMatrix =
1459 (0x0102040810204080ULL << kBits) &
1460 (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1461 constexpr int kShlBits = (-kBits) & 7;
1462 constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
1463 (0x0101010101010101ULL * (0xFF >> kShlBits));
1464 constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
1471 static_assert(0 <= kBits && kBits < 8,
"Invalid shift count");
1472 if (kBits == 0)
return v;
1480 static_assert(0 <= kBits && kBits < 16,
"Invalid shift count");
1481 if (kBits == 0)
return v;
1488 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
1489 if (kBits == 0)
return v;
1495 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
1496 if (kBits == 0)
return v;
1502#if HWY_TARGET <= HWY_AVX3
1504template <
class T, HWY_IF_UI32(T)>
1509template <
class T, HWY_IF_UI32(T)>
1514template <
class T, HWY_IF_UI64(T)>
1515HWY_API Vec512<T>
Rol(Vec512<T> a, Vec512<T> b) {
1516 return Vec512<T>{_mm512_rolv_epi64(a.raw, b.raw)};
1519template <
class T, HWY_IF_UI64(T)>
1520HWY_API Vec512<T>
Ror(Vec512<T> a, Vec512<T> b) {
1521 return Vec512<T>{_mm512_rorv_epi64(a.raw, b.raw)};
1530#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100
1533#elif HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
1546 if (__builtin_constant_p(bits)) {
1556 if (__builtin_constant_p(bits)) {
1566 if (__builtin_constant_p(bits)) {
1576 if (__builtin_constant_p(bits)) {
1586 if (__builtin_constant_p(bits)) {
1596 if (__builtin_constant_p(bits)) {
1604template <
typename T, HWY_IF_T_SIZE(T, 1)>
1606 const DFromV<
decltype(v)> d8;
1609 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
1617 if (__builtin_constant_p(bits)) {
1627 if (__builtin_constant_p(bits)) {
1637 if (__builtin_constant_p(bits)) {
1646 const DFromV<
decltype(v)> d8;
1649 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
1655 if (__builtin_constant_p(bits)) {
1666 if (__builtin_constant_p(bits)) {
1676 if (__builtin_constant_p(bits)) {
1685 const DFromV<
decltype(v)> di;
1688 const auto shifted_sign =
1689 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
1690 return (shifted ^ shifted_sign) - shifted_sign;
1725HWY_API Vec512<float16_t>
Min(Vec512<float16_t> a, Vec512<float16_t> b) {
1726 return Vec512<float16_t>{_mm512_min_ph(a.raw, b.raw)};
1768HWY_API Vec512<float16_t>
Max(Vec512<float16_t> a, Vec512<float16_t> b) {
1769 return Vec512<float16_t>{_mm512_max_ph(a.raw, b.raw)};
1782#ifdef HWY_NATIVE_MUL_64
1783#undef HWY_NATIVE_MUL_64
1785#define HWY_NATIVE_MUL_64
1848template <
typename T, HWY_IF_FLOAT_OR_SPECIAL(T)>
1854template <
typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1863HWY_API Vec512<float16_t>
operator*(Vec512<float16_t> a, Vec512<float16_t> b) {
1864 return Vec512<float16_t>{_mm512_mul_ph(a.raw, b.raw)};
1875HWY_API Vec512<float16_t>
operator/(Vec512<float16_t> a, Vec512<float16_t> b) {
1876 return Vec512<float16_t>{_mm512_div_ph(a.raw, b.raw)};
1889 return Vec512<float16_t>{_mm512_rcp_ph(v.raw)};
1902template <
typename T, HWY_IF_U8(T)>
1907template <
typename T, HWY_IF_I8(T)>
1910 return Vec512<T>{_mm512_mask_min_epi8(no.raw,
m.raw, a.raw, b.raw)};
1913template <
typename T, HWY_IF_U16(T)>
1916 return Vec512<T>{_mm512_mask_min_epu16(no.raw,
m.raw, a.raw, b.raw)};
1918template <
typename T, HWY_IF_I16(T)>
1921 return Vec512<T>{_mm512_mask_min_epi16(no.raw,
m.raw, a.raw, b.raw)};
1924template <
typename T, HWY_IF_U32(T)>
1927 return Vec512<T>{_mm512_mask_min_epu32(no.raw,
m.raw, a.raw, b.raw)};
1929template <
typename T, HWY_IF_I32(T)>
1932 return Vec512<T>{_mm512_mask_min_epi32(no.raw,
m.raw, a.raw, b.raw)};
1935template <
typename T, HWY_IF_U64(T)>
1938 return Vec512<T>{_mm512_mask_min_epu64(no.raw,
m.raw, a.raw, b.raw)};
1940template <
typename T, HWY_IF_I64(T)>
1943 return Vec512<T>{_mm512_mask_min_epi64(no.raw,
m.raw, a.raw, b.raw)};
1946template <
typename T, HWY_IF_F32(T)>
1949 return Vec512<T>{_mm512_mask_min_ps(no.raw,
m.raw, a.raw, b.raw)};
1952template <
typename T, HWY_IF_F64(T)>
1955 return Vec512<T>{_mm512_mask_min_pd(no.raw,
m.raw, a.raw, b.raw)};
1959template <
typename T, HWY_IF_F16(T)>
1962 return Vec512<T>{_mm512_mask_min_ph(no.raw,
m.raw, a.raw, b.raw)};
1968template <
typename T, HWY_IF_U8(T)>
1973template <
typename T, HWY_IF_I8(T)>
1976 return Vec512<T>{_mm512_mask_max_epi8(no.raw,
m.raw, a.raw, b.raw)};
1979template <
typename T, HWY_IF_U16(T)>
1982 return Vec512<T>{_mm512_mask_max_epu16(no.raw,
m.raw, a.raw, b.raw)};
1984template <
typename T, HWY_IF_I16(T)>
1987 return Vec512<T>{_mm512_mask_max_epi16(no.raw,
m.raw, a.raw, b.raw)};
1990template <
typename T, HWY_IF_U32(T)>
1993 return Vec512<T>{_mm512_mask_max_epu32(no.raw,
m.raw, a.raw, b.raw)};
1995template <
typename T, HWY_IF_I32(T)>
1998 return Vec512<T>{_mm512_mask_max_epi32(no.raw,
m.raw, a.raw, b.raw)};
2001template <
typename T, HWY_IF_U64(T)>
2004 return Vec512<T>{_mm512_mask_max_epu64(no.raw,
m.raw, a.raw, b.raw)};
2006template <
typename T, HWY_IF_I64(T)>
2009 return Vec512<T>{_mm512_mask_max_epi64(no.raw,
m.raw, a.raw, b.raw)};
2012template <
typename T, HWY_IF_F32(T)>
2015 return Vec512<T>{_mm512_mask_max_ps(no.raw,
m.raw, a.raw, b.raw)};
2018template <
typename T, HWY_IF_F64(T)>
2021 return Vec512<T>{_mm512_mask_max_pd(no.raw,
m.raw, a.raw, b.raw)};
2025template <
typename T, HWY_IF_F16(T)>
2028 return Vec512<T>{_mm512_mask_max_ph(no.raw,
m.raw, a.raw, b.raw)};
2034template <
typename T, HWY_IF_UI8(T)>
2040template <
typename T, HWY_IF_UI16(T)>
2043 return Vec512<T>{_mm512_mask_add_epi16(no.raw,
m.raw, a.raw, b.raw)};
2046template <
typename T, HWY_IF_UI32(T)>
2049 return Vec512<T>{_mm512_mask_add_epi32(no.raw,
m.raw, a.raw, b.raw)};
2052template <
typename T, HWY_IF_UI64(T)>
2055 return Vec512<T>{_mm512_mask_add_epi64(no.raw,
m.raw, a.raw, b.raw)};
2058template <
typename T, HWY_IF_F32(T)>
2061 return Vec512<T>{_mm512_mask_add_ps(no.raw,
m.raw, a.raw, b.raw)};
2064template <
typename T, HWY_IF_F64(T)>
2067 return Vec512<T>{_mm512_mask_add_pd(no.raw,
m.raw, a.raw, b.raw)};
2071template <
typename T, HWY_IF_F16(T)>
2074 return Vec512<T>{_mm512_mask_add_ph(no.raw,
m.raw, a.raw, b.raw)};
2080template <
typename T, HWY_IF_UI8(T)>
2086template <
typename T, HWY_IF_UI16(T)>
2089 return Vec512<T>{_mm512_mask_sub_epi16(no.raw,
m.raw, a.raw, b.raw)};
2092template <
typename T, HWY_IF_UI32(T)>
2095 return Vec512<T>{_mm512_mask_sub_epi32(no.raw,
m.raw, a.raw, b.raw)};
2098template <
typename T, HWY_IF_UI64(T)>
2101 return Vec512<T>{_mm512_mask_sub_epi64(no.raw,
m.raw, a.raw, b.raw)};
2104template <
typename T, HWY_IF_F32(T)>
2107 return Vec512<T>{_mm512_mask_sub_ps(no.raw,
m.raw, a.raw, b.raw)};
2110template <
typename T, HWY_IF_F64(T)>
2113 return Vec512<T>{_mm512_mask_sub_pd(no.raw,
m.raw, a.raw, b.raw)};
2117template <
typename T, HWY_IF_F16(T)>
2120 return Vec512<T>{_mm512_mask_sub_ph(no.raw,
m.raw, a.raw, b.raw)};
2138 Mask512<float16_t>
m, Vec512<float16_t> a,
2139 Vec512<float16_t> b) {
2140 return Vec512<float16_t>{_mm512_mask_mul_ph(no.raw,
m.raw, a.raw, b.raw)};
2158 Mask512<float16_t>
m, Vec512<float16_t> a,
2159 Vec512<float16_t> b) {
2160 return Vec512<float16_t>{_mm512_mask_div_ph(no.raw,
m.raw, a.raw, b.raw)};
2166template <
typename T, HWY_IF_I8(T)>
2172template <
typename T, HWY_IF_U8(T)>
2175 return Vec512<T>{_mm512_mask_adds_epu8(no.raw,
m.raw, a.raw, b.raw)};
2178template <
typename T, HWY_IF_I16(T)>
2181 return Vec512<T>{_mm512_mask_adds_epi16(no.raw,
m.raw, a.raw, b.raw)};
2184template <
typename T, HWY_IF_U16(T)>
2187 return Vec512<T>{_mm512_mask_adds_epu16(no.raw,
m.raw, a.raw, b.raw)};
2192template <
typename T, HWY_IF_I8(T)>
2198template <
typename T, HWY_IF_U8(T)>
2201 return Vec512<T>{_mm512_mask_subs_epu8(no.raw,
m.raw, a.raw, b.raw)};
2204template <
typename T, HWY_IF_I16(T)>
2207 return Vec512<T>{_mm512_mask_subs_epi16(no.raw,
m.raw, a.raw, b.raw)};
2210template <
typename T, HWY_IF_U16(T)>
2213 return Vec512<T>{_mm512_mask_subs_epu16(no.raw,
m.raw, a.raw, b.raw)};
2220HWY_API Vec512<float16_t>
MulAdd(Vec512<float16_t> mul, Vec512<float16_t> x,
2221 Vec512<float16_t> add) {
2222 return Vec512<float16_t>{_mm512_fmadd_ph(mul.raw, x.raw, add.raw)};
2225HWY_API Vec512<float16_t>
NegMulAdd(Vec512<float16_t> mul, Vec512<float16_t> x,
2226 Vec512<float16_t> add) {
2227 return Vec512<float16_t>{_mm512_fnmadd_ph(mul.raw, x.raw, add.raw)};
2230HWY_API Vec512<float16_t>
MulSub(Vec512<float16_t> mul, Vec512<float16_t> x,
2231 Vec512<float16_t> sub) {
2232 return Vec512<float16_t>{_mm512_fmsub_ph(mul.raw, x.raw, sub.raw)};
2235HWY_API Vec512<float16_t>
NegMulSub(Vec512<float16_t> mul, Vec512<float16_t> x,
2236 Vec512<float16_t> sub) {
2237 return Vec512<float16_t>{_mm512_fnmsub_ph(mul.raw, x.raw, sub.raw)};
2283HWY_API Vec512<float16_t>
MulAddSub(Vec512<float16_t> mul, Vec512<float16_t> x,
2284 Vec512<float16_t> sub_or_add) {
2285 return Vec512<float16_t>{_mm512_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
2303HWY_API Vec512<float16_t>
Sqrt(
const Vec512<float16_t> v) {
2304 return Vec512<float16_t>{_mm512_sqrt_ph(v.raw)};
2317 return Vec512<float16_t>{_mm512_rsqrt_ph(v.raw)};
2337 return Vec512<float16_t>{_mm512_roundscale_ph(
2338 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2343 v.
raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2347 v.
raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2353 return Vec512<float16_t>{
2354 _mm512_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2359 _mm512_roundscale_ps(v.
raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2363 _mm512_roundscale_pd(v.
raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2368HWY_API Vec512<float16_t>
Ceil(Vec512<float16_t> v) {
2369 return Vec512<float16_t>{
2370 _mm512_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2375 _mm512_roundscale_ps(v.
raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2379 _mm512_roundscale_pd(v.
raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2385 return Vec512<float16_t>{
2386 _mm512_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2391 _mm512_roundscale_ps(v.
raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2395 _mm512_roundscale_pd(v.
raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2404template <
class DTo,
typename TFrom>
2406 static_assert(
sizeof(TFrom) ==
sizeof(
TFromD<DTo>),
"Must have same size");
2412template <
typename T>
2417template <
typename T>
2422template <
typename T>
2427template <
typename T>
2435template <
typename T>
2437 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
2443template <
typename T, HWY_IF_T_SIZE(T, 1)>
2447template <
typename T, HWY_IF_T_SIZE(T, 2)>
2449 return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
2451template <
typename T, HWY_IF_UI32(T)>
2453 return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
2455template <
typename T, HWY_IF_UI64(T)>
2457 return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
2462 Vec512<float16_t> b) {
2466 return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2481template <
typename T, HWY_IF_T_SIZE(T, 1)>
2485template <
typename T, HWY_IF_T_SIZE(T, 2)>
2487 return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
2489template <
typename T, HWY_IF_UI32(T)>
2491 return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
2493template <
typename T, HWY_IF_UI64(T)>
2495 return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
2500 Vec512<float16_t> b) {
2504 return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2546HWY_API Mask512<float16_t>
operator>(Vec512<float16_t> a, Vec512<float16_t> b) {
2550 return Mask512<
float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
2566 Vec512<float16_t> b) {
2570 return Mask512<
float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
2610template <
typename T>
2615template <
typename T>
2624template <
typename T>
2628template <
typename T>
2632template <
typename T>
2636template <
typename T>
2643template <
typename T, HWY_IF_NOT_FLOAT(T)>
2647template <
typename T, HWY_IF_FLOAT(T)>
2668 return Vec512<float16_t>{_mm512_castsi512_ph(_mm512_movm_epi16(v.raw))};
2696template <
typename T>
2698#if HWY_COMPILER_HAS_MASK_INTRINSICS
2704template <
typename T>
2706#if HWY_COMPILER_HAS_MASK_INTRINSICS
2712template <
typename T>
2714#if HWY_COMPILER_HAS_MASK_INTRINSICS
2717 return Mask512<T>{
static_cast<uint16_t
>(~m.raw & 0xFFFF)};
2720template <
typename T>
2722#if HWY_COMPILER_HAS_MASK_INTRINSICS
2725 return Mask512<T>{
static_cast<uint8_t
>(~m.raw & 0xFF)};
2729template <
typename T>
2731#if HWY_COMPILER_HAS_MASK_INTRINSICS
2737template <
typename T>
2739#if HWY_COMPILER_HAS_MASK_INTRINSICS
2745template <
typename T>
2747#if HWY_COMPILER_HAS_MASK_INTRINSICS
2753template <
typename T>
2755#if HWY_COMPILER_HAS_MASK_INTRINSICS
2762template <
typename T>
2765#if HWY_COMPILER_HAS_MASK_INTRINSICS
2771template <
typename T>
2774#if HWY_COMPILER_HAS_MASK_INTRINSICS
2780template <
typename T>
2783#if HWY_COMPILER_HAS_MASK_INTRINSICS
2789template <
typename T>
2792#if HWY_COMPILER_HAS_MASK_INTRINSICS
2799template <
typename T>
2801#if HWY_COMPILER_HAS_MASK_INTRINSICS
2807template <
typename T>
2809#if HWY_COMPILER_HAS_MASK_INTRINSICS
2815template <
typename T>
2817#if HWY_COMPILER_HAS_MASK_INTRINSICS
2823template <
typename T>
2825#if HWY_COMPILER_HAS_MASK_INTRINSICS
2832template <
typename T>
2834#if HWY_COMPILER_HAS_MASK_INTRINSICS
2840template <
typename T>
2842#if HWY_COMPILER_HAS_MASK_INTRINSICS
2848template <
typename T>
2850#if HWY_COMPILER_HAS_MASK_INTRINSICS
2856template <
typename T>
2858#if HWY_COMPILER_HAS_MASK_INTRINSICS
2865template <
typename T>
2868#if HWY_COMPILER_HAS_MASK_INTRINSICS
2874template <
typename T>
2877#if HWY_COMPILER_HAS_MASK_INTRINSICS
2883template <
typename T>
2886#if HWY_COMPILER_HAS_MASK_INTRINSICS
2892template <
typename T>
2895#if HWY_COMPILER_HAS_MASK_INTRINSICS
2904template <
typename T>
2909template <
typename T>
2914template <
typename T>
2919template <
typename T>
2924template <
typename T>
2929template <
typename T>
2934template <
class D, HWY_IF_LANES_D(D, 64)>
2937#if HWY_COMPILER_HAS_MASK_INTRINSICS
2938 const __mmask64 combined_mask = _mm512_kunpackd(
2939 static_cast<__mmask64
>(hi.raw),
static_cast<__mmask64
>(lo.raw));
2941 const __mmask64 combined_mask =
static_cast<__mmask64
>(
2942 ((
static_cast<uint64_t
>(hi.raw) << 32) | (lo.raw & 0xFFFFFFFFULL)));
2945 return MFromD<D>{combined_mask};
2948template <
class D, HWY_IF_LANES_D(D, 32)>
2950#if HWY_COMPILER_HAS_MASK_INTRINSICS
2951 const auto shifted_mask = _kshiftri_mask64(
static_cast<__mmask64
>(
m.raw), 32);
2953 const auto shifted_mask =
static_cast<uint64_t
>(
m.raw) >> 32;
2956 return MFromD<D>{
static_cast<decltype(MFromD<D>().raw)
>(shifted_mask)};
2959template <
class D, HWY_IF_LANES_D(D, 64)>
2961 using RawM =
decltype(MFromD<D>().raw);
2962#if HWY_COMPILER_HAS_MASK_INTRINSICS
2964 static_cast<RawM
>(_kshiftli_mask64(
static_cast<__mmask64
>(
m.raw), 1))};
2966 return MFromD<D>{
static_cast<RawM
>(
static_cast<uint64_t
>(
m.raw) << 1)};
2970template <
class D, HWY_IF_LANES_D(D, 64)>
2972 using RawM =
decltype(MFromD<D>().raw);
2973#if HWY_COMPILER_HAS_MASK_INTRINSICS
2975 static_cast<RawM
>(_kshiftri_mask64(
static_cast<__mmask64
>(
m.raw), 1))};
2977 return MFromD<D>{
static_cast<RawM
>(
static_cast<uint64_t
>(
m.raw) >> 1)};
2984#if HWY_TARGET <= HWY_AVX3_DL
2994 return ShiftRight<15>(v);
2998 return ShiftRight<31>(v);
3002 return ShiftRight<63>(v);
3007#if HWY_HAVE_FLOAT16 || HWY_IDE
3010 return Mask512<float16_t>{_mm512_fpclass_ph_mask(
3015 Vec512<float16_t> b) {
3019 return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3024 return Mask512<float16_t>{_mm512_fpclass_ph_mask(v.raw, 0x18)};
3030 return Not(Mask512<float16_t>{_mm512_fpclass_ph_mask(
3080template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3082 return VFromD<D>{_mm512_load_si512(aligned)};
3086template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
3089 return Vec512<float16_t>{_mm512_load_ph(aligned)};
3092template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3096template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3098 return VFromD<D>{_mm512_load_pd(aligned)};
3101template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3108template <
class D, HWY_IF_V_SIZE_D(D, 64)>
3110 return Vec512<float16_t>{_mm512_loadu_ph(
p)};
3113template <
class D, HWY_IF_V_SIZE_D(D, 64)>
3117template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3124template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
3127 return VFromD<D>{_mm512_maskz_loadu_epi8(
m.raw,
p)};
3130template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
3133 const RebindToUnsigned<D> du;
3135 m.raw,
reinterpret_cast<const uint16_t*
>(
p))});
3138template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3141 return VFromD<D>{_mm512_maskz_loadu_epi32(
m.raw,
p)};
3144template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3147 return VFromD<D>{_mm512_maskz_loadu_epi64(
m.raw,
p)};
3150template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3156template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3164template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
3167 return VFromD<D>{_mm512_mask_loadu_epi8(v.raw,
m.raw,
p)};
3170template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
3175 d,
VFromD<
decltype(du)>{_mm512_mask_loadu_epi16(
3176 BitCast(du, v).raw,
m.raw,
reinterpret_cast<const uint16_t*
>(
p))});
3179template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3182 return VFromD<D>{_mm512_mask_loadu_epi32(v.raw,
m.raw,
p)};
3185template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3188 return VFromD<D>{_mm512_mask_loadu_epi64(v.raw,
m.raw,
p)};
3191template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3194 return VFromD<D>{_mm512_mask_loadu_ps(v.raw,
m.raw,
p)};
3197template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3200 return VFromD<D>{_mm512_mask_loadu_pd(v.raw,
m.raw,
p)};
3207template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3210 const Full128<TFromD<D>> d128;
3215template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3217 const __m128 x4 = _mm_loadu_ps(
p);
3218 return VFromD<D>{_mm512_broadcast_f32x4(x4)};
3221template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3223 const __m128d x2 = _mm_loadu_pd(
p);
3224 return VFromD<D>{_mm512_broadcast_f64x2(x2)};
3229template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3231 _mm512_store_si512(
reinterpret_cast<__m512i*
>(aligned), v.raw);
3235template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
3238 _mm512_store_ph(aligned, v.raw);
3241template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3243 _mm512_store_ps(aligned, v.
raw);
3245template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3247 _mm512_store_pd(aligned, v.raw);
3250template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3252 _mm512_storeu_si512(
reinterpret_cast<__m512i*
>(
p), v.raw);
3256template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
3259 _mm512_storeu_ph(
p, v.raw);
3263template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3265 _mm512_storeu_ps(
p, v.
raw);
3267template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3269 _mm512_storeu_pd(
p, v.
raw);
3274template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
3277 _mm512_mask_storeu_epi8(
p,
m.raw, v.raw);
3280template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
3284 _mm512_mask_storeu_epi16(
reinterpret_cast<uint16_t*
>(
p),
m.raw,
3288template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3291 _mm512_mask_storeu_epi32(
p,
m.raw, v.raw);
3294template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3297 _mm512_mask_storeu_epi64(
p,
m.raw, v.raw);
3300template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3303 _mm512_mask_storeu_ps(
p,
m.raw, v.
raw);
3306template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3309 _mm512_mask_storeu_pd(
p,
m.raw, v.
raw);
3314template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3317 _mm512_stream_si512(
reinterpret_cast<__m512i*
>(aligned),
BitCast(du, v).raw);
3319template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3321 _mm512_stream_ps(aligned, v.raw);
3323template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3325 _mm512_stream_pd(aligned, v.raw);
3338 _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
3341template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3344 VFromD<RebindToSigned<D>> offset) {
3345 _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
3348template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3351 _mm512_i32scatter_ps(base, offset.
raw, v.raw, 1);
3354template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3357 _mm512_i64scatter_pd(base, offset.
raw, v.raw, 1);
3362template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3365 VFromD<RebindToSigned<D>> index) {
3366 _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
3369template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3372 VFromD<RebindToSigned<D>> index) {
3373 _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
3376template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3379 _mm512_i32scatter_ps(base, index.
raw, v.raw, 4);
3382template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3385 _mm512_i64scatter_pd(base, index.
raw, v.raw, 8);
3390template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3393 VFromD<RebindToSigned<D>> index) {
3394 _mm512_mask_i32scatter_epi32(base,
m.raw, index.raw, v.raw, 4);
3397template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3400 VFromD<RebindToSigned<D>> index) {
3401 _mm512_mask_i64scatter_epi64(base,
m.raw, index.raw, v.raw, 8);
3404template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3408 _mm512_mask_i32scatter_ps(base,
m.raw, index.
raw, v.raw, 4);
3411template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3415 _mm512_mask_i64scatter_pd(base,
m.raw, index.
raw, v.raw, 8);
3422template <
int kScale,
typename T, HWY_IF_UI32(T)>
3428template <
int kScale,
typename T, HWY_IF_UI64(T)>
3434template <
int kScale>
3440template <
int kScale>
3446template <
int kScale,
typename T, HWY_IF_UI32(T)>
3451 _mm512_mask_i32gather_epi32(no.
raw,
m.raw,
indices.raw, base, kScale)};
3454template <
int kScale,
typename T, HWY_IF_UI64(T)>
3459 _mm512_mask_i64gather_epi64(no.
raw,
m.raw,
indices.raw, base, kScale)};
3462template <
int kScale>
3468 _mm512_mask_i32gather_ps(no.
raw,
m.raw,
indices.raw, base, kScale)};
3471template <
int kScale>
3476 _mm512_mask_i64gather_pd(no.
raw,
m.raw,
indices.raw, base, kScale)};
3480template <
class D, HWY_IF_V_SIZE_D(D, 64)>
3482 VFromD<RebindToSigned<D>> offsets) {
3486 return detail::NativeGather512<1>(base, offsets);
3489template <
class D, HWY_IF_V_SIZE_D(D, 64)>
3498template <
class D, HWY_IF_V_SIZE_D(D, 64)>
3515template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3517 return VFromD<D>{_mm512_castsi512_si256(v.raw)};
3519template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
3523template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3531template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3535template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3540template <
typename T>
3548template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
3551 const Twice<
decltype(du)> dut;
3553 _mm512_extracti32x8_epi32(
BitCast(dut, v).raw, 1)});
3555template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3557 return VFromD<D>{_mm512_extractf32x8_ps(v.raw, 1)};
3559template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3561 return VFromD<D>{_mm512_extractf64x4_pd(v.raw, 1)};
3565template <
typename T>
3570#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3571 constexpr size_t kLanesPerBlock = 16 /
sizeof(T);
3572 if (__builtin_constant_p(i < kLanesPerBlock) && (i < kLanesPerBlock)) {
3577 alignas(64) T lanes[
Lanes(
d)];
3583template <
int kBlockIdx,
class T, hwy::EnableIf<(kBlockIdx <= 1)>* =
nullptr>
3584HWY_API Vec128<T> ExtractBlock(Vec512<T> v) {
3585 const DFromV<decltype(v)> d;
3586 const Half<decltype(d)> dh;
3587 return ExtractBlock<kBlockIdx>(LowerHalf(dh, v));
3590template <
int kBlockIdx,
class T, hwy::EnableIf<(kBlockIdx > 1)>* =
nullptr>
3592 static_assert(kBlockIdx <= 3,
"Invalid block index");
3597 _mm512_extracti32x4_epi32(
BitCast(du, v).raw, kBlockIdx)});
3600template <
int kBlockIdx, hwy::EnableIf<(kBlockIdx > 1)>* =
nullptr>
3602 static_assert(kBlockIdx <= 3,
"Invalid block index");
3606template <
int kBlockIdx, hwy::EnableIf<(kBlockIdx > 1)>* =
nullptr>
3608 static_assert(kBlockIdx <= 3,
"Invalid block index");
3613template <
typename T>
3621template <
typename T>
3625 const auto insert_mask =
FirstN(
d, 16 /
sizeof(T));
3629template <
size_t kBlockIdx,
typename T>
3636 d,
VFromD<
decltype(du)>{_mm512_inserti32x4(
3637 BitCast(du, v).raw,
BitCast(du_blk_to_insert, blk_to_insert).raw,
3638 static_cast<int>(kBlockIdx & 3))});
3641template <
size_t kBlockIdx, hwy::EnableIf<kBlockIdx != 0>* =
nullptr>
3646 static_cast<int>(kBlockIdx & 3))};
3649template <
size_t kBlockIdx, hwy::EnableIf<kBlockIdx != 0>* =
nullptr>
3654 static_cast<int>(kBlockIdx & 3))};
3659template <
int kBlockIdx,
class T>
3661 static_assert(0 <= kBlockIdx && kBlockIdx <= 3,
"Invalid block index");
3667template <
typename T>
3674template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3678 return VFromD<D>{_mm512_zextsi256_si512(lo.raw)};
3684template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
3688 return VFromD<D>{_mm512_zextph256_ph512(lo.raw)};
3690 const RebindToUnsigned<D> du;
3695template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3699 return VFromD<D>{_mm512_zextps256_ps512(lo.raw)};
3704template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3708 return VFromD<D>{_mm512_zextpd256_pd512(lo.raw)};
3718template <
class DTo,
class DFrom, HWY_IF_NOT_FLOAT3264_D(DTo)>
3722 const Repartition<uint8_t,
decltype(d_from)> du8_from;
3723 const auto vu8 =
BitCast(du8_from, v);
3727 VFromD<
decltype(du_to)>{_mm512_zextsi128_si512(vu8.raw)});
3730 _mm512_inserti32x4(
Zero(du_to).raw, vu8.raw, 0)});
3734template <
class DTo,
class DFrom, HWY_IF_F32_D(DTo)>
3738 const Repartition<float,
decltype(d_from)> df32_from;
3739 const auto vf32 =
BitCast(df32_from, v);
3744 return Vec512<float>{_mm512_insertf32x4(
Zero(d_to).raw, vf32.raw, 0)};
3748template <
class DTo,
class DFrom, HWY_IF_F64_D(DTo)>
3752 const Repartition<double,
decltype(d_from)> df64_from;
3753 const auto vf64 =
BitCast(df64_from, v);
3762template <
class DTo,
class DFrom>
3766 const Twice<
decltype(d_from)> dt_from;
3775template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3778 const Half<
decltype(du)> duh;
3781 _mm512_inserti32x8(lo512,
BitCast(duh, hi).raw, 1)});
3783template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3787template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3793template <
int kBytes,
class D, HWY_IF_V_SIZE_D(D, 64)>
3795 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3796 return VFromD<D>{_mm512_bslli_epi128(v.raw, kBytes)};
3800template <
int kBytes,
class D, HWY_IF_V_SIZE_D(D, 64)>
3802 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3803 return VFromD<D>{_mm512_bsrli_epi128(v.raw, kBytes)};
3808template <
int kBytes,
class D, HWY_IF_V_SIZE_D(D, 64)>
3811 return BitCast(
d, Vec512<uint8_t>{_mm512_alignr_epi8(
3817template <
int kLane,
typename T, HWY_IF_T_SIZE(T, 2)>
3821 using VU =
VFromD<
decltype(du)>;
3823 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3825 const __m512i lo = _mm512_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF);
3826 return BitCast(
d, VU{_mm512_unpacklo_epi64(lo, lo)});
3829 _mm512_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF);
3830 return BitCast(
d, VU{_mm512_unpackhi_epi64(hi, hi)});
3834template <
int kLane,
typename T, HWY_IF_UI32(T)>
3836 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3837 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
3838 return Vec512<T>{_mm512_shuffle_epi32(v.raw, perm)};
3841template <
int kLane,
typename T, HWY_IF_UI64(T)>
3843 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3844 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
3845 return Vec512<T>{_mm512_shuffle_epi32(v.raw, perm)};
3850 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3851 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
3857 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3858 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0xFF * kLane);
3863template <
int kBlockIdx,
class T>
3865 static_assert(0 <= kBlockIdx && kBlockIdx <= 3,
"Invalid block index");
3869 d,
VFromD<
decltype(du)>{_mm512_shuffle_i32x4(
3873template <
int kBlockIdx>
3875 static_assert(0 <= kBlockIdx && kBlockIdx <= 3,
"Invalid block index");
3879template <
int kBlockIdx>
3881 static_assert(0 <= kBlockIdx && kBlockIdx <= 3,
"Invalid block index");
3889template <
class T, HWY_IF_T_SIZE(T, 1)>
3895template <
class T, HWY_IF_T_SIZE(T, 2)>
3904template <
class T, HWY_IF_UI32(T)>
3907 return Vec512<T>{_mm512_broadcastd_epi32(
ResizeBitCast(Full128<T>(), v).raw)};
3910template <
class T, HWY_IF_UI64(T)>
3913 return Vec512<T>{_mm512_broadcastq_epi64(
ResizeBitCast(Full128<T>(), v).raw)};
3928template <
size_t kLaneIdx,
class T, hwy::EnableIf<kLaneIdx != 0>* =
nullptr>
3931 constexpr size_t kLanesPerBlock = 16 /
sizeof(T);
3932 constexpr int kBlockIdx =
static_cast<int>(kLaneIdx / kLanesPerBlock);
3933 constexpr int kLaneInBlkIdx =
3934 static_cast<int>(kLaneIdx) & (kLanesPerBlock - 1);
3935 return Broadcast<kLaneInBlkIdx>(BroadcastBlock<kBlockIdx>(v));
3940template <
int kLaneIdx,
class T>
3942 static_assert(0 <= kLaneIdx,
"Invalid lane");
3956template <
typename T, HWY_IF_UI32(T)>
3958 return Vec512<T>{_mm512_shuffle_epi32(v.
raw, _MM_PERM_CDAB)};
3966template <
typename T, HWY_IF_T_SIZE(T, 4)>
3974template <
typename T, HWY_IF_T_SIZE(T, 4)>
3982template <
typename T, HWY_IF_T_SIZE(T, 4)>
4050template <
typename T>
4055template <
class D,
typename T = TFromD<D>,
typename TI>
4057 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
4058#if HWY_IS_DEBUG_BUILD
4059 const DFromV<
decltype(vec)> di;
4062 const auto vec_u =
BitCast(du, vec);
4064 AllTrue(du,
Lt(vec_u,
Set(du,
static_cast<TU
>(128 /
sizeof(T))))));
4069template <
class D, HWY_IF_V_SIZE_D(D, 64),
typename TI>
4071 const Rebind<TI,
decltype(
d)> di;
4075template <
typename T, HWY_IF_T_SIZE(T, 1)>
4077#if HWY_TARGET <= HWY_AVX3_DL
4084 const auto bd_sel_mask =
4086 const auto cd_sel_mask =
4096 const Vec512<T> shuf_ab{_mm512_mask_shuffle_epi8(shuf_a.raw, bd_sel_mask.raw,
4097 v_b.raw, idx_vec.raw)};
4098 const Vec512<T> shuf_cd{_mm512_mask_shuffle_epi8(shuf_c.raw, bd_sel_mask.raw,
4099 v_d.raw, idx_vec.raw)};
4100 return IfThenElse(cd_sel_mask, shuf_cd, shuf_ab);
4104template <
typename T, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_SPECIAL_FLOAT(T)>
4110 Indices512<float16_t> idx) {
4111 return Vec512<float16_t>{_mm512_permutexvar_ph(idx.raw, v.raw)};
4114template <
typename T, HWY_IF_T_SIZE(T, 4)>
4116 return Vec512<T>{_mm512_permutexvar_epi32(idx.raw, v.raw)};
4119template <
typename T, HWY_IF_T_SIZE(T, 8)>
4121 return Vec512<T>{_mm512_permutexvar_epi64(idx.raw, v.raw)};
4133template <
typename T, HWY_IF_T_SIZE(T, 1)>
4136#if HWY_TARGET <= HWY_AVX3_DL
4140 const auto b_sel_mask =
4147template <
typename T, HWY_IF_T_SIZE(T, 2)>
4149 Indices512<T> idx) {
4150 return Vec512<T>{_mm512_permutex2var_epi16(a.raw, idx.raw, b.raw)};
4153template <
typename T, HWY_IF_UI32(T)>
4155 Indices512<T> idx) {
4156 return Vec512<T>{_mm512_permutex2var_epi32(a.raw, idx.raw, b.raw)};
4161 Vec512<float16_t> b,
4162 Indices512<float16_t> idx) {
4163 return Vec512<float16_t>{_mm512_permutex2var_ph(a.raw, idx.raw, b.raw)};
4171template <
typename T, HWY_IF_UI64(T)>
4173 Indices512<T> idx) {
4174 return Vec512<T>{_mm512_permutex2var_epi64(a.raw, idx.raw, b.raw)};
4184template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4186#if HWY_TARGET <= HWY_AVX3_DL
4188 alignas(64)
static constexpr int8_t kReverse[64] = {
4189 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,
4190 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
4191 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
4192 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4193 const Vec512<int8_t> idx =
Load(di, kReverse);
4195 d, Vec512<int8_t>{_mm512_permutexvar_epi8(idx.raw,
BitCast(di, v).raw)});
4202template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4205 alignas(64)
static constexpr int16_t kReverse[32] = {
4206 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
4207 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4208 const Vec512<int16_t> idx =
Load(di, kReverse);
4210 _mm512_permutexvar_epi16(idx.raw,
BitCast(di, v).raw)});
4213template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
4215 alignas(64)
static constexpr int32_t kReverse[16] = {
4216 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4220template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
4222 alignas(64)
static constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
4230template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4233 alignas(64)
static constexpr int16_t kReverse4[32] = {
4234 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
4235 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
4236 const Vec512<int16_t> idx =
Load(di, kReverse4);
4238 _mm512_permutexvar_epi16(idx.raw,
BitCast(di, v).raw)});
4243template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4245 return VFromD<D>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
4247template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4249 return VFromD<D>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
4254template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4257 alignas(64)
static constexpr int16_t kReverse8[32] = {
4258 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
4259 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
4260 const Vec512<int16_t> idx =
Load(di, kReverse8);
4262 _mm512_permutexvar_epi16(idx.raw,
BitCast(di, v).raw)});
4265template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
4268 alignas(64)
static constexpr int32_t kReverse8[16] = {
4269 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
4270 const Vec512<int32_t> idx =
Load(di, kReverse8);
4272 _mm512_permutexvar_epi32(idx.raw,
BitCast(di, v).raw)});
4275template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
4282#if HWY_TARGET <= HWY_AVX3_DL
4284#ifdef HWY_NATIVE_REVERSE_BITS_UI8
4285#undef HWY_NATIVE_REVERSE_BITS_UI8
4287#define HWY_NATIVE_REVERSE_BITS_UI8
4291template <
class V, HWY_IF_T_SIZE_V(V, 1)>
4293 const Repartition<uint64_t, DFromV<V>> du64;
4301template <
typename T, HWY_IF_T_SIZE(T, 1)>
4305template <
typename T, HWY_IF_T_SIZE(T, 2)>
4309 using VU =
VFromD<
decltype(du)>;
4311 d, VU{_mm512_unpacklo_epi16(
BitCast(du, a).raw,
BitCast(du, b).raw)});
4313template <
typename T, HWY_IF_T_SIZE(T, 4)>
4315 return Vec512<T>{_mm512_unpacklo_epi32(a.raw, b.raw)};
4317template <
typename T, HWY_IF_T_SIZE(T, 8)>
4319 return Vec512<T>{_mm512_unpacklo_epi64(a.raw, b.raw)};
4330template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4332 return VFromD<D>{_mm512_unpackhi_epi8(a.raw, b.raw)};
4334template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4337 using VU =
VFromD<
decltype(du)>;
4339 d, VU{_mm512_unpackhi_epi16(
BitCast(du, a).raw,
BitCast(du, b).raw)});
4341template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4343 return VFromD<D>{_mm512_unpackhi_epi32(a.raw, b.raw)};
4345template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4347 return VFromD<D>{_mm512_unpackhi_epi64(a.raw, b.raw)};
4350template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4352 return VFromD<D>{_mm512_unpackhi_ps(a.raw, b.raw)};
4354template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4356 return VFromD<D>{_mm512_unpackhi_pd(a.raw, b.raw)};
4362template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
4366 VFromD<
decltype(du)>{_mm512_shuffle_i32x4(
4369template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4371 return VFromD<D>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
4373template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4380template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
4384 VFromD<
decltype(du)>{_mm512_shuffle_i32x4(
4387template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4389 return VFromD<D>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
4391template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4398template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
4402 VFromD<
decltype(du)>{_mm512_shuffle_i32x4(
4405template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4407 return VFromD<D>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
4409template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4416template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
4420 const __mmask32 mask = (0x0000FFFF);
4425template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4427 const __mmask16 mask = (0x00FF);
4428 return VFromD<D>{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)};
4430template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4433 const __mmask8 mask = (0x0F);
4439template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4442#if HWY_TARGET <= HWY_AVX3_DL
4443 alignas(64)
static constexpr uint8_t kIdx[64] = {
4444 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
4445 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51,
4446 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77,
4447 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103,
4448 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127};
4450 d, Vec512<uint8_t>{_mm512_permutex2var_epi8(
4455 const Vec512<uint16_t> uH = ShiftRight<8>(
BitCast(dw, hi));
4456 const Vec512<uint16_t> uL = ShiftRight<8>(
BitCast(dw, lo));
4457 const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
4459 const Full512<uint64_t> du64;
4460 alignas(64)
static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
4465template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4468 alignas(64)
static constexpr uint16_t kIdx[32] = {
4469 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
4470 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
4472 d, Vec512<uint16_t>{_mm512_permutex2var_epi16(
4476template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4479 alignas(64)
static constexpr uint32_t kIdx[16] = {
4480 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
4482 d, Vec512<uint32_t>{_mm512_permutex2var_epi32(
4486template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4489 alignas(64)
static constexpr uint32_t kIdx[16] = {
4490 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
4491 return VFromD<D>{_mm512_permutex2var_ps(lo.raw,
Load(du, kIdx).raw, hi.raw)};
4494template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4497 alignas(64)
static constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
4499 d, Vec512<uint64_t>{_mm512_permutex2var_epi64(
4503template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4506 alignas(64)
static constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
4507 return VFromD<D>{_mm512_permutex2var_pd(lo.raw,
Load(du, kIdx).raw, hi.raw)};
4512template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4515#if HWY_TARGET <= HWY_AVX3_DL
4516 alignas(64)
static constexpr uint8_t kIdx[64] = {
4517 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
4518 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
4519 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
4520 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
4521 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
4523 d, Vec512<uint32_t>{_mm512_permutex2var_epi8(
4528 const Vec512<uint16_t> mask =
Set(dw, 0x00FF);
4529 const Vec512<uint16_t> uH =
And(
BitCast(dw, hi), mask);
4530 const Vec512<uint16_t> uL =
And(
BitCast(dw, lo), mask);
4531 const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
4533 const Full512<uint64_t> du64;
4534 alignas(64)
static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
4539template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4542 alignas(64)
static constexpr uint16_t kIdx[32] = {
4543 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
4544 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
4546 d, Vec512<uint32_t>{_mm512_permutex2var_epi16(
4550template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4553 alignas(64)
static constexpr uint32_t kIdx[16] = {
4554 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
4556 d, Vec512<uint32_t>{_mm512_permutex2var_epi32(
4560template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4563 alignas(64)
static constexpr uint32_t kIdx[16] = {
4564 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
4565 return VFromD<D>{_mm512_permutex2var_ps(lo.raw,
Load(du, kIdx).raw, hi.raw)};
4568template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4571 alignas(64)
static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
4573 d, Vec512<uint64_t>{_mm512_permutex2var_epi64(
4577template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4580 alignas(64)
static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
4581 return VFromD<D>{_mm512_permutex2var_pd(lo.raw,
Load(du, kIdx).raw, hi.raw)};
4586template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4588#if HWY_TARGET <= HWY_AVX3_DL
4590 alignas(64)
static constexpr uint8_t kIdx[64] = {
4591 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71,
4592 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79,
4593 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87,
4594 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95};
4595 return VFromD<D>{_mm512_permutex2var_epi8(a.raw,
Load(du, kIdx).raw, b.raw)};
4597 alignas(64)
static constexpr uint64_t kIdx2[8] = {0, 1, 8, 9, 2, 3, 10, 11};
4600 Load(du64, kIdx2).raw,
4605template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4608 alignas(64)
static constexpr uint16_t kIdx[32] = {
4609 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
4610 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
4612 d,
VFromD<
decltype(du)>{_mm512_permutex2var_epi16(
4616template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4619 alignas(64)
static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
4620 4, 20, 5, 21, 6, 22, 7, 23};
4621 return VFromD<D>{_mm512_permutex2var_epi32(a.raw,
Load(du, kIdx).raw, b.raw)};
4624template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4627 alignas(64)
static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
4628 4, 20, 5, 21, 6, 22, 7, 23};
4629 return VFromD<D>{_mm512_permutex2var_ps(a.raw,
Load(du, kIdx).raw, b.raw)};
4632template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4635 alignas(64)
static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4636 return VFromD<D>{_mm512_permutex2var_epi64(a.raw,
Load(du, kIdx).raw, b.raw)};
4639template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4642 alignas(64)
static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4643 return VFromD<D>{_mm512_permutex2var_pd(a.raw,
Load(du, kIdx).raw, b.raw)};
4648template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4650#if HWY_TARGET <= HWY_AVX3_DL
4652 alignas(64)
static constexpr uint8_t kIdx[64] = {
4653 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103,
4654 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111,
4655 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119,
4656 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127};
4657 return VFromD<D>{_mm512_permutex2var_epi8(a.raw,
Load(du, kIdx).raw, b.raw)};
4659 alignas(64)
static constexpr uint64_t kIdx2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
4662 Load(du64, kIdx2).raw,
4667template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4670 alignas(64)
static constexpr uint16_t kIdx[32] = {
4671 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
4672 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
4674 d,
VFromD<
decltype(du)>{_mm512_permutex2var_epi16(
4678template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4681 alignas(64)
static constexpr uint32_t kIdx[16] = {
4682 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
4683 return VFromD<D>{_mm512_permutex2var_epi32(a.raw,
Load(du, kIdx).raw, b.raw)};
4686template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4689 alignas(64)
static constexpr uint32_t kIdx[16] = {
4690 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
4691 return VFromD<D>{_mm512_permutex2var_ps(a.raw,
Load(du, kIdx).raw, b.raw)};
4694template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4697 alignas(64)
static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
4698 return VFromD<D>{_mm512_permutex2var_epi64(a.raw,
Load(du, kIdx).raw, b.raw)};
4701template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4704 alignas(64)
static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
4705 return VFromD<D>{_mm512_permutex2var_pd(a.raw,
Load(du, kIdx).raw, b.raw)};
4710template <
typename T, HWY_IF_T_SIZE(T, 4)>
4712 return Vec512<T>{_mm512_shuffle_epi32(v.
raw, _MM_PERM_CCAA)};
4718template <
typename T, HWY_IF_T_SIZE(T, 8)>
4726template <
typename T, HWY_IF_T_SIZE(T, 4)>
4728 return Vec512<T>{_mm512_shuffle_epi32(v.
raw, _MM_PERM_DDBB)};
4734template <
typename T, HWY_IF_T_SIZE(T, 8)>
4742template <
typename T>
4744 constexpr size_t s =
sizeof(T);
4745 constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
4751template <
class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
4753 return VFromD<D>{_mm512_mask_shuffle_epi32(
4754 a.raw,
static_cast<__mmask16
>(0xAAAA), b.raw,
4755 static_cast<_MM_PERM_ENUM
>(_MM_SHUFFLE(2, 2, 0, 0)))};
4757template <
class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
4759 return VFromD<D>{_mm512_mask_shuffle_ps(a.raw,
static_cast<__mmask16
>(0xAAAA),
4761 _MM_SHUFFLE(2, 2, 0, 0))};
4765template <
class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
4767 return VFromD<D>{_mm512_mask_shuffle_epi32(
4768 b.raw,
static_cast<__mmask16
>(0x5555), a.raw,
4769 static_cast<_MM_PERM_ENUM
>(_MM_SHUFFLE(3, 3, 1, 1)))};
4771template <
class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
4773 return VFromD<D>{_mm512_mask_shuffle_ps(b.raw,
static_cast<__mmask16
>(0x5555),
4775 _MM_SHUFFLE(3, 3, 1, 1))};
4780template <
typename T>
4782 const DFromV<
decltype(odd)>
d;
4785 d,
VFromD<
decltype(du)>{_mm512_mask_blend_epi64(
4791 _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.
raw, even.
raw)};
4796 _mm512_mask_blend_pd(__mmask8{0x33u}, odd.
raw, even.
raw)};
4801template <
typename T>
4806 VFromD<
decltype(du)>{_mm512_shuffle_i32x4(
4820template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
4824 VFromD<
decltype(du)>{_mm512_shuffle_i32x4(
4827template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4829 return VFromD<D>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)};
4831template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4833 return VFromD<D>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)};
4839template <
typename T,
typename TI>
4848template <
typename T,
typename TI,
size_t NI>
4851 const Half<
decltype(d512)> d256;
4852 const Half<
decltype(d256)> d128;
4855 const auto from_512 =
4861template <
typename T,
typename TI>
4863 const DFromV<
decltype(from)> dih;
4864 const Twice<
decltype(dih)> di;
4870template <
typename T,
size_t N,
typename TI>
4872 const DFromV<
decltype(from)> d512;
4873 const Half<
decltype(d512)> d256;
4874 const Half<
decltype(d256)> d128;
4877 const auto bytes_512 =
4881template <
typename T,
typename TI>
4891template <
int kLane,
class T, HWY_IF_T_SIZE(T, 1)>
4893 static_assert(0 <= kLane && kLane < 16,
"Invalid lane");
4901template <
class D, HWY_IF_V_SIZE_D(D, 64)>
4905 const uint32_t x0) {
4906 return BitCast(
d, Vec512<uint32_t>{_mm512_set_epi32(
4907 static_cast<int32_t
>(x3),
static_cast<int32_t
>(x2),
4908 static_cast<int32_t
>(x1),
static_cast<int32_t
>(x0),
4909 static_cast<int32_t
>(x3),
static_cast<int32_t
>(x2),
4910 static_cast<int32_t
>(x1),
static_cast<int32_t
>(x0),
4911 static_cast<int32_t
>(x3),
static_cast<int32_t
>(x2),
4912 static_cast<int32_t
>(x1),
static_cast<int32_t
>(x0),
4913 static_cast<int32_t
>(x3),
static_cast<int32_t
>(x2),
4914 static_cast<int32_t
>(x1),
static_cast<int32_t
>(x0))});
4917template <
size_t kIdx3210,
class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
4922 _mm512_shuffle_epi32(v.raw,
static_cast<_MM_PERM_ENUM
>(kIdx3210 & 0xFF))};
4925template <
size_t kIdx3210,
class V, HWY_IF_FLOAT(TFromV<V>)>
4929 return V{_mm512_shuffle_ps(v.raw, v.raw,
static_cast<int>(kIdx3210 & 0xFF))};
4932template <
size_t kIdx3210,
class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
4936 return V{_mm512_permutex_epi64(v.raw,
static_cast<int>(kIdx3210 & 0xFF))};
4939template <
size_t kIdx3210,
class V, HWY_IF_FLOAT(TFromV<V>)>
4943 return V{_mm512_permutex_pd(v.raw,
static_cast<int>(kIdx3210 & 0xFF))};
4952template <
int kI32Lanes,
class V, HWY_IF_V_SIZE_V(V, 64)>
4957 Vec512<uint32_t>{_mm512_alignr_epi32(
4961template <
int kI64Lanes,
class V, HWY_IF_V_SIZE_V(V, 64)>
4966 Vec512<uint64_t>{_mm512_alignr_epi64(
4970template <
int kI32Lanes,
class V, HWY_IF_V_SIZE_V(V, 64)>
4972 static_assert(0 <= kI32Lanes && kI32Lanes <= 15,
4973 "kI32Lanes must be between 0 and 15");
4978template <
int kI64Lanes,
class V, HWY_IF_V_SIZE_V(V, 64)>
4980 static_assert(0 <= kI64Lanes && kI64Lanes <= 7,
4981 "kI64Lanes must be between 0 and 7");
4986template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4990#if HWY_TARGET <= HWY_AVX3_DL
4991 const auto byte_idx =
Iota(du8,
static_cast<uint8_t
>(
size_t{0} - amt));
4996 const auto byte_idx =
Iota(du8,
static_cast<uint8_t
>(
size_t{0} - (amt & 15)));
4997 const auto blk_u64_idx =
4998 Iota(du64,
static_cast<uint64_t
>(uint64_t{0} - ((amt >> 4) << 1)));
5001 _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
5003 _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(3, 1, 1, 3))};
5004 const auto odd_sel_mask =
5006 const auto even_blk_lookup_result =
5008 const VFromD<D> blockwise_slide_up_result{
5009 _mm512_mask_shuffle_epi8(even_blk_lookup_result.raw, odd_sel_mask.raw,
5010 odd_blocks.raw, byte_idx.raw)};
5012 BitCast(du64, blockwise_slide_up_result),
Zero(du64),
5013 Indices512<uint64_t>{blk_u64_idx.raw}));
5019template <
int kBlocks,
class D, HWY_IF_V_SIZE_D(D, 64)>
5021 static_assert(0 <= kBlocks && kBlocks <= 3,
5022 "kBlocks must be between 0 and 3");
5027 return detail::SlideUpI64Lanes<2>(v);
5031 return detail::SlideUpI64Lanes<6>(v);
5037template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
5039#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
5040 if (__builtin_constant_p(amt)) {
5045 return detail::SlideUpI32Lanes<1>(v);
5047 return detail::SlideUpI64Lanes<1>(v);
5049 return detail::SlideUpI32Lanes<3>(v);
5051 return detail::SlideUpI64Lanes<2>(v);
5053 return detail::SlideUpI32Lanes<5>(v);
5055 return detail::SlideUpI64Lanes<3>(v);
5057 return detail::SlideUpI32Lanes<7>(v);
5061 return detail::SlideUpI32Lanes<9>(v);
5063 return detail::SlideUpI64Lanes<5>(v);
5065 return detail::SlideUpI32Lanes<11>(v);
5067 return detail::SlideUpI64Lanes<6>(v);
5069 return detail::SlideUpI32Lanes<13>(v);
5071 return detail::SlideUpI64Lanes<7>(v);
5073 return detail::SlideUpI32Lanes<15>(v);
5081template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
5083#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
5084 if (__builtin_constant_p(amt)) {
5089 return detail::SlideUpI64Lanes<1>(v);
5091 return detail::SlideUpI64Lanes<2>(v);
5093 return detail::SlideUpI64Lanes<3>(v);
5097 return detail::SlideUpI64Lanes<5>(v);
5099 return detail::SlideUpI64Lanes<6>(v);
5101 return detail::SlideUpI64Lanes<7>(v);
5109template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
5111#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
5112 if (__builtin_constant_p(amt)) {
5113 if ((amt & 3) == 0) {
5116 }
else if ((amt & 1) == 0) {
5121#if HWY_TARGET > HWY_AVX3_DL
5122 else if (amt <= 63) {
5124 const size_t blk_u64_slideup_amt = (amt >> 4) << 1;
5125 const auto vu64 =
BitCast(du64, v);
5129 (blk_u64_slideup_amt <= 4)
5134 return CombineShiftRightBytes<15>(
d, v_hi, v_lo);
5136 return CombineShiftRightBytes<13>(
d, v_hi, v_lo);
5138 return CombineShiftRightBytes<11>(
d, v_hi, v_lo);
5140 return CombineShiftRightBytes<9>(
d, v_hi, v_lo);
5142 return CombineShiftRightBytes<7>(
d, v_hi, v_lo);
5144 return CombineShiftRightBytes<5>(
d, v_hi, v_lo);
5146 return CombineShiftRightBytes<3>(
d, v_hi, v_lo);
5148 return CombineShiftRightBytes<1>(
d, v_hi, v_lo);
5158template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
5160#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
5161 if (__builtin_constant_p(amt) && (amt & 1) == 0) {
5172template <
typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
5174#if HWY_TARGET <= HWY_AVX3_DL
5177 const auto v_lo = detail::SlideUpI64Lanes<2>(v);
5178 return CombineShiftRightBytes<15>(
d, v, v_lo);
5182template <
typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
5187template <
typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
5189 return detail::SlideUpI32Lanes<1>(v);
5192template <
typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
5194 return detail::SlideUpI64Lanes<1>(v);
5201template <
int kI32Lanes,
class V, HWY_IF_V_SIZE_V(V, 64)>
5203 static_assert(0 <= kI32Lanes && kI32Lanes <= 15,
5204 "kI32Lanes must be between 0 and 15");
5206 return CombineShiftRightI32Lanes<kI32Lanes>(
Zero(
d), v);
5209template <
int kI64Lanes,
class V, HWY_IF_V_SIZE_V(V, 64)>
5211 static_assert(0 <= kI64Lanes && kI64Lanes <= 7,
5212 "kI64Lanes must be between 0 and 7");
5214 return CombineShiftRightI64Lanes<kI64Lanes>(
Zero(
d), v);
5217template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
5221#if HWY_TARGET <= HWY_AVX3_DL
5222 auto byte_idx =
Iota(du8,
static_cast<uint8_t
>(amt));
5227 const auto byte_idx =
Iota(du8,
static_cast<uint8_t
>(amt & 15));
5228 const auto blk_u64_idx =
Iota(du64,
static_cast<uint64_t
>(((amt >> 4) << 1)));
5231 _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(0, 2, 2, 0))};
5233 _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
5234 const auto odd_sel_mask =
5237 _mm512_maskz_shuffle_epi8(
static_cast<__mmask64
>(0x0000FFFFFFFFFFFFULL),
5238 even_blocks.raw, byte_idx.raw)};
5239 const VFromD<D> blockwise_slide_up_result{
5240 _mm512_mask_shuffle_epi8(even_blk_lookup_result.raw, odd_sel_mask.raw,
5241 odd_blocks.raw, byte_idx.raw)};
5243 BitCast(du64, blockwise_slide_up_result),
Zero(du64),
5244 Indices512<uint64_t>{blk_u64_idx.raw}));
5250template <
int kBlocks,
class D, HWY_IF_V_SIZE_D(D, 64)>
5252 static_assert(0 <= kBlocks && kBlocks <= 3,
5253 "kBlocks must be between 0 and 3");
5254 const Half<
decltype(
d)> dh;
5259 return detail::SlideDownI64Lanes<2>(v);
5263 return detail::SlideDownI64Lanes<6>(v);
5269template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
5271#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
5272 if (__builtin_constant_p(amt)) {
5273 const Half<
decltype(
d)> dh;
5276 return detail::SlideDownI32Lanes<1>(v);
5278 return detail::SlideDownI64Lanes<1>(v);
5280 return detail::SlideDownI32Lanes<3>(v);
5282 return detail::SlideDownI64Lanes<2>(v);
5284 return detail::SlideDownI32Lanes<5>(v);
5286 return detail::SlideDownI64Lanes<3>(v);
5288 return detail::SlideDownI32Lanes<7>(v);
5292 return detail::SlideDownI32Lanes<9>(v);
5294 return detail::SlideDownI64Lanes<5>(v);
5296 return detail::SlideDownI32Lanes<11>(v);
5298 return detail::SlideDownI64Lanes<6>(v);
5300 return detail::SlideDownI32Lanes<13>(v);
5302 return detail::SlideDownI64Lanes<7>(v);
5304 return detail::SlideDownI32Lanes<15>(v);
5312template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
5314#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
5315 if (__builtin_constant_p(amt)) {
5316 const Half<
decltype(
d)> dh;
5321 return detail::SlideDownI64Lanes<1>(v);
5323 return detail::SlideDownI64Lanes<2>(v);
5325 return detail::SlideDownI64Lanes<3>(v);
5329 return detail::SlideDownI64Lanes<5>(v);
5331 return detail::SlideDownI64Lanes<6>(v);
5333 return detail::SlideDownI64Lanes<7>(v);
5341template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
5343#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
5344 if (__builtin_constant_p(amt)) {
5345 if ((amt & 3) == 0) {
5348 }
else if ((amt & 1) == 0) {
5351 du16,
BitCast(du16, v), amt >> 1));
5353#if HWY_TARGET > HWY_AVX3_DL
5354 else if (amt <= 63) {
5356 const size_t blk_u64_slidedown_amt = (amt >> 4) << 1;
5357 const auto vu64 =
BitCast(du64, v);
5361 (blk_u64_slidedown_amt <= 4)
5367 return CombineShiftRightBytes<1>(
d, v_hi, v_lo);
5369 return CombineShiftRightBytes<3>(
d, v_hi, v_lo);
5371 return CombineShiftRightBytes<5>(
d, v_hi, v_lo);
5373 return CombineShiftRightBytes<7>(
d, v_hi, v_lo);
5375 return CombineShiftRightBytes<9>(
d, v_hi, v_lo);
5377 return CombineShiftRightBytes<11>(
d, v_hi, v_lo);
5379 return CombineShiftRightBytes<13>(
d, v_hi, v_lo);
5381 return CombineShiftRightBytes<15>(
d, v_hi, v_lo);
5391template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
5393#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
5394 if (__builtin_constant_p(amt) && (amt & 1) == 0) {
5405template <
typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
5407#if HWY_TARGET <= HWY_AVX3_DL
5410 const auto v_hi = detail::SlideDownI64Lanes<2>(v);
5411 return CombineShiftRightBytes<1>(
d, v_hi, v);
5415template <
typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
5420template <
typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
5422 return detail::SlideDownI32Lanes<1>(v);
5425template <
typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
5427 return detail::SlideDownI64Lanes<1>(v);
5437template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5441template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
5443 return VFromD<D>{_mm512_cvtepu8_epi32(v.raw)};
5445template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
5449template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5453template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5455 return VFromD<D>{_mm512_cvtepu16_epi64(v.raw)};
5457template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5466template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
5470template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
5472 return VFromD<D>{_mm512_cvtepi8_epi32(v.raw)};
5474template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
5478template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5482template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5484 return VFromD<D>{_mm512_cvtepi16_epi64(v.raw)};
5486template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5492template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
5504template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5506 return VFromD<D>{_mm512_cvtph_pd(v.raw)};
5511template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
5513 const Rebind<uint16_t,
decltype(df32)> du16;
5518template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5523template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5525 return VFromD<D>{_mm512_cvtepi32_pd(v.raw)};
5528template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5530 return VFromD<D>{_mm512_cvtepu32_pd(v.raw)};
5533template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5535 return VFromD<D>{_mm512_cvttps_epi64(v.raw)};
5537template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5539 return VFromD<D>{_mm512_cvttps_epu64(v.raw)};
5541template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5548template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
5554 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
5555 const auto idx64 =
Load(du64, kLanes);
5560template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
5567template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
5569 const Full512<uint64_t> du64;
5570 const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
5573 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
5574 const auto idx64 =
Load(du64, kLanes);
5575 const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
5579template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
5581 const Full512<uint32_t> du32;
5582 const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
5583 const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
5586 const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
5590template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
5592 return VFromD<D>{_mm512_cvtusepi32_epi8(v.raw)};
5595template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
5601 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
5602 const auto idx64 =
Load(du64, kLanes);
5603 const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
5607template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
5614template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
5616 const Full512<uint32_t> du32;
5617 const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
5618 const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
5621 const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
5625template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I8_D(D)>
5627 const Full512<uint64_t> du64;
5628 const Vec512<int8_t> u8{_mm512_packs_epi16(v.raw, v.raw)};
5631 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
5632 const auto idx64 =
Load(du64, kLanes);
5633 const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
5637template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
5641template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
5643 return VFromD<D>{_mm512_cvtsepi64_epi16(v.raw)};
5645template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
5647 return VFromD<D>{_mm512_cvtsepi64_epi8(v.raw)};
5650template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5653 return VFromD<D>{_mm512_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
5655template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
5658 return VFromD<D>{_mm512_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
5660template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
5663 return VFromD<D>{_mm512_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
5666template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5670template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
5672 return VFromD<D>{_mm512_cvtusepi64_epi16(v.raw)};
5674template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
5676 return VFromD<D>{_mm512_cvtusepi64_epi8(v.raw)};
5679template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
5686 df16,
VFromD<
decltype(du16)>{_mm512_cvtps_ph(v.
raw, _MM_FROUND_NO_EXC)});
5691template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
5693 return VFromD<D>{_mm512_cvtpd_ph(v.raw)};
5697#if HWY_AVX3_HAVE_F32_TO_BF16C
5698template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
5700#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
5703 __asm__(
"vcvtneps2bf16 %1, %0" :
"=v"(raw_result) :
"v"(v.raw));
5712template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
5715#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
5718 __asm__(
"vcvtne2ps2bf16 %2, %1, %0"
5720 :
"v"(b.raw),
"v"(a.raw));
5730template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
5736template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5738 Vec512<int32_t> b) {
5739 return VFromD<D>{_mm512_packus_epi32(a.raw, b.raw)};
5742template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5745 const DFromV<
decltype(a)> du32;
5747 const auto max_i32 =
Set(du32, 0x7FFFFFFFu);
5753template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I8_D(D)>
5759template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
5761 Vec512<int16_t> b) {
5762 return VFromD<D>{_mm512_packus_epi16(a.raw, b.raw)};
5765template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
5768 const DFromV<
decltype(a)> du16;
5770 const auto max_i16 =
Set(du16, 0x7FFFu);
5776template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
5778 const Half<
decltype(dn)> dnh;
5782template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
5785 const Half<
decltype(dn)> dnh;
5789template <
class D,
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
5790 HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
5791 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
5792 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2),
5793 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
5794HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
5795 const Full512<u
int64_t> du64;
5796 alignas(64) static constexpr u
int64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
5797 return BitCast(d, TableLookupLanes(BitCast(du64, ReorderDemote2To(d, a, b)),
5798 SetTableIndices(du64, kIdx)));
5801template <
class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
5802 HWY_IF_V_SIZE_GT_D(D, 16),
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
5803 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
5804 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2),
5805 HWY_IF_T_SIZE_V(V, 8)>
5810template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5815template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
5820template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5822 return VFromD<D>{_mm512_cvttpd_epu32(v.raw)};
5825template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5830template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5832 return VFromD<D>{_mm512_cvtepi64_ps(v.raw)};
5835template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5837 return VFromD<D>{_mm512_cvtepu64_ps(v.raw)};
5842 const DFromV<
decltype(v)> d32;
5845 const VFromD<
decltype(d32)> v8From32 =
5850 const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
5856template <
class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
5858#if HWY_TARGET <= HWY_AVX3_DL
5862 d8, 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56);
5867 alignas(64)
static constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
5868 0, 2, 4, 6, 8, 10, 12, 14};
5870 _mm512_permutexvar_epi32(
Load(d32, kEven).raw, v.
raw)};
5875template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
5877 const Full512<uint16_t> d16;
5878 alignas(16)
static constexpr uint16_t k16From64[8] = {0, 4, 8, 12,
5880 const Vec512<uint16_t> bytes{
5881 _mm512_permutexvar_epi16(
LoadDup128(d16, k16From64).raw, v.raw)};
5885template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5887 const Full512<uint32_t> d32;
5888 alignas(64)
static constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
5889 0, 2, 4, 6, 8, 10, 12, 14};
5890 const Vec512<uint32_t> even{
5891 _mm512_permutexvar_epi32(
Load(d32, kEven).raw, v.raw)};
5895template <
class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
5897#if HWY_TARGET <= HWY_AVX3_DL
5900 d8, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
5906 const VFromD<
decltype(d32)> v8From32 =
5911 const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
5916template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
5918 const Full512<uint16_t> d16;
5919 alignas(64)
static constexpr uint16_t k16From32[32] = {
5920 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
5921 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
5922 const Vec512<uint16_t> bytes{
5923 _mm512_permutexvar_epi16(
Load(d16, k16From32).raw, v.raw)};
5927template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
5929#if HWY_TARGET <= HWY_AVX3_DL
5931 alignas(64)
static constexpr uint8_t k8From16[64] = {
5932 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
5933 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
5934 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
5935 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
5937 _mm512_permutexvar_epi8(
Load(d8, k8From16).raw, v.
raw)};
5941 d32, 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u);
5943 alignas(64)
static constexpr uint32_t kIndex32[16] = {
5944 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
5946 _mm512_permutexvar_epi32(
Load(d32, kIndex32).raw, quads.raw)};
5954template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
5956 return VFromD<D>{_mm512_cvtepu16_ph(v.raw)};
5958template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
5960 return VFromD<D>{_mm512_cvtepi16_ph(v.raw)};
5964template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
5969template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5974template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
5979template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5986template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
5988 return VFromD<D>{_mm512_cvttph_epi16(v.raw)};
5990template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5992 return VFromD<D>{_mm512_cvttph_epu16(v.raw)};
5994template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5999template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6003template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
6007template <
class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
6009 return VFromD<DU>{_mm512_cvttps_epu32(v.raw)};
6011template <
class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
6015template <
class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
6017 return VFromD<DU>{_mm512_cvttpd_epu64(v.raw)};
6019template <
class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
6032#if !defined(HWY_DISABLE_PCLMUL_AES)
6036#if HWY_TARGET <= HWY_AVX3_DL
6039 const DFromV<
decltype(state)>
d;
6040 const Half<
decltype(
d)> d2;
6048#if HWY_TARGET <= HWY_AVX3_DL
6051 const DFromV<
decltype(state)>
d;
6052 const Half<
decltype(
d)> d2;
6061#if HWY_TARGET <= HWY_AVX3_DL
6065 const Half<
decltype(
d)> d2;
6073#if HWY_TARGET <= HWY_AVX3_DL
6077 const Half<
decltype(
d)> d2;
6084template <u
int8_t kRcon>
6087#if HWY_TARGET <= HWY_AVX3_DL
6089 d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
6091 d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
6094 const auto sub_word_result =
AESLastRound(w13, rconXorMask);
6097 const Half<
decltype(
d)> d2;
6104#if HWY_TARGET <= HWY_AVX3_DL
6107 alignas(64) uint64_t a[8];
6108 alignas(64) uint64_t b[8];
6113 for (
size_t i = 0; i < 8; i += 2) {
6115 Store(mul, d128, a + i);
6122#if HWY_TARGET <= HWY_AVX3_DL
6125 alignas(64) uint64_t a[8];
6126 alignas(64) uint64_t b[8];
6131 for (
size_t i = 0; i < 8; i += 2) {
6133 Store(mul, d128, a + i);
6146template <
int kAOffset,
int kBOffset>
6149 static_assert(0 <= kAOffset && kAOffset <= 1,
6150 "kAOffset must be between 0 and 1");
6151 static_assert(0 <= kBOffset && kBOffset <= 3,
6152 "kBOffset must be between 0 and 3");
6170 const auto sum = a + b;
6173 const auto i32_max =
Set(
d, LimitsMax<int32_t>());
6175 i32_max.raw,
MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
6176 return IfThenElse(overflow_mask, overflow_result, sum);
6181 const auto sum = a + b;
6184 const auto i64_max =
Set(
d, LimitsMax<int64_t>());
6186 i64_max.raw,
MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
6187 return IfThenElse(overflow_mask, overflow_result, sum);
6194 const auto diff = a - b;
6197 const auto i32_max =
Set(
d, LimitsMax<int32_t>());
6199 i32_max.raw,
MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
6200 return IfThenElse(overflow_mask, overflow_result, diff);
6205 const auto diff = a - b;
6208 const auto i64_max =
Set(
d, LimitsMax<int64_t>());
6210 i64_max.raw,
MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
6211 return IfThenElse(overflow_mask, overflow_result, diff);
6221template <
typename T>
6223#if HWY_COMPILER_HAS_MASK_INTRINSICS
6224 return _kortestz_mask64_u8(mask.
raw, mask.
raw);
6226 return mask.
raw == 0;
6229template <
typename T>
6231#if HWY_COMPILER_HAS_MASK_INTRINSICS
6232 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
6234 return mask.
raw == 0;
6237template <
typename T>
6239#if HWY_COMPILER_HAS_MASK_INTRINSICS
6240 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
6242 return mask.
raw == 0;
6245template <
typename T>
6247#if HWY_COMPILER_HAS_MASK_INTRINSICS
6248 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
6250 return mask.
raw == 0;
6256template <
class D, HWY_IF_V_SIZE_D(D, 64)>
6263template <
typename T>
6265#if HWY_COMPILER_HAS_MASK_INTRINSICS
6266 return _kortestc_mask64_u8(mask.
raw, mask.
raw);
6268 return mask.
raw == 0xFFFFFFFFFFFFFFFFull;
6271template <
typename T>
6273#if HWY_COMPILER_HAS_MASK_INTRINSICS
6274 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
6276 return mask.
raw == 0xFFFFFFFFull;
6279template <
typename T>
6281#if HWY_COMPILER_HAS_MASK_INTRINSICS
6282 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
6284 return mask.
raw == 0xFFFFull;
6287template <
typename T>
6289#if HWY_COMPILER_HAS_MASK_INTRINSICS
6290 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
6292 return mask.
raw == 0xFFull;
6298template <
class D, HWY_IF_V_SIZE_D(D, 64)>
6304template <
class D, HWY_IF_V_SIZE_D(D, 64)>
6307 CopyBytes<8 /
sizeof(TFromD<D>)>(bits, &mask.raw);
6313template <
class D, HWY_IF_V_SIZE_D(D, 64)>
6315 const size_t kNumBytes = 8 /
sizeof(TFromD<D>);
6316 CopyBytes<kNumBytes>(&mask.raw, bits);
6321template <
class D, HWY_IF_V_SIZE_D(D, 64)>
6323 return PopCount(
static_cast<uint64_t
>(mask.raw));
6326template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_T_SIZE_D(D, 1)>
6331template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
6336template <
class D, HWY_IF_V_SIZE_D(D, 64)>
6342template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_T_SIZE_D(D, 1)>
6347template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
6352template <
class D, HWY_IF_V_SIZE_D(D, 64)>
6362#ifdef HWY_NATIVE_COMPRESS8
6363#undef HWY_NATIVE_COMPRESS8
6365#define HWY_NATIVE_COMPRESS8
6370#if HWY_TARGET <= HWY_AVX3_DL
6400#if HWY_TARGET != HWY_AVX3_ZEN4
6406 _mm_mask_compressstoreu_epi8(unaligned, mask.
raw, v.
raw);
6408HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask,
6410 _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
6412HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask,
6414 _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
6418HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v,
6419 Mask128<uint16_t, N> mask,
6421 _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
6423HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask,
6425 _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
6427HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask,
6429 _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
6444template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
6447 return VFromD<D>{_mm512_maskz_expandloadu_epi8(mask.
raw, unaligned)};
6450template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6453 return VFromD<D>{_mm512_maskz_expandloadu_epi16(mask.
raw, unaligned)};
6474#if HWY_TARGET != HWY_AVX3_ZEN4
6480 _mm_mask_compressstoreu_epi32(unaligned, mask.
raw, v.
raw);
6482HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask,
6484 _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6486HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask,
6488 _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6492HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v,
6493 Mask128<uint64_t, N> mask,
6495 _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6497HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask,
6499 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6501HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask,
6503 _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6507HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask,
6509 _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6511HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask,
6513 _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6515HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask,
6517 _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6521HWY_INLINE void NativeCompressStore(Vec128<double, N> v,
6522 Mask128<double, N> mask,
6524 _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6526HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask,
6528 _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6530HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask,
6532 _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6547template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
6550 return VFromD<D>{_mm512_maskz_expandloadu_epi32(mask.
raw, unaligned)};
6553template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
6556 return VFromD<D>{_mm512_maskz_expandloadu_epi64(mask.
raw, unaligned)};
6566 const Rebind<uint32_t,
decltype(
d)> d32;
6569 const uint64_t mask_bits{mask.
raw};
6571 using M32 =
MFromD<
decltype(d32)>;
6572 const M32 m0{
static_cast<typename M32::Raw
>(mask_bits)};
6580 const Rebind<int32_t,
decltype(
d)> di32;
6582 const MFromD<
decltype(du32)> mask32{
static_cast<__mmask8
>(mask.
raw)};
6592 const Rebind<int32_t,
decltype(
d)> di32;
6600template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6606template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6615template <
class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
6618 const uint64_t mask_bits{mask.raw};
6619 const Half<
decltype(
d)> dh;
6620 const Rebind<uint32_t,
decltype(dh)> d32;
6623 const Mask512<uint32_t> m0{
static_cast<__mmask16
>(mask_bits & 0xFFFFu)};
6624 const Mask512<uint32_t> m1{
static_cast<__mmask16
>(mask_bits >> 16)};
6632template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
6635 const uint64_t mask_bits{mask.raw};
6637 const Rebind<uint32_t,
decltype(dq)> d32;
6638 alignas(64) uint8_t lanes[64];
6641 const Vec512<uint32_t> v1 =
PromoteTo(d32,
Load(dq, lanes + 16));
6642 const Vec512<uint32_t> v2 =
PromoteTo(d32,
Load(dq, lanes + 32));
6643 const Vec512<uint32_t> v3 =
PromoteTo(d32,
Load(dq, lanes + 48));
6644 const Mask512<uint32_t> m0{
static_cast<__mmask16
>(mask_bits & 0xFFFFu)};
6645 const Mask512<uint32_t> m1{
6646 static_cast<uint16_t
>((mask_bits >> 16) & 0xFFFFu)};
6647 const Mask512<uint32_t> m2{
6648 static_cast<uint16_t
>((mask_bits >> 32) & 0xFFFFu)};
6649 const Mask512<uint32_t> m3{
static_cast<__mmask16
>(mask_bits >> 48)};
6664template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6669 const Half<
decltype(
d)> dh;
6670 const Vec512<uint32_t> promoted0 =
6672 const Vec512<uint32_t> promoted1 =
6675 const uint64_t mask_bits{mask.raw};
6676 const uint64_t maskL = mask_bits & 0xFFFF;
6677 const uint64_t maskH = mask_bits >> 16;
6678 const Mask512<uint32_t> mask0{
static_cast<__mmask16
>(maskL)};
6679 const Mask512<uint32_t> mask1{
static_cast<__mmask16
>(maskH)};
6680 const Vec512<uint32_t> compressed0 =
NativeCompress(promoted0, mask0);
6681 const Vec512<uint32_t> compressed1 =
NativeCompress(promoted1, mask1);
6683 const Vec256<uint16_t> demoted0 =
DemoteTo(dh,
BitCast(di32, compressed0));
6684 const Vec256<uint16_t> demoted1 =
DemoteTo(dh,
BitCast(di32, compressed1));
6687 StoreU(demoted0, dh, unaligned);
6692template <
typename T>
6695 alignas(64) T buf[2 *
Lanes(
d)];
6697 return Load(
d, buf);
6703 alignas(32) uint8_t buf[2 * 32 /
sizeof(uint8_t)];
6705 return Load(
d, buf);
6710template <
class V,
class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
6711HWY_API V Compress(V v, const M mask) {
6712 const DFromV<decltype(v)> d;
6713 const RebindToUn
signed<decltype(d)> du;
6714 const auto mu = RebindMask(du, mask);
6715#if HWY_TARGET <= HWY_AVX3_DL
6716 return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
6718 return BitCast(d, detail::EmuCompress(BitCast(du, v), mu));
6722template <
class V,
class M, HWY_IF_T_SIZE_V(V, 4)>
6730template <
typename T, HWY_IF_T_SIZE(T, 8)>
6733 alignas(16)
static constexpr uint64_t packed_array[256] = {
6737 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
6738 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
6739 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
6740 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
6741 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
6742 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
6743 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
6744 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
6745 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
6746 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
6747 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
6748 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
6749 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
6750 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
6751 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
6752 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
6753 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
6754 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
6755 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
6756 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
6757 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
6758 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
6759 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
6760 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
6761 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
6762 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
6763 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
6764 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
6765 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
6766 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
6767 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
6768 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
6769 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
6770 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
6771 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
6772 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
6773 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
6774 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
6775 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
6776 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
6777 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
6778 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
6779 0x10765432, 0x17654320, 0x07654321, 0x76543210};
6785 const auto packed =
Set(du64, packed_array[mask.
raw]);
6786 alignas(64)
static constexpr uint64_t shifts[8] = {0, 4, 8, 12,
6794template <
typename T, HWY_IF_T_SIZE(T, 1)>
6797#if HWY_TARGET <= HWY_AVX3_DL
6805 constexpr size_t N =
Lanes(
d);
6809 alignas(64) T lanes[N];
6813 static_cast<Bits
>(mask.
raw & Bits{(1ULL << (N / 2)) - 1})};
6814 const Mask256<T> maskH{
static_cast<Bits
>(mask.
raw >> (N / 2))};
6815 const size_t countL =
CountTrue(dh, maskL);
6818 return Combine(
d, expandH, expandL);
6822template <
typename T, HWY_IF_T_SIZE(T, 2)>
6823HWY_API Vec512<T>
Expand(Vec512<T> v,
const Mask512<T> mask) {
6826 const Vec512<uint16_t> vu =
BitCast(du, v);
6827#if HWY_TARGET <= HWY_AVX3_DL
6832 const Full256<T> dh;
6833 constexpr size_t N =
Lanes(
d);
6835 const Mask256<T> maskL{
6836 static_cast<Bits
>(mask.raw & Bits{(1ULL << (N / 2)) - 1})};
6837 const Mask256<T> maskH{
static_cast<Bits
>(mask.raw >> (N / 2))};
6840 alignas(64) uint16_t iota[64] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
6841 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
6842 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
6844 const Vec512<uint16_t> shifted{_mm512_permutexvar_epi16(
indices.raw, vu.raw)};
6847 return Combine(
d, expandH, expandL);
6851template <
class V,
class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
6852HWY_API V Expand(V v, const M mask) {
6853 const DFromV<decltype(v)> d;
6854 const RebindToUn
signed<decltype(d)> du;
6855 const auto mu = RebindMask(du, mask);
6856 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
6862#if HWY_TARGET > HWY_AVX3_DL
6864template <
class V,
class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
6865 HWY_IF_LANES_LE_D(DFromV<V>, 16)>
6866HWY_API V Expand(V v, M mask) {
6868 const RebindToUn
signed<decltype(d)> du;
6869 const Rebind<u
int32_t, decltype(d)> du32;
6870 const VFromD<decltype(du)> vu = BitCast(du, v);
6871 using M32 = MFromD<decltype(du32)>;
6872 const M32 m32{static_cast<
typename M32::Raw>(mask.raw)};
6873 return BitCast(d, TruncateTo(du, Expand(PromoteTo(du32, vu), m32)));
6880template <
class D, HWY_IF_V_SIZE_D(D, 64),
6881 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
6882HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
6883 const TFromD<D>* HWY_RESTRICT unaligned) {
6884#if HWY_TARGET <= HWY_AVX3_DL
6885 const RebindToUn
signed<decltype(d)> du;
6886 using TU = TFromD<decltype(du)>;
6887 const TU* HWY_RESTRICT pu = re
interpret_cast<const TU*>(unaligned);
6888 const MFromD<decltype(du)> mu = RebindMask(du, mask);
6889 return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
6891 return Expand(LoadU(d, unaligned), mask);
6895template <
class D, HWY_IF_V_SIZE_D(D, 64),
6896 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
6897HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
6898 const TFromD<D>* HWY_RESTRICT unaligned) {
6899 const RebindToUn
signed<decltype(d)> du;
6900 using TU = TFromD<decltype(du)>;
6901 const TU* HWY_RESTRICT pu = re
interpret_cast<const TU*>(unaligned);
6902 const MFromD<decltype(du)> mu = RebindMask(du, mask);
6903 return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
6908template <
class V,
class M, HWY_IF_NOT_T_SIZE_V(V, 8)>
6913template <
typename T, HWY_IF_T_SIZE(T, 8)>
6916 alignas(16)
static constexpr uint64_t packed_array[256] = {
6920 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
6921 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
6922 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
6923 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
6924 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
6925 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
6926 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
6927 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
6928 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
6929 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
6930 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
6931 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
6932 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
6933 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
6934 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
6935 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
6936 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
6937 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
6938 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
6939 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
6940 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
6941 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
6942 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
6943 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
6944 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
6945 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
6946 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
6947 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
6948 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
6949 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
6950 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
6951 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
6952 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
6953 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
6954 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
6955 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
6956 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
6957 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
6958 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
6959 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
6960 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
6961 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
6962 0x76543210, 0x76543201, 0x76543210, 0x76543210};
6968 const auto packed =
Set(du64, packed_array[mask.
raw]);
6969 alignas(64)
static constexpr uint64_t shifts[8] = {0, 4, 8, 12,
6977template <
class V,
class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)>
6992template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
6993HWY_API
size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
6994 TFromD<D>* HWY_RESTRICT unaligned) {
6995#if HWY_TARGET == HWY_AVX3_ZEN4
6996 StoreU(Compress(v, mask), d, unaligned);
6998 const RebindToUn
signed<decltype(d)> du;
6999 const auto mu = RebindMask(du, mask);
7000 auto pu = re
interpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned);
7002#if HWY_TARGET <= HWY_AVX3_DL
7003 detail::NativeCompressStore(BitCast(du, v), mu, pu);
7005 detail::EmuCompressStore(BitCast(du, v), mu, du, pu);
7008 const
size_t count = CountTrue(d, mask);
7009 detail::MaybeUnpoison(unaligned, count);
7013template <
class D, HWY_IF_NOT_FLOAT_D(D),
7014 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
7015HWY_API
size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
7016 TFromD<D>* HWY_RESTRICT unaligned) {
7017#if HWY_TARGET == HWY_AVX3_ZEN4
7018 StoreU(Compress(v, mask), d, unaligned);
7020 const RebindToUn
signed<decltype(d)> du;
7021 const auto mu = RebindMask(du, mask);
7022 using TU = TFromD<decltype(du)>;
7023 TU* HWY_RESTRICT pu = re
interpret_cast<TU*>(unaligned);
7024 detail::NativeCompressStore(BitCast(du, v), mu, pu);
7026 const
size_t count = CountTrue(d, mask);
7027 detail::MaybeUnpoison(unaligned, count);
7032template <
class D, HWY_IF_FLOAT3264_D(D)>
7035#if HWY_TARGET == HWY_AVX3_ZEN4
7039 detail::NativeCompressStore(v, mask, unaligned);
7041 const size_t count =
PopCount(uint64_t{mask.raw});
7047template <
class D, HWY_IF_V_SIZE_GT_D(D, 8)>
7077template <_MM_PERM_ENUM kPerm,
typename T>
7084template <_MM_PERM_ENUM kPerm>
7088template <_MM_PERM_ENUM kPerm>
7101template <
class D, HWY_IF_V_SIZE_D(D, 64)>
7104 constexpr size_t N =
Lanes(
d);
7109 const VFromD<D> v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654);
7110 const VFromD<D> va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98);
7112 A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976);
7113 B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976);
7114 C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98);
7127template <
class D, HWY_IF_V_SIZE_D(D, 64)>
7131 constexpr size_t N =
Lanes(
d);
7137 const VFromD<D> v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654);
7138 const VFromD<D> vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc);
7139 const VFromD<D> v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654);
7140 const VFromD<D> vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc);
7141 vA = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98);
7142 vB = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98);
7143 vC = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba);
7144 vD = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba);
7161template <
class D, HWY_IF_V_SIZE_D(D, 64)>
7164 constexpr size_t N =
Lanes(
d);
7165 const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
7166 const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
7167 const auto j1_i1_j0_i0 =
7168 detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0);
7169 const auto j3_i3_j2_i2 =
7170 detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2);
7171 StoreU(j1_i1_j0_i0,
d, unaligned + 0 * N);
7172 StoreU(j3_i3_j2_i2,
d, unaligned + 1 * N);
7183template <
class D, HWY_IF_V_SIZE_D(D, 64)>
7187 constexpr size_t N =
Lanes(
d);
7188 const VFromD<D> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j);
7189 const VFromD<D> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i);
7190 const VFromD<D> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j);
7193 detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0);
7195 detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0);
7197 detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1);
7199 StoreU(out0,
d, unaligned + 0 * N);
7200 StoreU(out1,
d, unaligned + 1 * N);
7201 StoreU(out2,
d, unaligned + 2 * N);
7214template <
class D, HWY_IF_V_SIZE_D(D, 64)>
7218 constexpr size_t N =
Lanes(
d);
7219 const VFromD<D> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
7220 const VFromD<D> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l);
7221 const VFromD<D> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
7222 const VFromD<D> l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l);
7224 detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0);
7226 detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0);
7228 detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2);
7230 detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2);
7231 StoreU(out0,
d, unaligned + 0 * N);
7232 StoreU(out1,
d, unaligned + 1 * N);
7233 StoreU(out2,
d, unaligned + 2 * N);
7234 StoreU(out3,
d, unaligned + 3 * N);
7271#if HWY_TARGET <= HWY_AVX3_DL
7273 const VFromD<
decltype(
d)> masks =
7274 Dup128VecFromValues(
d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
7275 0, 0, 0, 0, 0, 0, 0);
7277 const VFromD<
decltype(
d)> shl =
7278 Dup128VecFromValues(
d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0,
7279 0, 0, 0, 0, 0, 0, 0);
7282 return VFromD<
decltype(
d)>{_mm512_gf2p8mul_epi8(v.
raw, mul.raw)};
7285 using VW =
VFromD<
decltype(dw)>;
7286 const VW even_mask =
Set(dw, 0x00FF);
7287 const VW odd_mask =
Set(dw, 0xFF00);
7289 const VW bits16 =
BitCast(dw, bits);
7291 const VW evens = vw <<
And(bits16, even_mask);
7292 const VW odds =
And(vw, odd_mask) << ShiftRight<8>(bits16);
7308template <
typename T, HWY_IF_SIGNED(T)>
7310 const DFromV<
decltype(v)> di;
7326 using VW =
VFromD<
decltype(dw)>;
7327 const VW mask =
Set(dw, 0x00FF);
7329 const VW bits16 =
BitCast(dw, bits);
7330 const VW evens =
And(vw, mask) >>
And(bits16, mask);
7332 const VW odds = vw >> ShiftRight<8>(bits16);
7356 using VW =
VFromD<
decltype(dw)>;
7357 const VW mask =
Set(dw, 0x00FF);
7359 const VW bits16 =
BitCast(dw, bits);
7360 const VW evens = ShiftRight<8>(ShiftLeft<8>(vw)) >>
And(bits16, mask);
7362 const VW odds = vw >>
BitCast(dw, ShiftRight<8>(
BitCast(dw_u, bits16)));
7377template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
7385template <
class DI16, HWY_IF_V_SIZE_D(DI16, 64), HWY_IF_I16_D(DI16)>
7387 DI16 ,
VFromD<Repartition<uint8_t, DI16>> a,
7388 VFromD<Repartition<int8_t, DI16>> b) {
7389 return VFromD<DI16>{_mm512_maddubs_epi16(a.raw, b.raw)};
7393#if HWY_TARGET <= HWY_AVX3_DL
7394template <
class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 64)>
7396 DI32 ,
VFromD<Repartition<int16_t, DI32>> a,
7398 return VFromD<DI32>{_mm512_dpwssds_epi32(sum.raw, a.raw, b.raw)};
7403template <
class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
7409#if HWY_TARGET <= HWY_AVX3_DL
7428#if HWY_TARGET <= HWY_AVX3_DL
7430template <
class DI32, HWY_IF_V_SIZE_D(DI32, 64)>
7432 DI32 ,
VFromD<Repartition<uint8_t, DI32>> a_u,
7434 return VFromD<DI32>{_mm512_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)};
7444template <
class D,
class Func, HWY_IF_V_SIZE_D(D, 64)>
7454template <
class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
7456 return V{_mm512_lzcnt_epi32(v.raw)};
7459template <
class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
7461 return V{_mm512_lzcnt_epi64(v.raw)};
7471 const Rebind<int32_t,
decltype(
d)> di32;
7472 const Rebind<uint32_t,
decltype(
d)> du32;
7483 const Half<
decltype(
d)> dh;
7484 const Rebind<int32_t,
decltype(dh)> di32;
7485 const Rebind<uint32_t,
decltype(dh)> du32;
7486 const Rebind<uint16_t,
decltype(
d)> du16;
7488 const auto lo_v_lz_count =
7490 const auto hi_v_lz_count =
7493 BitCast(di32, hi_v_lz_count));
7498 const Rebind<int16_t,
decltype(
d)> di16;
7504 const Half<
decltype(
d)> dh;
7505 const Rebind<int16_t,
decltype(dh)> di16;
7526 using TU =
TFromD<
decltype(du)>;
7528 constexpr TU kNumOfBitsInT{
sizeof(TU) * 8};
7530 return BitCast(
d,
Min(v_lzcnt32 -
Set(du, TU{32 - kNumOfBitsInT}),
7531 Set(du, TU{kNumOfBitsInT})));
7539 using TU =
TFromD<
decltype(du)>;
7548 using T =
TFromD<
decltype(
d)>;
7552template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
7556 using T =
TFromD<
decltype(
d)>;
7558 const auto vi =
BitCast(di, v);
7560 constexpr T kNumOfBitsInT{
sizeof(T) * 8};
#define HWY_RESTRICT
Definition base.h:95
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
Definition arm_neon-inl.h:865
Raw raw
Definition arm_neon-inl.h:878
Definition arm_neon-inl.h:813
Raw raw
Definition arm_neon-inl.h:851
Definition wasm_256-inl.h:27
Raw raw
Definition x86_256-inl.h:117
Definition x86_512-inl.h:134
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition x86_512-inl.h:146
typename detail::Raw512< T >::type Raw
Definition x86_512-inl.h:135
Raw raw
Definition x86_512-inl.h:168
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition x86_512-inl.h:161
HWY_INLINE Vec512 & operator%=(const Vec512 other)
Definition x86_512-inl.h:155
T PrivateT
Definition x86_512-inl.h:138
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition x86_512-inl.h:149
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition x86_512-inl.h:164
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition x86_512-inl.h:152
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition x86_512-inl.h:143
static constexpr size_t kPrivateN
Definition x86_512-inl.h:139
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition x86_512-inl.h:158
#define HWY_COMPILER_CLANGCL
Definition detect_compiler_arch.h:45
#define HWY_AVX3_DL
Definition detect_targets.h:73
#define HWY_AVX3_SPR
Definition detect_targets.h:63
#define HWY_TARGET
Definition detect_targets.h:543
#define HWY_AVX3_ZEN4
Definition detect_targets.h:68
HWY_INLINE V SlideUpI32Lanes(V v)
Definition x86_512-inl.h:4971
HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV< V > t)
Definition x86_128-inl.h:6289
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE Vec256< T > BroadcastLane(hwy::SizeTag< 0 >, Vec256< T > v)
Definition x86_256-inl.h:4186
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition ops/shared-inl.h:151
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_INLINE VFromD< D > TableLookupSlideDownLanes(D d, VFromD< D > v, size_t amt)
Definition x86_256-inl.h:5786
HWY_API Vec128< T, N > GaloisAffine(Vec128< T, N > v, VFromD< Repartition< uint64_t, Simd< T, N, 0 > > > matrix)
Definition x86_128-inl.h:1870
HWY_INLINE VFromD< DI > FixConversionOverflow(DI di, VFromD< RebindToFloat< DI > > original, VFromD< DI > converted)
Definition x86_128-inl.h:10061
HWY_INLINE Vec128< uint8_t, N > EmuCompress(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition x86_512-inl.h:6563
static HWY_INLINE uint32_t AVX3Blsi(T x)
Definition x86_128-inl.h:12517
HWY_INLINE V CombineShiftRightI64Lanes(V hi, V lo)
Definition x86_256-inl.h:5526
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1593
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:5084
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE void StoreTransposedBlocks3(VFromD< D > A, VFromD< D > B, VFromD< D > C, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1652
HWY_INLINE V SlideDownI64Lanes(V v)
Definition x86_256-inl.h:5740
static HWY_INLINE uint32_t AVX3Blsmsk(T x)
Definition x86_128-inl.h:12537
HWY_INLINE void StoreTransposedBlocks2(VFromD< D > A, VFromD< D > B, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1616
Vec512< T > Shuffle128(const Vec512< T > lo, const Vec512< T > hi)
Definition x86_512-inl.h:7078
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:164
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition x86_256-inl.h:7076
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE VFromD< DTo > ZeroExtendResizeBitCast(FromSizeTag, ToSizeTag, DTo d_to, DFrom, VFromD< DFrom > v)
Definition emu128-inl.h:140
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE Vec512< T > NativeGather512(const T *HWY_RESTRICT base, Vec512< int32_t > indices)
Definition x86_512-inl.h:3423
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2845
HWY_INLINE void StoreTransposedBlocks4(VFromD< D > vA, VFromD< D > vB, VFromD< D > vC, VFromD< D > vD, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:2003
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE VFromD< D > NativeLoadExpand(MFromD< D > mask, D, const uint8_t *HWY_RESTRICT unaligned)
Definition x86_128-inl.h:12412
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:1383
HWY_INLINE V Lzcnt32ForU8OrU16(V v)
Definition x86_512-inl.h:7469
HWY_INLINE Vec128< uint8_t, N > NativeCompress(const Vec128< uint8_t, N > v, const Mask128< uint8_t, N > mask)
Definition x86_512-inl.h:6372
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
HWY_INLINE VFromD< Rebind< uint16_t, DFromV< V > > > Lzcnt32ForU8OrU16AsU16(V v)
Definition x86_512-inl.h:7481
HWY_INLINE Vec512< T > InsertBlock(hwy::SizeTag< 0 >, Vec512< T > v, Vec128< T > blk_to_insert)
Definition x86_512-inl.h:3622
HWY_INLINE VFromD< D > TableLookupSlideUpLanes(D d, VFromD< D > v, size_t amt)
Definition x86_256-inl.h:5582
HWY_INLINE Vec512< T > NativeMaskedGatherOr512(Vec512< T > no, Mask512< T > m, const T *HWY_RESTRICT base, Vec512< int32_t > indices)
Definition x86_512-inl.h:3447
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE V SlideDownI32Lanes(V v)
Definition x86_512-inl.h:5202
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE VFromD< D > ReduceAcrossBlocks(D, Func, VFromD< D > v)
Definition generic_ops-inl.h:998
HWY_INLINE void EmuCompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition x86_512-inl.h:6601
HWY_INLINE Vec128< uint8_t, N > NativeExpand(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition x86_128-inl.h:12400
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE V CombineShiftRightI32Lanes(V hi, V lo)
Definition x86_256-inl.h:5517
HWY_INLINE V SlideUpI64Lanes(V v)
Definition x86_256-inl.h:5535
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< DI32 > SatWidenMulPairwiseAccumulate(DI32 di32, VFromD< Repartition< int16_t, DI32 > > a, VFromD< Repartition< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition generic_ops-inl.h:5179
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API Mask< D > SlideMask1Up(D d, Mask< D > m)
Definition generic_ops-inl.h:7071
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API V Rol(V a, V b)
Definition generic_ops-inl.h:445
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfAdjQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3901
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API VFromD< D > SlideDownBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7046
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfShuffledQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3943
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v)
Definition generic_ops-inl.h:869
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec< DI16 > SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b)
Definition generic_ops-inl.h:5153
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Mask< D > SlideMask1Down(D d, Mask< D > m)
Definition generic_ops-inl.h:7076
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API Vec< RepartitionToWideX3< DFromV< V > > > SumsOf8AbsDiff(V a, V b)
Definition generic_ops-inl.h:2820
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API V MulAddSub(V mul, V x, V sub_or_add)
Definition arm_sve-inl.h:4285
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API V Ror(V a, V b)
Definition generic_ops-inl.h:459
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
unsigned int Shift16Count
Definition x86_512-inl.h:1539
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
RepartitionToWide< RepartitionToWide< D > > RepartitionToWideX2
Definition ops/shared-inl.h:480
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API MFromD< DFromV< V > > IsEitherNaN(V a, V b)
Definition generic_ops-inl.h:1177
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API V HighestSetBitIndex(V v)
Definition arm_neon-inl.h:9523
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
unsigned int Shift3264Count
Definition x86_512-inl.h:1540
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API V TrailingZeroCount(V v)
Definition arm_neon-inl.h:9530
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API MFromD< D > UpperHalfOfMask(D, MFromD< Twice< D > > m)
Definition x86_128-inl.h:1051
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V ReverseBits(V v)
Definition generic_ops-inl.h:6464
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_API MFromD< D > CombineMasks(D, MFromD< Half< D > > hi, MFromD< Half< D > > lo)
Definition x86_128-inl.h:959
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
HWY_API VFromD< D > SlideUpBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7028
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API V BroadcastLane(const V v)
Definition arm_sve-inl.h:4146
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
FuncOutput(*)(const void *, FuncInput) Func
Definition nanobenchmark.h:87
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2551
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2577
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2540
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2588
HWY_API size_t PopCount(T x)
Definition base.h:2615
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_UI32_D(D)
Definition ops/shared-inl.h:591
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_LANES_LE_D(D, lanes)
Definition ops/shared-inl.h:561
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_V_SIZE_V(V, bytes)
Definition ops/shared-inl.h:632
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_HAVE_FLOAT16
Definition set_macros-inl.h:173
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition x86_512-inl.h:4051
__m512i raw
Definition x86_512-inl.h:4052
Definition wasm_256-inl.h:64
typename detail::RawMask256< sizeof(T)>::type Raw
Definition x86_256-inl.h:148
Raw raw
Definition x86_256-inl.h:154
Definition x86_512-inl.h:173
typename detail::RawMask512< sizeof(T)>::type Raw
Definition x86_512-inl.h:174
Raw raw
Definition x86_512-inl.h:175
Definition ops/shared-inl.h:198
HWY_INLINE __m512d operator()(__m512i v)
Definition x86_512-inl.h:237
HWY_INLINE __m512 operator()(__m512i v)
Definition x86_512-inl.h:233
Definition x86_512-inl.h:222
HWY_INLINE __m512i operator()(__m512i v)
Definition x86_512-inl.h:223
__m512d type
Definition x86_512-inl.h:108
__m512 type
Definition x86_512-inl.h:104
Definition x86_512-inl.h:93
__m512i type
Definition x86_512-inl.h:94
__mmask64 type
Definition x86_512-inl.h:116
__mmask32 type
Definition x86_512-inl.h:120
__mmask16 type
Definition x86_512-inl.h:124
__mmask8 type
Definition x86_512-inl.h:128
Definition x86_512-inl.h:113
int VFromD
Definition tuple-inl.h:25
#define HWY_X86_FPCLASS_NEG_INF
Definition x86_128-inl.h:11266
#define HWY_X86_FPCLASS_SNAN
Definition x86_128-inl.h:11269
#define HWY_X86_FPCLASS_POS_INF
Definition x86_128-inl.h:11265
#define HWY_X86_FPCLASS_QNAN
Definition x86_128-inl.h:11262