Grok 12.0.1
x86_512-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 512-bit AVX512 vectors and operations.
17// External include guard in highway.h - see comment there.
18
19// WARNING: most operations do not cross 128-bit block boundaries. In
20// particular, "Broadcast", pack and zip behavior may be surprising.
21
22// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL
23#include "hwy/base.h"
24
25// Avoid uninitialized warnings in GCC's avx512fintrin.h - see
26// https://github.com/google/highway/issues/710)
28#if HWY_COMPILER_GCC_ACTUAL
29HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
30HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
31 ignored "-Wmaybe-uninitialized")
32#endif
33
34#include <immintrin.h> // AVX2+
35
37// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
38// including these headers when _MSC_VER is defined, like when using clang-cl.
39// Include these directly here.
40// clang-format off
41#include <smmintrin.h>
42
43#include <avxintrin.h>
44// avxintrin defines __m256i and must come before avx2intrin.
45#include <avx2intrin.h>
46#include <f16cintrin.h>
47#include <fmaintrin.h>
48
49#include <avx512fintrin.h>
50#include <avx512vlintrin.h>
51#include <avx512bwintrin.h>
52#include <avx512vlbwintrin.h>
53#include <avx512dqintrin.h>
54#include <avx512vldqintrin.h>
55#include <avx512cdintrin.h>
56#include <avx512vlcdintrin.h>
57
59#include <avx512bitalgintrin.h>
60#include <avx512vlbitalgintrin.h>
61#include <avx512vbmiintrin.h>
62#include <avx512vbmivlintrin.h>
63#include <avx512vbmi2intrin.h>
64#include <avx512vlvbmi2intrin.h>
65#include <avx512vpopcntdqintrin.h>
66#include <avx512vpopcntdqvlintrin.h>
67#include <avx512vnniintrin.h>
68#include <avx512vlvnniintrin.h>
69// Must come after avx512fintrin, else will not define 512-bit intrinsics.
70#include <vaesintrin.h>
71#include <vpclmulqdqintrin.h>
72#include <gfniintrin.h>
73#endif // HWY_TARGET <= HWY_AVX3_DL
74
76#include <avx512fp16intrin.h>
77#include <avx512vlfp16intrin.h>
78#endif // HWY_TARGET <= HWY_AVX3_SPR
79
80// clang-format on
81#endif // HWY_COMPILER_CLANGCL
82
83// For half-width vectors. Already includes base.h and shared-inl.h.
84#include "hwy/ops/x86_256-inl.h"
85
87namespace hwy {
88namespace HWY_NAMESPACE {
89
90namespace detail {
91
92template <typename T>
93struct Raw512 {
94 using type = __m512i;
95};
96#if HWY_HAVE_FLOAT16
97template <>
98struct Raw512<float16_t> {
99 using type = __m512h;
100};
101#endif // HWY_HAVE_FLOAT16
102template <>
103struct Raw512<float> {
104 using type = __m512;
105};
106template <>
107struct Raw512<double> {
108 using type = __m512d;
109};
110
111// Template arg: sizeof(lane type)
112template <size_t size>
113struct RawMask512 {};
114template <>
115struct RawMask512<1> {
116 using type = __mmask64;
117};
118template <>
119struct RawMask512<2> {
120 using type = __mmask32;
121};
122template <>
123struct RawMask512<4> {
124 using type = __mmask16;
125};
126template <>
127struct RawMask512<8> {
128 using type = __mmask8;
129};
130
131} // namespace detail
132
133template <typename T>
134class Vec512 {
135 using Raw = typename detail::Raw512<T>::type;
136
137 public:
138 using PrivateT = T; // only for DFromV
139 static constexpr size_t kPrivateN = 64 / sizeof(T); // only for DFromV
140
141 // Compound assignment. Only usable if there is a corresponding non-member
142 // binary operator overload. For example, only f32 and f64 support division.
144 return *this = (*this * other);
145 }
147 return *this = (*this / other);
148 }
150 return *this = (*this + other);
151 }
153 return *this = (*this - other);
154 }
156 return *this = (*this % other);
157 }
159 return *this = (*this & other);
160 }
162 return *this = (*this | other);
163 }
165 return *this = (*this ^ other);
166 }
167
169};
170
171// Mask register: one bit per lane.
172template <typename T>
173struct Mask512 {
174 using Raw = typename detail::RawMask512<sizeof(T)>::type;
176};
177
178template <typename T>
179using Full512 = Simd<T, 64 / sizeof(T), 0>;
180
181// ------------------------------ BitCast
182
183namespace detail {
184
185HWY_INLINE __m512i BitCastToInteger(__m512i v) { return v; }
186#if HWY_HAVE_FLOAT16
187HWY_INLINE __m512i BitCastToInteger(__m512h v) {
188 return _mm512_castph_si512(v);
189}
190#endif // HWY_HAVE_FLOAT16
191HWY_INLINE __m512i BitCastToInteger(__m512 v) { return _mm512_castps_si512(v); }
192HWY_INLINE __m512i BitCastToInteger(__m512d v) {
193 return _mm512_castpd_si512(v);
194}
195
196#if HWY_AVX3_HAVE_F32_TO_BF16C
197HWY_INLINE __m512i BitCastToInteger(__m512bh v) {
198 // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
199 // bit cast a __m512bh to a __m512i as there is currently no intrinsic
200 // available (as of GCC 13 and Clang 17) that can bit cast a __m512bh vector
201 // to a __m512i vector
202
203#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
204 // On GCC or Clang, use reinterpret_cast to bit cast a __m512bh to a __m512i
205 return reinterpret_cast<__m512i>(v);
206#else
207 // On MSVC, use BitCastScalar to bit cast a __m512bh to a __m512i as MSVC does
208 // not allow reinterpret_cast, static_cast, or a C-style cast to be used to
209 // bit cast from one AVX vector type to a different AVX vector type
210 return BitCastScalar<__m512i>(v);
211#endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
212}
213#endif // HWY_AVX3_HAVE_F32_TO_BF16C
214
215template <typename T>
219
220// Cannot rely on function overloading because return types differ.
221template <typename T>
223 HWY_INLINE __m512i operator()(__m512i v) { return v; }
224};
225#if HWY_HAVE_FLOAT16
226template <>
227struct BitCastFromInteger512<float16_t> {
228 HWY_INLINE __m512h operator()(__m512i v) { return _mm512_castsi512_ph(v); }
229};
230#endif // HWY_HAVE_FLOAT16
231template <>
233 HWY_INLINE __m512 operator()(__m512i v) { return _mm512_castsi512_ps(v); }
234};
235template <>
236struct BitCastFromInteger512<double> {
237 HWY_INLINE __m512d operator()(__m512i v) { return _mm512_castsi512_pd(v); }
238};
239
240template <class D, HWY_IF_V_SIZE_D(D, 64)>
244
245} // namespace detail
246
247template <class D, HWY_IF_V_SIZE_D(D, 64), typename FromT>
251
252// ------------------------------ Set
253
254template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
255HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
256 return VFromD<D>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
257}
258template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI16_D(D)>
259HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
260 return VFromD<D>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
261}
262template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
263HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
264 return VFromD<D>{_mm512_set1_epi32(static_cast<int>(t))};
265}
266template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
267HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
268 return VFromD<D>{_mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
269}
270// bfloat16_t is handled by x86_128-inl.h.
271#if HWY_HAVE_FLOAT16
272template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
273HWY_API Vec512<float16_t> Set(D /* tag */, float16_t t) {
274 return Vec512<float16_t>{_mm512_set1_ph(t)};
275}
276#endif // HWY_HAVE_FLOAT16
277template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
278HWY_API Vec512<float> Set(D /* tag */, float t) {
279 return Vec512<float>{_mm512_set1_ps(t)};
280}
281template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
282HWY_API Vec512<double> Set(D /* tag */, double t) {
283 return Vec512<double>{_mm512_set1_pd(t)};
284}
285
286// ------------------------------ Zero (Set)
287
288// GCC pre-9.1 lacked setzero, so use Set instead.
289#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
290
291// Cannot use VFromD here because it is defined in terms of Zero.
292template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
293HWY_API Vec512<TFromD<D>> Zero(D d) {
294 return Set(d, TFromD<D>{0});
295}
296// BitCast is defined below, but the Raw type is the same, so use that.
297template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
298HWY_API Vec512<bfloat16_t> Zero(D /* tag */) {
299 const RebindToUnsigned<D> du;
300 return Vec512<bfloat16_t>{Set(du, 0).raw};
301}
302template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
303HWY_API Vec512<float16_t> Zero(D /* tag */) {
304 const RebindToUnsigned<D> du;
305 return Vec512<float16_t>{Set(du, 0).raw};
306}
307
308#else
309
310template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
312 return Vec512<TFromD<D>>{_mm512_setzero_si512()};
313}
314template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
316 return Vec512<bfloat16_t>{_mm512_setzero_si512()};
317}
318template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
320#if HWY_HAVE_FLOAT16
321 return Vec512<float16_t>{_mm512_setzero_ph()};
322#else
323 return Vec512<float16_t>{_mm512_setzero_si512()};
324#endif
325}
326template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
328 return Vec512<float>{_mm512_setzero_ps()};
329}
330template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
332 return Vec512<double>{_mm512_setzero_pd()};
333}
334
335#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
336
337// ------------------------------ Undefined
338
339HWY_DIAGNOSTICS(push)
340HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
341
342// Returns a vector with uninitialized elements.
343template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
344HWY_API Vec512<TFromD<D>> Undefined(D /* tag */) {
345 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
346 // generate an XOR instruction.
347 return Vec512<TFromD<D>>{_mm512_undefined_epi32()};
348}
349template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
351 return Vec512<bfloat16_t>{_mm512_undefined_epi32()};
352}
353template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
355#if HWY_HAVE_FLOAT16
356 return Vec512<float16_t>{_mm512_undefined_ph()};
357#else
358 return Vec512<float16_t>{_mm512_undefined_epi32()};
359#endif
360}
361template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
363 return Vec512<float>{_mm512_undefined_ps()};
364}
365template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
367 return Vec512<double>{_mm512_undefined_pd()};
368}
369
371
372// ------------------------------ ResizeBitCast
373
374// 64-byte vector to 16-byte vector
375template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 64),
376 HWY_IF_V_SIZE_D(D, 16)>
377HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
378 return BitCast(d, Vec128<uint8_t>{_mm512_castsi512_si128(
379 BitCast(Full512<uint8_t>(), v).raw)});
380}
381
382// <= 16-byte vector to 64-byte vector
383template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
384 HWY_IF_V_SIZE_D(D, 64)>
385HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
386 return BitCast(d, Vec512<uint8_t>{_mm512_castsi128_si512(
387 ResizeBitCast(Full128<uint8_t>(), v).raw)});
388}
389
390// 32-byte vector to 64-byte vector
391template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 32),
392 HWY_IF_V_SIZE_D(D, 64)>
393HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
394 return BitCast(d, Vec512<uint8_t>{_mm512_castsi256_si512(
395 BitCast(Full256<uint8_t>(), v).raw)});
396}
397
398// ------------------------------ Dup128VecFromValues
399
400template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 64)>
401HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
402 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
403 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
404 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
405 TFromD<D> t11, TFromD<D> t12,
406 TFromD<D> t13, TFromD<D> t14,
407 TFromD<D> t15) {
408#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
409 // Missing set_epi8/16.
410 return BroadcastBlock<0>(ResizeBitCast(
411 d, Dup128VecFromValues(Full128<TFromD<D>>(), t0, t1, t2, t3, t4, t5, t6,
412 t7, t8, t9, t10, t11, t12, t13, t14, t15)));
413#else
414 (void)d;
415 // Need to use _mm512_set_epi8 as there is no _mm512_setr_epi8 intrinsic
416 // available
417 return VFromD<D>{_mm512_set_epi8(
418 static_cast<char>(t15), static_cast<char>(t14), static_cast<char>(t13),
419 static_cast<char>(t12), static_cast<char>(t11), static_cast<char>(t10),
420 static_cast<char>(t9), static_cast<char>(t8), static_cast<char>(t7),
421 static_cast<char>(t6), static_cast<char>(t5), static_cast<char>(t4),
422 static_cast<char>(t3), static_cast<char>(t2), static_cast<char>(t1),
423 static_cast<char>(t0), static_cast<char>(t15), static_cast<char>(t14),
424 static_cast<char>(t13), static_cast<char>(t12), static_cast<char>(t11),
425 static_cast<char>(t10), static_cast<char>(t9), static_cast<char>(t8),
426 static_cast<char>(t7), static_cast<char>(t6), static_cast<char>(t5),
427 static_cast<char>(t4), static_cast<char>(t3), static_cast<char>(t2),
428 static_cast<char>(t1), static_cast<char>(t0), static_cast<char>(t15),
429 static_cast<char>(t14), static_cast<char>(t13), static_cast<char>(t12),
430 static_cast<char>(t11), static_cast<char>(t10), static_cast<char>(t9),
431 static_cast<char>(t8), static_cast<char>(t7), static_cast<char>(t6),
432 static_cast<char>(t5), static_cast<char>(t4), static_cast<char>(t3),
433 static_cast<char>(t2), static_cast<char>(t1), static_cast<char>(t0),
434 static_cast<char>(t15), static_cast<char>(t14), static_cast<char>(t13),
435 static_cast<char>(t12), static_cast<char>(t11), static_cast<char>(t10),
436 static_cast<char>(t9), static_cast<char>(t8), static_cast<char>(t7),
437 static_cast<char>(t6), static_cast<char>(t5), static_cast<char>(t4),
438 static_cast<char>(t3), static_cast<char>(t2), static_cast<char>(t1),
439 static_cast<char>(t0))};
440#endif
441}
442
443template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 64)>
444HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
445 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
446 TFromD<D> t5, TFromD<D> t6,
447 TFromD<D> t7) {
448#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
449 // Missing set_epi8/16.
450 return BroadcastBlock<0>(
451 ResizeBitCast(d, Dup128VecFromValues(Full128<TFromD<D>>(), t0, t1, t2, t3,
452 t4, t5, t6, t7)));
453#else
454 (void)d;
455 // Need to use _mm512_set_epi16 as there is no _mm512_setr_epi16 intrinsic
456 // available
457 return VFromD<D>{
458 _mm512_set_epi16(static_cast<int16_t>(t7), static_cast<int16_t>(t6),
459 static_cast<int16_t>(t5), static_cast<int16_t>(t4),
460 static_cast<int16_t>(t3), static_cast<int16_t>(t2),
461 static_cast<int16_t>(t1), static_cast<int16_t>(t0),
462 static_cast<int16_t>(t7), static_cast<int16_t>(t6),
463 static_cast<int16_t>(t5), static_cast<int16_t>(t4),
464 static_cast<int16_t>(t3), static_cast<int16_t>(t2),
465 static_cast<int16_t>(t1), static_cast<int16_t>(t0),
466 static_cast<int16_t>(t7), static_cast<int16_t>(t6),
467 static_cast<int16_t>(t5), static_cast<int16_t>(t4),
468 static_cast<int16_t>(t3), static_cast<int16_t>(t2),
469 static_cast<int16_t>(t1), static_cast<int16_t>(t0),
470 static_cast<int16_t>(t7), static_cast<int16_t>(t6),
471 static_cast<int16_t>(t5), static_cast<int16_t>(t4),
472 static_cast<int16_t>(t3), static_cast<int16_t>(t2),
473 static_cast<int16_t>(t1), static_cast<int16_t>(t0))};
474#endif
475}
476
477#if HWY_HAVE_FLOAT16
478template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 64)>
479HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
480 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
481 TFromD<D> t5, TFromD<D> t6,
482 TFromD<D> t7) {
483 return VFromD<D>{_mm512_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
484 t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5,
485 t6, t7, t0, t1, t2, t3, t4, t5, t6, t7)};
486}
487#endif
488
489template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 64)>
490HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
491 TFromD<D> t2, TFromD<D> t3) {
492 return VFromD<D>{
493 _mm512_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
494 static_cast<int32_t>(t2), static_cast<int32_t>(t3),
495 static_cast<int32_t>(t0), static_cast<int32_t>(t1),
496 static_cast<int32_t>(t2), static_cast<int32_t>(t3),
497 static_cast<int32_t>(t0), static_cast<int32_t>(t1),
498 static_cast<int32_t>(t2), static_cast<int32_t>(t3),
499 static_cast<int32_t>(t0), static_cast<int32_t>(t1),
500 static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
501}
502
503template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 64)>
504HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
505 TFromD<D> t2, TFromD<D> t3) {
506 return VFromD<D>{_mm512_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2,
507 t3, t0, t1, t2, t3)};
508}
509
510template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 64)>
511HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
512 return VFromD<D>{
513 _mm512_setr_epi64(static_cast<int64_t>(t0), static_cast<int64_t>(t1),
514 static_cast<int64_t>(t0), static_cast<int64_t>(t1),
515 static_cast<int64_t>(t0), static_cast<int64_t>(t1),
516 static_cast<int64_t>(t0), static_cast<int64_t>(t1))};
517}
518
519template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 64)>
520HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
521 return VFromD<D>{_mm512_setr_pd(t0, t1, t0, t1, t0, t1, t0, t1)};
522}
523
524// ----------------------------- Iota
525
526namespace detail {
527
528template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
530#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
531 // Missing set_epi8/16.
532 alignas(64) static constexpr TFromD<D> kIota[64] = {
533 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
534 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
535 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
536 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
537 return Load(d, kIota);
538#else
539 (void)d;
540 return VFromD<D>{_mm512_set_epi8(
541 static_cast<char>(63), static_cast<char>(62), static_cast<char>(61),
542 static_cast<char>(60), static_cast<char>(59), static_cast<char>(58),
543 static_cast<char>(57), static_cast<char>(56), static_cast<char>(55),
544 static_cast<char>(54), static_cast<char>(53), static_cast<char>(52),
545 static_cast<char>(51), static_cast<char>(50), static_cast<char>(49),
546 static_cast<char>(48), static_cast<char>(47), static_cast<char>(46),
547 static_cast<char>(45), static_cast<char>(44), static_cast<char>(43),
548 static_cast<char>(42), static_cast<char>(41), static_cast<char>(40),
549 static_cast<char>(39), static_cast<char>(38), static_cast<char>(37),
550 static_cast<char>(36), static_cast<char>(35), static_cast<char>(34),
551 static_cast<char>(33), static_cast<char>(32), static_cast<char>(31),
552 static_cast<char>(30), static_cast<char>(29), static_cast<char>(28),
553 static_cast<char>(27), static_cast<char>(26), static_cast<char>(25),
554 static_cast<char>(24), static_cast<char>(23), static_cast<char>(22),
555 static_cast<char>(21), static_cast<char>(20), static_cast<char>(19),
556 static_cast<char>(18), static_cast<char>(17), static_cast<char>(16),
557 static_cast<char>(15), static_cast<char>(14), static_cast<char>(13),
558 static_cast<char>(12), static_cast<char>(11), static_cast<char>(10),
559 static_cast<char>(9), static_cast<char>(8), static_cast<char>(7),
560 static_cast<char>(6), static_cast<char>(5), static_cast<char>(4),
561 static_cast<char>(3), static_cast<char>(2), static_cast<char>(1),
562 static_cast<char>(0))};
563#endif
564}
565
566template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI16_D(D)>
568#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
569 // Missing set_epi8/16.
570 alignas(64) static constexpr TFromD<D> kIota[32] = {
571 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
572 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
573 return Load(d, kIota);
574#else
575 (void)d;
576 return VFromD<D>{_mm512_set_epi16(
577 int16_t{31}, int16_t{30}, int16_t{29}, int16_t{28}, int16_t{27},
578 int16_t{26}, int16_t{25}, int16_t{24}, int16_t{23}, int16_t{22},
579 int16_t{21}, int16_t{20}, int16_t{19}, int16_t{18}, int16_t{17},
580 int16_t{16}, int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12},
581 int16_t{11}, int16_t{10}, int16_t{9}, int16_t{8}, int16_t{7}, int16_t{6},
582 int16_t{5}, int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0})};
583#endif
584}
585
586#if HWY_HAVE_FLOAT16
587template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
588HWY_INLINE VFromD<D> Iota0(D /*d*/) {
589 return VFromD<D>{_mm512_set_ph(
590 float16_t{31}, float16_t{30}, float16_t{29}, float16_t{28}, float16_t{27},
591 float16_t{26}, float16_t{25}, float16_t{24}, float16_t{23}, float16_t{22},
592 float16_t{21}, float16_t{20}, float16_t{19}, float16_t{18}, float16_t{17},
593 float16_t{16}, float16_t{15}, float16_t{14}, float16_t{13}, float16_t{12},
594 float16_t{11}, float16_t{10}, float16_t{9}, float16_t{8}, float16_t{7},
595 float16_t{6}, float16_t{5}, float16_t{4}, float16_t{3}, float16_t{2},
596 float16_t{1}, float16_t{0})};
597}
598#endif // HWY_HAVE_FLOAT16
599
600template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
601HWY_INLINE VFromD<D> Iota0(D /*d*/) {
602 return VFromD<D>{_mm512_set_epi32(
603 int32_t{15}, int32_t{14}, int32_t{13}, int32_t{12}, int32_t{11},
604 int32_t{10}, int32_t{9}, int32_t{8}, int32_t{7}, int32_t{6}, int32_t{5},
605 int32_t{4}, int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})};
606}
607
608template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
609HWY_INLINE VFromD<D> Iota0(D /*d*/) {
610 return VFromD<D>{_mm512_set_epi64(int64_t{7}, int64_t{6}, int64_t{5},
611 int64_t{4}, int64_t{3}, int64_t{2},
612 int64_t{1}, int64_t{0})};
613}
614
615template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
616HWY_INLINE VFromD<D> Iota0(D /*d*/) {
617 return VFromD<D>{_mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f,
618 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f,
619 0.0f)};
620}
621
622template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
623HWY_INLINE VFromD<D> Iota0(D /*d*/) {
624 return VFromD<D>{_mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)};
625}
626
627} // namespace detail
628
629template <class D, typename T2, HWY_IF_V_SIZE_D(D, 64)>
630HWY_API VFromD<D> Iota(D d, const T2 first) {
631 return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
632}
633
634// ================================================== LOGICAL
635
636// ------------------------------ Not
637
638template <typename T>
640 const DFromV<decltype(v)> d;
641 const RebindToUnsigned<decltype(d)> du;
642 using VU = VFromD<decltype(du)>;
643 const __m512i vu = BitCast(du, v).raw;
644 return BitCast(d, VU{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
645}
646
647// ------------------------------ And
648
649template <typename T>
651 const DFromV<decltype(a)> d; // for float16_t
652 const RebindToUnsigned<decltype(d)> du;
653 return BitCast(d, VFromD<decltype(du)>{_mm512_and_si512(BitCast(du, a).raw,
654 BitCast(du, b).raw)});
655}
656
658 return Vec512<float>{_mm512_and_ps(a.raw, b.raw)};
659}
661 return Vec512<double>{_mm512_and_pd(a.raw, b.raw)};
662}
663
664// ------------------------------ AndNot
665
666// Returns ~not_mask & mask.
667template <typename T>
668HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
669 const DFromV<decltype(mask)> d; // for float16_t
670 const RebindToUnsigned<decltype(d)> du;
671 return BitCast(d, VFromD<decltype(du)>{_mm512_andnot_si512(
672 BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
673}
675 const Vec512<float> mask) {
676 return Vec512<float>{_mm512_andnot_ps(not_mask.raw, mask.raw)};
677}
679 const Vec512<double> mask) {
680 return Vec512<double>{_mm512_andnot_pd(not_mask.raw, mask.raw)};
681}
682
683// ------------------------------ Or
684
685template <typename T>
687 const DFromV<decltype(a)> d; // for float16_t
688 const RebindToUnsigned<decltype(d)> du;
689 return BitCast(d, VFromD<decltype(du)>{_mm512_or_si512(BitCast(du, a).raw,
690 BitCast(du, b).raw)});
691}
692
694 return Vec512<float>{_mm512_or_ps(a.raw, b.raw)};
695}
697 return Vec512<double>{_mm512_or_pd(a.raw, b.raw)};
698}
699
700// ------------------------------ Xor
701
702template <typename T>
704 const DFromV<decltype(a)> d; // for float16_t
705 const RebindToUnsigned<decltype(d)> du;
706 return BitCast(d, VFromD<decltype(du)>{_mm512_xor_si512(BitCast(du, a).raw,
707 BitCast(du, b).raw)});
708}
709
711 return Vec512<float>{_mm512_xor_ps(a.raw, b.raw)};
712}
714 return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)};
715}
716
717// ------------------------------ Xor3
718template <typename T>
720#if !HWY_IS_MSAN
721 const DFromV<decltype(x1)> d;
722 const RebindToUnsigned<decltype(d)> du;
723 using VU = VFromD<decltype(du)>;
724 const __m512i ret = _mm512_ternarylogic_epi64(
725 BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
726 return BitCast(d, VU{ret});
727#else
728 return Xor(x1, Xor(x2, x3));
729#endif
730}
731
732// ------------------------------ Or3
733template <typename T>
735#if !HWY_IS_MSAN
736 const DFromV<decltype(o1)> d;
737 const RebindToUnsigned<decltype(d)> du;
738 using VU = VFromD<decltype(du)>;
739 const __m512i ret = _mm512_ternarylogic_epi64(
740 BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
741 return BitCast(d, VU{ret});
742#else
743 return Or(o1, Or(o2, o3));
744#endif
745}
746
747// ------------------------------ OrAnd
748template <typename T>
750#if !HWY_IS_MSAN
751 const DFromV<decltype(o)> d;
752 const RebindToUnsigned<decltype(d)> du;
753 using VU = VFromD<decltype(du)>;
754 const __m512i ret = _mm512_ternarylogic_epi64(
755 BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
756 return BitCast(d, VU{ret});
757#else
758 return Or(o, And(a1, a2));
759#endif
760}
761
762// ------------------------------ IfVecThenElse
763template <typename T>
765#if !HWY_IS_MSAN
766 const DFromV<decltype(yes)> d;
767 const RebindToUnsigned<decltype(d)> du;
768 using VU = VFromD<decltype(du)>;
769 return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
770 BitCast(du, yes).raw,
771 BitCast(du, no).raw, 0xCA)});
772#else
773 return IfThenElse(MaskFromVec(mask), yes, no);
774#endif
775}
776
777// ------------------------------ Operator overloads (internal-only if float)
778
779template <typename T>
781 return And(a, b);
782}
783
784template <typename T>
786 return Or(a, b);
787}
788
789template <typename T>
791 return Xor(a, b);
792}
793
794// ------------------------------ PopulationCount
795
796// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
797#if HWY_TARGET <= HWY_AVX3_DL
798
799#ifdef HWY_NATIVE_POPCNT
800#undef HWY_NATIVE_POPCNT
801#else
802#define HWY_NATIVE_POPCNT
803#endif
804
805namespace detail {
806
807template <typename T>
809 return Vec512<T>{_mm512_popcnt_epi8(v.raw)};
810}
811template <typename T>
813 return Vec512<T>{_mm512_popcnt_epi16(v.raw)};
814}
815template <typename T>
817 return Vec512<T>{_mm512_popcnt_epi32(v.raw)};
818}
819template <typename T>
821 return Vec512<T>{_mm512_popcnt_epi64(v.raw)};
822}
823
824} // namespace detail
825
826template <typename T>
830
831#endif // HWY_TARGET <= HWY_AVX3_DL
832
833// ================================================== MASK
834
835// ------------------------------ FirstN
836
837// Possibilities for constructing a bitmask of N ones:
838// - kshift* only consider the lowest byte of the shift count, so they would
839// not correctly handle large n.
840// - Scalar shifts >= 64 are UB.
841// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
842// we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
843
844#if HWY_ARCH_X86_32
845namespace detail {
846
847// 32 bit mask is sufficient for lane size >= 2.
848template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
849HWY_INLINE Mask512<T> FirstN(size_t n) {
850 Mask512<T> m;
851 const uint32_t all = ~uint32_t{0};
852 // BZHI only looks at the lower 8 bits of n, but it has been clamped to
853 // MaxLanes, which is at most 32.
854 m.raw = static_cast<decltype(m.raw)>(_bzhi_u32(all, n));
855 return m;
856}
857
858#if HWY_COMPILER_MSVC >= 1920 || HWY_COMPILER_GCC_ACTUAL >= 900 || \
859 HWY_COMPILER_CLANG || HWY_COMPILER_ICC
860template <typename T, HWY_IF_T_SIZE(T, 1)>
861HWY_INLINE Mask512<T> FirstN(size_t n) {
862 uint32_t lo_mask;
863 uint32_t hi_mask;
864 uint32_t hi_mask_len;
865#if HWY_COMPILER_GCC
866 if (__builtin_constant_p(n >= 32) && n >= 32) {
867 if (__builtin_constant_p(n >= 64) && n >= 64) {
868 hi_mask_len = 32u;
869 } else {
870 hi_mask_len = static_cast<uint32_t>(n) - 32u;
871 }
872 lo_mask = hi_mask = 0xFFFFFFFFu;
873 } else // NOLINT(readability/braces)
874#endif
875 {
876 const uint32_t lo_mask_len = static_cast<uint32_t>(n);
877 lo_mask = _bzhi_u32(0xFFFFFFFFu, lo_mask_len);
878
879#if HWY_COMPILER_GCC
880 if (__builtin_constant_p(lo_mask_len <= 32) && lo_mask_len <= 32) {
881 return Mask512<T>{static_cast<__mmask64>(lo_mask)};
882 }
883#endif
884
885 _addcarry_u32(_subborrow_u32(0, lo_mask_len, 32u, &hi_mask_len),
886 0xFFFFFFFFu, 0u, &hi_mask);
887 }
888 hi_mask = _bzhi_u32(hi_mask, hi_mask_len);
889#if HWY_COMPILER_GCC && !HWY_COMPILER_ICC
890 if (__builtin_constant_p((static_cast<uint64_t>(hi_mask) << 32) | lo_mask))
891#endif
892 return Mask512<T>{static_cast<__mmask64>(
893 (static_cast<uint64_t>(hi_mask) << 32) | lo_mask)};
894#if HWY_COMPILER_GCC && !HWY_COMPILER_ICC
895 else
896 return Mask512<T>{_mm512_kunpackd(static_cast<__mmask64>(hi_mask),
897 static_cast<__mmask64>(lo_mask))};
898#endif
899}
900#else // HWY_COMPILER..
901template <typename T, HWY_IF_T_SIZE(T, 1)>
902HWY_INLINE Mask512<T> FirstN(size_t n) {
903 const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0};
904 return Mask512<T>{static_cast<__mmask64>(bits)};
905}
906#endif // HWY_COMPILER..
907} // namespace detail
908#endif // HWY_ARCH_X86_32
909
910template <class D, HWY_IF_V_SIZE_D(D, 64)>
911HWY_API MFromD<D> FirstN(D d, size_t n) {
912 // This ensures `num` <= 255 as required by bzhi, which only looks
913 // at the lower 8 bits.
914 n = HWY_MIN(n, MaxLanes(d));
915
916#if HWY_ARCH_X86_64
917 MFromD<D> m;
918 const uint64_t all = ~uint64_t{0};
919 m.raw = static_cast<decltype(m.raw)>(_bzhi_u64(all, n));
920 return m;
921#else
922 return detail::FirstN<TFromD<D>>(n);
923#endif // HWY_ARCH_X86_64
924}
925
926// ------------------------------ IfThenElse
927
928// Returns mask ? b : a.
929
930namespace detail {
931
932// Templates for signed/unsigned integer of a particular size.
933template <typename T>
935 const Mask512<T> mask, const Vec512<T> yes,
936 const Vec512<T> no) {
937 return Vec512<T>{_mm512_mask_blend_epi8(mask.raw, no.raw, yes.raw)};
938}
939template <typename T>
941 const Mask512<T> mask, const Vec512<T> yes,
942 const Vec512<T> no) {
943 return Vec512<T>{_mm512_mask_blend_epi16(mask.raw, no.raw, yes.raw)};
944}
945template <typename T>
947 const Mask512<T> mask, const Vec512<T> yes,
948 const Vec512<T> no) {
949 return Vec512<T>{_mm512_mask_blend_epi32(mask.raw, no.raw, yes.raw)};
950}
951template <typename T>
953 const Mask512<T> mask, const Vec512<T> yes,
954 const Vec512<T> no) {
955 return Vec512<T>{_mm512_mask_blend_epi64(mask.raw, no.raw, yes.raw)};
956}
957
958} // namespace detail
959
960template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
962 const Vec512<T> no) {
963 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
964}
965#if HWY_HAVE_FLOAT16
966HWY_API Vec512<float16_t> IfThenElse(Mask512<float16_t> mask,
967 Vec512<float16_t> yes,
968 Vec512<float16_t> no) {
969 return Vec512<float16_t>{_mm512_mask_blend_ph(mask.raw, no.raw, yes.raw)};
970}
971#endif // HWY_HAVE_FLOAT16
973 Vec512<float> no) {
974 return Vec512<float>{_mm512_mask_blend_ps(mask.raw, no.raw, yes.raw)};
975}
977 Vec512<double> no) {
978 return Vec512<double>{_mm512_mask_blend_pd(mask.raw, no.raw, yes.raw)};
979}
980
981namespace detail {
982
983template <typename T>
985 const Mask512<T> mask,
986 const Vec512<T> yes) {
987 return Vec512<T>{_mm512_maskz_mov_epi8(mask.raw, yes.raw)};
988}
989template <typename T>
991 const Mask512<T> mask,
992 const Vec512<T> yes) {
993 return Vec512<T>{_mm512_maskz_mov_epi16(mask.raw, yes.raw)};
994}
995template <typename T>
997 const Mask512<T> mask,
998 const Vec512<T> yes) {
999 return Vec512<T>{_mm512_maskz_mov_epi32(mask.raw, yes.raw)};
1000}
1001template <typename T>
1003 const Mask512<T> mask,
1004 const Vec512<T> yes) {
1005 return Vec512<T>{_mm512_maskz_mov_epi64(mask.raw, yes.raw)};
1006}
1007
1008} // namespace detail
1009
1010template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1012 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
1013}
1015 return Vec512<float>{_mm512_maskz_mov_ps(mask.raw, yes.raw)};
1016}
1018 Vec512<double> yes) {
1019 return Vec512<double>{_mm512_maskz_mov_pd(mask.raw, yes.raw)};
1020}
1021
1022namespace detail {
1023
1024template <typename T>
1026 const Mask512<T> mask, const Vec512<T> no) {
1027 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
1028 return Vec512<T>{_mm512_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
1029}
1030template <typename T>
1032 const Mask512<T> mask, const Vec512<T> no) {
1033 return Vec512<T>{_mm512_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
1034}
1035template <typename T>
1037 const Mask512<T> mask, const Vec512<T> no) {
1038 return Vec512<T>{_mm512_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
1039}
1040template <typename T>
1042 const Mask512<T> mask, const Vec512<T> no) {
1043 return Vec512<T>{_mm512_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
1044}
1045
1046} // namespace detail
1047
1048template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1050 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
1051}
1053 return Vec512<float>{_mm512_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
1054}
1056 return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
1057}
1058
1059template <typename T>
1061 static_assert(IsSigned<T>(), "Only works for signed/float");
1062 // AVX3 MaskFromVec only looks at the MSB
1063 return IfThenElse(MaskFromVec(v), yes, no);
1064}
1065
1066template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1067 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
1069 // AVX3 MaskFromVec only looks at the MSB
1070 const DFromV<decltype(v)> d;
1071 return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
1072}
1073
1074// ================================================== ARITHMETIC
1075
1076// ------------------------------ Addition
1077
1078// Unsigned
1080 return Vec512<uint8_t>{_mm512_add_epi8(a.raw, b.raw)};
1081}
1091
1092// Signed
1094 return Vec512<int8_t>{_mm512_add_epi8(a.raw, b.raw)};
1095}
1097 return Vec512<int16_t>{_mm512_add_epi16(a.raw, b.raw)};
1098}
1100 return Vec512<int32_t>{_mm512_add_epi32(a.raw, b.raw)};
1101}
1103 return Vec512<int64_t>{_mm512_add_epi64(a.raw, b.raw)};
1104}
1105
1106// Float
1107#if HWY_HAVE_FLOAT16
1108HWY_API Vec512<float16_t> operator+(Vec512<float16_t> a, Vec512<float16_t> b) {
1109 return Vec512<float16_t>{_mm512_add_ph(a.raw, b.raw)};
1110}
1111#endif // HWY_HAVE_FLOAT16
1113 return Vec512<float>{_mm512_add_ps(a.raw, b.raw)};
1114}
1116 return Vec512<double>{_mm512_add_pd(a.raw, b.raw)};
1117}
1118
1119// ------------------------------ Subtraction
1120
1121// Unsigned
1123 return Vec512<uint8_t>{_mm512_sub_epi8(a.raw, b.raw)};
1124}
1134
1135// Signed
1137 return Vec512<int8_t>{_mm512_sub_epi8(a.raw, b.raw)};
1138}
1140 return Vec512<int16_t>{_mm512_sub_epi16(a.raw, b.raw)};
1141}
1143 return Vec512<int32_t>{_mm512_sub_epi32(a.raw, b.raw)};
1144}
1146 return Vec512<int64_t>{_mm512_sub_epi64(a.raw, b.raw)};
1147}
1148
1149// Float
1150#if HWY_HAVE_FLOAT16
1151HWY_API Vec512<float16_t> operator-(Vec512<float16_t> a, Vec512<float16_t> b) {
1152 return Vec512<float16_t>{_mm512_sub_ph(a.raw, b.raw)};
1153}
1154#endif // HWY_HAVE_FLOAT16
1156 return Vec512<float>{_mm512_sub_ps(a.raw, b.raw)};
1157}
1159 return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)};
1160}
1161
1162// ------------------------------ SumsOf8
1164 const Full512<uint8_t> d;
1165 return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, Zero(d).raw)};
1166}
1167
1171
1172// ------------------------------ SumsOf4
1173namespace detail {
1174
1176 hwy::SizeTag<1> /*lane_size_tag*/,
1177 Vec512<uint8_t> v) {
1178 const DFromV<decltype(v)> d;
1179
1180 // _mm512_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
1181 // zeroed out and the sums of the 4 consecutive lanes are already in the
1182 // even uint16_t lanes of the _mm512_maskz_dbsad_epu8 result.
1183 return Vec512<uint32_t>{_mm512_maskz_dbsad_epu8(
1184 static_cast<__mmask32>(0x55555555), v.raw, Zero(d).raw, 0)};
1185}
1186
1187// I8->I32 SumsOf4
1188// Generic for all vector lengths
1189template <class V>
1191 hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1192 const DFromV<decltype(v)> d;
1193 const RebindToUnsigned<decltype(d)> du;
1194 const RepartitionToWideX2<decltype(d)> di32;
1195
1196 // Adjust the values of v to be in the 0..255 range by adding 128 to each lane
1197 // of v (which is the same as an bitwise XOR of each i8 lane by 128) and then
1198 // bitcasting the Xor result to an u8 vector.
1199 const auto v_adj = BitCast(du, Xor(v, SignBit(d)));
1200
1201 // Need to add -512 to each i32 lane of the result of the
1202 // SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v_adj) operation to account
1203 // for the adjustment made above.
1204 return BitCast(di32, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v_adj)) +
1205 Set(di32, int32_t{-512});
1206}
1207
1208} // namespace detail
1209
1210// ------------------------------ SumsOfShuffledQuadAbsDiff
1211
1212#if HWY_TARGET <= HWY_AVX3
1213template <int kIdx3, int kIdx2, int kIdx1, int kIdx0>
1215 Vec512<uint8_t> b) {
1216 static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
1217 static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
1218 static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
1219 static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
1220 return Vec512<uint16_t>{
1221 _mm512_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
1222}
1223#endif
1224
1225// ------------------------------ SaturatedAdd
1226
1227// Returns a + b clamped to the destination range.
1228
1229// Unsigned
1236
1237// Signed
1239 return Vec512<int8_t>{_mm512_adds_epi8(a.raw, b.raw)};
1240}
1242 return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)};
1243}
1244
1245// ------------------------------ SaturatedSub
1246
1247// Returns a - b clamped to the destination range.
1248
1249// Unsigned
1256
1257// Signed
1259 return Vec512<int8_t>{_mm512_subs_epi8(a.raw, b.raw)};
1260}
1262 return Vec512<int16_t>{_mm512_subs_epi16(a.raw, b.raw)};
1263}
1264
1265// ------------------------------ Average
1266
1267// Returns (a + b + 1) / 2
1268
1269// Unsigned
1276
1277// ------------------------------ Abs (Sub)
1278
1279// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
1281#if HWY_COMPILER_MSVC
1282 // Workaround for incorrect codegen? (untested due to internal compiler error)
1283 const DFromV<decltype(v)> d;
1284 const auto zero = Zero(d);
1285 return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
1286#else
1287 return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
1288#endif
1289}
1291 return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
1292}
1294 return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
1295}
1297 return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
1298}
1299
1300// ------------------------------ ShiftLeft
1301
1302#if HWY_TARGET <= HWY_AVX3_DL
1303namespace detail {
1304template <typename T>
1306 return Vec512<T>{_mm512_gf2p8affine_epi64_epi8(v.raw, matrix.raw, 0)};
1307}
1308} // namespace detail
1309#endif // HWY_TARGET <= HWY_AVX3_DL
1310
1311template <int kBits>
1313 return Vec512<uint16_t>{_mm512_slli_epi16(v.raw, kBits)};
1314}
1315
1316template <int kBits>
1318 return Vec512<uint32_t>{_mm512_slli_epi32(v.raw, kBits)};
1319}
1320
1321template <int kBits>
1323 return Vec512<uint64_t>{_mm512_slli_epi64(v.raw, kBits)};
1324}
1325
1326template <int kBits>
1328 return Vec512<int16_t>{_mm512_slli_epi16(v.raw, kBits)};
1329}
1330
1331template <int kBits>
1333 return Vec512<int32_t>{_mm512_slli_epi32(v.raw, kBits)};
1334}
1335
1336template <int kBits>
1338 return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
1339}
1340
1341#if HWY_TARGET <= HWY_AVX3_DL
1342
1343// Generic for all vector lengths. Must be defined after all GaloisAffine.
1344template <int kBits, class V, HWY_IF_T_SIZE_V(V, 1)>
1345HWY_API V ShiftLeft(const V v) {
1347 if (kBits == 0) return v;
1348 if (kBits == 1) return v + v;
1349 constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) &
1350 (0x0101010101010101ULL * (0xFF >> kBits));
1351 return detail::GaloisAffine(v, Set(du64, kMatrix));
1352}
1353
1354#else // HWY_TARGET > HWY_AVX3_DL
1355
1356template <int kBits, typename T, HWY_IF_T_SIZE(T, 1)>
1357HWY_API Vec512<T> ShiftLeft(const Vec512<T> v) {
1358 const DFromV<decltype(v)> d8;
1359 const RepartitionToWide<decltype(d8)> d16;
1360 const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
1361 return kBits == 1
1362 ? (v + v)
1363 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1364}
1365
1366#endif // HWY_TARGET > HWY_AVX3_DL
1367
1368// ------------------------------ ShiftRight
1369
1370template <int kBits>
1372 return Vec512<uint16_t>{_mm512_srli_epi16(v.raw, kBits)};
1373}
1374
1375template <int kBits>
1377 return Vec512<uint32_t>{_mm512_srli_epi32(v.raw, kBits)};
1378}
1379
1380template <int kBits>
1382 return Vec512<uint64_t>{_mm512_srli_epi64(v.raw, kBits)};
1383}
1384
1385template <int kBits>
1387 return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
1388}
1389
1390template <int kBits>
1392 return Vec512<int32_t>{_mm512_srai_epi32(v.raw, kBits)};
1393}
1394
1395template <int kBits>
1397 return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
1398}
1399
1400#if HWY_TARGET <= HWY_AVX3_DL
1401
1402// Generic for all vector lengths. Must be defined after all GaloisAffine.
1403template <int kBits, class V, HWY_IF_U8_D(DFromV<V>)>
1404HWY_API V ShiftRight(const V v) {
1406 if (kBits == 0) return v;
1407 constexpr uint64_t kMatrix =
1408 (0x0102040810204080ULL << kBits) &
1409 (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1410 return detail::GaloisAffine(v, Set(du64, kMatrix));
1411}
1412
1413// Generic for all vector lengths. Must be defined after all GaloisAffine.
1414template <int kBits, class V, HWY_IF_I8_D(DFromV<V>)>
1415HWY_API V ShiftRight(const V v) {
1416 const Repartition<uint64_t, DFromV<V>> du64;
1417 if (kBits == 0) return v;
1418 constexpr uint64_t kShift =
1419 (0x0102040810204080ULL << kBits) &
1420 (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1421 constexpr uint64_t kSign =
1422 kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits)));
1423 return detail::GaloisAffine(v, Set(du64, kShift | kSign));
1424}
1425
1426#else // HWY_TARGET > HWY_AVX3_DL
1427
1428template <int kBits>
1429HWY_API Vec512<uint8_t> ShiftRight(const Vec512<uint8_t> v) {
1430 const DFromV<decltype(v)> d8;
1431 // Use raw instead of BitCast to support N=1.
1432 const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
1433 return shifted & Set(d8, 0xFF >> kBits);
1434}
1435
1436template <int kBits>
1437HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
1438 const DFromV<decltype(v)> di;
1439 const RebindToUnsigned<decltype(di)> du;
1440 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1441 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
1442 return (shifted ^ shifted_sign) - shifted_sign;
1443}
1444
1445#endif // HWY_TARGET > HWY_AVX3_DL
1446
1447// ------------------------------ RotateRight
1448
1449#if HWY_TARGET <= HWY_AVX3_DL
1450// U8 RotateRight is generic for all vector lengths on AVX3_DL
1451template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
1452HWY_API V RotateRight(V v) {
1453 static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
1454
1455 const Repartition<uint64_t, DFromV<V>> du64;
1456 if (kBits == 0) return v;
1457
1458 constexpr uint64_t kShrMatrix =
1459 (0x0102040810204080ULL << kBits) &
1460 (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1461 constexpr int kShlBits = (-kBits) & 7;
1462 constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
1463 (0x0101010101010101ULL * (0xFF >> kShlBits));
1464 constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
1465
1466 return detail::GaloisAffine(v, Set(du64, kMatrix));
1467}
1468#else // HWY_TARGET > HWY_AVX3_DL
1469template <int kBits>
1470HWY_API Vec512<uint8_t> RotateRight(const Vec512<uint8_t> v) {
1471 static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
1472 if (kBits == 0) return v;
1473 // AVX3 does not support 8-bit.
1474 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
1475}
1476#endif // HWY_TARGET <= HWY_AVX3_DL
1477
1478template <int kBits>
1480 static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
1481 if (kBits == 0) return v;
1482 // AVX3 does not support 16-bit.
1483 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
1484}
1485
1486template <int kBits>
1488 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
1489 if (kBits == 0) return v;
1490 return Vec512<uint32_t>{_mm512_ror_epi32(v.raw, kBits)};
1491}
1492
1493template <int kBits>
1495 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
1496 if (kBits == 0) return v;
1497 return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
1498}
1499
1500// ------------------------------ Rol/Ror
1501
1502#if HWY_TARGET <= HWY_AVX3
1503
1504template <class T, HWY_IF_UI32(T)>
1506 return Vec512<T>{_mm512_rolv_epi32(a.raw, b.raw)};
1507}
1508
1509template <class T, HWY_IF_UI32(T)>
1511 return Vec512<T>{_mm512_rorv_epi32(a.raw, b.raw)};
1512}
1513
1514template <class T, HWY_IF_UI64(T)>
1515HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
1516 return Vec512<T>{_mm512_rolv_epi64(a.raw, b.raw)};
1517}
1518
1519template <class T, HWY_IF_UI64(T)>
1520HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
1521 return Vec512<T>{_mm512_rorv_epi64(a.raw, b.raw)};
1522}
1523
1524#endif
1525
1526// ------------------------------ ShiftLeftSame
1527
1528// GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512
1529// shift-with-immediate: the counts should all be unsigned int.
1530#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100
1531using Shift16Count = int;
1532using Shift3264Count = int;
1533#elif HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
1534// GCC 11.0 requires these, prior versions used a macro+cast and don't care.
1535using Shift16Count = int;
1536using Shift3264Count = unsigned int;
1537#else
1538// Assume documented behavior. Clang 11, GCC 14 and MSVC 14.28.29910 match this.
1539using Shift16Count = unsigned int;
1540using Shift3264Count = unsigned int;
1541#endif
1542
1544 const int bits) {
1545#if HWY_COMPILER_GCC
1546 if (__builtin_constant_p(bits)) {
1547 return Vec512<uint16_t>{
1548 _mm512_slli_epi16(v.raw, static_cast<Shift16Count>(bits))};
1549 }
1550#endif
1551 return Vec512<uint16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1552}
1554 const int bits) {
1555#if HWY_COMPILER_GCC
1556 if (__builtin_constant_p(bits)) {
1557 return Vec512<uint32_t>{
1558 _mm512_slli_epi32(v.raw, static_cast<Shift3264Count>(bits))};
1559 }
1560#endif
1561 return Vec512<uint32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1562}
1564 const int bits) {
1565#if HWY_COMPILER_GCC
1566 if (__builtin_constant_p(bits)) {
1567 return Vec512<uint64_t>{
1568 _mm512_slli_epi64(v.raw, static_cast<Shift3264Count>(bits))};
1569 }
1570#endif
1571 return Vec512<uint64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1572}
1573
1575#if HWY_COMPILER_GCC
1576 if (__builtin_constant_p(bits)) {
1577 return Vec512<int16_t>{
1578 _mm512_slli_epi16(v.raw, static_cast<Shift16Count>(bits))};
1579 }
1580#endif
1581 return Vec512<int16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1582}
1583
1585#if HWY_COMPILER_GCC
1586 if (__builtin_constant_p(bits)) {
1587 return Vec512<int32_t>{
1588 _mm512_slli_epi32(v.raw, static_cast<Shift3264Count>(bits))};
1589 }
1590#endif
1591 return Vec512<int32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1592}
1593
1595#if HWY_COMPILER_GCC
1596 if (__builtin_constant_p(bits)) {
1597 return Vec512<int64_t>{
1598 _mm512_slli_epi64(v.raw, static_cast<Shift3264Count>(bits))};
1599 }
1600#endif
1601 return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1602}
1603
1604template <typename T, HWY_IF_T_SIZE(T, 1)>
1605HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) {
1606 const DFromV<decltype(v)> d8;
1607 const RepartitionToWide<decltype(d8)> d16;
1608 const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
1609 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
1610}
1611
1612// ------------------------------ ShiftRightSame
1613
1615 const int bits) {
1616#if HWY_COMPILER_GCC
1617 if (__builtin_constant_p(bits)) {
1618 return Vec512<uint16_t>{
1619 _mm512_srli_epi16(v.raw, static_cast<Shift16Count>(bits))};
1620 }
1621#endif
1622 return Vec512<uint16_t>{_mm512_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1623}
1625 const int bits) {
1626#if HWY_COMPILER_GCC
1627 if (__builtin_constant_p(bits)) {
1628 return Vec512<uint32_t>{
1629 _mm512_srli_epi32(v.raw, static_cast<Shift3264Count>(bits))};
1630 }
1631#endif
1632 return Vec512<uint32_t>{_mm512_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1633}
1635 const int bits) {
1636#if HWY_COMPILER_GCC
1637 if (__builtin_constant_p(bits)) {
1638 return Vec512<uint64_t>{
1639 _mm512_srli_epi64(v.raw, static_cast<Shift3264Count>(bits))};
1640 }
1641#endif
1642 return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1643}
1644
1646 const DFromV<decltype(v)> d8;
1647 const RepartitionToWide<decltype(d8)> d16;
1648 const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
1649 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
1650}
1651
1653 const int bits) {
1654#if HWY_COMPILER_GCC
1655 if (__builtin_constant_p(bits)) {
1656 return Vec512<int16_t>{
1657 _mm512_srai_epi16(v.raw, static_cast<Shift16Count>(bits))};
1658 }
1659#endif
1660 return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1661}
1662
1664 const int bits) {
1665#if HWY_COMPILER_GCC
1666 if (__builtin_constant_p(bits)) {
1667 return Vec512<int32_t>{
1668 _mm512_srai_epi32(v.raw, static_cast<Shift3264Count>(bits))};
1669 }
1670#endif
1671 return Vec512<int32_t>{_mm512_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1672}
1674 const int bits) {
1675#if HWY_COMPILER_GCC
1676 if (__builtin_constant_p(bits)) {
1677 return Vec512<int64_t>{
1678 _mm512_srai_epi64(v.raw, static_cast<Shift3264Count>(bits))};
1679 }
1680#endif
1681 return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1682}
1683
1685 const DFromV<decltype(v)> di;
1686 const RebindToUnsigned<decltype(di)> du;
1687 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1688 const auto shifted_sign =
1689 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
1690 return (shifted ^ shifted_sign) - shifted_sign;
1691}
1692
1693// ------------------------------ Minimum
1694
1695// Unsigned
1697 return Vec512<uint8_t>{_mm512_min_epu8(a.raw, b.raw)};
1698}
1700 return Vec512<uint16_t>{_mm512_min_epu16(a.raw, b.raw)};
1701}
1703 return Vec512<uint32_t>{_mm512_min_epu32(a.raw, b.raw)};
1704}
1706 return Vec512<uint64_t>{_mm512_min_epu64(a.raw, b.raw)};
1707}
1708
1709// Signed
1711 return Vec512<int8_t>{_mm512_min_epi8(a.raw, b.raw)};
1712}
1714 return Vec512<int16_t>{_mm512_min_epi16(a.raw, b.raw)};
1715}
1717 return Vec512<int32_t>{_mm512_min_epi32(a.raw, b.raw)};
1718}
1720 return Vec512<int64_t>{_mm512_min_epi64(a.raw, b.raw)};
1721}
1722
1723// Float
1724#if HWY_HAVE_FLOAT16
1725HWY_API Vec512<float16_t> Min(Vec512<float16_t> a, Vec512<float16_t> b) {
1726 return Vec512<float16_t>{_mm512_min_ph(a.raw, b.raw)};
1727}
1728#endif // HWY_HAVE_FLOAT16
1730 return Vec512<float>{_mm512_min_ps(a.raw, b.raw)};
1731}
1733 return Vec512<double>{_mm512_min_pd(a.raw, b.raw)};
1734}
1735
1736// ------------------------------ Maximum
1737
1738// Unsigned
1740 return Vec512<uint8_t>{_mm512_max_epu8(a.raw, b.raw)};
1741}
1743 return Vec512<uint16_t>{_mm512_max_epu16(a.raw, b.raw)};
1744}
1746 return Vec512<uint32_t>{_mm512_max_epu32(a.raw, b.raw)};
1747}
1749 return Vec512<uint64_t>{_mm512_max_epu64(a.raw, b.raw)};
1750}
1751
1752// Signed
1754 return Vec512<int8_t>{_mm512_max_epi8(a.raw, b.raw)};
1755}
1757 return Vec512<int16_t>{_mm512_max_epi16(a.raw, b.raw)};
1758}
1760 return Vec512<int32_t>{_mm512_max_epi32(a.raw, b.raw)};
1761}
1763 return Vec512<int64_t>{_mm512_max_epi64(a.raw, b.raw)};
1764}
1765
1766// Float
1767#if HWY_HAVE_FLOAT16
1768HWY_API Vec512<float16_t> Max(Vec512<float16_t> a, Vec512<float16_t> b) {
1769 return Vec512<float16_t>{_mm512_max_ph(a.raw, b.raw)};
1770}
1771#endif // HWY_HAVE_FLOAT16
1773 return Vec512<float>{_mm512_max_ps(a.raw, b.raw)};
1774}
1776 return Vec512<double>{_mm512_max_pd(a.raw, b.raw)};
1777}
1778
1779// ------------------------------ Integer multiplication
1780
1781// Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*.
1782#ifdef HWY_NATIVE_MUL_64
1783#undef HWY_NATIVE_MUL_64
1784#else
1785#define HWY_NATIVE_MUL_64
1786#endif
1787
1788// Unsigned
1790 return Vec512<uint16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1791}
1793 return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1794}
1796 return Vec512<uint64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
1797}
1799 return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
1800}
1801template <size_t N>
1806
1807// Signed
1809 return Vec512<int16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1810}
1812 return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1813}
1815 return Vec512<int64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
1816}
1818 return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
1819}
1820template <size_t N>
1825// Returns the upper 16 bits of a * b in each lane.
1827 return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
1828}
1830 return Vec512<int16_t>{_mm512_mulhi_epi16(a.raw, b.raw)};
1831}
1832
1834 return Vec512<int16_t>{_mm512_mulhrs_epi16(a.raw, b.raw)};
1835}
1836
1837// Multiplies even lanes (0, 2 ..) and places the double-wide result into
1838// even and the upper half into its odd neighbor lane.
1840 return Vec512<int64_t>{_mm512_mul_epi32(a.raw, b.raw)};
1841}
1845
1846// ------------------------------ Neg (Sub)
1847
1848template <typename T, HWY_IF_FLOAT_OR_SPECIAL(T)>
1850 const DFromV<decltype(v)> d;
1851 return Xor(v, SignBit(d));
1852}
1853
1854template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1855HWY_API Vec512<T> Neg(const Vec512<T> v) {
1856 const DFromV<decltype(v)> d;
1857 return Zero(d) - v;
1858}
1859
1860// ------------------------------ Floating-point mul / div
1861
1862#if HWY_HAVE_FLOAT16
1863HWY_API Vec512<float16_t> operator*(Vec512<float16_t> a, Vec512<float16_t> b) {
1864 return Vec512<float16_t>{_mm512_mul_ph(a.raw, b.raw)};
1865}
1866#endif // HWY_HAVE_FLOAT16
1868 return Vec512<float>{_mm512_mul_ps(a.raw, b.raw)};
1869}
1871 return Vec512<double>{_mm512_mul_pd(a.raw, b.raw)};
1872}
1873
1874#if HWY_HAVE_FLOAT16
1875HWY_API Vec512<float16_t> operator/(Vec512<float16_t> a, Vec512<float16_t> b) {
1876 return Vec512<float16_t>{_mm512_div_ph(a.raw, b.raw)};
1877}
1878#endif // HWY_HAVE_FLOAT16
1880 return Vec512<float>{_mm512_div_ps(a.raw, b.raw)};
1881}
1883 return Vec512<double>{_mm512_div_pd(a.raw, b.raw)};
1884}
1885
1886// Approximate reciprocal
1887#if HWY_HAVE_FLOAT16
1888HWY_API Vec512<float16_t> ApproximateReciprocal(const Vec512<float16_t> v) {
1889 return Vec512<float16_t>{_mm512_rcp_ph(v.raw)};
1890}
1891#endif // HWY_HAVE_FLOAT16
1893 return Vec512<float>{_mm512_rcp14_ps(v.raw)};
1894}
1895
1899
1900// ------------------------------ MaskedMinOr
1901
1902template <typename T, HWY_IF_U8(T)>
1904 Vec512<T> b) {
1905 return Vec512<T>{_mm512_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
1906}
1907template <typename T, HWY_IF_I8(T)>
1908HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1909 Vec512<T> b) {
1910 return Vec512<T>{_mm512_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
1911}
1912
1913template <typename T, HWY_IF_U16(T)>
1914HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1915 Vec512<T> b) {
1916 return Vec512<T>{_mm512_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
1917}
1918template <typename T, HWY_IF_I16(T)>
1919HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1920 Vec512<T> b) {
1921 return Vec512<T>{_mm512_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
1922}
1923
1924template <typename T, HWY_IF_U32(T)>
1925HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1926 Vec512<T> b) {
1927 return Vec512<T>{_mm512_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
1928}
1929template <typename T, HWY_IF_I32(T)>
1930HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1931 Vec512<T> b) {
1932 return Vec512<T>{_mm512_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
1933}
1934
1935template <typename T, HWY_IF_U64(T)>
1936HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1937 Vec512<T> b) {
1938 return Vec512<T>{_mm512_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
1939}
1940template <typename T, HWY_IF_I64(T)>
1941HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1942 Vec512<T> b) {
1943 return Vec512<T>{_mm512_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
1944}
1945
1946template <typename T, HWY_IF_F32(T)>
1947HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1948 Vec512<T> b) {
1949 return Vec512<T>{_mm512_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
1950}
1951
1952template <typename T, HWY_IF_F64(T)>
1953HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1954 Vec512<T> b) {
1955 return Vec512<T>{_mm512_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
1956}
1957
1958#if HWY_HAVE_FLOAT16
1959template <typename T, HWY_IF_F16(T)>
1960HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1961 Vec512<T> b) {
1962 return Vec512<T>{_mm512_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
1963}
1964#endif // HWY_HAVE_FLOAT16
1965
1966// ------------------------------ MaskedMaxOr
1967
1968template <typename T, HWY_IF_U8(T)>
1970 Vec512<T> b) {
1971 return Vec512<T>{_mm512_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
1972}
1973template <typename T, HWY_IF_I8(T)>
1974HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1975 Vec512<T> b) {
1976 return Vec512<T>{_mm512_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
1977}
1978
1979template <typename T, HWY_IF_U16(T)>
1980HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1981 Vec512<T> b) {
1982 return Vec512<T>{_mm512_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
1983}
1984template <typename T, HWY_IF_I16(T)>
1985HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1986 Vec512<T> b) {
1987 return Vec512<T>{_mm512_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
1988}
1989
1990template <typename T, HWY_IF_U32(T)>
1991HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1992 Vec512<T> b) {
1993 return Vec512<T>{_mm512_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
1994}
1995template <typename T, HWY_IF_I32(T)>
1996HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1997 Vec512<T> b) {
1998 return Vec512<T>{_mm512_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
1999}
2000
2001template <typename T, HWY_IF_U64(T)>
2002HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2003 Vec512<T> b) {
2004 return Vec512<T>{_mm512_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
2005}
2006template <typename T, HWY_IF_I64(T)>
2007HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2008 Vec512<T> b) {
2009 return Vec512<T>{_mm512_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
2010}
2011
2012template <typename T, HWY_IF_F32(T)>
2013HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2014 Vec512<T> b) {
2015 return Vec512<T>{_mm512_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
2016}
2017
2018template <typename T, HWY_IF_F64(T)>
2019HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2020 Vec512<T> b) {
2021 return Vec512<T>{_mm512_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
2022}
2023
2024#if HWY_HAVE_FLOAT16
2025template <typename T, HWY_IF_F16(T)>
2026HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2027 Vec512<T> b) {
2028 return Vec512<T>{_mm512_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
2029}
2030#endif // HWY_HAVE_FLOAT16
2031
2032// ------------------------------ MaskedAddOr
2033
2034template <typename T, HWY_IF_UI8(T)>
2036 Vec512<T> b) {
2037 return Vec512<T>{_mm512_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
2038}
2039
2040template <typename T, HWY_IF_UI16(T)>
2041HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2042 Vec512<T> b) {
2043 return Vec512<T>{_mm512_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
2044}
2045
2046template <typename T, HWY_IF_UI32(T)>
2047HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2048 Vec512<T> b) {
2049 return Vec512<T>{_mm512_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
2050}
2051
2052template <typename T, HWY_IF_UI64(T)>
2053HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2054 Vec512<T> b) {
2055 return Vec512<T>{_mm512_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
2056}
2057
2058template <typename T, HWY_IF_F32(T)>
2059HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2060 Vec512<T> b) {
2061 return Vec512<T>{_mm512_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
2062}
2063
2064template <typename T, HWY_IF_F64(T)>
2065HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2066 Vec512<T> b) {
2067 return Vec512<T>{_mm512_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
2068}
2069
2070#if HWY_HAVE_FLOAT16
2071template <typename T, HWY_IF_F16(T)>
2072HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2073 Vec512<T> b) {
2074 return Vec512<T>{_mm512_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
2075}
2076#endif // HWY_HAVE_FLOAT16
2077
2078// ------------------------------ MaskedSubOr
2079
2080template <typename T, HWY_IF_UI8(T)>
2082 Vec512<T> b) {
2083 return Vec512<T>{_mm512_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
2084}
2085
2086template <typename T, HWY_IF_UI16(T)>
2087HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2088 Vec512<T> b) {
2089 return Vec512<T>{_mm512_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
2090}
2091
2092template <typename T, HWY_IF_UI32(T)>
2093HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2094 Vec512<T> b) {
2095 return Vec512<T>{_mm512_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
2096}
2097
2098template <typename T, HWY_IF_UI64(T)>
2099HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2100 Vec512<T> b) {
2101 return Vec512<T>{_mm512_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
2102}
2103
2104template <typename T, HWY_IF_F32(T)>
2105HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2106 Vec512<T> b) {
2107 return Vec512<T>{_mm512_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
2108}
2109
2110template <typename T, HWY_IF_F64(T)>
2111HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2112 Vec512<T> b) {
2113 return Vec512<T>{_mm512_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
2114}
2115
2116#if HWY_HAVE_FLOAT16
2117template <typename T, HWY_IF_F16(T)>
2118HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2119 Vec512<T> b) {
2120 return Vec512<T>{_mm512_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
2121}
2122#endif // HWY_HAVE_FLOAT16
2123
2124// ------------------------------ MaskedMulOr
2125
2128 return Vec512<float>{_mm512_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
2129}
2130
2133 return Vec512<double>{_mm512_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
2134}
2135
2136#if HWY_HAVE_FLOAT16
2137HWY_API Vec512<float16_t> MaskedMulOr(Vec512<float16_t> no,
2138 Mask512<float16_t> m, Vec512<float16_t> a,
2139 Vec512<float16_t> b) {
2140 return Vec512<float16_t>{_mm512_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
2141}
2142#endif // HWY_HAVE_FLOAT16
2143
2144// ------------------------------ MaskedDivOr
2145
2148 return Vec512<float>{_mm512_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
2149}
2150
2153 return Vec512<double>{_mm512_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
2154}
2155
2156#if HWY_HAVE_FLOAT16
2157HWY_API Vec512<float16_t> MaskedDivOr(Vec512<float16_t> no,
2158 Mask512<float16_t> m, Vec512<float16_t> a,
2159 Vec512<float16_t> b) {
2160 return Vec512<float16_t>{_mm512_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
2161}
2162#endif // HWY_HAVE_FLOAT16
2163
2164// ------------------------------ MaskedSatAddOr
2165
2166template <typename T, HWY_IF_I8(T)>
2168 Vec512<T> b) {
2169 return Vec512<T>{_mm512_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
2170}
2171
2172template <typename T, HWY_IF_U8(T)>
2173HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2174 Vec512<T> b) {
2175 return Vec512<T>{_mm512_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
2176}
2177
2178template <typename T, HWY_IF_I16(T)>
2179HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2180 Vec512<T> b) {
2181 return Vec512<T>{_mm512_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
2182}
2183
2184template <typename T, HWY_IF_U16(T)>
2185HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2186 Vec512<T> b) {
2187 return Vec512<T>{_mm512_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
2188}
2189
2190// ------------------------------ MaskedSatSubOr
2191
2192template <typename T, HWY_IF_I8(T)>
2194 Vec512<T> b) {
2195 return Vec512<T>{_mm512_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
2196}
2197
2198template <typename T, HWY_IF_U8(T)>
2199HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2200 Vec512<T> b) {
2201 return Vec512<T>{_mm512_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
2202}
2203
2204template <typename T, HWY_IF_I16(T)>
2205HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2206 Vec512<T> b) {
2207 return Vec512<T>{_mm512_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
2208}
2209
2210template <typename T, HWY_IF_U16(T)>
2211HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2212 Vec512<T> b) {
2213 return Vec512<T>{_mm512_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
2214}
2215
2216// ------------------------------ Floating-point multiply-add variants
2217
2218#if HWY_HAVE_FLOAT16
2219
2220HWY_API Vec512<float16_t> MulAdd(Vec512<float16_t> mul, Vec512<float16_t> x,
2221 Vec512<float16_t> add) {
2222 return Vec512<float16_t>{_mm512_fmadd_ph(mul.raw, x.raw, add.raw)};
2223}
2224
2225HWY_API Vec512<float16_t> NegMulAdd(Vec512<float16_t> mul, Vec512<float16_t> x,
2226 Vec512<float16_t> add) {
2227 return Vec512<float16_t>{_mm512_fnmadd_ph(mul.raw, x.raw, add.raw)};
2228}
2229
2230HWY_API Vec512<float16_t> MulSub(Vec512<float16_t> mul, Vec512<float16_t> x,
2231 Vec512<float16_t> sub) {
2232 return Vec512<float16_t>{_mm512_fmsub_ph(mul.raw, x.raw, sub.raw)};
2233}
2234
2235HWY_API Vec512<float16_t> NegMulSub(Vec512<float16_t> mul, Vec512<float16_t> x,
2236 Vec512<float16_t> sub) {
2237 return Vec512<float16_t>{_mm512_fnmsub_ph(mul.raw, x.raw, sub.raw)};
2238}
2239
2240#endif // HWY_HAVE_FLOAT16
2241
2242// Returns mul * x + add
2244 Vec512<float> add) {
2245 return Vec512<float>{_mm512_fmadd_ps(mul.raw, x.raw, add.raw)};
2246}
2248 Vec512<double> add) {
2249 return Vec512<double>{_mm512_fmadd_pd(mul.raw, x.raw, add.raw)};
2250}
2251
2252// Returns add - mul * x
2254 Vec512<float> add) {
2255 return Vec512<float>{_mm512_fnmadd_ps(mul.raw, x.raw, add.raw)};
2256}
2258 Vec512<double> add) {
2259 return Vec512<double>{_mm512_fnmadd_pd(mul.raw, x.raw, add.raw)};
2260}
2261
2262// Returns mul * x - sub
2264 Vec512<float> sub) {
2265 return Vec512<float>{_mm512_fmsub_ps(mul.raw, x.raw, sub.raw)};
2266}
2268 Vec512<double> sub) {
2269 return Vec512<double>{_mm512_fmsub_pd(mul.raw, x.raw, sub.raw)};
2270}
2271
2272// Returns -mul * x - sub
2274 Vec512<float> sub) {
2275 return Vec512<float>{_mm512_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2276}
2278 Vec512<double> sub) {
2279 return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
2280}
2281
2282#if HWY_HAVE_FLOAT16
2283HWY_API Vec512<float16_t> MulAddSub(Vec512<float16_t> mul, Vec512<float16_t> x,
2284 Vec512<float16_t> sub_or_add) {
2285 return Vec512<float16_t>{_mm512_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
2286}
2287#endif // HWY_HAVE_FLOAT16
2288
2290 Vec512<float> sub_or_add) {
2291 return Vec512<float>{_mm512_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
2292}
2293
2295 Vec512<double> sub_or_add) {
2296 return Vec512<double>{_mm512_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
2297}
2298
2299// ------------------------------ Floating-point square root
2300
2301// Full precision square root
2302#if HWY_HAVE_FLOAT16
2303HWY_API Vec512<float16_t> Sqrt(const Vec512<float16_t> v) {
2304 return Vec512<float16_t>{_mm512_sqrt_ph(v.raw)};
2305}
2306#endif // HWY_HAVE_FLOAT16
2308 return Vec512<float>{_mm512_sqrt_ps(v.raw)};
2309}
2311 return Vec512<double>{_mm512_sqrt_pd(v.raw)};
2312}
2313
2314// Approximate reciprocal square root
2315#if HWY_HAVE_FLOAT16
2316HWY_API Vec512<float16_t> ApproximateReciprocalSqrt(Vec512<float16_t> v) {
2317 return Vec512<float16_t>{_mm512_rsqrt_ph(v.raw)};
2318}
2319#endif // HWY_HAVE_FLOAT16
2323
2327
2328// ------------------------------ Floating-point rounding
2329
2330// Work around warnings in the intrinsic definitions (passing -1 as a mask).
2331HWY_DIAGNOSTICS(push)
2332HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2333
2334// Toward nearest integer, tie to even
2336HWY_API Vec512<float16_t> Round(Vec512<float16_t> v) {
2337 return Vec512<float16_t>{_mm512_roundscale_ph(
2338 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2339}
2340#endif // HWY_HAVE_FLOAT16
2342 return Vec512<float>{_mm512_roundscale_ps(
2343 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2344}
2346 return Vec512<double>{_mm512_roundscale_pd(
2347 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2348}
2349
2350// Toward zero, aka truncate
2351#if HWY_HAVE_FLOAT16
2352HWY_API Vec512<float16_t> Trunc(Vec512<float16_t> v) {
2353 return Vec512<float16_t>{
2354 _mm512_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2355}
2356#endif // HWY_HAVE_FLOAT16
2358 return Vec512<float>{
2359 _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2360}
2362 return Vec512<double>{
2363 _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2364}
2365
2366// Toward +infinity, aka ceiling
2367#if HWY_HAVE_FLOAT16
2368HWY_API Vec512<float16_t> Ceil(Vec512<float16_t> v) {
2369 return Vec512<float16_t>{
2370 _mm512_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2371}
2372#endif // HWY_HAVE_FLOAT16
2374 return Vec512<float>{
2375 _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2376}
2378 return Vec512<double>{
2379 _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2380}
2381
2382// Toward -infinity, aka floor
2383#if HWY_HAVE_FLOAT16
2384HWY_API Vec512<float16_t> Floor(Vec512<float16_t> v) {
2385 return Vec512<float16_t>{
2386 _mm512_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2387}
2388#endif // HWY_HAVE_FLOAT16
2390 return Vec512<float>{
2391 _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2392}
2394 return Vec512<double>{
2395 _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2396}
2397
2398HWY_DIAGNOSTICS(pop)
2399
2400// ================================================== COMPARE
2401
2402// Comparisons set a mask bit to 1 if the condition is true, else 0.
2403
2404template <class DTo, typename TFrom>
2406 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
2407 return MFromD<DTo>{m.raw};
2408}
2409
2410namespace detail {
2411
2412template <typename T>
2414 Vec512<T> bit) {
2415 return Mask512<T>{_mm512_test_epi8_mask(v.raw, bit.raw)};
2416}
2417template <typename T>
2419 Vec512<T> bit) {
2420 return Mask512<T>{_mm512_test_epi16_mask(v.raw, bit.raw)};
2421}
2422template <typename T>
2424 Vec512<T> bit) {
2425 return Mask512<T>{_mm512_test_epi32_mask(v.raw, bit.raw)};
2426}
2427template <typename T>
2429 Vec512<T> bit) {
2430 return Mask512<T>{_mm512_test_epi64_mask(v.raw, bit.raw)};
2431}
2432
2433} // namespace detail
2434
2435template <typename T>
2437 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
2438 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
2439}
2440
2441// ------------------------------ Equality
2442
2443template <typename T, HWY_IF_T_SIZE(T, 1)>
2445 return Mask512<T>{_mm512_cmpeq_epi8_mask(a.raw, b.raw)};
2446}
2447template <typename T, HWY_IF_T_SIZE(T, 2)>
2448HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
2449 return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
2450}
2451template <typename T, HWY_IF_UI32(T)>
2452HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
2453 return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
2454}
2455template <typename T, HWY_IF_UI64(T)>
2456HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
2457 return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
2458}
2459
2460#if HWY_HAVE_FLOAT16
2461HWY_API Mask512<float16_t> operator==(Vec512<float16_t> a,
2462 Vec512<float16_t> b) {
2463 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2464 HWY_DIAGNOSTICS(push)
2465 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2466 return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2467 HWY_DIAGNOSTICS(pop)
2468}
2469#endif // HWY_HAVE_FLOAT16
2470
2472 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2473}
2474
2476 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2477}
2478
2479// ------------------------------ Inequality
2480
2481template <typename T, HWY_IF_T_SIZE(T, 1)>
2483 return Mask512<T>{_mm512_cmpneq_epi8_mask(a.raw, b.raw)};
2484}
2485template <typename T, HWY_IF_T_SIZE(T, 2)>
2486HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
2487 return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
2488}
2489template <typename T, HWY_IF_UI32(T)>
2490HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
2491 return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
2492}
2493template <typename T, HWY_IF_UI64(T)>
2494HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
2495 return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
2496}
2497
2498#if HWY_HAVE_FLOAT16
2499HWY_API Mask512<float16_t> operator!=(Vec512<float16_t> a,
2500 Vec512<float16_t> b) {
2501 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2502 HWY_DIAGNOSTICS(push)
2503 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2504 return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2505 HWY_DIAGNOSTICS(pop)
2506}
2507#endif // HWY_HAVE_FLOAT16
2508
2510 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2511}
2512
2514 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2515}
2516
2517// ------------------------------ Strict inequality
2518
2520 return Mask512<uint8_t>{_mm512_cmpgt_epu8_mask(a.raw, b.raw)};
2521}
2523 return Mask512<uint16_t>{_mm512_cmpgt_epu16_mask(a.raw, b.raw)};
2524}
2526 return Mask512<uint32_t>{_mm512_cmpgt_epu32_mask(a.raw, b.raw)};
2527}
2529 return Mask512<uint64_t>{_mm512_cmpgt_epu64_mask(a.raw, b.raw)};
2530}
2531
2533 return Mask512<int8_t>{_mm512_cmpgt_epi8_mask(a.raw, b.raw)};
2534}
2536 return Mask512<int16_t>{_mm512_cmpgt_epi16_mask(a.raw, b.raw)};
2537}
2539 return Mask512<int32_t>{_mm512_cmpgt_epi32_mask(a.raw, b.raw)};
2540}
2542 return Mask512<int64_t>{_mm512_cmpgt_epi64_mask(a.raw, b.raw)};
2543}
2544
2545#if HWY_HAVE_FLOAT16
2546HWY_API Mask512<float16_t> operator>(Vec512<float16_t> a, Vec512<float16_t> b) {
2547 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2548 HWY_DIAGNOSTICS(push)
2549 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2550 return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
2551 HWY_DIAGNOSTICS(pop)
2552}
2553#endif // HWY_HAVE_FLOAT16
2554
2556 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
2557}
2559 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
2560}
2561
2562// ------------------------------ Weak inequality
2563
2564#if HWY_HAVE_FLOAT16
2565HWY_API Mask512<float16_t> operator>=(Vec512<float16_t> a,
2566 Vec512<float16_t> b) {
2567 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2568 HWY_DIAGNOSTICS(push)
2569 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2570 return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
2571 HWY_DIAGNOSTICS(pop)
2572}
2573#endif // HWY_HAVE_FLOAT16
2574
2576 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
2577}
2579 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
2580}
2581
2583 return Mask512<uint8_t>{_mm512_cmpge_epu8_mask(a.raw, b.raw)};
2584}
2586 return Mask512<uint16_t>{_mm512_cmpge_epu16_mask(a.raw, b.raw)};
2587}
2589 return Mask512<uint32_t>{_mm512_cmpge_epu32_mask(a.raw, b.raw)};
2590}
2592 return Mask512<uint64_t>{_mm512_cmpge_epu64_mask(a.raw, b.raw)};
2593}
2594
2596 return Mask512<int8_t>{_mm512_cmpge_epi8_mask(a.raw, b.raw)};
2597}
2599 return Mask512<int16_t>{_mm512_cmpge_epi16_mask(a.raw, b.raw)};
2600}
2602 return Mask512<int32_t>{_mm512_cmpge_epi32_mask(a.raw, b.raw)};
2603}
2605 return Mask512<int64_t>{_mm512_cmpge_epi64_mask(a.raw, b.raw)};
2606}
2607
2608// ------------------------------ Reversed comparisons
2609
2610template <typename T>
2612 return b > a;
2613}
2614
2615template <typename T>
2617 return b >= a;
2618}
2619
2620// ------------------------------ Mask
2621
2622namespace detail {
2623
2624template <typename T>
2626 return Mask512<T>{_mm512_movepi8_mask(v.raw)};
2627}
2628template <typename T>
2630 return Mask512<T>{_mm512_movepi16_mask(v.raw)};
2631}
2632template <typename T>
2634 return Mask512<T>{_mm512_movepi32_mask(v.raw)};
2635}
2636template <typename T>
2638 return Mask512<T>{_mm512_movepi64_mask(v.raw)};
2639}
2640
2641} // namespace detail
2642
2643template <typename T, HWY_IF_NOT_FLOAT(T)>
2647template <typename T, HWY_IF_FLOAT(T)>
2648HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
2649 const RebindToSigned<DFromV<decltype(v)>> di;
2650 return Mask512<T>{MaskFromVec(BitCast(di, v)).raw};
2651}
2652
2654 return Vec512<uint8_t>{_mm512_movm_epi8(v.raw)};
2655}
2657 return Vec512<int8_t>{_mm512_movm_epi8(v.raw)};
2658}
2659
2661 return Vec512<uint16_t>{_mm512_movm_epi16(v.raw)};
2662}
2664 return Vec512<int16_t>{_mm512_movm_epi16(v.raw)};
2665}
2666#if HWY_HAVE_FLOAT16
2667HWY_API Vec512<float16_t> VecFromMask(Mask512<float16_t> v) {
2668 return Vec512<float16_t>{_mm512_castsi512_ph(_mm512_movm_epi16(v.raw))};
2669}
2670#endif // HWY_HAVE_FLOAT16
2671
2673 return Vec512<uint32_t>{_mm512_movm_epi32(v.raw)};
2674}
2676 return Vec512<int32_t>{_mm512_movm_epi32(v.raw)};
2677}
2679 return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))};
2680}
2681
2683 return Vec512<uint64_t>{_mm512_movm_epi64(v.raw)};
2684}
2686 return Vec512<int64_t>{_mm512_movm_epi64(v.raw)};
2687}
2689 return Vec512<double>{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))};
2690}
2691
2692// ------------------------------ Mask logical
2693
2694namespace detail {
2695
2696template <typename T>
2698#if HWY_COMPILER_HAS_MASK_INTRINSICS
2699 return Mask512<T>{_knot_mask64(m.raw)};
2700#else
2701 return Mask512<T>{~m.raw};
2702#endif
2703}
2704template <typename T>
2706#if HWY_COMPILER_HAS_MASK_INTRINSICS
2707 return Mask512<T>{_knot_mask32(m.raw)};
2708#else
2709 return Mask512<T>{~m.raw};
2710#endif
2711}
2712template <typename T>
2714#if HWY_COMPILER_HAS_MASK_INTRINSICS
2715 return Mask512<T>{_knot_mask16(m.raw)};
2716#else
2717 return Mask512<T>{static_cast<uint16_t>(~m.raw & 0xFFFF)};
2718#endif
2719}
2720template <typename T>
2722#if HWY_COMPILER_HAS_MASK_INTRINSICS
2723 return Mask512<T>{_knot_mask8(m.raw)};
2724#else
2725 return Mask512<T>{static_cast<uint8_t>(~m.raw & 0xFF)};
2726#endif
2727}
2728
2729template <typename T>
2731#if HWY_COMPILER_HAS_MASK_INTRINSICS
2732 return Mask512<T>{_kand_mask64(a.raw, b.raw)};
2733#else
2734 return Mask512<T>{a.raw & b.raw};
2735#endif
2736}
2737template <typename T>
2739#if HWY_COMPILER_HAS_MASK_INTRINSICS
2740 return Mask512<T>{_kand_mask32(a.raw, b.raw)};
2741#else
2742 return Mask512<T>{a.raw & b.raw};
2743#endif
2744}
2745template <typename T>
2747#if HWY_COMPILER_HAS_MASK_INTRINSICS
2748 return Mask512<T>{_kand_mask16(a.raw, b.raw)};
2749#else
2750 return Mask512<T>{static_cast<uint16_t>(a.raw & b.raw)};
2751#endif
2752}
2753template <typename T>
2755#if HWY_COMPILER_HAS_MASK_INTRINSICS
2756 return Mask512<T>{_kand_mask8(a.raw, b.raw)};
2757#else
2758 return Mask512<T>{static_cast<uint8_t>(a.raw & b.raw)};
2759#endif
2760}
2761
2762template <typename T>
2764 Mask512<T> b) {
2765#if HWY_COMPILER_HAS_MASK_INTRINSICS
2766 return Mask512<T>{_kandn_mask64(a.raw, b.raw)};
2767#else
2768 return Mask512<T>{~a.raw & b.raw};
2769#endif
2770}
2771template <typename T>
2773 Mask512<T> b) {
2774#if HWY_COMPILER_HAS_MASK_INTRINSICS
2775 return Mask512<T>{_kandn_mask32(a.raw, b.raw)};
2776#else
2777 return Mask512<T>{~a.raw & b.raw};
2778#endif
2779}
2780template <typename T>
2782 Mask512<T> b) {
2783#if HWY_COMPILER_HAS_MASK_INTRINSICS
2784 return Mask512<T>{_kandn_mask16(a.raw, b.raw)};
2785#else
2786 return Mask512<T>{static_cast<uint16_t>(~a.raw & b.raw)};
2787#endif
2788}
2789template <typename T>
2791 Mask512<T> b) {
2792#if HWY_COMPILER_HAS_MASK_INTRINSICS
2793 return Mask512<T>{_kandn_mask8(a.raw, b.raw)};
2794#else
2795 return Mask512<T>{static_cast<uint8_t>(~a.raw & b.raw)};
2796#endif
2797}
2798
2799template <typename T>
2801#if HWY_COMPILER_HAS_MASK_INTRINSICS
2802 return Mask512<T>{_kor_mask64(a.raw, b.raw)};
2803#else
2804 return Mask512<T>{a.raw | b.raw};
2805#endif
2806}
2807template <typename T>
2809#if HWY_COMPILER_HAS_MASK_INTRINSICS
2810 return Mask512<T>{_kor_mask32(a.raw, b.raw)};
2811#else
2812 return Mask512<T>{a.raw | b.raw};
2813#endif
2814}
2815template <typename T>
2817#if HWY_COMPILER_HAS_MASK_INTRINSICS
2818 return Mask512<T>{_kor_mask16(a.raw, b.raw)};
2819#else
2820 return Mask512<T>{static_cast<uint16_t>(a.raw | b.raw)};
2821#endif
2822}
2823template <typename T>
2825#if HWY_COMPILER_HAS_MASK_INTRINSICS
2826 return Mask512<T>{_kor_mask8(a.raw, b.raw)};
2827#else
2828 return Mask512<T>{static_cast<uint8_t>(a.raw | b.raw)};
2829#endif
2830}
2831
2832template <typename T>
2834#if HWY_COMPILER_HAS_MASK_INTRINSICS
2835 return Mask512<T>{_kxor_mask64(a.raw, b.raw)};
2836#else
2837 return Mask512<T>{a.raw ^ b.raw};
2838#endif
2839}
2840template <typename T>
2842#if HWY_COMPILER_HAS_MASK_INTRINSICS
2843 return Mask512<T>{_kxor_mask32(a.raw, b.raw)};
2844#else
2845 return Mask512<T>{a.raw ^ b.raw};
2846#endif
2847}
2848template <typename T>
2850#if HWY_COMPILER_HAS_MASK_INTRINSICS
2851 return Mask512<T>{_kxor_mask16(a.raw, b.raw)};
2852#else
2853 return Mask512<T>{static_cast<uint16_t>(a.raw ^ b.raw)};
2854#endif
2855}
2856template <typename T>
2858#if HWY_COMPILER_HAS_MASK_INTRINSICS
2859 return Mask512<T>{_kxor_mask8(a.raw, b.raw)};
2860#else
2861 return Mask512<T>{static_cast<uint8_t>(a.raw ^ b.raw)};
2862#endif
2863}
2864
2865template <typename T>
2867 Mask512<T> b) {
2868#if HWY_COMPILER_HAS_MASK_INTRINSICS
2869 return Mask512<T>{_kxnor_mask64(a.raw, b.raw)};
2870#else
2871 return Mask512<T>{~(a.raw ^ b.raw)};
2872#endif
2873}
2874template <typename T>
2876 Mask512<T> b) {
2877#if HWY_COMPILER_HAS_MASK_INTRINSICS
2878 return Mask512<T>{_kxnor_mask32(a.raw, b.raw)};
2879#else
2880 return Mask512<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
2881#endif
2882}
2883template <typename T>
2885 Mask512<T> b) {
2886#if HWY_COMPILER_HAS_MASK_INTRINSICS
2887 return Mask512<T>{_kxnor_mask16(a.raw, b.raw)};
2888#else
2889 return Mask512<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
2890#endif
2891}
2892template <typename T>
2894 Mask512<T> b) {
2895#if HWY_COMPILER_HAS_MASK_INTRINSICS
2896 return Mask512<T>{_kxnor_mask8(a.raw, b.raw)};
2897#else
2898 return Mask512<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
2899#endif
2900}
2901
2902} // namespace detail
2903
2904template <typename T>
2906 return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
2907}
2908
2909template <typename T>
2911 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
2912}
2913
2914template <typename T>
2916 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
2917}
2918
2919template <typename T>
2921 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
2922}
2923
2924template <typename T>
2926 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
2927}
2928
2929template <typename T>
2933
2934template <class D, HWY_IF_LANES_D(D, 64)>
2935HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
2936 MFromD<Half<D>> lo) {
2937#if HWY_COMPILER_HAS_MASK_INTRINSICS
2938 const __mmask64 combined_mask = _mm512_kunpackd(
2939 static_cast<__mmask64>(hi.raw), static_cast<__mmask64>(lo.raw));
2940#else
2941 const __mmask64 combined_mask = static_cast<__mmask64>(
2942 ((static_cast<uint64_t>(hi.raw) << 32) | (lo.raw & 0xFFFFFFFFULL)));
2943#endif
2944
2945 return MFromD<D>{combined_mask};
2946}
2947
2948template <class D, HWY_IF_LANES_D(D, 32)>
2949HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
2950#if HWY_COMPILER_HAS_MASK_INTRINSICS
2951 const auto shifted_mask = _kshiftri_mask64(static_cast<__mmask64>(m.raw), 32);
2952#else
2953 const auto shifted_mask = static_cast<uint64_t>(m.raw) >> 32;
2954#endif
2955
2956 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
2957}
2958
2959template <class D, HWY_IF_LANES_D(D, 64)>
2960HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
2961 using RawM = decltype(MFromD<D>().raw);
2962#if HWY_COMPILER_HAS_MASK_INTRINSICS
2963 return MFromD<D>{
2964 static_cast<RawM>(_kshiftli_mask64(static_cast<__mmask64>(m.raw), 1))};
2965#else
2966 return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) << 1)};
2967#endif
2968}
2969
2970template <class D, HWY_IF_LANES_D(D, 64)>
2971HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
2972 using RawM = decltype(MFromD<D>().raw);
2973#if HWY_COMPILER_HAS_MASK_INTRINSICS
2974 return MFromD<D>{
2975 static_cast<RawM>(_kshiftri_mask64(static_cast<__mmask64>(m.raw), 1))};
2976#else
2977 return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) >> 1)};
2978#endif
2979}
2980
2981// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2982
2984#if HWY_TARGET <= HWY_AVX3_DL
2985 const Repartition<uint64_t, DFromV<decltype(v)>> du64;
2986 return detail::GaloisAffine(v, Set(du64, 0x8080808080808080ull));
2987#else
2988 const DFromV<decltype(v)> d;
2989 return VecFromMask(v < Zero(d));
2990#endif
2991}
2992
2994 return ShiftRight<15>(v);
2995}
2996
2998 return ShiftRight<31>(v);
2999}
3000
3002 return ShiftRight<63>(v);
3003}
3004
3005// ------------------------------ Floating-point classification (Not)
3006
3007#if HWY_HAVE_FLOAT16 || HWY_IDE
3008
3009HWY_API Mask512<float16_t> IsNaN(Vec512<float16_t> v) {
3010 return Mask512<float16_t>{_mm512_fpclass_ph_mask(
3012}
3013
3014HWY_API Mask512<float16_t> IsEitherNaN(Vec512<float16_t> a,
3015 Vec512<float16_t> b) {
3016 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3017 HWY_DIAGNOSTICS(push)
3018 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3019 return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3020 HWY_DIAGNOSTICS(pop)
3021}
3022
3023HWY_API Mask512<float16_t> IsInf(Vec512<float16_t> v) {
3024 return Mask512<float16_t>{_mm512_fpclass_ph_mask(v.raw, 0x18)};
3025}
3026
3027// Returns whether normal/subnormal/zero. fpclass doesn't have a flag for
3028// positive, so we have to check for inf/NaN and negate.
3029HWY_API Mask512<float16_t> IsFinite(Vec512<float16_t> v) {
3030 return Not(Mask512<float16_t>{_mm512_fpclass_ph_mask(
3033}
3034
3035#endif // HWY_HAVE_FLOAT16
3036
3038 return Mask512<float>{_mm512_fpclass_ps_mask(
3040}
3045
3047 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3048}
3049
3051 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3052}
3053
3062
3063// Returns whether normal/subnormal/zero. fpclass doesn't have a flag for
3064// positive, so we have to check for inf/NaN and negate.
3075
3076// ================================================== MEMORY
3077
3078// ------------------------------ Load
3079
3080template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3081HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
3082 return VFromD<D>{_mm512_load_si512(aligned)};
3083}
3084// bfloat16_t is handled by x86_128-inl.h.
3085#if HWY_HAVE_FLOAT16
3086template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
3087HWY_API Vec512<float16_t> Load(D /* tag */,
3088 const float16_t* HWY_RESTRICT aligned) {
3089 return Vec512<float16_t>{_mm512_load_ph(aligned)};
3090}
3091#endif // HWY_HAVE_FLOAT16
3092template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3093HWY_API Vec512<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
3094 return Vec512<float>{_mm512_load_ps(aligned)};
3095}
3096template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3097HWY_API VFromD<D> Load(D /* tag */, const double* HWY_RESTRICT aligned) {
3098 return VFromD<D>{_mm512_load_pd(aligned)};
3099}
3100
3101template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3102HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
3103 return VFromD<D>{_mm512_loadu_si512(p)};
3104}
3105
3106// bfloat16_t is handled by x86_128-inl.h.
3107#if HWY_HAVE_FLOAT16
3108template <class D, HWY_IF_V_SIZE_D(D, 64)>
3109HWY_API Vec512<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
3110 return Vec512<float16_t>{_mm512_loadu_ph(p)};
3111}
3112#endif // HWY_HAVE_FLOAT16
3113template <class D, HWY_IF_V_SIZE_D(D, 64)>
3114HWY_API Vec512<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
3115 return Vec512<float>{_mm512_loadu_ps(p)};
3116}
3117template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3118HWY_API VFromD<D> LoadU(D /* tag */, const double* HWY_RESTRICT p) {
3119 return VFromD<D>{_mm512_loadu_pd(p)};
3120}
3121
3122// ------------------------------ MaskedLoad
3123
3124template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
3125HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3126 const TFromD<D>* HWY_RESTRICT p) {
3127 return VFromD<D>{_mm512_maskz_loadu_epi8(m.raw, p)};
3128}
3129
3130template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
3131HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
3132 const TFromD<D>* HWY_RESTRICT p) {
3133 const RebindToUnsigned<D> du; // for float16_t
3134 return BitCast(d, VFromD<decltype(du)>{_mm512_maskz_loadu_epi16(
3135 m.raw, reinterpret_cast<const uint16_t*>(p))});
3136}
3137
3138template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3139HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3140 const TFromD<D>* HWY_RESTRICT p) {
3141 return VFromD<D>{_mm512_maskz_loadu_epi32(m.raw, p)};
3142}
3143
3144template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3145HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3146 const TFromD<D>* HWY_RESTRICT p) {
3147 return VFromD<D>{_mm512_maskz_loadu_epi64(m.raw, p)};
3148}
3149
3150template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3152 const float* HWY_RESTRICT p) {
3153 return Vec512<float>{_mm512_maskz_loadu_ps(m.raw, p)};
3154}
3155
3156template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3158 const double* HWY_RESTRICT p) {
3159 return Vec512<double>{_mm512_maskz_loadu_pd(m.raw, p)};
3160}
3161
3162// ------------------------------ MaskedLoadOr
3163
3164template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
3165HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
3166 const TFromD<D>* HWY_RESTRICT p) {
3167 return VFromD<D>{_mm512_mask_loadu_epi8(v.raw, m.raw, p)};
3168}
3169
3170template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
3171HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
3172 const TFromD<D>* HWY_RESTRICT p) {
3173 const RebindToUnsigned<decltype(d)> du; // for float16_t
3174 return BitCast(
3175 d, VFromD<decltype(du)>{_mm512_mask_loadu_epi16(
3176 BitCast(du, v).raw, m.raw, reinterpret_cast<const uint16_t*>(p))});
3177}
3178
3179template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3180HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
3181 const TFromD<D>* HWY_RESTRICT p) {
3182 return VFromD<D>{_mm512_mask_loadu_epi32(v.raw, m.raw, p)};
3183}
3184
3185template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3186HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
3187 const TFromD<D>* HWY_RESTRICT p) {
3188 return VFromD<D>{_mm512_mask_loadu_epi64(v.raw, m.raw, p)};
3189}
3190
3191template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3193 const float* HWY_RESTRICT p) {
3194 return VFromD<D>{_mm512_mask_loadu_ps(v.raw, m.raw, p)};
3195}
3196
3197template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3199 const double* HWY_RESTRICT p) {
3200 return VFromD<D>{_mm512_mask_loadu_pd(v.raw, m.raw, p)};
3201}
3202
3203// ------------------------------ LoadDup128
3204
3205// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
3206// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
3207template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3208HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
3209 const RebindToUnsigned<decltype(d)> du;
3210 const Full128<TFromD<D>> d128;
3211 const RebindToUnsigned<decltype(d128)> du128;
3212 return BitCast(d, VFromD<decltype(du)>{_mm512_broadcast_i32x4(
3213 BitCast(du128, LoadU(d128, p)).raw)});
3214}
3215template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3216HWY_API VFromD<D> LoadDup128(D /* tag */, const float* HWY_RESTRICT p) {
3217 const __m128 x4 = _mm_loadu_ps(p);
3218 return VFromD<D>{_mm512_broadcast_f32x4(x4)};
3219}
3220
3221template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3222HWY_API VFromD<D> LoadDup128(D /* tag */, const double* HWY_RESTRICT p) {
3223 const __m128d x2 = _mm_loadu_pd(p);
3224 return VFromD<D>{_mm512_broadcast_f64x2(x2)};
3225}
3226
3227// ------------------------------ Store
3228
3229template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3230HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
3231 _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
3232}
3233// bfloat16_t is handled by x86_128-inl.h.
3234#if HWY_HAVE_FLOAT16
3235template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
3236HWY_API void Store(Vec512<float16_t> v, D /* tag */,
3237 float16_t* HWY_RESTRICT aligned) {
3238 _mm512_store_ph(aligned, v.raw);
3239}
3240#endif
3241template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3242HWY_API void Store(Vec512<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
3243 _mm512_store_ps(aligned, v.raw);
3244}
3245template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3246HWY_API void Store(VFromD<D> v, D /* tag */, double* HWY_RESTRICT aligned) {
3247 _mm512_store_pd(aligned, v.raw);
3248}
3249
3250template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3251HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
3252 _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
3253}
3254// bfloat16_t is handled by x86_128-inl.h.
3255#if HWY_HAVE_FLOAT16
3256template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
3257HWY_API void StoreU(Vec512<float16_t> v, D /* tag */,
3258 float16_t* HWY_RESTRICT p) {
3259 _mm512_storeu_ph(p, v.raw);
3260}
3261#endif // HWY_HAVE_FLOAT16
3262
3263template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3264HWY_API void StoreU(Vec512<float> v, D /* tag */, float* HWY_RESTRICT p) {
3265 _mm512_storeu_ps(p, v.raw);
3266}
3267template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3268HWY_API void StoreU(Vec512<double> v, D /* tag */, double* HWY_RESTRICT p) {
3269 _mm512_storeu_pd(p, v.raw);
3270}
3271
3272// ------------------------------ BlendedStore
3273
3274template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
3275HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
3276 TFromD<D>* HWY_RESTRICT p) {
3277 _mm512_mask_storeu_epi8(p, m.raw, v.raw);
3278}
3279
3280template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
3281HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
3282 TFromD<D>* HWY_RESTRICT p) {
3283 const RebindToUnsigned<decltype(d)> du; // for float16_t
3284 _mm512_mask_storeu_epi16(reinterpret_cast<uint16_t*>(p), m.raw,
3285 BitCast(du, v).raw);
3286}
3287
3288template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3289HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
3290 TFromD<D>* HWY_RESTRICT p) {
3291 _mm512_mask_storeu_epi32(p, m.raw, v.raw);
3292}
3293
3294template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3295HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
3296 TFromD<D>* HWY_RESTRICT p) {
3297 _mm512_mask_storeu_epi64(p, m.raw, v.raw);
3298}
3299
3300template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3302 float* HWY_RESTRICT p) {
3303 _mm512_mask_storeu_ps(p, m.raw, v.raw);
3304}
3305
3306template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3308 double* HWY_RESTRICT p) {
3309 _mm512_mask_storeu_pd(p, m.raw, v.raw);
3310}
3311
3312// ------------------------------ Non-temporal stores
3313
3314template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3315HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
3316 const RebindToUnsigned<decltype(d)> du; // for float16_t
3317 _mm512_stream_si512(reinterpret_cast<__m512i*>(aligned), BitCast(du, v).raw);
3318}
3319template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3320HWY_API void Stream(VFromD<D> v, D /* tag */, float* HWY_RESTRICT aligned) {
3321 _mm512_stream_ps(aligned, v.raw);
3322}
3323template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3324HWY_API void Stream(VFromD<D> v, D /* tag */, double* HWY_RESTRICT aligned) {
3325 _mm512_stream_pd(aligned, v.raw);
3326}
3327
3328// ------------------------------ ScatterOffset
3329
3330// Work around warnings in the intrinsic definitions (passing -1 as a mask).
3331HWY_DIAGNOSTICS(push)
3332HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3333
3334template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3335HWY_API void ScatterOffset(VFromD<D> v, D /* tag */,
3336 TFromD<D>* HWY_RESTRICT base,
3337 VFromD<RebindToSigned<D>> offset) {
3338 _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
3339}
3340
3341template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3342HWY_API void ScatterOffset(VFromD<D> v, D /* tag */,
3343 TFromD<D>* HWY_RESTRICT base,
3344 VFromD<RebindToSigned<D>> offset) {
3345 _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
3346}
3347
3348template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3349HWY_API void ScatterOffset(VFromD<D> v, D /* tag */, float* HWY_RESTRICT base,
3350 Vec512<int32_t> offset) {
3351 _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
3352}
3353
3354template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3355HWY_API void ScatterOffset(VFromD<D> v, D /* tag */, double* HWY_RESTRICT base,
3356 Vec512<int64_t> offset) {
3357 _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
3358}
3359
3360// ------------------------------ ScatterIndex
3361
3362template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3363HWY_API void ScatterIndex(VFromD<D> v, D /* tag */,
3364 TFromD<D>* HWY_RESTRICT base,
3365 VFromD<RebindToSigned<D>> index) {
3366 _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
3367}
3368
3369template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3370HWY_API void ScatterIndex(VFromD<D> v, D /* tag */,
3371 TFromD<D>* HWY_RESTRICT base,
3372 VFromD<RebindToSigned<D>> index) {
3373 _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
3374}
3375
3376template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3377HWY_API void ScatterIndex(VFromD<D> v, D /* tag */, float* HWY_RESTRICT base,
3378 Vec512<int32_t> index) {
3379 _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
3380}
3381
3382template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3383HWY_API void ScatterIndex(VFromD<D> v, D /* tag */, double* HWY_RESTRICT base,
3384 Vec512<int64_t> index) {
3385 _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
3386}
3387
3388// ------------------------------ MaskedScatterIndex
3389
3390template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3391HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
3392 TFromD<D>* HWY_RESTRICT base,
3393 VFromD<RebindToSigned<D>> index) {
3394 _mm512_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, 4);
3395}
3396
3397template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3398HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
3399 TFromD<D>* HWY_RESTRICT base,
3400 VFromD<RebindToSigned<D>> index) {
3401 _mm512_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, 8);
3402}
3403
3404template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3406 float* HWY_RESTRICT base,
3407 Vec512<int32_t> index) {
3408 _mm512_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, 4);
3409}
3410
3411template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3413 double* HWY_RESTRICT base,
3414 Vec512<int64_t> index) {
3415 _mm512_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, 8);
3416}
3417
3418// ------------------------------ Gather
3419
3420namespace detail {
3421
3422template <int kScale, typename T, HWY_IF_UI32(T)>
3425 return Vec512<T>{_mm512_i32gather_epi32(indices.raw, base, kScale)};
3426}
3427
3428template <int kScale, typename T, HWY_IF_UI64(T)>
3431 return Vec512<T>{_mm512_i64gather_epi64(indices.raw, base, kScale)};
3432}
3433
3434template <int kScale>
3437 return Vec512<float>{_mm512_i32gather_ps(indices.raw, base, kScale)};
3438}
3439
3440template <int kScale>
3443 return Vec512<double>{_mm512_i64gather_pd(indices.raw, base, kScale)};
3444}
3445
3446template <int kScale, typename T, HWY_IF_UI32(T)>
3448 const T* HWY_RESTRICT base,
3450 return Vec512<T>{
3451 _mm512_mask_i32gather_epi32(no.raw, m.raw, indices.raw, base, kScale)};
3452}
3453
3454template <int kScale, typename T, HWY_IF_UI64(T)>
3456 const T* HWY_RESTRICT base,
3458 return Vec512<T>{
3459 _mm512_mask_i64gather_epi64(no.raw, m.raw, indices.raw, base, kScale)};
3460}
3461
3462template <int kScale>
3465 const float* HWY_RESTRICT base,
3467 return Vec512<float>{
3468 _mm512_mask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
3469}
3470
3471template <int kScale>
3473 Vec512<double> no, Mask512<double> m, const double* HWY_RESTRICT base,
3475 return Vec512<double>{
3476 _mm512_mask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
3477}
3478} // namespace detail
3479
3480template <class D, HWY_IF_V_SIZE_D(D, 64)>
3481HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
3482 VFromD<RebindToSigned<D>> offsets) {
3483 const RebindToSigned<decltype(d)> di;
3484 (void)di; // for HWY_DASSERT
3485 HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
3486 return detail::NativeGather512<1>(base, offsets);
3487}
3488
3489template <class D, HWY_IF_V_SIZE_D(D, 64)>
3490HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
3491 VFromD<RebindToSigned<D>> indices) {
3492 const RebindToSigned<decltype(d)> di;
3493 (void)di; // for HWY_DASSERT
3494 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3495 return detail::NativeGather512<sizeof(TFromD<D>)>(base, indices);
3496}
3497
3498template <class D, HWY_IF_V_SIZE_D(D, 64)>
3500 const TFromD<D>* HWY_RESTRICT base,
3501 VFromD<RebindToSigned<D>> indices) {
3502 const RebindToSigned<decltype(d)> di;
3503 (void)di; // for HWY_DASSERT
3504 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3505 return detail::NativeMaskedGatherOr512<sizeof(TFromD<D>)>(no, m, base,
3506 indices);
3507}
3508
3509HWY_DIAGNOSTICS(pop)
3510
3511// ================================================== SWIZZLE
3512
3513// ------------------------------ LowerHalf
3514
3515template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3516HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
3517 return VFromD<D>{_mm512_castsi512_si256(v.raw)};
3518}
3519template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
3521 return VFromD<D>{_mm512_castsi512_si256(v.raw)};
3522}
3523template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3525#if HWY_HAVE_FLOAT16
3526 return VFromD<D>{_mm512_castph512_ph256(v.raw)};
3527#else
3528 return VFromD<D>{_mm512_castsi512_si256(v.raw)};
3529#endif // HWY_HAVE_FLOAT16
3530}
3531template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3533 return VFromD<D>{_mm512_castps512_ps256(v.raw)};
3534}
3535template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3537 return VFromD<D>{_mm512_castpd512_pd256(v.raw)};
3538}
3539
3540template <typename T>
3542 const Half<DFromV<decltype(v)>> dh;
3543 return LowerHalf(dh, v);
3544}
3545
3546// ------------------------------ UpperHalf
3547
3548template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
3549HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
3550 const RebindToUnsigned<decltype(d)> du; // for float16_t
3551 const Twice<decltype(du)> dut;
3552 return BitCast(d, VFromD<decltype(du)>{
3553 _mm512_extracti32x8_epi32(BitCast(dut, v).raw, 1)});
3554}
3555template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3556HWY_API VFromD<D> UpperHalf(D /* tag */, VFromD<Twice<D>> v) {
3557 return VFromD<D>{_mm512_extractf32x8_ps(v.raw, 1)};
3558}
3559template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3560HWY_API VFromD<D> UpperHalf(D /* tag */, VFromD<Twice<D>> v) {
3561 return VFromD<D>{_mm512_extractf64x4_pd(v.raw, 1)};
3562}
3563
3564// ------------------------------ ExtractLane (Store)
3565template <typename T>
3566HWY_API T ExtractLane(const Vec512<T> v, size_t i) {
3567 const DFromV<decltype(v)> d;
3568 HWY_DASSERT(i < Lanes(d));
3569
3570#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3571 constexpr size_t kLanesPerBlock = 16 / sizeof(T);
3572 if (__builtin_constant_p(i < kLanesPerBlock) && (i < kLanesPerBlock)) {
3573 return ExtractLane(ResizeBitCast(Full128<T>(), v), i);
3574 }
3575#endif
3576
3577 alignas(64) T lanes[Lanes(d)];
3578 Store(v, d, lanes);
3579 return lanes[i];
3580}
3581
3582// ------------------------------ ExtractBlock
3583template <int kBlockIdx, class T, hwy::EnableIf<(kBlockIdx <= 1)>* = nullptr>
3584HWY_API Vec128<T> ExtractBlock(Vec512<T> v) {
3585 const DFromV<decltype(v)> d;
3586 const Half<decltype(d)> dh;
3587 return ExtractBlock<kBlockIdx>(LowerHalf(dh, v));
3588}
3589
3590template <int kBlockIdx, class T, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
3592 static_assert(kBlockIdx <= 3, "Invalid block index");
3593 const DFromV<decltype(v)> d;
3594 const RebindToUnsigned<decltype(d)> du; // for float16_t
3595 return BitCast(Full128<T>(),
3597 _mm512_extracti32x4_epi32(BitCast(du, v).raw, kBlockIdx)});
3598}
3599
3600template <int kBlockIdx, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
3602 static_assert(kBlockIdx <= 3, "Invalid block index");
3603 return Vec128<float>{_mm512_extractf32x4_ps(v.raw, kBlockIdx)};
3604}
3605
3606template <int kBlockIdx, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
3608 static_assert(kBlockIdx <= 3, "Invalid block index");
3609 return Vec128<double>{_mm512_extractf64x2_pd(v.raw, kBlockIdx)};
3610}
3611
3612// ------------------------------ InsertLane (Store)
3613template <typename T>
3614HWY_API Vec512<T> InsertLane(const Vec512<T> v, size_t i, T t) {
3616}
3617
3618// ------------------------------ InsertBlock
3619namespace detail {
3620
3621template <typename T>
3623 Vec128<T> blk_to_insert) {
3624 const DFromV<decltype(v)> d;
3625 const auto insert_mask = FirstN(d, 16 / sizeof(T));
3626 return IfThenElse(insert_mask, ResizeBitCast(d, blk_to_insert), v);
3627}
3628
3629template <size_t kBlockIdx, typename T>
3631 Vec512<T> v, Vec128<T> blk_to_insert) {
3632 const DFromV<decltype(v)> d;
3633 const RebindToUnsigned<decltype(d)> du; // for float16_t
3634 const Full128<MakeUnsigned<T>> du_blk_to_insert;
3635 return BitCast(
3636 d, VFromD<decltype(du)>{_mm512_inserti32x4(
3637 BitCast(du, v).raw, BitCast(du_blk_to_insert, blk_to_insert).raw,
3638 static_cast<int>(kBlockIdx & 3))});
3639}
3640
3641template <size_t kBlockIdx, hwy::EnableIf<kBlockIdx != 0>* = nullptr>
3643 Vec512<float> v,
3644 Vec128<float> blk_to_insert) {
3645 return Vec512<float>{_mm512_insertf32x4(v.raw, blk_to_insert.raw,
3646 static_cast<int>(kBlockIdx & 3))};
3647}
3648
3649template <size_t kBlockIdx, hwy::EnableIf<kBlockIdx != 0>* = nullptr>
3652 Vec128<double> blk_to_insert) {
3653 return Vec512<double>{_mm512_insertf64x2(v.raw, blk_to_insert.raw,
3654 static_cast<int>(kBlockIdx & 3))};
3655}
3656
3657} // namespace detail
3658
3659template <int kBlockIdx, class T>
3661 static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index");
3662 return detail::InsertBlock(hwy::SizeTag<static_cast<size_t>(kBlockIdx)>(), v,
3663 blk_to_insert);
3664}
3665
3666// ------------------------------ GetLane (LowerHalf)
3667template <typename T>
3669 return GetLane(LowerHalf(v));
3670}
3671
3672// ------------------------------ ZeroExtendVector
3673
3674template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3675HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
3676#if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h.
3677 (void)d;
3678 return VFromD<D>{_mm512_zextsi256_si512(lo.raw)};
3679#else
3680 return VFromD<D>{_mm512_inserti32x8(Zero(d).raw, lo.raw, 0)};
3681#endif
3682}
3683#if HWY_HAVE_FLOAT16
3684template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
3685HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
3686#if HWY_HAVE_ZEXT
3687 (void)d;
3688 return VFromD<D>{_mm512_zextph256_ph512(lo.raw)};
3689#else
3690 const RebindToUnsigned<D> du;
3691 return BitCast(d, ZeroExtendVector(du, BitCast(du, lo)));
3692#endif
3693}
3694#endif // HWY_HAVE_FLOAT16
3695template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3696HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
3697#if HWY_HAVE_ZEXT
3698 (void)d;
3699 return VFromD<D>{_mm512_zextps256_ps512(lo.raw)};
3700#else
3701 return VFromD<D>{_mm512_insertf32x8(Zero(d).raw, lo.raw, 0)};
3702#endif
3703}
3704template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3705HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
3706#if HWY_HAVE_ZEXT
3707 (void)d;
3708 return VFromD<D>{_mm512_zextpd256_pd512(lo.raw)};
3709#else
3710 return VFromD<D>{_mm512_insertf64x4(Zero(d).raw, lo.raw, 0)};
3711#endif
3712}
3713
3714// ------------------------------ ZeroExtendResizeBitCast
3715
3716namespace detail {
3717
3718template <class DTo, class DFrom, HWY_IF_NOT_FLOAT3264_D(DTo)>
3720 hwy::SizeTag<16> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */,
3721 DTo d_to, DFrom d_from, VFromD<DFrom> v) {
3722 const Repartition<uint8_t, decltype(d_from)> du8_from;
3723 const auto vu8 = BitCast(du8_from, v);
3724 const RebindToUnsigned<decltype(d_to)> du_to;
3725#if HWY_HAVE_ZEXT
3726 return BitCast(d_to,
3727 VFromD<decltype(du_to)>{_mm512_zextsi128_si512(vu8.raw)});
3728#else
3729 return BitCast(d_to, VFromD<decltype(du_to)>{
3730 _mm512_inserti32x4(Zero(du_to).raw, vu8.raw, 0)});
3731#endif
3732}
3733
3734template <class DTo, class DFrom, HWY_IF_F32_D(DTo)>
3736 hwy::SizeTag<16> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */,
3737 DTo d_to, DFrom d_from, VFromD<DFrom> v) {
3738 const Repartition<float, decltype(d_from)> df32_from;
3739 const auto vf32 = BitCast(df32_from, v);
3740#if HWY_HAVE_ZEXT
3741 (void)d_to;
3742 return Vec512<float>{_mm512_zextps128_ps512(vf32.raw)};
3743#else
3744 return Vec512<float>{_mm512_insertf32x4(Zero(d_to).raw, vf32.raw, 0)};
3745#endif
3746}
3747
3748template <class DTo, class DFrom, HWY_IF_F64_D(DTo)>
3750 hwy::SizeTag<16> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */,
3751 DTo d_to, DFrom d_from, VFromD<DFrom> v) {
3752 const Repartition<double, decltype(d_from)> df64_from;
3753 const auto vf64 = BitCast(df64_from, v);
3754#if HWY_HAVE_ZEXT
3755 (void)d_to;
3756 return Vec512<double>{_mm512_zextpd128_pd512(vf64.raw)};
3757#else
3758 return Vec512<double>{_mm512_insertf64x2(Zero(d_to).raw, vf64.raw, 0)};
3759#endif
3760}
3761
3762template <class DTo, class DFrom>
3764 hwy::SizeTag<8> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */,
3765 DTo d_to, DFrom d_from, VFromD<DFrom> v) {
3766 const Twice<decltype(d_from)> dt_from;
3768 dt_from, ZeroExtendVector(dt_from, v));
3769}
3770
3771} // namespace detail
3772
3773// ------------------------------ Combine
3774
3775template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3776HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
3777 const RebindToUnsigned<decltype(d)> du; // for float16_t
3778 const Half<decltype(du)> duh;
3779 const __m512i lo512 = ZeroExtendVector(du, BitCast(duh, lo)).raw;
3780 return BitCast(d, VFromD<decltype(du)>{
3781 _mm512_inserti32x8(lo512, BitCast(duh, hi).raw, 1)});
3782}
3783template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3784HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
3785 return VFromD<D>{_mm512_insertf32x8(ZeroExtendVector(d, lo).raw, hi.raw, 1)};
3786}
3787template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3788HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
3789 return VFromD<D>{_mm512_insertf64x4(ZeroExtendVector(d, lo).raw, hi.raw, 1)};
3790}
3791
3792// ------------------------------ ShiftLeftBytes
3793template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 64)>
3794HWY_API VFromD<D> ShiftLeftBytes(D /* tag */, const VFromD<D> v) {
3795 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3796 return VFromD<D>{_mm512_bslli_epi128(v.raw, kBytes)};
3797}
3798
3799// ------------------------------ ShiftRightBytes
3800template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 64)>
3801HWY_API VFromD<D> ShiftRightBytes(D /* tag */, const VFromD<D> v) {
3802 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3803 return VFromD<D>{_mm512_bsrli_epi128(v.raw, kBytes)};
3804}
3805
3806// ------------------------------ CombineShiftRightBytes
3807
3808template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 64)>
3810 const Repartition<uint8_t, decltype(d)> d8;
3811 return BitCast(d, Vec512<uint8_t>{_mm512_alignr_epi8(
3812 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
3813}
3814
3815// ------------------------------ Broadcast/splat any lane
3816
3817template <int kLane, typename T, HWY_IF_T_SIZE(T, 2)>
3819 const DFromV<decltype(v)> d;
3820 const RebindToUnsigned<decltype(d)> du;
3821 using VU = VFromD<decltype(du)>;
3822 const VU vu = BitCast(du, v); // for float16_t
3823 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3824 if (kLane < 4) {
3825 const __m512i lo = _mm512_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF);
3826 return BitCast(d, VU{_mm512_unpacklo_epi64(lo, lo)});
3827 } else {
3828 const __m512i hi =
3829 _mm512_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF);
3830 return BitCast(d, VU{_mm512_unpackhi_epi64(hi, hi)});
3831 }
3832}
3833
3834template <int kLane, typename T, HWY_IF_UI32(T)>
3835HWY_API Vec512<T> Broadcast(const Vec512<T> v) {
3836 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3837 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
3838 return Vec512<T>{_mm512_shuffle_epi32(v.raw, perm)};
3839}
3840
3841template <int kLane, typename T, HWY_IF_UI64(T)>
3842HWY_API Vec512<T> Broadcast(const Vec512<T> v) {
3843 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3844 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
3845 return Vec512<T>{_mm512_shuffle_epi32(v.raw, perm)};
3846}
3847
3848template <int kLane>
3850 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3851 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
3852 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, perm)};
3853}
3854
3855template <int kLane>
3857 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3858 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0xFF * kLane);
3859 return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, perm)};
3860}
3861
3862// ------------------------------ BroadcastBlock
3863template <int kBlockIdx, class T>
3865 static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index");
3866 const DFromV<decltype(v)> d;
3867 const RebindToUnsigned<decltype(d)> du; // for float16_t
3868 return BitCast(
3869 d, VFromD<decltype(du)>{_mm512_shuffle_i32x4(
3870 BitCast(du, v).raw, BitCast(du, v).raw, 0x55 * kBlockIdx)});
3871}
3872
3873template <int kBlockIdx>
3875 static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index");
3876 return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, 0x55 * kBlockIdx)};
3877}
3878
3879template <int kBlockIdx>
3881 static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index");
3882 return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, 0x55 * kBlockIdx)};
3883}
3884
3885// ------------------------------ BroadcastLane
3886
3887namespace detail {
3888
3889template <class T, HWY_IF_T_SIZE(T, 1)>
3891 Vec512<T> v) {
3892 return Vec512<T>{_mm512_broadcastb_epi8(ResizeBitCast(Full128<T>(), v).raw)};
3893}
3894
3895template <class T, HWY_IF_T_SIZE(T, 2)>
3897 Vec512<T> v) {
3898 const DFromV<decltype(v)> d;
3899 const RebindToUnsigned<decltype(d)> du; // for float16_t
3900 return BitCast(d, VFromD<decltype(du)>{_mm512_broadcastw_epi16(
3901 ResizeBitCast(Full128<uint16_t>(), v).raw)});
3902}
3903
3904template <class T, HWY_IF_UI32(T)>
3905HWY_INLINE Vec512<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
3906 Vec512<T> v) {
3907 return Vec512<T>{_mm512_broadcastd_epi32(ResizeBitCast(Full128<T>(), v).raw)};
3908}
3909
3910template <class T, HWY_IF_UI64(T)>
3911HWY_INLINE Vec512<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
3912 Vec512<T> v) {
3913 return Vec512<T>{_mm512_broadcastq_epi64(ResizeBitCast(Full128<T>(), v).raw)};
3914}
3915
3917 Vec512<float> v) {
3918 return Vec512<float>{
3919 _mm512_broadcastss_ps(ResizeBitCast(Full128<float>(), v).raw)};
3920}
3921
3923 Vec512<double> v) {
3924 return Vec512<double>{
3925 _mm512_broadcastsd_pd(ResizeBitCast(Full128<double>(), v).raw)};
3926}
3927
3928template <size_t kLaneIdx, class T, hwy::EnableIf<kLaneIdx != 0>* = nullptr>
3930 Vec512<T> v) {
3931 constexpr size_t kLanesPerBlock = 16 / sizeof(T);
3932 constexpr int kBlockIdx = static_cast<int>(kLaneIdx / kLanesPerBlock);
3933 constexpr int kLaneInBlkIdx =
3934 static_cast<int>(kLaneIdx) & (kLanesPerBlock - 1);
3935 return Broadcast<kLaneInBlkIdx>(BroadcastBlock<kBlockIdx>(v));
3936}
3937
3938} // namespace detail
3939
3940template <int kLaneIdx, class T>
3942 static_assert(0 <= kLaneIdx, "Invalid lane");
3943 return detail::BroadcastLane(hwy::SizeTag<static_cast<size_t>(kLaneIdx)>(),
3944 v);
3945}
3946
3947// ------------------------------ Hard-coded shuffles
3948
3949// Notation: let Vec512<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
3950// least-significant). Shuffle0321 rotates four-lane blocks one lane to the
3951// right (the previous least-significant lane is now most-significant =>
3952// 47650321). These could also be implemented via CombineShiftRightBytes but
3953// the shuffle_abcd notation is more convenient.
3954
3955// Swap 32-bit halves in 64-bit halves.
3956template <typename T, HWY_IF_UI32(T)>
3958 return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)};
3959}
3961 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CDAB)};
3962}
3963
3964namespace detail {
3965
3966template <typename T, HWY_IF_T_SIZE(T, 4)>
3968 const DFromV<decltype(a)> d;
3969 const RebindToFloat<decltype(d)> df;
3970 return BitCast(
3971 d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
3972 _MM_PERM_CDAB)});
3973}
3974template <typename T, HWY_IF_T_SIZE(T, 4)>
3976 const DFromV<decltype(a)> d;
3977 const RebindToFloat<decltype(d)> df;
3978 return BitCast(
3979 d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
3980 _MM_PERM_BCDA)});
3981}
3982template <typename T, HWY_IF_T_SIZE(T, 4)>
3984 const DFromV<decltype(a)> d;
3985 const RebindToFloat<decltype(d)> df;
3986 return BitCast(
3987 d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
3988 _MM_PERM_DABC)});
3989}
3990
3991} // namespace detail
3992
3993// Swap 64-bit halves
3995 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
3996}
3998 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
3999}
4001 // Shorter encoding than _mm512_permute_ps.
4002 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_BADC)};
4003}
4005 return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
4006}
4008 return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
4009}
4011 // Shorter encoding than _mm512_permute_pd.
4012 return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, _MM_PERM_BBBB)};
4013}
4014
4015// Rotate right 32 bits
4017 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
4018}
4020 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
4021}
4023 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ADCB)};
4024}
4025// Rotate left 32 bits
4027 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
4028}
4030 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
4031}
4033 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CBAD)};
4034}
4035
4036// Reverse
4038 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
4039}
4041 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
4042}
4044 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ABCD)};
4045}
4046
4047// ------------------------------ TableLookupLanes
4048
4049// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
4050template <typename T>
4052 __m512i raw;
4053};
4054
4055template <class D, typename T = TFromD<D>, typename TI>
4057 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
4058#if HWY_IS_DEBUG_BUILD
4059 const DFromV<decltype(vec)> di;
4060 const RebindToUnsigned<decltype(di)> du;
4061 using TU = MakeUnsigned<T>;
4062 const auto vec_u = BitCast(du, vec);
4064 AllTrue(du, Lt(vec_u, Set(du, static_cast<TU>(128 / sizeof(T))))));
4065#endif
4066 return Indices512<T>{vec.raw};
4067}
4068
4069template <class D, HWY_IF_V_SIZE_D(D, 64), typename TI>
4071 const Rebind<TI, decltype(d)> di;
4072 return IndicesFromVec(d, LoadU(di, idx));
4073}
4074
4075template <typename T, HWY_IF_T_SIZE(T, 1)>
4077#if HWY_TARGET <= HWY_AVX3_DL
4078 return Vec512<T>{_mm512_permutexvar_epi8(idx.raw, v.raw)};
4079#else
4080 const DFromV<decltype(v)> d;
4081 const Repartition<uint16_t, decltype(d)> du16;
4082 const Vec512<T> idx_vec{idx.raw};
4083
4084 const auto bd_sel_mask =
4085 MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec))));
4086 const auto cd_sel_mask =
4087 MaskFromVec(BitCast(d, ShiftLeft<2>(BitCast(du16, idx_vec))));
4088
4089 const Vec512<T> v_a{_mm512_shuffle_i32x4(v.raw, v.raw, 0x00)};
4090 const Vec512<T> v_b{_mm512_shuffle_i32x4(v.raw, v.raw, 0x55)};
4091 const Vec512<T> v_c{_mm512_shuffle_i32x4(v.raw, v.raw, 0xAA)};
4092 const Vec512<T> v_d{_mm512_shuffle_i32x4(v.raw, v.raw, 0xFF)};
4093
4094 const auto shuf_a = TableLookupBytes(v_a, idx_vec);
4095 const auto shuf_c = TableLookupBytes(v_c, idx_vec);
4096 const Vec512<T> shuf_ab{_mm512_mask_shuffle_epi8(shuf_a.raw, bd_sel_mask.raw,
4097 v_b.raw, idx_vec.raw)};
4098 const Vec512<T> shuf_cd{_mm512_mask_shuffle_epi8(shuf_c.raw, bd_sel_mask.raw,
4099 v_d.raw, idx_vec.raw)};
4100 return IfThenElse(cd_sel_mask, shuf_cd, shuf_ab);
4101#endif
4102}
4103
4104template <typename T, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_SPECIAL_FLOAT(T)>
4106 return Vec512<T>{_mm512_permutexvar_epi16(idx.raw, v.raw)};
4107}
4108#if HWY_HAVE_FLOAT16
4109HWY_API Vec512<float16_t> TableLookupLanes(Vec512<float16_t> v,
4110 Indices512<float16_t> idx) {
4111 return Vec512<float16_t>{_mm512_permutexvar_ph(idx.raw, v.raw)};
4112}
4113#endif // HWY_HAVE_FLOAT16
4114template <typename T, HWY_IF_T_SIZE(T, 4)>
4115HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) {
4116 return Vec512<T>{_mm512_permutexvar_epi32(idx.raw, v.raw)};
4117}
4118
4119template <typename T, HWY_IF_T_SIZE(T, 8)>
4120HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) {
4121 return Vec512<T>{_mm512_permutexvar_epi64(idx.raw, v.raw)};
4122}
4123
4125 return Vec512<float>{_mm512_permutexvar_ps(idx.raw, v.raw)};
4126}
4127
4129 Indices512<double> idx) {
4130 return Vec512<double>{_mm512_permutexvar_pd(idx.raw, v.raw)};
4131}
4132
4133template <typename T, HWY_IF_T_SIZE(T, 1)>
4135 Indices512<T> idx) {
4136#if HWY_TARGET <= HWY_AVX3_DL
4137 return Vec512<T>{_mm512_permutex2var_epi8(a.raw, idx.raw, b.raw)};
4138#else
4139 const DFromV<decltype(a)> d;
4140 const auto b_sel_mask =
4141 MaskFromVec(BitCast(d, ShiftLeft<1>(Vec512<uint16_t>{idx.raw})));
4142 return IfThenElse(b_sel_mask, TableLookupLanes(b, idx),
4143 TableLookupLanes(a, idx));
4144#endif
4145}
4146
4147template <typename T, HWY_IF_T_SIZE(T, 2)>
4148HWY_API Vec512<T> TwoTablesLookupLanes(Vec512<T> a, Vec512<T> b,
4149 Indices512<T> idx) {
4150 return Vec512<T>{_mm512_permutex2var_epi16(a.raw, idx.raw, b.raw)};
4151}
4152
4153template <typename T, HWY_IF_UI32(T)>
4154HWY_API Vec512<T> TwoTablesLookupLanes(Vec512<T> a, Vec512<T> b,
4155 Indices512<T> idx) {
4156 return Vec512<T>{_mm512_permutex2var_epi32(a.raw, idx.raw, b.raw)};
4157}
4158
4159#if HWY_HAVE_FLOAT16
4160HWY_API Vec512<float16_t> TwoTablesLookupLanes(Vec512<float16_t> a,
4161 Vec512<float16_t> b,
4162 Indices512<float16_t> idx) {
4163 return Vec512<float16_t>{_mm512_permutex2var_ph(a.raw, idx.raw, b.raw)};
4164}
4165#endif // HWY_HAVE_FLOAT16
4167 Indices512<float> idx) {
4168 return Vec512<float>{_mm512_permutex2var_ps(a.raw, idx.raw, b.raw)};
4169}
4170
4171template <typename T, HWY_IF_UI64(T)>
4172HWY_API Vec512<T> TwoTablesLookupLanes(Vec512<T> a, Vec512<T> b,
4173 Indices512<T> idx) {
4174 return Vec512<T>{_mm512_permutex2var_epi64(a.raw, idx.raw, b.raw)};
4175}
4176
4178 Indices512<double> idx) {
4179 return Vec512<double>{_mm512_permutex2var_pd(a.raw, idx.raw, b.raw)};
4180}
4181
4182// ------------------------------ Reverse
4183
4184template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4185HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4186#if HWY_TARGET <= HWY_AVX3_DL
4187 const RebindToSigned<decltype(d)> di;
4188 alignas(64) static constexpr int8_t kReverse[64] = {
4189 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,
4190 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
4191 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
4192 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4193 const Vec512<int8_t> idx = Load(di, kReverse);
4194 return BitCast(
4195 d, Vec512<int8_t>{_mm512_permutexvar_epi8(idx.raw, BitCast(di, v).raw)});
4196#else
4197 const RepartitionToWide<decltype(d)> d16;
4198 return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v))));
4199#endif
4200}
4201
4202template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4203HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4204 const RebindToSigned<decltype(d)> di;
4205 alignas(64) static constexpr int16_t kReverse[32] = {
4206 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
4207 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4208 const Vec512<int16_t> idx = Load(di, kReverse);
4209 return BitCast(d, Vec512<int16_t>{
4210 _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4211}
4212
4213template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
4214HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4215 alignas(64) static constexpr int32_t kReverse[16] = {
4216 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4217 return TableLookupLanes(v, SetTableIndices(d, kReverse));
4218}
4219
4220template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
4221HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4222 alignas(64) static constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
4223 return TableLookupLanes(v, SetTableIndices(d, kReverse));
4224}
4225
4226// ------------------------------ Reverse2 (in x86_128)
4227
4228// ------------------------------ Reverse4
4229
4230template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4231HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
4232 const RebindToSigned<decltype(d)> di;
4233 alignas(64) static constexpr int16_t kReverse4[32] = {
4234 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
4235 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
4236 const Vec512<int16_t> idx = Load(di, kReverse4);
4237 return BitCast(d, Vec512<int16_t>{
4238 _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4239}
4240
4241// 32 bit Reverse4 defined in x86_128.
4242
4243template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4244HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
4245 return VFromD<D>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
4246}
4247template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4248HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> v) {
4249 return VFromD<D>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
4250}
4251
4252// ------------------------------ Reverse8
4253
4254template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4255HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
4256 const RebindToSigned<decltype(d)> di;
4257 alignas(64) static constexpr int16_t kReverse8[32] = {
4258 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
4259 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
4260 const Vec512<int16_t> idx = Load(di, kReverse8);
4261 return BitCast(d, Vec512<int16_t>{
4262 _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4263}
4264
4265template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
4266HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
4267 const RebindToSigned<decltype(d)> di;
4268 alignas(64) static constexpr int32_t kReverse8[16] = {
4269 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
4270 const Vec512<int32_t> idx = Load(di, kReverse8);
4271 return BitCast(d, Vec512<int32_t>{
4272 _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)});
4273}
4274
4275template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
4276HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
4277 return Reverse(d, v);
4278}
4279
4280// ------------------------------ ReverseBits (GaloisAffine)
4281
4282#if HWY_TARGET <= HWY_AVX3_DL
4283
4284#ifdef HWY_NATIVE_REVERSE_BITS_UI8
4285#undef HWY_NATIVE_REVERSE_BITS_UI8
4286#else
4287#define HWY_NATIVE_REVERSE_BITS_UI8
4288#endif
4289
4290// Generic for all vector lengths. Must be defined after all GaloisAffine.
4291template <class V, HWY_IF_T_SIZE_V(V, 1)>
4292HWY_API V ReverseBits(V v) {
4293 const Repartition<uint64_t, DFromV<V>> du64;
4294 return detail::GaloisAffine(v, Set(du64, 0x8040201008040201u));
4295}
4296
4297#endif // HWY_TARGET <= HWY_AVX3_DL
4298
4299// ------------------------------ InterleaveLower
4300
4301template <typename T, HWY_IF_T_SIZE(T, 1)>
4303 return Vec512<T>{_mm512_unpacklo_epi8(a.raw, b.raw)};
4304}
4305template <typename T, HWY_IF_T_SIZE(T, 2)>
4306HWY_API Vec512<T> InterleaveLower(Vec512<T> a, Vec512<T> b) {
4307 const DFromV<decltype(a)> d;
4308 const RebindToUnsigned<decltype(d)> du;
4309 using VU = VFromD<decltype(du)>; // for float16_t
4310 return BitCast(
4311 d, VU{_mm512_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
4312}
4313template <typename T, HWY_IF_T_SIZE(T, 4)>
4314HWY_API Vec512<T> InterleaveLower(Vec512<T> a, Vec512<T> b) {
4315 return Vec512<T>{_mm512_unpacklo_epi32(a.raw, b.raw)};
4316}
4317template <typename T, HWY_IF_T_SIZE(T, 8)>
4318HWY_API Vec512<T> InterleaveLower(Vec512<T> a, Vec512<T> b) {
4319 return Vec512<T>{_mm512_unpacklo_epi64(a.raw, b.raw)};
4320}
4322 return Vec512<float>{_mm512_unpacklo_ps(a.raw, b.raw)};
4323}
4325 return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)};
4326}
4327
4328// ------------------------------ InterleaveUpper
4329
4330template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4332 return VFromD<D>{_mm512_unpackhi_epi8(a.raw, b.raw)};
4333}
4334template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4336 const RebindToUnsigned<decltype(d)> du;
4337 using VU = VFromD<decltype(du)>; // for float16_t
4338 return BitCast(
4339 d, VU{_mm512_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
4340}
4341template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4343 return VFromD<D>{_mm512_unpackhi_epi32(a.raw, b.raw)};
4344}
4345template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4347 return VFromD<D>{_mm512_unpackhi_epi64(a.raw, b.raw)};
4348}
4349
4350template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4352 return VFromD<D>{_mm512_unpackhi_ps(a.raw, b.raw)};
4353}
4354template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4356 return VFromD<D>{_mm512_unpackhi_pd(a.raw, b.raw)};
4357}
4358
4359// ------------------------------ Concat* halves
4360
4361// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
4362template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
4364 const RebindToUnsigned<decltype(d)> du; // for float16_t
4365 return BitCast(d,
4366 VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4367 BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_BABA)});
4368}
4369template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4371 return VFromD<D>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
4372}
4373template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4375 Vec512<double> lo) {
4376 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BABA)};
4377}
4378
4379// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
4380template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
4382 const RebindToUnsigned<decltype(d)> du; // for float16_t
4383 return BitCast(d,
4384 VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4385 BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_DCDC)});
4386}
4387template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4389 return VFromD<D>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
4390}
4391template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4393 Vec512<double> lo) {
4394 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_DCDC)};
4395}
4396
4397// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
4398template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
4400 const RebindToUnsigned<decltype(d)> du; // for float16_t
4401 return BitCast(d,
4402 VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4403 BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_BADC)});
4404}
4405template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4407 return VFromD<D>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
4408}
4409template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4411 Vec512<double> lo) {
4412 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)};
4413}
4414
4415// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
4416template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
4418 // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
4419 // are efficiently loaded from 32-bit regs.
4420 const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
4421 const RebindToUnsigned<decltype(d)> du; // for float16_t
4422 return BitCast(d, VFromD<decltype(du)>{_mm512_mask_blend_epi16(
4423 mask, BitCast(du, hi).raw, BitCast(du, lo).raw)});
4424}
4425template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4427 const __mmask16 mask = /*_cvtu32_mask16 */ (0x00FF);
4428 return VFromD<D>{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)};
4429}
4430template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4432 Vec512<double> lo) {
4433 const __mmask8 mask = /*_cvtu32_mask8 */ (0x0F);
4434 return Vec512<double>{_mm512_mask_blend_pd(mask, hi.raw, lo.raw)};
4435}
4436
4437// ------------------------------ ConcatOdd
4438
4439template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4441 const RebindToUnsigned<decltype(d)> du;
4442#if HWY_TARGET <= HWY_AVX3_DL
4443 alignas(64) static constexpr uint8_t kIdx[64] = {
4444 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
4445 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51,
4446 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77,
4447 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103,
4448 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127};
4449 return BitCast(
4450 d, Vec512<uint8_t>{_mm512_permutex2var_epi8(
4451 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4452#else
4453 const RepartitionToWide<decltype(du)> dw;
4454 // Right-shift 8 bits per u16 so we can pack.
4455 const Vec512<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
4456 const Vec512<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
4457 const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
4458 // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
4459 const Full512<uint64_t> du64;
4460 alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
4461 return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
4462#endif
4463}
4464
4465template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4467 const RebindToUnsigned<decltype(d)> du;
4468 alignas(64) static constexpr uint16_t kIdx[32] = {
4469 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
4470 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
4471 return BitCast(
4472 d, Vec512<uint16_t>{_mm512_permutex2var_epi16(
4473 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4474}
4475
4476template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4478 const RebindToUnsigned<decltype(d)> du;
4479 alignas(64) static constexpr uint32_t kIdx[16] = {
4480 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
4481 return BitCast(
4482 d, Vec512<uint32_t>{_mm512_permutex2var_epi32(
4483 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4484}
4485
4486template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4488 const RebindToUnsigned<decltype(d)> du;
4489 alignas(64) static constexpr uint32_t kIdx[16] = {
4490 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
4491 return VFromD<D>{_mm512_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
4492}
4493
4494template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4496 const RebindToUnsigned<decltype(d)> du;
4497 alignas(64) static constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
4498 return BitCast(
4499 d, Vec512<uint64_t>{_mm512_permutex2var_epi64(
4500 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4501}
4502
4503template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4505 const RebindToUnsigned<decltype(d)> du;
4506 alignas(64) static constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
4507 return VFromD<D>{_mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
4508}
4509
4510// ------------------------------ ConcatEven
4511
4512template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4514 const RebindToUnsigned<decltype(d)> du;
4515#if HWY_TARGET <= HWY_AVX3_DL
4516 alignas(64) static constexpr uint8_t kIdx[64] = {
4517 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
4518 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
4519 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
4520 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
4521 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
4522 return BitCast(
4523 d, Vec512<uint32_t>{_mm512_permutex2var_epi8(
4524 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4525#else
4526 const RepartitionToWide<decltype(du)> dw;
4527 // Isolate lower 8 bits per u16 so we can pack.
4528 const Vec512<uint16_t> mask = Set(dw, 0x00FF);
4529 const Vec512<uint16_t> uH = And(BitCast(dw, hi), mask);
4530 const Vec512<uint16_t> uL = And(BitCast(dw, lo), mask);
4531 const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
4532 // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
4533 const Full512<uint64_t> du64;
4534 alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
4535 return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
4536#endif
4537}
4538
4539template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4541 const RebindToUnsigned<decltype(d)> du;
4542 alignas(64) static constexpr uint16_t kIdx[32] = {
4543 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
4544 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
4545 return BitCast(
4546 d, Vec512<uint32_t>{_mm512_permutex2var_epi16(
4547 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4548}
4549
4550template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4552 const RebindToUnsigned<decltype(d)> du;
4553 alignas(64) static constexpr uint32_t kIdx[16] = {
4554 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
4555 return BitCast(
4556 d, Vec512<uint32_t>{_mm512_permutex2var_epi32(
4557 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4558}
4559
4560template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4562 const RebindToUnsigned<decltype(d)> du;
4563 alignas(64) static constexpr uint32_t kIdx[16] = {
4564 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
4565 return VFromD<D>{_mm512_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
4566}
4567
4568template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4570 const RebindToUnsigned<decltype(d)> du;
4571 alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
4572 return BitCast(
4573 d, Vec512<uint64_t>{_mm512_permutex2var_epi64(
4574 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4575}
4576
4577template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4579 const RebindToUnsigned<decltype(d)> du;
4580 alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
4581 return VFromD<D>{_mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
4582}
4583
4584// ------------------------------ InterleaveWholeLower
4585
4586template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4588#if HWY_TARGET <= HWY_AVX3_DL
4589 const RebindToUnsigned<decltype(d)> du;
4590 alignas(64) static constexpr uint8_t kIdx[64] = {
4591 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71,
4592 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79,
4593 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87,
4594 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95};
4595 return VFromD<D>{_mm512_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
4596#else
4597 alignas(64) static constexpr uint64_t kIdx2[8] = {0, 1, 8, 9, 2, 3, 10, 11};
4598 const Repartition<uint64_t, decltype(d)> du64;
4599 return VFromD<D>{_mm512_permutex2var_epi64(InterleaveLower(a, b).raw,
4600 Load(du64, kIdx2).raw,
4601 InterleaveUpper(d, a, b).raw)};
4602#endif
4603}
4604
4605template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4607 const RebindToUnsigned<decltype(d)> du;
4608 alignas(64) static constexpr uint16_t kIdx[32] = {
4609 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
4610 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
4611 return BitCast(
4612 d, VFromD<decltype(du)>{_mm512_permutex2var_epi16(
4613 BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
4614}
4615
4616template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4618 const RebindToUnsigned<decltype(d)> du;
4619 alignas(64) static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
4620 4, 20, 5, 21, 6, 22, 7, 23};
4621 return VFromD<D>{_mm512_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
4622}
4623
4624template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4626 const RebindToUnsigned<decltype(d)> du;
4627 alignas(64) static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
4628 4, 20, 5, 21, 6, 22, 7, 23};
4629 return VFromD<D>{_mm512_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
4630}
4631
4632template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4634 const RebindToUnsigned<decltype(d)> du;
4635 alignas(64) static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4636 return VFromD<D>{_mm512_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
4637}
4638
4639template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4641 const RebindToUnsigned<decltype(d)> du;
4642 alignas(64) static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4643 return VFromD<D>{_mm512_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
4644}
4645
4646// ------------------------------ InterleaveWholeUpper
4647
4648template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4650#if HWY_TARGET <= HWY_AVX3_DL
4651 const RebindToUnsigned<decltype(d)> du;
4652 alignas(64) static constexpr uint8_t kIdx[64] = {
4653 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103,
4654 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111,
4655 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119,
4656 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127};
4657 return VFromD<D>{_mm512_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
4658#else
4659 alignas(64) static constexpr uint64_t kIdx2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
4660 const Repartition<uint64_t, decltype(d)> du64;
4661 return VFromD<D>{_mm512_permutex2var_epi64(InterleaveLower(a, b).raw,
4662 Load(du64, kIdx2).raw,
4663 InterleaveUpper(d, a, b).raw)};
4664#endif
4665}
4666
4667template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4669 const RebindToUnsigned<decltype(d)> du;
4670 alignas(64) static constexpr uint16_t kIdx[32] = {
4671 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
4672 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
4673 return BitCast(
4674 d, VFromD<decltype(du)>{_mm512_permutex2var_epi16(
4675 BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
4676}
4677
4678template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4680 const RebindToUnsigned<decltype(d)> du;
4681 alignas(64) static constexpr uint32_t kIdx[16] = {
4682 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
4683 return VFromD<D>{_mm512_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
4684}
4685
4686template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4688 const RebindToUnsigned<decltype(d)> du;
4689 alignas(64) static constexpr uint32_t kIdx[16] = {
4690 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
4691 return VFromD<D>{_mm512_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
4692}
4693
4694template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4696 const RebindToUnsigned<decltype(d)> du;
4697 alignas(64) static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
4698 return VFromD<D>{_mm512_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
4699}
4700
4701template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4703 const RebindToUnsigned<decltype(d)> du;
4704 alignas(64) static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
4705 return VFromD<D>{_mm512_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
4706}
4707
4708// ------------------------------ DupEven (InterleaveLower)
4709
4710template <typename T, HWY_IF_T_SIZE(T, 4)>
4712 return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)};
4713}
4715 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)};
4716}
4717
4718template <typename T, HWY_IF_T_SIZE(T, 8)>
4719HWY_API Vec512<T> DupEven(const Vec512<T> v) {
4720 const DFromV<decltype(v)> d;
4721 return InterleaveLower(d, v, v);
4722}
4723
4724// ------------------------------ DupOdd (InterleaveUpper)
4725
4726template <typename T, HWY_IF_T_SIZE(T, 4)>
4728 return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)};
4729}
4731 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)};
4732}
4733
4734template <typename T, HWY_IF_T_SIZE(T, 8)>
4735HWY_API Vec512<T> DupOdd(const Vec512<T> v) {
4736 const DFromV<decltype(v)> d;
4737 return InterleaveUpper(d, v, v);
4738}
4739
4740// ------------------------------ OddEven (IfThenElse)
4741
4742template <typename T>
4744 constexpr size_t s = sizeof(T);
4745 constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
4746 return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
4747}
4748
4749// -------------------------- InterleaveEven
4750
4751template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
4753 return VFromD<D>{_mm512_mask_shuffle_epi32(
4754 a.raw, static_cast<__mmask16>(0xAAAA), b.raw,
4755 static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
4756}
4757template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
4759 return VFromD<D>{_mm512_mask_shuffle_ps(a.raw, static_cast<__mmask16>(0xAAAA),
4760 b.raw, b.raw,
4761 _MM_SHUFFLE(2, 2, 0, 0))};
4762}
4763// -------------------------- InterleaveOdd
4764
4765template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
4767 return VFromD<D>{_mm512_mask_shuffle_epi32(
4768 b.raw, static_cast<__mmask16>(0x5555), a.raw,
4769 static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
4770}
4771template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
4773 return VFromD<D>{_mm512_mask_shuffle_ps(b.raw, static_cast<__mmask16>(0x5555),
4774 a.raw, a.raw,
4775 _MM_SHUFFLE(3, 3, 1, 1))};
4776}
4777
4778// ------------------------------ OddEvenBlocks
4779
4780template <typename T>
4782 const DFromV<decltype(odd)> d;
4783 const RebindToUnsigned<decltype(d)> du; // for float16_t
4784 return BitCast(
4785 d, VFromD<decltype(du)>{_mm512_mask_blend_epi64(
4786 __mmask8{0x33u}, BitCast(du, odd).raw, BitCast(du, even).raw)});
4787}
4788
4790 return Vec512<float>{
4791 _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.raw, even.raw)};
4792}
4793
4795 return Vec512<double>{
4796 _mm512_mask_blend_pd(__mmask8{0x33u}, odd.raw, even.raw)};
4797}
4798
4799// ------------------------------ SwapAdjacentBlocks
4800
4801template <typename T>
4803 const DFromV<decltype(v)> d;
4804 const RebindToUnsigned<decltype(d)> du; // for float16_t
4805 return BitCast(d,
4806 VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4807 BitCast(du, v).raw, BitCast(du, v).raw, _MM_PERM_CDAB)});
4808}
4809
4811 return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)};
4812}
4813
4815 return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
4816}
4817
4818// ------------------------------ ReverseBlocks
4819
4820template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
4822 const RebindToUnsigned<decltype(d)> du; // for float16_t
4823 return BitCast(d,
4824 VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4825 BitCast(du, v).raw, BitCast(du, v).raw, _MM_PERM_ABCD)});
4826}
4827template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4828HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
4829 return VFromD<D>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)};
4830}
4831template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4832HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
4833 return VFromD<D>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)};
4834}
4835
4836// ------------------------------ TableLookupBytes (ZeroExtendVector)
4837
4838// Both full
4839template <typename T, typename TI>
4841 const DFromV<decltype(indices)> d;
4842 return BitCast(d, Vec512<uint8_t>{_mm512_shuffle_epi8(
4843 BitCast(Full512<uint8_t>(), bytes).raw,
4844 BitCast(Full512<uint8_t>(), indices).raw)});
4845}
4846
4847// Partial index vector
4848template <typename T, typename TI, size_t NI>
4850 const Full512<TI> d512;
4851 const Half<decltype(d512)> d256;
4852 const Half<decltype(d256)> d128;
4853 // First expand to full 128, then 256, then 512.
4854 const Vec128<TI> from_full{from.raw};
4855 const auto from_512 =
4856 ZeroExtendVector(d512, ZeroExtendVector(d256, from_full));
4857 const auto tbl_full = TableLookupBytes(bytes, from_512);
4858 // Shrink to 256, then 128, then partial.
4859 return Vec128<TI, NI>{LowerHalf(d128, LowerHalf(d256, tbl_full)).raw};
4860}
4861template <typename T, typename TI>
4863 const DFromV<decltype(from)> dih;
4864 const Twice<decltype(dih)> di;
4865 const auto from_512 = ZeroExtendVector(di, from);
4866 return LowerHalf(dih, TableLookupBytes(bytes, from_512));
4867}
4868
4869// Partial table vector
4870template <typename T, size_t N, typename TI>
4872 const DFromV<decltype(from)> d512;
4873 const Half<decltype(d512)> d256;
4874 const Half<decltype(d256)> d128;
4875 // First expand to full 128, then 256, then 512.
4876 const Vec128<T> bytes_full{bytes.raw};
4877 const auto bytes_512 =
4878 ZeroExtendVector(d512, ZeroExtendVector(d256, bytes_full));
4879 return TableLookupBytes(bytes_512, from);
4880}
4881template <typename T, typename TI>
4883 const Full512<T> d;
4884 return TableLookupBytes(ZeroExtendVector(d, bytes), from);
4885}
4886
4887// Partial both are handled by x86_128/256.
4888
4889// ------------------------------ I8/U8 Broadcast (TableLookupBytes)
4890
4891template <int kLane, class T, HWY_IF_T_SIZE(T, 1)>
4892HWY_API Vec512<T> Broadcast(const Vec512<T> v) {
4893 static_assert(0 <= kLane && kLane < 16, "Invalid lane");
4894 return TableLookupBytes(v, Set(Full512<T>(), static_cast<T>(kLane)));
4895}
4896
4897// ------------------------------ Per4LaneBlockShuffle
4898
4899namespace detail {
4900
4901template <class D, HWY_IF_V_SIZE_D(D, 64)>
4903 const uint32_t x2,
4904 const uint32_t x1,
4905 const uint32_t x0) {
4906 return BitCast(d, Vec512<uint32_t>{_mm512_set_epi32(
4907 static_cast<int32_t>(x3), static_cast<int32_t>(x2),
4908 static_cast<int32_t>(x1), static_cast<int32_t>(x0),
4909 static_cast<int32_t>(x3), static_cast<int32_t>(x2),
4910 static_cast<int32_t>(x1), static_cast<int32_t>(x0),
4911 static_cast<int32_t>(x3), static_cast<int32_t>(x2),
4912 static_cast<int32_t>(x1), static_cast<int32_t>(x0),
4913 static_cast<int32_t>(x3), static_cast<int32_t>(x2),
4914 static_cast<int32_t>(x1), static_cast<int32_t>(x0))});
4915}
4916
4917template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
4919 hwy::SizeTag<4> /*lane_size_tag*/,
4920 hwy::SizeTag<64> /*vect_size_tag*/, V v) {
4921 return V{
4922 _mm512_shuffle_epi32(v.raw, static_cast<_MM_PERM_ENUM>(kIdx3210 & 0xFF))};
4923}
4924
4925template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)>
4927 hwy::SizeTag<4> /*lane_size_tag*/,
4928 hwy::SizeTag<64> /*vect_size_tag*/, V v) {
4929 return V{_mm512_shuffle_ps(v.raw, v.raw, static_cast<int>(kIdx3210 & 0xFF))};
4930}
4931
4932template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
4934 hwy::SizeTag<8> /*lane_size_tag*/,
4935 hwy::SizeTag<64> /*vect_size_tag*/, V v) {
4936 return V{_mm512_permutex_epi64(v.raw, static_cast<int>(kIdx3210 & 0xFF))};
4937}
4938
4939template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)>
4941 hwy::SizeTag<8> /*lane_size_tag*/,
4942 hwy::SizeTag<64> /*vect_size_tag*/, V v) {
4943 return V{_mm512_permutex_pd(v.raw, static_cast<int>(kIdx3210 & 0xFF))};
4944}
4945
4946} // namespace detail
4947
4948// ------------------------------ SlideUpLanes
4949
4950namespace detail {
4951
4952template <int kI32Lanes, class V, HWY_IF_V_SIZE_V(V, 64)>
4954 const DFromV<decltype(hi)> d;
4955 const Repartition<uint32_t, decltype(d)> du32;
4956 return BitCast(d,
4957 Vec512<uint32_t>{_mm512_alignr_epi32(
4958 BitCast(du32, hi).raw, BitCast(du32, lo).raw, kI32Lanes)});
4959}
4960
4961template <int kI64Lanes, class V, HWY_IF_V_SIZE_V(V, 64)>
4963 const DFromV<decltype(hi)> d;
4964 const Repartition<uint64_t, decltype(d)> du64;
4965 return BitCast(d,
4966 Vec512<uint64_t>{_mm512_alignr_epi64(
4967 BitCast(du64, hi).raw, BitCast(du64, lo).raw, kI64Lanes)});
4968}
4969
4970template <int kI32Lanes, class V, HWY_IF_V_SIZE_V(V, 64)>
4972 static_assert(0 <= kI32Lanes && kI32Lanes <= 15,
4973 "kI32Lanes must be between 0 and 15");
4974 const DFromV<decltype(v)> d;
4975 return CombineShiftRightI32Lanes<16 - kI32Lanes>(v, Zero(d));
4976}
4977
4978template <int kI64Lanes, class V, HWY_IF_V_SIZE_V(V, 64)>
4980 static_assert(0 <= kI64Lanes && kI64Lanes <= 7,
4981 "kI64Lanes must be between 0 and 7");
4982 const DFromV<decltype(v)> d;
4983 return CombineShiftRightI64Lanes<8 - kI64Lanes>(v, Zero(d));
4984}
4985
4986template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4988 const Repartition<uint8_t, decltype(d)> du8;
4989
4990#if HWY_TARGET <= HWY_AVX3_DL
4991 const auto byte_idx = Iota(du8, static_cast<uint8_t>(size_t{0} - amt));
4992 return TwoTablesLookupLanes(v, Zero(d), Indices512<TFromD<D>>{byte_idx.raw});
4993#else
4994 const Repartition<uint16_t, decltype(d)> du16;
4995 const Repartition<uint64_t, decltype(d)> du64;
4996 const auto byte_idx = Iota(du8, static_cast<uint8_t>(size_t{0} - (amt & 15)));
4997 const auto blk_u64_idx =
4998 Iota(du64, static_cast<uint64_t>(uint64_t{0} - ((amt >> 4) << 1)));
4999
5000 const VFromD<D> even_blocks{
5001 _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
5002 const VFromD<D> odd_blocks{
5003 _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(3, 1, 1, 3))};
5004 const auto odd_sel_mask =
5005 MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, byte_idx))));
5006 const auto even_blk_lookup_result =
5007 BitCast(d, TableLookupBytes(even_blocks, byte_idx));
5008 const VFromD<D> blockwise_slide_up_result{
5009 _mm512_mask_shuffle_epi8(even_blk_lookup_result.raw, odd_sel_mask.raw,
5010 odd_blocks.raw, byte_idx.raw)};
5012 BitCast(du64, blockwise_slide_up_result), Zero(du64),
5013 Indices512<uint64_t>{blk_u64_idx.raw}));
5014#endif
5015}
5016
5017} // namespace detail
5018
5019template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 64)>
5021 static_assert(0 <= kBlocks && kBlocks <= 3,
5022 "kBlocks must be between 0 and 3");
5023 switch (kBlocks) {
5024 case 0:
5025 return v;
5026 case 1:
5027 return detail::SlideUpI64Lanes<2>(v);
5028 case 2:
5029 return ConcatLowerLower(d, v, Zero(d));
5030 case 3:
5031 return detail::SlideUpI64Lanes<6>(v);
5032 }
5033
5034 return v;
5035}
5036
5037template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
5038HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
5039#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5040 if (__builtin_constant_p(amt)) {
5041 switch (amt) {
5042 case 0:
5043 return v;
5044 case 1:
5045 return detail::SlideUpI32Lanes<1>(v);
5046 case 2:
5047 return detail::SlideUpI64Lanes<1>(v);
5048 case 3:
5049 return detail::SlideUpI32Lanes<3>(v);
5050 case 4:
5051 return detail::SlideUpI64Lanes<2>(v);
5052 case 5:
5053 return detail::SlideUpI32Lanes<5>(v);
5054 case 6:
5055 return detail::SlideUpI64Lanes<3>(v);
5056 case 7:
5057 return detail::SlideUpI32Lanes<7>(v);
5058 case 8:
5059 return ConcatLowerLower(d, v, Zero(d));
5060 case 9:
5061 return detail::SlideUpI32Lanes<9>(v);
5062 case 10:
5063 return detail::SlideUpI64Lanes<5>(v);
5064 case 11:
5065 return detail::SlideUpI32Lanes<11>(v);
5066 case 12:
5067 return detail::SlideUpI64Lanes<6>(v);
5068 case 13:
5069 return detail::SlideUpI32Lanes<13>(v);
5070 case 14:
5071 return detail::SlideUpI64Lanes<7>(v);
5072 case 15:
5073 return detail::SlideUpI32Lanes<15>(v);
5074 }
5075 }
5076#endif
5077
5078 return detail::TableLookupSlideUpLanes(d, v, amt);
5079}
5080
5081template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
5082HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
5083#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5084 if (__builtin_constant_p(amt)) {
5085 switch (amt) {
5086 case 0:
5087 return v;
5088 case 1:
5089 return detail::SlideUpI64Lanes<1>(v);
5090 case 2:
5091 return detail::SlideUpI64Lanes<2>(v);
5092 case 3:
5093 return detail::SlideUpI64Lanes<3>(v);
5094 case 4:
5095 return ConcatLowerLower(d, v, Zero(d));
5096 case 5:
5097 return detail::SlideUpI64Lanes<5>(v);
5098 case 6:
5099 return detail::SlideUpI64Lanes<6>(v);
5100 case 7:
5101 return detail::SlideUpI64Lanes<7>(v);
5102 }
5103 }
5104#endif
5105
5106 return detail::TableLookupSlideUpLanes(d, v, amt);
5107}
5108
5109template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
5110HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
5111#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5112 if (__builtin_constant_p(amt)) {
5113 if ((amt & 3) == 0) {
5114 const Repartition<uint32_t, decltype(d)> du32;
5115 return BitCast(d, SlideUpLanes(du32, BitCast(du32, v), amt >> 2));
5116 } else if ((amt & 1) == 0) {
5117 const Repartition<uint16_t, decltype(d)> du16;
5118 return BitCast(
5119 d, detail::TableLookupSlideUpLanes(du16, BitCast(du16, v), amt >> 1));
5120 }
5121#if HWY_TARGET > HWY_AVX3_DL
5122 else if (amt <= 63) { // NOLINT(readability/braces)
5123 const Repartition<uint64_t, decltype(d)> du64;
5124 const size_t blk_u64_slideup_amt = (amt >> 4) << 1;
5125 const auto vu64 = BitCast(du64, v);
5126 const auto v_hi =
5127 BitCast(d, SlideUpLanes(du64, vu64, blk_u64_slideup_amt));
5128 const auto v_lo =
5129 (blk_u64_slideup_amt <= 4)
5130 ? BitCast(d, SlideUpLanes(du64, vu64, blk_u64_slideup_amt + 2))
5131 : Zero(d);
5132 switch (amt & 15) {
5133 case 1:
5134 return CombineShiftRightBytes<15>(d, v_hi, v_lo);
5135 case 3:
5136 return CombineShiftRightBytes<13>(d, v_hi, v_lo);
5137 case 5:
5138 return CombineShiftRightBytes<11>(d, v_hi, v_lo);
5139 case 7:
5140 return CombineShiftRightBytes<9>(d, v_hi, v_lo);
5141 case 9:
5142 return CombineShiftRightBytes<7>(d, v_hi, v_lo);
5143 case 11:
5144 return CombineShiftRightBytes<5>(d, v_hi, v_lo);
5145 case 13:
5146 return CombineShiftRightBytes<3>(d, v_hi, v_lo);
5147 case 15:
5148 return CombineShiftRightBytes<1>(d, v_hi, v_lo);
5149 }
5150 }
5151#endif // HWY_TARGET > HWY_AVX3_DL
5152 }
5153#endif
5154
5155 return detail::TableLookupSlideUpLanes(d, v, amt);
5156}
5157
5158template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
5159HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
5160#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5161 if (__builtin_constant_p(amt) && (amt & 1) == 0) {
5162 const Repartition<uint32_t, decltype(d)> du32;
5163 return BitCast(d, SlideUpLanes(du32, BitCast(du32, v), amt >> 1));
5164 }
5165#endif
5166
5167 return detail::TableLookupSlideUpLanes(d, v, amt);
5168}
5169
5170// ------------------------------ Slide1Up
5171
5172template <typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
5174#if HWY_TARGET <= HWY_AVX3_DL
5175 return detail::TableLookupSlideUpLanes(d, v, 1);
5176#else
5177 const auto v_lo = detail::SlideUpI64Lanes<2>(v);
5178 return CombineShiftRightBytes<15>(d, v, v_lo);
5179#endif
5180}
5181
5182template <typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
5184 return detail::TableLookupSlideUpLanes(d, v, 1);
5185}
5186
5187template <typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
5188HWY_API VFromD<D> Slide1Up(D /*d*/, VFromD<D> v) {
5189 return detail::SlideUpI32Lanes<1>(v);
5190}
5191
5192template <typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
5193HWY_API VFromD<D> Slide1Up(D /*d*/, VFromD<D> v) {
5194 return detail::SlideUpI64Lanes<1>(v);
5195}
5196
5197// ------------------------------ SlideDownLanes
5198
5199namespace detail {
5200
5201template <int kI32Lanes, class V, HWY_IF_V_SIZE_V(V, 64)>
5203 static_assert(0 <= kI32Lanes && kI32Lanes <= 15,
5204 "kI32Lanes must be between 0 and 15");
5205 const DFromV<decltype(v)> d;
5206 return CombineShiftRightI32Lanes<kI32Lanes>(Zero(d), v);
5207}
5208
5209template <int kI64Lanes, class V, HWY_IF_V_SIZE_V(V, 64)>
5211 static_assert(0 <= kI64Lanes && kI64Lanes <= 7,
5212 "kI64Lanes must be between 0 and 7");
5213 const DFromV<decltype(v)> d;
5214 return CombineShiftRightI64Lanes<kI64Lanes>(Zero(d), v);
5215}
5216
5217template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
5219 const Repartition<uint8_t, decltype(d)> du8;
5220
5221#if HWY_TARGET <= HWY_AVX3_DL
5222 auto byte_idx = Iota(du8, static_cast<uint8_t>(amt));
5223 return TwoTablesLookupLanes(v, Zero(d), Indices512<TFromD<D>>{byte_idx.raw});
5224#else
5225 const Repartition<uint16_t, decltype(d)> du16;
5226 const Repartition<uint64_t, decltype(d)> du64;
5227 const auto byte_idx = Iota(du8, static_cast<uint8_t>(amt & 15));
5228 const auto blk_u64_idx = Iota(du64, static_cast<uint64_t>(((amt >> 4) << 1)));
5229
5230 const VFromD<D> even_blocks{
5231 _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(0, 2, 2, 0))};
5232 const VFromD<D> odd_blocks{
5233 _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
5234 const auto odd_sel_mask =
5235 MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, byte_idx))));
5236 const VFromD<D> even_blk_lookup_result{
5237 _mm512_maskz_shuffle_epi8(static_cast<__mmask64>(0x0000FFFFFFFFFFFFULL),
5238 even_blocks.raw, byte_idx.raw)};
5239 const VFromD<D> blockwise_slide_up_result{
5240 _mm512_mask_shuffle_epi8(even_blk_lookup_result.raw, odd_sel_mask.raw,
5241 odd_blocks.raw, byte_idx.raw)};
5243 BitCast(du64, blockwise_slide_up_result), Zero(du64),
5244 Indices512<uint64_t>{blk_u64_idx.raw}));
5245#endif
5246}
5247
5248} // namespace detail
5249
5250template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 64)>
5252 static_assert(0 <= kBlocks && kBlocks <= 3,
5253 "kBlocks must be between 0 and 3");
5254 const Half<decltype(d)> dh;
5255 switch (kBlocks) {
5256 case 0:
5257 return v;
5258 case 1:
5259 return detail::SlideDownI64Lanes<2>(v);
5260 case 2:
5261 return ZeroExtendVector(d, UpperHalf(dh, v));
5262 case 3:
5263 return detail::SlideDownI64Lanes<6>(v);
5264 }
5265
5266 return v;
5267}
5268
5269template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
5270HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
5271#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5272 if (__builtin_constant_p(amt)) {
5273 const Half<decltype(d)> dh;
5274 switch (amt) {
5275 case 1:
5276 return detail::SlideDownI32Lanes<1>(v);
5277 case 2:
5278 return detail::SlideDownI64Lanes<1>(v);
5279 case 3:
5280 return detail::SlideDownI32Lanes<3>(v);
5281 case 4:
5282 return detail::SlideDownI64Lanes<2>(v);
5283 case 5:
5284 return detail::SlideDownI32Lanes<5>(v);
5285 case 6:
5286 return detail::SlideDownI64Lanes<3>(v);
5287 case 7:
5288 return detail::SlideDownI32Lanes<7>(v);
5289 case 8:
5290 return ZeroExtendVector(d, UpperHalf(dh, v));
5291 case 9:
5292 return detail::SlideDownI32Lanes<9>(v);
5293 case 10:
5294 return detail::SlideDownI64Lanes<5>(v);
5295 case 11:
5296 return detail::SlideDownI32Lanes<11>(v);
5297 case 12:
5298 return detail::SlideDownI64Lanes<6>(v);
5299 case 13:
5300 return detail::SlideDownI32Lanes<13>(v);
5301 case 14:
5302 return detail::SlideDownI64Lanes<7>(v);
5303 case 15:
5304 return detail::SlideDownI32Lanes<15>(v);
5305 }
5306 }
5307#endif
5308
5309 return detail::TableLookupSlideDownLanes(d, v, amt);
5310}
5311
5312template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
5313HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
5314#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5315 if (__builtin_constant_p(amt)) {
5316 const Half<decltype(d)> dh;
5317 switch (amt) {
5318 case 0:
5319 return v;
5320 case 1:
5321 return detail::SlideDownI64Lanes<1>(v);
5322 case 2:
5323 return detail::SlideDownI64Lanes<2>(v);
5324 case 3:
5325 return detail::SlideDownI64Lanes<3>(v);
5326 case 4:
5327 return ZeroExtendVector(d, UpperHalf(dh, v));
5328 case 5:
5329 return detail::SlideDownI64Lanes<5>(v);
5330 case 6:
5331 return detail::SlideDownI64Lanes<6>(v);
5332 case 7:
5333 return detail::SlideDownI64Lanes<7>(v);
5334 }
5335 }
5336#endif
5337
5338 return detail::TableLookupSlideDownLanes(d, v, amt);
5339}
5340
5341template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
5342HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
5343#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5344 if (__builtin_constant_p(amt)) {
5345 if ((amt & 3) == 0) {
5346 const Repartition<uint32_t, decltype(d)> du32;
5347 return BitCast(d, SlideDownLanes(du32, BitCast(du32, v), amt >> 2));
5348 } else if ((amt & 1) == 0) {
5349 const Repartition<uint16_t, decltype(d)> du16;
5351 du16, BitCast(du16, v), amt >> 1));
5352 }
5353#if HWY_TARGET > HWY_AVX3_DL
5354 else if (amt <= 63) { // NOLINT(readability/braces)
5355 const Repartition<uint64_t, decltype(d)> du64;
5356 const size_t blk_u64_slidedown_amt = (amt >> 4) << 1;
5357 const auto vu64 = BitCast(du64, v);
5358 const auto v_lo =
5359 BitCast(d, SlideDownLanes(du64, vu64, blk_u64_slidedown_amt));
5360 const auto v_hi =
5361 (blk_u64_slidedown_amt <= 4)
5362 ? BitCast(d,
5363 SlideDownLanes(du64, vu64, blk_u64_slidedown_amt + 2))
5364 : Zero(d);
5365 switch (amt & 15) {
5366 case 1:
5367 return CombineShiftRightBytes<1>(d, v_hi, v_lo);
5368 case 3:
5369 return CombineShiftRightBytes<3>(d, v_hi, v_lo);
5370 case 5:
5371 return CombineShiftRightBytes<5>(d, v_hi, v_lo);
5372 case 7:
5373 return CombineShiftRightBytes<7>(d, v_hi, v_lo);
5374 case 9:
5375 return CombineShiftRightBytes<9>(d, v_hi, v_lo);
5376 case 11:
5377 return CombineShiftRightBytes<11>(d, v_hi, v_lo);
5378 case 13:
5379 return CombineShiftRightBytes<13>(d, v_hi, v_lo);
5380 case 15:
5381 return CombineShiftRightBytes<15>(d, v_hi, v_lo);
5382 }
5383 }
5384#endif
5385 }
5386#endif
5387
5388 return detail::TableLookupSlideDownLanes(d, v, amt);
5389}
5390
5391template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
5392HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
5393#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5394 if (__builtin_constant_p(amt) && (amt & 1) == 0) {
5395 const Repartition<uint32_t, decltype(d)> du32;
5396 return BitCast(d, SlideDownLanes(du32, BitCast(du32, v), amt >> 1));
5397 }
5398#endif
5399
5400 return detail::TableLookupSlideDownLanes(d, v, amt);
5401}
5402
5403// ------------------------------ Slide1Down
5404
5405template <typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
5407#if HWY_TARGET <= HWY_AVX3_DL
5409#else
5410 const auto v_hi = detail::SlideDownI64Lanes<2>(v);
5411 return CombineShiftRightBytes<1>(d, v_hi, v);
5412#endif
5413}
5414
5415template <typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
5418}
5419
5420template <typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 4)>
5422 return detail::SlideDownI32Lanes<1>(v);
5423}
5424
5425template <typename D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 8)>
5427 return detail::SlideDownI64Lanes<1>(v);
5428}
5429
5430// ================================================== CONVERT
5431
5432// ------------------------------ Promotions (part w/ narrow lanes -> full)
5433
5434// Unsigned: zero-extend.
5435// Note: these have 3 cycle latency; if inputs are already split across the
5436// 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
5437template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5439 return VFromD<D>{_mm512_cvtepu8_epi16(v.raw)};
5440}
5441template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
5442HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint8_t> v) {
5443 return VFromD<D>{_mm512_cvtepu8_epi32(v.raw)};
5444}
5445template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
5447 return VFromD<D>{_mm512_cvtepu16_epi32(v.raw)};
5448}
5449template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5451 return VFromD<D>{_mm512_cvtepu32_epi64(v.raw)};
5452}
5453template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5454HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint16_t> v) {
5455 return VFromD<D>{_mm512_cvtepu16_epi64(v.raw)};
5456}
5457template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5459 return VFromD<D>{_mm512_cvtepu8_epi64(v.raw)};
5460}
5461
5462// Signed: replicate sign bit.
5463// Note: these have 3 cycle latency; if inputs are already split across the
5464// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
5465// signed shift would be faster.
5466template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
5468 return VFromD<D>{_mm512_cvtepi8_epi16(v.raw)};
5469}
5470template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
5471HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int8_t> v) {
5472 return VFromD<D>{_mm512_cvtepi8_epi32(v.raw)};
5473}
5474template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
5476 return VFromD<D>{_mm512_cvtepi16_epi32(v.raw)};
5477}
5478template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5480 return VFromD<D>{_mm512_cvtepi32_epi64(v.raw)};
5481}
5482template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5483HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int16_t> v) {
5484 return VFromD<D>{_mm512_cvtepi16_epi64(v.raw)};
5485}
5486template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5488 return VFromD<D>{_mm512_cvtepi8_epi64(v.raw)};
5489}
5490
5491// Float
5492template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
5494#if HWY_HAVE_FLOAT16
5495 const RebindToUnsigned<DFromV<decltype(v)>> du16;
5496 return VFromD<D>{_mm512_cvtph_ps(BitCast(du16, v).raw)};
5497#else
5498 return VFromD<D>{_mm512_cvtph_ps(v.raw)};
5499#endif // HWY_HAVE_FLOAT16
5500}
5501
5502#if HWY_HAVE_FLOAT16
5503
5504template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5505HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec128<float16_t> v) {
5506 return VFromD<D>{_mm512_cvtph_pd(v.raw)};
5507}
5508
5509#endif // HWY_HAVE_FLOAT16
5510
5511template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
5513 const Rebind<uint16_t, decltype(df32)> du16;
5514 const RebindToSigned<decltype(df32)> di32;
5515 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
5516}
5517
5518template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5520 return VFromD<D>{_mm512_cvtps_pd(v.raw)};
5521}
5522
5523template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5524HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<int32_t> v) {
5525 return VFromD<D>{_mm512_cvtepi32_pd(v.raw)};
5526}
5527
5528template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5529HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<uint32_t> v) {
5530 return VFromD<D>{_mm512_cvtepu32_pd(v.raw)};
5531}
5532
5533template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5534HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
5535 return VFromD<D>{_mm512_cvttps_epi64(v.raw)};
5536}
5537template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5538HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
5539 return VFromD<D>{_mm512_cvttps_epu64(v.raw)};
5540}
5541template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5542HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
5543 return VFromD<D>{_mm512_maskz_cvttps_epu64(Not(MaskFromVec(v)).raw, v.raw)};
5544}
5545
5546// ------------------------------ Demotions (full -> part w/ narrow lanes)
5547
5548template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
5550 const Full512<uint64_t> du64;
5551 const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
5552
5553 // Compress even u64 lanes into 256 bit.
5554 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
5555 const auto idx64 = Load(du64, kLanes);
5556 const Vec512<uint16_t> even{_mm512_permutexvar_epi64(idx64.raw, u16.raw)};
5557 return LowerHalf(even);
5558}
5559
5560template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
5562 const DFromV<decltype(v)> d;
5563 const RebindToSigned<decltype(d)> di;
5564 return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu))));
5565}
5566
5567template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
5568HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
5569 const Full512<uint64_t> du64;
5570 const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
5571
5572 // Compress even u64 lanes into 256 bit.
5573 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
5574 const auto idx64 = Load(du64, kLanes);
5575 const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
5576 return LowerHalf(even);
5577}
5578
5579template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
5580HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
5581 const Full512<uint32_t> du32;
5582 const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
5583 const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
5584
5585 const VFromD<decltype(du32)> idx32 = Dup128VecFromValues(du32, 0, 4, 8, 12);
5586 const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
5587 return LowerHalf(LowerHalf(fixed));
5588}
5589
5590template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
5591HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<uint32_t> v) {
5592 return VFromD<D>{_mm512_cvtusepi32_epi8(v.raw)};
5593}
5594
5595template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
5597 const Full512<uint64_t> du64;
5598 const Vec512<uint8_t> u8{_mm512_packus_epi16(v.raw, v.raw)};
5599
5600 // Compress even u64 lanes into 256 bit.
5601 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
5602 const auto idx64 = Load(du64, kLanes);
5603 const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
5604 return LowerHalf(even);
5605}
5606
5607template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
5609 const DFromV<decltype(v)> d;
5610 const RebindToSigned<decltype(d)> di;
5611 return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFu))));
5612}
5613
5614template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
5615HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
5616 const Full512<uint32_t> du32;
5617 const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
5618 const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
5619
5620 const VFromD<decltype(du32)> idx32 = Dup128VecFromValues(du32, 0, 4, 8, 12);
5621 const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
5622 return LowerHalf(LowerHalf(fixed));
5623}
5624
5625template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I8_D(D)>
5626HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int16_t> v) {
5627 const Full512<uint64_t> du64;
5628 const Vec512<int8_t> u8{_mm512_packs_epi16(v.raw, v.raw)};
5629
5630 // Compress even u64 lanes into 256 bit.
5631 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
5632 const auto idx64 = Load(du64, kLanes);
5633 const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
5634 return LowerHalf(even);
5635}
5636
5637template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
5639 return VFromD<D>{_mm512_cvtsepi64_epi32(v.raw)};
5640}
5641template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
5642HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
5643 return VFromD<D>{_mm512_cvtsepi64_epi16(v.raw)};
5644}
5645template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
5646HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
5647 return VFromD<D>{_mm512_cvtsepi64_epi8(v.raw)};
5648}
5649
5650template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5651HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
5652 const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
5653 return VFromD<D>{_mm512_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
5654}
5655template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
5656HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
5657 const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
5658 return VFromD<D>{_mm512_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
5659}
5660template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
5661HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
5662 const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
5663 return VFromD<D>{_mm512_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
5664}
5665
5666template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5668 return VFromD<D>{_mm512_cvtusepi64_epi32(v.raw)};
5669}
5670template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
5671HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<uint64_t> v) {
5672 return VFromD<D>{_mm512_cvtusepi64_epi16(v.raw)};
5673}
5674template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
5675HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<uint64_t> v) {
5676 return VFromD<D>{_mm512_cvtusepi64_epi8(v.raw)};
5677}
5678
5679template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
5681 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
5682 HWY_DIAGNOSTICS(push)
5683 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
5684 const RebindToUnsigned<decltype(df16)> du16;
5685 return BitCast(
5686 df16, VFromD<decltype(du16)>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
5687 HWY_DIAGNOSTICS(pop)
5688}
5689
5690#if HWY_HAVE_FLOAT16
5691template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
5692HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec512<double> v) {
5693 return VFromD<D>{_mm512_cvtpd_ph(v.raw)};
5694}
5695#endif // HWY_HAVE_FLOAT16
5696
5697#if HWY_AVX3_HAVE_F32_TO_BF16C
5698template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
5699HWY_API VFromD<D> DemoteTo(D /*dbf16*/, Vec512<float> v) {
5700#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
5701 // Inline assembly workaround for LLVM codegen bug
5702 __m256i raw_result;
5703 __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
5704 return VFromD<D>{raw_result};
5705#else
5706 // The _mm512_cvtneps_pbh intrinsic returns a __m256bh vector that needs to be
5707 // bit casted to a __m256i vector
5708 return VFromD<D>{detail::BitCastToInteger(_mm512_cvtneps_pbh(v.raw))};
5709#endif
5710}
5711
5712template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
5713HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec512<float> a,
5714 Vec512<float> b) {
5715#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
5716 // Inline assembly workaround for LLVM codegen bug
5717 __m512i raw_result;
5718 __asm__("vcvtne2ps2bf16 %2, %1, %0"
5719 : "=v"(raw_result)
5720 : "v"(b.raw), "v"(a.raw));
5721 return VFromD<D>{raw_result};
5722#else
5723 // The _mm512_cvtne2ps_pbh intrinsic returns a __m512bh vector that needs to
5724 // be bit casted to a __m512i vector
5725 return VFromD<D>{detail::BitCastToInteger(_mm512_cvtne2ps_pbh(b.raw, a.raw))};
5726#endif
5727}
5728#endif // HWY_AVX3_HAVE_F32_TO_BF16C
5729
5730template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
5732 Vec512<int32_t> b) {
5733 return VFromD<D>{_mm512_packs_epi32(a.raw, b.raw)};
5734}
5735
5736template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5737HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec512<int32_t> a,
5738 Vec512<int32_t> b) {
5739 return VFromD<D>{_mm512_packus_epi32(a.raw, b.raw)};
5740}
5741
5742template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5744 Vec512<uint32_t> b) {
5745 const DFromV<decltype(a)> du32;
5746 const RebindToSigned<decltype(du32)> di32;
5747 const auto max_i32 = Set(du32, 0x7FFFFFFFu);
5748
5749 return ReorderDemote2To(dn, BitCast(di32, Min(a, max_i32)),
5750 BitCast(di32, Min(b, max_i32)));
5751}
5752
5753template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I8_D(D)>
5755 Vec512<int16_t> b) {
5756 return VFromD<D>{_mm512_packs_epi16(a.raw, b.raw)};
5757}
5758
5759template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
5760HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec512<int16_t> a,
5761 Vec512<int16_t> b) {
5762 return VFromD<D>{_mm512_packus_epi16(a.raw, b.raw)};
5763}
5764
5765template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
5767 Vec512<uint16_t> b) {
5768 const DFromV<decltype(a)> du16;
5769 const RebindToSigned<decltype(du16)> di16;
5770 const auto max_i16 = Set(du16, 0x7FFFu);
5771
5772 return ReorderDemote2To(dn, BitCast(di16, Min(a, max_i16)),
5773 BitCast(di16, Min(b, max_i16)));
5774}
5775
5776template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
5778 const Half<decltype(dn)> dnh;
5779 return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
5780}
5781
5782template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
5784 Vec512<uint64_t> b) {
5785 const Half<decltype(dn)> dnh;
5786 return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
5787}
5788
5789template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
5790 HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
5791 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
5792 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2),
5793 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
5794HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
5795 const Full512<uint64_t> du64;
5796 alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
5797 return BitCast(d, TableLookupLanes(BitCast(du64, ReorderDemote2To(d, a, b)),
5798 SetTableIndices(du64, kIdx)));
5799}
5800
5801template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
5802 HWY_IF_V_SIZE_GT_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
5803 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
5804 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2),
5805 HWY_IF_T_SIZE_V(V, 8)>
5806HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
5807 return ReorderDemote2To(d, a, b);
5808}
5809
5810template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5812 return VFromD<D>{_mm512_cvtpd_ps(v.raw)};
5813}
5814
5815template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
5817 return VFromD<D>{_mm512_cvttpd_epi32(v.raw)};
5818}
5819
5820template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5821HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
5822 return VFromD<D>{_mm512_cvttpd_epu32(v.raw)};
5823}
5824
5825template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5826HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
5827 return VFromD<D>{_mm512_maskz_cvttpd_epu32(Not(MaskFromVec(v)).raw, v.raw)};
5828}
5829
5830template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5831HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
5832 return VFromD<D>{_mm512_cvtepi64_ps(v.raw)};
5833}
5834
5835template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5836HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
5837 return VFromD<D>{_mm512_cvtepu64_ps(v.raw)};
5838}
5839
5840// For already range-limited input [0, 255].
5842 const DFromV<decltype(v)> d32;
5843 // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
5844 // lowest 4 bytes.
5845 const VFromD<decltype(d32)> v8From32 =
5846 Dup128VecFromValues(d32, 0x0C080400u, ~0u, ~0u, ~0u);
5847 const auto quads = TableLookupBytes(v, v8From32);
5848 // Gather the lowest 4 bytes of 4 128-bit blocks.
5849 const VFromD<decltype(d32)> index32 = Dup128VecFromValues(d32, 0, 4, 8, 12);
5850 const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
5851 return LowerHalf(LowerHalf(bytes));
5852}
5853
5854// ------------------------------ Truncations
5855
5856template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
5858#if HWY_TARGET <= HWY_AVX3_DL
5859 (void)d;
5860 const Full512<uint8_t> d8;
5861 const VFromD<decltype(d8)> v8From64 = Dup128VecFromValues(
5862 d8, 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56);
5863 const Vec512<uint8_t> bytes{_mm512_permutexvar_epi8(v8From64.raw, v.raw)};
5864 return LowerHalf(LowerHalf(LowerHalf(bytes)));
5865#else
5866 const Full512<uint32_t> d32;
5867 alignas(64) static constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
5868 0, 2, 4, 6, 8, 10, 12, 14};
5869 const Vec512<uint32_t> even{
5870 _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)};
5871 return TruncateTo(d, LowerHalf(even));
5872#endif
5873}
5874
5875template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
5876HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint64_t> v) {
5877 const Full512<uint16_t> d16;
5878 alignas(16) static constexpr uint16_t k16From64[8] = {0, 4, 8, 12,
5879 16, 20, 24, 28};
5880 const Vec512<uint16_t> bytes{
5881 _mm512_permutexvar_epi16(LoadDup128(d16, k16From64).raw, v.raw)};
5882 return LowerHalf(LowerHalf(bytes));
5883}
5884
5885template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5886HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint64_t> v) {
5887 const Full512<uint32_t> d32;
5888 alignas(64) static constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
5889 0, 2, 4, 6, 8, 10, 12, 14};
5890 const Vec512<uint32_t> even{
5891 _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)};
5892 return LowerHalf(even);
5893}
5894
5895template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
5897#if HWY_TARGET <= HWY_AVX3_DL
5898 const Full512<uint8_t> d8;
5899 const VFromD<decltype(d8)> v8From32 = Dup128VecFromValues(
5900 d8, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
5901 const Vec512<uint8_t> bytes{_mm512_permutexvar_epi8(v8From32.raw, v.raw)};
5902#else
5903 const Full512<uint32_t> d32;
5904 // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
5905 // lowest 4 bytes.
5906 const VFromD<decltype(d32)> v8From32 =
5907 Dup128VecFromValues(d32, 0x0C080400u, ~0u, ~0u, ~0u);
5908 const auto quads = TableLookupBytes(v, v8From32);
5909 // Gather the lowest 4 bytes of 4 128-bit blocks.
5910 const VFromD<decltype(d32)> index32 = Dup128VecFromValues(d32, 0, 4, 8, 12);
5911 const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
5912#endif
5913 return LowerHalf(LowerHalf(bytes));
5914}
5915
5916template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
5917HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint32_t> v) {
5918 const Full512<uint16_t> d16;
5919 alignas(64) static constexpr uint16_t k16From32[32] = {
5920 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
5921 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
5922 const Vec512<uint16_t> bytes{
5923 _mm512_permutexvar_epi16(Load(d16, k16From32).raw, v.raw)};
5924 return LowerHalf(bytes);
5925}
5926
5927template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
5929#if HWY_TARGET <= HWY_AVX3_DL
5930 const Full512<uint8_t> d8;
5931 alignas(64) static constexpr uint8_t k8From16[64] = {
5932 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
5933 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
5934 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
5935 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
5936 const Vec512<uint8_t> bytes{
5937 _mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)};
5938#else
5939 const Full512<uint32_t> d32;
5940 const VFromD<decltype(d32)> v16From32 = Dup128VecFromValues(
5941 d32, 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u);
5942 const auto quads = TableLookupBytes(v, v16From32);
5943 alignas(64) static constexpr uint32_t kIndex32[16] = {
5944 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
5945 const Vec512<uint8_t> bytes{
5946 _mm512_permutexvar_epi32(Load(d32, kIndex32).raw, quads.raw)};
5947#endif
5948 return LowerHalf(bytes);
5949}
5950
5951// ------------------------------ Convert integer <=> floating point
5952
5953#if HWY_HAVE_FLOAT16
5954template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
5955HWY_API VFromD<D> ConvertTo(D /* tag */, Vec512<uint16_t> v) {
5956 return VFromD<D>{_mm512_cvtepu16_ph(v.raw)};
5957}
5958template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
5959HWY_API VFromD<D> ConvertTo(D /* tag */, Vec512<int16_t> v) {
5960 return VFromD<D>{_mm512_cvtepi16_ph(v.raw)};
5961}
5962#endif // HWY_HAVE_FLOAT16
5963
5964template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
5966 return VFromD<D>{_mm512_cvtepi32_ps(v.raw)};
5967}
5968
5969template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5971 return VFromD<D>{_mm512_cvtepi64_pd(v.raw)};
5972}
5973
5974template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
5976 return VFromD<D>{_mm512_cvtepu32_ps(v.raw)};
5977}
5978
5979template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5981 return VFromD<D>{_mm512_cvtepu64_pd(v.raw)};
5982}
5983
5984// Truncates (rounds toward zero).
5985#if HWY_HAVE_FLOAT16
5986template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
5987HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float16_t> v) {
5988 return VFromD<D>{_mm512_cvttph_epi16(v.raw)};
5989}
5990template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5991HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
5992 return VFromD<D>{_mm512_cvttph_epu16(v.raw)};
5993}
5994template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5995HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
5996 return VFromD<D>{_mm512_maskz_cvttph_epu16(Not(MaskFromVec(v)).raw, v.raw)};
5997}
5998#endif // HWY_HAVE_FLOAT16
5999template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6001 return VFromD<D>{_mm512_cvttps_epi32(v.raw)};
6002}
6003template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
6005 return VFromD<D>{_mm512_cvttpd_epi64(v.raw)};
6006}
6007template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
6008HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6009 return VFromD<DU>{_mm512_cvttps_epu32(v.raw)};
6010}
6011template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
6012HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6013 return VFromD<DU>{_mm512_maskz_cvttps_epu32(Not(MaskFromVec(v)).raw, v.raw)};
6014}
6015template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
6016HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6017 return VFromD<DU>{_mm512_cvttpd_epu64(v.raw)};
6018}
6019template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
6020HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6021 return VFromD<DU>{_mm512_maskz_cvttpd_epu64(Not(MaskFromVec(v)).raw, v.raw)};
6022}
6023
6025 const Full512<int32_t> di;
6027 di, v, Vec512<int32_t>{_mm512_cvtps_epi32(v.raw)});
6028}
6029
6030// ================================================== CRYPTO
6031
6032#if !defined(HWY_DISABLE_PCLMUL_AES)
6033
6035 Vec512<uint8_t> round_key) {
6036#if HWY_TARGET <= HWY_AVX3_DL
6037 return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)};
6038#else
6039 const DFromV<decltype(state)> d;
6040 const Half<decltype(d)> d2;
6041 return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
6042 AESRound(LowerHalf(state), LowerHalf(round_key)));
6043#endif
6044}
6045
6047 Vec512<uint8_t> round_key) {
6048#if HWY_TARGET <= HWY_AVX3_DL
6049 return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)};
6050#else
6051 const DFromV<decltype(state)> d;
6052 const Half<decltype(d)> d2;
6053 return Combine(d,
6054 AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
6055 AESLastRound(LowerHalf(state), LowerHalf(round_key)));
6056#endif
6057}
6058
6060 Vec512<uint8_t> round_key) {
6061#if HWY_TARGET <= HWY_AVX3_DL
6062 return Vec512<uint8_t>{_mm512_aesdec_epi128(state.raw, round_key.raw)};
6063#else
6064 const Full512<uint8_t> d;
6065 const Half<decltype(d)> d2;
6066 return Combine(d, AESRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)),
6067 AESRoundInv(LowerHalf(state), LowerHalf(round_key)));
6068#endif
6069}
6070
6072 Vec512<uint8_t> round_key) {
6073#if HWY_TARGET <= HWY_AVX3_DL
6074 return Vec512<uint8_t>{_mm512_aesdeclast_epi128(state.raw, round_key.raw)};
6075#else
6076 const Full512<uint8_t> d;
6077 const Half<decltype(d)> d2;
6078 return Combine(
6079 d, AESLastRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)),
6080 AESLastRoundInv(LowerHalf(state), LowerHalf(round_key)));
6081#endif
6082}
6083
6084template <uint8_t kRcon>
6086 const Full512<uint8_t> d;
6087#if HWY_TARGET <= HWY_AVX3_DL
6088 const VFromD<decltype(d)> rconXorMask = Dup128VecFromValues(
6089 d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
6090 const VFromD<decltype(d)> rotWordShuffle = Dup128VecFromValues(
6091 d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
6092 const Repartition<uint32_t, decltype(d)> du32;
6093 const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
6094 const auto sub_word_result = AESLastRound(w13, rconXorMask);
6095 return TableLookupBytes(sub_word_result, rotWordShuffle);
6096#else
6097 const Half<decltype(d)> d2;
6098 return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)),
6099 AESKeyGenAssist<kRcon>(LowerHalf(v)));
6100#endif
6101}
6102
6104#if HWY_TARGET <= HWY_AVX3_DL
6105 return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x00)};
6106#else
6107 alignas(64) uint64_t a[8];
6108 alignas(64) uint64_t b[8];
6109 const DFromV<decltype(va)> d;
6110 const Half<Half<decltype(d)>> d128;
6111 Store(va, d, a);
6112 Store(vb, d, b);
6113 for (size_t i = 0; i < 8; i += 2) {
6114 const auto mul = CLMulLower(Load(d128, a + i), Load(d128, b + i));
6115 Store(mul, d128, a + i);
6116 }
6117 return Load(d, a);
6118#endif
6119}
6120
6122#if HWY_TARGET <= HWY_AVX3_DL
6123 return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x11)};
6124#else
6125 alignas(64) uint64_t a[8];
6126 alignas(64) uint64_t b[8];
6127 const DFromV<decltype(va)> d;
6128 const Half<Half<decltype(d)>> d128;
6129 Store(va, d, a);
6130 Store(vb, d, b);
6131 for (size_t i = 0; i < 8; i += 2) {
6132 const auto mul = CLMulUpper(Load(d128, a + i), Load(d128, b + i));
6133 Store(mul, d128, a + i);
6134 }
6135 return Load(d, a);
6136#endif
6137}
6138
6139#endif // HWY_DISABLE_PCLMUL_AES
6140
6141// ================================================== MISC
6142
6143// ------------------------------ SumsOfAdjQuadAbsDiff (Broadcast,
6144// SumsOfAdjShufQuadAbsDiff)
6145
6146template <int kAOffset, int kBOffset>
6148 Vec512<uint8_t> b) {
6149 static_assert(0 <= kAOffset && kAOffset <= 1,
6150 "kAOffset must be between 0 and 1");
6151 static_assert(0 <= kBOffset && kBOffset <= 3,
6152 "kBOffset must be between 0 and 3");
6153
6154 const DFromV<decltype(a)> d;
6155 const RepartitionToWideX2<decltype(d)> du32;
6156
6157 // While AVX3 does not have a _mm512_mpsadbw_epu8 intrinsic, the
6158 // SumsOfAdjQuadAbsDiff operation is implementable for 512-bit vectors on
6159 // AVX3 using SumsOfShuffledQuadAbsDiff and U32 Broadcast.
6160 return SumsOfShuffledQuadAbsDiff<kAOffset + 2, kAOffset + 1, kAOffset + 1,
6161 kAOffset>(
6162 a, BitCast(d, Broadcast<kBOffset>(BitCast(du32, b))));
6163}
6164
6165#if !HWY_IS_MSAN
6166// ------------------------------ I32/I64 SaturatedAdd (MaskFromVec)
6167
6169 const DFromV<decltype(a)> d;
6170 const auto sum = a + b;
6171 const auto overflow_mask = MaskFromVec(
6172 Vec512<int32_t>{_mm512_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)});
6173 const auto i32_max = Set(d, LimitsMax<int32_t>());
6174 const Vec512<int32_t> overflow_result{_mm512_mask_ternarylogic_epi32(
6175 i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
6176 return IfThenElse(overflow_mask, overflow_result, sum);
6177}
6178
6180 const DFromV<decltype(a)> d;
6181 const auto sum = a + b;
6182 const auto overflow_mask = MaskFromVec(
6183 Vec512<int64_t>{_mm512_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)});
6184 const auto i64_max = Set(d, LimitsMax<int64_t>());
6185 const Vec512<int64_t> overflow_result{_mm512_mask_ternarylogic_epi64(
6186 i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
6187 return IfThenElse(overflow_mask, overflow_result, sum);
6188}
6189
6190// ------------------------------ I32/I64 SaturatedSub (MaskFromVec)
6191
6193 const DFromV<decltype(a)> d;
6194 const auto diff = a - b;
6195 const auto overflow_mask = MaskFromVec(
6196 Vec512<int32_t>{_mm512_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)});
6197 const auto i32_max = Set(d, LimitsMax<int32_t>());
6198 const Vec512<int32_t> overflow_result{_mm512_mask_ternarylogic_epi32(
6199 i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
6200 return IfThenElse(overflow_mask, overflow_result, diff);
6201}
6202
6204 const DFromV<decltype(a)> d;
6205 const auto diff = a - b;
6206 const auto overflow_mask = MaskFromVec(
6207 Vec512<int64_t>{_mm512_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)});
6208 const auto i64_max = Set(d, LimitsMax<int64_t>());
6209 const Vec512<int64_t> overflow_result{_mm512_mask_ternarylogic_epi64(
6210 i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
6211 return IfThenElse(overflow_mask, overflow_result, diff);
6212}
6213#endif // !HWY_IS_MSAN
6214
6215// ------------------------------ Mask testing
6216
6217// Beware: the suffix indicates the number of mask bits, not lane size!
6218
6219namespace detail {
6220
6221template <typename T>
6223#if HWY_COMPILER_HAS_MASK_INTRINSICS
6224 return _kortestz_mask64_u8(mask.raw, mask.raw);
6225#else
6226 return mask.raw == 0;
6227#endif
6228}
6229template <typename T>
6231#if HWY_COMPILER_HAS_MASK_INTRINSICS
6232 return _kortestz_mask32_u8(mask.raw, mask.raw);
6233#else
6234 return mask.raw == 0;
6235#endif
6236}
6237template <typename T>
6239#if HWY_COMPILER_HAS_MASK_INTRINSICS
6240 return _kortestz_mask16_u8(mask.raw, mask.raw);
6241#else
6242 return mask.raw == 0;
6243#endif
6244}
6245template <typename T>
6247#if HWY_COMPILER_HAS_MASK_INTRINSICS
6248 return _kortestz_mask8_u8(mask.raw, mask.raw);
6249#else
6250 return mask.raw == 0;
6251#endif
6252}
6253
6254} // namespace detail
6255
6256template <class D, HWY_IF_V_SIZE_D(D, 64)>
6257HWY_API bool AllFalse(D /* tag */, const MFromD<D> mask) {
6258 return detail::AllFalse(hwy::SizeTag<sizeof(TFromD<D>)>(), mask);
6259}
6260
6261namespace detail {
6262
6263template <typename T>
6265#if HWY_COMPILER_HAS_MASK_INTRINSICS
6266 return _kortestc_mask64_u8(mask.raw, mask.raw);
6267#else
6268 return mask.raw == 0xFFFFFFFFFFFFFFFFull;
6269#endif
6270}
6271template <typename T>
6273#if HWY_COMPILER_HAS_MASK_INTRINSICS
6274 return _kortestc_mask32_u8(mask.raw, mask.raw);
6275#else
6276 return mask.raw == 0xFFFFFFFFull;
6277#endif
6278}
6279template <typename T>
6281#if HWY_COMPILER_HAS_MASK_INTRINSICS
6282 return _kortestc_mask16_u8(mask.raw, mask.raw);
6283#else
6284 return mask.raw == 0xFFFFull;
6285#endif
6286}
6287template <typename T>
6289#if HWY_COMPILER_HAS_MASK_INTRINSICS
6290 return _kortestc_mask8_u8(mask.raw, mask.raw);
6291#else
6292 return mask.raw == 0xFFull;
6293#endif
6294}
6295
6296} // namespace detail
6297
6298template <class D, HWY_IF_V_SIZE_D(D, 64)>
6299HWY_API bool AllTrue(D /* tag */, const MFromD<D> mask) {
6300 return detail::AllTrue(hwy::SizeTag<sizeof(TFromD<D>)>(), mask);
6301}
6302
6303// `p` points to at least 8 readable bytes, not all of which need be valid.
6304template <class D, HWY_IF_V_SIZE_D(D, 64)>
6305HWY_API MFromD<D> LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) {
6306 MFromD<D> mask;
6307 CopyBytes<8 / sizeof(TFromD<D>)>(bits, &mask.raw);
6308 // N >= 8 (= 512 / 64), so no need to mask invalid bits.
6309 return mask;
6310}
6311
6312// `p` points to at least 8 writable bytes.
6313template <class D, HWY_IF_V_SIZE_D(D, 64)>
6314HWY_API size_t StoreMaskBits(D /* tag */, MFromD<D> mask, uint8_t* bits) {
6315 const size_t kNumBytes = 8 / sizeof(TFromD<D>);
6316 CopyBytes<kNumBytes>(&mask.raw, bits);
6317 // N >= 8 (= 512 / 64), so no need to mask invalid bits.
6318 return kNumBytes;
6319}
6320
6321template <class D, HWY_IF_V_SIZE_D(D, 64)>
6322HWY_API size_t CountTrue(D /* tag */, const MFromD<D> mask) {
6323 return PopCount(static_cast<uint64_t>(mask.raw));
6324}
6325
6326template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_T_SIZE_D(D, 1)>
6327HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) {
6328 return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
6329}
6330
6331template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
6332HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) {
6333 return Num0BitsBelowLS1Bit_Nonzero64(mask.raw);
6334}
6335
6336template <class D, HWY_IF_V_SIZE_D(D, 64)>
6337HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
6338 return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
6339 : intptr_t{-1};
6340}
6341
6342template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_T_SIZE_D(D, 1)>
6343HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) {
6344 return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask.raw);
6345}
6346
6347template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
6348HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) {
6349 return 63 - Num0BitsAboveMS1Bit_Nonzero64(mask.raw);
6350}
6351
6352template <class D, HWY_IF_V_SIZE_D(D, 64)>
6353HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
6354 return mask.raw ? static_cast<intptr_t>(FindKnownLastTrue(d, mask))
6355 : intptr_t{-1};
6356}
6357
6358// ------------------------------ Compress
6359
6360// Always implement 8-bit here even if we lack VBMI2 because we can do better
6361// than generic_ops (8 at a time) via the native 32-bit compress (16 at a time).
6362#ifdef HWY_NATIVE_COMPRESS8
6363#undef HWY_NATIVE_COMPRESS8
6364#else
6365#define HWY_NATIVE_COMPRESS8
6366#endif
6367
6368namespace detail {
6369
6370#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
6371template <size_t N>
6373 const Mask128<uint8_t, N> mask) {
6374 return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)};
6375}
6377 const Mask256<uint8_t> mask) {
6378 return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)};
6379}
6381 const Mask512<uint8_t> mask) {
6382 return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)};
6383}
6384
6385template <size_t N>
6387 const Mask128<uint16_t, N> mask) {
6388 return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)};
6389}
6391 const Mask256<uint16_t> mask) {
6392 return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)};
6393}
6395 const Mask512<uint16_t> mask) {
6396 return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)};
6397}
6398
6399// Slow on Zen4, do not even define these to prevent accidental usage.
6400#if HWY_TARGET != HWY_AVX3_ZEN4
6401
6402template <size_t N>
6403HWY_INLINE void NativeCompressStore(Vec128<uint8_t, N> v,
6405 uint8_t* HWY_RESTRICT unaligned) {
6406 _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
6407}
6408HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask,
6409 uint8_t* HWY_RESTRICT unaligned) {
6410 _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
6411}
6412HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask,
6413 uint8_t* HWY_RESTRICT unaligned) {
6414 _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
6415}
6416
6417template <size_t N>
6418HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v,
6419 Mask128<uint16_t, N> mask,
6420 uint16_t* HWY_RESTRICT unaligned) {
6421 _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
6422}
6423HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask,
6424 uint16_t* HWY_RESTRICT unaligned) {
6425 _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
6426}
6427HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask,
6428 uint16_t* HWY_RESTRICT unaligned) {
6429 _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
6430}
6431
6432#endif // HWY_TARGET != HWY_AVX3_ZEN4
6433
6435 Mask512<uint8_t> mask) {
6436 return Vec512<uint8_t>{_mm512_maskz_expand_epi8(mask.raw, v.raw)};
6437}
6438
6440 Mask512<uint16_t> mask) {
6441 return Vec512<uint16_t>{_mm512_maskz_expand_epi16(mask.raw, v.raw)};
6442}
6443
6444template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
6446 const uint8_t* HWY_RESTRICT unaligned) {
6447 return VFromD<D>{_mm512_maskz_expandloadu_epi8(mask.raw, unaligned)};
6448}
6449
6450template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6452 const uint16_t* HWY_RESTRICT unaligned) {
6453 return VFromD<D>{_mm512_maskz_expandloadu_epi16(mask.raw, unaligned)};
6454}
6455
6456#endif // HWY_TARGET <= HWY_AVX3_DL
6457
6458template <size_t N>
6460 Mask128<uint32_t, N> mask) {
6461 return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
6462}
6464 Mask256<uint32_t> mask) {
6465 return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
6466}
6468 Mask512<uint32_t> mask) {
6469 return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
6470}
6471// We use table-based compress for 64-bit lanes, see CompressIsPartition.
6472
6473// Slow on Zen4, do not even define these to prevent accidental usage.
6474#if HWY_TARGET != HWY_AVX3_ZEN4
6475
6476template <size_t N>
6477HWY_INLINE void NativeCompressStore(Vec128<uint32_t, N> v,
6479 uint32_t* HWY_RESTRICT unaligned) {
6480 _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6481}
6482HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask,
6483 uint32_t* HWY_RESTRICT unaligned) {
6484 _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6485}
6486HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask,
6487 uint32_t* HWY_RESTRICT unaligned) {
6488 _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6489}
6490
6491template <size_t N>
6492HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v,
6493 Mask128<uint64_t, N> mask,
6494 uint64_t* HWY_RESTRICT unaligned) {
6495 _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6496}
6497HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask,
6498 uint64_t* HWY_RESTRICT unaligned) {
6499 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6500}
6501HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask,
6502 uint64_t* HWY_RESTRICT unaligned) {
6503 _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6504}
6505
6506template <size_t N>
6507HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask,
6508 float* HWY_RESTRICT unaligned) {
6509 _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6510}
6511HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask,
6512 float* HWY_RESTRICT unaligned) {
6513 _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6514}
6515HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask,
6516 float* HWY_RESTRICT unaligned) {
6517 _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6518}
6519
6520template <size_t N>
6521HWY_INLINE void NativeCompressStore(Vec128<double, N> v,
6522 Mask128<double, N> mask,
6523 double* HWY_RESTRICT unaligned) {
6524 _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6525}
6526HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask,
6527 double* HWY_RESTRICT unaligned) {
6528 _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6529}
6530HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask,
6531 double* HWY_RESTRICT unaligned) {
6532 _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6533}
6534
6535#endif // HWY_TARGET != HWY_AVX3_ZEN4
6536
6538 Mask512<uint32_t> mask) {
6539 return Vec512<uint32_t>{_mm512_maskz_expand_epi32(mask.raw, v.raw)};
6540}
6541
6543 Mask512<uint64_t> mask) {
6544 return Vec512<uint64_t>{_mm512_maskz_expand_epi64(mask.raw, v.raw)};
6545}
6546
6547template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
6549 const uint32_t* HWY_RESTRICT unaligned) {
6550 return VFromD<D>{_mm512_maskz_expandloadu_epi32(mask.raw, unaligned)};
6551}
6552
6553template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
6555 const uint64_t* HWY_RESTRICT unaligned) {
6556 return VFromD<D>{_mm512_maskz_expandloadu_epi64(mask.raw, unaligned)};
6557}
6558
6559// For u8x16 and <= u16x16 we can avoid store+load for Compress because there is
6560// only a single compressed vector (u32x16). Other EmuCompress are implemented
6561// after the EmuCompressStore they build upon.
6562template <size_t N>
6564 Mask128<uint8_t, N> mask) {
6565 const DFromV<decltype(v)> d;
6566 const Rebind<uint32_t, decltype(d)> d32;
6567 const VFromD<decltype(d32)> v0 = PromoteTo(d32, v);
6568
6569 const uint64_t mask_bits{mask.raw};
6570 // Mask type is __mmask16 if v is full 128, else __mmask8.
6571 using M32 = MFromD<decltype(d32)>;
6572 const M32 m0{static_cast<typename M32::Raw>(mask_bits)};
6573 return TruncateTo(d, Compress(v0, m0));
6574}
6575
6576template <size_t N>
6578 Mask128<uint16_t, N> mask) {
6579 const DFromV<decltype(v)> d;
6580 const Rebind<int32_t, decltype(d)> di32;
6581 const RebindToUnsigned<decltype(di32)> du32;
6582 const MFromD<decltype(du32)> mask32{static_cast<__mmask8>(mask.raw)};
6583 // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX.
6584 // Only i32 -> u16 is supported, whereas NativeCompress expects u32.
6585 const VFromD<decltype(du32)> v32 = BitCast(du32, PromoteTo(di32, v));
6586 return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
6587}
6588
6590 Mask256<uint16_t> mask) {
6591 const DFromV<decltype(v)> d;
6592 const Rebind<int32_t, decltype(d)> di32;
6593 const RebindToUnsigned<decltype(di32)> du32;
6594 const Mask512<uint32_t> mask32{static_cast<__mmask16>(mask.raw)};
6595 const Vec512<uint32_t> v32 = BitCast(du32, PromoteTo(di32, v));
6596 return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
6597}
6598
6599// See above - small-vector EmuCompressStore are implemented via EmuCompress.
6600template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6602 TFromD<D>* HWY_RESTRICT unaligned) {
6603 StoreU(EmuCompress(v, mask), d, unaligned);
6604}
6605
6606template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6608 TFromD<D>* HWY_RESTRICT unaligned) {
6609 StoreU(EmuCompress(v, mask), d, unaligned);
6610}
6611
6612// Main emulation logic for wider vector, starting with EmuCompressStore because
6613// it is most convenient to merge pieces using memory (concatenating vectors at
6614// byte offsets is difficult).
6615template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
6616HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
6617 TFromD<D>* HWY_RESTRICT unaligned) {
6618 const uint64_t mask_bits{mask.raw};
6619 const Half<decltype(d)> dh;
6620 const Rebind<uint32_t, decltype(dh)> d32;
6621 const Vec512<uint32_t> v0 = PromoteTo(d32, LowerHalf(v));
6622 const Vec512<uint32_t> v1 = PromoteTo(d32, UpperHalf(dh, v));
6623 const Mask512<uint32_t> m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)};
6624 const Mask512<uint32_t> m1{static_cast<__mmask16>(mask_bits >> 16)};
6625 const Vec128<uint8_t> c0 = TruncateTo(dh, NativeCompress(v0, m0));
6626 const Vec128<uint8_t> c1 = TruncateTo(dh, NativeCompress(v1, m1));
6627 uint8_t* HWY_RESTRICT pos = unaligned;
6628 StoreU(c0, dh, pos);
6629 StoreU(c1, dh, pos + CountTrue(d32, m0));
6630}
6631
6632template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
6633HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
6634 TFromD<D>* HWY_RESTRICT unaligned) {
6635 const uint64_t mask_bits{mask.raw};
6636 const Half<Half<decltype(d)>> dq;
6637 const Rebind<uint32_t, decltype(dq)> d32;
6638 alignas(64) uint8_t lanes[64];
6639 Store(v, d, lanes);
6640 const Vec512<uint32_t> v0 = PromoteTo(d32, LowerHalf(LowerHalf(v)));
6641 const Vec512<uint32_t> v1 = PromoteTo(d32, Load(dq, lanes + 16));
6642 const Vec512<uint32_t> v2 = PromoteTo(d32, Load(dq, lanes + 32));
6643 const Vec512<uint32_t> v3 = PromoteTo(d32, Load(dq, lanes + 48));
6644 const Mask512<uint32_t> m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)};
6645 const Mask512<uint32_t> m1{
6646 static_cast<uint16_t>((mask_bits >> 16) & 0xFFFFu)};
6647 const Mask512<uint32_t> m2{
6648 static_cast<uint16_t>((mask_bits >> 32) & 0xFFFFu)};
6649 const Mask512<uint32_t> m3{static_cast<__mmask16>(mask_bits >> 48)};
6650 const Vec128<uint8_t> c0 = TruncateTo(dq, NativeCompress(v0, m0));
6651 const Vec128<uint8_t> c1 = TruncateTo(dq, NativeCompress(v1, m1));
6652 const Vec128<uint8_t> c2 = TruncateTo(dq, NativeCompress(v2, m2));
6653 const Vec128<uint8_t> c3 = TruncateTo(dq, NativeCompress(v3, m3));
6654 uint8_t* HWY_RESTRICT pos = unaligned;
6655 StoreU(c0, dq, pos);
6656 pos += CountTrue(d32, m0);
6657 StoreU(c1, dq, pos);
6658 pos += CountTrue(d32, m1);
6659 StoreU(c2, dq, pos);
6660 pos += CountTrue(d32, m2);
6661 StoreU(c3, dq, pos);
6662}
6663
6664template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6665HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
6666 TFromD<D>* HWY_RESTRICT unaligned) {
6667 const Repartition<int32_t, decltype(d)> di32;
6668 const RebindToUnsigned<decltype(di32)> du32;
6669 const Half<decltype(d)> dh;
6670 const Vec512<uint32_t> promoted0 =
6671 BitCast(du32, PromoteTo(di32, LowerHalf(dh, v)));
6672 const Vec512<uint32_t> promoted1 =
6673 BitCast(du32, PromoteTo(di32, UpperHalf(dh, v)));
6674
6675 const uint64_t mask_bits{mask.raw};
6676 const uint64_t maskL = mask_bits & 0xFFFF;
6677 const uint64_t maskH = mask_bits >> 16;
6678 const Mask512<uint32_t> mask0{static_cast<__mmask16>(maskL)};
6679 const Mask512<uint32_t> mask1{static_cast<__mmask16>(maskH)};
6680 const Vec512<uint32_t> compressed0 = NativeCompress(promoted0, mask0);
6681 const Vec512<uint32_t> compressed1 = NativeCompress(promoted1, mask1);
6682
6683 const Vec256<uint16_t> demoted0 = DemoteTo(dh, BitCast(di32, compressed0));
6684 const Vec256<uint16_t> demoted1 = DemoteTo(dh, BitCast(di32, compressed1));
6685
6686 // Store 256-bit halves
6687 StoreU(demoted0, dh, unaligned);
6688 StoreU(demoted1, dh, unaligned + PopCount(maskL));
6689}
6690
6691// Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore.
6692template <typename T> // 1 or 2 bytes
6694 const DFromV<decltype(v)> d;
6695 alignas(64) T buf[2 * Lanes(d)];
6696 EmuCompressStore(v, mask, d, buf);
6697 return Load(d, buf);
6698}
6699
6701 const Mask256<uint8_t> mask) {
6702 const DFromV<decltype(v)> d;
6703 alignas(32) uint8_t buf[2 * 32 / sizeof(uint8_t)];
6704 EmuCompressStore(v, mask, d, buf);
6705 return Load(d, buf);
6706}
6707
6708} // namespace detail
6709
6710template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
6711HWY_API V Compress(V v, const M mask) {
6712 const DFromV<decltype(v)> d;
6713 const RebindToUnsigned<decltype(d)> du;
6714 const auto mu = RebindMask(du, mask);
6715#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
6716 return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
6717#else
6718 return BitCast(d, detail::EmuCompress(BitCast(du, v), mu));
6719#endif
6720}
6721
6722template <class V, class M, HWY_IF_T_SIZE_V(V, 4)>
6723HWY_API V Compress(V v, const M mask) {
6724 const DFromV<decltype(v)> d;
6725 const RebindToUnsigned<decltype(d)> du;
6726 const auto mu = RebindMask(du, mask);
6727 return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
6728}
6729
6730template <typename T, HWY_IF_T_SIZE(T, 8)>
6732 // See CompressIsPartition. u64 is faster than u32.
6733 alignas(16) static constexpr uint64_t packed_array[256] = {
6734 // From PrintCompress32x8Tables, without the FirstN extension (there is
6735 // no benefit to including them because 64-bit CompressStore is anyway
6736 // masked, but also no harm because TableLookupLanes ignores the MSB).
6737 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
6738 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
6739 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
6740 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
6741 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
6742 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
6743 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
6744 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
6745 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
6746 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
6747 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
6748 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
6749 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
6750 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
6751 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
6752 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
6753 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
6754 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
6755 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
6756 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
6757 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
6758 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
6759 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
6760 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
6761 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
6762 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
6763 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
6764 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
6765 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
6766 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
6767 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
6768 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
6769 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
6770 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
6771 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
6772 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
6773 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
6774 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
6775 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
6776 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
6777 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
6778 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
6779 0x10765432, 0x17654320, 0x07654321, 0x76543210};
6780
6781 // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
6782 // _mm512_permutexvar_epi64 will ignore the upper bits.
6783 const DFromV<decltype(v)> d;
6784 const RebindToUnsigned<decltype(d)> du64;
6785 const auto packed = Set(du64, packed_array[mask.raw]);
6786 alignas(64) static constexpr uint64_t shifts[8] = {0, 4, 8, 12,
6787 16, 20, 24, 28};
6788 const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
6789 return TableLookupLanes(v, indices);
6790}
6791
6792// ------------------------------ Expand
6793
6794template <typename T, HWY_IF_T_SIZE(T, 1)>
6796 const Full512<T> d;
6797#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
6798 const RebindToUnsigned<decltype(d)> du;
6799 const auto mu = RebindMask(du, mask);
6800 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
6801#else
6802 // LUTs are infeasible for 2^64 possible masks, so splice together two
6803 // half-vector Expand.
6804 const Full256<T> dh;
6805 constexpr size_t N = Lanes(d);
6806 // We have to shift the input by a variable number of u8. Shuffling requires
6807 // VBMI2, in which case we would already have NativeExpand. We instead
6808 // load at an offset, which may incur a store to load forwarding stall.
6809 alignas(64) T lanes[N];
6810 Store(v, d, lanes);
6811 using Bits = typename Mask256<T>::Raw;
6812 const Mask256<T> maskL{
6813 static_cast<Bits>(mask.raw & Bits{(1ULL << (N / 2)) - 1})};
6814 const Mask256<T> maskH{static_cast<Bits>(mask.raw >> (N / 2))};
6815 const size_t countL = CountTrue(dh, maskL);
6816 const Vec256<T> expandL = Expand(LowerHalf(v), maskL);
6817 const Vec256<T> expandH = Expand(LoadU(dh, lanes + countL), maskH);
6818 return Combine(d, expandH, expandL);
6819#endif
6820}
6821
6822template <typename T, HWY_IF_T_SIZE(T, 2)>
6823HWY_API Vec512<T> Expand(Vec512<T> v, const Mask512<T> mask) {
6824 const Full512<T> d;
6825 const RebindToUnsigned<decltype(d)> du;
6826 const Vec512<uint16_t> vu = BitCast(du, v);
6827#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
6828 return BitCast(d, detail::NativeExpand(vu, RebindMask(du, mask)));
6829#else // AVX3
6830 // LUTs are infeasible for 2^32 possible masks, so splice together two
6831 // half-vector Expand.
6832 const Full256<T> dh;
6833 constexpr size_t N = Lanes(d);
6834 using Bits = typename Mask256<T>::Raw;
6835 const Mask256<T> maskL{
6836 static_cast<Bits>(mask.raw & Bits{(1ULL << (N / 2)) - 1})};
6837 const Mask256<T> maskH{static_cast<Bits>(mask.raw >> (N / 2))};
6838 // In AVX3 we can permutevar, which avoids a potential store to load
6839 // forwarding stall vs. reloading the input.
6840 alignas(64) uint16_t iota[64] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
6841 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
6842 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
6843 const Vec512<uint16_t> indices = LoadU(du, iota + CountTrue(dh, maskL));
6844 const Vec512<uint16_t> shifted{_mm512_permutexvar_epi16(indices.raw, vu.raw)};
6845 const Vec256<T> expandL = Expand(LowerHalf(v), maskL);
6846 const Vec256<T> expandH = Expand(LowerHalf(BitCast(d, shifted)), maskH);
6847 return Combine(d, expandH, expandL);
6848#endif // AVX3
6849}
6850
6851template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
6852HWY_API V Expand(V v, const M mask) {
6853 const DFromV<decltype(v)> d;
6854 const RebindToUnsigned<decltype(d)> du;
6855 const auto mu = RebindMask(du, mask);
6856 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
6857}
6858
6859// For smaller vectors, it is likely more efficient to promote to 32-bit.
6860// This works for u8x16, u16x8, u16x16 (can be promoted to u32x16), but is
6861// unnecessary if HWY_AVX3_DL, which provides native instructions.
6862#if HWY_TARGET > HWY_AVX3_DL // no VBMI2
6863
6864template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
6865 HWY_IF_LANES_LE_D(DFromV<V>, 16)>
6866HWY_API V Expand(V v, M mask) {
6867 const DFromV<V> d;
6868 const RebindToUnsigned<decltype(d)> du;
6869 const Rebind<uint32_t, decltype(d)> du32;
6870 const VFromD<decltype(du)> vu = BitCast(du, v);
6871 using M32 = MFromD<decltype(du32)>;
6872 const M32 m32{static_cast<typename M32::Raw>(mask.raw)};
6873 return BitCast(d, TruncateTo(du, Expand(PromoteTo(du32, vu), m32)));
6874}
6875
6876#endif // HWY_TARGET > HWY_AVX3_DL
6877
6878// ------------------------------ LoadExpand
6879
6880template <class D, HWY_IF_V_SIZE_D(D, 64),
6881 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
6882HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
6883 const TFromD<D>* HWY_RESTRICT unaligned) {
6884#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
6885 const RebindToUnsigned<decltype(d)> du;
6886 using TU = TFromD<decltype(du)>;
6887 const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned);
6888 const MFromD<decltype(du)> mu = RebindMask(du, mask);
6889 return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
6890#else
6891 return Expand(LoadU(d, unaligned), mask);
6892#endif
6893}
6894
6895template <class D, HWY_IF_V_SIZE_D(D, 64),
6896 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
6897HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
6898 const TFromD<D>* HWY_RESTRICT unaligned) {
6899 const RebindToUnsigned<decltype(d)> du;
6900 using TU = TFromD<decltype(du)>;
6901 const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned);
6902 const MFromD<decltype(du)> mu = RebindMask(du, mask);
6903 return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
6904}
6905
6906// ------------------------------ CompressNot
6907
6908template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 8)>
6909HWY_API V CompressNot(V v, const M mask) {
6910 return Compress(v, Not(mask));
6911}
6912
6913template <typename T, HWY_IF_T_SIZE(T, 8)>
6915 // See CompressIsPartition. u64 is faster than u32.
6916 alignas(16) static constexpr uint64_t packed_array[256] = {
6917 // From PrintCompressNot32x8Tables, without the FirstN extension (there is
6918 // no benefit to including them because 64-bit CompressStore is anyway
6919 // masked, but also no harm because TableLookupLanes ignores the MSB).
6920 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
6921 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
6922 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
6923 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
6924 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
6925 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
6926 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
6927 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
6928 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
6929 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
6930 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
6931 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
6932 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
6933 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
6934 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
6935 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
6936 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
6937 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
6938 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
6939 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
6940 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
6941 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
6942 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
6943 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
6944 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
6945 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
6946 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
6947 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
6948 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
6949 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
6950 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
6951 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
6952 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
6953 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
6954 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
6955 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
6956 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
6957 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
6958 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
6959 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
6960 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
6961 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
6962 0x76543210, 0x76543201, 0x76543210, 0x76543210};
6963
6964 // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
6965 // _mm512_permutexvar_epi64 will ignore the upper bits.
6966 const DFromV<decltype(v)> d;
6967 const RebindToUnsigned<decltype(d)> du64;
6968 const auto packed = Set(du64, packed_array[mask.raw]);
6969 alignas(64) static constexpr uint64_t shifts[8] = {0, 4, 8, 12,
6970 16, 20, 24, 28};
6971 const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
6972 return TableLookupLanes(v, indices);
6973}
6974
6975// uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a
6976// no-op for 128-bit.
6977template <class V, class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)>
6979 return CompressNot(v, mask);
6980}
6981
6982// ------------------------------ CompressBits
6983template <class V>
6984HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
6985 return Compress(v, LoadMaskBits(DFromV<V>(), bits));
6986}
6987
6988// ------------------------------ CompressStore
6989
6990// Generic for all vector lengths.
6991
6992template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
6993HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
6994 TFromD<D>* HWY_RESTRICT unaligned) {
6995#if HWY_TARGET == HWY_AVX3_ZEN4
6996 StoreU(Compress(v, mask), d, unaligned);
6997#else
6998 const RebindToUnsigned<decltype(d)> du;
6999 const auto mu = RebindMask(du, mask);
7000 auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned);
7001
7002#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
7003 detail::NativeCompressStore(BitCast(du, v), mu, pu);
7004#else
7005 detail::EmuCompressStore(BitCast(du, v), mu, du, pu);
7006#endif
7007#endif // HWY_TARGET != HWY_AVX3_ZEN4
7008 const size_t count = CountTrue(d, mask);
7009 detail::MaybeUnpoison(unaligned, count);
7010 return count;
7011}
7012
7013template <class D, HWY_IF_NOT_FLOAT_D(D),
7014 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
7015HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
7016 TFromD<D>* HWY_RESTRICT unaligned) {
7017#if HWY_TARGET == HWY_AVX3_ZEN4
7018 StoreU(Compress(v, mask), d, unaligned);
7019#else
7020 const RebindToUnsigned<decltype(d)> du;
7021 const auto mu = RebindMask(du, mask);
7022 using TU = TFromD<decltype(du)>;
7023 TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned);
7024 detail::NativeCompressStore(BitCast(du, v), mu, pu);
7025#endif // HWY_TARGET != HWY_AVX3_ZEN4
7026 const size_t count = CountTrue(d, mask);
7027 detail::MaybeUnpoison(unaligned, count);
7028 return count;
7029}
7030
7031// Additional overloads to avoid casting to uint32_t (delay?).
7032template <class D, HWY_IF_FLOAT3264_D(D)>
7033HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
7034 TFromD<D>* HWY_RESTRICT unaligned) {
7035#if HWY_TARGET == HWY_AVX3_ZEN4
7036 StoreU(Compress(v, mask), d, unaligned);
7037#else
7038 (void)d;
7039 detail::NativeCompressStore(v, mask, unaligned);
7040#endif // HWY_TARGET != HWY_AVX3_ZEN4
7041 const size_t count = PopCount(uint64_t{mask.raw});
7042 detail::MaybeUnpoison(unaligned, count);
7043 return count;
7044}
7045
7046// ------------------------------ CompressBlendedStore
7047template <class D, HWY_IF_V_SIZE_GT_D(D, 8)>
7048HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
7049 TFromD<D>* HWY_RESTRICT unaligned) {
7050 // Native CompressStore already does the blending at no extra cost (latency
7051 // 11, rthroughput 2 - same as compress plus store).
7052 if (HWY_TARGET == HWY_AVX3_DL ||
7053 (HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD<D>) > 2)) {
7054 return CompressStore(v, m, d, unaligned);
7055 } else {
7056 const size_t count = CountTrue(d, m);
7057 BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
7058 detail::MaybeUnpoison(unaligned, count);
7059 return count;
7060 }
7061}
7062
7063// ------------------------------ CompressBitsStore
7064// Generic for all vector lengths.
7065template <class D>
7066HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
7067 D d, TFromD<D>* HWY_RESTRICT unaligned) {
7068 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
7069}
7070
7071// ------------------------------ LoadInterleaved4
7072
7073// Actually implemented in generic_ops, we just overload LoadTransposedBlocks4.
7074namespace detail {
7075
7076// Type-safe wrapper.
7077template <_MM_PERM_ENUM kPerm, typename T>
7079 const DFromV<decltype(lo)> d;
7080 const RebindToUnsigned<decltype(d)> du;
7081 return BitCast(d, VFromD<decltype(du)>{_mm512_shuffle_i64x2(
7082 BitCast(du, lo).raw, BitCast(du, hi).raw, kPerm)});
7083}
7084template <_MM_PERM_ENUM kPerm>
7086 return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, kPerm)};
7087}
7088template <_MM_PERM_ENUM kPerm>
7090 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, kPerm)};
7091}
7092
7093// Input (128-bit blocks):
7094// 3 2 1 0 (<- first block in unaligned)
7095// 7 6 5 4
7096// b a 9 8
7097// Output:
7098// 9 6 3 0 (LSB of A)
7099// a 7 4 1
7100// b 8 5 2
7101template <class D, HWY_IF_V_SIZE_D(D, 64)>
7102HWY_API void LoadTransposedBlocks3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
7103 VFromD<D>& A, VFromD<D>& B, VFromD<D>& C) {
7104 constexpr size_t N = Lanes(d);
7105 const VFromD<D> v3210 = LoadU(d, unaligned + 0 * N);
7106 const VFromD<D> v7654 = LoadU(d, unaligned + 1 * N);
7107 const VFromD<D> vba98 = LoadU(d, unaligned + 2 * N);
7108
7109 const VFromD<D> v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654);
7110 const VFromD<D> va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98);
7111
7112 A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976);
7113 B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976);
7114 C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98);
7115}
7116
7117// Input (128-bit blocks):
7118// 3 2 1 0 (<- first block in unaligned)
7119// 7 6 5 4
7120// b a 9 8
7121// f e d c
7122// Output:
7123// c 8 4 0 (LSB of A)
7124// d 9 5 1
7125// e a 6 2
7126// f b 7 3
7127template <class D, HWY_IF_V_SIZE_D(D, 64)>
7128HWY_API void LoadTransposedBlocks4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
7129 VFromD<D>& vA, VFromD<D>& vB, VFromD<D>& vC,
7130 VFromD<D>& vD) {
7131 constexpr size_t N = Lanes(d);
7132 const VFromD<D> v3210 = LoadU(d, unaligned + 0 * N);
7133 const VFromD<D> v7654 = LoadU(d, unaligned + 1 * N);
7134 const VFromD<D> vba98 = LoadU(d, unaligned + 2 * N);
7135 const VFromD<D> vfedc = LoadU(d, unaligned + 3 * N);
7136
7137 const VFromD<D> v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654);
7138 const VFromD<D> vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc);
7139 const VFromD<D> v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654);
7140 const VFromD<D> vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc);
7141 vA = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98);
7142 vB = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98);
7143 vC = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba);
7144 vD = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba);
7145}
7146
7147} // namespace detail
7148
7149// ------------------------------ StoreInterleaved2
7150
7151// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
7152
7153namespace detail {
7154
7155// Input (128-bit blocks):
7156// 6 4 2 0 (LSB of i)
7157// 7 5 3 1
7158// Output:
7159// 3 2 1 0
7160// 7 6 5 4
7161template <class D, HWY_IF_V_SIZE_D(D, 64)>
7162HWY_API void StoreTransposedBlocks2(const VFromD<D> i, const VFromD<D> j, D d,
7163 TFromD<D>* HWY_RESTRICT unaligned) {
7164 constexpr size_t N = Lanes(d);
7165 const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
7166 const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
7167 const auto j1_i1_j0_i0 =
7168 detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0);
7169 const auto j3_i3_j2_i2 =
7170 detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2);
7171 StoreU(j1_i1_j0_i0, d, unaligned + 0 * N);
7172 StoreU(j3_i3_j2_i2, d, unaligned + 1 * N);
7173}
7174
7175// Input (128-bit blocks):
7176// 9 6 3 0 (LSB of i)
7177// a 7 4 1
7178// b 8 5 2
7179// Output:
7180// 3 2 1 0
7181// 7 6 5 4
7182// b a 9 8
7183template <class D, HWY_IF_V_SIZE_D(D, 64)>
7185 const VFromD<D> k, D d,
7186 TFromD<D>* HWY_RESTRICT unaligned) {
7187 constexpr size_t N = Lanes(d);
7188 const VFromD<D> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j);
7189 const VFromD<D> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i);
7190 const VFromD<D> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j);
7191
7192 const VFromD<D> out0 = // i1 k0 j0 i0
7193 detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0);
7194 const VFromD<D> out1 = // j2 i2 k1 j1
7195 detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0);
7196 const VFromD<D> out2 = // k3 j3 i3 k2
7197 detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1);
7198
7199 StoreU(out0, d, unaligned + 0 * N);
7200 StoreU(out1, d, unaligned + 1 * N);
7201 StoreU(out2, d, unaligned + 2 * N);
7202}
7203
7204// Input (128-bit blocks):
7205// c 8 4 0 (LSB of i)
7206// d 9 5 1
7207// e a 6 2
7208// f b 7 3
7209// Output:
7210// 3 2 1 0
7211// 7 6 5 4
7212// b a 9 8
7213// f e d c
7214template <class D, HWY_IF_V_SIZE_D(D, 64)>
7216 const VFromD<D> k, const VFromD<D> l, D d,
7217 TFromD<D>* HWY_RESTRICT unaligned) {
7218 constexpr size_t N = Lanes(d);
7219 const VFromD<D> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
7220 const VFromD<D> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l);
7221 const VFromD<D> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
7222 const VFromD<D> l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l);
7223 const VFromD<D> out0 =
7224 detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0);
7225 const VFromD<D> out1 =
7226 detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0);
7227 const VFromD<D> out2 =
7228 detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2);
7229 const VFromD<D> out3 =
7230 detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2);
7231 StoreU(out0, d, unaligned + 0 * N);
7232 StoreU(out1, d, unaligned + 1 * N);
7233 StoreU(out2, d, unaligned + 2 * N);
7234 StoreU(out3, d, unaligned + 3 * N);
7235}
7236
7237} // namespace detail
7238
7239// ------------------------------ Additional mask logical operations
7240
7241template <class T>
7243 return Mask512<T>{
7244 static_cast<typename Mask512<T>::Raw>(0u - detail::AVX3Blsi(mask.raw))};
7245}
7246template <class T>
7248 return Mask512<T>{
7249 static_cast<typename Mask512<T>::Raw>(detail::AVX3Blsi(mask.raw) - 1u)};
7250}
7251template <class T>
7253 return Mask512<T>{
7254 static_cast<typename Mask512<T>::Raw>(detail::AVX3Blsmsk(mask.raw))};
7255}
7256template <class T>
7258 return Mask512<T>{
7259 static_cast<typename Mask512<T>::Raw>(detail::AVX3Blsi(mask.raw))};
7260}
7261
7262// ------------------------------ Shl (Dup128VecFromValues)
7263
7265 return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
7266}
7267
7268// 8-bit: may use the << overload for uint16_t.
7270 const DFromV<decltype(v)> d;
7271#if HWY_TARGET <= HWY_AVX3_DL
7272 // kMask[i] = 0xFF >> i
7273 const VFromD<decltype(d)> masks =
7274 Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
7275 0, 0, 0, 0, 0, 0, 0);
7276 // kShl[i] = 1 << i
7277 const VFromD<decltype(d)> shl =
7278 Dup128VecFromValues(d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0,
7279 0, 0, 0, 0, 0, 0, 0);
7280 v = And(v, TableLookupBytes(masks, bits));
7281 const VFromD<decltype(d)> mul = TableLookupBytes(shl, bits);
7282 return VFromD<decltype(d)>{_mm512_gf2p8mul_epi8(v.raw, mul.raw)};
7283#else
7284 const Repartition<uint16_t, decltype(d)> dw;
7285 using VW = VFromD<decltype(dw)>;
7286 const VW even_mask = Set(dw, 0x00FF);
7287 const VW odd_mask = Set(dw, 0xFF00);
7288 const VW vw = BitCast(dw, v);
7289 const VW bits16 = BitCast(dw, bits);
7290 // Shift even lanes in-place
7291 const VW evens = vw << And(bits16, even_mask);
7292 const VW odds = And(vw, odd_mask) << ShiftRight<8>(bits16);
7293 return OddEven(BitCast(d, odds), BitCast(d, evens));
7294#endif
7295}
7296
7298 const Vec512<uint32_t> bits) {
7299 return Vec512<uint32_t>{_mm512_sllv_epi32(v.raw, bits.raw)};
7300}
7301
7303 const Vec512<uint64_t> bits) {
7304 return Vec512<uint64_t>{_mm512_sllv_epi64(v.raw, bits.raw)};
7305}
7306
7307// Signed left shift is the same as unsigned.
7308template <typename T, HWY_IF_SIGNED(T)>
7310 const DFromV<decltype(v)> di;
7311 const RebindToUnsigned<decltype(di)> du;
7312 return BitCast(di, BitCast(du, v) << BitCast(du, bits));
7313}
7314
7315// ------------------------------ Shr (IfVecThenElse)
7316
7318 const Vec512<uint16_t> bits) {
7319 return Vec512<uint16_t>{_mm512_srlv_epi16(v.raw, bits.raw)};
7320}
7321
7322// 8-bit uses 16-bit shifts.
7324 const DFromV<decltype(v)> d;
7325 const RepartitionToWide<decltype(d)> dw;
7326 using VW = VFromD<decltype(dw)>;
7327 const VW mask = Set(dw, 0x00FF);
7328 const VW vw = BitCast(dw, v);
7329 const VW bits16 = BitCast(dw, bits);
7330 const VW evens = And(vw, mask) >> And(bits16, mask);
7331 // Shift odd lanes in-place
7332 const VW odds = vw >> ShiftRight<8>(bits16);
7333 return OddEven(BitCast(d, odds), BitCast(d, evens));
7334}
7335
7337 const Vec512<uint32_t> bits) {
7338 return Vec512<uint32_t>{_mm512_srlv_epi32(v.raw, bits.raw)};
7339}
7340
7342 const Vec512<uint64_t> bits) {
7343 return Vec512<uint64_t>{_mm512_srlv_epi64(v.raw, bits.raw)};
7344}
7345
7347 const Vec512<int16_t> bits) {
7348 return Vec512<int16_t>{_mm512_srav_epi16(v.raw, bits.raw)};
7349}
7350
7351// 8-bit uses 16-bit shifts.
7353 const DFromV<decltype(v)> d;
7354 const RepartitionToWide<decltype(d)> dw;
7355 const RebindToUnsigned<decltype(dw)> dw_u;
7356 using VW = VFromD<decltype(dw)>;
7357 const VW mask = Set(dw, 0x00FF);
7358 const VW vw = BitCast(dw, v);
7359 const VW bits16 = BitCast(dw, bits);
7360 const VW evens = ShiftRight<8>(ShiftLeft<8>(vw)) >> And(bits16, mask);
7361 // Shift odd lanes in-place
7362 const VW odds = vw >> BitCast(dw, ShiftRight<8>(BitCast(dw_u, bits16)));
7363 return OddEven(BitCast(d, odds), BitCast(d, evens));
7364}
7365
7367 const Vec512<int32_t> bits) {
7368 return Vec512<int32_t>{_mm512_srav_epi32(v.raw, bits.raw)};
7369}
7370
7372 const Vec512<int64_t> bits) {
7373 return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
7374}
7375
7376// ------------------------------ WidenMulPairwiseAdd
7377template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
7379 Vec512<int16_t> b) {
7380 return VFromD<D>{_mm512_madd_epi16(a.raw, b.raw)};
7381}
7382
7383// ------------------------------ SatWidenMulPairwiseAdd
7384
7385template <class DI16, HWY_IF_V_SIZE_D(DI16, 64), HWY_IF_I16_D(DI16)>
7387 DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a,
7388 VFromD<Repartition<int8_t, DI16>> b) {
7389 return VFromD<DI16>{_mm512_maddubs_epi16(a.raw, b.raw)};
7390}
7391
7392// ------------------------------ SatWidenMulPairwiseAccumulate
7393#if HWY_TARGET <= HWY_AVX3_DL
7394template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 64)>
7396 DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
7397 VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
7398 return VFromD<DI32>{_mm512_dpwssds_epi32(sum.raw, a.raw, b.raw)};
7399}
7400#endif // HWY_TARGET <= HWY_AVX3_DL
7401
7402// ------------------------------ ReorderWidenMulAccumulate
7403template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
7406 const VFromD<D> sum0,
7407 VFromD<D>& /*sum1*/) {
7408 (void)d;
7409#if HWY_TARGET <= HWY_AVX3_DL
7410 return VFromD<D>{_mm512_dpwssd_epi32(sum0.raw, a.raw, b.raw)};
7411#else
7412 return sum0 + WidenMulPairwiseAdd(d, a, b);
7413#endif
7414}
7415
7417 Vec512<int32_t> /*sum1*/) {
7418 return sum0; // invariant already holds
7419}
7420
7422 Vec512<uint32_t> /*sum1*/) {
7423 return sum0; // invariant already holds
7424}
7425
7426// ------------------------------ SumOfMulQuadAccumulate
7427
7428#if HWY_TARGET <= HWY_AVX3_DL
7429
7430template <class DI32, HWY_IF_V_SIZE_D(DI32, 64)>
7432 DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
7433 VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
7434 return VFromD<DI32>{_mm512_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)};
7435}
7436
7437#endif
7438
7439// ------------------------------ Reductions
7440
7441namespace detail {
7442
7443// Used by generic_ops-inl
7444template <class D, class Func, HWY_IF_V_SIZE_D(D, 64)>
7446 v = f(v, SwapAdjacentBlocks(v));
7447 return f(v, ReverseBlocks(d, v));
7448}
7449
7450} // namespace detail
7451
7452// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
7453
7454template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
7456 return V{_mm512_lzcnt_epi32(v.raw)};
7457}
7458
7459template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
7461 return V{_mm512_lzcnt_epi64(v.raw)};
7462}
7463
7464namespace detail {
7465
7466template <class V, HWY_IF_UNSIGNED_V(V),
7467 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
7468 HWY_IF_LANES_LE_D(DFromV<V>, 16)>
7470 const DFromV<decltype(v)> d;
7471 const Rebind<int32_t, decltype(d)> di32;
7472 const Rebind<uint32_t, decltype(d)> du32;
7473
7474 const auto v_lz_count = LeadingZeroCount(PromoteTo(du32, v));
7475 return DemoteTo(d, BitCast(di32, v_lz_count));
7476}
7477
7478template <class V, HWY_IF_UNSIGNED_V(V),
7479 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
7482 const DFromV<decltype(v)> d;
7483 const Half<decltype(d)> dh;
7484 const Rebind<int32_t, decltype(dh)> di32;
7485 const Rebind<uint32_t, decltype(dh)> du32;
7486 const Rebind<uint16_t, decltype(d)> du16;
7487
7488 const auto lo_v_lz_count =
7489 LeadingZeroCount(PromoteTo(du32, LowerHalf(dh, v)));
7490 const auto hi_v_lz_count =
7491 LeadingZeroCount(PromoteTo(du32, UpperHalf(dh, v)));
7492 return OrderedDemote2To(du16, BitCast(di32, lo_v_lz_count),
7493 BitCast(di32, hi_v_lz_count));
7494}
7495
7497 const DFromV<decltype(v)> d;
7498 const Rebind<int16_t, decltype(d)> di16;
7499 return DemoteTo(d, BitCast(di16, Lzcnt32ForU8OrU16AsU16(v)));
7500}
7501
7503 const DFromV<decltype(v)> d;
7504 const Half<decltype(d)> dh;
7505 const Rebind<int16_t, decltype(dh)> di16;
7506
7507 const auto lo_half = LowerHalf(dh, v);
7508 const auto hi_half = UpperHalf(dh, v);
7509
7510 const auto lo_v_lz_count = BitCast(di16, Lzcnt32ForU8OrU16AsU16(lo_half));
7511 const auto hi_v_lz_count = BitCast(di16, Lzcnt32ForU8OrU16AsU16(hi_half));
7512 return OrderedDemote2To(d, lo_v_lz_count, hi_v_lz_count);
7513}
7514
7518
7519} // namespace detail
7520
7521template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
7522 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
7524 const DFromV<decltype(v)> d;
7525 const RebindToUnsigned<decltype(d)> du;
7526 using TU = TFromD<decltype(du)>;
7527
7528 constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
7529 const auto v_lzcnt32 = detail::Lzcnt32ForU8OrU16(BitCast(du, v));
7530 return BitCast(d, Min(v_lzcnt32 - Set(du, TU{32 - kNumOfBitsInT}),
7531 Set(du, TU{kNumOfBitsInT})));
7532}
7533
7534template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
7535 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
7537 const DFromV<decltype(v)> d;
7538 const RebindToUnsigned<decltype(d)> du;
7539 using TU = TFromD<decltype(du)>;
7540 return BitCast(d,
7541 Set(du, TU{31}) - detail::Lzcnt32ForU8OrU16(BitCast(du, v)));
7542}
7543
7544template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
7545 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
7547 const DFromV<decltype(v)> d;
7548 using T = TFromD<decltype(d)>;
7549 return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
7550}
7551
7552template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
7554 const DFromV<decltype(v)> d;
7555 const RebindToSigned<decltype(d)> di;
7556 using T = TFromD<decltype(d)>;
7557
7558 const auto vi = BitCast(di, v);
7559 const auto lowest_bit = BitCast(d, And(vi, Neg(vi)));
7560 constexpr T kNumOfBitsInT{sizeof(T) * 8};
7561 const auto bit_idx = HighestSetBitIndex(lowest_bit);
7562 return IfThenElse(MaskFromVec(bit_idx), Set(d, kNumOfBitsInT), bit_idx);
7563}
7564
7565// NOLINTNEXTLINE(google-readability-namespace-comments)
7566} // namespace HWY_NAMESPACE
7567} // namespace hwy
7569
7570// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
7571// the warning seems to be issued at the call site of intrinsics, i.e. our code.
7572HWY_DIAGNOSTICS(pop)
#define HWY_RESTRICT
Definition base.h:95
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
Definition arm_neon-inl.h:865
Raw raw
Definition arm_neon-inl.h:878
Definition arm_neon-inl.h:813
Raw raw
Definition arm_neon-inl.h:851
Definition wasm_256-inl.h:27
Raw raw
Definition x86_256-inl.h:117
Definition x86_512-inl.h:134
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition x86_512-inl.h:146
typename detail::Raw512< T >::type Raw
Definition x86_512-inl.h:135
Raw raw
Definition x86_512-inl.h:168
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition x86_512-inl.h:161
HWY_INLINE Vec512 & operator%=(const Vec512 other)
Definition x86_512-inl.h:155
T PrivateT
Definition x86_512-inl.h:138
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition x86_512-inl.h:149
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition x86_512-inl.h:164
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition x86_512-inl.h:152
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition x86_512-inl.h:143
static constexpr size_t kPrivateN
Definition x86_512-inl.h:139
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition x86_512-inl.h:158
#define HWY_COMPILER_CLANGCL
Definition detect_compiler_arch.h:45
#define HWY_AVX3_DL
Definition detect_targets.h:73
#define HWY_AVX3_SPR
Definition detect_targets.h:63
#define HWY_TARGET
Definition detect_targets.h:543
#define HWY_AVX3_ZEN4
Definition detect_targets.h:68
HWY_INLINE V SlideUpI32Lanes(V v)
Definition x86_512-inl.h:4971
HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV< V > t)
Definition x86_128-inl.h:6289
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE Vec256< T > BroadcastLane(hwy::SizeTag< 0 >, Vec256< T > v)
Definition x86_256-inl.h:4186
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition ops/shared-inl.h:151
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_INLINE VFromD< D > TableLookupSlideDownLanes(D d, VFromD< D > v, size_t amt)
Definition x86_256-inl.h:5786
HWY_API Vec128< T, N > GaloisAffine(Vec128< T, N > v, VFromD< Repartition< uint64_t, Simd< T, N, 0 > > > matrix)
Definition x86_128-inl.h:1870
HWY_INLINE VFromD< DI > FixConversionOverflow(DI di, VFromD< RebindToFloat< DI > > original, VFromD< DI > converted)
Definition x86_128-inl.h:10061
HWY_INLINE Vec128< uint8_t, N > EmuCompress(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition x86_512-inl.h:6563
static HWY_INLINE uint32_t AVX3Blsi(T x)
Definition x86_128-inl.h:12517
HWY_INLINE V CombineShiftRightI64Lanes(V hi, V lo)
Definition x86_256-inl.h:5526
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1593
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:5084
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE void StoreTransposedBlocks3(VFromD< D > A, VFromD< D > B, VFromD< D > C, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1652
HWY_INLINE V SlideDownI64Lanes(V v)
Definition x86_256-inl.h:5740
static HWY_INLINE uint32_t AVX3Blsmsk(T x)
Definition x86_128-inl.h:12537
HWY_INLINE void StoreTransposedBlocks2(VFromD< D > A, VFromD< D > B, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1616
Vec512< T > Shuffle128(const Vec512< T > lo, const Vec512< T > hi)
Definition x86_512-inl.h:7078
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:164
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition x86_256-inl.h:7076
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE VFromD< DTo > ZeroExtendResizeBitCast(FromSizeTag, ToSizeTag, DTo d_to, DFrom, VFromD< DFrom > v)
Definition emu128-inl.h:140
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE Vec512< T > NativeGather512(const T *HWY_RESTRICT base, Vec512< int32_t > indices)
Definition x86_512-inl.h:3423
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2845
HWY_INLINE void StoreTransposedBlocks4(VFromD< D > vA, VFromD< D > vB, VFromD< D > vC, VFromD< D > vD, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:2003
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE VFromD< D > NativeLoadExpand(MFromD< D > mask, D, const uint8_t *HWY_RESTRICT unaligned)
Definition x86_128-inl.h:12412
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:1383
HWY_INLINE V Lzcnt32ForU8OrU16(V v)
Definition x86_512-inl.h:7469
HWY_INLINE Vec128< uint8_t, N > NativeCompress(const Vec128< uint8_t, N > v, const Mask128< uint8_t, N > mask)
Definition x86_512-inl.h:6372
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
HWY_INLINE VFromD< Rebind< uint16_t, DFromV< V > > > Lzcnt32ForU8OrU16AsU16(V v)
Definition x86_512-inl.h:7481
HWY_INLINE Vec512< T > InsertBlock(hwy::SizeTag< 0 >, Vec512< T > v, Vec128< T > blk_to_insert)
Definition x86_512-inl.h:3622
HWY_INLINE VFromD< D > TableLookupSlideUpLanes(D d, VFromD< D > v, size_t amt)
Definition x86_256-inl.h:5582
HWY_INLINE Vec512< T > NativeMaskedGatherOr512(Vec512< T > no, Mask512< T > m, const T *HWY_RESTRICT base, Vec512< int32_t > indices)
Definition x86_512-inl.h:3447
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE V SlideDownI32Lanes(V v)
Definition x86_512-inl.h:5202
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE VFromD< D > ReduceAcrossBlocks(D, Func, VFromD< D > v)
Definition generic_ops-inl.h:998
HWY_INLINE void EmuCompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition x86_512-inl.h:6601
HWY_INLINE Vec128< uint8_t, N > NativeExpand(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition x86_128-inl.h:12400
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE V CombineShiftRightI32Lanes(V hi, V lo)
Definition x86_256-inl.h:5517
HWY_INLINE V SlideUpI64Lanes(V v)
Definition x86_256-inl.h:5535
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< DI32 > SatWidenMulPairwiseAccumulate(DI32 di32, VFromD< Repartition< int16_t, DI32 > > a, VFromD< Repartition< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition generic_ops-inl.h:5179
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API Mask< D > SlideMask1Up(D d, Mask< D > m)
Definition generic_ops-inl.h:7071
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API V Rol(V a, V b)
Definition generic_ops-inl.h:445
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfAdjQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3901
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API VFromD< D > SlideDownBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7046
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfShuffledQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3943
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v)
Definition generic_ops-inl.h:869
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec< DI16 > SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b)
Definition generic_ops-inl.h:5153
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Mask< D > SlideMask1Down(D d, Mask< D > m)
Definition generic_ops-inl.h:7076
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API Vec< RepartitionToWideX3< DFromV< V > > > SumsOf8AbsDiff(V a, V b)
Definition generic_ops-inl.h:2820
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API V MulAddSub(V mul, V x, V sub_or_add)
Definition arm_sve-inl.h:4285
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API V Ror(V a, V b)
Definition generic_ops-inl.h:459
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
unsigned int Shift16Count
Definition x86_512-inl.h:1539
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
RepartitionToWide< RepartitionToWide< D > > RepartitionToWideX2
Definition ops/shared-inl.h:480
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API MFromD< DFromV< V > > IsEitherNaN(V a, V b)
Definition generic_ops-inl.h:1177
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API V HighestSetBitIndex(V v)
Definition arm_neon-inl.h:9523
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
unsigned int Shift3264Count
Definition x86_512-inl.h:1540
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API V TrailingZeroCount(V v)
Definition arm_neon-inl.h:9530
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API MFromD< D > UpperHalfOfMask(D, MFromD< Twice< D > > m)
Definition x86_128-inl.h:1051
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V ReverseBits(V v)
Definition generic_ops-inl.h:6464
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_API MFromD< D > CombineMasks(D, MFromD< Half< D > > hi, MFromD< Half< D > > lo)
Definition x86_128-inl.h:959
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
HWY_API VFromD< D > SlideUpBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7028
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API V BroadcastLane(const V v)
Definition arm_sve-inl.h:4146
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
Definition abort.h:8
FuncOutput(*)(const void *, FuncInput) Func
Definition nanobenchmark.h:87
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2551
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2577
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2540
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2588
HWY_API size_t PopCount(T x)
Definition base.h:2615
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_UI32_D(D)
Definition ops/shared-inl.h:591
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_LANES_LE_D(D, lanes)
Definition ops/shared-inl.h:561
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_V_SIZE_V(V, bytes)
Definition ops/shared-inl.h:632
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_HAVE_FLOAT16
Definition set_macros-inl.h:173
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition x86_512-inl.h:4051
__m512i raw
Definition x86_512-inl.h:4052
Definition wasm_256-inl.h:64
typename detail::RawMask256< sizeof(T)>::type Raw
Definition x86_256-inl.h:148
Raw raw
Definition x86_256-inl.h:154
Definition x86_512-inl.h:173
typename detail::RawMask512< sizeof(T)>::type Raw
Definition x86_512-inl.h:174
Raw raw
Definition x86_512-inl.h:175
Definition ops/shared-inl.h:198
HWY_INLINE __m512d operator()(__m512i v)
Definition x86_512-inl.h:237
HWY_INLINE __m512 operator()(__m512i v)
Definition x86_512-inl.h:233
HWY_INLINE __m512i operator()(__m512i v)
Definition x86_512-inl.h:223
__m512d type
Definition x86_512-inl.h:108
__m512 type
Definition x86_512-inl.h:104
Definition x86_512-inl.h:93
__m512i type
Definition x86_512-inl.h:94
__mmask64 type
Definition x86_512-inl.h:116
__mmask32 type
Definition x86_512-inl.h:120
__mmask16 type
Definition x86_512-inl.h:124
__mmask8 type
Definition x86_512-inl.h:128
Definition x86_512-inl.h:113
Definition base.h:694
Definition base.h:1117
int VFromD
Definition tuple-inl.h:25
#define HWY_X86_FPCLASS_NEG_INF
Definition x86_128-inl.h:11266
#define HWY_X86_FPCLASS_SNAN
Definition x86_128-inl.h:11269
#define HWY_X86_FPCLASS_POS_INF
Definition x86_128-inl.h:11265
#define HWY_X86_FPCLASS_QNAN
Definition x86_128-inl.h:11262
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()