Grok 12.0.1
x86_256-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when
17// compiling for that target.
18// External include guard in highway.h - see comment there.
19
20// WARNING: most operations do not cross 128-bit block boundaries. In
21// particular, "Broadcast", pack and zip behavior may be surprising.
22
23// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL
24#include "hwy/base.h"
25
26// Avoid uninitialized warnings in GCC's avx512fintrin.h - see
27// https://github.com/google/highway/issues/710)
29#if HWY_COMPILER_GCC_ACTUAL
30HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
31HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
32 ignored "-Wmaybe-uninitialized")
33#endif
34
35// Must come before HWY_COMPILER_CLANGCL
36#include <immintrin.h> // AVX2+
37
39// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
40// including these headers when _MSC_VER is defined, like when using clang-cl.
41// Include these directly here.
42#include <avxintrin.h>
43// avxintrin defines __m256i and must come before avx2intrin.
44#include <avx2intrin.h>
45#include <bmi2intrin.h> // _pext_u64
46#include <f16cintrin.h>
47#include <fmaintrin.h>
48#include <smmintrin.h>
49#endif // HWY_COMPILER_CLANGCL
50
51// For half-width vectors. Already includes base.h.
52#include "hwy/ops/shared-inl.h"
53// Already included by shared-inl, but do it again to avoid IDE warnings.
54#include "hwy/ops/x86_128-inl.h"
55
57namespace hwy {
58namespace HWY_NAMESPACE {
59namespace detail {
60
61template <typename T>
62struct Raw256 {
63 using type = __m256i;
64};
65#if HWY_HAVE_FLOAT16
66template <>
67struct Raw256<float16_t> {
68 using type = __m256h;
69};
70#endif // HWY_HAVE_FLOAT16
71template <>
72struct Raw256<float> {
73 using type = __m256;
74};
75template <>
76struct Raw256<double> {
77 using type = __m256d;
78};
79
80} // namespace detail
81
82template <typename T>
83class Vec256 {
84 using Raw = typename detail::Raw256<T>::type;
85
86 public:
87 using PrivateT = T; // only for DFromV
88 static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV
89
90 // Compound assignment. Only usable if there is a corresponding non-member
91 // binary operator overload. For example, only f32 and f64 support division.
93 return *this = (*this * other);
94 }
96 return *this = (*this / other);
97 }
99 return *this = (*this + other);
100 }
102 return *this = (*this - other);
103 }
105 return *this = (*this % other);
106 }
108 return *this = (*this & other);
109 }
111 return *this = (*this | other);
112 }
114 return *this = (*this ^ other);
115 }
116
118};
119
120#if HWY_TARGET <= HWY_AVX3
121
122namespace detail {
123
124// Template arg: sizeof(lane type)
125template <size_t size>
126struct RawMask256 {};
127template <>
128struct RawMask256<1> {
129 using type = __mmask32;
130};
131template <>
132struct RawMask256<2> {
133 using type = __mmask16;
134};
135template <>
136struct RawMask256<4> {
137 using type = __mmask8;
138};
139template <>
140struct RawMask256<8> {
141 using type = __mmask8;
142};
143
144} // namespace detail
145
146template <typename T>
147struct Mask256 {
148 using Raw = typename detail::RawMask256<sizeof(T)>::type;
149
150 static Mask256<T> FromBits(uint64_t mask_bits) {
151 return Mask256<T>{static_cast<Raw>(mask_bits)};
152 }
153
155};
156
157#else // AVX2
158
159// FF..FF or 0.
160template <typename T>
161struct Mask256 {
163};
164
165#endif // AVX2
166
167#if HWY_TARGET <= HWY_AVX3
168namespace detail {
169
170// Used by Expand() emulation, which is required for both AVX3 and AVX2.
171template <typename T>
172HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
173 return mask.raw;
174}
175
176} // namespace detail
177#endif // HWY_TARGET <= HWY_AVX3
178
179template <typename T>
180using Full256 = Simd<T, 32 / sizeof(T), 0>;
181
182// ------------------------------ BitCast
183
184namespace detail {
185
186HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; }
187#if HWY_HAVE_FLOAT16
188HWY_INLINE __m256i BitCastToInteger(__m256h v) {
189 return _mm256_castph_si256(v);
190}
191#endif // HWY_HAVE_FLOAT16
192HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); }
193HWY_INLINE __m256i BitCastToInteger(__m256d v) {
194 return _mm256_castpd_si256(v);
195}
196
197#if HWY_AVX3_HAVE_F32_TO_BF16C
198HWY_INLINE __m256i BitCastToInteger(__m256bh v) {
199 // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
200 // bit cast a __m256bh to a __m256i as there is currently no intrinsic
201 // available (as of GCC 13 and Clang 17) that can bit cast a __m256bh vector
202 // to a __m256i vector
203
204#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
205 // On GCC or Clang, use reinterpret_cast to bit cast a __m256bh to a __m256i
206 return reinterpret_cast<__m256i>(v);
207#else
208 // On MSVC, use BitCastScalar to bit cast a __m256bh to a __m256i as MSVC does
209 // not allow reinterpret_cast, static_cast, or a C-style cast to be used to
210 // bit cast from one AVX vector type to a different AVX vector type
211 return BitCastScalar<__m256i>(v);
212#endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
213}
214#endif // HWY_AVX3_HAVE_F32_TO_BF16C
215
216template <typename T>
220
221// Cannot rely on function overloading because return types differ.
222template <typename T>
224 HWY_INLINE __m256i operator()(__m256i v) { return v; }
225};
226#if HWY_HAVE_FLOAT16
227template <>
228struct BitCastFromInteger256<float16_t> {
229 HWY_INLINE __m256h operator()(__m256i v) { return _mm256_castsi256_ph(v); }
230};
231#endif // HWY_HAVE_FLOAT16
232template <>
234 HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); }
235};
236template <>
237struct BitCastFromInteger256<double> {
238 HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); }
239};
240
241template <class D, HWY_IF_V_SIZE_D(D, 32)>
245
246} // namespace detail
247
248template <class D, HWY_IF_V_SIZE_D(D, 32), typename FromT>
252
253// ------------------------------ Zero
254
255// Cannot use VFromD here because it is defined in terms of Zero.
256template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
258 return Vec256<TFromD<D>>{_mm256_setzero_si256()};
259}
260template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
262 return Vec256<bfloat16_t>{_mm256_setzero_si256()};
263}
264template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
266#if HWY_HAVE_FLOAT16
267 return Vec256<float16_t>{_mm256_setzero_ph()};
268#else
269 return Vec256<float16_t>{_mm256_setzero_si256()};
270#endif // HWY_HAVE_FLOAT16
271}
272template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
274 return Vec256<float>{_mm256_setzero_ps()};
275}
276template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
278 return Vec256<double>{_mm256_setzero_pd()};
279}
280
281// ------------------------------ Set
282
283template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
284HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
285 return VFromD<D>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
286}
287template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
288HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
289 return VFromD<D>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
290}
291template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
292HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
293 return VFromD<D>{_mm256_set1_epi32(static_cast<int>(t))};
294}
295template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
296HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
297 return VFromD<D>{_mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
298}
299// bfloat16_t is handled by x86_128-inl.h.
300#if HWY_HAVE_FLOAT16
301template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
302HWY_API Vec256<float16_t> Set(D /* tag */, float16_t t) {
303 return Vec256<float16_t>{_mm256_set1_ph(t)};
304}
305#endif // HWY_HAVE_FLOAT16
306template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
307HWY_API Vec256<float> Set(D /* tag */, float t) {
308 return Vec256<float>{_mm256_set1_ps(t)};
309}
310template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
311HWY_API Vec256<double> Set(D /* tag */, double t) {
312 return Vec256<double>{_mm256_set1_pd(t)};
313}
314
315HWY_DIAGNOSTICS(push)
316HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
317
318// Returns a vector with uninitialized elements.
319template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
320HWY_API VFromD<D> Undefined(D /* tag */) {
321 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
322 // generate an XOR instruction.
323 return VFromD<D>{_mm256_undefined_si256()};
324}
325template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
327 return Vec256<bfloat16_t>{_mm256_undefined_si256()};
328}
329template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
331#if HWY_HAVE_FLOAT16
332 return Vec256<float16_t>{_mm256_undefined_ph()};
333#else
334 return Vec256<float16_t>{_mm256_undefined_si256()};
335#endif
336}
337template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
339 return Vec256<float>{_mm256_undefined_ps()};
340}
341template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
343 return Vec256<double>{_mm256_undefined_pd()};
344}
345
347
348// ------------------------------ ResizeBitCast
349
350// 32-byte vector to 32-byte vector (or 64-byte vector to 64-byte vector on
351// AVX3)
352template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16),
353 HWY_IF_V_SIZE_D(D, HWY_MAX_LANES_V(FromV) * sizeof(TFromV<FromV>))>
354HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
355 return BitCast(d, v);
356}
357
358// 32-byte vector to 16-byte vector (or 64-byte vector to 32-byte vector on
359// AVX3)
360template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16),
362 (HWY_MAX_LANES_V(FromV) * sizeof(TFromV<FromV>)) / 2)>
363HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
364 const DFromV<decltype(v)> d_from;
365 const Half<decltype(d_from)> dh_from;
366 return BitCast(d, LowerHalf(dh_from, v));
367}
368
369// 32-byte vector (or 64-byte vector on AVX3) to <= 8-byte vector
370template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16),
371 HWY_IF_V_SIZE_LE_D(D, 8)>
372HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) {
373 return VFromD<D>{ResizeBitCast(Full128<TFromD<D>>(), v).raw};
374}
375
376// <= 16-byte vector to 32-byte vector
377template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
378 HWY_IF_V_SIZE_D(D, 32)>
379HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
380 return BitCast(d, Vec256<uint8_t>{_mm256_castsi128_si256(
381 ResizeBitCast(Full128<uint8_t>(), v).raw)});
382}
383
384// ------------------------------ Dup128VecFromValues
385
386template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 32)>
387HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
388 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
389 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
390 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
391 TFromD<D> t11, TFromD<D> t12,
392 TFromD<D> t13, TFromD<D> t14,
393 TFromD<D> t15) {
394 return VFromD<D>{_mm256_setr_epi8(
395 static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
396 static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
397 static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
398 static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
399 static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
400 static_cast<char>(t15), static_cast<char>(t0), static_cast<char>(t1),
401 static_cast<char>(t2), static_cast<char>(t3), static_cast<char>(t4),
402 static_cast<char>(t5), static_cast<char>(t6), static_cast<char>(t7),
403 static_cast<char>(t8), static_cast<char>(t9), static_cast<char>(t10),
404 static_cast<char>(t11), static_cast<char>(t12), static_cast<char>(t13),
405 static_cast<char>(t14), static_cast<char>(t15))};
406}
407
408template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 32)>
409HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
410 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
411 TFromD<D> t5, TFromD<D> t6,
412 TFromD<D> t7) {
413 return VFromD<D>{
414 _mm256_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1),
415 static_cast<int16_t>(t2), static_cast<int16_t>(t3),
416 static_cast<int16_t>(t4), static_cast<int16_t>(t5),
417 static_cast<int16_t>(t6), static_cast<int16_t>(t7),
418 static_cast<int16_t>(t0), static_cast<int16_t>(t1),
419 static_cast<int16_t>(t2), static_cast<int16_t>(t3),
420 static_cast<int16_t>(t4), static_cast<int16_t>(t5),
421 static_cast<int16_t>(t6), static_cast<int16_t>(t7))};
422}
423
424#if HWY_HAVE_FLOAT16
425template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 32)>
426HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
427 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
428 TFromD<D> t5, TFromD<D> t6,
429 TFromD<D> t7) {
430 return VFromD<D>{_mm256_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
431 t3, t4, t5, t6, t7)};
432}
433#endif
434
435template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 32)>
436HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
437 TFromD<D> t2, TFromD<D> t3) {
438 return VFromD<D>{
439 _mm256_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
440 static_cast<int32_t>(t2), static_cast<int32_t>(t3),
441 static_cast<int32_t>(t0), static_cast<int32_t>(t1),
442 static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
443}
444
445template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 32)>
446HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
447 TFromD<D> t2, TFromD<D> t3) {
448 return VFromD<D>{_mm256_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3)};
449}
450
451template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 32)>
452HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
453 return VFromD<D>{
454 _mm256_setr_epi64x(static_cast<int64_t>(t0), static_cast<int64_t>(t1),
455 static_cast<int64_t>(t0), static_cast<int64_t>(t1))};
456}
457
458template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 32)>
459HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
460 return VFromD<D>{_mm256_setr_pd(t0, t1, t0, t1)};
461}
462
463// ================================================== LOGICAL
464
465// ------------------------------ And
466
467template <typename T>
468HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
469 const DFromV<decltype(a)> d; // for float16_t
470 const RebindToUnsigned<decltype(d)> du;
471 return BitCast(d, VFromD<decltype(du)>{_mm256_and_si256(BitCast(du, a).raw,
472 BitCast(du, b).raw)});
473}
474
476 return Vec256<float>{_mm256_and_ps(a.raw, b.raw)};
477}
479 return Vec256<double>{_mm256_and_pd(a.raw, b.raw)};
480}
481
482// ------------------------------ AndNot
483
484// Returns ~not_mask & mask.
485template <typename T>
486HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
487 const DFromV<decltype(mask)> d; // for float16_t
488 const RebindToUnsigned<decltype(d)> du;
489 return BitCast(d, VFromD<decltype(du)>{_mm256_andnot_si256(
490 BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
491}
493 return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
494}
496 return Vec256<double>{_mm256_andnot_pd(not_mask.raw, mask.raw)};
497}
498
499// ------------------------------ Or
500
501template <typename T>
502HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
503 const DFromV<decltype(a)> d; // for float16_t
504 const RebindToUnsigned<decltype(d)> du;
505 return BitCast(d, VFromD<decltype(du)>{_mm256_or_si256(BitCast(du, a).raw,
506 BitCast(du, b).raw)});
507}
508
510 return Vec256<float>{_mm256_or_ps(a.raw, b.raw)};
511}
513 return Vec256<double>{_mm256_or_pd(a.raw, b.raw)};
514}
515
516// ------------------------------ Xor
517
518template <typename T>
519HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
520 const DFromV<decltype(a)> d; // for float16_t
521 const RebindToUnsigned<decltype(d)> du;
522 return BitCast(d, VFromD<decltype(du)>{_mm256_xor_si256(BitCast(du, a).raw,
523 BitCast(du, b).raw)});
524}
525
527 return Vec256<float>{_mm256_xor_ps(a.raw, b.raw)};
528}
530 return Vec256<double>{_mm256_xor_pd(a.raw, b.raw)};
531}
532
533// ------------------------------ Not
534template <typename T>
535HWY_API Vec256<T> Not(const Vec256<T> v) {
536 const DFromV<decltype(v)> d;
537 using TU = MakeUnsigned<T>;
538#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
539 const __m256i vu = BitCast(RebindToUnsigned<decltype(d)>(), v).raw;
540 return BitCast(d, Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
541#else
542 return Xor(v, BitCast(d, Vec256<TU>{_mm256_set1_epi32(-1)}));
543#endif
544}
545
546// ------------------------------ Xor3
547template <typename T>
548HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
549#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
550 const DFromV<decltype(x1)> d;
551 const RebindToUnsigned<decltype(d)> du;
552 using VU = VFromD<decltype(du)>;
553 const __m256i ret = _mm256_ternarylogic_epi64(
554 BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
555 return BitCast(d, VU{ret});
556#else
557 return Xor(x1, Xor(x2, x3));
558#endif
559}
560
561// ------------------------------ Or3
562template <typename T>
563HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
564#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
565 const DFromV<decltype(o1)> d;
566 const RebindToUnsigned<decltype(d)> du;
567 using VU = VFromD<decltype(du)>;
568 const __m256i ret = _mm256_ternarylogic_epi64(
569 BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
570 return BitCast(d, VU{ret});
571#else
572 return Or(o1, Or(o2, o3));
573#endif
574}
575
576// ------------------------------ OrAnd
577template <typename T>
578HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
579#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
580 const DFromV<decltype(o)> d;
581 const RebindToUnsigned<decltype(d)> du;
582 using VU = VFromD<decltype(du)>;
583 const __m256i ret = _mm256_ternarylogic_epi64(
584 BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
585 return BitCast(d, VU{ret});
586#else
587 return Or(o, And(a1, a2));
588#endif
589}
590
591// ------------------------------ IfVecThenElse
592template <typename T>
593HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
594#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
595 const DFromV<decltype(yes)> d;
596 const RebindToUnsigned<decltype(d)> du;
597 using VU = VFromD<decltype(du)>;
598 return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw,
599 BitCast(du, yes).raw,
600 BitCast(du, no).raw, 0xCA)});
601#else
602 return IfThenElse(MaskFromVec(mask), yes, no);
603#endif
604}
605
606// ------------------------------ Operator overloads (internal-only if float)
607
608template <typename T>
609HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) {
610 return And(a, b);
611}
612
613template <typename T>
614HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) {
615 return Or(a, b);
616}
617
618template <typename T>
619HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) {
620 return Xor(a, b);
621}
622
623// ------------------------------ PopulationCount
624
625// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
626#if HWY_TARGET <= HWY_AVX3_DL
627
628#ifdef HWY_NATIVE_POPCNT
629#undef HWY_NATIVE_POPCNT
630#else
631#define HWY_NATIVE_POPCNT
632#endif
633
634namespace detail {
635
636template <typename T>
638 return Vec256<T>{_mm256_popcnt_epi8(v.raw)};
639}
640template <typename T>
642 return Vec256<T>{_mm256_popcnt_epi16(v.raw)};
643}
644template <typename T>
646 return Vec256<T>{_mm256_popcnt_epi32(v.raw)};
647}
648template <typename T>
650 return Vec256<T>{_mm256_popcnt_epi64(v.raw)};
651}
652
653} // namespace detail
654
655template <typename T>
659
660#endif // HWY_TARGET <= HWY_AVX3_DL
661
662// ================================================== MASK
663
664#if HWY_TARGET <= HWY_AVX3
665
666// ------------------------------ IfThenElse
667
668// Returns mask ? b : a.
669
670namespace detail {
671
672// Templates for signed/unsigned integer of a particular size.
673template <typename T>
675 Vec256<T> yes, Vec256<T> no) {
676 return Vec256<T>{_mm256_mask_blend_epi8(mask.raw, no.raw, yes.raw)};
677}
678template <typename T>
680 Vec256<T> yes, Vec256<T> no) {
681 return Vec256<T>{_mm256_mask_blend_epi16(mask.raw, no.raw, yes.raw)};
682}
683template <typename T>
685 Vec256<T> yes, Vec256<T> no) {
686 return Vec256<T>{_mm256_mask_blend_epi32(mask.raw, no.raw, yes.raw)};
687}
688template <typename T>
690 Vec256<T> yes, Vec256<T> no) {
691 return Vec256<T>{_mm256_mask_blend_epi64(mask.raw, no.raw, yes.raw)};
692}
693
694} // namespace detail
695
696template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
698 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
699}
700#if HWY_HAVE_FLOAT16
701HWY_API Vec256<float16_t> IfThenElse(Mask256<float16_t> mask,
702 Vec256<float16_t> yes,
703 Vec256<float16_t> no) {
704 return Vec256<float16_t>{_mm256_mask_blend_ph(mask.raw, no.raw, yes.raw)};
705}
706#endif // HWY_HAVE_FLOAT16
708 Vec256<float> no) {
709 return Vec256<float>{_mm256_mask_blend_ps(mask.raw, no.raw, yes.raw)};
710}
712 Vec256<double> no) {
713 return Vec256<double>{_mm256_mask_blend_pd(mask.raw, no.raw, yes.raw)};
714}
715
716namespace detail {
717
718template <typename T>
720 Vec256<T> yes) {
721 return Vec256<T>{_mm256_maskz_mov_epi8(mask.raw, yes.raw)};
722}
723template <typename T>
725 Vec256<T> yes) {
726 return Vec256<T>{_mm256_maskz_mov_epi16(mask.raw, yes.raw)};
727}
728template <typename T>
730 Vec256<T> yes) {
731 return Vec256<T>{_mm256_maskz_mov_epi32(mask.raw, yes.raw)};
732}
733template <typename T>
735 Vec256<T> yes) {
736 return Vec256<T>{_mm256_maskz_mov_epi64(mask.raw, yes.raw)};
737}
738
739} // namespace detail
740
741template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
743 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
744}
746 return Vec256<float>{_mm256_maskz_mov_ps(mask.raw, yes.raw)};
747}
749 Vec256<double> yes) {
750 return Vec256<double>{_mm256_maskz_mov_pd(mask.raw, yes.raw)};
751}
752
753namespace detail {
754
755template <typename T>
757 Vec256<T> no) {
758 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
759 return Vec256<T>{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
760}
761template <typename T>
763 Vec256<T> no) {
764 return Vec256<T>{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
765}
766template <typename T>
768 Vec256<T> no) {
769 return Vec256<T>{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
770}
771template <typename T>
773 Vec256<T> no) {
774 return Vec256<T>{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
775}
776
777} // namespace detail
778
779template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
781 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
782}
784 return Vec256<float>{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
785}
787 return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
788}
789
790// ------------------------------ Mask logical
791
792namespace detail {
793
794template <typename T>
796 const Mask256<T> b) {
797#if HWY_COMPILER_HAS_MASK_INTRINSICS
798 return Mask256<T>{_kand_mask32(a.raw, b.raw)};
799#else
800 return Mask256<T>{static_cast<__mmask32>(a.raw & b.raw)};
801#endif
802}
803template <typename T>
805 const Mask256<T> b) {
806#if HWY_COMPILER_HAS_MASK_INTRINSICS
807 return Mask256<T>{_kand_mask16(a.raw, b.raw)};
808#else
809 return Mask256<T>{static_cast<__mmask16>(a.raw & b.raw)};
810#endif
811}
812template <typename T>
814 const Mask256<T> b) {
815#if HWY_COMPILER_HAS_MASK_INTRINSICS
816 return Mask256<T>{_kand_mask8(a.raw, b.raw)};
817#else
818 return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
819#endif
820}
821template <typename T>
823 const Mask256<T> b) {
824#if HWY_COMPILER_HAS_MASK_INTRINSICS
825 return Mask256<T>{_kand_mask8(a.raw, b.raw)};
826#else
827 return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
828#endif
829}
830
831template <typename T>
833 const Mask256<T> b) {
834#if HWY_COMPILER_HAS_MASK_INTRINSICS
835 return Mask256<T>{_kandn_mask32(a.raw, b.raw)};
836#else
837 return Mask256<T>{static_cast<__mmask32>(~a.raw & b.raw)};
838#endif
839}
840template <typename T>
842 const Mask256<T> b) {
843#if HWY_COMPILER_HAS_MASK_INTRINSICS
844 return Mask256<T>{_kandn_mask16(a.raw, b.raw)};
845#else
846 return Mask256<T>{static_cast<__mmask16>(~a.raw & b.raw)};
847#endif
848}
849template <typename T>
851 const Mask256<T> b) {
852#if HWY_COMPILER_HAS_MASK_INTRINSICS
853 return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
854#else
855 return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
856#endif
857}
858template <typename T>
860 const Mask256<T> b) {
861#if HWY_COMPILER_HAS_MASK_INTRINSICS
862 return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
863#else
864 return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
865#endif
866}
867
868template <typename T>
870 const Mask256<T> b) {
871#if HWY_COMPILER_HAS_MASK_INTRINSICS
872 return Mask256<T>{_kor_mask32(a.raw, b.raw)};
873#else
874 return Mask256<T>{static_cast<__mmask32>(a.raw | b.raw)};
875#endif
876}
877template <typename T>
879 const Mask256<T> b) {
880#if HWY_COMPILER_HAS_MASK_INTRINSICS
881 return Mask256<T>{_kor_mask16(a.raw, b.raw)};
882#else
883 return Mask256<T>{static_cast<__mmask16>(a.raw | b.raw)};
884#endif
885}
886template <typename T>
888 const Mask256<T> b) {
889#if HWY_COMPILER_HAS_MASK_INTRINSICS
890 return Mask256<T>{_kor_mask8(a.raw, b.raw)};
891#else
892 return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
893#endif
894}
895template <typename T>
897 const Mask256<T> b) {
898#if HWY_COMPILER_HAS_MASK_INTRINSICS
899 return Mask256<T>{_kor_mask8(a.raw, b.raw)};
900#else
901 return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
902#endif
903}
904
905template <typename T>
907 const Mask256<T> b) {
908#if HWY_COMPILER_HAS_MASK_INTRINSICS
909 return Mask256<T>{_kxor_mask32(a.raw, b.raw)};
910#else
911 return Mask256<T>{static_cast<__mmask32>(a.raw ^ b.raw)};
912#endif
913}
914template <typename T>
916 const Mask256<T> b) {
917#if HWY_COMPILER_HAS_MASK_INTRINSICS
918 return Mask256<T>{_kxor_mask16(a.raw, b.raw)};
919#else
920 return Mask256<T>{static_cast<__mmask16>(a.raw ^ b.raw)};
921#endif
922}
923template <typename T>
925 const Mask256<T> b) {
926#if HWY_COMPILER_HAS_MASK_INTRINSICS
927 return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
928#else
929 return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
930#endif
931}
932template <typename T>
934 const Mask256<T> b) {
935#if HWY_COMPILER_HAS_MASK_INTRINSICS
936 return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
937#else
938 return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
939#endif
940}
941
942template <typename T>
944 const Mask256<T> a, const Mask256<T> b) {
945#if HWY_COMPILER_HAS_MASK_INTRINSICS
946 return Mask256<T>{_kxnor_mask32(a.raw, b.raw)};
947#else
948 return Mask256<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
949#endif
950}
951template <typename T>
953 const Mask256<T> a, const Mask256<T> b) {
954#if HWY_COMPILER_HAS_MASK_INTRINSICS
955 return Mask256<T>{_kxnor_mask16(a.raw, b.raw)};
956#else
957 return Mask256<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
958#endif
959}
960template <typename T>
962 const Mask256<T> a, const Mask256<T> b) {
963#if HWY_COMPILER_HAS_MASK_INTRINSICS
964 return Mask256<T>{_kxnor_mask8(a.raw, b.raw)};
965#else
966 return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
967#endif
968}
969template <typename T>
971 const Mask256<T> a, const Mask256<T> b) {
972#if HWY_COMPILER_HAS_MASK_INTRINSICS
973 return Mask256<T>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
974#else
975 return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
976#endif
977}
978
979// UnmaskedNot returns ~m.raw without zeroing out any invalid bits
980template <typename T, HWY_IF_T_SIZE(T, 1)>
982#if HWY_COMPILER_HAS_MASK_INTRINSICS
983 return Mask256<T>{static_cast<__mmask32>(_knot_mask32(m.raw))};
984#else
985 return Mask256<T>{static_cast<__mmask32>(~m.raw)};
986#endif
987}
988
989template <typename T, HWY_IF_T_SIZE(T, 2)>
991#if HWY_COMPILER_HAS_MASK_INTRINSICS
992 return Mask256<T>{static_cast<__mmask16>(_knot_mask16(m.raw))};
993#else
994 return Mask256<T>{static_cast<__mmask16>(~m.raw)};
995#endif
996}
997
998template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
999HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
1000#if HWY_COMPILER_HAS_MASK_INTRINSICS
1001 return Mask256<T>{static_cast<__mmask8>(_knot_mask8(m.raw))};
1002#else
1003 return Mask256<T>{static_cast<__mmask8>(~m.raw)};
1004#endif
1005}
1006
1007template <typename T>
1009 // sizeof(T) == 1: simply return ~m as all 32 bits of m are valid
1010 return UnmaskedNot(m);
1011}
1012template <typename T>
1014 // sizeof(T) == 2: simply return ~m as all 16 bits of m are valid
1015 return UnmaskedNot(m);
1016}
1017template <typename T>
1019 // sizeof(T) == 4: simply return ~m as all 8 bits of m are valid
1020 return UnmaskedNot(m);
1021}
1022template <typename T>
1024 // sizeof(T) == 8: need to zero out the upper 4 bits of ~m as only the lower
1025 // 4 bits of m are valid
1026
1027 // Return (~m) & 0x0F
1028 return AndNot(hwy::SizeTag<8>(), m, Mask256<T>::FromBits(uint64_t{0x0F}));
1029}
1030
1031} // namespace detail
1032
1033template <typename T>
1034HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
1035 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
1036}
1037
1038template <typename T>
1039HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
1040 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
1041}
1042
1043template <typename T>
1044HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
1045 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
1046}
1047
1048template <typename T>
1049HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
1050 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
1051}
1052
1053template <typename T>
1054HWY_API Mask256<T> Not(const Mask256<T> m) {
1055 // Flip only the valid bits.
1056 return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
1057}
1058
1059template <typename T>
1060HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
1061 return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
1062}
1063
1064template <class D, HWY_IF_LANES_D(D, 32)>
1065HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
1066 MFromD<Half<D>> lo) {
1067#if HWY_COMPILER_HAS_MASK_INTRINSICS
1068 const __mmask32 combined_mask = _mm512_kunpackw(
1069 static_cast<__mmask32>(hi.raw), static_cast<__mmask32>(lo.raw));
1070#else
1071 const auto combined_mask =
1072 ((static_cast<uint32_t>(hi.raw) << 16) | (lo.raw & 0xFFFFu));
1073#endif
1074
1075 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
1076}
1077
1078template <class D, HWY_IF_LANES_D(D, 16)>
1079HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1080#if HWY_COMPILER_HAS_MASK_INTRINSICS
1081 const auto shifted_mask = _kshiftri_mask32(static_cast<__mmask32>(m.raw), 16);
1082#else
1083 const auto shifted_mask = static_cast<uint32_t>(m.raw) >> 16;
1084#endif
1085
1086 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1087}
1088
1089template <class D, HWY_IF_LANES_D(D, 32)>
1090HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
1091 using RawM = decltype(MFromD<D>().raw);
1092#if HWY_COMPILER_HAS_MASK_INTRINSICS
1093 return MFromD<D>{
1094 static_cast<RawM>(_kshiftli_mask32(static_cast<__mmask32>(m.raw), 1))};
1095#else
1096 return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) << 1)};
1097#endif
1098}
1099
1100template <class D, HWY_IF_LANES_D(D, 32)>
1101HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
1102 using RawM = decltype(MFromD<D>().raw);
1103#if HWY_COMPILER_HAS_MASK_INTRINSICS
1104 return MFromD<D>{
1105 static_cast<RawM>(_kshiftri_mask32(static_cast<__mmask32>(m.raw), 1))};
1106#else
1107 return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) >> 1)};
1108#endif
1109}
1110
1111#else // AVX2
1112
1113// ------------------------------ Mask
1114
1115// Mask and Vec are the same (true = FF..FF).
1116template <typename T>
1117HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
1118 return Mask256<T>{v.raw};
1119}
1120
1121template <typename T>
1122HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1123 return Vec256<T>{v.raw};
1124}
1125
1126// ------------------------------ IfThenElse
1127
1128// mask ? yes : no
1129template <typename T, HWY_IF_NOT_FLOAT3264(T)>
1130HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
1131 return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
1132}
1133HWY_API Vec256<float> IfThenElse(Mask256<float> mask, Vec256<float> yes,
1134 Vec256<float> no) {
1135 return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
1136}
1137HWY_API Vec256<double> IfThenElse(Mask256<double> mask, Vec256<double> yes,
1138 Vec256<double> no) {
1139 return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
1140}
1141
1142// mask ? yes : 0
1143template <typename T>
1144HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
1145 const DFromV<decltype(yes)> d;
1146 return yes & VecFromMask(d, mask);
1147}
1148
1149// mask ? 0 : no
1150template <typename T>
1151HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
1152 const DFromV<decltype(no)> d;
1153 return AndNot(VecFromMask(d, mask), no);
1154}
1155
1156template <typename T>
1157HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
1158 static_assert(IsSigned<T>(), "Only for float");
1159 const DFromV<decltype(v)> d;
1160 const auto zero = Zero(d);
1161 // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
1162 return IfThenElse(MaskFromVec(v), zero, v);
1163}
1164
1165// ------------------------------ Mask logical
1166
1167template <typename T>
1168HWY_API Mask256<T> Not(const Mask256<T> m) {
1169 const Full256<T> d;
1170 return MaskFromVec(Not(VecFromMask(d, m)));
1171}
1172
1173template <typename T>
1174HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
1175 const Full256<T> d;
1176 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1177}
1178
1179template <typename T>
1180HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
1181 const Full256<T> d;
1182 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1183}
1184
1185template <typename T>
1186HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
1187 const Full256<T> d;
1188 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1189}
1190
1191template <typename T>
1192HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
1193 const Full256<T> d;
1194 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1195}
1196
1197template <typename T>
1198HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
1199 const Full256<T> d;
1200 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
1201}
1202
1203#endif // HWY_TARGET <= HWY_AVX3
1204
1205// ================================================== COMPARE
1206
1207#if HWY_TARGET <= HWY_AVX3
1208
1209// Comparisons set a mask bit to 1 if the condition is true, else 0.
1210
1211template <class DTo, HWY_IF_V_SIZE_D(DTo, 32), typename TFrom>
1212HWY_API MFromD<DTo> RebindMask(DTo /*tag*/, Mask256<TFrom> m) {
1213 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
1214 return MFromD<DTo>{m.raw};
1215}
1216
1217namespace detail {
1218
1219template <typename T>
1221 const Vec256<T> bit) {
1222 return Mask256<T>{_mm256_test_epi8_mask(v.raw, bit.raw)};
1223}
1224template <typename T>
1226 const Vec256<T> bit) {
1227 return Mask256<T>{_mm256_test_epi16_mask(v.raw, bit.raw)};
1228}
1229template <typename T>
1231 const Vec256<T> bit) {
1232 return Mask256<T>{_mm256_test_epi32_mask(v.raw, bit.raw)};
1233}
1234template <typename T>
1236 const Vec256<T> bit) {
1237 return Mask256<T>{_mm256_test_epi64_mask(v.raw, bit.raw)};
1238}
1239
1240} // namespace detail
1241
1242template <typename T>
1243HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
1244 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1245 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1246}
1247
1248// ------------------------------ Equality
1249
1250template <typename T, HWY_IF_T_SIZE(T, 1)>
1252 return Mask256<T>{_mm256_cmpeq_epi8_mask(a.raw, b.raw)};
1253}
1254template <typename T, HWY_IF_UI16(T)>
1255HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1256 return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
1257}
1258template <typename T, HWY_IF_UI32(T)>
1259HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1260 return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
1261}
1262template <typename T, HWY_IF_UI64(T)>
1263HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1264 return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
1265}
1266
1267#if HWY_HAVE_FLOAT16
1268HWY_API Mask256<float16_t> operator==(Vec256<float16_t> a,
1269 Vec256<float16_t> b) {
1270 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1271 HWY_DIAGNOSTICS(push)
1272 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1273 return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1274 HWY_DIAGNOSTICS(pop)
1275}
1276#endif // HWY_HAVE_FLOAT16
1278 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1279}
1280
1282 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1283}
1284
1285// ------------------------------ Inequality
1286
1287template <typename T, HWY_IF_T_SIZE(T, 1)>
1289 return Mask256<T>{_mm256_cmpneq_epi8_mask(a.raw, b.raw)};
1290}
1291template <typename T, HWY_IF_UI16(T)>
1292HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1293 return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
1294}
1295template <typename T, HWY_IF_UI32(T)>
1296HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1297 return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
1298}
1299template <typename T, HWY_IF_UI64(T)>
1300HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1301 return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
1302}
1303
1304#if HWY_HAVE_FLOAT16
1305HWY_API Mask256<float16_t> operator!=(Vec256<float16_t> a,
1306 Vec256<float16_t> b) {
1307 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1308 HWY_DIAGNOSTICS(push)
1309 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1310 return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1311 HWY_DIAGNOSTICS(pop)
1312}
1313#endif // HWY_HAVE_FLOAT16
1315 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1316}
1317
1319 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1320}
1321
1322// ------------------------------ Strict inequality
1323
1325 return Mask256<int8_t>{_mm256_cmpgt_epi8_mask(a.raw, b.raw)};
1326}
1328 return Mask256<int16_t>{_mm256_cmpgt_epi16_mask(a.raw, b.raw)};
1329}
1331 return Mask256<int32_t>{_mm256_cmpgt_epi32_mask(a.raw, b.raw)};
1332}
1334 return Mask256<int64_t>{_mm256_cmpgt_epi64_mask(a.raw, b.raw)};
1335}
1336
1338 return Mask256<uint8_t>{_mm256_cmpgt_epu8_mask(a.raw, b.raw)};
1339}
1341 return Mask256<uint16_t>{_mm256_cmpgt_epu16_mask(a.raw, b.raw)};
1342}
1344 return Mask256<uint32_t>{_mm256_cmpgt_epu32_mask(a.raw, b.raw)};
1345}
1347 return Mask256<uint64_t>{_mm256_cmpgt_epu64_mask(a.raw, b.raw)};
1348}
1349
1350#if HWY_HAVE_FLOAT16
1351HWY_API Mask256<float16_t> operator>(Vec256<float16_t> a, Vec256<float16_t> b) {
1352 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1353 HWY_DIAGNOSTICS(push)
1354 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1355 return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
1356 HWY_DIAGNOSTICS(pop)
1357}
1358#endif // HWY_HAVE_FLOAT16
1360 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1361}
1363 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1364}
1365
1366// ------------------------------ Weak inequality
1367
1368#if HWY_HAVE_FLOAT16
1369HWY_API Mask256<float16_t> operator>=(Vec256<float16_t> a,
1370 Vec256<float16_t> b) {
1371 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1372 HWY_DIAGNOSTICS(push)
1373 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1374 return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
1375 HWY_DIAGNOSTICS(pop)
1376}
1377#endif // HWY_HAVE_FLOAT16
1378
1380 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1381}
1383 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1384}
1385
1387 return Mask256<int8_t>{_mm256_cmpge_epi8_mask(a.raw, b.raw)};
1388}
1390 return Mask256<int16_t>{_mm256_cmpge_epi16_mask(a.raw, b.raw)};
1391}
1393 return Mask256<int32_t>{_mm256_cmpge_epi32_mask(a.raw, b.raw)};
1394}
1396 return Mask256<int64_t>{_mm256_cmpge_epi64_mask(a.raw, b.raw)};
1397}
1398
1400 return Mask256<uint8_t>{_mm256_cmpge_epu8_mask(a.raw, b.raw)};
1401}
1403 const Vec256<uint16_t> b) {
1404 return Mask256<uint16_t>{_mm256_cmpge_epu16_mask(a.raw, b.raw)};
1405}
1407 const Vec256<uint32_t> b) {
1408 return Mask256<uint32_t>{_mm256_cmpge_epu32_mask(a.raw, b.raw)};
1409}
1411 const Vec256<uint64_t> b) {
1412 return Mask256<uint64_t>{_mm256_cmpge_epu64_mask(a.raw, b.raw)};
1413}
1414
1415// ------------------------------ Mask
1416
1417namespace detail {
1418
1419template <typename T>
1421 return Mask256<T>{_mm256_movepi8_mask(v.raw)};
1422}
1423template <typename T>
1425 return Mask256<T>{_mm256_movepi16_mask(v.raw)};
1426}
1427template <typename T>
1429 return Mask256<T>{_mm256_movepi32_mask(v.raw)};
1430}
1431template <typename T>
1433 return Mask256<T>{_mm256_movepi64_mask(v.raw)};
1434}
1435
1436} // namespace detail
1437
1438template <typename T, HWY_IF_NOT_FLOAT(T)>
1440 return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1441}
1442// There do not seem to be native floating-point versions of these instructions.
1443template <typename T, HWY_IF_FLOAT(T)>
1444HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
1445 const RebindToSigned<DFromV<decltype(v)>> di;
1446 return Mask256<T>{MaskFromVec(BitCast(di, v)).raw};
1447}
1448
1449template <typename T, HWY_IF_T_SIZE(T, 1)>
1451 return Vec256<T>{_mm256_movm_epi8(v.raw)};
1452}
1453
1454template <typename T, HWY_IF_UI16(T)>
1455HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1456 return Vec256<T>{_mm256_movm_epi16(v.raw)};
1457}
1458
1459template <typename T, HWY_IF_UI32(T)>
1460HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1461 return Vec256<T>{_mm256_movm_epi32(v.raw)};
1462}
1463
1464template <typename T, HWY_IF_UI64(T)>
1465HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1466 return Vec256<T>{_mm256_movm_epi64(v.raw)};
1467}
1468
1469#if HWY_HAVE_FLOAT16
1470HWY_API Vec256<float16_t> VecFromMask(const Mask256<float16_t> v) {
1471 return Vec256<float16_t>{_mm256_castsi256_ph(_mm256_movm_epi16(v.raw))};
1472}
1473#endif // HWY_HAVE_FLOAT16
1474
1476 return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))};
1477}
1478
1480 return Vec256<double>{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))};
1481}
1482
1483#else // AVX2
1484
1485// Comparisons fill a lane with 1-bits if the condition is true, else 0.
1486
1487template <class DTo, HWY_IF_V_SIZE_D(DTo, 32), typename TFrom>
1488HWY_API MFromD<DTo> RebindMask(DTo d_to, Mask256<TFrom> m) {
1489 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
1490 const Full256<TFrom> dfrom;
1491 return MaskFromVec(BitCast(d_to, VecFromMask(dfrom, m)));
1492}
1493
1494template <typename T>
1495HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
1496 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1497 return (v & bit) == bit;
1498}
1499
1500// ------------------------------ Equality
1501
1502template <typename T, HWY_IF_T_SIZE(T, 1)>
1503HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
1504 return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1505}
1506
1507template <typename T, HWY_IF_UI16(T)>
1508HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
1509 return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1510}
1511
1512template <typename T, HWY_IF_UI32(T)>
1513HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
1514 return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1515}
1516
1517template <typename T, HWY_IF_UI64(T)>
1518HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
1519 return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1520}
1521
1522HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
1523 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1524}
1525
1526HWY_API Mask256<double> operator==(Vec256<double> a, Vec256<double> b) {
1527 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1528}
1529
1530// ------------------------------ Inequality
1531
1532template <typename T, HWY_IF_NOT_FLOAT3264(T)>
1533HWY_API Mask256<T> operator!=(Vec256<T> a, Vec256<T> b) {
1534 return Not(a == b);
1535}
1536HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
1537 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1538}
1539HWY_API Mask256<double> operator!=(Vec256<double> a, Vec256<double> b) {
1540 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1541}
1542
1543// ------------------------------ Strict inequality
1544
1545// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
1546namespace detail {
1547
1548// Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8
1549// to perform an unsigned comparison instead of the intended signed. Workaround
1550// is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy
1551#if HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 903
1552#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1553#else
1554#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1555#endif
1556
1557HWY_API Mask256<int8_t> Gt(hwy::SignedTag /*tag*/, Vec256<int8_t> a,
1558 Vec256<int8_t> b) {
1559#if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1560 using i8x32 = signed char __attribute__((__vector_size__(32)));
1561 return Mask256<int8_t>{static_cast<__m256i>(reinterpret_cast<i8x32>(a.raw) >
1562 reinterpret_cast<i8x32>(b.raw))};
1563#else
1564 return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1565#endif
1566}
1567HWY_API Mask256<int16_t> Gt(hwy::SignedTag /*tag*/, Vec256<int16_t> a,
1568 Vec256<int16_t> b) {
1569 return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1570}
1571HWY_API Mask256<int32_t> Gt(hwy::SignedTag /*tag*/, Vec256<int32_t> a,
1572 Vec256<int32_t> b) {
1573 return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1574}
1575HWY_API Mask256<int64_t> Gt(hwy::SignedTag /*tag*/, Vec256<int64_t> a,
1576 Vec256<int64_t> b) {
1577 return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1578}
1579
1580template <typename T>
1581HWY_INLINE Mask256<T> Gt(hwy::UnsignedTag /*tag*/, Vec256<T> a, Vec256<T> b) {
1582 const Full256<T> du;
1583 const RebindToSigned<decltype(du)> di;
1584 const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
1585 return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
1586}
1587
1588HWY_API Mask256<float> Gt(hwy::FloatTag /*tag*/, Vec256<float> a,
1589 Vec256<float> b) {
1590 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1591}
1592HWY_API Mask256<double> Gt(hwy::FloatTag /*tag*/, Vec256<double> a,
1593 Vec256<double> b) {
1594 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1595}
1596
1597} // namespace detail
1598
1599template <typename T>
1600HWY_API Mask256<T> operator>(Vec256<T> a, Vec256<T> b) {
1601 return detail::Gt(hwy::TypeTag<T>(), a, b);
1602}
1603
1604// ------------------------------ Weak inequality
1605
1606namespace detail {
1607
1608template <typename T>
1609HWY_INLINE Mask256<T> Ge(hwy::SignedTag tag, Vec256<T> a, Vec256<T> b) {
1610 return Not(Gt(tag, b, a));
1611}
1612
1613template <typename T>
1614HWY_INLINE Mask256<T> Ge(hwy::UnsignedTag tag, Vec256<T> a, Vec256<T> b) {
1615 return Not(Gt(tag, b, a));
1616}
1617
1618HWY_INLINE Mask256<float> Ge(hwy::FloatTag /*tag*/, Vec256<float> a,
1619 Vec256<float> b) {
1620 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1621}
1622HWY_INLINE Mask256<double> Ge(hwy::FloatTag /*tag*/, Vec256<double> a,
1623 Vec256<double> b) {
1624 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1625}
1626
1627} // namespace detail
1628
1629template <typename T>
1630HWY_API Mask256<T> operator>=(Vec256<T> a, Vec256<T> b) {
1631 return detail::Ge(hwy::TypeTag<T>(), a, b);
1632}
1633
1634#endif // HWY_TARGET <= HWY_AVX3
1635
1636// ------------------------------ Reversed comparisons
1637
1638template <typename T>
1639HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) {
1640 return b > a;
1641}
1642
1643template <typename T>
1644HWY_API Mask256<T> operator<=(const Vec256<T> a, const Vec256<T> b) {
1645 return b >= a;
1646}
1647
1648// ------------------------------ Min (Gt, IfThenElse)
1649
1650// Unsigned
1652 return Vec256<uint8_t>{_mm256_min_epu8(a.raw, b.raw)};
1653}
1655 const Vec256<uint16_t> b) {
1656 return Vec256<uint16_t>{_mm256_min_epu16(a.raw, b.raw)};
1657}
1659 const Vec256<uint32_t> b) {
1660 return Vec256<uint32_t>{_mm256_min_epu32(a.raw, b.raw)};
1661}
1663 const Vec256<uint64_t> b) {
1664#if HWY_TARGET <= HWY_AVX3
1665 return Vec256<uint64_t>{_mm256_min_epu64(a.raw, b.raw)};
1666#else
1667 const Full256<uint64_t> du;
1668 const Full256<int64_t> di;
1669 const auto msb = Set(du, 1ull << 63);
1670 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1671 return IfThenElse(gt, b, a);
1672#endif
1673}
1674
1675// Signed
1677 return Vec256<int8_t>{_mm256_min_epi8(a.raw, b.raw)};
1678}
1680 return Vec256<int16_t>{_mm256_min_epi16(a.raw, b.raw)};
1681}
1683 return Vec256<int32_t>{_mm256_min_epi32(a.raw, b.raw)};
1684}
1686#if HWY_TARGET <= HWY_AVX3
1687 return Vec256<int64_t>{_mm256_min_epi64(a.raw, b.raw)};
1688#else
1689 return IfThenElse(a < b, a, b);
1690#endif
1691}
1692
1693// Float
1694#if HWY_HAVE_FLOAT16
1695HWY_API Vec256<float16_t> Min(Vec256<float16_t> a, Vec256<float16_t> b) {
1696 return Vec256<float16_t>{_mm256_min_ph(a.raw, b.raw)};
1697}
1698#endif // HWY_HAVE_FLOAT16
1700 return Vec256<float>{_mm256_min_ps(a.raw, b.raw)};
1701}
1703 return Vec256<double>{_mm256_min_pd(a.raw, b.raw)};
1704}
1705
1706// ------------------------------ Max (Gt, IfThenElse)
1707
1708// Unsigned
1710 return Vec256<uint8_t>{_mm256_max_epu8(a.raw, b.raw)};
1711}
1713 const Vec256<uint16_t> b) {
1714 return Vec256<uint16_t>{_mm256_max_epu16(a.raw, b.raw)};
1715}
1717 const Vec256<uint32_t> b) {
1718 return Vec256<uint32_t>{_mm256_max_epu32(a.raw, b.raw)};
1719}
1721 const Vec256<uint64_t> b) {
1722#if HWY_TARGET <= HWY_AVX3
1723 return Vec256<uint64_t>{_mm256_max_epu64(a.raw, b.raw)};
1724#else
1725 const Full256<uint64_t> du;
1726 const Full256<int64_t> di;
1727 const auto msb = Set(du, 1ull << 63);
1728 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1729 return IfThenElse(gt, a, b);
1730#endif
1731}
1732
1733// Signed
1735 return Vec256<int8_t>{_mm256_max_epi8(a.raw, b.raw)};
1736}
1738 return Vec256<int16_t>{_mm256_max_epi16(a.raw, b.raw)};
1739}
1741 return Vec256<int32_t>{_mm256_max_epi32(a.raw, b.raw)};
1742}
1744#if HWY_TARGET <= HWY_AVX3
1745 return Vec256<int64_t>{_mm256_max_epi64(a.raw, b.raw)};
1746#else
1747 return IfThenElse(a < b, b, a);
1748#endif
1749}
1750
1751// Float
1752#if HWY_HAVE_FLOAT16
1753HWY_API Vec256<float16_t> Max(Vec256<float16_t> a, Vec256<float16_t> b) {
1754 return Vec256<float16_t>{_mm256_max_ph(a.raw, b.raw)};
1755}
1756#endif // HWY_HAVE_FLOAT16
1758 return Vec256<float>{_mm256_max_ps(a.raw, b.raw)};
1759}
1761 return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
1762}
1763
1764// ------------------------------ Iota
1765
1766namespace detail {
1767
1768template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
1769HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1770 return VFromD<D>{_mm256_set_epi8(
1771 static_cast<char>(31), static_cast<char>(30), static_cast<char>(29),
1772 static_cast<char>(28), static_cast<char>(27), static_cast<char>(26),
1773 static_cast<char>(25), static_cast<char>(24), static_cast<char>(23),
1774 static_cast<char>(22), static_cast<char>(21), static_cast<char>(20),
1775 static_cast<char>(19), static_cast<char>(18), static_cast<char>(17),
1776 static_cast<char>(16), static_cast<char>(15), static_cast<char>(14),
1777 static_cast<char>(13), static_cast<char>(12), static_cast<char>(11),
1778 static_cast<char>(10), static_cast<char>(9), static_cast<char>(8),
1779 static_cast<char>(7), static_cast<char>(6), static_cast<char>(5),
1780 static_cast<char>(4), static_cast<char>(3), static_cast<char>(2),
1781 static_cast<char>(1), static_cast<char>(0))};
1782}
1783
1784template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
1785HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1786 return VFromD<D>{_mm256_set_epi16(
1787 int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12}, int16_t{11},
1788 int16_t{10}, int16_t{9}, int16_t{8}, int16_t{7}, int16_t{6}, int16_t{5},
1789 int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0})};
1790}
1791
1792#if HWY_HAVE_FLOAT16
1793template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
1794HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1795 return VFromD<D>{
1796 _mm256_set_ph(float16_t{15}, float16_t{14}, float16_t{13}, float16_t{12},
1797 float16_t{11}, float16_t{10}, float16_t{9}, float16_t{8},
1798 float16_t{7}, float16_t{6}, float16_t{5}, float16_t{4},
1799 float16_t{3}, float16_t{2}, float16_t{1}, float16_t{0})};
1800}
1801#endif // HWY_HAVE_FLOAT16
1802
1803template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
1804HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1805 return VFromD<D>{_mm256_set_epi32(int32_t{7}, int32_t{6}, int32_t{5},
1806 int32_t{4}, int32_t{3}, int32_t{2},
1807 int32_t{1}, int32_t{0})};
1808}
1809
1810template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
1811HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1812 return VFromD<D>{
1813 _mm256_set_epi64x(int64_t{3}, int64_t{2}, int64_t{1}, int64_t{0})};
1814}
1815
1816template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
1817HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1818 return VFromD<D>{
1819 _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)};
1820}
1821
1822template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
1823HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1824 return VFromD<D>{_mm256_set_pd(3.0, 2.0, 1.0, 0.0)};
1825}
1826
1827} // namespace detail
1828
1829template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2>
1830HWY_API VFromD<D> Iota(D d, const T2 first) {
1831 return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
1832}
1833
1834// ------------------------------ FirstN (Iota, Lt)
1835
1836template <class D, HWY_IF_V_SIZE_D(D, 32), class M = MFromD<D>>
1837HWY_API M FirstN(const D d, size_t n) {
1838 constexpr size_t kN = MaxLanes(d);
1839 // For AVX3, this ensures `num` <= 255 as required by bzhi, which only looks
1840 // at the lower 8 bits; for AVX2 and below, this ensures `num` fits in TI.
1841 n = HWY_MIN(n, kN);
1842
1843#if HWY_TARGET <= HWY_AVX3
1844#if HWY_ARCH_X86_64
1845 const uint64_t all = (1ull << kN) - 1;
1846 return M::FromBits(_bzhi_u64(all, n));
1847#else
1848 const uint32_t all = static_cast<uint32_t>((1ull << kN) - 1);
1849 return M::FromBits(_bzhi_u32(all, static_cast<uint32_t>(n)));
1850#endif // HWY_ARCH_X86_64
1851#else
1852 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1853 using TI = TFromD<decltype(di)>;
1854 return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(n)));
1855#endif
1856}
1857
1858// ================================================== ARITHMETIC
1859
1860// ------------------------------ Addition
1861
1862// Unsigned
1864 return Vec256<uint8_t>{_mm256_add_epi8(a.raw, b.raw)};
1865}
1875
1876// Signed
1878 return Vec256<int8_t>{_mm256_add_epi8(a.raw, b.raw)};
1879}
1881 return Vec256<int16_t>{_mm256_add_epi16(a.raw, b.raw)};
1882}
1884 return Vec256<int32_t>{_mm256_add_epi32(a.raw, b.raw)};
1885}
1887 return Vec256<int64_t>{_mm256_add_epi64(a.raw, b.raw)};
1888}
1889
1890// Float
1891#if HWY_HAVE_FLOAT16
1892HWY_API Vec256<float16_t> operator+(Vec256<float16_t> a, Vec256<float16_t> b) {
1893 return Vec256<float16_t>{_mm256_add_ph(a.raw, b.raw)};
1894}
1895#endif // HWY_HAVE_FLOAT16
1897 return Vec256<float>{_mm256_add_ps(a.raw, b.raw)};
1898}
1900 return Vec256<double>{_mm256_add_pd(a.raw, b.raw)};
1901}
1902
1903// ------------------------------ Subtraction
1904
1905// Unsigned
1907 return Vec256<uint8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1908}
1918
1919// Signed
1921 return Vec256<int8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1922}
1924 return Vec256<int16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1925}
1927 return Vec256<int32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1928}
1930 return Vec256<int64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1931}
1932
1933// Float
1934#if HWY_HAVE_FLOAT16
1935HWY_API Vec256<float16_t> operator-(Vec256<float16_t> a, Vec256<float16_t> b) {
1936 return Vec256<float16_t>{_mm256_sub_ph(a.raw, b.raw)};
1937}
1938#endif // HWY_HAVE_FLOAT16
1940 return Vec256<float>{_mm256_sub_ps(a.raw, b.raw)};
1941}
1943 return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
1944}
1945
1946// ------------------------------ AddSub
1947
1949 return Vec256<float>{_mm256_addsub_ps(a.raw, b.raw)};
1950}
1952 return Vec256<double>{_mm256_addsub_pd(a.raw, b.raw)};
1953}
1954
1955// ------------------------------ SumsOf8
1956HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) {
1957 return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
1958}
1959
1963
1964// ------------------------------ SumsOf4
1965#if HWY_TARGET <= HWY_AVX3
1966namespace detail {
1967
1969 hwy::SizeTag<1> /*lane_size_tag*/,
1970 Vec256<uint8_t> v) {
1971 const DFromV<decltype(v)> d;
1972
1973 // _mm256_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
1974 // zeroed out and the sums of the 4 consecutive lanes are already in the
1975 // even uint16_t lanes of the _mm256_maskz_dbsad_epu8 result.
1976 return Vec256<uint32_t>{_mm256_maskz_dbsad_epu8(
1977 static_cast<__mmask16>(0x5555), v.raw, Zero(d).raw, 0)};
1978}
1979
1980// detail::SumsOf4 for Vec256<int8_t> on AVX3 is implemented in x86_512-inl.h
1981
1982} // namespace detail
1983#endif // HWY_TARGET <= HWY_AVX3
1984
1985// ------------------------------ SumsOfAdjQuadAbsDiff
1986
1987template <int kAOffset, int kBOffset>
1989 Vec256<uint8_t> b) {
1990 static_assert(0 <= kAOffset && kAOffset <= 1,
1991 "kAOffset must be between 0 and 1");
1992 static_assert(0 <= kBOffset && kBOffset <= 3,
1993 "kBOffset must be between 0 and 3");
1994 return Vec256<uint16_t>{_mm256_mpsadbw_epu8(
1995 a.raw, b.raw,
1996 (kAOffset << 5) | (kBOffset << 3) | (kAOffset << 2) | kBOffset)};
1997}
1998
1999// ------------------------------ SumsOfShuffledQuadAbsDiff
2000
2001#if HWY_TARGET <= HWY_AVX3
2002template <int kIdx3, int kIdx2, int kIdx1, int kIdx0>
2004 Vec256<uint8_t> b) {
2005 static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
2006 static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
2007 static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
2008 static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
2009 return Vec256<uint16_t>{
2010 _mm256_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
2011}
2012#endif
2013
2014// ------------------------------ SaturatedAdd
2015
2016// Returns a + b clamped to the destination range.
2017
2018// Unsigned
2025
2026// Signed
2028 return Vec256<int8_t>{_mm256_adds_epi8(a.raw, b.raw)};
2029}
2031 return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
2032}
2033
2034#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
2036 const DFromV<decltype(a)> d;
2037 const auto sum = a + b;
2038 const auto overflow_mask = MaskFromVec(
2039 Vec256<int32_t>{_mm256_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)});
2040 const auto i32_max = Set(d, LimitsMax<int32_t>());
2041 const Vec256<int32_t> overflow_result{_mm256_mask_ternarylogic_epi32(
2042 i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
2043 return IfThenElse(overflow_mask, overflow_result, sum);
2044}
2045
2047 const DFromV<decltype(a)> d;
2048 const auto sum = a + b;
2049 const auto overflow_mask = MaskFromVec(
2050 Vec256<int64_t>{_mm256_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)});
2051 const auto i64_max = Set(d, LimitsMax<int64_t>());
2052 const Vec256<int64_t> overflow_result{_mm256_mask_ternarylogic_epi64(
2053 i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
2054 return IfThenElse(overflow_mask, overflow_result, sum);
2055}
2056#endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
2057
2058// ------------------------------ SaturatedSub
2059
2060// Returns a - b clamped to the destination range.
2061
2062// Unsigned
2069
2070// Signed
2072 return Vec256<int8_t>{_mm256_subs_epi8(a.raw, b.raw)};
2073}
2075 return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
2076}
2077
2078#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
2080 const DFromV<decltype(a)> d;
2081 const auto diff = a - b;
2082 const auto overflow_mask = MaskFromVec(
2083 Vec256<int32_t>{_mm256_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)});
2084 const auto i32_max = Set(d, LimitsMax<int32_t>());
2085 const Vec256<int32_t> overflow_result{_mm256_mask_ternarylogic_epi32(
2086 i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
2087 return IfThenElse(overflow_mask, overflow_result, diff);
2088}
2089
2091 const DFromV<decltype(a)> d;
2092 const auto diff = a - b;
2093 const auto overflow_mask = MaskFromVec(
2094 Vec256<int64_t>{_mm256_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)});
2095 const auto i64_max = Set(d, LimitsMax<int64_t>());
2096 const Vec256<int64_t> overflow_result{_mm256_mask_ternarylogic_epi64(
2097 i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
2098 return IfThenElse(overflow_mask, overflow_result, diff);
2099}
2100#endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
2101
2102// ------------------------------ Average
2103
2104// Returns (a + b + 1) / 2
2105
2106// Unsigned
2113
2114// ------------------------------ Abs (Sub)
2115
2116// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
2118#if HWY_COMPILER_MSVC
2119 // Workaround for incorrect codegen? (wrong result)
2120 const DFromV<decltype(v)> d;
2121 const auto zero = Zero(d);
2122 return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
2123#else
2124 return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
2125#endif
2126}
2128 return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
2129}
2131 return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
2132}
2133
2134#if HWY_TARGET <= HWY_AVX3
2136 return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
2137}
2138#endif
2139
2140// ------------------------------ Integer multiplication
2141
2142// Unsigned
2144 return Vec256<uint16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
2145}
2147 return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
2148}
2149
2150// Signed
2152 return Vec256<int16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
2153}
2155 return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
2156}
2157
2158// Returns the upper 16 bits of a * b in each lane.
2160 return Vec256<uint16_t>{_mm256_mulhi_epu16(a.raw, b.raw)};
2161}
2163 return Vec256<int16_t>{_mm256_mulhi_epi16(a.raw, b.raw)};
2164}
2165
2167 return Vec256<int16_t>{_mm256_mulhrs_epi16(a.raw, b.raw)};
2168}
2169
2170// Multiplies even lanes (0, 2 ..) and places the double-wide result into
2171// even and the upper half into its odd neighbor lane.
2173 return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)};
2174}
2178
2179// ------------------------------ ShiftLeft
2180
2181#if HWY_TARGET <= HWY_AVX3_DL
2182namespace detail {
2183template <typename T>
2185 return Vec256<T>{_mm256_gf2p8affine_epi64_epi8(v.raw, matrix.raw, 0)};
2186}
2187} // namespace detail
2188#endif // HWY_TARGET <= HWY_AVX3_DL
2189
2190template <int kBits>
2192 return Vec256<uint16_t>{_mm256_slli_epi16(v.raw, kBits)};
2193}
2194
2195template <int kBits>
2197 return Vec256<uint32_t>{_mm256_slli_epi32(v.raw, kBits)};
2198}
2199
2200template <int kBits>
2202 return Vec256<uint64_t>{_mm256_slli_epi64(v.raw, kBits)};
2203}
2204
2205template <int kBits>
2207 return Vec256<int16_t>{_mm256_slli_epi16(v.raw, kBits)};
2208}
2209
2210template <int kBits>
2212 return Vec256<int32_t>{_mm256_slli_epi32(v.raw, kBits)};
2213}
2214
2215template <int kBits>
2217 return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
2218}
2219
2220#if HWY_TARGET > HWY_AVX3_DL
2221
2222template <int kBits, typename T, HWY_IF_T_SIZE(T, 1)>
2223HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) {
2224 const Full256<T> d8;
2225 const RepartitionToWide<decltype(d8)> d16;
2226 const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
2227 return kBits == 1
2228 ? (v + v)
2229 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
2230}
2231
2232#endif // HWY_TARGET > HWY_AVX3_DL
2233
2234// ------------------------------ ShiftRight
2235
2236template <int kBits>
2238 return Vec256<uint16_t>{_mm256_srli_epi16(v.raw, kBits)};
2239}
2240
2241template <int kBits>
2243 return Vec256<uint32_t>{_mm256_srli_epi32(v.raw, kBits)};
2244}
2245
2246template <int kBits>
2248 return Vec256<uint64_t>{_mm256_srli_epi64(v.raw, kBits)};
2249}
2250
2251template <int kBits>
2253 return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
2254}
2255
2256template <int kBits>
2258 return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
2259}
2260
2261#if HWY_TARGET > HWY_AVX3_DL
2262
2263template <int kBits>
2264HWY_API Vec256<uint8_t> ShiftRight(Vec256<uint8_t> v) {
2265 const Full256<uint8_t> d8;
2266 // Use raw instead of BitCast to support N=1.
2267 const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
2268 return shifted & Set(d8, 0xFF >> kBits);
2269}
2270
2271template <int kBits>
2272HWY_API Vec256<int8_t> ShiftRight(Vec256<int8_t> v) {
2273 const Full256<int8_t> di;
2274 const Full256<uint8_t> du;
2275 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2276 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
2277 return (shifted ^ shifted_sign) - shifted_sign;
2278}
2279
2280#endif // HWY_TARGET > HWY_AVX3_DL
2281
2282// i64 is implemented after BroadcastSignBit.
2283
2284// ------------------------------ RotateRight
2285
2286// U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
2287// RotateRight uses detail::GaloisAffine on AVX3_DL
2288
2289#if HWY_TARGET > HWY_AVX3_DL
2290template <int kBits>
2291HWY_API Vec256<uint8_t> RotateRight(const Vec256<uint8_t> v) {
2292 static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
2293 if (kBits == 0) return v;
2294 // AVX3 does not support 8-bit.
2295 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
2296}
2297#endif
2298
2299template <int kBits>
2301 static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
2302 if (kBits == 0) return v;
2303 // AVX3 does not support 16-bit.
2304 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
2305}
2306
2307template <int kBits>
2309 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
2310#if HWY_TARGET <= HWY_AVX3
2311 return Vec256<uint32_t>{_mm256_ror_epi32(v.raw, kBits)};
2312#else
2313 if (kBits == 0) return v;
2314 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
2315#endif
2316}
2317
2318template <int kBits>
2320 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
2321#if HWY_TARGET <= HWY_AVX3
2322 return Vec256<uint64_t>{_mm256_ror_epi64(v.raw, kBits)};
2323#else
2324 if (kBits == 0) return v;
2325 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
2326#endif
2327}
2328
2329// ------------------------------ Rol/Ror
2330#if HWY_TARGET <= HWY_AVX3
2331
2332template <class T, HWY_IF_UI32(T)>
2334 return Vec256<T>{_mm256_rolv_epi32(a.raw, b.raw)};
2335}
2336
2337template <class T, HWY_IF_UI32(T)>
2339 return Vec256<T>{_mm256_rorv_epi32(a.raw, b.raw)};
2340}
2341
2342template <class T, HWY_IF_UI64(T)>
2343HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
2344 return Vec256<T>{_mm256_rolv_epi64(a.raw, b.raw)};
2345}
2346
2347template <class T, HWY_IF_UI64(T)>
2348HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
2349 return Vec256<T>{_mm256_rorv_epi64(a.raw, b.raw)};
2350}
2351
2352#endif
2353
2354// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2355
2356HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
2357 const DFromV<decltype(v)> d;
2358 return VecFromMask(v < Zero(d));
2359}
2360
2362 return ShiftRight<15>(v);
2363}
2364
2366 return ShiftRight<31>(v);
2367}
2368
2370#if HWY_TARGET == HWY_AVX2
2371 const DFromV<decltype(v)> d;
2372 return VecFromMask(v < Zero(d));
2373#else
2374 return Vec256<int64_t>{_mm256_srai_epi64(v.raw, 63)};
2375#endif
2376}
2377
2378template <int kBits>
2380#if HWY_TARGET <= HWY_AVX3
2381 return Vec256<int64_t>{
2382 _mm256_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))};
2383#else
2384 const Full256<int64_t> di;
2385 const Full256<uint64_t> du;
2386 const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2387 const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
2388 return right | sign;
2389#endif
2390}
2391
2392// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
2394 Vec256<int8_t> no) {
2395 // int8: AVX2 IfThenElse only looks at the MSB.
2396 return IfThenElse(MaskFromVec(v), yes, no);
2397}
2398
2399template <typename T, HWY_IF_T_SIZE(T, 2)>
2401 static_assert(IsSigned<T>(), "Only works for signed/float");
2402
2403#if HWY_TARGET <= HWY_AVX3
2404 const auto mask = MaskFromVec(v);
2405#else
2406 // 16-bit: no native blendv on AVX2, so copy sign to lower byte's MSB.
2407 const DFromV<decltype(v)> d;
2408 const RebindToSigned<decltype(d)> di;
2409 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
2410#endif
2411
2412 return IfThenElse(mask, yes, no);
2413}
2414
2415template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
2416HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
2417 static_assert(IsSigned<T>(), "Only works for signed/float");
2418
2419#if HWY_TARGET <= HWY_AVX3
2420 // No need to cast to float on AVX3 as IfThenElse only looks at the MSB on
2421 // AVX3
2422 return IfThenElse(MaskFromVec(v), yes, no);
2423#else
2424 const DFromV<decltype(v)> d;
2425 const RebindToFloat<decltype(d)> df;
2426 // 32/64-bit: use float IfThenElse, which only looks at the MSB.
2427 const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v));
2428 return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no)));
2429#endif
2430}
2431
2432// ------------------------------ IfNegativeThenNegOrUndefIfZero
2433
2434HWY_API Vec256<int8_t> IfNegativeThenNegOrUndefIfZero(Vec256<int8_t> mask,
2435 Vec256<int8_t> v) {
2436 return Vec256<int8_t>{_mm256_sign_epi8(v.raw, mask.raw)};
2437}
2438
2439HWY_API Vec256<int16_t> IfNegativeThenNegOrUndefIfZero(Vec256<int16_t> mask,
2440 Vec256<int16_t> v) {
2441 return Vec256<int16_t>{_mm256_sign_epi16(v.raw, mask.raw)};
2442}
2443
2444HWY_API Vec256<int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<int32_t> mask,
2445 Vec256<int32_t> v) {
2446 return Vec256<int32_t>{_mm256_sign_epi32(v.raw, mask.raw)};
2447}
2448
2449// ------------------------------ ShiftLeftSame
2450
2451HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
2452 const int bits) {
2453#if HWY_COMPILER_GCC
2454 if (__builtin_constant_p(bits)) {
2455 return Vec256<uint16_t>{_mm256_slli_epi16(v.raw, bits)};
2456 }
2457#endif
2458 return Vec256<uint16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2459}
2460HWY_API Vec256<uint32_t> ShiftLeftSame(const Vec256<uint32_t> v,
2461 const int bits) {
2462#if HWY_COMPILER_GCC
2463 if (__builtin_constant_p(bits)) {
2464 return Vec256<uint32_t>{_mm256_slli_epi32(v.raw, bits)};
2465 }
2466#endif
2467 return Vec256<uint32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2468}
2469HWY_API Vec256<uint64_t> ShiftLeftSame(const Vec256<uint64_t> v,
2470 const int bits) {
2471#if HWY_COMPILER_GCC
2472 if (__builtin_constant_p(bits)) {
2473 return Vec256<uint64_t>{_mm256_slli_epi64(v.raw, bits)};
2474 }
2475#endif
2476 return Vec256<uint64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2477}
2478
2479HWY_API Vec256<int16_t> ShiftLeftSame(const Vec256<int16_t> v, const int bits) {
2480#if HWY_COMPILER_GCC
2481 if (__builtin_constant_p(bits)) {
2482 return Vec256<int16_t>{_mm256_slli_epi16(v.raw, bits)};
2483 }
2484#endif
2485 return Vec256<int16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2486}
2487
2488HWY_API Vec256<int32_t> ShiftLeftSame(const Vec256<int32_t> v, const int bits) {
2489#if HWY_COMPILER_GCC
2490 if (__builtin_constant_p(bits)) {
2491 return Vec256<int32_t>{_mm256_slli_epi32(v.raw, bits)};
2492 }
2493#endif
2494 return Vec256<int32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2495}
2496
2497HWY_API Vec256<int64_t> ShiftLeftSame(const Vec256<int64_t> v, const int bits) {
2498#if HWY_COMPILER_GCC
2499 if (__builtin_constant_p(bits)) {
2500 return Vec256<int64_t>{_mm256_slli_epi64(v.raw, bits)};
2501 }
2502#endif
2503 return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2504}
2505
2506template <typename T, HWY_IF_T_SIZE(T, 1)>
2507HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
2508 const Full256<T> d8;
2509 const RepartitionToWide<decltype(d8)> d16;
2510 const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
2511 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
2512}
2513
2514// ------------------------------ ShiftRightSame (BroadcastSignBit)
2515
2517 const int bits) {
2518#if HWY_COMPILER_GCC
2519 if (__builtin_constant_p(bits)) {
2520 return Vec256<uint16_t>{_mm256_srli_epi16(v.raw, bits)};
2521 }
2522#endif
2523 return Vec256<uint16_t>{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2524}
2526 const int bits) {
2527#if HWY_COMPILER_GCC
2528 if (__builtin_constant_p(bits)) {
2529 return Vec256<uint32_t>{_mm256_srli_epi32(v.raw, bits)};
2530 }
2531#endif
2532 return Vec256<uint32_t>{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2533}
2535 const int bits) {
2536#if HWY_COMPILER_GCC
2537 if (__builtin_constant_p(bits)) {
2538 return Vec256<uint64_t>{_mm256_srli_epi64(v.raw, bits)};
2539 }
2540#endif
2541 return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2542}
2543
2545 const Full256<uint8_t> d8;
2546 const RepartitionToWide<decltype(d8)> d16;
2547 const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
2548 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
2549}
2550
2552 const int bits) {
2553#if HWY_COMPILER_GCC
2554 if (__builtin_constant_p(bits)) {
2555 return Vec256<int16_t>{_mm256_srai_epi16(v.raw, bits)};
2556 }
2557#endif
2558 return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2559}
2560
2562 const int bits) {
2563#if HWY_COMPILER_GCC
2564 if (__builtin_constant_p(bits)) {
2565 return Vec256<int32_t>{_mm256_srai_epi32(v.raw, bits)};
2566 }
2567#endif
2568 return Vec256<int32_t>{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2569}
2571 const int bits) {
2572#if HWY_TARGET <= HWY_AVX3
2573#if HWY_COMPILER_GCC
2574 if (__builtin_constant_p(bits)) {
2575 return Vec256<int64_t>{
2576 _mm256_srai_epi64(v.raw, static_cast<Shift64Count>(bits))};
2577 }
2578#endif
2579 return Vec256<int64_t>{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2580#else
2581 const Full256<int64_t> di;
2582 const Full256<uint64_t> du;
2583 const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2584 const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
2585 return right | sign;
2586#endif
2587}
2588
2590 const Full256<int8_t> di;
2591 const Full256<uint8_t> du;
2592 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2593 const auto shifted_sign =
2594 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
2595 return (shifted ^ shifted_sign) - shifted_sign;
2596}
2597
2598// ------------------------------ Neg (Xor, Sub)
2599
2600// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
2601namespace detail {
2602
2603template <typename T>
2605 const DFromV<decltype(v)> d;
2606 return Xor(v, SignBit(d));
2607}
2608
2609template <typename T>
2611 const DFromV<decltype(v)> d;
2612 return Xor(v, SignBit(d));
2613}
2614
2615// Not floating-point
2616template <typename T>
2618 const DFromV<decltype(v)> d;
2619 return Zero(d) - v;
2620}
2621
2622} // namespace detail
2623
2624template <typename T>
2625HWY_API Vec256<T> Neg(const Vec256<T> v) {
2626 return detail::Neg(hwy::TypeTag<T>(), v);
2627}
2628
2629// ------------------------------ Floating-point mul / div
2630
2631#if HWY_HAVE_FLOAT16
2632HWY_API Vec256<float16_t> operator*(Vec256<float16_t> a, Vec256<float16_t> b) {
2633 return Vec256<float16_t>{_mm256_mul_ph(a.raw, b.raw)};
2634}
2635#endif // HWY_HAVE_FLOAT16
2637 return Vec256<float>{_mm256_mul_ps(a.raw, b.raw)};
2638}
2640 return Vec256<double>{_mm256_mul_pd(a.raw, b.raw)};
2641}
2642
2643#if HWY_HAVE_FLOAT16
2644HWY_API Vec256<float16_t> operator/(Vec256<float16_t> a, Vec256<float16_t> b) {
2645 return Vec256<float16_t>{_mm256_div_ph(a.raw, b.raw)};
2646}
2647#endif // HWY_HAVE_FLOAT16
2649 return Vec256<float>{_mm256_div_ps(a.raw, b.raw)};
2650}
2652 return Vec256<double>{_mm256_div_pd(a.raw, b.raw)};
2653}
2654
2655// Approximate reciprocal
2656#if HWY_HAVE_FLOAT16
2657HWY_API Vec256<float16_t> ApproximateReciprocal(Vec256<float16_t> v) {
2658 return Vec256<float16_t>{_mm256_rcp_ph(v.raw)};
2659}
2660#endif // HWY_HAVE_FLOAT16
2661
2662HWY_API Vec256<float> ApproximateReciprocal(Vec256<float> v) {
2663 return Vec256<float>{_mm256_rcp_ps(v.raw)};
2664}
2665
2666#if HWY_TARGET <= HWY_AVX3
2670#endif
2671
2672// ------------------------------ MaskedMinOr
2673
2674#if HWY_TARGET <= HWY_AVX3
2675
2676template <typename T, HWY_IF_U8(T)>
2678 Vec256<T> b) {
2679 return Vec256<T>{_mm256_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
2680}
2681template <typename T, HWY_IF_I8(T)>
2682HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2683 Vec256<T> b) {
2684 return Vec256<T>{_mm256_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
2685}
2686
2687template <typename T, HWY_IF_U16(T)>
2688HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2689 Vec256<T> b) {
2690 return Vec256<T>{_mm256_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
2691}
2692template <typename T, HWY_IF_I16(T)>
2693HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2694 Vec256<T> b) {
2695 return Vec256<T>{_mm256_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
2696}
2697
2698template <typename T, HWY_IF_U32(T)>
2699HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2700 Vec256<T> b) {
2701 return Vec256<T>{_mm256_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
2702}
2703template <typename T, HWY_IF_I32(T)>
2704HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2705 Vec256<T> b) {
2706 return Vec256<T>{_mm256_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
2707}
2708
2709template <typename T, HWY_IF_U64(T)>
2710HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2711 Vec256<T> b) {
2712 return Vec256<T>{_mm256_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
2713}
2714template <typename T, HWY_IF_I64(T)>
2715HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2716 Vec256<T> b) {
2717 return Vec256<T>{_mm256_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
2718}
2719
2720template <typename T, HWY_IF_F32(T)>
2721HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2722 Vec256<T> b) {
2723 return Vec256<T>{_mm256_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
2724}
2725
2726template <typename T, HWY_IF_F64(T)>
2727HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2728 Vec256<T> b) {
2729 return Vec256<T>{_mm256_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
2730}
2731
2732#if HWY_HAVE_FLOAT16
2733template <typename T, HWY_IF_F16(T)>
2734HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2735 Vec256<T> b) {
2736 return Vec256<T>{_mm256_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
2737}
2738#endif // HWY_HAVE_FLOAT16
2739
2740// ------------------------------ MaskedMaxOr
2741
2742template <typename T, HWY_IF_U8(T)>
2744 Vec256<T> b) {
2745 return Vec256<T>{_mm256_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
2746}
2747template <typename T, HWY_IF_I8(T)>
2748HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2749 Vec256<T> b) {
2750 return Vec256<T>{_mm256_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
2751}
2752
2753template <typename T, HWY_IF_U16(T)>
2754HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2755 Vec256<T> b) {
2756 return Vec256<T>{_mm256_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
2757}
2758template <typename T, HWY_IF_I16(T)>
2759HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2760 Vec256<T> b) {
2761 return Vec256<T>{_mm256_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
2762}
2763
2764template <typename T, HWY_IF_U32(T)>
2765HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2766 Vec256<T> b) {
2767 return Vec256<T>{_mm256_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
2768}
2769template <typename T, HWY_IF_I32(T)>
2770HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2771 Vec256<T> b) {
2772 return Vec256<T>{_mm256_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
2773}
2774
2775template <typename T, HWY_IF_U64(T)>
2776HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2777 Vec256<T> b) {
2778 return Vec256<T>{_mm256_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
2779}
2780template <typename T, HWY_IF_I64(T)>
2781HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2782 Vec256<T> b) {
2783 return Vec256<T>{_mm256_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
2784}
2785
2786template <typename T, HWY_IF_F32(T)>
2787HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2788 Vec256<T> b) {
2789 return Vec256<T>{_mm256_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
2790}
2791
2792template <typename T, HWY_IF_F64(T)>
2793HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2794 Vec256<T> b) {
2795 return Vec256<T>{_mm256_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
2796}
2797
2798#if HWY_HAVE_FLOAT16
2799template <typename T, HWY_IF_F16(T)>
2800HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2801 Vec256<T> b) {
2802 return Vec256<T>{_mm256_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
2803}
2804#endif // HWY_HAVE_FLOAT16
2805
2806// ------------------------------ MaskedAddOr
2807
2808template <typename T, HWY_IF_UI8(T)>
2810 Vec256<T> b) {
2811 return Vec256<T>{_mm256_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
2812}
2813
2814template <typename T, HWY_IF_UI16(T)>
2815HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2816 Vec256<T> b) {
2817 return Vec256<T>{_mm256_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
2818}
2819
2820template <typename T, HWY_IF_UI32(T)>
2821HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2822 Vec256<T> b) {
2823 return Vec256<T>{_mm256_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
2824}
2825
2826template <typename T, HWY_IF_UI64(T)>
2827HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2828 Vec256<T> b) {
2829 return Vec256<T>{_mm256_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
2830}
2831
2832template <typename T, HWY_IF_F32(T)>
2833HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2834 Vec256<T> b) {
2835 return Vec256<T>{_mm256_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
2836}
2837
2838template <typename T, HWY_IF_F64(T)>
2839HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2840 Vec256<T> b) {
2841 return Vec256<T>{_mm256_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
2842}
2843
2844#if HWY_HAVE_FLOAT16
2845template <typename T, HWY_IF_F16(T)>
2846HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2847 Vec256<T> b) {
2848 return Vec256<T>{_mm256_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
2849}
2850#endif // HWY_HAVE_FLOAT16
2851
2852// ------------------------------ MaskedSubOr
2853
2854template <typename T, HWY_IF_UI8(T)>
2856 Vec256<T> b) {
2857 return Vec256<T>{_mm256_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
2858}
2859
2860template <typename T, HWY_IF_UI16(T)>
2861HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2862 Vec256<T> b) {
2863 return Vec256<T>{_mm256_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
2864}
2865
2866template <typename T, HWY_IF_UI32(T)>
2867HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2868 Vec256<T> b) {
2869 return Vec256<T>{_mm256_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
2870}
2871
2872template <typename T, HWY_IF_UI64(T)>
2873HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2874 Vec256<T> b) {
2875 return Vec256<T>{_mm256_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
2876}
2877
2878template <typename T, HWY_IF_F32(T)>
2879HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2880 Vec256<T> b) {
2881 return Vec256<T>{_mm256_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
2882}
2883
2884template <typename T, HWY_IF_F64(T)>
2885HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2886 Vec256<T> b) {
2887 return Vec256<T>{_mm256_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
2888}
2889
2890#if HWY_HAVE_FLOAT16
2891template <typename T, HWY_IF_F16(T)>
2892HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2893 Vec256<T> b) {
2894 return Vec256<T>{_mm256_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
2895}
2896#endif // HWY_HAVE_FLOAT16
2897
2898// ------------------------------ MaskedMulOr
2899
2902 return Vec256<float>{_mm256_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
2903}
2904
2907 return Vec256<double>{_mm256_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
2908}
2909
2910#if HWY_HAVE_FLOAT16
2911HWY_API Vec256<float16_t> MaskedMulOr(Vec256<float16_t> no,
2912 Mask256<float16_t> m, Vec256<float16_t> a,
2913 Vec256<float16_t> b) {
2914 return Vec256<float16_t>{_mm256_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
2915}
2916#endif // HWY_HAVE_FLOAT16
2917
2918// ------------------------------ MaskedDivOr
2919
2922 return Vec256<float>{_mm256_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
2923}
2924
2927 return Vec256<double>{_mm256_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
2928}
2929
2930#if HWY_HAVE_FLOAT16
2931HWY_API Vec256<float16_t> MaskedDivOr(Vec256<float16_t> no,
2932 Mask256<float16_t> m, Vec256<float16_t> a,
2933 Vec256<float16_t> b) {
2934 return Vec256<float16_t>{_mm256_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
2935}
2936#endif // HWY_HAVE_FLOAT16
2937
2938// ------------------------------ MaskedSatAddOr
2939
2940template <typename T, HWY_IF_I8(T)>
2942 Vec256<T> b) {
2943 return Vec256<T>{_mm256_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
2944}
2945
2946template <typename T, HWY_IF_U8(T)>
2947HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2948 Vec256<T> b) {
2949 return Vec256<T>{_mm256_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
2950}
2951
2952template <typename T, HWY_IF_I16(T)>
2953HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2954 Vec256<T> b) {
2955 return Vec256<T>{_mm256_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
2956}
2957
2958template <typename T, HWY_IF_U16(T)>
2959HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2960 Vec256<T> b) {
2961 return Vec256<T>{_mm256_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
2962}
2963
2964// ------------------------------ MaskedSatSubOr
2965
2966template <typename T, HWY_IF_I8(T)>
2968 Vec256<T> b) {
2969 return Vec256<T>{_mm256_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
2970}
2971
2972template <typename T, HWY_IF_U8(T)>
2973HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2974 Vec256<T> b) {
2975 return Vec256<T>{_mm256_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
2976}
2977
2978template <typename T, HWY_IF_I16(T)>
2979HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2980 Vec256<T> b) {
2981 return Vec256<T>{_mm256_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
2982}
2983
2984template <typename T, HWY_IF_U16(T)>
2985HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2986 Vec256<T> b) {
2987 return Vec256<T>{_mm256_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
2988}
2989
2990#endif // HWY_TARGET <= HWY_AVX3
2991
2992// ------------------------------ Floating-point multiply-add variants
2993
2994#if HWY_HAVE_FLOAT16
2995
2996HWY_API Vec256<float16_t> MulAdd(Vec256<float16_t> mul, Vec256<float16_t> x,
2997 Vec256<float16_t> add) {
2998 return Vec256<float16_t>{_mm256_fmadd_ph(mul.raw, x.raw, add.raw)};
2999}
3000
3001HWY_API Vec256<float16_t> NegMulAdd(Vec256<float16_t> mul, Vec256<float16_t> x,
3002 Vec256<float16_t> add) {
3003 return Vec256<float16_t>{_mm256_fnmadd_ph(mul.raw, x.raw, add.raw)};
3004}
3005
3006HWY_API Vec256<float16_t> MulSub(Vec256<float16_t> mul, Vec256<float16_t> x,
3007 Vec256<float16_t> sub) {
3008 return Vec256<float16_t>{_mm256_fmsub_ph(mul.raw, x.raw, sub.raw)};
3009}
3010
3011HWY_API Vec256<float16_t> NegMulSub(Vec256<float16_t> mul, Vec256<float16_t> x,
3012 Vec256<float16_t> sub) {
3013 return Vec256<float16_t>{_mm256_fnmsub_ph(mul.raw, x.raw, sub.raw)};
3014}
3015
3016#endif // HWY_HAVE_FLOAT16
3017
3018HWY_API Vec256<float> MulAdd(Vec256<float> mul, Vec256<float> x,
3019 Vec256<float> add) {
3020#ifdef HWY_DISABLE_BMI2_FMA
3021 return mul * x + add;
3022#else
3023 return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
3024#endif
3025}
3027 Vec256<double> add) {
3028#ifdef HWY_DISABLE_BMI2_FMA
3029 return mul * x + add;
3030#else
3031 return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
3032#endif
3033}
3034
3035HWY_API Vec256<float> NegMulAdd(Vec256<float> mul, Vec256<float> x,
3036 Vec256<float> add) {
3037#ifdef HWY_DISABLE_BMI2_FMA
3038 return add - mul * x;
3039#else
3040 return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
3041#endif
3042}
3044 Vec256<double> add) {
3045#ifdef HWY_DISABLE_BMI2_FMA
3046 return add - mul * x;
3047#else
3048 return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
3049#endif
3050}
3051
3052HWY_API Vec256<float> MulSub(Vec256<float> mul, Vec256<float> x,
3053 Vec256<float> sub) {
3054#ifdef HWY_DISABLE_BMI2_FMA
3055 return mul * x - sub;
3056#else
3057 return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
3058#endif
3059}
3061 Vec256<double> sub) {
3062#ifdef HWY_DISABLE_BMI2_FMA
3063 return mul * x - sub;
3064#else
3065 return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
3066#endif
3067}
3068
3069HWY_API Vec256<float> NegMulSub(Vec256<float> mul, Vec256<float> x,
3070 Vec256<float> sub) {
3071#ifdef HWY_DISABLE_BMI2_FMA
3072 return Neg(mul * x) - sub;
3073#else
3074 return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
3075#endif
3076}
3078 Vec256<double> sub) {
3079#ifdef HWY_DISABLE_BMI2_FMA
3080 return Neg(mul * x) - sub;
3081#else
3082 return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
3083#endif
3084}
3085
3086#if HWY_HAVE_FLOAT16
3087HWY_API Vec256<float16_t> MulAddSub(Vec256<float16_t> mul, Vec256<float16_t> x,
3088 Vec256<float16_t> sub_or_add) {
3089 return Vec256<float16_t>{_mm256_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
3090}
3091#endif // HWY_HAVE_FLOAT16
3092
3094 Vec256<float> sub_or_add) {
3095#ifdef HWY_DISABLE_BMI2_FMA
3096 return AddSub(mul * x, sub_or_add);
3097#else
3098 return Vec256<float>{_mm256_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
3099#endif
3100}
3101
3103 Vec256<double> sub_or_add) {
3104#ifdef HWY_DISABLE_BMI2_FMA
3105 return AddSub(mul * x, sub_or_add);
3106#else
3107 return Vec256<double>{_mm256_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
3108#endif
3109}
3110
3111// ------------------------------ Floating-point square root
3112
3113// Full precision square root
3114#if HWY_HAVE_FLOAT16
3115HWY_API Vec256<float16_t> Sqrt(Vec256<float16_t> v) {
3116 return Vec256<float16_t>{_mm256_sqrt_ph(v.raw)};
3117}
3118#endif // HWY_HAVE_FLOAT16
3120 return Vec256<float>{_mm256_sqrt_ps(v.raw)};
3121}
3123 return Vec256<double>{_mm256_sqrt_pd(v.raw)};
3124}
3125
3126// Approximate reciprocal square root
3127#if HWY_HAVE_FLOAT16
3128HWY_API Vec256<float16_t> ApproximateReciprocalSqrt(Vec256<float16_t> v) {
3129 return Vec256<float16_t>{_mm256_rsqrt_ph(v.raw)};
3130}
3131#endif
3132HWY_API Vec256<float> ApproximateReciprocalSqrt(Vec256<float> v) {
3133 return Vec256<float>{_mm256_rsqrt_ps(v.raw)};
3134}
3135
3136#if HWY_TARGET <= HWY_AVX3
3138#if HWY_COMPILER_MSVC
3139 const DFromV<decltype(v)> d;
3140 return Vec256<double>{_mm256_mask_rsqrt14_pd(
3141 Undefined(d).raw, static_cast<__mmask8>(0xFF), v.raw)};
3142#else
3143 return Vec256<double>{_mm256_rsqrt14_pd(v.raw)};
3144#endif
3145}
3146#endif
3147
3148// ------------------------------ Floating-point rounding
3149
3150// Toward nearest integer, tie to even
3151#if HWY_HAVE_FLOAT16
3152HWY_API Vec256<float16_t> Round(Vec256<float16_t> v) {
3153 return Vec256<float16_t>{_mm256_roundscale_ph(
3154 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
3155}
3156#endif // HWY_HAVE_FLOAT16
3157HWY_API Vec256<float> Round(Vec256<float> v) {
3158 return Vec256<float>{
3159 _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
3160}
3162 return Vec256<double>{
3163 _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
3164}
3165
3166// Toward zero, aka truncate
3167#if HWY_HAVE_FLOAT16
3168HWY_API Vec256<float16_t> Trunc(Vec256<float16_t> v) {
3169 return Vec256<float16_t>{
3170 _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
3171}
3172#endif // HWY_HAVE_FLOAT16
3173HWY_API Vec256<float> Trunc(Vec256<float> v) {
3174 return Vec256<float>{
3175 _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
3176}
3178 return Vec256<double>{
3179 _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
3180}
3181
3182// Toward +infinity, aka ceiling
3183#if HWY_HAVE_FLOAT16
3184HWY_API Vec256<float16_t> Ceil(Vec256<float16_t> v) {
3185 return Vec256<float16_t>{
3186 _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
3187}
3188#endif // HWY_HAVE_FLOAT16
3189HWY_API Vec256<float> Ceil(Vec256<float> v) {
3190 return Vec256<float>{
3191 _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
3192}
3194 return Vec256<double>{
3195 _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
3196}
3197
3198// Toward -infinity, aka floor
3199#if HWY_HAVE_FLOAT16
3200HWY_API Vec256<float16_t> Floor(Vec256<float16_t> v) {
3201 return Vec256<float16_t>{
3202 _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
3203}
3204#endif // HWY_HAVE_FLOAT16
3205HWY_API Vec256<float> Floor(Vec256<float> v) {
3206 return Vec256<float>{
3207 _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
3208}
3210 return Vec256<double>{
3211 _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
3212}
3213
3214// ------------------------------ Floating-point classification
3215
3216#if HWY_HAVE_FLOAT16 || HWY_IDE
3217
3218HWY_API Mask256<float16_t> IsNaN(Vec256<float16_t> v) {
3219 return Mask256<float16_t>{_mm256_fpclass_ph_mask(
3221}
3222
3223HWY_API Mask256<float16_t> IsEitherNaN(Vec256<float16_t> a,
3224 Vec256<float16_t> b) {
3225 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3226 HWY_DIAGNOSTICS(push)
3227 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3228 return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3229 HWY_DIAGNOSTICS(pop)
3230}
3231
3232HWY_API Mask256<float16_t> IsInf(Vec256<float16_t> v) {
3233 return Mask256<float16_t>{_mm256_fpclass_ph_mask(
3235}
3236
3237HWY_API Mask256<float16_t> IsFinite(Vec256<float16_t> v) {
3238 // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
3239 // and negate the mask.
3240 return Not(Mask256<float16_t>{_mm256_fpclass_ph_mask(
3243}
3244
3245#endif // HWY_HAVE_FLOAT16
3246
3248#if HWY_TARGET <= HWY_AVX3
3249 return Mask256<float>{_mm256_fpclass_ps_mask(
3251#else
3252 return Mask256<float>{_mm256_cmp_ps(v.raw, v.raw, _CMP_UNORD_Q)};
3253#endif
3254}
3256#if HWY_TARGET <= HWY_AVX3
3257 return Mask256<double>{_mm256_fpclass_pd_mask(
3259#else
3260 return Mask256<double>{_mm256_cmp_pd(v.raw, v.raw, _CMP_UNORD_Q)};
3261#endif
3262}
3263
3265#if HWY_TARGET <= HWY_AVX3
3266 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3267#else
3268 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_UNORD_Q)};
3269#endif
3270}
3271
3273#if HWY_TARGET <= HWY_AVX3
3274 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3275#else
3276 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_UNORD_Q)};
3277#endif
3278}
3279
3280#if HWY_TARGET <= HWY_AVX3
3281
3290
3292 // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
3293 // and negate the mask.
3294 return Not(Mask256<float>{_mm256_fpclass_ps_mask(
3297}
3303
3304#endif // HWY_TARGET <= HWY_AVX3
3305
3306// ================================================== MEMORY
3307
3308// ------------------------------ Load
3309
3310template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3311HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
3312 return VFromD<D>{
3313 _mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
3314}
3315// bfloat16_t is handled by x86_128-inl.h.
3316#if HWY_HAVE_FLOAT16
3317template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3318HWY_API Vec256<float16_t> Load(D /* tag */,
3319 const float16_t* HWY_RESTRICT aligned) {
3320 return Vec256<float16_t>{_mm256_load_ph(aligned)};
3321}
3322#endif
3323template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3324HWY_API Vec256<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
3325 return Vec256<float>{_mm256_load_ps(aligned)};
3326}
3327template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3328HWY_API Vec256<double> Load(D /* tag */, const double* HWY_RESTRICT aligned) {
3329 return Vec256<double>{_mm256_load_pd(aligned)};
3330}
3331
3332template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3333HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
3334 return VFromD<D>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
3335}
3336// bfloat16_t is handled by x86_128-inl.h.
3337#if HWY_HAVE_FLOAT16
3338template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3339HWY_API Vec256<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
3340 return Vec256<float16_t>{_mm256_loadu_ph(p)};
3341}
3342#endif
3343template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3344HWY_API Vec256<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
3345 return Vec256<float>{_mm256_loadu_ps(p)};
3346}
3347template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3348HWY_API Vec256<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) {
3349 return Vec256<double>{_mm256_loadu_pd(p)};
3350}
3351
3352// ------------------------------ MaskedLoad
3353
3354#if HWY_TARGET <= HWY_AVX3
3355
3356template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
3357HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3358 const TFromD<D>* HWY_RESTRICT p) {
3359 return VFromD<D>{_mm256_maskz_loadu_epi8(m.raw, p)};
3360}
3361
3362template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
3363HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
3364 const TFromD<D>* HWY_RESTRICT p) {
3365 const RebindToUnsigned<decltype(d)> du; // for float16_t
3366 return BitCast(d, VFromD<decltype(du)>{_mm256_maskz_loadu_epi16(m.raw, p)});
3367}
3368
3369template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3370HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3371 const TFromD<D>* HWY_RESTRICT p) {
3372 return VFromD<D>{_mm256_maskz_loadu_epi32(m.raw, p)};
3373}
3374
3375template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3376HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3377 const TFromD<D>* HWY_RESTRICT p) {
3378 return VFromD<D>{_mm256_maskz_loadu_epi64(m.raw, p)};
3379}
3380
3381template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3383 const float* HWY_RESTRICT p) {
3384 return Vec256<float>{_mm256_maskz_loadu_ps(m.raw, p)};
3385}
3386
3387template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3389 const double* HWY_RESTRICT p) {
3390 return Vec256<double>{_mm256_maskz_loadu_pd(m.raw, p)};
3391}
3392
3393template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
3394HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
3395 const TFromD<D>* HWY_RESTRICT p) {
3396 return VFromD<D>{_mm256_mask_loadu_epi8(v.raw, m.raw, p)};
3397}
3398
3399template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
3400HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
3401 const TFromD<D>* HWY_RESTRICT p) {
3402 const RebindToUnsigned<decltype(d)> du; // for float16_t
3403 return BitCast(d, VFromD<decltype(du)>{
3404 _mm256_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)});
3405}
3406
3407template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3408HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
3409 const TFromD<D>* HWY_RESTRICT p) {
3410 return VFromD<D>{_mm256_mask_loadu_epi32(v.raw, m.raw, p)};
3411}
3412
3413template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3414HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
3415 const TFromD<D>* HWY_RESTRICT p) {
3416 return VFromD<D>{_mm256_mask_loadu_epi64(v.raw, m.raw, p)};
3417}
3418
3419template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3421 const float* HWY_RESTRICT p) {
3422 return Vec256<float>{_mm256_mask_loadu_ps(v.raw, m.raw, p)};
3423}
3424
3425template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3427 const double* HWY_RESTRICT p) {
3428 return Vec256<double>{_mm256_mask_loadu_pd(v.raw, m.raw, p)};
3429}
3430
3431#else // AVX2
3432
3433// There is no maskload_epi8/16, so blend instead.
3434template <class D, HWY_IF_V_SIZE_D(D, 32),
3435 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
3436HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
3437 const TFromD<D>* HWY_RESTRICT p) {
3438 return IfThenElseZero(m, LoadU(d, p));
3439}
3440
3441template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3442HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3443 const TFromD<D>* HWY_RESTRICT p) {
3444 auto pi = reinterpret_cast<const int*>(p); // NOLINT
3445 return VFromD<D>{_mm256_maskload_epi32(pi, m.raw)};
3446}
3447
3448template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3449HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3450 const TFromD<D>* HWY_RESTRICT p) {
3451 auto pi = reinterpret_cast<const long long*>(p); // NOLINT
3452 return VFromD<D>{_mm256_maskload_epi64(pi, m.raw)};
3453}
3454
3455template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3456HWY_API Vec256<float> MaskedLoad(Mask256<float> m, D d,
3457 const float* HWY_RESTRICT p) {
3458 const Vec256<int32_t> mi =
3459 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
3460 return Vec256<float>{_mm256_maskload_ps(p, mi.raw)};
3461}
3462
3463template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3464HWY_API Vec256<double> MaskedLoad(Mask256<double> m, D d,
3465 const double* HWY_RESTRICT p) {
3466 const Vec256<int64_t> mi =
3467 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
3468 return Vec256<double>{_mm256_maskload_pd(p, mi.raw)};
3469}
3470
3471#endif
3472
3473// ------------------------------ LoadDup128
3474
3475// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
3476// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
3477template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
3479 const RebindToUnsigned<decltype(d)> du;
3480 const Full128<TFromD<D>> d128;
3481 const RebindToUnsigned<decltype(d128)> du128;
3482 const __m128i v128 = BitCast(du128, LoadU(d128, p)).raw;
3483#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
3484 // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
3485 // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
3486 // upper half undefined) is fine because we're overwriting that anyway.
3487 // This workaround seems in turn to generate incorrect code in MSVC 2022
3488 // (19.31), so use broadcastsi128 there.
3489 return BitCast(d, VFromD<decltype(du)>{_mm256_inserti128_si256(
3490 _mm256_castsi128_si256(v128), v128, 1)});
3491#else
3492 // The preferred path. This is perhaps surprising, because vbroadcasti128
3493 // with xmm input has 7 cycle latency on Intel, but Clang >= 7 is able to
3494 // pattern-match this to vbroadcastf128 with a memory operand as desired.
3495 return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastsi128_si256(v128)});
3496#endif
3497}
3498template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3499HWY_API Vec256<float> LoadDup128(D /* tag */, const float* HWY_RESTRICT p) {
3500#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
3501 const Full128<float> d128;
3502 const __m128 v128 = LoadU(d128, p).raw;
3503 return Vec256<float>{
3504 _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
3505#else
3506 return Vec256<float>{_mm256_broadcast_ps(reinterpret_cast<const __m128*>(p))};
3507#endif
3508}
3509template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3510HWY_API Vec256<double> LoadDup128(D /* tag */, const double* HWY_RESTRICT p) {
3511#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
3512 const Full128<double> d128;
3513 const __m128d v128 = LoadU(d128, p).raw;
3514 return Vec256<double>{
3515 _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
3516#else
3517 return Vec256<double>{
3518 _mm256_broadcast_pd(reinterpret_cast<const __m128d*>(p))};
3519#endif
3520}
3521
3522// ------------------------------ Store
3523
3524template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3525HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
3526 _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
3527}
3528#if HWY_HAVE_FLOAT16
3529template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3530HWY_API void Store(Vec256<float16_t> v, D /* tag */,
3531 float16_t* HWY_RESTRICT aligned) {
3532 _mm256_store_ph(aligned, v.raw);
3533}
3534#endif // HWY_HAVE_FLOAT16
3535template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3536HWY_API void Store(Vec256<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
3537 _mm256_store_ps(aligned, v.raw);
3538}
3539template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3540HWY_API void Store(Vec256<double> v, D /* tag */,
3541 double* HWY_RESTRICT aligned) {
3542 _mm256_store_pd(aligned, v.raw);
3543}
3544
3545template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3546HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
3547 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
3548}
3549#if HWY_HAVE_FLOAT16
3550template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3551HWY_API void StoreU(Vec256<float16_t> v, D /* tag */,
3552 float16_t* HWY_RESTRICT p) {
3553 _mm256_storeu_ph(p, v.raw);
3554}
3555#endif
3556template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3557HWY_API void StoreU(Vec256<float> v, D /* tag */, float* HWY_RESTRICT p) {
3558 _mm256_storeu_ps(p, v.raw);
3559}
3560template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3561HWY_API void StoreU(Vec256<double> v, D /* tag */, double* HWY_RESTRICT p) {
3562 _mm256_storeu_pd(p, v.raw);
3563}
3564
3565// ------------------------------ BlendedStore
3566
3567#if HWY_TARGET <= HWY_AVX3
3568
3569template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
3570HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
3571 TFromD<D>* HWY_RESTRICT p) {
3572 _mm256_mask_storeu_epi8(p, m.raw, v.raw);
3573}
3574
3575template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
3576HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
3577 TFromD<D>* HWY_RESTRICT p) {
3578 const RebindToUnsigned<decltype(d)> du; // for float16_t
3579 _mm256_mask_storeu_epi16(reinterpret_cast<uint16_t*>(p),
3580 RebindMask(du, m).raw, BitCast(du, v).raw);
3581}
3582
3583template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3584HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
3585 TFromD<D>* HWY_RESTRICT p) {
3586 _mm256_mask_storeu_epi32(p, m.raw, v.raw);
3587}
3588
3589template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3590HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
3591 TFromD<D>* HWY_RESTRICT p) {
3592 _mm256_mask_storeu_epi64(p, m.raw, v.raw);
3593}
3594
3595template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3597 float* HWY_RESTRICT p) {
3598 _mm256_mask_storeu_ps(p, m.raw, v.raw);
3599}
3600
3601template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3603 double* HWY_RESTRICT p) {
3604 _mm256_mask_storeu_pd(p, m.raw, v.raw);
3605}
3606
3607#else // AVX2
3608
3609// Intel SDM says "No AC# reported for any mask bit combinations". However, AMD
3610// allows AC# if "Alignment checking enabled and: 256-bit memory operand not
3611// 32-byte aligned". Fortunately AC# is not enabled by default and requires both
3612// OS support (CR0) and the application to set rflags.AC. We assume these remain
3613// disabled because x86/x64 code and compiler output often contain misaligned
3614// scalar accesses, which would also fault.
3615//
3616// Caveat: these are slow on AMD Jaguar/Bulldozer.
3617
3618template <class D, HWY_IF_V_SIZE_D(D, 32),
3619 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
3620HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
3621 TFromD<D>* HWY_RESTRICT p) {
3622 // There is no maskload_epi8/16. Blending is also unsafe because loading a
3623 // full vector that crosses the array end causes asan faults. Resort to scalar
3624 // code; the caller should instead use memcpy, assuming m is FirstN(d, n).
3625 const RebindToUnsigned<decltype(d)> du;
3626 using TU = TFromD<decltype(du)>;
3627 alignas(32) TU buf[MaxLanes(d)];
3628 alignas(32) TU mask[MaxLanes(d)];
3629 Store(BitCast(du, v), du, buf);
3630 Store(BitCast(du, VecFromMask(d, m)), du, mask);
3631 for (size_t i = 0; i < MaxLanes(d); ++i) {
3632 if (mask[i]) {
3633 CopySameSize(buf + i, p + i);
3634 }
3635 }
3636}
3637
3638template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3639HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
3640 TFromD<D>* HWY_RESTRICT p) {
3641 auto pi = reinterpret_cast<int*>(p); // NOLINT
3642 _mm256_maskstore_epi32(pi, m.raw, v.raw);
3643}
3644
3645template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3646HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
3647 TFromD<D>* HWY_RESTRICT p) {
3648 auto pi = reinterpret_cast<long long*>(p); // NOLINT
3649 _mm256_maskstore_epi64(pi, m.raw, v.raw);
3650}
3651
3652template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3653HWY_API void BlendedStore(Vec256<float> v, Mask256<float> m, D d,
3654 float* HWY_RESTRICT p) {
3655 const Vec256<int32_t> mi =
3656 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
3657 _mm256_maskstore_ps(p, mi.raw, v.raw);
3658}
3659
3660template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3661HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m, D d,
3662 double* HWY_RESTRICT p) {
3663 const Vec256<int64_t> mi =
3664 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
3665 _mm256_maskstore_pd(p, mi.raw, v.raw);
3666}
3667
3668#endif
3669
3670// ------------------------------ Non-temporal stores
3671
3672template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
3673HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
3674 const RebindToUnsigned<decltype(d)> du; // for float16_t
3675 _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), BitCast(du, v).raw);
3676}
3677template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3678HWY_API void Stream(Vec256<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
3679 _mm256_stream_ps(aligned, v.raw);
3680}
3681template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3682HWY_API void Stream(Vec256<double> v, D /* tag */,
3683 double* HWY_RESTRICT aligned) {
3684 _mm256_stream_pd(aligned, v.raw);
3685}
3686
3687// ------------------------------ ScatterOffset
3688
3689// Work around warnings in the intrinsic definitions (passing -1 as a mask).
3690HWY_DIAGNOSTICS(push)
3691HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3692
3693#if HWY_TARGET <= HWY_AVX3
3694
3695template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3696HWY_API void ScatterOffset(VFromD<D> v, D /* tag */,
3697 TFromD<D>* HWY_RESTRICT base,
3698 Vec256<int32_t> offset) {
3699 _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
3700}
3701
3702template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3703HWY_API void ScatterOffset(VFromD<D> v, D /* tag */,
3704 TFromD<D>* HWY_RESTRICT base,
3705 Vec256<int64_t> offset) {
3706 _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
3707}
3708
3709template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3710HWY_API void ScatterOffset(VFromD<D> v, D /* tag */, float* HWY_RESTRICT base,
3711 const Vec256<int32_t> offset) {
3712 _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
3713}
3714
3715template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3716HWY_API void ScatterOffset(VFromD<D> v, D /* tag */, double* HWY_RESTRICT base,
3717 const Vec256<int64_t> offset) {
3718 _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
3719}
3720
3721// ------------------------------ ScatterIndex
3722
3723template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3724HWY_API void ScatterIndex(VFromD<D> v, D /* tag */,
3725 TFromD<D>* HWY_RESTRICT base,
3726 VFromD<RebindToSigned<D>> index) {
3727 _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
3728}
3729
3730template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3731HWY_API void ScatterIndex(VFromD<D> v, D /* tag */,
3732 TFromD<D>* HWY_RESTRICT base,
3733 VFromD<RebindToSigned<D>> index) {
3734 _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
3735}
3736
3737template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3738HWY_API void ScatterIndex(VFromD<D> v, D /* tag */, float* HWY_RESTRICT base,
3739 VFromD<RebindToSigned<D>> index) {
3740 _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
3741}
3742
3743template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3744HWY_API void ScatterIndex(VFromD<D> v, D /* tag */, double* HWY_RESTRICT base,
3745 VFromD<RebindToSigned<D>> index) {
3746 _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
3747}
3748
3749// ------------------------------ MaskedScatterIndex
3750
3751template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3753 TFromD<D>* HWY_RESTRICT base,
3754 VFromD<RebindToSigned<D>> index) {
3755 _mm256_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, 4);
3756}
3757
3758template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3759HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
3760 TFromD<D>* HWY_RESTRICT base,
3761 VFromD<RebindToSigned<D>> index) {
3762 _mm256_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, 8);
3763}
3764
3765template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3767 float* HWY_RESTRICT base,
3768 VFromD<RebindToSigned<D>> index) {
3769 _mm256_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, 4);
3770}
3771
3772template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3774 double* HWY_RESTRICT base,
3775 VFromD<RebindToSigned<D>> index) {
3776 _mm256_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, 8);
3777}
3778
3779#endif // HWY_TARGET <= HWY_AVX3
3780
3781// ------------------------------ Gather
3782
3783namespace detail {
3784
3785template <int kScale, typename T, HWY_IF_UI32(T)>
3788 return Vec256<T>{_mm256_i32gather_epi32(
3789 reinterpret_cast<const int32_t*>(base), indices.raw, kScale)};
3790}
3791
3792template <int kScale, typename T, HWY_IF_UI64(T)>
3795 return Vec256<T>{_mm256_i64gather_epi64(
3796 reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)};
3797}
3798
3799template <int kScale>
3802 return Vec256<float>{_mm256_i32gather_ps(base, indices.raw, kScale)};
3803}
3804
3805template <int kScale>
3808 return Vec256<double>{_mm256_i64gather_pd(base, indices.raw, kScale)};
3809}
3810
3811} // namespace detail
3812
3813template <class D, HWY_IF_V_SIZE_D(D, 32)>
3814HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
3815 VFromD<RebindToSigned<D>> offsets) {
3816 const RebindToSigned<decltype(d)> di;
3817 (void)di; // for HWY_DASSERT
3818 HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
3819 return detail::NativeGather256<1>(base, offsets);
3820}
3821
3822template <class D, HWY_IF_V_SIZE_D(D, 32)>
3825 const RebindToSigned<decltype(d)> di;
3826 (void)di; // for HWY_DASSERT
3827 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3828 return detail::NativeGather256<sizeof(TFromD<D>)>(base, indices);
3829}
3830
3831// ------------------------------ MaskedGatherIndexOr
3832
3833namespace detail {
3834
3835template <int kScale, typename T, HWY_IF_UI32(T)>
3837 const T* HWY_RESTRICT base,
3839#if HWY_TARGET <= HWY_AVX3
3840 return Vec256<T>{_mm256_mmask_i32gather_epi32(
3841 no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base),
3842 kScale)};
3843#else
3844 return Vec256<T>{_mm256_mask_i32gather_epi32(
3845 no.raw, reinterpret_cast<const int32_t*>(base), indices.raw, m.raw,
3846 kScale)};
3847#endif
3848}
3849
3850template <int kScale, typename T, HWY_IF_UI64(T)>
3852 const T* HWY_RESTRICT base,
3854#if HWY_TARGET <= HWY_AVX3
3855 return Vec256<T>{_mm256_mmask_i64gather_epi64(
3856 no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base),
3857 kScale)};
3858#else
3859 // For reasons unknown, _mm256_mask_i64gather_epi64 returns all-zeros.
3860 const Full256<T> d;
3861 const Full256<double> dd;
3862 return BitCast(d,
3863 Vec256<double>{_mm256_mask_i64gather_pd(
3864 BitCast(dd, no).raw, reinterpret_cast<const double*>(base),
3865 indices.raw, RebindMask(dd, m).raw, kScale)});
3866#endif
3867}
3868
3869template <int kScale>
3872 const float* HWY_RESTRICT base,
3874#if HWY_TARGET <= HWY_AVX3
3875 return Vec256<float>{
3876 _mm256_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
3877#else
3878 return Vec256<float>{
3879 _mm256_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)};
3880#endif
3881}
3882
3883template <int kScale>
3886 const double* HWY_RESTRICT base,
3888#if HWY_TARGET <= HWY_AVX3
3889 return Vec256<double>{
3890 _mm256_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
3891#else
3892 return Vec256<double>{
3893 _mm256_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)};
3894#endif
3895}
3896
3897} // namespace detail
3898
3899template <class D, HWY_IF_V_SIZE_D(D, 32)>
3901 const TFromD<D>* HWY_RESTRICT base,
3903 const RebindToSigned<decltype(d)> di;
3904 (void)di; // for HWY_DASSERT
3905 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3906 return detail::NativeMaskedGatherOr256<sizeof(TFromD<D>)>(no, m, base,
3907 indices);
3908}
3909
3910HWY_DIAGNOSTICS(pop)
3911
3912// ================================================== SWIZZLE
3913
3914// ------------------------------ LowerHalf
3915
3916template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3918 return VFromD<D>{_mm256_castsi256_si128(v.raw)};
3919}
3920template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
3922 return Vec128<bfloat16_t>{_mm256_castsi256_si128(v.raw)};
3923}
3924template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3926#if HWY_HAVE_FLOAT16
3927 return Vec128<float16_t>{_mm256_castph256_ph128(v.raw)};
3928#else
3929 return Vec128<float16_t>{_mm256_castsi256_si128(v.raw)};
3930#endif // HWY_HAVE_FLOAT16
3931}
3932template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3934 return Vec128<float>{_mm256_castps256_ps128(v.raw)};
3935}
3936template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3938 return Vec128<double>{_mm256_castpd256_pd128(v.raw)};
3939}
3940
3941template <typename T>
3942HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
3943 const Full128<T> dh;
3944 return LowerHalf(dh, v);
3945}
3946
3947// ------------------------------ UpperHalf
3948
3949template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
3950HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
3951 const RebindToUnsigned<decltype(d)> du; // for float16_t
3952 const Twice<decltype(du)> dut;
3953 return BitCast(d, VFromD<decltype(du)>{
3954 _mm256_extracti128_si256(BitCast(dut, v).raw, 1)});
3955}
3956template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3958 return VFromD<D>{_mm256_extractf128_ps(v.raw, 1)};
3959}
3960template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3962 return VFromD<D>{_mm256_extractf128_pd(v.raw, 1)};
3963}
3964
3965// ------------------------------ ExtractLane (Store)
3966template <typename T>
3967HWY_API T ExtractLane(const Vec256<T> v, size_t i) {
3968 const DFromV<decltype(v)> d;
3969 HWY_DASSERT(i < Lanes(d));
3970
3971#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3972 constexpr size_t kLanesPerBlock = 16 / sizeof(T);
3973 if (__builtin_constant_p(i < kLanesPerBlock) && (i < kLanesPerBlock)) {
3974 return ExtractLane(LowerHalf(Half<decltype(d)>(), v), i);
3975 }
3976#endif
3977
3978 alignas(32) T lanes[32 / sizeof(T)];
3979 Store(v, d, lanes);
3980 return lanes[i];
3981}
3982
3983// ------------------------------ InsertLane (Store)
3984template <typename T>
3985HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) {
3987}
3988
3989// ------------------------------ GetLane (LowerHalf)
3990template <typename T>
3991HWY_API T GetLane(const Vec256<T> v) {
3992 return GetLane(LowerHalf(v));
3993}
3994
3995// ------------------------------ ExtractBlock (LowerHalf, UpperHalf)
3996
3997template <int kBlockIdx, class T>
3998HWY_API Vec128<T> ExtractBlock(Vec256<T> v) {
3999 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
4000 const Half<DFromV<decltype(v)>> dh;
4001 return (kBlockIdx == 0) ? LowerHalf(dh, v) : UpperHalf(dh, v);
4002}
4003
4004// ------------------------------ ZeroExtendVector
4005
4006// Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper
4007// bits undefined. Although it makes sense for them to be zero (VEX encoded
4008// 128-bit instructions zero the upper lanes to avoid large penalties), a
4009// compiler could decide to optimize out code that relies on this.
4010//
4011// The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the
4012// zeroing, but it is not available on MSVC until 1920 nor GCC until 10.1.
4013// Unfortunately as of 2023-08 it still seems to cause internal compiler errors
4014// on MSVC, so we consider it unavailable there.
4015//
4016// Without zext we can still possibly obtain the desired code thanks to pattern
4017// recognition; note that the expensive insert instruction might not actually be
4018// generated, see https://gcc.godbolt.org/z/1MKGaP.
4019
4020#if !defined(HWY_HAVE_ZEXT)
4021#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
4022 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000)
4023#define HWY_HAVE_ZEXT 1
4024#else
4025#define HWY_HAVE_ZEXT 0
4026#endif
4027#endif // defined(HWY_HAVE_ZEXT)
4028
4029template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
4030HWY_API VFromD<D> ZeroExtendVector(D /* tag */, VFromD<Half<D>> lo) {
4031#if HWY_HAVE_ZEXT
4032 return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
4033#elif HWY_COMPILER_MSVC
4034 // Workaround: _mm256_inserti128_si256 does not actually zero the hi part.
4035 return VFromD<D>{_mm256_set_m128i(_mm_setzero_si128(), lo.raw)};
4036#else
4037 return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
4038#endif
4039}
4040#if HWY_HAVE_FLOAT16
4041template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
4042HWY_API Vec256<float16_t> ZeroExtendVector(D d, Vec128<float16_t> lo) {
4043#if HWY_HAVE_ZEXT
4044 (void)d;
4045 return Vec256<float16_t>{_mm256_zextph128_ph256(lo.raw)};
4046#else
4047 const RebindToUnsigned<D> du;
4048 return BitCast(d, ZeroExtendVector(du, BitCast(du, lo)));
4049#endif // HWY_HAVE_ZEXT
4050}
4051#endif // HWY_HAVE_FLOAT16
4052template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4054#if HWY_HAVE_ZEXT
4055 return Vec256<float>{_mm256_zextps128_ps256(lo.raw)};
4056#else
4057 return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
4058#endif
4059}
4060template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4062#if HWY_HAVE_ZEXT
4063 return Vec256<double>{_mm256_zextpd128_pd256(lo.raw)};
4064#else
4065 return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
4066#endif
4067}
4068
4069// ------------------------------ ZeroExtendResizeBitCast
4070
4071namespace detail {
4072
4073template <class DTo, class DFrom>
4075 hwy::SizeTag<8> /* from_size_tag */, hwy::SizeTag<32> /* to_size_tag */,
4076 DTo d_to, DFrom d_from, VFromD<DFrom> v) {
4077 const Twice<decltype(d_from)> dt_from;
4078 const Twice<decltype(dt_from)> dq_from;
4079 return BitCast(d_to, ZeroExtendVector(dq_from, ZeroExtendVector(dt_from, v)));
4080}
4081
4082} // namespace detail
4083
4084// ------------------------------ Combine
4085
4086template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4088 const RebindToUnsigned<decltype(d)> du; // for float16_t
4089 const Half<decltype(du)> dh_u;
4090 const auto lo256 = ZeroExtendVector(du, BitCast(dh_u, lo));
4091 return BitCast(d, VFromD<decltype(du)>{_mm256_inserti128_si256(
4092 lo256.raw, BitCast(dh_u, hi).raw, 1)});
4093}
4094template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4096 const auto lo256 = ZeroExtendVector(d, lo);
4097 return Vec256<float>{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)};
4098}
4099template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4101 const auto lo256 = ZeroExtendVector(d, lo);
4102 return Vec256<double>{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)};
4103}
4104
4105// ------------------------------ ShiftLeftBytes
4106template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)>
4107HWY_API VFromD<D> ShiftLeftBytes(D /* tag */, VFromD<D> v) {
4108 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
4109 // This is the same operation as _mm256_bslli_epi128.
4110 return VFromD<D>{_mm256_slli_si256(v.raw, kBytes)};
4111}
4112
4113// ------------------------------ ShiftRightBytes
4114template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)>
4116 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
4117 // This is the same operation as _mm256_bsrli_epi128.
4118 return VFromD<D>{_mm256_srli_si256(v.raw, kBytes)};
4119}
4120
4121// ------------------------------ CombineShiftRightBytes
4122template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)>
4124 const Repartition<uint8_t, decltype(d)> d8;
4125 return BitCast(d, Vec256<uint8_t>{_mm256_alignr_epi8(
4126 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
4127}
4128
4129// ------------------------------ Broadcast
4130
4131template <int kLane, typename T, HWY_IF_T_SIZE(T, 2)>
4133 const DFromV<decltype(v)> d;
4134 const RebindToUnsigned<decltype(d)> du;
4135 using VU = VFromD<decltype(du)>;
4136 const VU vu = BitCast(du, v); // for float16_t
4137 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
4138 if (kLane < 4) {
4139 const __m256i lo = _mm256_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF);
4140 return BitCast(d, VU{_mm256_unpacklo_epi64(lo, lo)});
4141 } else {
4142 const __m256i hi =
4143 _mm256_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF);
4144 return BitCast(d, VU{_mm256_unpackhi_epi64(hi, hi)});
4145 }
4146}
4147template <int kLane, typename T, HWY_IF_UI32(T)>
4148HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
4149 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
4150 return Vec256<T>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
4151}
4152
4153template <int kLane, typename T, HWY_IF_UI64(T)>
4154HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
4155 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
4156 return Vec256<T>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
4157}
4158
4159template <int kLane>
4161 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
4162 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
4163}
4164
4165template <int kLane>
4167 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
4168 return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)};
4169}
4170
4171// ------------------------------ BroadcastBlock
4172
4173template <int kBlockIdx, class T>
4174HWY_API Vec256<T> BroadcastBlock(Vec256<T> v) {
4175 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
4176 const DFromV<decltype(v)> d;
4177 return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v)
4178 : ConcatUpperUpper(d, v, v);
4179}
4180
4181// ------------------------------ BroadcastLane
4182
4183namespace detail {
4184
4185template <class T, HWY_IF_T_SIZE(T, 1)>
4187 Vec256<T> v) {
4188 const Half<DFromV<decltype(v)>> dh;
4189 return Vec256<T>{_mm256_broadcastb_epi8(LowerHalf(dh, v).raw)};
4190}
4191
4192template <class T, HWY_IF_T_SIZE(T, 2)>
4194 Vec256<T> v) {
4195 const DFromV<decltype(v)> d;
4196 const RebindToUnsigned<decltype(d)> du; // for float16_t
4197 const Half<decltype(d)> dh;
4198 const RebindToUnsigned<decltype(dh)> dh_u;
4199 return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastw_epi16(
4200 BitCast(dh_u, LowerHalf(dh, v)).raw)});
4201}
4202
4203template <class T, HWY_IF_UI32(T)>
4204HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
4205 Vec256<T> v) {
4206 const Half<DFromV<decltype(v)>> dh;
4207 return Vec256<T>{_mm256_broadcastd_epi32(LowerHalf(dh, v).raw)};
4208}
4209
4210template <class T, HWY_IF_UI64(T)>
4211HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
4212 Vec256<T> v) {
4213 const Half<DFromV<decltype(v)>> dh;
4214 return Vec256<T>{_mm256_broadcastq_epi64(LowerHalf(dh, v).raw)};
4215}
4216
4218 Vec256<float> v) {
4219 const Half<DFromV<decltype(v)>> dh;
4220 return Vec256<float>{_mm256_broadcastss_ps(LowerHalf(dh, v).raw)};
4221}
4222
4224 Vec256<double> v) {
4225 const Half<DFromV<decltype(v)>> dh;
4226 return Vec256<double>{_mm256_broadcastsd_pd(LowerHalf(dh, v).raw)};
4227}
4228
4229template <size_t kLaneIdx, class T, hwy::EnableIf<kLaneIdx != 0>* = nullptr,
4230 HWY_IF_NOT_T_SIZE(T, 8)>
4232 Vec256<T> v) {
4233 constexpr size_t kLanesPerBlock = 16 / sizeof(T);
4234 constexpr int kBlockIdx = static_cast<int>(kLaneIdx / kLanesPerBlock);
4235 constexpr int kLaneInBlkIdx =
4236 static_cast<int>(kLaneIdx) & (kLanesPerBlock - 1);
4237 return Broadcast<kLaneInBlkIdx>(BroadcastBlock<kBlockIdx>(v));
4238}
4239
4240template <size_t kLaneIdx, class T, hwy::EnableIf<kLaneIdx != 0>* = nullptr,
4241 HWY_IF_UI64(T)>
4243 Vec256<T> v) {
4244 static_assert(kLaneIdx <= 3, "Invalid lane");
4245 return Vec256<T>{
4246 _mm256_permute4x64_epi64(v.raw, static_cast<int>(0x55 * kLaneIdx))};
4247}
4248
4249template <size_t kLaneIdx, hwy::EnableIf<kLaneIdx != 0>* = nullptr>
4251 hwy::SizeTag<kLaneIdx> /* lane_idx_tag */, Vec256<double> v) {
4252 static_assert(kLaneIdx <= 3, "Invalid lane");
4253 return Vec256<double>{
4254 _mm256_permute4x64_pd(v.raw, static_cast<int>(0x55 * kLaneIdx))};
4255}
4256
4257} // namespace detail
4258
4259template <int kLaneIdx, class T>
4260HWY_API Vec256<T> BroadcastLane(Vec256<T> v) {
4261 static_assert(kLaneIdx >= 0, "Invalid lane");
4262 return detail::BroadcastLane(hwy::SizeTag<static_cast<size_t>(kLaneIdx)>(),
4263 v);
4264}
4265
4266// ------------------------------ Hard-coded shuffles
4267
4268// Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
4269// least-significant). Shuffle0321 rotates four-lane blocks one lane to the
4270// right (the previous least-significant lane is now most-significant =>
4271// 47650321). These could also be implemented via CombineShiftRightBytes but
4272// the shuffle_abcd notation is more convenient.
4273
4274// Swap 32-bit halves in 64-bit halves.
4275template <typename T, HWY_IF_UI32(T)>
4277 return Vec256<T>{_mm256_shuffle_epi32(v.raw, 0xB1)};
4278}
4280 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)};
4281}
4282
4283// Used by generic_ops-inl.h
4284namespace detail {
4285
4286template <typename T, HWY_IF_T_SIZE(T, 4)>
4287HWY_API Vec256<T> ShuffleTwo2301(const Vec256<T> a, const Vec256<T> b) {
4288 const DFromV<decltype(a)> d;
4289 const RebindToFloat<decltype(d)> df;
4290 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
4291 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
4292 BitCast(df, b).raw, m)});
4293}
4294template <typename T, HWY_IF_T_SIZE(T, 4)>
4295HWY_API Vec256<T> ShuffleTwo1230(const Vec256<T> a, const Vec256<T> b) {
4296 const DFromV<decltype(a)> d;
4297 const RebindToFloat<decltype(d)> df;
4298 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
4299 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
4300 BitCast(df, b).raw, m)});
4301}
4302template <typename T, HWY_IF_T_SIZE(T, 4)>
4303HWY_API Vec256<T> ShuffleTwo3012(const Vec256<T> a, const Vec256<T> b) {
4304 const DFromV<decltype(a)> d;
4305 const RebindToFloat<decltype(d)> df;
4306 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
4307 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
4308 BitCast(df, b).raw, m)});
4309}
4310
4311} // namespace detail
4312
4313// Swap 64-bit halves
4315 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
4316}
4318 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
4319}
4321 // Shorter encoding than _mm256_permute_ps.
4322 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)};
4323}
4325 return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
4326}
4328 return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
4329}
4331 // Shorter encoding than _mm256_permute_pd.
4332 return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 5)};
4333}
4334
4335// Rotate right 32 bits
4337 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
4338}
4340 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
4341}
4343 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x39)};
4344}
4345// Rotate left 32 bits
4347 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
4348}
4350 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
4351}
4353 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x93)};
4354}
4355
4356// Reverse
4358 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
4359}
4361 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
4362}
4364 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)};
4365}
4366
4367// ------------------------------ TableLookupLanes
4368
4369// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
4370template <typename T>
4371struct Indices256 {
4372 __m256i raw;
4373};
4374
4375// 8-bit lanes: indices remain unchanged
4376template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1), typename TI>
4378 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
4379#if HWY_IS_DEBUG_BUILD
4380 const Full256<TI> di;
4381 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
4382 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(2 * Lanes(di))))));
4383#endif
4384 return Indices256<TFromD<D>>{vec.raw};
4385}
4386
4387// 16-bit lanes: convert indices to 32x8 unless AVX3 is available
4388template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2), typename TI>
4389HWY_API Indices256<TFromD<D>> IndicesFromVec(D /* tag */, Vec256<TI> vec) {
4390 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
4391 const Full256<TI> di;
4392#if HWY_IS_DEBUG_BUILD
4393 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
4394 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(2 * Lanes(di))))));
4395#endif
4396
4397#if HWY_TARGET <= HWY_AVX3
4398 (void)di;
4399 return Indices256<TFromD<D>>{vec.raw};
4400#else
4401 const Repartition<uint8_t, decltype(di)> d8;
4402 using V8 = VFromD<decltype(d8)>;
4403 alignas(32) static constexpr uint8_t kByteOffsets[32] = {
4404 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
4405 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
4406
4407 // Broadcast each lane index to all 2 bytes of T
4408 alignas(32) static constexpr uint8_t kBroadcastLaneBytes[32] = {
4409 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
4410 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
4411 const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
4412
4413 // Shift to bytes
4414 const Repartition<uint16_t, decltype(di)> d16;
4415 const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices)));
4416
4417 return Indices256<TFromD<D>>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
4418#endif // HWY_TARGET <= HWY_AVX3
4419}
4420
4421// Native 8x32 instruction: indices remain unchanged
4422template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4), typename TI>
4423HWY_API Indices256<TFromD<D>> IndicesFromVec(D /* tag */, Vec256<TI> vec) {
4424 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
4425#if HWY_IS_DEBUG_BUILD
4426 const Full256<TI> di;
4427 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
4428 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(2 * Lanes(di))))));
4429#endif
4430 return Indices256<TFromD<D>>{vec.raw};
4431}
4432
4433// 64-bit lanes: convert indices to 8x32 unless AVX3 is available
4434template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8), typename TI>
4435HWY_API Indices256<TFromD<D>> IndicesFromVec(D d, Vec256<TI> idx64) {
4436 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
4437 const Rebind<TI, decltype(d)> di;
4438 (void)di; // potentially unused
4439#if HWY_IS_DEBUG_BUILD
4440 HWY_DASSERT(AllFalse(di, Lt(idx64, Zero(di))) &&
4441 AllTrue(di, Lt(idx64, Set(di, static_cast<TI>(2 * Lanes(di))))));
4442#endif
4443
4444#if HWY_TARGET <= HWY_AVX3
4445 (void)d;
4446 return Indices256<TFromD<D>>{idx64.raw};
4447#else
4448 const Repartition<float, decltype(d)> df; // 32-bit!
4449 // Replicate 64-bit index into upper 32 bits
4450 const Vec256<TI> dup =
4451 BitCast(di, Vec256<float>{_mm256_moveldup_ps(BitCast(df, idx64).raw)});
4452 // For each idx64 i, idx32 are 2*i and 2*i+1.
4453 const Vec256<TI> idx32 = dup + dup + Set(di, TI(1) << 32);
4454 return Indices256<TFromD<D>>{idx32.raw};
4455#endif
4456}
4457
4458template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI>
4459HWY_API Indices256<TFromD<D>> SetTableIndices(D d, const TI* idx) {
4460 const Rebind<TI, decltype(d)> di;
4461 return IndicesFromVec(d, LoadU(di, idx));
4462}
4463
4464template <typename T, HWY_IF_T_SIZE(T, 1)>
4466#if HWY_TARGET <= HWY_AVX3_DL
4467 return Vec256<T>{_mm256_permutexvar_epi8(idx.raw, v.raw)};
4468#else
4469 const Vec256<T> idx_vec{idx.raw};
4470 const DFromV<decltype(v)> d;
4471 const Repartition<uint16_t, decltype(d)> du16;
4472 const auto sel_hi_mask =
4473 MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec))));
4474
4475 const auto a = ConcatLowerLower(d, v, v);
4476 const auto b = ConcatUpperUpper(d, v, v);
4477 const auto lo_lookup_result = TableLookupBytes(a, idx_vec);
4478
4479#if HWY_TARGET <= HWY_AVX3
4480 return Vec256<T>{_mm256_mask_shuffle_epi8(
4481 lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)};
4482#else
4483 const auto hi_lookup_result = TableLookupBytes(b, idx_vec);
4484 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
4485#endif // HWY_TARGET <= HWY_AVX3
4486#endif // HWY_TARGET <= HWY_AVX3_DL
4487}
4488
4489template <typename T, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_SPECIAL_FLOAT(T)>
4491#if HWY_TARGET <= HWY_AVX3
4492 return Vec256<T>{_mm256_permutexvar_epi16(idx.raw, v.raw)};
4493#else
4494 const DFromV<decltype(v)> d;
4495 const Repartition<uint8_t, decltype(d)> du8;
4496 return BitCast(
4497 d, TableLookupLanes(BitCast(du8, v), Indices256<uint8_t>{idx.raw}));
4498#endif
4499}
4500
4501#if HWY_HAVE_FLOAT16
4502HWY_API Vec256<float16_t> TableLookupLanes(Vec256<float16_t> v,
4503 Indices256<float16_t> idx) {
4504 return Vec256<float16_t>{_mm256_permutexvar_ph(idx.raw, v.raw)};
4505}
4506#endif // HWY_HAVE_FLOAT16
4507
4508template <typename T, HWY_IF_T_SIZE(T, 4)>
4509HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
4510 return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
4511}
4512
4513template <typename T, HWY_IF_T_SIZE(T, 8)>
4514HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
4515#if HWY_TARGET <= HWY_AVX3
4516 return Vec256<T>{_mm256_permutexvar_epi64(idx.raw, v.raw)};
4517#else
4518 return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
4519#endif
4520}
4521
4523 const Indices256<float> idx) {
4524 return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
4525}
4526
4528 const Indices256<double> idx) {
4529#if HWY_TARGET <= HWY_AVX3
4530 return Vec256<double>{_mm256_permutexvar_pd(idx.raw, v.raw)};
4531#else
4532 const Full256<double> df;
4533 const Full256<uint64_t> du;
4534 return BitCast(df, Vec256<uint64_t>{_mm256_permutevar8x32_epi32(
4535 BitCast(du, v).raw, idx.raw)});
4536#endif
4537}
4538
4539template <typename T, HWY_IF_T_SIZE(T, 1)>
4541 Indices256<T> idx) {
4542#if HWY_TARGET <= HWY_AVX3_DL
4543 return Vec256<T>{_mm256_permutex2var_epi8(a.raw, idx.raw, b.raw)};
4544#else
4545 const DFromV<decltype(a)> d;
4546 const auto sel_hi_mask =
4547 MaskFromVec(BitCast(d, ShiftLeft<2>(Vec256<uint16_t>{idx.raw})));
4548 const auto lo_lookup_result = TableLookupLanes(a, idx);
4549 const auto hi_lookup_result = TableLookupLanes(b, idx);
4550 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
4551#endif
4552}
4553
4554template <typename T, HWY_IF_T_SIZE(T, 2)>
4555HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b,
4556 Indices256<T> idx) {
4557#if HWY_TARGET <= HWY_AVX3
4558 return Vec256<T>{_mm256_permutex2var_epi16(a.raw, idx.raw, b.raw)};
4559#else
4560 const DFromV<decltype(a)> d;
4561 const Repartition<uint8_t, decltype(d)> du8;
4562 return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
4563 Indices256<uint8_t>{idx.raw}));
4564#endif
4565}
4566
4567template <typename T, HWY_IF_UI32(T)>
4568HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b,
4569 Indices256<T> idx) {
4570#if HWY_TARGET <= HWY_AVX3
4571 return Vec256<T>{_mm256_permutex2var_epi32(a.raw, idx.raw, b.raw)};
4572#else
4573 const DFromV<decltype(a)> d;
4574 const RebindToFloat<decltype(d)> df;
4575 const Vec256<T> idx_vec{idx.raw};
4576
4577 const auto sel_hi_mask = MaskFromVec(BitCast(df, ShiftLeft<28>(idx_vec)));
4578 const auto lo_lookup_result = BitCast(df, TableLookupLanes(a, idx));
4579 const auto hi_lookup_result = BitCast(df, TableLookupLanes(b, idx));
4580 return BitCast(d,
4581 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
4582#endif
4583}
4584
4585#if HWY_HAVE_FLOAT16
4586HWY_API Vec256<float16_t> TwoTablesLookupLanes(Vec256<float16_t> a,
4587 Vec256<float16_t> b,
4588 Indices256<float16_t> idx) {
4589 return Vec256<float16_t>{_mm256_permutex2var_ph(a.raw, idx.raw, b.raw)};
4590}
4591#endif // HWY_HAVE_FLOAT16
4593 Indices256<float> idx) {
4594#if HWY_TARGET <= HWY_AVX3
4595 return Vec256<float>{_mm256_permutex2var_ps(a.raw, idx.raw, b.raw)};
4596#else
4597 const DFromV<decltype(a)> d;
4598 const auto sel_hi_mask =
4599 MaskFromVec(BitCast(d, ShiftLeft<28>(Vec256<uint32_t>{idx.raw})));
4600 const auto lo_lookup_result = TableLookupLanes(a, idx);
4601 const auto hi_lookup_result = TableLookupLanes(b, idx);
4602 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
4603#endif
4604}
4605
4606template <typename T, HWY_IF_UI64(T)>
4607HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b,
4608 Indices256<T> idx) {
4609#if HWY_TARGET <= HWY_AVX3
4610 return Vec256<T>{_mm256_permutex2var_epi64(a.raw, idx.raw, b.raw)};
4611#else
4612 const DFromV<decltype(a)> d;
4613 const Repartition<uint32_t, decltype(d)> du32;
4614 return BitCast(d, TwoTablesLookupLanes(BitCast(du32, a), BitCast(du32, b),
4615 Indices256<uint32_t>{idx.raw}));
4616#endif
4617}
4618
4620 Indices256<double> idx) {
4621#if HWY_TARGET <= HWY_AVX3
4622 return Vec256<double>{_mm256_permutex2var_pd(a.raw, idx.raw, b.raw)};
4623#else
4624 const DFromV<decltype(a)> d;
4625 const Repartition<uint32_t, decltype(d)> du32;
4626 return BitCast(d, TwoTablesLookupLanes(BitCast(du32, a), BitCast(du32, b),
4627 Indices256<uint32_t>{idx.raw}));
4628#endif
4629}
4630
4631// ------------------------------ SwapAdjacentBlocks
4632
4633template <typename T>
4634HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
4635 const DFromV<decltype(v)> d;
4636 const RebindToUnsigned<decltype(d)> du; // for float16_t
4637 return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
4638 BitCast(du, v).raw, _MM_SHUFFLE(1, 0, 3, 2))});
4639}
4640
4642 return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
4643}
4644
4646 // Assume no domain-crossing penalty between float/double (true on SKX).
4647 const DFromV<decltype(v)> d;
4648 const RepartitionToWide<decltype(d)> dw;
4649 return BitCast(d, SwapAdjacentBlocks(BitCast(dw, v)));
4650}
4651
4652// ------------------------------ Reverse (RotateRight)
4653
4654template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
4655HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4656 alignas(32) static constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
4657 return TableLookupLanes(v, SetTableIndices(d, kReverse));
4658}
4659
4660template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
4661HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4662 alignas(32) static constexpr int64_t kReverse[4] = {3, 2, 1, 0};
4663 return TableLookupLanes(v, SetTableIndices(d, kReverse));
4664}
4665
4666template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4667HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4668#if HWY_TARGET <= HWY_AVX3
4669 const RebindToSigned<decltype(d)> di;
4670 alignas(32) static constexpr int16_t kReverse[16] = {
4671 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4672 const Vec256<int16_t> idx = Load(di, kReverse);
4673 return BitCast(d, Vec256<int16_t>{
4674 _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4675#else
4676 const RebindToSigned<decltype(d)> di;
4677 const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
4678 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
4679 const auto rev128 = TableLookupBytes(v, shuffle);
4680 return VFromD<D>{
4681 _mm256_permute4x64_epi64(rev128.raw, _MM_SHUFFLE(1, 0, 3, 2))};
4682#endif
4683}
4684
4685template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
4686HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4687#if HWY_TARGET <= HWY_AVX3_DL
4688 alignas(32) static constexpr TFromD<D> kReverse[32] = {
4689 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
4690 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4691 return TableLookupLanes(v, SetTableIndices(d, kReverse));
4692#else
4693 // First reverse bytes within blocks via PSHUFB, then swap blocks.
4694 alignas(32) static constexpr TFromD<D> kReverse[32] = {
4695 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
4696 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
4697 return SwapAdjacentBlocks(TableLookupBytes(v, Load(d, kReverse)));
4698#endif
4699}
4700
4701// ------------------------------ Reverse2 (in x86_128)
4702
4703// ------------------------------ Reverse4 (SwapAdjacentBlocks)
4704
4705template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4706HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
4707 const RebindToSigned<decltype(d)> di;
4708 const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
4709 di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
4710 return BitCast(d, TableLookupBytes(v, shuffle));
4711}
4712
4713// 32 bit Reverse4 defined in x86_128.
4714
4715template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
4716HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
4717 // Could also use _mm256_permute4x64_epi64.
4718 return SwapAdjacentBlocks(Shuffle01(v));
4719}
4720
4721// ------------------------------ Reverse8
4722
4723template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4724HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
4725 const RebindToSigned<decltype(d)> di;
4726 const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
4727 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
4728 return BitCast(d, TableLookupBytes(v, shuffle));
4729}
4730
4731template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
4732HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
4733 return Reverse(d, v);
4734}
4735
4736template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
4737HWY_API VFromD<D> Reverse8(D /* tag */, const VFromD<D> /* v */) {
4738 HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes
4739}
4740
4741// ------------------------------ ReverseBits in x86_512
4742
4743// ------------------------------ InterleaveLower
4744
4745// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
4746// the least-significant lane) and "b". To concatenate two half-width integers
4747// into one, use ZipLower/Upper instead (also works with scalar).
4748
4749template <typename T, HWY_IF_T_SIZE(T, 1)>
4751 return Vec256<T>{_mm256_unpacklo_epi8(a.raw, b.raw)};
4752}
4753template <typename T, HWY_IF_T_SIZE(T, 2)>
4754HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
4755 const DFromV<decltype(a)> d;
4756 const RebindToUnsigned<decltype(d)> du;
4757 using VU = VFromD<decltype(du)>; // for float16_t
4758 return BitCast(
4759 d, VU{_mm256_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
4760}
4761template <typename T, HWY_IF_UI32(T)>
4762HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
4763 return Vec256<T>{_mm256_unpacklo_epi32(a.raw, b.raw)};
4764}
4765template <typename T, HWY_IF_UI64(T)>
4766HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
4767 return Vec256<T>{_mm256_unpacklo_epi64(a.raw, b.raw)};
4768}
4769
4771 return Vec256<float>{_mm256_unpacklo_ps(a.raw, b.raw)};
4772}
4774 return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)};
4775}
4776
4777// ------------------------------ InterleaveUpper
4778
4779template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
4781 return VFromD<D>{_mm256_unpackhi_epi8(a.raw, b.raw)};
4782}
4783template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4785 const RebindToUnsigned<decltype(d)> du;
4786 using VU = VFromD<decltype(du)>; // for float16_t
4787 return BitCast(
4788 d, VU{_mm256_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
4789}
4790template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
4792 return VFromD<D>{_mm256_unpackhi_epi32(a.raw, b.raw)};
4793}
4794template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
4796 return VFromD<D>{_mm256_unpackhi_epi64(a.raw, b.raw)};
4797}
4798
4799template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4801 return VFromD<D>{_mm256_unpackhi_ps(a.raw, b.raw)};
4802}
4803template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4805 return VFromD<D>{_mm256_unpackhi_pd(a.raw, b.raw)};
4806}
4807
4808// ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
4809
4810// _mm256_broadcastsi128_si256 has 7 cycle latency on ICL.
4811// _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no
4812// extra cost) for LowerLower and UpperLower.
4813
4814// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
4815template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4817 const RebindToUnsigned<decltype(d)> du; // for float16_t
4818 const Half<decltype(d)> d2;
4819 const RebindToUnsigned<decltype(d2)> du2; // for float16_t
4820 return BitCast(
4821 d, VFromD<decltype(du)>{_mm256_inserti128_si256(
4822 BitCast(du, lo).raw, BitCast(du2, LowerHalf(d2, hi)).raw, 1)});
4823}
4824template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4826 Vec256<float> lo) {
4827 const Half<decltype(d)> d2;
4828 return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
4829}
4830template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4832 Vec256<double> lo) {
4833 const Half<decltype(d)> d2;
4834 return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
4835}
4836
4837// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
4838template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4840 const RebindToUnsigned<decltype(d)> du;
4841 return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
4842 BitCast(du, lo).raw, BitCast(du, hi).raw, 0x21)});
4843}
4844template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4846 Vec256<float> lo) {
4847 return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
4848}
4849template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4851 Vec256<double> lo) {
4852 return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
4853}
4854
4855// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
4856template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4858 const RebindToUnsigned<decltype(d)> du; // for float16_t
4859 return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
4860 BitCast(du, hi).raw, BitCast(du, lo).raw, 0x0F)});
4861}
4862template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4864 Vec256<float> lo) {
4865 return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
4866}
4867template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4869 Vec256<double> lo) {
4870 return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
4871}
4872
4873// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
4874template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4876 const RebindToUnsigned<decltype(d)> du; // for float16_t
4877 return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
4878 BitCast(du, lo).raw, BitCast(du, hi).raw, 0x31)});
4879}
4880template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4882 Vec256<float> lo) {
4883 return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
4884}
4885template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4887 Vec256<double> lo) {
4888 return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
4889}
4890
4891// ---------------------------- InsertBlock (ConcatLowerLower, ConcatUpperLower)
4892template <int kBlockIdx, class T>
4893HWY_API Vec256<T> InsertBlock(Vec256<T> v, Vec128<T> blk_to_insert) {
4894 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
4895
4896 const DFromV<decltype(v)> d;
4897 const auto vec_to_insert = ResizeBitCast(d, blk_to_insert);
4898 return (kBlockIdx == 0) ? ConcatUpperLower(d, v, vec_to_insert)
4899 : ConcatLowerLower(d, vec_to_insert, v);
4900}
4901
4902// ------------------------------ ConcatOdd
4903
4904template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
4906 const RebindToUnsigned<decltype(d)> du;
4907#if HWY_TARGET <= HWY_AVX3_DL
4908 alignas(32) static constexpr uint8_t kIdx[32] = {
4909 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
4910 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
4911 return BitCast(
4912 d, Vec256<uint16_t>{_mm256_permutex2var_epi8(
4913 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4914#else
4915 const RepartitionToWide<decltype(du)> dw;
4916 // Unsigned 8-bit shift so we can pack.
4917 const Vec256<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
4918 const Vec256<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
4919 const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
4920 return VFromD<D>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
4921#endif
4922}
4923
4924template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4926 const RebindToUnsigned<decltype(d)> du;
4927#if HWY_TARGET <= HWY_AVX3
4928 alignas(32) static constexpr uint16_t kIdx[16] = {
4929 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
4930 return BitCast(
4931 d, Vec256<uint16_t>{_mm256_permutex2var_epi16(
4932 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4933#else
4934 const RepartitionToWide<decltype(du)> dw;
4935 // Unsigned 16-bit shift so we can pack.
4936 const Vec256<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
4937 const Vec256<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
4938 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
4939 return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
4940 u16, _MM_SHUFFLE(3, 1, 2, 0))});
4941#endif
4942}
4943
4944template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
4946 const RebindToUnsigned<decltype(d)> du;
4947#if HWY_TARGET <= HWY_AVX3
4948 alignas(32) static constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
4949 return BitCast(
4950 d, Vec256<uint32_t>{_mm256_permutex2var_epi32(
4951 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4952#else
4953 const RebindToFloat<decltype(d)> df;
4954 const Vec256<float> v3131{_mm256_shuffle_ps(
4955 BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
4956 return VFromD<D>{_mm256_permute4x64_epi64(BitCast(du, v3131).raw,
4957 _MM_SHUFFLE(3, 1, 2, 0))};
4958#endif
4959}
4960
4961template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4963 const RebindToUnsigned<decltype(d)> du;
4964#if HWY_TARGET <= HWY_AVX3
4965 alignas(32) static constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
4966 return VFromD<D>{_mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
4967#else
4968 const VFromD<D> v3131{
4969 _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
4970 return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
4971 BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
4972#endif
4973}
4974
4975template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
4977 const RebindToUnsigned<decltype(d)> du;
4978#if HWY_TARGET <= HWY_AVX3
4979 alignas(64) static constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
4980 return BitCast(
4981 d, Vec256<uint64_t>{_mm256_permutex2var_epi64(
4982 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4983#else
4984 const RebindToFloat<decltype(d)> df;
4985 const Vec256<double> v31{
4986 _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)};
4987 return VFromD<D>{
4988 _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
4989#endif
4990}
4991
4992template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4994#if HWY_TARGET <= HWY_AVX3
4995 const RebindToUnsigned<decltype(d)> du;
4996 alignas(64) static constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
4997 return Vec256<double>{
4998 _mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
4999#else
5000 (void)d;
5001 const Vec256<double> v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)};
5002 return Vec256<double>{
5003 _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
5004#endif
5005}
5006
5007// ------------------------------ ConcatEven
5008
5009template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5011 const RebindToUnsigned<decltype(d)> du;
5012#if HWY_TARGET <= HWY_AVX3_DL
5013 alignas(64) static constexpr uint8_t kIdx[32] = {
5014 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
5015 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
5016 return BitCast(
5017 d, Vec256<uint32_t>{_mm256_permutex2var_epi8(
5018 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
5019#else
5020 const RepartitionToWide<decltype(du)> dw;
5021 // Isolate lower 8 bits per u16 so we can pack.
5022 const Vec256<uint16_t> mask = Set(dw, 0x00FF);
5023 const Vec256<uint16_t> uH = And(BitCast(dw, hi), mask);
5024 const Vec256<uint16_t> uL = And(BitCast(dw, lo), mask);
5025 const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
5026 return VFromD<D>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
5027#endif
5028}
5029
5030template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5032 const RebindToUnsigned<decltype(d)> du;
5033#if HWY_TARGET <= HWY_AVX3
5034 alignas(64) static constexpr uint16_t kIdx[16] = {
5035 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
5036 return BitCast(
5037 d, Vec256<uint32_t>{_mm256_permutex2var_epi16(
5038 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
5039#else
5040 const RepartitionToWide<decltype(du)> dw;
5041 // Isolate lower 16 bits per u32 so we can pack.
5042 const Vec256<uint32_t> mask = Set(dw, 0x0000FFFF);
5043 const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
5044 const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
5045 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
5046 return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
5047 u16, _MM_SHUFFLE(3, 1, 2, 0))});
5048#endif
5049}
5050
5051template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
5053 const RebindToUnsigned<decltype(d)> du;
5054#if HWY_TARGET <= HWY_AVX3
5055 alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
5056 return BitCast(
5057 d, Vec256<uint32_t>{_mm256_permutex2var_epi32(
5058 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
5059#else
5060 const RebindToFloat<decltype(d)> df;
5061 const Vec256<float> v2020{_mm256_shuffle_ps(
5062 BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
5063 return VFromD<D>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
5064 _MM_SHUFFLE(3, 1, 2, 0))};
5065
5066#endif
5067}
5068
5069template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5071 const RebindToUnsigned<decltype(d)> du;
5072#if HWY_TARGET <= HWY_AVX3
5073 alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
5074 return VFromD<D>{_mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
5075#else
5076 const VFromD<D> v2020{
5077 _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
5078 return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
5079 BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
5080
5081#endif
5082}
5083
5084template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
5086 const RebindToUnsigned<decltype(d)> du;
5087#if HWY_TARGET <= HWY_AVX3
5088 alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
5089 return BitCast(
5090 d, Vec256<uint64_t>{_mm256_permutex2var_epi64(
5091 BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
5092#else
5093 const RebindToFloat<decltype(d)> df;
5094 const Vec256<double> v20{
5095 _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
5096 return VFromD<D>{
5097 _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
5098
5099#endif
5100}
5101
5102template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
5104#if HWY_TARGET <= HWY_AVX3
5105 const RebindToUnsigned<decltype(d)> du;
5106 alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
5107 return Vec256<double>{
5108 _mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
5109#else
5110 (void)d;
5111 const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)};
5112 return Vec256<double>{
5113 _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
5114#endif
5115}
5116
5117// ------------------------------ InterleaveWholeLower
5118
5119#if HWY_TARGET <= HWY_AVX3
5120template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5122#if HWY_TARGET <= HWY_AVX3_DL
5123 const RebindToUnsigned<decltype(d)> du;
5124 alignas(32) static constexpr uint8_t kIdx[32] = {
5125 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
5126 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
5127 return VFromD<D>{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
5128#else
5129 return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5130#endif
5131}
5132
5133template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5135 const RebindToUnsigned<decltype(d)> du;
5136 alignas(32) static constexpr uint16_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
5137 4, 20, 5, 21, 6, 22, 7, 23};
5138 return BitCast(
5139 d, VFromD<decltype(du)>{_mm256_permutex2var_epi16(
5140 BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
5141}
5142
5143template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
5145 const RebindToUnsigned<decltype(d)> du;
5146 alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5147 return VFromD<D>{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
5148}
5149
5150template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5152 const RebindToUnsigned<decltype(d)> du;
5153 alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5154 return VFromD<D>{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
5155}
5156
5157template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
5159 const RebindToUnsigned<decltype(d)> du;
5160 alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
5161 return VFromD<D>{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
5162}
5163
5164template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
5166 const RebindToUnsigned<decltype(d)> du;
5167 alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
5168 return VFromD<D>{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
5169}
5170#else // AVX2
5171template <class D, HWY_IF_V_SIZE_D(D, 32)>
5173 return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5174}
5175#endif
5176
5177// ------------------------------ InterleaveWholeUpper
5178
5179#if HWY_TARGET <= HWY_AVX3
5180template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5182#if HWY_TARGET <= HWY_AVX3_DL
5183 const RebindToUnsigned<decltype(d)> du;
5184 alignas(32) static constexpr uint8_t kIdx[32] = {
5185 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
5186 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
5187 return VFromD<D>{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
5188#else
5189 return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5190#endif
5191}
5192
5193template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5195 const RebindToUnsigned<decltype(d)> du;
5196 alignas(32) static constexpr uint16_t kIdx[16] = {
5197 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
5198 return BitCast(
5199 d, VFromD<decltype(du)>{_mm256_permutex2var_epi16(
5200 BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
5201}
5202
5203template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
5205 const RebindToUnsigned<decltype(d)> du;
5206 alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
5207 return VFromD<D>{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
5208}
5209
5210template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5212 const RebindToUnsigned<decltype(d)> du;
5213 alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
5214 return VFromD<D>{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
5215}
5216
5217template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
5219 const RebindToUnsigned<decltype(d)> du;
5220 alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
5221 return VFromD<D>{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
5222}
5223
5224template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
5226 const RebindToUnsigned<decltype(d)> du;
5227 alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
5228 return VFromD<D>{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
5229}
5230#else // AVX2
5231template <class D, HWY_IF_V_SIZE_D(D, 32)>
5233 return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5234}
5235#endif
5236
5237// ------------------------------ DupEven (InterleaveLower)
5238
5239template <typename T, HWY_IF_UI32(T)>
5241 return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
5242}
5244 return Vec256<float>{
5245 _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
5246}
5247
5248template <typename T, HWY_IF_T_SIZE(T, 8)>
5249HWY_API Vec256<T> DupEven(const Vec256<T> v) {
5250 const DFromV<decltype(v)> d;
5251 return InterleaveLower(d, v, v);
5252}
5253
5254// ------------------------------ DupOdd (InterleaveUpper)
5255
5256template <typename T, HWY_IF_UI32(T)>
5258 return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
5259}
5261 return Vec256<float>{
5262 _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
5263}
5264
5265template <typename T, HWY_IF_T_SIZE(T, 8)>
5266HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
5267 const DFromV<decltype(v)> d;
5268 return InterleaveUpper(d, v, v);
5269}
5270
5271// ------------------------------ OddEven
5272
5273template <typename T, HWY_IF_T_SIZE(T, 1)>
5275 const DFromV<decltype(a)> d;
5276 const Full256<uint8_t> d8;
5277 const VFromD<decltype(d8)> mask =
5278 Dup128VecFromValues(d8, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF,
5279 0, 0xFF, 0, 0xFF, 0);
5280 return IfThenElse(MaskFromVec(BitCast(d, mask)), b, a);
5281}
5282
5283template <typename T, HWY_IF_UI16(T)>
5284HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
5285 const DFromV<decltype(a)> d;
5286 const RebindToUnsigned<decltype(d)> du; // for float16_t
5287 return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi16(
5288 BitCast(du, a).raw, BitCast(du, b).raw, 0x55)});
5289}
5290
5291#if HWY_HAVE_FLOAT16
5292HWY_INLINE Vec256<float16_t> OddEven(Vec256<float16_t> a, Vec256<float16_t> b) {
5293 return Vec256<float16_t>{
5294 _mm256_mask_blend_ph(static_cast<__mmask16>(0x5555), a.raw, b.raw)};
5295}
5296#endif // HWY_HAVE_FLOAT16
5297
5298template <typename T, HWY_IF_UI32(T)>
5299HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
5300 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)};
5301}
5302
5303template <typename T, HWY_IF_UI64(T)>
5304HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
5305 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)};
5306}
5307
5309 return Vec256<float>{_mm256_blend_ps(a.raw, b.raw, 0x55)};
5310}
5311
5313 return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
5314}
5315
5316// -------------------------- InterleaveEven
5317
5318#if HWY_TARGET <= HWY_AVX3
5319template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
5321 return VFromD<D>{_mm256_mask_shuffle_epi32(
5322 a.raw, static_cast<__mmask8>(0xAA), b.raw,
5323 static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
5324}
5325template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
5327 return VFromD<D>{_mm256_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0xAA),
5328 b.raw, b.raw,
5329 _MM_SHUFFLE(2, 2, 0, 0))};
5330}
5331#else
5332template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
5334 const RebindToFloat<decltype(d)> df;
5335 const VFromD<decltype(df)> b2_b0_a2_a0{_mm256_shuffle_ps(
5336 BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(2, 0, 2, 0))};
5337 return BitCast(
5338 d, VFromD<decltype(df)>{_mm256_shuffle_ps(
5339 b2_b0_a2_a0.raw, b2_b0_a2_a0.raw, _MM_SHUFFLE(3, 1, 2, 0))});
5340}
5341#endif
5342
5343// I64/U64/F64 InterleaveEven is generic for vector lengths >= 32 bytes
5344template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
5346 return InterleaveLower(a, b);
5347}
5348
5349// -------------------------- InterleaveOdd
5350
5351#if HWY_TARGET <= HWY_AVX3
5352template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
5354 return VFromD<D>{_mm256_mask_shuffle_epi32(
5355 b.raw, static_cast<__mmask8>(0x55), a.raw,
5356 static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
5357}
5358template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
5360 return VFromD<D>{_mm256_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x55),
5361 a.raw, a.raw,
5362 _MM_SHUFFLE(3, 3, 1, 1))};
5363}
5364#else
5365template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
5367 const RebindToFloat<decltype(d)> df;
5368 const VFromD<decltype(df)> b3_b1_a3_a3{_mm256_shuffle_ps(
5369 BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(3, 1, 3, 1))};
5370 return BitCast(
5371 d, VFromD<decltype(df)>{_mm256_shuffle_ps(
5372 b3_b1_a3_a3.raw, b3_b1_a3_a3.raw, _MM_SHUFFLE(3, 1, 2, 0))});
5373}
5374#endif
5375
5376// I64/U64/F64 InterleaveOdd is generic for vector lengths >= 32 bytes
5377template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
5379 return InterleaveUpper(d, a, b);
5380}
5381
5382// ------------------------------ OddEvenBlocks
5383
5384template <typename T, HWY_IF_NOT_FLOAT3264(T)>
5386 const DFromV<decltype(odd)> d;
5387 const RebindToUnsigned<decltype(d)> du;
5388 return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
5389 BitCast(du, odd).raw, BitCast(du, even).raw, 0xFu)});
5390}
5391
5393 return Vec256<float>{_mm256_blend_ps(odd.raw, even.raw, 0xFu)};
5394}
5395
5397 return Vec256<double>{_mm256_blend_pd(odd.raw, even.raw, 0x3u)};
5398}
5399
5400// ------------------------------ ReverseBlocks (SwapAdjacentBlocks)
5401
5402template <class D, HWY_IF_V_SIZE_D(D, 32)>
5404 return SwapAdjacentBlocks(v);
5405}
5406
5407// ------------------------------ TableLookupBytes (ZeroExtendVector)
5408
5409// Both full
5410template <typename T, typename TI>
5411HWY_API Vec256<TI> TableLookupBytes(Vec256<T> bytes, Vec256<TI> from) {
5412 const DFromV<decltype(from)> d;
5413 return BitCast(d, Vec256<uint8_t>{_mm256_shuffle_epi8(
5414 BitCast(Full256<uint8_t>(), bytes).raw,
5415 BitCast(Full256<uint8_t>(), from).raw)});
5416}
5417
5418// Partial index vector
5419template <typename T, typename TI, size_t NI>
5420HWY_API Vec128<TI, NI> TableLookupBytes(Vec256<T> bytes, Vec128<TI, NI> from) {
5421 const Full256<TI> di;
5422 const Half<decltype(di)> dih;
5423 // First expand to full 128, then 256.
5424 const auto from_256 = ZeroExtendVector(di, Vec128<TI>{from.raw});
5425 const auto tbl_full = TableLookupBytes(bytes, from_256);
5426 // Shrink to 128, then partial.
5427 return Vec128<TI, NI>{LowerHalf(dih, tbl_full).raw};
5428}
5429
5430// Partial table vector
5431template <typename T, size_t N, typename TI>
5432HWY_API Vec256<TI> TableLookupBytes(Vec128<T, N> bytes, Vec256<TI> from) {
5433 const Full256<T> d;
5434 // First expand to full 128, then 256.
5435 const auto bytes_256 = ZeroExtendVector(d, Vec128<T>{bytes.raw});
5436 return TableLookupBytes(bytes_256, from);
5437}
5438
5439// Partial both are handled by x86_128.
5440
5441// ------------------------------ I8/U8 Broadcast (TableLookupBytes)
5442
5443template <int kLane, class T, HWY_IF_T_SIZE(T, 1)>
5444HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
5445 static_assert(0 <= kLane && kLane < 16, "Invalid lane");
5446 return TableLookupBytes(v, Set(Full256<T>(), static_cast<T>(kLane)));
5447}
5448
5449// ------------------------------ Per4LaneBlockShuffle
5450
5451namespace detail {
5452
5453template <class D, HWY_IF_V_SIZE_D(D, 32)>
5455 const uint32_t x2,
5456 const uint32_t x1,
5457 const uint32_t x0) {
5458 return BitCast(d, Vec256<uint32_t>{_mm256_set_epi32(
5459 static_cast<int32_t>(x3), static_cast<int32_t>(x2),
5460 static_cast<int32_t>(x1), static_cast<int32_t>(x0),
5461 static_cast<int32_t>(x3), static_cast<int32_t>(x2),
5462 static_cast<int32_t>(x1), static_cast<int32_t>(x0))});
5463}
5464
5465template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
5467 hwy::SizeTag<4> /*lane_size_tag*/,
5468 hwy::SizeTag<32> /*vect_size_tag*/, V v) {
5469 return V{_mm256_shuffle_epi32(v.raw, static_cast<int>(kIdx3210 & 0xFF))};
5470}
5471
5472template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)>
5474 hwy::SizeTag<4> /*lane_size_tag*/,
5475 hwy::SizeTag<32> /*vect_size_tag*/, V v) {
5476 return V{_mm256_shuffle_ps(v.raw, v.raw, static_cast<int>(kIdx3210 & 0xFF))};
5477}
5478
5479template <class V>
5481 hwy::SizeTag<8> /*lane_size_tag*/,
5482 hwy::SizeTag<32> /*vect_size_tag*/, V v) {
5483 const DFromV<decltype(v)> d;
5484 return ConcatLowerLower(d, v, v);
5485}
5486
5487template <class V>
5489 hwy::SizeTag<8> /*lane_size_tag*/,
5490 hwy::SizeTag<32> /*vect_size_tag*/, V v) {
5491 const DFromV<decltype(v)> d;
5492 return ConcatUpperUpper(d, v, v);
5493}
5494
5495template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
5497 hwy::SizeTag<8> /*lane_size_tag*/,
5498 hwy::SizeTag<32> /*vect_size_tag*/, V v) {
5499 return V{_mm256_permute4x64_epi64(v.raw, static_cast<int>(kIdx3210 & 0xFF))};
5500}
5501
5502template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)>
5504 hwy::SizeTag<8> /*lane_size_tag*/,
5505 hwy::SizeTag<32> /*vect_size_tag*/, V v) {
5506 return V{_mm256_permute4x64_pd(v.raw, static_cast<int>(kIdx3210 & 0xFF))};
5507}
5508
5509} // namespace detail
5510
5511// ------------------------------ SlideUpLanes
5512
5513namespace detail {
5514
5515#if HWY_TARGET <= HWY_AVX3
5516template <int kI32Lanes, class V, HWY_IF_V_SIZE_V(V, 32)>
5518 const DFromV<decltype(hi)> d;
5519 const Repartition<uint32_t, decltype(d)> du32;
5520 return BitCast(d,
5521 Vec256<uint32_t>{_mm256_alignr_epi32(
5522 BitCast(du32, hi).raw, BitCast(du32, lo).raw, kI32Lanes)});
5523}
5524
5525template <int kI64Lanes, class V, HWY_IF_V_SIZE_V(V, 32)>
5527 const DFromV<decltype(hi)> d;
5528 const Repartition<uint64_t, decltype(d)> du64;
5529 return BitCast(d,
5530 Vec256<uint64_t>{_mm256_alignr_epi64(
5531 BitCast(du64, hi).raw, BitCast(du64, lo).raw, kI64Lanes)});
5532}
5533
5534template <int kI64Lanes, class V, HWY_IF_V_SIZE_V(V, 32)>
5536 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5537 "kI64Lanes must be between 0 and 3");
5538 const DFromV<decltype(v)> d;
5539 return CombineShiftRightI64Lanes<4 - kI64Lanes>(v, Zero(d));
5540}
5541#else // AVX2
5542template <int kI64Lanes, class V, HWY_IF_V_SIZE_V(V, 32),
5545 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5546 "kI64Lanes must be between 0 and 3");
5547 constexpr int kIdx0 = (-kI64Lanes) & 3;
5548 constexpr int kIdx1 = (-kI64Lanes + 1) & 3;
5549 constexpr int kIdx2 = (-kI64Lanes + 2) & 3;
5550 constexpr int kIdx3 = (-kI64Lanes + 3) & 3;
5551 constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0);
5552 constexpr int kBlendMask = (1 << (kI64Lanes * 2)) - 1;
5553
5554 const DFromV<decltype(v)> d;
5555 return V{_mm256_blend_epi32(_mm256_permute4x64_epi64(v.raw, kIdx3210),
5556 Zero(d).raw, kBlendMask)};
5557}
5558
5559template <int kI64Lanes, class V, HWY_IF_V_SIZE_V(V, 32),
5560 HWY_IF_FLOAT_D(DFromV<V>)>
5562 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5563 "kI64Lanes must be between 0 and 3");
5564 constexpr int kIdx0 = (-kI64Lanes) & 3;
5565 constexpr int kIdx1 = (-kI64Lanes + 1) & 3;
5566 constexpr int kIdx2 = (-kI64Lanes + 2) & 3;
5567 constexpr int kIdx3 = (-kI64Lanes + 3) & 3;
5568 constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0);
5569 constexpr int kBlendMask = (1 << kI64Lanes) - 1;
5570
5571 const DFromV<decltype(v)> d;
5572 const Repartition<double, decltype(d)> dd;
5573 return BitCast(d, Vec256<double>{_mm256_blend_pd(
5574 _mm256_permute4x64_pd(BitCast(dd, v).raw, kIdx3210),
5575 Zero(dd).raw, kBlendMask)});
5576}
5577#endif // HWY_TARGET <= HWY_AVX3
5578
5579template <class D, HWY_IF_V_SIZE_D(D, 32),
5581 D, (1 << 1) | ((HWY_TARGET > HWY_AVX3) ? (1 << 2) : 0))>
5583 const Repartition<uint8_t, decltype(d)> du8;
5584
5585 const auto idx_vec =
5586 Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromD<D>)));
5587 const Indices256<TFromD<D>> idx{idx_vec.raw};
5588
5589#if HWY_TARGET <= HWY_AVX3_DL
5590 return TwoTablesLookupLanes(v, Zero(d), idx);
5591#else
5592 return TableLookupLanes(v, idx);
5593#endif
5594}
5595
5596template <class D, HWY_IF_V_SIZE_GT_D(D, 16),
5597 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | ((HWY_TARGET <= HWY_AVX3)
5598 ? ((1 << 2) | (1 << 8))
5599 : 0))>
5600HWY_INLINE VFromD<D> TableLookupSlideUpLanes(D d, VFromD<D> v, size_t amt) {
5601 const RebindToUnsigned<decltype(d)> du;
5602 using TU = TFromD<decltype(du)>;
5603
5604 const auto idx = Iota(du, static_cast<TU>(size_t{0} - amt));
5605#if HWY_TARGET <= HWY_AVX3
5606 const auto masked_idx =
5607 And(idx, Set(du, static_cast<TU>(MaxLanes(d) * 2 - 1)));
5608 return TwoTablesLookupLanes(v, Zero(d), IndicesFromVec(d, masked_idx));
5609#else
5610 const auto masked_idx = And(idx, Set(du, static_cast<TU>(MaxLanes(d) - 1)));
5611 return IfThenElseZero(RebindMask(d, idx == masked_idx),
5612 TableLookupLanes(v, IndicesFromVec(d, masked_idx)));
5613#endif
5614}
5615
5616#if HWY_TARGET > HWY_AVX3
5617template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
5619 const RepartitionToNarrow<D> dn;
5620 return BitCast(d, TableLookupSlideUpLanes(dn, BitCast(dn, v), amt * 2));
5621}
5622#endif // HWY_TARGET > HWY_AVX3
5623
5624} // namespace detail
5625
5626template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)>
5628 static_assert(0 <= kBlocks && kBlocks <= 1,
5629 "kBlocks must be between 0 and 1");
5630 return (kBlocks == 1) ? ConcatLowerLower(d, v, Zero(d)) : v;
5631}
5632
5633template <class D, HWY_IF_V_SIZE_D(D, 32)>
5634HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
5635#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5636 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
5637 if (__builtin_constant_p(amt)) {
5638 const auto v_lo = ConcatLowerLower(d, v, Zero(d));
5639 switch (amt * sizeof(TFromD<D>)) {
5640 case 0:
5641 return v;
5642 case 1:
5643 return CombineShiftRightBytes<15>(d, v, v_lo);
5644 case 2:
5645 return CombineShiftRightBytes<14>(d, v, v_lo);
5646 case 3:
5647 return CombineShiftRightBytes<13>(d, v, v_lo);
5648 case 4:
5649#if HWY_TARGET <= HWY_AVX3
5650 return detail::CombineShiftRightI32Lanes<7>(v, Zero(d));
5651#else
5652 return CombineShiftRightBytes<12>(d, v, v_lo);
5653#endif
5654 case 5:
5655 return CombineShiftRightBytes<11>(d, v, v_lo);
5656 case 6:
5657 return CombineShiftRightBytes<10>(d, v, v_lo);
5658 case 7:
5659 return CombineShiftRightBytes<9>(d, v, v_lo);
5660 case 8:
5661 return detail::SlideUpI64Lanes<1>(v);
5662 case 9:
5663 return CombineShiftRightBytes<7>(d, v, v_lo);
5664 case 10:
5665 return CombineShiftRightBytes<6>(d, v, v_lo);
5666 case 11:
5667 return CombineShiftRightBytes<5>(d, v, v_lo);
5668 case 12:
5669#if HWY_TARGET <= HWY_AVX3
5670 return detail::CombineShiftRightI32Lanes<5>(v, Zero(d));
5671#else
5672 return CombineShiftRightBytes<4>(d, v, v_lo);
5673#endif
5674 case 13:
5675 return CombineShiftRightBytes<3>(d, v, v_lo);
5676 case 14:
5677 return CombineShiftRightBytes<2>(d, v, v_lo);
5678 case 15:
5679 return CombineShiftRightBytes<1>(d, v, v_lo);
5680 case 16:
5681 return ConcatLowerLower(d, v, Zero(d));
5682#if HWY_TARGET <= HWY_AVX3
5683 case 20:
5684 return detail::CombineShiftRightI32Lanes<3>(v, Zero(d));
5685#endif
5686 case 24:
5687 return detail::SlideUpI64Lanes<3>(v);
5688#if HWY_TARGET <= HWY_AVX3
5689 case 28:
5690 return detail::CombineShiftRightI32Lanes<1>(v, Zero(d));
5691#endif
5692 }
5693 }
5694
5695 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
5696 const Half<decltype(d)> dh;
5697 return Combine(d, SlideUpLanes(dh, LowerHalf(dh, v), amt - kLanesPerBlock),
5698 Zero(dh));
5699 }
5700#endif
5701
5702 return detail::TableLookupSlideUpLanes(d, v, amt);
5703}
5704
5705// ------------------------------ Slide1Up
5706
5707template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5709 const auto v_lo = ConcatLowerLower(d, v, Zero(d));
5710 return CombineShiftRightBytes<15>(d, v, v_lo);
5711}
5712
5713template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5715 const auto v_lo = ConcatLowerLower(d, v, Zero(d));
5716 return CombineShiftRightBytes<14>(d, v, v_lo);
5717}
5718
5719template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
5721#if HWY_TARGET <= HWY_AVX3
5722 return detail::CombineShiftRightI32Lanes<7>(v, Zero(d));
5723#else
5724 const auto v_lo = ConcatLowerLower(d, v, Zero(d));
5725 return CombineShiftRightBytes<12>(d, v, v_lo);
5726#endif
5727}
5728
5729template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
5730HWY_API VFromD<D> Slide1Up(D /*d*/, VFromD<D> v) {
5731 return detail::SlideUpI64Lanes<1>(v);
5732}
5733
5734// ------------------------------ SlideDownLanes
5735
5736namespace detail {
5737
5738#if HWY_TARGET <= HWY_AVX3
5739template <int kI64Lanes, class V, HWY_IF_V_SIZE_V(V, 32)>
5741 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5742 "kI64Lanes must be between 0 and 3");
5743 const DFromV<decltype(v)> d;
5744 return CombineShiftRightI64Lanes<kI64Lanes>(Zero(d), v);
5745}
5746#else // AVX2
5747template <int kI64Lanes, class V, HWY_IF_V_SIZE_V(V, 32),
5749HWY_INLINE V SlideDownI64Lanes(V v) {
5750 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5751 "kI64Lanes must be between 0 and 3");
5752 constexpr int kIdx1 = (kI64Lanes + 1) & 3;
5753 constexpr int kIdx2 = (kI64Lanes + 2) & 3;
5754 constexpr int kIdx3 = (kI64Lanes + 3) & 3;
5755 constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kI64Lanes);
5756 constexpr int kBlendMask =
5757 static_cast<int>((0xFFu << ((4 - kI64Lanes) * 2)) & 0xFFu);
5758
5759 const DFromV<decltype(v)> d;
5760 return V{_mm256_blend_epi32(_mm256_permute4x64_epi64(v.raw, kIdx3210),
5761 Zero(d).raw, kBlendMask)};
5762}
5763
5764template <int kI64Lanes, class V, HWY_IF_V_SIZE_V(V, 32),
5765 HWY_IF_FLOAT_D(DFromV<V>)>
5766HWY_INLINE V SlideDownI64Lanes(V v) {
5767 static_assert(0 <= kI64Lanes && kI64Lanes <= 3,
5768 "kI64Lanes must be between 0 and 3");
5769 constexpr int kIdx1 = (kI64Lanes + 1) & 3;
5770 constexpr int kIdx2 = (kI64Lanes + 2) & 3;
5771 constexpr int kIdx3 = (kI64Lanes + 3) & 3;
5772 constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kI64Lanes);
5773 constexpr int kBlendMask = (0x0F << (4 - kI64Lanes)) & 0x0F;
5774
5775 const DFromV<decltype(v)> d;
5776 const Repartition<double, decltype(d)> dd;
5777 return BitCast(d, Vec256<double>{_mm256_blend_pd(
5778 _mm256_permute4x64_pd(BitCast(dd, v).raw, kIdx3210),
5779 Zero(dd).raw, kBlendMask)});
5780}
5781#endif // HWY_TARGET <= HWY_AVX3
5782
5783template <class D, HWY_IF_V_SIZE_D(D, 32),
5785 D, (1 << 1) | ((HWY_TARGET > HWY_AVX3) ? (1 << 2) : 0))>
5787 const Repartition<uint8_t, decltype(d)> du8;
5788
5789 auto idx_vec = Iota(du8, static_cast<uint8_t>(amt * sizeof(TFromD<D>)));
5790
5791#if HWY_TARGET <= HWY_AVX3_DL
5792 const auto result_mask = idx_vec < Set(du8, uint8_t{32});
5793 return VFromD<D>{
5794 _mm256_maskz_permutexvar_epi8(result_mask.raw, idx_vec.raw, v.raw)};
5795#else
5796 const RebindToSigned<decltype(du8)> di8;
5797 idx_vec =
5798 Or(idx_vec, BitCast(du8, VecFromMask(di8, BitCast(di8, idx_vec) >
5799 Set(di8, int8_t{31}))));
5800 return TableLookupLanes(v, Indices256<TFromD<D>>{idx_vec.raw});
5801#endif
5802}
5803
5804template <class D, HWY_IF_V_SIZE_GT_D(D, 16),
5805 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | ((HWY_TARGET <= HWY_AVX3)
5806 ? ((1 << 2) | (1 << 8))
5807 : 0))>
5808HWY_INLINE VFromD<D> TableLookupSlideDownLanes(D d, VFromD<D> v, size_t amt) {
5809 const RebindToUnsigned<decltype(d)> du;
5810 using TU = TFromD<decltype(du)>;
5811
5812 const auto idx = Iota(du, static_cast<TU>(amt));
5813 const auto masked_idx = And(idx, Set(du, static_cast<TU>(MaxLanes(d) - 1)));
5814
5815 return IfThenElseZero(RebindMask(d, idx == masked_idx),
5816 TableLookupLanes(v, IndicesFromVec(d, masked_idx)));
5817}
5818
5819#if HWY_TARGET > HWY_AVX3
5820template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
5821HWY_INLINE VFromD<D> TableLookupSlideDownLanes(D d, VFromD<D> v, size_t amt) {
5822 const RepartitionToNarrow<D> dn;
5823 return BitCast(d, TableLookupSlideDownLanes(dn, BitCast(dn, v), amt * 2));
5824}
5825#endif // HWY_TARGET > HWY_AVX3
5826
5827} // namespace detail
5828
5829template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)>
5831 static_assert(0 <= kBlocks && kBlocks <= 1,
5832 "kBlocks must be between 0 and 1");
5833 const Half<decltype(d)> dh;
5834 return (kBlocks == 1) ? ZeroExtendVector(d, UpperHalf(dh, v)) : v;
5835}
5836
5837template <class D, HWY_IF_V_SIZE_D(D, 32)>
5838HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
5839#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5840 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
5841 const Half<decltype(d)> dh;
5842 if (__builtin_constant_p(amt)) {
5843 const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
5844 switch (amt * sizeof(TFromD<D>)) {
5845 case 0:
5846 return v;
5847 case 1:
5848 return CombineShiftRightBytes<1>(d, v_hi, v);
5849 case 2:
5850 return CombineShiftRightBytes<2>(d, v_hi, v);
5851 case 3:
5852 return CombineShiftRightBytes<3>(d, v_hi, v);
5853 case 4:
5854#if HWY_TARGET <= HWY_AVX3
5855 return detail::CombineShiftRightI32Lanes<1>(Zero(d), v);
5856#else
5857 return CombineShiftRightBytes<4>(d, v_hi, v);
5858#endif
5859 case 5:
5860 return CombineShiftRightBytes<5>(d, v_hi, v);
5861 case 6:
5862 return CombineShiftRightBytes<6>(d, v_hi, v);
5863 case 7:
5864 return CombineShiftRightBytes<7>(d, v_hi, v);
5865 case 8:
5866 return detail::SlideDownI64Lanes<1>(v);
5867 case 9:
5868 return CombineShiftRightBytes<9>(d, v_hi, v);
5869 case 10:
5870 return CombineShiftRightBytes<10>(d, v_hi, v);
5871 case 11:
5872 return CombineShiftRightBytes<11>(d, v_hi, v);
5873 case 12:
5874#if HWY_TARGET <= HWY_AVX3
5875 return detail::CombineShiftRightI32Lanes<3>(Zero(d), v);
5876#else
5877 return CombineShiftRightBytes<12>(d, v_hi, v);
5878#endif
5879 case 13:
5880 return CombineShiftRightBytes<13>(d, v_hi, v);
5881 case 14:
5882 return CombineShiftRightBytes<14>(d, v_hi, v);
5883 case 15:
5884 return CombineShiftRightBytes<15>(d, v_hi, v);
5885 case 16:
5886 return v_hi;
5887#if HWY_TARGET <= HWY_AVX3
5888 case 20:
5889 return detail::CombineShiftRightI32Lanes<5>(Zero(d), v);
5890#endif
5891 case 24:
5892 return detail::SlideDownI64Lanes<3>(v);
5893#if HWY_TARGET <= HWY_AVX3
5894 case 28:
5895 return detail::CombineShiftRightI32Lanes<7>(Zero(d), v);
5896#endif
5897 }
5898 }
5899
5900 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
5901 return ZeroExtendVector(
5902 d, SlideDownLanes(dh, UpperHalf(dh, v), amt - kLanesPerBlock));
5903 }
5904#endif
5905
5906 return detail::TableLookupSlideDownLanes(d, v, amt);
5907}
5908
5909// ------------------------------ Slide1Down
5910
5911template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5913 const Half<decltype(d)> dh;
5914 const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
5915 return CombineShiftRightBytes<1>(d, v_hi, v);
5916}
5917
5918template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5920 const Half<decltype(d)> dh;
5921 const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
5922 return CombineShiftRightBytes<2>(d, v_hi, v);
5923}
5924
5925template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
5927#if HWY_TARGET <= HWY_AVX3
5928 return detail::CombineShiftRightI32Lanes<1>(Zero(d), v);
5929#else
5930 const Half<decltype(d)> dh;
5931 const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
5932 return CombineShiftRightBytes<4>(d, v_hi, v);
5933#endif
5934}
5935
5936template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
5938 return detail::SlideDownI64Lanes<1>(v);
5939}
5940
5941// ------------------------------ Shl (Mul, ZipLower)
5942
5943namespace detail {
5944
5945#if HWY_TARGET > HWY_AVX3 && !HWY_IDE // AVX2 or older
5946template <class V>
5947HWY_INLINE V AVX2ShlU16Vec256(V v, V bits) {
5948 const DFromV<decltype(v)> d;
5949 const Half<decltype(d)> dh;
5950 const Rebind<uint32_t, decltype(dh)> du32;
5951
5952 const auto lo_shl_result = PromoteTo(du32, LowerHalf(dh, v))
5953 << PromoteTo(du32, LowerHalf(dh, bits));
5954 const auto hi_shl_result = PromoteTo(du32, UpperHalf(dh, v))
5955 << PromoteTo(du32, UpperHalf(dh, bits));
5956 return ConcatEven(d, BitCast(d, hi_shl_result), BitCast(d, lo_shl_result));
5957}
5958#endif
5959
5961 Vec256<uint16_t> bits) {
5962#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
5963 return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
5964#else
5965 return AVX2ShlU16Vec256(v, bits);
5966#endif
5967}
5968
5969// 8-bit: may use the Shl overload for uint16_t.
5971 Vec256<uint8_t> bits) {
5972 const DFromV<decltype(v)> d;
5973#if HWY_TARGET <= HWY_AVX3_DL
5974 (void)tag;
5975 // masks[i] = 0xFF >> i
5976 const VFromD<decltype(d)> masks =
5977 Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
5978 0, 0, 0, 0, 0, 0, 0);
5979 // kShl[i] = 1 << i
5980 const VFromD<decltype(d)> shl = Dup128VecFromValues(
5981 d, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 0, 0, 0, 0, 0, 0, 0, 0);
5982 v = And(v, TableLookupBytes(masks, bits));
5983 const VFromD<decltype(d)> mul = TableLookupBytes(shl, bits);
5984 return VFromD<decltype(d)>{_mm256_gf2p8mul_epi8(v.raw, mul.raw)};
5985#else
5986 const Repartition<uint16_t, decltype(d)> dw;
5987 using VW = VFromD<decltype(dw)>;
5988 const VW even_mask = Set(dw, 0x00FF);
5989 const VW odd_mask = Set(dw, 0xFF00);
5990 const VW vw = BitCast(dw, v);
5991 const VW bits16 = BitCast(dw, bits);
5992 // Shift even lanes in-place
5993 const VW evens = Shl(tag, vw, And(bits16, even_mask));
5994 const VW odds = Shl(tag, And(vw, odd_mask), ShiftRight<8>(bits16));
5995 return OddEven(BitCast(d, odds), BitCast(d, evens));
5996#endif
5997}
5998
6000 Vec256<uint32_t> bits) {
6001 return Vec256<uint32_t>{_mm256_sllv_epi32(v.raw, bits.raw)};
6002}
6003
6005 Vec256<uint64_t> bits) {
6006 return Vec256<uint64_t>{_mm256_sllv_epi64(v.raw, bits.raw)};
6007}
6008
6009template <typename T>
6011 // Signed left shifts are the same as unsigned.
6012 const Full256<T> di;
6013 const Full256<MakeUnsigned<T>> du;
6014 return BitCast(di,
6015 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
6016}
6017
6018} // namespace detail
6019
6020template <typename T>
6022 return detail::Shl(hwy::TypeTag<T>(), v, bits);
6023}
6024
6025// ------------------------------ Shr (MulHigh, IfThenElse, Not)
6026
6027#if HWY_TARGET > HWY_AVX3 // AVX2
6028namespace detail {
6029
6030template <class V>
6031HWY_INLINE V AVX2ShrU16Vec256(V v, V bits) {
6032 const DFromV<decltype(v)> d;
6033 const Half<decltype(d)> dh;
6034 const Rebind<int32_t, decltype(dh)> di32;
6035 const Rebind<uint32_t, decltype(dh)> du32;
6036
6037 const auto lo_shr_result =
6038 PromoteTo(du32, LowerHalf(dh, v)) >> PromoteTo(du32, LowerHalf(dh, bits));
6039 const auto hi_shr_result =
6040 PromoteTo(du32, UpperHalf(dh, v)) >> PromoteTo(du32, UpperHalf(dh, bits));
6041 return OrderedDemote2To(d, BitCast(di32, lo_shr_result),
6042 BitCast(di32, hi_shr_result));
6043}
6044
6045} // namespace detail
6046#endif
6047
6049#if HWY_TARGET <= HWY_AVX3
6050 return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
6051#else
6052 return detail::AVX2ShrU16Vec256(v, bits);
6053#endif
6054}
6055
6056// 8-bit uses 16-bit shifts.
6058 const DFromV<decltype(v)> d;
6059 const RepartitionToWide<decltype(d)> dw;
6060 using VW = VFromD<decltype(dw)>;
6061 const VW mask = Set(dw, 0x00FF);
6062 const VW vw = BitCast(dw, v);
6063 const VW bits16 = BitCast(dw, bits);
6064 const VW evens = And(vw, mask) >> And(bits16, mask);
6065 // Shift odd lanes in-place
6066 const VW odds = vw >> ShiftRight<8>(bits16);
6067 return OddEven(BitCast(d, odds), BitCast(d, evens));
6068}
6069
6071 return Vec256<uint32_t>{_mm256_srlv_epi32(v.raw, bits.raw)};
6072}
6073
6075 return Vec256<uint64_t>{_mm256_srlv_epi64(v.raw, bits.raw)};
6076}
6077
6078#if HWY_TARGET > HWY_AVX3 // AVX2
6079namespace detail {
6080
6081template <class V>
6082HWY_INLINE V AVX2ShrI16Vec256(V v, V bits) {
6083 const DFromV<decltype(v)> d;
6084 const Half<decltype(d)> dh;
6085 const Rebind<int32_t, decltype(dh)> di32;
6086
6087 const auto lo_shr_result =
6088 PromoteTo(di32, LowerHalf(dh, v)) >> PromoteTo(di32, LowerHalf(dh, bits));
6089 const auto hi_shr_result =
6090 PromoteTo(di32, UpperHalf(dh, v)) >> PromoteTo(di32, UpperHalf(dh, bits));
6091 return OrderedDemote2To(d, lo_shr_result, hi_shr_result);
6092}
6093
6094} // namespace detail
6095#endif
6096
6098#if HWY_TARGET <= HWY_AVX3
6099 return Vec256<int16_t>{_mm256_srav_epi16(v.raw, bits.raw)};
6100#else
6101 return detail::AVX2ShrI16Vec256(v, bits);
6102#endif
6103}
6104
6105// 8-bit uses 16-bit shifts.
6107 const DFromV<decltype(v)> d;
6108 const RepartitionToWide<decltype(d)> dw;
6109 const RebindToUnsigned<decltype(dw)> dw_u;
6110 using VW = VFromD<decltype(dw)>;
6111 const VW mask = Set(dw, 0x00FF);
6112 const VW vw = BitCast(dw, v);
6113 const VW bits16 = BitCast(dw, bits);
6114 const VW evens = ShiftRight<8>(ShiftLeft<8>(vw)) >> And(bits16, mask);
6115 // Shift odd lanes in-place
6116 const VW odds = vw >> BitCast(dw, ShiftRight<8>(BitCast(dw_u, bits16)));
6117 return OddEven(BitCast(d, odds), BitCast(d, evens));
6118}
6119
6121 return Vec256<int32_t>{_mm256_srav_epi32(v.raw, bits.raw)};
6122}
6123
6125#if HWY_TARGET <= HWY_AVX3
6126 return Vec256<int64_t>{_mm256_srav_epi64(v.raw, bits.raw)};
6127#else
6128 const DFromV<decltype(v)> d;
6129 return detail::SignedShr(d, v, bits);
6130#endif
6131}
6132
6133// ------------------------------ WidenMulPairwiseAdd
6134template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6136 Vec256<int16_t> b) {
6137 return VFromD<D>{_mm256_madd_epi16(a.raw, b.raw)};
6138}
6139
6140// ------------------------------ SatWidenMulPairwiseAdd
6141
6142template <class DI16, HWY_IF_V_SIZE_D(DI16, 32), HWY_IF_I16_D(DI16)>
6144 DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a,
6145 VFromD<Repartition<int8_t, DI16>> b) {
6146 return VFromD<DI16>{_mm256_maddubs_epi16(a.raw, b.raw)};
6147}
6148
6149// ------------------------------ SatWidenMulPairwiseAccumulate
6150
6151#if HWY_TARGET <= HWY_AVX3_DL
6152template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 32)>
6154 DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
6155 VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
6156 return VFromD<DI32>{_mm256_dpwssds_epi32(sum.raw, a.raw, b.raw)};
6157}
6158#endif // HWY_TARGET <= HWY_AVX3_DL
6159
6160// ------------------------------ ReorderWidenMulAccumulate
6161template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6164 const VFromD<D> sum0,
6165 VFromD<D>& /*sum1*/) {
6166 (void)d;
6167#if HWY_TARGET <= HWY_AVX3_DL
6168 return VFromD<D>{_mm256_dpwssd_epi32(sum0.raw, a.raw, b.raw)};
6169#else
6170 return sum0 + WidenMulPairwiseAdd(d, a, b);
6171#endif
6172}
6173
6174// ------------------------------ RearrangeToOddPlusEven
6176 Vec256<int32_t> /*sum1*/) {
6177 return sum0; // invariant already holds
6178}
6179
6181 Vec256<uint32_t> /*sum1*/) {
6182 return sum0; // invariant already holds
6183}
6184
6185// ------------------------------ SumOfMulQuadAccumulate
6186
6187#if HWY_TARGET <= HWY_AVX3_DL
6188
6189template <class DI32, HWY_IF_V_SIZE_D(DI32, 32)>
6191 DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
6192 VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
6193 return VFromD<DI32>{_mm256_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)};
6194}
6195
6196#endif
6197
6198// ================================================== CONVERT
6199
6200// ------------------------------ Promotions (part w/ narrow lanes -> full)
6201
6202template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6204 return VFromD<D>{_mm256_cvtps_pd(v.raw)};
6205}
6206
6207template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6209 return VFromD<D>{_mm256_cvtepi32_pd(v.raw)};
6210}
6211
6212#if HWY_TARGET <= HWY_AVX3
6213template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6215 return Vec256<double>{_mm256_cvtepu32_pd(v.raw)};
6216}
6217#endif
6218
6219// Unsigned: zero-extend.
6220// Note: these have 3 cycle latency; if inputs are already split across the
6221// 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
6222template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6224 return VFromD<D>{_mm256_cvtepu8_epi16(v.raw)};
6225}
6226template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
6228 return VFromD<D>{_mm256_cvtepu8_epi32(v.raw)};
6229}
6230template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
6232 return VFromD<D>{_mm256_cvtepu16_epi32(v.raw)};
6233}
6234template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6236 return VFromD<D>{_mm256_cvtepu32_epi64(v.raw)};
6237}
6238template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6240 return VFromD<D>{_mm256_cvtepu16_epi64(v.raw)};
6241}
6242template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6244 return VFromD<D>{_mm256_cvtepu8_epi64(v.raw)};
6245}
6246
6247// Signed: replicate sign bit.
6248// Note: these have 3 cycle latency; if inputs are already split across the
6249// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
6250// signed shift would be faster.
6251template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
6253 return VFromD<D>{_mm256_cvtepi8_epi16(v.raw)};
6254}
6255template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6257 return VFromD<D>{_mm256_cvtepi8_epi32(v.raw)};
6258}
6259template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6261 return VFromD<D>{_mm256_cvtepi16_epi32(v.raw)};
6262}
6263template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6264HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int32_t> v) {
6265 return VFromD<D>{_mm256_cvtepi32_epi64(v.raw)};
6266}
6267template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6269 return VFromD<D>{_mm256_cvtepi16_epi64(v.raw)};
6270}
6271template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6273 return VFromD<D>{_mm256_cvtepi8_epi64(v.raw)};
6274}
6275
6276#if HWY_TARGET <= HWY_AVX3
6277template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_I64_D(D)>
6278HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
6279 const Rebind<float, decltype(di64)> df32;
6280 const RebindToSigned<decltype(df32)> di32;
6281 const RebindToFloat<decltype(di64)> df64;
6282
6284 di64, BitCast(df64, PromoteTo(di64, BitCast(di32, v))),
6285 PromoteInRangeTo(di64, v));
6286}
6287template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6288HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
6289 return VFromD<D>{_mm256_cvttps_epi64(v.raw)};
6290}
6291template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6292HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
6293 return VFromD<D>{_mm256_cvttps_epu64(v.raw)};
6294}
6295template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6296HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
6297 return VFromD<D>{_mm256_maskz_cvttps_epu64(
6298 detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
6299}
6300#endif // HWY_TARGET <= HWY_AVX3
6301
6302// ------------------------------ PromoteEvenTo/PromoteOddTo
6303#if HWY_TARGET > HWY_AVX3
6304namespace detail {
6305
6306// I32->I64 PromoteEvenTo/PromoteOddTo
6307
6308template <class D, HWY_IF_LANES_D(D, 4)>
6310 hwy::SizeTag<8> /*to_lane_size_tag*/,
6311 hwy::SignedTag /*from_type_tag*/, D d_to,
6312 Vec256<int32_t> v) {
6313 return BitCast(d_to, OddEven(DupEven(BroadcastSignBit(v)), v));
6314}
6315
6316template <class D, HWY_IF_LANES_D(D, 4)>
6318 hwy::SizeTag<8> /*to_lane_size_tag*/,
6319 hwy::SignedTag /*from_type_tag*/, D d_to,
6320 Vec256<int32_t> v) {
6321 return BitCast(d_to, OddEven(BroadcastSignBit(v), DupOdd(v)));
6322}
6323
6324} // namespace detail
6325#endif
6326
6327// ------------------------------ Demotions (full -> part w/ narrow lanes)
6328
6329template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
6331 const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw);
6332 // Concatenating lower halves of both 128-bit blocks afterward is more
6333 // efficient than an extra input with low block = high block of v.
6334 return VFromD<D>{_mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
6335}
6336
6337template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
6339 const DFromV<decltype(v)> d;
6340 const RebindToSigned<decltype(d)> di;
6341 return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu))));
6342}
6343
6344template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
6345HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int32_t> v) {
6346 const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw);
6347 return VFromD<D>{_mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
6348}
6349
6350template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
6351HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int32_t> v) {
6352 const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
6353 // Concatenate lower 64 bits of each 128-bit block
6354 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
6355 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
6356 return VFromD<D>{_mm_packus_epi16(i16, i16)};
6357}
6358
6359template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
6360HWY_API VFromD<D> DemoteTo(D dn, Vec256<uint32_t> v) {
6361#if HWY_TARGET <= HWY_AVX3
6362 (void)dn;
6363 return VFromD<D>{_mm256_cvtusepi32_epi8(v.raw)};
6364#else
6365 const DFromV<decltype(v)> d;
6366 const RebindToSigned<decltype(d)> di;
6367 return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu))));
6368#endif
6369}
6370
6371template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
6373 const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw);
6374 return VFromD<D>{_mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
6375}
6376
6377template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
6379 const DFromV<decltype(v)> d;
6380 const RebindToSigned<decltype(d)> di;
6381 return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFu))));
6382}
6383
6384template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
6385HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int32_t> v) {
6386 const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
6387 // Concatenate lower 64 bits of each 128-bit block
6388 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
6389 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
6390 return VFromD<D>{_mm_packs_epi16(i16, i16)};
6391}
6392
6393template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
6394HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int16_t> v) {
6395 const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw);
6396 return VFromD<D>{_mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
6397}
6398
6399#if HWY_TARGET <= HWY_AVX3
6400template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
6402 return VFromD<D>{_mm256_cvtsepi64_epi32(v.raw)};
6403}
6404template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
6405HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
6406 return VFromD<D>{_mm256_cvtsepi64_epi16(v.raw)};
6407}
6408template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
6409HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
6410 return VFromD<D>{_mm256_cvtsepi64_epi8(v.raw)};
6411}
6412
6413template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6414HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
6415 const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
6416 return VFromD<D>{_mm256_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
6417}
6418template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
6419HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
6420 const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
6421 return VFromD<D>{_mm256_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
6422}
6423template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
6424HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
6425 const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
6426 return VFromD<D>{_mm256_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
6427}
6428
6429template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6431 return VFromD<D>{_mm256_cvtusepi64_epi32(v.raw)};
6432}
6433template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
6434HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<uint64_t> v) {
6435 return VFromD<D>{_mm256_cvtusepi64_epi16(v.raw)};
6436}
6437template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
6438HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<uint64_t> v) {
6439 return VFromD<D>{_mm256_cvtusepi64_epi8(v.raw)};
6440}
6441#endif // HWY_TARGET <= HWY_AVX3
6442
6443#ifndef HWY_DISABLE_F16C
6444
6445// Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
6446// 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
6447HWY_DIAGNOSTICS(push)
6448HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
6449
6450template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
6451HWY_API VFromD<D> DemoteTo(D df16, Vec256<float> v) {
6452 const RebindToUnsigned<decltype(df16)> du16;
6453 return BitCast(
6454 df16, VFromD<decltype(du16)>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
6455}
6456
6457HWY_DIAGNOSTICS(pop)
6458
6459#endif // HWY_DISABLE_F16C
6460
6461#if HWY_HAVE_FLOAT16
6462template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
6463HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec256<double> v) {
6464 return VFromD<D>{_mm256_cvtpd_ph(v.raw)};
6465}
6466#endif // HWY_HAVE_FLOAT16
6467
6468#if HWY_AVX3_HAVE_F32_TO_BF16C
6469template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
6470HWY_API VFromD<D> DemoteTo(D /*dbf16*/, Vec256<float> v) {
6471#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
6472 // Inline assembly workaround for LLVM codegen bug
6473 __m128i raw_result;
6474 __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
6475 return VFromD<D>{raw_result};
6476#else
6477 // The _mm256_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be
6478 // bit casted to a __m128i vector
6479 return VFromD<D>{detail::BitCastToInteger(_mm256_cvtneps_pbh(v.raw))};
6480#endif
6481}
6482
6483template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
6484HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec256<float> a,
6485 Vec256<float> b) {
6486#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
6487 // Inline assembly workaround for LLVM codegen bug
6488 __m256i raw_result;
6489 __asm__("vcvtne2ps2bf16 %2, %1, %0"
6490 : "=v"(raw_result)
6491 : "v"(b.raw), "v"(a.raw));
6492 return VFromD<D>{raw_result};
6493#else
6494 // The _mm256_cvtne2ps_pbh intrinsic returns a __m256bh vector that needs to
6495 // be bit casted to a __m256i vector
6496 return VFromD<D>{detail::BitCastToInteger(_mm256_cvtne2ps_pbh(b.raw, a.raw))};
6497#endif
6498}
6499#endif // HWY_AVX3_HAVE_F32_TO_BF16C
6500
6501template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
6503 Vec256<int32_t> b) {
6504 return VFromD<D>{_mm256_packs_epi32(a.raw, b.raw)};
6505}
6506
6507template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6508HWY_API VFromD<D> ReorderDemote2To(D /*d16*/, Vec256<int32_t> a,
6509 Vec256<int32_t> b) {
6510 return VFromD<D>{_mm256_packus_epi32(a.raw, b.raw)};
6511}
6512
6513template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6515 Vec256<uint32_t> b) {
6516 const DFromV<decltype(a)> d;
6517 const RebindToSigned<decltype(d)> di;
6518 const auto max_i32 = Set(d, 0x7FFFFFFFu);
6519 return ReorderDemote2To(dn, BitCast(di, Min(a, max_i32)),
6520 BitCast(di, Min(b, max_i32)));
6521}
6522
6523template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I8_D(D)>
6525 Vec256<int16_t> b) {
6526 return VFromD<D>{_mm256_packs_epi16(a.raw, b.raw)};
6527}
6528
6529template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
6530HWY_API VFromD<D> ReorderDemote2To(D /*d16*/, Vec256<int16_t> a,
6531 Vec256<int16_t> b) {
6532 return VFromD<D>{_mm256_packus_epi16(a.raw, b.raw)};
6533}
6534
6535template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
6537 Vec256<uint16_t> b) {
6538 const DFromV<decltype(a)> d;
6539 const RebindToSigned<decltype(d)> di;
6540 const auto max_i16 = Set(d, 0x7FFFu);
6541 return ReorderDemote2To(dn, BitCast(di, Min(a, max_i16)),
6542 BitCast(di, Min(b, max_i16)));
6543}
6544
6545#if HWY_TARGET > HWY_AVX3
6546template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6547HWY_API Vec256<int32_t> ReorderDemote2To(D dn, Vec256<int64_t> a,
6548 Vec256<int64_t> b) {
6549 const DFromV<decltype(a)> di64;
6550 const RebindToUnsigned<decltype(di64)> du64;
6551 const Half<decltype(dn)> dnh;
6552 const Repartition<float, decltype(dn)> dn_f;
6553
6554 // Negative values are saturated by first saturating their bitwise inverse
6555 // and then inverting the saturation result
6556 const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a));
6557 const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b));
6558 const auto saturated_a = Xor(
6559 invert_mask_a,
6560 detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a))));
6561 const auto saturated_b = Xor(
6562 invert_mask_b,
6563 detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b))));
6564
6565 return BitCast(dn,
6566 Vec256<float>{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw,
6567 BitCast(dn_f, saturated_b).raw,
6568 _MM_SHUFFLE(2, 0, 2, 0))});
6569}
6570
6571template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
6572HWY_API Vec256<uint32_t> ReorderDemote2To(D dn, Vec256<int64_t> a,
6573 Vec256<int64_t> b) {
6574 const DFromV<decltype(a)> di64;
6575 const RebindToUnsigned<decltype(di64)> du64;
6576 const Half<decltype(dn)> dnh;
6577 const Repartition<float, decltype(dn)> dn_f;
6578
6579 const auto saturated_a = detail::DemoteFromU64Saturate(
6580 dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a)));
6581 const auto saturated_b = detail::DemoteFromU64Saturate(
6582 dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b)));
6583
6584 return BitCast(dn,
6585 Vec256<float>{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw,
6586 BitCast(dn_f, saturated_b).raw,
6587 _MM_SHUFFLE(2, 0, 2, 0))});
6588}
6589
6590template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
6591HWY_API VFromD<D> ReorderDemote2To(D dn, Vec256<uint64_t> a,
6592 Vec256<uint64_t> b) {
6593 const Half<decltype(dn)> dnh;
6594 const Repartition<float, decltype(dn)> dn_f;
6595
6596 const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a);
6597 const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b);
6598
6599 return BitCast(dn,
6600 Vec256<float>{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw,
6601 BitCast(dn_f, saturated_b).raw,
6602 _MM_SHUFFLE(2, 0, 2, 0))});
6603}
6604#endif // HWY_TARGET > HWY_AVX3
6605
6606template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
6607 HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
6608 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
6609 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2),
6610 HWY_IF_T_SIZE_ONE_OF_V(V,
6611 (1 << 1) | (1 << 2) | (1 << 4) |
6612 ((HWY_TARGET > HWY_AVX3) ? (1 << 8) : 0))>
6613HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
6614 return VFromD<D>{_mm256_permute4x64_epi64(ReorderDemote2To(d, a, b).raw,
6615 _MM_SHUFFLE(3, 1, 2, 0))};
6616}
6617
6618template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
6620 return VFromD<D>{_mm256_cvtpd_ps(v.raw)};
6621}
6622
6623template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
6625 return VFromD<D>{_mm256_cvttpd_epi32(v.raw)};
6626}
6627
6628#if HWY_TARGET <= HWY_AVX3
6629template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6630HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
6631 return VFromD<D>{_mm256_cvttpd_epu32(v.raw)};
6632}
6633template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6634HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
6635 return VFromD<D>{_mm256_maskz_cvttpd_epu32(
6636 detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
6637}
6638
6639template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
6640HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
6641 return VFromD<D>{_mm256_cvtepi64_ps(v.raw)};
6642}
6643template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
6644HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
6645 return VFromD<D>{_mm256_cvtepu64_ps(v.raw)};
6646}
6647#endif
6648
6649// For already range-limited input [0, 255].
6650HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
6651 const Full256<uint32_t> d32;
6652 const Full64<uint8_t> d8;
6653 alignas(32) static constexpr uint32_t k8From32[8] = {
6654 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
6655 // Place first four bytes in lo[0], remaining 4 in hi[1].
6656 const auto quad = TableLookupBytes(v, Load(d32, k8From32));
6657 // Interleave both quadruplets - OR instead of unpack reduces port5 pressure.
6658 const auto lo = LowerHalf(quad);
6659 const auto hi = UpperHalf(Half<decltype(d32)>(), quad);
6660 return BitCast(d8, LowerHalf(lo | hi));
6661}
6662
6663// ------------------------------ Truncations
6664
6665namespace detail {
6666
6667// LO and HI each hold four indices of bytes within a 128-bit block.
6668template <uint32_t LO, uint32_t HI, typename T>
6670 const Full256<uint32_t> d32;
6671
6672#if HWY_TARGET <= HWY_AVX3_DL
6673 alignas(32) static constexpr uint32_t kMap[8] = {
6674 LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0};
6675 const auto result = _mm256_permutexvar_epi8(Load(d32, kMap).raw, v.raw);
6676#else
6677 alignas(32) static constexpr uint32_t kMap[8] = {LO, HI, ~0u, ~0u,
6678 ~0u, ~0u, LO, HI};
6679 const auto quad = TableLookupBytes(v, Load(d32, kMap));
6680 const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC);
6681 // Possible alternative:
6682 // const auto lo = LowerHalf(quad);
6683 // const auto hi = UpperHalf(Half<decltype(d32)>(), quad);
6684 // const auto result = lo | hi;
6685#endif
6686
6687 return Vec128<uint32_t>{_mm256_castsi256_si128(result)};
6688}
6689
6690// LO and HI each hold two indices of bytes within a 128-bit block.
6691template <uint16_t LO, uint16_t HI, typename T>
6693 const Full256<uint16_t> d16;
6694
6695#if HWY_TARGET <= HWY_AVX3_DL
6696 alignas(32) static constexpr uint16_t kMap[16] = {
6697 LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
6698 const auto result = _mm256_permutexvar_epi8(Load(d16, kMap).raw, v.raw);
6699 return LowerHalf(Vec128<uint32_t>{_mm256_castsi256_si128(result)});
6700#else
6701 constexpr uint16_t ff = static_cast<uint16_t>(~0u);
6702 alignas(32) static constexpr uint16_t kMap[16] = {
6703 LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff};
6704 const auto quad = TableLookupBytes(v, Load(d16, kMap));
6705 const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC);
6706 const auto half = _mm256_castsi256_si128(mixed);
6707 return LowerHalf(Vec128<uint32_t>{_mm_packus_epi32(half, half)});
6708#endif
6709}
6710
6711} // namespace detail
6712
6713template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
6715 const Full256<uint32_t> d32;
6716#if HWY_TARGET <= HWY_AVX3_DL
6717 alignas(32) static constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0,
6718 0, 0, 0, 0};
6719 const auto result = _mm256_permutexvar_epi8(Load(d32, kMap).raw, v.raw);
6720 return LowerHalf(LowerHalf(LowerHalf(Vec256<uint8_t>{result})));
6721#else
6722 alignas(32) static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u,
6723 0x0800FFFFu, ~0u, ~0u, ~0u};
6724 const auto quad = TableLookupBytes(v, Load(d32, kMap));
6725 const auto lo = LowerHalf(quad);
6726 const auto hi = UpperHalf(Half<decltype(d32)>(), quad);
6727 const auto result = lo | hi;
6728 return LowerHalf(LowerHalf(Vec128<uint8_t>{result.raw}));
6729#endif
6730}
6731
6732template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
6733HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
6734 const auto result = detail::LookupAndConcatQuarters<0x100, 0x908>(v);
6735 return VFromD<D>{result.raw};
6736}
6737
6738template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6739HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
6740 const Full256<uint32_t> d32;
6741 alignas(32) static constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6};
6742 const auto v32 =
6743 TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven));
6744 return LowerHalf(Vec256<uint32_t>{v32.raw});
6745}
6746
6747template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
6749 const auto full = detail::LookupAndConcatQuarters<0x400, 0xC08>(v);
6750 return VFromD<D>{full.raw};
6751}
6752
6753template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
6754HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint32_t> v) {
6755 const auto full = detail::LookupAndConcatHalves<0x05040100, 0x0D0C0908>(v);
6756 return VFromD<D>{full.raw};
6757}
6758
6759template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
6761 const auto full = detail::LookupAndConcatHalves<0x06040200, 0x0E0C0A08>(v);
6762 return VFromD<D>{full.raw};
6763}
6764
6765// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
6766
6767#if HWY_HAVE_FLOAT16
6768template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
6769HWY_API VFromD<D> ConvertTo(D /* tag */, Vec256<uint16_t> v) {
6770 return VFromD<D>{_mm256_cvtepu16_ph(v.raw)};
6771}
6772template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
6773HWY_API VFromD<D> ConvertTo(D /* tag */, Vec256<int16_t> v) {
6774 return VFromD<D>{_mm256_cvtepi16_ph(v.raw)};
6775}
6776#endif // HWY_HAVE_FLOAT16
6777
6778template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
6780 return VFromD<D>{_mm256_cvtepi32_ps(v.raw)};
6781}
6782
6783#if HWY_TARGET <= HWY_AVX3
6784template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
6786 return VFromD<D>{_mm256_cvtepu32_ps(v.raw)};
6787}
6788
6789template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6791 return VFromD<D>{_mm256_cvtepi64_pd(v.raw)};
6792}
6793
6794template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6796 return VFromD<D>{_mm256_cvtepu64_pd(v.raw)};
6797}
6798#endif // HWY_TARGET <= HWY_AVX3
6799
6800// Truncates (rounds toward zero).
6801
6802#if HWY_HAVE_FLOAT16
6803template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
6804HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float16_t> v) {
6805 return VFromD<D>{_mm256_cvttph_epi16(v.raw)};
6806}
6807template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6808HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
6809 return VFromD<D>{_mm256_cvttph_epu16(v.raw)};
6810}
6811template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6812HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
6813 return VFromD<D>{_mm256_maskz_cvttph_epu16(
6814 detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
6815}
6816#endif // HWY_HAVE_FLOAT16
6817
6818template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6820 return VFromD<D>{_mm256_cvttps_epi32(v.raw)};
6821}
6822
6823#if HWY_TARGET <= HWY_AVX3
6824template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6826 return VFromD<D>{_mm256_cvttpd_epi64(v.raw)};
6827}
6828template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
6829HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6830 return VFromD<DU>{_mm256_cvttps_epu32(v.raw)};
6831}
6832template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
6833HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6834 return VFromD<DU>{_mm256_maskz_cvttps_epu32(
6835 detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
6836}
6837template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
6838HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6839 return VFromD<DU>{_mm256_cvttpd_epu64(v.raw)};
6840}
6841template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
6842HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6843 return VFromD<DU>{_mm256_maskz_cvttpd_epu64(
6844 detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
6845}
6846#endif // HWY_TARGET <= HWY_AVX3
6847
6848HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) {
6849 const Full256<int32_t> di;
6851 di, v, Vec256<int32_t>{_mm256_cvtps_epi32(v.raw)});
6852}
6853
6854#ifndef HWY_DISABLE_F16C
6855
6856template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
6858 (void)df32;
6859#if HWY_HAVE_FLOAT16
6860 const RebindToUnsigned<DFromV<decltype(v)>> du16;
6861 return VFromD<D>{_mm256_cvtph_ps(BitCast(du16, v).raw)};
6862#else
6863 return VFromD<D>{_mm256_cvtph_ps(v.raw)};
6864#endif // HWY_HAVE_FLOAT16
6865}
6866
6867#endif // HWY_DISABLE_F16C
6868
6869#if HWY_HAVE_FLOAT16
6870
6871template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6872HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec64<float16_t> v) {
6873 return VFromD<D>{_mm256_cvtph_pd(v.raw)};
6874}
6875
6876#endif // HWY_HAVE_FLOAT16
6877
6878template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
6880 const Rebind<uint16_t, decltype(df32)> du16;
6881 const RebindToSigned<decltype(df32)> di32;
6882 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
6883}
6884
6885// ================================================== CRYPTO
6886
6887#if !defined(HWY_DISABLE_PCLMUL_AES)
6888
6890 Vec256<uint8_t> round_key) {
6891#if HWY_TARGET <= HWY_AVX3_DL
6892 return Vec256<uint8_t>{_mm256_aesenc_epi128(state.raw, round_key.raw)};
6893#else
6894 const Full256<uint8_t> d;
6895 const Half<decltype(d)> d2;
6896 return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
6897 AESRound(LowerHalf(state), LowerHalf(round_key)));
6898#endif
6899}
6900
6902 Vec256<uint8_t> round_key) {
6903#if HWY_TARGET <= HWY_AVX3_DL
6904 return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)};
6905#else
6906 const Full256<uint8_t> d;
6907 const Half<decltype(d)> d2;
6908 return Combine(d,
6909 AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
6910 AESLastRound(LowerHalf(state), LowerHalf(round_key)));
6911#endif
6912}
6913
6915 Vec256<uint8_t> round_key) {
6916#if HWY_TARGET <= HWY_AVX3_DL
6917 return Vec256<uint8_t>{_mm256_aesdec_epi128(state.raw, round_key.raw)};
6918#else
6919 const Full256<uint8_t> d;
6920 const Half<decltype(d)> d2;
6921 return Combine(d, AESRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)),
6922 AESRoundInv(LowerHalf(state), LowerHalf(round_key)));
6923#endif
6924}
6925
6927 Vec256<uint8_t> round_key) {
6928#if HWY_TARGET <= HWY_AVX3_DL
6929 return Vec256<uint8_t>{_mm256_aesdeclast_epi128(state.raw, round_key.raw)};
6930#else
6931 const Full256<uint8_t> d;
6932 const Half<decltype(d)> d2;
6933 return Combine(
6934 d, AESLastRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)),
6935 AESLastRoundInv(LowerHalf(state), LowerHalf(round_key)));
6936#endif
6937}
6938
6939template <class V, HWY_IF_V_SIZE_GT_V(V, 16), HWY_IF_U8_D(DFromV<V>)>
6941 const DFromV<decltype(state)> d;
6942#if HWY_TARGET <= HWY_AVX3_DL
6943 // On AVX3_DL, it is more efficient to do an InvMixColumns operation for a
6944 // 256-bit or 512-bit vector by doing a AESLastRound operation
6945 // (_mm256_aesenclast_epi128/_mm512_aesenclast_epi128) followed by a
6946 // AESRoundInv operation (_mm256_aesdec_epi128/_mm512_aesdec_epi128) than to
6947 // split the vector into 128-bit vectors, carrying out multiple
6948 // _mm_aesimc_si128 operations, and then combining the _mm_aesimc_si128
6949 // results back into a 256-bit or 512-bit vector.
6950 const auto zero = Zero(d);
6951 return AESRoundInv(AESLastRound(state, zero), zero);
6952#else
6953 const Half<decltype(d)> dh;
6954 return Combine(d, AESInvMixColumns(UpperHalf(dh, state)),
6955 AESInvMixColumns(LowerHalf(dh, state)));
6956#endif
6957}
6958
6959template <uint8_t kRcon>
6961 const Full256<uint8_t> d;
6962#if HWY_TARGET <= HWY_AVX3_DL
6963 const VFromD<decltype(d)> rconXorMask = Dup128VecFromValues(
6964 d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
6965 const VFromD<decltype(d)> rotWordShuffle = Dup128VecFromValues(
6966 d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
6967 const Repartition<uint32_t, decltype(d)> du32;
6968 const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
6969 const auto sub_word_result = AESLastRound(w13, rconXorMask);
6970 return TableLookupBytes(sub_word_result, rotWordShuffle);
6971#else
6972 const Half<decltype(d)> d2;
6973 return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)),
6974 AESKeyGenAssist<kRcon>(LowerHalf(v)));
6975#endif
6976}
6977
6979#if HWY_TARGET <= HWY_AVX3_DL
6980 return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)};
6981#else
6982 const Full256<uint64_t> d;
6983 const Half<decltype(d)> d2;
6984 return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)),
6986#endif
6987}
6988
6990#if HWY_TARGET <= HWY_AVX3_DL
6991 return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)};
6992#else
6993 const Full256<uint64_t> d;
6994 const Half<decltype(d)> d2;
6995 return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)),
6997#endif
6998}
6999
7000#endif // HWY_DISABLE_PCLMUL_AES
7001
7002// ================================================== MISC
7003
7004#if HWY_TARGET <= HWY_AVX3
7005
7006// ------------------------------ LoadMaskBits
7007
7008// `p` points to at least 8 readable bytes, not all of which need be valid.
7009template <class D, HWY_IF_V_SIZE_D(D, 32)>
7010HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
7011 constexpr size_t kN = MaxLanes(d);
7012 constexpr size_t kNumBytes = (kN + 7) / 8;
7013
7014 uint64_t mask_bits = 0;
7015 CopyBytes<kNumBytes>(bits, &mask_bits);
7016
7017 if (kN < 8) {
7018 mask_bits &= (1ull << kN) - 1;
7019 }
7020
7021 return MFromD<D>::FromBits(mask_bits);
7022}
7023
7024// ------------------------------ StoreMaskBits
7025
7026// `p` points to at least 8 writable bytes.
7027template <class D, HWY_IF_V_SIZE_D(D, 32)>
7028HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
7029 constexpr size_t kN = MaxLanes(d);
7030 constexpr size_t kNumBytes = (kN + 7) / 8;
7031
7032 CopyBytes<kNumBytes>(&mask.raw, bits);
7033
7034 // Non-full byte, need to clear the undefined upper bits.
7035 if (kN < 8) {
7036 const int mask_bits = static_cast<int>((1ull << kN) - 1);
7037 bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
7038 }
7039 return kNumBytes;
7040}
7041
7042// ------------------------------ Mask testing
7043
7044template <class D, HWY_IF_V_SIZE_D(D, 32)>
7045HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
7046 return PopCount(static_cast<uint64_t>(mask.raw));
7047}
7048
7049template <class D, HWY_IF_V_SIZE_D(D, 32)>
7050HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) {
7051 return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
7052}
7053
7054template <class D, HWY_IF_V_SIZE_D(D, 32)>
7055HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
7056 return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
7057 : intptr_t{-1};
7058}
7059
7060template <class D, HWY_IF_V_SIZE_D(D, 32)>
7061HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) {
7062 return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask.raw);
7063}
7064
7065template <class D, HWY_IF_V_SIZE_D(D, 32)>
7066HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
7067 return mask.raw ? static_cast<intptr_t>(FindKnownLastTrue(d, mask))
7068 : intptr_t{-1};
7069}
7070
7071// Beware: the suffix indicates the number of mask bits, not lane size!
7072
7073namespace detail {
7074
7075template <typename T>
7077#if HWY_COMPILER_HAS_MASK_INTRINSICS
7078 return _kortestz_mask32_u8(mask.raw, mask.raw);
7079#else
7080 return mask.raw == 0;
7081#endif
7082}
7083template <typename T>
7085#if HWY_COMPILER_HAS_MASK_INTRINSICS
7086 return _kortestz_mask16_u8(mask.raw, mask.raw);
7087#else
7088 return mask.raw == 0;
7089#endif
7090}
7091template <typename T>
7093#if HWY_COMPILER_HAS_MASK_INTRINSICS
7094 return _kortestz_mask8_u8(mask.raw, mask.raw);
7095#else
7096 return mask.raw == 0;
7097#endif
7098}
7099template <typename T>
7101 return (uint64_t{mask.raw} & 0xF) == 0;
7102}
7103
7104} // namespace detail
7105
7106template <class D, HWY_IF_V_SIZE_D(D, 32)>
7107HWY_API bool AllFalse(D /* tag */, MFromD<D> mask) {
7108 return detail::AllFalse(hwy::SizeTag<sizeof(TFromD<D>)>(), mask);
7109}
7110
7111namespace detail {
7112
7113template <typename T>
7115#if HWY_COMPILER_HAS_MASK_INTRINSICS
7116 return _kortestc_mask32_u8(mask.raw, mask.raw);
7117#else
7118 return mask.raw == 0xFFFFFFFFu;
7119#endif
7120}
7121template <typename T>
7123#if HWY_COMPILER_HAS_MASK_INTRINSICS
7124 return _kortestc_mask16_u8(mask.raw, mask.raw);
7125#else
7126 return mask.raw == 0xFFFFu;
7127#endif
7128}
7129template <typename T>
7131#if HWY_COMPILER_HAS_MASK_INTRINSICS
7132 return _kortestc_mask8_u8(mask.raw, mask.raw);
7133#else
7134 return mask.raw == 0xFFu;
7135#endif
7136}
7137template <typename T>
7139 // Cannot use _kortestc because we have less than 8 mask bits.
7140 return mask.raw == 0xFu;
7141}
7142
7143} // namespace detail
7144
7145template <class D, HWY_IF_V_SIZE_D(D, 32)>
7146HWY_API bool AllTrue(D /* tag */, const MFromD<D> mask) {
7147 return detail::AllTrue(hwy::SizeTag<sizeof(TFromD<D>)>(), mask);
7148}
7149
7150// ------------------------------ Compress
7151
7152// 16-bit is defined in x86_512 so we can use 512-bit vectors.
7153
7154template <typename T, HWY_IF_T_SIZE(T, 4)>
7156 return Vec256<T>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
7157}
7158
7160 return Vec256<float>{_mm256_maskz_compress_ps(mask.raw, v.raw)};
7161}
7162
7163template <typename T, HWY_IF_T_SIZE(T, 8)>
7164HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
7165 // See CompressIsPartition.
7166 alignas(16) static constexpr uint64_t packed_array[16] = {
7167 // PrintCompress64x4NibbleTables
7168 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
7169 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
7170 0x00001032, 0x00001320, 0x00000321, 0x00003210};
7171
7172 // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
7173 // _mm256_permutexvar_epi64 will ignore the upper bits.
7174 const DFromV<decltype(v)> d;
7175 const RebindToUnsigned<decltype(d)> du64;
7176 const auto packed = Set(du64, packed_array[mask.raw]);
7177 alignas(64) static constexpr uint64_t shifts[4] = {0, 4, 8, 12};
7178 const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
7179 return TableLookupLanes(v, indices);
7180}
7181
7182// ------------------------------ CompressNot (Compress)
7183
7184// Implemented in x86_512 for lane size != 8.
7185
7186template <typename T, HWY_IF_T_SIZE(T, 8)>
7188 // See CompressIsPartition.
7189 alignas(16) static constexpr uint64_t packed_array[16] = {
7190 // PrintCompressNot64x4NibbleTables
7191 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
7192 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
7193 0x00003210, 0x00003201, 0x00003210, 0x00003210};
7194
7195 // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
7196 // _mm256_permutexvar_epi64 will ignore the upper bits.
7197 const DFromV<decltype(v)> d;
7198 const RebindToUnsigned<decltype(d)> du64;
7199 const auto packed = Set(du64, packed_array[mask.raw]);
7200 alignas(32) static constexpr uint64_t shifts[4] = {0, 4, 8, 12};
7201 const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
7202 return TableLookupLanes(v, indices);
7203}
7204
7205// ------------------------------ CompressStore (defined in x86_512)
7206// ------------------------------ CompressBlendedStore (defined in x86_512)
7207// ------------------------------ CompressBitsStore (defined in x86_512)
7208
7209#else // AVX2
7210
7211// ------------------------------ LoadMaskBits (TestBit)
7212
7213namespace detail {
7214
7215// 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_V_SIZE.
7216template <typename T, HWY_IF_T_SIZE(T, 1)>
7217HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
7218 const Full256<T> d;
7219 const RebindToUnsigned<decltype(d)> du;
7220 const Repartition<uint32_t, decltype(d)> du32;
7221 const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits)));
7222
7223 // Replicate bytes 8x such that each byte contains the bit that governs it.
7224 const Repartition<uint64_t, decltype(d)> du64;
7225 alignas(32) static constexpr uint64_t kRep8[4] = {
7226 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
7227 0x0303030303030303ull};
7228 const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
7229
7230 const VFromD<decltype(du)> bit = Dup128VecFromValues(
7231 du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
7232 return RebindMask(d, TestBit(rep8, bit));
7233}
7234
7235template <typename T, HWY_IF_T_SIZE(T, 2)>
7236HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
7237 const Full256<T> d;
7238 const RebindToUnsigned<decltype(d)> du;
7239 alignas(32) static constexpr uint16_t kBit[16] = {
7240 1, 2, 4, 8, 16, 32, 64, 128,
7241 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
7242 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
7243 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
7244}
7245
7246template <typename T, HWY_IF_T_SIZE(T, 4)>
7247HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
7248 const Full256<T> d;
7249 const RebindToUnsigned<decltype(d)> du;
7250 alignas(32) static constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
7251 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
7252 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
7253}
7254
7255template <typename T, HWY_IF_T_SIZE(T, 8)>
7256HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
7257 const Full256<T> d;
7258 const RebindToUnsigned<decltype(d)> du;
7259 alignas(32) static constexpr uint64_t kBit[8] = {1, 2, 4, 8};
7260 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
7261}
7262
7263} // namespace detail
7264
7265// `p` points to at least 8 readable bytes, not all of which need be valid.
7266template <class D, HWY_IF_V_SIZE_D(D, 32)>
7267HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
7268 constexpr size_t kN = MaxLanes(d);
7269 constexpr size_t kNumBytes = (kN + 7) / 8;
7270
7271 uint64_t mask_bits = 0;
7272 CopyBytes<kNumBytes>(bits, &mask_bits);
7273
7274 if (kN < 8) {
7275 mask_bits &= (1ull << kN) - 1;
7276 }
7277
7278 return detail::LoadMaskBits256<TFromD<D>>(mask_bits);
7279}
7280
7281// ------------------------------ StoreMaskBits
7282
7283namespace detail {
7284
7285template <typename T, HWY_IF_T_SIZE(T, 1)>
7286HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
7287 const Full256<T> d;
7288 const Full256<uint8_t> d8;
7289 const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw;
7290 // Prevent sign-extension of 32-bit masks because the intrinsic returns int.
7291 return static_cast<uint32_t>(_mm256_movemask_epi8(sign_bits));
7292}
7293
7294template <typename T, HWY_IF_T_SIZE(T, 2)>
7295HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
7296#if !defined(HWY_DISABLE_BMI2_FMA) && !defined(HWY_DISABLE_PEXT_ON_AVX2)
7297 const Full256<T> d;
7298 const Full256<uint8_t> d8;
7299 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
7300 const uint64_t sign_bits8 = BitsFromMask(mask8);
7301 // Skip the bits from the lower byte of each u16 (better not to use the
7302 // same packs_epi16 as SSE4, because that requires an extra swizzle here).
7303 return _pext_u32(static_cast<uint32_t>(sign_bits8), 0xAAAAAAAAu);
7304#else
7305 // Slow workaround for when BMI2 is disabled
7306 // Remove useless lower half of each u16 while preserving the sign bit.
7307 // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes.
7308 const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
7309 // Move odd qwords (value zero) to top so they don't affect the mask value.
7310 const auto compressed = _mm256_castsi256_si128(
7311 _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0)));
7312 return static_cast<unsigned>(_mm_movemask_epi8(compressed));
7313#endif // HWY_ARCH_X86_64
7314}
7315
7316template <typename T, HWY_IF_T_SIZE(T, 4)>
7317HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
7318 const Full256<T> d;
7319 const Full256<float> df;
7320 const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
7321 return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
7322}
7323
7324template <typename T, HWY_IF_T_SIZE(T, 8)>
7325HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
7326 const Full256<T> d;
7327 const Full256<double> df;
7328 const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
7329 return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
7330}
7331
7332} // namespace detail
7333
7334// `p` points to at least 8 writable bytes.
7335template <class D, HWY_IF_V_SIZE_D(D, 32)>
7336HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
7337 constexpr size_t N = Lanes(d);
7338 constexpr size_t kNumBytes = (N + 7) / 8;
7339
7340 const uint64_t mask_bits = detail::BitsFromMask(mask);
7341 CopyBytes<kNumBytes>(&mask_bits, bits);
7342 return kNumBytes;
7343}
7344
7345// ------------------------------ Mask testing
7346
7347// Specialize for 16-bit lanes to avoid unnecessary pext. This assumes each mask
7348// lane is 0 or ~0.
7349template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7350HWY_API bool AllFalse(D d, MFromD<D> mask) {
7351 const Repartition<uint8_t, decltype(d)> d8;
7352 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
7353 return detail::BitsFromMask(mask8) == 0;
7354}
7355
7356template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
7357HWY_API bool AllFalse(D /* tag */, MFromD<D> mask) {
7358 // Cheaper than PTEST, which is 2 uop / 3L.
7359 return detail::BitsFromMask(mask) == 0;
7360}
7361
7362template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7363HWY_API bool AllTrue(D d, MFromD<D> mask) {
7364 const Repartition<uint8_t, decltype(d)> d8;
7365 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
7366 return detail::BitsFromMask(mask8) == (1ull << 32) - 1;
7367}
7368template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
7369HWY_API bool AllTrue(D d, MFromD<D> mask) {
7370 constexpr uint64_t kAllBits = (1ull << Lanes(d)) - 1;
7371 return detail::BitsFromMask(mask) == kAllBits;
7372}
7373
7374template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7375HWY_API size_t CountTrue(D d, MFromD<D> mask) {
7376 const Repartition<uint8_t, decltype(d)> d8;
7377 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
7378 return PopCount(detail::BitsFromMask(mask8)) >> 1;
7379}
7380template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
7381HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
7382 return PopCount(detail::BitsFromMask(mask));
7383}
7384
7385template <class D, HWY_IF_V_SIZE_D(D, 32)>
7386HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) {
7387 const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
7388 return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
7389}
7390
7391template <class D, HWY_IF_V_SIZE_D(D, 32)>
7392HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD<D> mask) {
7393 const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
7394 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
7395}
7396
7397template <class D, HWY_IF_V_SIZE_D(D, 32)>
7398HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) {
7399 const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
7400 return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits);
7401}
7402
7403template <class D, HWY_IF_V_SIZE_D(D, 32)>
7404HWY_API intptr_t FindLastTrue(D /* tag */, MFromD<D> mask) {
7405 const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
7406 return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
7407 : -1;
7408}
7409
7410// ------------------------------ Compress, CompressBits
7411
7412namespace detail {
7413
7414template <typename T, HWY_IF_T_SIZE(T, 4)>
7415HWY_INLINE Vec256<uint32_t> IndicesFromBits256(uint64_t mask_bits) {
7416 const Full256<uint32_t> d32;
7417 // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
7418 // of SetTableIndices would require 8 KiB, a large part of L1D. The other
7419 // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
7420 // and unavailable in 32-bit builds. We instead compress each index into 4
7421 // bits, for a total of 1 KiB.
7422 alignas(16) static constexpr uint32_t packed_array[256] = {
7423 // PrintCompress32x8Tables
7424 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
7425 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
7426 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
7427 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
7428 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
7429 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
7430 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
7431 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
7432 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
7433 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
7434 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
7435 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
7436 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
7437 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
7438 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
7439 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
7440 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
7441 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
7442 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
7443 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
7444 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
7445 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
7446 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
7447 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
7448 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
7449 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
7450 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
7451 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
7452 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
7453 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
7454 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
7455 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
7456 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
7457 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
7458 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
7459 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
7460 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
7461 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
7462 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
7463 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
7464 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
7465 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
7466 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
7467
7468 // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
7469 // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
7470 // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
7471 // latency, it may be faster to use LoadDup128 and PSHUFB.
7472 const auto packed = Set(d32, packed_array[mask_bits]);
7473 alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12,
7474 16, 20, 24, 28};
7475 return packed >> Load(d32, shifts);
7476}
7477
7478template <typename T, HWY_IF_T_SIZE(T, 8)>
7479HWY_INLINE Vec256<uint32_t> IndicesFromBits256(uint64_t mask_bits) {
7480 const Full256<uint32_t> d32;
7481
7482 // For 64-bit, we still need 32-bit indices because there is no 64-bit
7483 // permutevar, but there are only 4 lanes, so we can afford to skip the
7484 // unpacking and load the entire index vector directly.
7485 alignas(32) static constexpr uint32_t u32_indices[128] = {
7486 // PrintCompress64x4PairTables
7487 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7,
7488 10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7,
7489 12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7,
7490 10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7,
7491 14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5,
7492 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5,
7493 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3,
7494 10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15};
7495 return Load(d32, u32_indices + 8 * mask_bits);
7496}
7497
7498template <typename T, HWY_IF_T_SIZE(T, 4)>
7499HWY_INLINE Vec256<uint32_t> IndicesFromNotBits256(uint64_t mask_bits) {
7500 const Full256<uint32_t> d32;
7501 // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
7502 // of SetTableIndices would require 8 KiB, a large part of L1D. The other
7503 // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
7504 // and unavailable in 32-bit builds. We instead compress each index into 4
7505 // bits, for a total of 1 KiB.
7506 alignas(16) static constexpr uint32_t packed_array[256] = {
7507 // PrintCompressNot32x8Tables
7508 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
7509 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
7510 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
7511 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
7512 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
7513 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
7514 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
7515 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
7516 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
7517 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
7518 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
7519 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
7520 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
7521 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
7522 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
7523 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
7524 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
7525 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
7526 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
7527 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
7528 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
7529 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
7530 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
7531 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
7532 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
7533 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
7534 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
7535 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
7536 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
7537 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
7538 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
7539 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
7540 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
7541 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
7542 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
7543 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
7544 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
7545 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
7546 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
7547 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
7548 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
7549 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
7550 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
7551
7552 // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31.
7553 // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
7554 // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
7555 // latency, it may be faster to use LoadDup128 and PSHUFB.
7556 const Vec256<uint32_t> packed = Set(d32, packed_array[mask_bits]);
7557 alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12,
7558 16, 20, 24, 28};
7559 return packed >> Load(d32, shifts);
7560}
7561
7562template <typename T, HWY_IF_T_SIZE(T, 8)>
7563HWY_INLINE Vec256<uint32_t> IndicesFromNotBits256(uint64_t mask_bits) {
7564 const Full256<uint32_t> d32;
7565
7566 // For 64-bit, we still need 32-bit indices because there is no 64-bit
7567 // permutevar, but there are only 4 lanes, so we can afford to skip the
7568 // unpacking and load the entire index vector directly.
7569 alignas(32) static constexpr uint32_t u32_indices[128] = {
7570 // PrintCompressNot64x4PairTables
7571 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9,
7572 8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11,
7573 8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13,
7574 8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13,
7575 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15,
7576 8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15,
7577 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15,
7578 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
7579 return Load(d32, u32_indices + 8 * mask_bits);
7580}
7581
7582template <typename T, HWY_IF_NOT_T_SIZE(T, 2)>
7583HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
7584 const DFromV<decltype(v)> d;
7585 const Repartition<uint32_t, decltype(d)> du32;
7586
7587 HWY_DASSERT(mask_bits < (1ull << Lanes(d)));
7588 // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
7589 // no instruction for 4x64).
7590 const Indices256<uint32_t> indices{IndicesFromBits256<T>(mask_bits).raw};
7591 return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
7592}
7593
7594// LUTs are infeasible for 2^16 possible masks, so splice together two
7595// half-vector Compress.
7596template <typename T, HWY_IF_T_SIZE(T, 2)>
7597HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
7598 const DFromV<decltype(v)> d;
7599 const RebindToUnsigned<decltype(d)> du;
7600 const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
7601 const Half<decltype(du)> duh;
7602 const auto half0 = LowerHalf(duh, vu16);
7603 const auto half1 = UpperHalf(duh, vu16);
7604
7605 const uint64_t mask_bits0 = mask_bits & 0xFF;
7606 const uint64_t mask_bits1 = mask_bits >> 8;
7607 const auto compressed0 = detail::CompressBits(half0, mask_bits0);
7608 const auto compressed1 = detail::CompressBits(half1, mask_bits1);
7609
7610 alignas(32) uint16_t all_true[16] = {};
7611 // Store mask=true lanes, left to right.
7612 const size_t num_true0 = PopCount(mask_bits0);
7613 Store(compressed0, duh, all_true);
7614 StoreU(compressed1, duh, all_true + num_true0);
7615
7617 // Store mask=false lanes, right to left. The second vector fills the upper
7618 // half with right-aligned false lanes. The first vector is shifted
7619 // rightwards to overwrite the true lanes of the second.
7620 alignas(32) uint16_t all_false[16] = {};
7621 const size_t num_true1 = PopCount(mask_bits1);
7622 Store(compressed1, duh, all_false + 8);
7623 StoreU(compressed0, duh, all_false + num_true1);
7624
7625 const auto mask = FirstN(du, num_true0 + num_true1);
7626 return BitCast(d,
7627 IfThenElse(mask, Load(du, all_true), Load(du, all_false)));
7628 } else {
7629 // Only care about the mask=true lanes.
7630 return BitCast(d, Load(du, all_true));
7631 }
7632}
7633
7634template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
7635HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
7636 const DFromV<decltype(v)> d;
7637 const Repartition<uint32_t, decltype(d)> du32;
7638
7639 HWY_DASSERT(mask_bits < (1ull << Lanes(d)));
7640 // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
7641 // no instruction for 4x64).
7642 const Indices256<uint32_t> indices{IndicesFromNotBits256<T>(mask_bits).raw};
7643 return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
7644}
7645
7646// LUTs are infeasible for 2^16 possible masks, so splice together two
7647// half-vector Compress.
7648template <typename T, HWY_IF_T_SIZE(T, 2)>
7649HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
7650 // Compress ensures only the lower 16 bits are set, so flip those.
7651 return Compress(v, mask_bits ^ 0xFFFF);
7652}
7653
7654} // namespace detail
7655
7656template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
7657HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
7659}
7660
7661template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
7662HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
7664}
7665
7666HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
7667 Mask256<uint64_t> mask) {
7668 return CompressNot(v, mask);
7669}
7670
7671template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
7672HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
7673 constexpr size_t N = 32 / sizeof(T);
7674 constexpr size_t kNumBytes = (N + 7) / 8;
7675
7676 uint64_t mask_bits = 0;
7677 CopyBytes<kNumBytes>(bits, &mask_bits);
7678
7679 if (N < 8) {
7680 mask_bits &= (1ull << N) - 1;
7681 }
7682
7683 return detail::Compress(v, mask_bits);
7684}
7685
7686// ------------------------------ CompressStore, CompressBitsStore
7687
7688template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
7689HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
7690 TFromD<D>* HWY_RESTRICT unaligned) {
7691 const uint64_t mask_bits = detail::BitsFromMask(m);
7692 const size_t count = PopCount(mask_bits);
7693 StoreU(detail::Compress(v, mask_bits), d, unaligned);
7694 detail::MaybeUnpoison(unaligned, count);
7695 return count;
7696}
7697
7698template <class D, HWY_IF_V_SIZE_D(D, 32),
7699 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
7700HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
7701 TFromD<D>* HWY_RESTRICT unaligned) {
7702 const uint64_t mask_bits = detail::BitsFromMask(m);
7703 const size_t count = PopCount(mask_bits);
7704
7705 const RebindToUnsigned<decltype(d)> du;
7706 const Repartition<uint32_t, decltype(d)> du32;
7707 HWY_DASSERT(mask_bits < (1ull << Lanes(d)));
7708 // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
7709 // no instruction for 4x64). Nibble MSB encodes FirstN.
7710 const Vec256<uint32_t> idx_mask =
7711 detail::IndicesFromBits256<TFromD<D>>(mask_bits);
7712 // Shift nibble MSB into MSB
7713 const Mask256<uint32_t> mask32 = MaskFromVec(ShiftLeft<28>(idx_mask));
7714 // First cast to unsigned (RebindMask cannot change lane size)
7715 const MFromD<decltype(du)> mask_u{mask32.raw};
7716 const MFromD<D> mask = RebindMask(d, mask_u);
7717 const VFromD<D> compressed = BitCast(
7718 d,
7719 TableLookupLanes(BitCast(du32, v), Indices256<uint32_t>{idx_mask.raw}));
7720
7721 BlendedStore(compressed, mask, d, unaligned);
7722 detail::MaybeUnpoison(unaligned, count);
7723 return count;
7724}
7725
7726template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7727HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
7728 TFromD<D>* HWY_RESTRICT unaligned) {
7729 const uint64_t mask_bits = detail::BitsFromMask(m);
7730 const size_t count = PopCount(mask_bits);
7731 const VFromD<D> compressed = detail::Compress(v, mask_bits);
7732
7733#if HWY_MEM_OPS_MIGHT_FAULT // true if HWY_IS_MSAN
7734 // BlendedStore tests mask for each lane, but we know that the mask is
7735 // FirstN, so we can just copy.
7736 alignas(32) TFromD<D> buf[16];
7737 Store(compressed, d, buf);
7738 CopyBytes(buf, unaligned, count * sizeof(TFromD<D>));
7739#else
7740 BlendedStore(compressed, FirstN(d, count), d, unaligned);
7741#endif
7742 return count;
7743}
7744
7745template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
7746HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
7747 D d, TFromD<D>* HWY_RESTRICT unaligned) {
7748 constexpr size_t N = Lanes(d);
7749 constexpr size_t kNumBytes = (N + 7) / 8;
7750
7751 uint64_t mask_bits = 0;
7752 CopyBytes<kNumBytes>(bits, &mask_bits);
7753
7754 if (N < 8) {
7755 mask_bits &= (1ull << N) - 1;
7756 }
7757 const size_t count = PopCount(mask_bits);
7758
7759 StoreU(detail::Compress(v, mask_bits), d, unaligned);
7760 detail::MaybeUnpoison(unaligned, count);
7761 return count;
7762}
7763
7764#endif // HWY_TARGET <= HWY_AVX3
7765
7766// ------------------------------ Dup128MaskFromMaskBits
7767
7768// Generic for all vector lengths >= 32 bytes
7769template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
7770HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
7771 const Half<decltype(d)> dh;
7772 const auto mh = Dup128MaskFromMaskBits(dh, mask_bits);
7773 return CombineMasks(d, mh, mh);
7774}
7775
7776// ------------------------------ Expand
7777
7778// Always define Expand/LoadExpand because generic_ops only does so for Vec128.
7779
7780namespace detail {
7781
7782#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2
7783
7785 Mask256<uint8_t> mask) {
7786 return Vec256<uint8_t>{_mm256_maskz_expand_epi8(mask.raw, v.raw)};
7787}
7788
7790 Mask256<uint16_t> mask) {
7791 return Vec256<uint16_t>{_mm256_maskz_expand_epi16(mask.raw, v.raw)};
7792}
7793
7794template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
7796 const uint8_t* HWY_RESTRICT unaligned) {
7797 return VFromD<D>{_mm256_maskz_expandloadu_epi8(mask.raw, unaligned)};
7798}
7799
7800template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
7801HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */,
7802 const uint16_t* HWY_RESTRICT unaligned) {
7803 return VFromD<D>{_mm256_maskz_expandloadu_epi16(mask.raw, unaligned)};
7804}
7805
7806#endif // HWY_TARGET <= HWY_AVX3_DL
7807#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
7808
7810 Mask256<uint32_t> mask) {
7811 return Vec256<uint32_t>{_mm256_maskz_expand_epi32(mask.raw, v.raw)};
7812}
7813
7815 Mask256<uint64_t> mask) {
7816 return Vec256<uint64_t>{_mm256_maskz_expand_epi64(mask.raw, v.raw)};
7817}
7818
7819template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
7821 const uint32_t* HWY_RESTRICT unaligned) {
7822 return VFromD<D>{_mm256_maskz_expandloadu_epi32(mask.raw, unaligned)};
7823}
7824
7825template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
7826HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */,
7827 const uint64_t* HWY_RESTRICT unaligned) {
7828 return VFromD<D>{_mm256_maskz_expandloadu_epi64(mask.raw, unaligned)};
7829}
7830
7831#endif // HWY_TARGET <= HWY_AVX3
7832
7833} // namespace detail
7834
7835template <typename T, HWY_IF_T_SIZE(T, 1)>
7837 const DFromV<decltype(v)> d;
7838#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
7839 const RebindToUnsigned<decltype(d)> du;
7840 const MFromD<decltype(du)> mu = RebindMask(du, mask);
7841 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
7842#else
7843 // LUTs are infeasible for so many mask combinations, so Combine two
7844 // half-vector Expand.
7845 const Half<decltype(d)> dh;
7846 const uint64_t mask_bits = detail::BitsFromMask(mask);
7847 constexpr size_t N = 32 / sizeof(T);
7848 const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1));
7849 const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask)));
7850 const Vec128<T> expandL = Expand(LowerHalf(v), maskL);
7851 // We have to shift the input by a variable number of bytes, but there isn't
7852 // a table-driven option for that until VBMI, and CPUs with that likely also
7853 // have VBMI2 and thus native Expand.
7854 alignas(32) T lanes[N];
7855 Store(v, d, lanes);
7856 const Mask128<T> maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask)));
7857 const Vec128<T> expandH = Expand(LoadU(dh, lanes + countL), maskH);
7858 return Combine(d, expandH, expandL);
7859#endif
7860}
7861
7862// If AVX3, this is already implemented by x86_512.
7863#if HWY_TARGET != HWY_AVX3
7864
7865template <typename T, HWY_IF_T_SIZE(T, 2)>
7866HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
7867 const Full256<T> d;
7868#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
7869 const RebindToUnsigned<decltype(d)> du;
7870 return BitCast(d, detail::NativeExpand(BitCast(du, v), RebindMask(du, mask)));
7871#else // AVX2
7872 // LUTs are infeasible for 2^16 possible masks, so splice together two
7873 // half-vector Expand.
7874 const Half<decltype(d)> dh;
7875 const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask)));
7876 const Vec128<T> expandL = Expand(LowerHalf(v), maskL);
7877 // We have to shift the input by a variable number of u16. permutevar_epi16
7878 // requires AVX3 and if we had that, we'd use native u32 Expand. The only
7879 // alternative is re-loading, which incurs a store to load forwarding stall.
7880 alignas(32) T lanes[32 / sizeof(T)];
7881 Store(v, d, lanes);
7882 const Vec128<T> vH = LoadU(dh, lanes + CountTrue(dh, maskL));
7883 const Mask128<T> maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask)));
7884 const Vec128<T> expandH = Expand(vH, maskH);
7885 return Combine(d, expandH, expandL);
7886#endif // AVX2
7887}
7888
7889#endif // HWY_TARGET != HWY_AVX3
7890
7891template <typename T, HWY_IF_T_SIZE(T, 4)>
7892HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
7893 const Full256<T> d;
7894#if HWY_TARGET <= HWY_AVX3
7895 const RebindToUnsigned<decltype(d)> du;
7896 const MFromD<decltype(du)> mu = RebindMask(du, mask);
7897 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
7898#else
7899 const RebindToUnsigned<decltype(d)> du;
7900 const uint64_t mask_bits = detail::BitsFromMask(mask);
7901
7902 alignas(16) constexpr uint32_t packed_array[256] = {
7903 // PrintExpand32x8Nibble.
7904 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0,
7905 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10,
7906 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0,
7907 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210,
7908 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0,
7909 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10,
7910 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0,
7911 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210,
7912 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0,
7913 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10,
7914 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0,
7915 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210,
7916 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0,
7917 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10,
7918 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0,
7919 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210,
7920 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0,
7921 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10,
7922 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0,
7923 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210,
7924 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0,
7925 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10,
7926 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0,
7927 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210,
7928 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0,
7929 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10,
7930 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0,
7931 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210,
7932 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0,
7933 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10,
7934 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0,
7935 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210,
7936 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0,
7937 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10,
7938 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0,
7939 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210,
7940 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0,
7941 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10,
7942 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0,
7943 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210,
7944 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0,
7945 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10,
7946 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210,
7947 };
7948
7949 // For lane i, shift the i-th 4-bit index down to bits [0, 3).
7950 const Vec256<uint32_t> packed = Set(du, packed_array[mask_bits]);
7951 alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
7952 // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec.
7953 const Indices256<uint32_t> indices{(packed >> Load(du, shifts)).raw};
7954 const Vec256<uint32_t> expand = TableLookupLanes(BitCast(du, v), indices);
7955 // TableLookupLanes cannot also zero masked-off lanes, so do that now.
7956 return IfThenElseZero(mask, BitCast(d, expand));
7957#endif
7958}
7959
7960template <typename T, HWY_IF_T_SIZE(T, 8)>
7961HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
7962 const Full256<T> d;
7963#if HWY_TARGET <= HWY_AVX3
7964 const RebindToUnsigned<decltype(d)> du;
7965 const MFromD<decltype(du)> mu = RebindMask(du, mask);
7966 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
7967#else
7968 const RebindToUnsigned<decltype(d)> du;
7969 const uint64_t mask_bits = detail::BitsFromMask(mask);
7970
7971 alignas(16) constexpr uint64_t packed_array[16] = {
7972 // PrintExpand64x4Nibble.
7973 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
7974 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
7975 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
7976
7977 // For lane i, shift the i-th 4-bit index down to bits [0, 2).
7978 const Vec256<uint64_t> packed = Set(du, packed_array[mask_bits]);
7979 alignas(32) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
7980#if HWY_TARGET <= HWY_AVX3 // native 64-bit TableLookupLanes
7981 // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec.
7982 const Indices256<uint64_t> indices{(packed >> Load(du, shifts)).raw};
7983#else
7984 // 64-bit TableLookupLanes on AVX2 requires IndicesFromVec, which checks
7985 // bounds, so clear the upper bits.
7986 const Vec256<uint64_t> masked = And(packed >> Load(du, shifts), Set(du, 3));
7987 const Indices256<uint64_t> indices = IndicesFromVec(du, masked);
7988#endif
7989 const Vec256<uint64_t> expand = TableLookupLanes(BitCast(du, v), indices);
7990 // TableLookupLanes cannot also zero masked-off lanes, so do that now.
7991 return IfThenElseZero(mask, BitCast(d, expand));
7992#endif
7993}
7994
7995// ------------------------------ LoadExpand
7996
7997template <class D, HWY_IF_V_SIZE_D(D, 32),
7998 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
7999HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
8000 const TFromD<D>* HWY_RESTRICT unaligned) {
8001#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
8002 const RebindToUnsigned<decltype(d)> du;
8003 using TU = TFromD<decltype(du)>;
8004 const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned);
8005 const MFromD<decltype(du)> mu = RebindMask(du, mask);
8006 return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
8007#else
8008 return Expand(LoadU(d, unaligned), mask);
8009#endif
8010}
8011
8012template <class D, HWY_IF_V_SIZE_D(D, 32),
8013 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
8014HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
8015 const TFromD<D>* HWY_RESTRICT unaligned) {
8016#if HWY_TARGET <= HWY_AVX3
8017 const RebindToUnsigned<decltype(d)> du;
8018 using TU = TFromD<decltype(du)>;
8019 const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned);
8020 const MFromD<decltype(du)> mu = RebindMask(du, mask);
8021 return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
8022#else
8023 return Expand(LoadU(d, unaligned), mask);
8024#endif
8025}
8026
8027// ------------------------------ LoadInterleaved3/4
8028
8029// Implemented in generic_ops, we just overload LoadTransposedBlocks3/4.
8030
8031namespace detail {
8032// Input:
8033// 1 0 (<- first block of unaligned)
8034// 3 2
8035// 5 4
8036// Output:
8037// 3 0
8038// 4 1
8039// 5 2
8040template <class D, HWY_IF_V_SIZE_D(D, 32)>
8042 VFromD<D>& A, VFromD<D>& B, VFromD<D>& C) {
8043 constexpr size_t N = Lanes(d);
8044 const VFromD<D> v10 = LoadU(d, unaligned + 0 * N); // 1 0
8045 const VFromD<D> v32 = LoadU(d, unaligned + 1 * N);
8046 const VFromD<D> v54 = LoadU(d, unaligned + 2 * N);
8047
8048 A = ConcatUpperLower(d, v32, v10);
8049 B = ConcatLowerUpper(d, v54, v10);
8050 C = ConcatUpperLower(d, v54, v32);
8051}
8052
8053// Input (128-bit blocks):
8054// 1 0 (first block of unaligned)
8055// 3 2
8056// 5 4
8057// 7 6
8058// Output:
8059// 4 0 (LSB of vA)
8060// 5 1
8061// 6 2
8062// 7 3
8063template <class D, HWY_IF_V_SIZE_D(D, 32)>
8065 VFromD<D>& vA, VFromD<D>& vB, VFromD<D>& vC,
8066 VFromD<D>& vD) {
8067 constexpr size_t N = Lanes(d);
8068 const VFromD<D> v10 = LoadU(d, unaligned + 0 * N);
8069 const VFromD<D> v32 = LoadU(d, unaligned + 1 * N);
8070 const VFromD<D> v54 = LoadU(d, unaligned + 2 * N);
8071 const VFromD<D> v76 = LoadU(d, unaligned + 3 * N);
8072
8073 vA = ConcatLowerLower(d, v54, v10);
8074 vB = ConcatUpperUpper(d, v54, v10);
8075 vC = ConcatLowerLower(d, v76, v32);
8076 vD = ConcatUpperUpper(d, v76, v32);
8077}
8078} // namespace detail
8079
8080// ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower)
8081
8082// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
8083
8084namespace detail {
8085// Input (128-bit blocks):
8086// 2 0 (LSB of i)
8087// 3 1
8088// Output:
8089// 1 0
8090// 3 2
8091template <class D, HWY_IF_V_SIZE_D(D, 32)>
8093 TFromD<D>* HWY_RESTRICT unaligned) {
8094 constexpr size_t N = Lanes(d);
8095 const auto out0 = ConcatLowerLower(d, j, i);
8096 const auto out1 = ConcatUpperUpper(d, j, i);
8097 StoreU(out0, d, unaligned + 0 * N);
8098 StoreU(out1, d, unaligned + 1 * N);
8099}
8100
8101// Input (128-bit blocks):
8102// 3 0 (LSB of i)
8103// 4 1
8104// 5 2
8105// Output:
8106// 1 0
8107// 3 2
8108// 5 4
8109template <class D, HWY_IF_V_SIZE_D(D, 32)>
8111 TFromD<D>* HWY_RESTRICT unaligned) {
8112 constexpr size_t N = Lanes(d);
8113 const auto out0 = ConcatLowerLower(d, j, i);
8114 const auto out1 = ConcatUpperLower(d, i, k);
8115 const auto out2 = ConcatUpperUpper(d, k, j);
8116 StoreU(out0, d, unaligned + 0 * N);
8117 StoreU(out1, d, unaligned + 1 * N);
8118 StoreU(out2, d, unaligned + 2 * N);
8119}
8120
8121// Input (128-bit blocks):
8122// 4 0 (LSB of i)
8123// 5 1
8124// 6 2
8125// 7 3
8126// Output:
8127// 1 0
8128// 3 2
8129// 5 4
8130// 7 6
8131template <class D, HWY_IF_V_SIZE_D(D, 32)>
8133 VFromD<D> l, D d,
8134 TFromD<D>* HWY_RESTRICT unaligned) {
8135 constexpr size_t N = Lanes(d);
8136 // Write lower halves, then upper.
8137 const auto out0 = ConcatLowerLower(d, j, i);
8138 const auto out1 = ConcatLowerLower(d, l, k);
8139 StoreU(out0, d, unaligned + 0 * N);
8140 StoreU(out1, d, unaligned + 1 * N);
8141 const auto out2 = ConcatUpperUpper(d, j, i);
8142 const auto out3 = ConcatUpperUpper(d, l, k);
8143 StoreU(out2, d, unaligned + 2 * N);
8144 StoreU(out3, d, unaligned + 3 * N);
8145}
8146} // namespace detail
8147
8148// ------------------------------ Additional mask logical operations
8149
8150#if HWY_TARGET <= HWY_AVX3
8151template <class T>
8152HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) {
8153 constexpr size_t N = Lanes(Full256<T>());
8154 constexpr uint32_t kActiveElemMask =
8155 static_cast<uint32_t>((uint64_t{1} << N) - 1);
8156 return Mask256<T>{static_cast<typename Mask256<T>::Raw>(
8157 (0u - detail::AVX3Blsi(mask.raw)) & kActiveElemMask)};
8158}
8159template <class T>
8160HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) {
8161 constexpr size_t N = Lanes(Full256<T>());
8162 constexpr uint32_t kActiveElemMask =
8163 static_cast<uint32_t>((uint64_t{1} << N) - 1);
8164 return Mask256<T>{static_cast<typename Mask256<T>::Raw>(
8165 (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)};
8166}
8167template <class T>
8168HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
8169 constexpr size_t N = Lanes(Full256<T>());
8170 constexpr uint32_t kActiveElemMask =
8171 static_cast<uint32_t>((uint64_t{1} << N) - 1);
8172 return Mask256<T>{static_cast<typename Mask256<T>::Raw>(
8173 detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)};
8174}
8175template <class T>
8176HWY_API Mask256<T> SetOnlyFirst(Mask256<T> mask) {
8177 return Mask256<T>{
8178 static_cast<typename Mask256<T>::Raw>(detail::AVX3Blsi(mask.raw))};
8179}
8180#else // AVX2
8181template <class T>
8182HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) {
8183 const Full256<T> d;
8184 const Repartition<int64_t, decltype(d)> di64;
8185 const Repartition<float, decltype(d)> df32;
8186 const Repartition<int32_t, decltype(d)> di32;
8187 const Half<decltype(di64)> dh_i64;
8188 const Half<decltype(di32)> dh_i32;
8189 using VF32 = VFromD<decltype(df32)>;
8190
8191 auto vmask = BitCast(di64, VecFromMask(d, mask));
8192 vmask = Or(vmask, Neg(vmask));
8193
8194 // Copy the sign bit of the even int64_t lanes to the odd int64_t lanes
8195 const auto vmask2 = BitCast(
8196 di32, VF32{_mm256_shuffle_ps(Zero(df32).raw, BitCast(df32, vmask).raw,
8197 _MM_SHUFFLE(1, 1, 0, 0))});
8198 vmask = Or(vmask, BitCast(di64, BroadcastSignBit(vmask2)));
8199
8200 // Copy the sign bit of the lower 128-bit half to the upper 128-bit half
8201 const auto vmask3 =
8202 BroadcastSignBit(Broadcast<3>(BitCast(dh_i32, LowerHalf(dh_i64, vmask))));
8203 vmask = Or(vmask, BitCast(di64, Combine(di32, vmask3, Zero(dh_i32))));
8204 return MaskFromVec(BitCast(d, vmask));
8205}
8206
8207template <class T>
8208HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) {
8209 return Not(SetAtOrAfterFirst(mask));
8210}
8211
8212template <class T>
8213HWY_API Mask256<T> SetOnlyFirst(Mask256<T> mask) {
8214 const Full256<T> d;
8215 const RebindToSigned<decltype(d)> di;
8216 const Repartition<int64_t, decltype(d)> di64;
8217 const Half<decltype(di64)> dh_i64;
8218
8219 const auto zero = Zero(di64);
8220 const auto vmask = BitCast(di64, VecFromMask(d, mask));
8221
8222 const auto vmask_eq_0 = VecFromMask(di64, vmask == zero);
8223 auto vmask2_lo = LowerHalf(dh_i64, vmask_eq_0);
8224 auto vmask2_hi = UpperHalf(dh_i64, vmask_eq_0);
8225
8226 vmask2_lo = And(vmask2_lo, InterleaveLower(vmask2_lo, vmask2_lo));
8227 vmask2_hi = And(ConcatLowerUpper(dh_i64, vmask2_hi, vmask2_lo),
8228 InterleaveUpper(dh_i64, vmask2_lo, vmask2_lo));
8229 vmask2_lo = InterleaveLower(Set(dh_i64, int64_t{-1}), vmask2_lo);
8230
8231 const auto vmask2 = Combine(di64, vmask2_hi, vmask2_lo);
8232 const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask))));
8233 return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2))));
8234}
8235
8236template <class T>
8237HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
8238 const Full256<T> d;
8239 constexpr size_t kLanesPerBlock = MaxLanes(d) / 2;
8240
8241 const auto vmask = VecFromMask(d, mask);
8242 const auto vmask_lo = ConcatLowerLower(d, vmask, Zero(d));
8243 return SetBeforeFirst(
8244 MaskFromVec(CombineShiftRightBytes<(kLanesPerBlock - 1) * sizeof(T)>(
8245 d, vmask, vmask_lo)));
8246}
8247#endif // HWY_TARGET <= HWY_AVX3
8248
8249// ------------------------------ Reductions in generic_ops
8250
8251// ------------------------------ LeadingZeroCount
8252
8253#if HWY_TARGET <= HWY_AVX3
8254template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
8256 return V{_mm256_lzcnt_epi32(v.raw)};
8257}
8258
8259template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
8261 return V{_mm256_lzcnt_epi64(v.raw)};
8262}
8263#endif // HWY_TARGET <= HWY_AVX3
8264
8265// NOLINTNEXTLINE(google-readability-namespace-comments)
8266} // namespace HWY_NAMESPACE
8267} // namespace hwy
8269
8270// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
8271// the warning seems to be issued at the call site of intrinsics, i.e. our code.
8272HWY_DIAGNOSTICS(pop)
#define HWY_RESTRICT
Definition base.h:95
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_ASSERT(condition)
Definition base.h:237
Definition arm_neon-inl.h:865
Definition arm_neon-inl.h:813
Raw raw
Definition arm_neon-inl.h:851
Definition wasm_256-inl.h:27
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition x86_256-inl.h:113
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition x86_256-inl.h:107
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition x86_256-inl.h:101
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition x86_256-inl.h:98
HWY_INLINE Vec256 & operator%=(const Vec256 other)
Definition x86_256-inl.h:104
Raw raw
Definition x86_256-inl.h:117
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition x86_256-inl.h:110
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition x86_256-inl.h:95
static constexpr size_t kPrivateN
Definition wasm_256-inl.h:30
typename detail::Raw256< T >::type Raw
Definition x86_256-inl.h:84
T PrivateT
Definition wasm_256-inl.h:29
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition x86_256-inl.h:92
#define HWY_COMPILER_CLANGCL
Definition detect_compiler_arch.h:45
#define HWY_TARGET
Definition detect_targets.h:543
#define HWY_AVX3
Definition detect_targets.h:74
HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV< V > t)
Definition x86_128-inl.h:6289
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:744
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE Vec256< T > BroadcastLane(hwy::SizeTag< 0 >, Vec256< T > v)
Definition x86_256-inl.h:4186
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition ops/shared-inl.h:151
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_INLINE VFromD< D > TableLookupSlideDownLanes(D d, VFromD< D > v, size_t amt)
Definition x86_256-inl.h:5786
HWY_INLINE Vec256< T > NativeGather256(const T *HWY_RESTRICT base, Vec256< int32_t > indices)
Definition x86_256-inl.h:3786
HWY_API Vec128< T, N > GaloisAffine(Vec128< T, N > v, VFromD< Repartition< uint64_t, Simd< T, N, 0 > > > matrix)
Definition x86_128-inl.h:1870
HWY_INLINE VFromD< DI > FixConversionOverflow(DI di, VFromD< RebindToFloat< DI > > original, VFromD< DI > converted)
Definition x86_128-inl.h:10061
static HWY_INLINE uint32_t AVX3Blsi(T x)
Definition x86_128-inl.h:12517
HWY_API Vec128< T, N > Shl(hwy::UnsignedTag, Vec128< T, N > v, Vec128< T, N > bits)
Definition ppc_vsx-inl.h:3336
HWY_INLINE V CombineShiftRightI64Lanes(V hi, V lo)
Definition x86_256-inl.h:5526
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE Mask128< T, N > UnmaskedNot(const Mask128< T, N > m)
Definition x86_128-inl.h:1635
HWY_INLINE Vec128< uint32_t, 2 > LookupAndConcatQuarters(Vec256< T > v)
Definition x86_256-inl.h:6692
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1593
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:5084
HWY_INLINE Vec128< uint32_t > LookupAndConcatHalves(Vec256< T > v)
Definition x86_256-inl.h:6669
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE void LoadTransposedBlocks4(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &vA, VFromD< D > &vB, VFromD< D > &vC, VFromD< D > &vD)
Definition generic_ops-inl.h:1477
HWY_INLINE void StoreTransposedBlocks3(VFromD< D > A, VFromD< D > B, VFromD< D > C, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1652
HWY_INLINE Vec256< T > NativeMaskedGatherOr256(Vec256< T > no, Mask256< T > m, const T *HWY_RESTRICT base, Vec256< int32_t > indices)
Definition x86_256-inl.h:3836
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:3803
HWY_INLINE V SlideDownI64Lanes(V v)
Definition x86_256-inl.h:5740
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
static HWY_INLINE uint32_t AVX3Blsmsk(T x)
Definition x86_128-inl.h:12537
HWY_INLINE void StoreTransposedBlocks2(VFromD< D > A, VFromD< D > B, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:1616
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:164
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition x86_256-inl.h:7076
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE VFromD< DTo > ZeroExtendResizeBitCast(FromSizeTag, ToSizeTag, DTo d_to, DFrom, VFromD< DFrom > v)
Definition emu128-inl.h:140
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2845
HWY_INLINE void StoreTransposedBlocks4(VFromD< D > vA, VFromD< D > vB, VFromD< D > vC, VFromD< D > vD, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:2003
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE VFromD< D > NativeLoadExpand(MFromD< D > mask, D, const uint8_t *HWY_RESTRICT unaligned)
Definition x86_128-inl.h:12412
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:1383
HWY_INLINE VFromD< D > PromoteOddTo(hwy::FloatTag to_type_tag, hwy::SizeTag< 4 > to_lane_size_tag, hwy::FloatTag from_type_tag, D d_to, svfloat16_t v)
Definition arm_sve-inl.h:4419
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8851
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
HWY_INLINE VFromD< D > PromoteEvenTo(hwy::SignedTag, hwy::SizeTag< 2 >, hwy::SignedTag, D d_to, svint8_t v)
Definition arm_sve-inl.h:4334
HWY_INLINE void LoadTransposedBlocks3(D d, const TFromD< D > *HWY_RESTRICT unaligned, VFromD< D > &A, VFromD< D > &B, VFromD< D > &C)
Definition generic_ops-inl.h:1279
HWY_INLINE VFromD< D > TableLookupSlideUpLanes(D d, VFromD< D > v, size_t amt)
Definition x86_256-inl.h:5582
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8860
HWY_API Vec128< T, N > CompressBits(Vec128< T, N > v, uint64_t mask_bits)
Definition ppc_vsx-inl.h:6007
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE VFromD< Rebind< uint64_t, D > > DemoteFromU64Saturate(D dn, VFromD< Rebind< uint64_t, D > > v)
Definition wasm_128-inl.h:4501
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE Vec128< uint8_t, N > NativeExpand(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition x86_128-inl.h:12400
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE V CombineShiftRightI32Lanes(V hi, V lo)
Definition x86_256-inl.h:5517
HWY_INLINE V SlideUpI64Lanes(V v)
Definition x86_256-inl.h:5535
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< DI32 > SatWidenMulPairwiseAccumulate(DI32 di32, VFromD< Repartition< int16_t, DI32 > > a, VFromD< Repartition< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition generic_ops-inl.h:5179
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API Mask< D > SlideMask1Up(D d, Mask< D > m)
Definition generic_ops-inl.h:7071
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
V Shl(V a, V b)
Definition generic_ops-inl.h:7322
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API V AddSub(V a, V b)
Definition generic_ops-inl.h:775
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API V Rol(V a, V b)
Definition generic_ops-inl.h:445
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API V ZeroIfNegative(V v)
Definition generic_ops-inl.h:266
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfAdjQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3901
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API VFromD< D > SlideDownBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7046
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfShuffledQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3943
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec< DI16 > SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b)
Definition generic_ops-inl.h:5153
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Mask< D > SlideMask1Down(D d, Mask< D > m)
Definition generic_ops-inl.h:7076
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API Vec< RepartitionToWideX3< DFromV< V > > > SumsOf8AbsDiff(V a, V b)
Definition generic_ops-inl.h:2820
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API V MulAddSub(V mul, V x, V sub_or_add)
Definition arm_sve-inl.h:4285
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
Simd< T, 32/sizeof(T), 0 > Full256
Definition wasm_128-inl.h:53
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API V Ror(V a, V b)
Definition generic_ops-inl.h:459
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
unsigned int Shift64Count
Definition x86_128-inl.h:4535
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API MFromD< DFromV< V > > IsEitherNaN(V a, V b)
Definition generic_ops-inl.h:1177
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
long long int GatherIndex64
Definition x86_128-inl.h:5737
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API MFromD< D > UpperHalfOfMask(D, MFromD< Twice< D > > m)
Definition x86_128-inl.h:1051
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_API MFromD< D > CombineMasks(D, MFromD< Half< D > > hi, MFromD< Half< D > > lo)
Definition x86_128-inl.h:959
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
HWY_API VFromD< D > SlideUpBlocks(D, VFromD< D > v)
Definition generic_ops-inl.h:7028
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API V BroadcastLane(const V v)
Definition arm_sve-inl.h:4146
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
Definition abort.h:8
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2577
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2540
HWY_API size_t PopCount(T x)
Definition base.h:2615
#define HWY_IF_F16_D(D)
Definition ops/shared-inl.h:597
#define HWY_IF_FLOAT_D(D)
Definition ops/shared-inl.h:535
#define HWY_IF_V_SIZE_GT_V(V, bytes)
Definition ops/shared-inl.h:636
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_UI32_D(D)
Definition ops/shared-inl.h:591
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_V_SIZE_V(V, bytes)
Definition ops/shared-inl.h:632
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_V_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:609
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_NOT_FLOAT_D(D)
Definition ops/shared-inl.h:536
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_AFTER_NAMESPACE()
Definition set_macros-inl.h:633
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition arm_neon-inl.h:8428
Definition wasm_256-inl.h:1085
__m256i raw
Definition x86_256-inl.h:4372
Definition wasm_256-inl.h:64
typename detail::RawMask256< sizeof(T)>::type Raw
Definition x86_256-inl.h:148
static Mask256< T > FromBits(uint64_t mask_bits)
Definition x86_256-inl.h:150
Raw raw
Definition x86_256-inl.h:154
Definition ops/shared-inl.h:198
HWY_INLINE __m256d operator()(__m256i v)
Definition x86_256-inl.h:238
HWY_INLINE __m256 operator()(__m256i v)
Definition x86_256-inl.h:234
HWY_INLINE __m256i operator()(__m256i v)
Definition x86_256-inl.h:224
__m256d type
Definition x86_256-inl.h:77
__m256 type
Definition x86_256-inl.h:73
Definition x86_256-inl.h:62
__m256i type
Definition x86_256-inl.h:63
__mmask32 type
Definition x86_256-inl.h:129
__mmask16 type
Definition x86_256-inl.h:133
__mmask8 type
Definition x86_256-inl.h:137
__mmask8 type
Definition x86_256-inl.h:141
Definition x86_256-inl.h:126
Definition base.h:694
Definition base.h:1117
int VFromD
Definition tuple-inl.h:25
#define HWY_X86_FPCLASS_NEG_INF
Definition x86_128-inl.h:11266
#define HWY_X86_FPCLASS_SNAN
Definition x86_128-inl.h:11269
#define HWY_X86_FPCLASS_POS_INF
Definition x86_128-inl.h:11265
#define HWY_X86_FPCLASS_QNAN
Definition x86_128-inl.h:11262
HWY_BEFORE_NAMESPACE()