Grok 12.0.1
x86_128-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
17// operations when compiling for those targets.
18// External include guard in highway.h - see comment there.
19
20// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL
21#include "hwy/base.h"
22
23// Avoid uninitialized warnings in GCC's emmintrin.h - see
24// https://github.com/google/highway/issues/710 and pull/902
26#if HWY_COMPILER_GCC_ACTUAL
27HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
28HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
29 ignored "-Wmaybe-uninitialized")
30#endif
31
32#include <emmintrin.h>
33#include <stdio.h>
35#include <tmmintrin.h> // SSSE3
36#elif HWY_TARGET <= HWY_SSE4
37#include <smmintrin.h> // SSE4
38#ifndef HWY_DISABLE_PCLMUL_AES
39#include <wmmintrin.h> // CLMUL
40#endif
41#endif
42
43#include "hwy/ops/shared-inl.h"
44
46namespace hwy {
47namespace HWY_NAMESPACE {
48namespace detail {
49
50// Enable generic functions for whichever of (f16, bf16) are not supported.
51#if !HWY_HAVE_FLOAT16
52#define HWY_X86_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
53#else
54#define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
55#endif
56
57#undef HWY_AVX3_HAVE_F32_TO_BF16C
58#if HWY_TARGET <= HWY_AVX3_ZEN4 && !HWY_COMPILER_CLANGCL && \
59 (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 900) && \
60 !defined(HWY_AVX3_DISABLE_AVX512BF16)
61#define HWY_AVX3_HAVE_F32_TO_BF16C 1
62#else
63#define HWY_AVX3_HAVE_F32_TO_BF16C 0
64#endif
65
66template <typename T>
67struct Raw128 {
68 using type = __m128i;
69};
70#if HWY_HAVE_FLOAT16
71template <>
72struct Raw128<float16_t> {
73 using type = __m128h;
74};
75#endif // HWY_HAVE_FLOAT16
76template <>
77struct Raw128<float> {
78 using type = __m128;
79};
80template <>
81struct Raw128<double> {
82 using type = __m128d;
83};
84
85} // namespace detail
86
87template <typename T, size_t N = 16 / sizeof(T)>
88class Vec128 {
89 using Raw = typename detail::Raw128<T>::type;
90
91 public:
92 using PrivateT = T; // only for DFromV
93 static constexpr size_t kPrivateN = N; // only for DFromV
94
95 // Compound assignment. Only usable if there is a corresponding non-member
96 // binary operator overload. For example, only f32 and f64 support division.
98 return *this = (*this * other);
99 }
101 return *this = (*this / other);
102 }
104 return *this = (*this + other);
105 }
107 return *this = (*this - other);
108 }
110 return *this = (*this % other);
111 }
113 return *this = (*this & other);
114 }
116 return *this = (*this | other);
117 }
119 return *this = (*this ^ other);
120 }
121
122 Raw raw;
123};
124
125template <typename T>
126using Vec64 = Vec128<T, 8 / sizeof(T)>;
127
128template <typename T>
129using Vec32 = Vec128<T, 4 / sizeof(T)>;
130
131template <typename T>
132using Vec16 = Vec128<T, 2 / sizeof(T)>;
133
134#if HWY_TARGET <= HWY_AVX3
135
136namespace detail {
137
138// Template arg: sizeof(lane type)
139template <size_t size>
140struct RawMask128 {};
141template <>
143 using type = __mmask16;
144};
145template <>
147 using type = __mmask8;
148};
149template <>
151 using type = __mmask8;
152};
153template <>
155 using type = __mmask8;
156};
157
158} // namespace detail
159
160template <typename T, size_t N = 16 / sizeof(T)>
161struct Mask128 {
162 using Raw = typename detail::RawMask128<sizeof(T)>::type;
163
164 static Mask128<T, N> FromBits(uint64_t mask_bits) {
165 return Mask128<T, N>{static_cast<Raw>(mask_bits)};
166 }
167
168 Raw raw;
169};
170
171#else // AVX2 or below
172
173// FF..FF or 0.
174template <typename T, size_t N = 16 / sizeof(T)>
175struct Mask128 {
176 typename detail::Raw128<T>::type raw;
177};
178
179#endif // AVX2 or below
180
181namespace detail {
182
183// Returns the lowest N of the _mm_movemask* bits.
184template <typename T, size_t N>
185constexpr uint64_t OnlyActive(uint64_t mask_bits) {
186 return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
187}
188
189} // namespace detail
190
191#if HWY_TARGET <= HWY_AVX3
192namespace detail {
193
194// Used by Expand() emulation, which is required for both AVX3 and AVX2.
195template <typename T, size_t N>
196HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
197 return OnlyActive<T, N>(mask.raw);
198}
199
200} // namespace detail
201#endif // HWY_TARGET <= HWY_AVX3
202
203template <class V>
204using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
205
206template <class V>
207using TFromV = typename V::PrivateT;
208
209// ------------------------------ Zero
210
211// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
212template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
213HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
214 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
215}
216#if HWY_HAVE_FLOAT16
217template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
218HWY_API Vec128<float16_t, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
219 return Vec128<float16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_ph()};
220}
221#endif // HWY_HAVE_FLOAT16
222template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
223HWY_API Vec128<float, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
224 return Vec128<float, HWY_MAX_LANES_D(D)>{_mm_setzero_ps()};
225}
226template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
227HWY_API Vec128<double, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
228 return Vec128<double, HWY_MAX_LANES_D(D)>{_mm_setzero_pd()};
229}
230template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
231HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
232 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
233}
234
235// Using the existing Zero function instead of a dedicated function for
236// deduction avoids having to forward-declare Vec256 here.
237template <class D>
238using VFromD = decltype(Zero(D()));
239
240// ------------------------------ Tuple (VFromD)
241#include "hwy/ops/tuple-inl.h"
242
243// ------------------------------ BitCast
244
245namespace detail {
246
247HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
248#if HWY_HAVE_FLOAT16
249HWY_INLINE __m128i BitCastToInteger(__m128h v) { return _mm_castph_si128(v); }
250#endif // HWY_HAVE_FLOAT16
251HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
252HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
253
254#if HWY_AVX3_HAVE_F32_TO_BF16C
255HWY_INLINE __m128i BitCastToInteger(__m128bh v) {
256 // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
257 // bit cast a __m128bh to a __m128i as there is currently no intrinsic
258 // available (as of GCC 13 and Clang 17) that can bit cast a __m128bh vector
259 // to a __m128i vector
260
261#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
262 // On GCC or Clang, use reinterpret_cast to bit cast a __m128bh to a __m128i
263 return reinterpret_cast<__m128i>(v);
264#else
265 // On MSVC, use BitCastScalar to bit cast a __m128bh to a __m128i as MSVC does
266 // not allow reinterpret_cast, static_cast, or a C-style cast to be used to
267 // bit cast from one SSE/AVX vector type to a different SSE/AVX vector type
268 return BitCastScalar<__m128i>(v);
269#endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
270}
271#endif // HWY_AVX3_HAVE_F32_TO_BF16C
272
273template <typename T, size_t N>
274HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
275 return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
276}
277
278// Cannot rely on function overloading because return types differ.
279template <typename T>
280struct BitCastFromInteger128 {
281 HWY_INLINE __m128i operator()(__m128i v) { return v; }
282};
283#if HWY_HAVE_FLOAT16
284template <>
285struct BitCastFromInteger128<float16_t> {
286 HWY_INLINE __m128h operator()(__m128i v) { return _mm_castsi128_ph(v); }
287};
288#endif // HWY_HAVE_FLOAT16
289template <>
290struct BitCastFromInteger128<float> {
291 HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); }
292};
293template <>
294struct BitCastFromInteger128<double> {
295 HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); }
296};
297
298template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
300 Vec128<uint8_t, D().MaxBytes()> v) {
302}
303
304} // namespace detail
305
306template <class D, typename FromT, HWY_IF_V_SIZE_LE_D(D, 16)>
311
312// ------------------------------ Set
313
314template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
315HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
316 return VFromD<D>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
317}
318template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
319HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
320 return VFromD<D>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
321}
322template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
323HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
324 return VFromD<D>{_mm_set1_epi32(static_cast<int>(t))};
325}
326template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
327HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
328 return VFromD<D>{_mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
329}
330#if HWY_HAVE_FLOAT16
331template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
332HWY_API VFromD<D> Set(D /* tag */, float16_t t) {
333 return VFromD<D>{_mm_set1_ph(t)};
334}
335#endif // HWY_HAVE_FLOAT16
336template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
337HWY_API VFromD<D> Set(D /* tag */, float t) {
338 return VFromD<D>{_mm_set1_ps(t)};
339}
340template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
341HWY_API VFromD<D> Set(D /* tag */, double t) {
342 return VFromD<D>{_mm_set1_pd(t)};
343}
344
345// Generic for all vector lengths.
346template <class D, HWY_X86_IF_EMULATED_D(D)>
347HWY_API VFromD<D> Set(D df, TFromD<D> t) {
348 const RebindToUnsigned<decltype(df)> du;
349 static_assert(sizeof(TFromD<D>) == 2, "Expecting [b]f16");
350 uint16_t bits;
351 CopyBytes<2>(&t, &bits);
352 return BitCast(df, Set(du, bits));
353}
354
355// ------------------------------ Undefined
356
357HWY_DIAGNOSTICS(push)
358HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
359
360// Returns a vector with uninitialized elements.
361template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
362HWY_API VFromD<D> Undefined(D /* tag */) {
363 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
364 // generate an XOR instruction.
365 return VFromD<D>{_mm_undefined_si128()};
366}
367#if HWY_HAVE_FLOAT16
368template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
369HWY_API VFromD<D> Undefined(D /* tag */) {
370 return VFromD<D>{_mm_undefined_ph()};
371}
372#endif // HWY_HAVE_FLOAT16
373template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
374HWY_API VFromD<D> Undefined(D /* tag */) {
375 return VFromD<D>{_mm_undefined_ps()};
376}
377template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
378HWY_API VFromD<D> Undefined(D /* tag */) {
379 return VFromD<D>{_mm_undefined_pd()};
380}
381template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
382HWY_API VFromD<D> Undefined(D /* tag */) {
383 return VFromD<D>{_mm_undefined_si128()};
384}
385
387
388// ------------------------------ GetLane
389
390template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
392 return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF);
393}
394template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
395HWY_API T GetLane(const Vec128<T, N> v) {
396 const DFromV<decltype(v)> d;
397 const RebindToUnsigned<decltype(d)> du;
398 const uint16_t bits =
399 static_cast<uint16_t>(_mm_cvtsi128_si32(BitCast(du, v).raw) & 0xFFFF);
400 return BitCastScalar<T>(bits);
401}
402template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
403HWY_API T GetLane(const Vec128<T, N> v) {
404 return static_cast<T>(_mm_cvtsi128_si32(v.raw));
405}
406template <size_t N>
408 return _mm_cvtss_f32(v.raw);
409}
410template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
411HWY_API T GetLane(const Vec128<T, N> v) {
412#if HWY_ARCH_X86_32
413 const DFromV<decltype(v)> d;
414 alignas(16) T lanes[2];
415 Store(v, d, lanes);
416 return lanes[0];
417#else
418 return static_cast<T>(_mm_cvtsi128_si64(v.raw));
419#endif
420}
421template <size_t N>
423 return _mm_cvtsd_f64(v.raw);
424}
425
426// ------------------------------ ResizeBitCast
427
428template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
429 HWY_IF_V_SIZE_LE_D(D, 16)>
430HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
431 const Repartition<uint8_t, decltype(d)> du8;
432 return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)});
433}
434
435// ------------------------------ Dup128VecFromValues
436
437template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
438HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
439 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
440 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
441 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
442 TFromD<D> t11, TFromD<D> t12,
443 TFromD<D> t13, TFromD<D> t14,
444 TFromD<D> t15) {
445 return VFromD<D>{_mm_setr_epi8(
446 static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
447 static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
448 static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
449 static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
450 static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
451 static_cast<char>(t15))};
452}
453
454template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
455HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
456 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
457 TFromD<D> t5, TFromD<D> t6,
458 TFromD<D> t7) {
459 return VFromD<D>{
460 _mm_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1),
461 static_cast<int16_t>(t2), static_cast<int16_t>(t3),
462 static_cast<int16_t>(t4), static_cast<int16_t>(t5),
463 static_cast<int16_t>(t6), static_cast<int16_t>(t7))};
464}
465
466// Generic for all vector lengths
467template <class D, HWY_IF_BF16_D(D)>
468HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
469 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
470 TFromD<D> t5, TFromD<D> t6,
471 TFromD<D> t7) {
472 const RebindToSigned<decltype(d)> di;
473 return BitCast(d,
475 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
476 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
477 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
478 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
479}
480
481#if HWY_HAVE_FLOAT16
482template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
483HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
484 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
485 TFromD<D> t5, TFromD<D> t6,
486 TFromD<D> t7) {
487 return VFromD<D>{_mm_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7)};
488}
489#else
490// Generic for all vector lengths if HWY_HAVE_FLOAT16 is not true
491template <class D, HWY_IF_F16_D(D)>
492HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
493 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
494 TFromD<D> t5, TFromD<D> t6,
495 TFromD<D> t7) {
496 const RebindToSigned<decltype(d)> di;
497 return BitCast(d,
499 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
500 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
501 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
502 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
503}
504#endif // HWY_HAVE_FLOAT16
505
506template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
507HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
508 TFromD<D> t2, TFromD<D> t3) {
509 return VFromD<D>{
510 _mm_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
511 static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
512}
513
514template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
515HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
516 TFromD<D> t2, TFromD<D> t3) {
517 return VFromD<D>{_mm_setr_ps(t0, t1, t2, t3)};
518}
519
520template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
521HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
522 // Need to use _mm_set_epi64x as there is no _mm_setr_epi64x intrinsic
523 // available
524 return VFromD<D>{
525 _mm_set_epi64x(static_cast<int64_t>(t1), static_cast<int64_t>(t0))};
526}
527
528template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
529HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
530 return VFromD<D>{_mm_setr_pd(t0, t1)};
531}
532
533// ================================================== LOGICAL
534
535// ------------------------------ And
536
537template <typename T, size_t N>
538HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
539 const DFromV<decltype(a)> d; // for float16_t
540 const RebindToUnsigned<decltype(d)> du;
541 return BitCast(d, VFromD<decltype(du)>{
542 _mm_and_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
543}
544template <size_t N>
548template <size_t N>
552
553// ------------------------------ AndNot
554
555// Returns ~not_mask & mask.
556template <typename T, size_t N>
557HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
558 const DFromV<decltype(mask)> d; // for float16_t
559 const RebindToUnsigned<decltype(d)> du;
560 return BitCast(d, VFromD<decltype(du)>{_mm_andnot_si128(
561 BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
562}
563template <size_t N>
565 Vec128<float, N> mask) {
566 return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
567}
568template <size_t N>
570 Vec128<double, N> mask) {
571 return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
572}
573
574// ------------------------------ Or
575
576template <typename T, size_t N>
577HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
578 const DFromV<decltype(a)> d; // for float16_t
579 const RebindToUnsigned<decltype(d)> du;
580 return BitCast(d, VFromD<decltype(du)>{
581 _mm_or_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
582}
583
584template <size_t N>
588template <size_t N>
592
593// ------------------------------ Xor
594
595template <typename T, size_t N>
596HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
597 const DFromV<decltype(a)> d; // for float16_t
598 const RebindToUnsigned<decltype(d)> du;
599 return BitCast(d, VFromD<decltype(du)>{
600 _mm_xor_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
601}
602
603template <size_t N>
607template <size_t N>
611
612// ------------------------------ Not
613template <typename T, size_t N>
614HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
615 const DFromV<decltype(v)> d;
616 const RebindToUnsigned<decltype(d)> du;
617 using VU = VFromD<decltype(du)>;
618#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
619 const __m128i vu = BitCast(du, v).raw;
620 return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
621#else
622 return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)}));
623#endif
624}
625
626// ------------------------------ Xor3
627template <typename T, size_t N>
628HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
629#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
630 const DFromV<decltype(x1)> d;
631 const RebindToUnsigned<decltype(d)> du;
632 using VU = VFromD<decltype(du)>;
633 const __m128i ret = _mm_ternarylogic_epi64(
634 BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
635 return BitCast(d, VU{ret});
636#else
637 return Xor(x1, Xor(x2, x3));
638#endif
639}
640
641// ------------------------------ Or3
642template <typename T, size_t N>
643HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
644#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
645 const DFromV<decltype(o1)> d;
646 const RebindToUnsigned<decltype(d)> du;
647 using VU = VFromD<decltype(du)>;
648 const __m128i ret = _mm_ternarylogic_epi64(
649 BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
650 return BitCast(d, VU{ret});
651#else
652 return Or(o1, Or(o2, o3));
653#endif
654}
655
656// ------------------------------ OrAnd
657template <typename T, size_t N>
658HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
659#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
660 const DFromV<decltype(o)> d;
661 const RebindToUnsigned<decltype(d)> du;
662 using VU = VFromD<decltype(du)>;
663 const __m128i ret = _mm_ternarylogic_epi64(
664 BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
665 return BitCast(d, VU{ret});
666#else
667 return Or(o, And(a1, a2));
668#endif
669}
670
671// ------------------------------ IfVecThenElse
672template <typename T, size_t N>
673HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
674 Vec128<T, N> no) {
675#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
676 const DFromV<decltype(no)> d;
677 const RebindToUnsigned<decltype(d)> du;
678 using VU = VFromD<decltype(du)>;
679 return BitCast(
680 d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw,
681 BitCast(du, no).raw, 0xCA)});
682#else
683 return IfThenElse(MaskFromVec(mask), yes, no);
684#endif
685}
686
687// ------------------------------ BitwiseIfThenElse
688#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
689
690#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
691#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
692#else
693#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
694#endif
695
696template <class V>
697HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
698 return IfVecThenElse(mask, yes, no);
699}
700
701#endif
702
703// ------------------------------ Operator overloads (internal-only if float)
704
705template <typename T, size_t N>
706HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
707 return And(a, b);
708}
709
710template <typename T, size_t N>
711HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
712 return Or(a, b);
713}
714
715template <typename T, size_t N>
716HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
717 return Xor(a, b);
718}
719
720// ------------------------------ PopulationCount
721
722// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
723#if HWY_TARGET <= HWY_AVX3_DL
724
725#ifdef HWY_NATIVE_POPCNT
726#undef HWY_NATIVE_POPCNT
727#else
728#define HWY_NATIVE_POPCNT
729#endif
730
731namespace detail {
732
733template <typename T, size_t N>
735 Vec128<T, N> v) {
736 return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
737}
738template <typename T, size_t N>
740 Vec128<T, N> v) {
741 return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
742}
743template <typename T, size_t N>
745 Vec128<T, N> v) {
746 return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
747}
748template <typename T, size_t N>
750 Vec128<T, N> v) {
751 return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
752}
753
754} // namespace detail
755
756template <typename T, size_t N>
760
761#endif // HWY_TARGET <= HWY_AVX3_DL
762
763// ================================================== SIGN
764
765// ------------------------------ Neg
766
767// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
768namespace detail {
769
770template <typename T, size_t N>
772 return Xor(v, SignBit(DFromV<decltype(v)>()));
773}
774
775template <typename T, size_t N>
777 return Xor(v, SignBit(DFromV<decltype(v)>()));
778}
779
780template <typename T, size_t N>
782 return Zero(DFromV<decltype(v)>()) - v;
783}
784
785} // namespace detail
786
787template <typename T, size_t N>
789 return detail::Neg(hwy::TypeTag<T>(), v);
790}
791
792// ------------------------------ Floating-point Abs
793// Generic for all vector lengths
794template <class V, HWY_IF_FLOAT(TFromV<V>)>
795HWY_API V Abs(V v) {
796 const DFromV<decltype(v)> d;
797 const RebindToSigned<decltype(d)> di;
798 using TI = TFromD<decltype(di)>;
799 return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>())));
800}
801
802// ------------------------------ CopySign
803// Generic for all vector lengths.
804template <class V>
805HWY_API V CopySign(const V magn, const V sign) {
806 static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
807
808 const DFromV<decltype(magn)> d;
809 const auto msb = SignBit(d);
810
811 // Truth table for msb, magn, sign | bitwise msb ? sign : mag
812 // 0 0 0 | 0
813 // 0 0 1 | 0
814 // 0 1 0 | 1
815 // 0 1 1 | 1
816 // 1 0 0 | 0
817 // 1 0 1 | 1
818 // 1 1 0 | 0
819 // 1 1 1 | 1
820 return BitwiseIfThenElse(msb, sign, magn);
821}
822
823// ------------------------------ CopySignToAbs
824// Generic for all vector lengths.
825template <class V>
826HWY_API V CopySignToAbs(const V abs, const V sign) {
827 const DFromV<decltype(abs)> d;
828 return OrAnd(abs, SignBit(d), sign);
829}
830
831// ================================================== MASK
832
833#if HWY_TARGET <= HWY_AVX3
834// ------------------------------ MaskFromVec
835
836namespace detail {
837
838template <typename T, size_t N>
840 const Vec128<T, N> v) {
841 return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
842}
843template <typename T, size_t N>
845 const Vec128<T, N> v) {
846 return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
847}
848template <typename T, size_t N>
850 const Vec128<T, N> v) {
851 return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
852}
853template <typename T, size_t N>
855 const Vec128<T, N> v) {
856 return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
857}
858
859} // namespace detail
860
861template <typename T, size_t N>
862HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
863 return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
864}
865// There do not seem to be native floating-point versions of these instructions.
866#if HWY_HAVE_FLOAT16
867template <size_t N>
868HWY_API Mask128<float16_t, N> MaskFromVec(const Vec128<float16_t, N> v) {
869 const RebindToSigned<DFromV<decltype(v)>> di;
870 return Mask128<float16_t, N>{MaskFromVec(BitCast(di, v)).raw};
871}
872#endif
873template <size_t N>
878template <size_t N>
883
884template <class D>
885using MFromD = decltype(MaskFromVec(VFromD<D>()));
886
887// ------------------------------ MaskFalse (MFromD)
888
889#ifdef HWY_NATIVE_MASK_FALSE
890#undef HWY_NATIVE_MASK_FALSE
891#else
892#define HWY_NATIVE_MASK_FALSE
893#endif
894
895// Generic for all vector lengths
896template <class D>
897HWY_API MFromD<D> MaskFalse(D /*d*/) {
898 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(0)};
899}
900
901// ------------------------------ IsNegative (MFromD)
902#ifdef HWY_NATIVE_IS_NEGATIVE
903#undef HWY_NATIVE_IS_NEGATIVE
904#else
905#define HWY_NATIVE_IS_NEGATIVE
906#endif
907
908// Generic for all vector lengths
909template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
910HWY_API MFromD<DFromV<V>> IsNegative(V v) {
911 return MaskFromVec(v);
912}
913
914// ------------------------------ PromoteMaskTo (MFromD)
915
916#ifdef HWY_NATIVE_PROMOTE_MASK_TO
917#undef HWY_NATIVE_PROMOTE_MASK_TO
918#else
919#define HWY_NATIVE_PROMOTE_MASK_TO
920#endif
921
922// AVX3 PromoteMaskTo is generic for all vector lengths
923template <class DTo, class DFrom,
924 HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
925 class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
926 hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
927HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
929 return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
930}
931
932// ------------------------------ DemoteMaskTo (MFromD)
933
934#ifdef HWY_NATIVE_DEMOTE_MASK_TO
935#undef HWY_NATIVE_DEMOTE_MASK_TO
936#else
937#define HWY_NATIVE_DEMOTE_MASK_TO
938#endif
939
940// AVX3 DemoteMaskTo is generic for all vector lengths
941template <class DTo, class DFrom,
942 HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
943 class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
944 hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
945HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
947 return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
948}
949
950// ------------------------------ CombineMasks (MFromD)
951
952#ifdef HWY_NATIVE_COMBINE_MASKS
953#undef HWY_NATIVE_COMBINE_MASKS
954#else
955#define HWY_NATIVE_COMBINE_MASKS
956#endif
957
958template <class D, HWY_IF_LANES_D(D, 2)>
960 MFromD<Half<D>> lo) {
961#if HWY_COMPILER_HAS_MASK_INTRINSICS
962 const __mmask8 combined_mask = _kor_mask8(
963 _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 1),
964 _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(1)));
965#else
966 const auto combined_mask =
967 (static_cast<unsigned>(hi.raw) << 1) | (lo.raw & 1);
968#endif
969
970 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
971}
972
973template <class D, HWY_IF_LANES_D(D, 4)>
974HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
975 MFromD<Half<D>> lo) {
976#if HWY_COMPILER_HAS_MASK_INTRINSICS
977 const __mmask8 combined_mask = _kor_mask8(
978 _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 2),
979 _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(3)));
980#else
981 const auto combined_mask =
982 (static_cast<unsigned>(hi.raw) << 2) | (lo.raw & 3);
983#endif
984
985 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
986}
987
988template <class D, HWY_IF_LANES_D(D, 8)>
989HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
990 MFromD<Half<D>> lo) {
991#if HWY_COMPILER_HAS_MASK_INTRINSICS
992 const __mmask8 combined_mask = _kor_mask8(
993 _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 4),
994 _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(15)));
995#else
996 const auto combined_mask =
997 (static_cast<unsigned>(hi.raw) << 4) | (lo.raw & 15u);
998#endif
999
1000 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
1001}
1002
1003template <class D, HWY_IF_LANES_D(D, 16)>
1004HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
1005 MFromD<Half<D>> lo) {
1006#if HWY_COMPILER_HAS_MASK_INTRINSICS
1007 const __mmask16 combined_mask = _mm512_kunpackb(
1008 static_cast<__mmask16>(hi.raw), static_cast<__mmask16>(lo.raw));
1009#else
1010 const auto combined_mask =
1011 ((static_cast<unsigned>(hi.raw) << 8) | (lo.raw & 0xFFu));
1012#endif
1013
1014 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
1015}
1016
1017// ------------------------------ LowerHalfOfMask (MFromD)
1018
1019#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
1020#undef HWY_NATIVE_LOWER_HALF_OF_MASK
1021#else
1022#define HWY_NATIVE_LOWER_HALF_OF_MASK
1023#endif
1024
1025// Generic for all vector lengths
1026template <class D>
1028 using RawM = decltype(MFromD<D>().raw);
1029 constexpr size_t kN = MaxLanes(d);
1030 constexpr size_t kNumOfBitsInRawMask = sizeof(RawM) * 8;
1031
1032 MFromD<D> result_mask{static_cast<RawM>(m.raw)};
1033
1034 if (kN < kNumOfBitsInRawMask) {
1035 result_mask =
1036 And(result_mask, MFromD<D>{static_cast<RawM>((1ULL << kN) - 1)});
1037 }
1038
1039 return result_mask;
1040}
1041
1042// ------------------------------ UpperHalfOfMask (MFromD)
1043
1044#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
1045#undef HWY_NATIVE_UPPER_HALF_OF_MASK
1046#else
1047#define HWY_NATIVE_UPPER_HALF_OF_MASK
1048#endif
1049
1050template <class D, HWY_IF_LANES_D(D, 1)>
1052#if HWY_COMPILER_HAS_MASK_INTRINSICS
1053 const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 1);
1054#else
1055 const auto shifted_mask = static_cast<unsigned>(m.raw) >> 1;
1056#endif
1057
1058 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1059}
1060
1061template <class D, HWY_IF_LANES_D(D, 2)>
1062HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1063#if HWY_COMPILER_HAS_MASK_INTRINSICS
1064 const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 2);
1065#else
1066 const auto shifted_mask = static_cast<unsigned>(m.raw) >> 2;
1067#endif
1068
1069 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1070}
1071
1072template <class D, HWY_IF_LANES_D(D, 4)>
1073HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1074#if HWY_COMPILER_HAS_MASK_INTRINSICS
1075 const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 4);
1076#else
1077 const auto shifted_mask = static_cast<unsigned>(m.raw) >> 4;
1078#endif
1079
1080 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1081}
1082
1083template <class D, HWY_IF_LANES_D(D, 8)>
1084HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1085#if HWY_COMPILER_HAS_MASK_INTRINSICS
1086 const auto shifted_mask = _kshiftri_mask16(static_cast<__mmask16>(m.raw), 8);
1087#else
1088 const auto shifted_mask = static_cast<unsigned>(m.raw) >> 8;
1089#endif
1090
1091 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1092}
1093
1094// ------------------------------ OrderedDemote2MasksTo (MFromD, CombineMasks)
1095
1096#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
1097#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
1098#else
1099#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
1100#endif
1101
1102// Generic for all vector lengths
1103template <class DTo, class DFrom,
1104 HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
1105 class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
1106 hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
1107HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
1109 using MH = MFromD<Half<DTo>>;
1110 using RawMH = decltype(MH().raw);
1111
1112 return CombineMasks(d_to, MH{static_cast<RawMH>(b.raw)},
1113 MH{static_cast<RawMH>(a.raw)});
1114}
1115
1116// ------------------------------ Slide mask up/down
1117#ifdef HWY_NATIVE_SLIDE_MASK
1118#undef HWY_NATIVE_SLIDE_MASK
1119#else
1120#define HWY_NATIVE_SLIDE_MASK
1121#endif
1122
1123template <class D, HWY_IF_LANES_LE_D(D, 8)>
1125 using RawM = decltype(MFromD<D>().raw);
1126 constexpr size_t kN = MaxLanes(d);
1127 constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
1128
1129#if HWY_COMPILER_HAS_MASK_INTRINSICS
1130 MFromD<D> result_mask{
1131 static_cast<RawM>(_kshiftli_mask8(static_cast<__mmask8>(m.raw), 1))};
1132
1133 if (kN < 8) {
1134 result_mask =
1135 And(result_mask, MFromD<D>{static_cast<RawM>(kValidLanesMask)});
1136 }
1137#else
1138 MFromD<D> result_mask{
1139 static_cast<RawM>((static_cast<unsigned>(m.raw) << 1) & kValidLanesMask)};
1140#endif
1141
1142 return result_mask;
1143}
1144
1145template <class D, HWY_IF_LANES_D(D, 16)>
1146HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
1147 using RawM = decltype(MFromD<D>().raw);
1148#if HWY_COMPILER_HAS_MASK_INTRINSICS
1149 return MFromD<D>{
1150 static_cast<RawM>(_kshiftli_mask16(static_cast<__mmask16>(m.raw), 1))};
1151#else
1152 return MFromD<D>{static_cast<RawM>(static_cast<unsigned>(m.raw) << 1)};
1153#endif
1154}
1155
1156template <class D, HWY_IF_LANES_LE_D(D, 8)>
1158 using RawM = decltype(MFromD<D>().raw);
1159 constexpr size_t kN = MaxLanes(d);
1160 constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
1161
1162#if HWY_COMPILER_HAS_MASK_INTRINSICS
1163 if (kN < 8) {
1164 m = And(m, MFromD<D>{static_cast<RawM>(kValidLanesMask)});
1165 }
1166
1167 return MFromD<D>{
1168 static_cast<RawM>(_kshiftri_mask8(static_cast<__mmask8>(m.raw), 1))};
1169#else
1170 return MFromD<D>{
1171 static_cast<RawM>((static_cast<unsigned>(m.raw) & kValidLanesMask) >> 1)};
1172#endif
1173}
1174
1175template <class D, HWY_IF_LANES_D(D, 16)>
1176HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
1177 using RawM = decltype(MFromD<D>().raw);
1178#if HWY_COMPILER_HAS_MASK_INTRINSICS
1179 return MFromD<D>{
1180 static_cast<RawM>(_kshiftri_mask16(static_cast<__mmask16>(m.raw), 1))};
1181#else
1182 return MFromD<D>{
1183 static_cast<RawM>((static_cast<unsigned>(m.raw) & 0xFFFFu) >> 1)};
1184#endif
1185}
1186
1187// Generic for all vector lengths
1188template <class D>
1190 using RawM = decltype(MFromD<D>().raw);
1191 constexpr size_t kN = MaxLanes(d);
1192 constexpr uint64_t kValidLanesMask =
1193 static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
1194
1195 return MFromD<D>{static_cast<RawM>(
1196 (static_cast<uint64_t>(m.raw) << (amt & 63)) & kValidLanesMask)};
1197}
1198
1199// Generic for all vector lengths
1200template <class D>
1202 using RawM = decltype(MFromD<D>().raw);
1203 constexpr size_t kN = MaxLanes(d);
1204 constexpr uint64_t kValidLanesMask =
1205 static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
1206
1207 return MFromD<D>{static_cast<RawM>(
1208 (static_cast<uint64_t>(m.raw) & kValidLanesMask) >> (amt & 63))};
1209}
1210
1211// ------------------------------ VecFromMask
1212
1213template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
1215 return Vec128<T, N>{_mm_movm_epi8(v.raw)};
1216}
1217
1218template <typename T, size_t N, HWY_IF_UI16(T)>
1219HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1220 return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1221}
1222
1223template <typename T, size_t N, HWY_IF_UI32(T)>
1224HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1225 return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1226}
1227
1228template <typename T, size_t N, HWY_IF_UI64(T)>
1229HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1230 return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1231}
1232
1233#if HWY_HAVE_FLOAT16
1234template <size_t N>
1235HWY_API Vec128<float16_t, N> VecFromMask(const Mask128<float16_t, N> v) {
1236 return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))};
1237}
1238#endif // HWY_HAVE_FLOAT16
1239
1240template <size_t N>
1242 return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
1243}
1244
1245template <size_t N>
1247 return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
1248}
1249
1250// Generic for all vector lengths.
1251template <class D>
1252HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
1253 return VecFromMask(v);
1254}
1255
1256// ------------------------------ RebindMask (MaskFromVec)
1257
1258template <typename TFrom, size_t NFrom, class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)>
1260 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
1261 return MFromD<DTo>{m.raw};
1262}
1263
1264// ------------------------------ IfThenElse
1265
1266namespace detail {
1267
1268template <typename T, size_t N>
1270 Mask128<T, N> mask, Vec128<T, N> yes,
1271 Vec128<T, N> no) {
1272 return Vec128<T, N>{_mm_mask_blend_epi8(mask.raw, no.raw, yes.raw)};
1273}
1274template <typename T, size_t N>
1276 Mask128<T, N> mask, Vec128<T, N> yes,
1277 Vec128<T, N> no) {
1278 return Vec128<T, N>{_mm_mask_blend_epi16(mask.raw, no.raw, yes.raw)};
1279}
1280template <typename T, size_t N>
1282 Mask128<T, N> mask, Vec128<T, N> yes,
1283 Vec128<T, N> no) {
1284 return Vec128<T, N>{_mm_mask_blend_epi32(mask.raw, no.raw, yes.raw)};
1285}
1286template <typename T, size_t N>
1288 Mask128<T, N> mask, Vec128<T, N> yes,
1289 Vec128<T, N> no) {
1290 return Vec128<T, N>{_mm_mask_blend_epi64(mask.raw, no.raw, yes.raw)};
1291}
1292
1293} // namespace detail
1294
1295template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1297 Vec128<T, N> no) {
1298 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
1299}
1300
1301#if HWY_HAVE_FLOAT16
1302template <size_t N>
1303HWY_API Vec128<float16_t, N> IfThenElse(Mask128<float16_t, N> mask,
1304 Vec128<float16_t, N> yes,
1305 Vec128<float16_t, N> no) {
1306 return Vec128<float16_t, N>{_mm_mask_blend_ph(mask.raw, no.raw, yes.raw)};
1307}
1308#endif // HWY_HAVE_FLOAT16
1309
1310// Generic for all vector lengths.
1311template <class V, class D = DFromV<V>, HWY_X86_IF_EMULATED_D(D)>
1312HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
1313 const RebindToUnsigned<D> du;
1314 return BitCast(
1315 D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
1316}
1317
1318template <size_t N>
1321 return Vec128<float, N>{_mm_mask_blend_ps(mask.raw, no.raw, yes.raw)};
1322}
1323
1324template <size_t N>
1327 Vec128<double, N> no) {
1328 return Vec128<double, N>{_mm_mask_blend_pd(mask.raw, no.raw, yes.raw)};
1329}
1330
1331namespace detail {
1332
1333template <typename T, size_t N>
1335 Mask128<T, N> mask, Vec128<T, N> yes) {
1336 return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
1337}
1338template <typename T, size_t N>
1340 Mask128<T, N> mask, Vec128<T, N> yes) {
1341 return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
1342}
1343template <typename T, size_t N>
1345 Mask128<T, N> mask, Vec128<T, N> yes) {
1346 return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
1347}
1348template <typename T, size_t N>
1350 Mask128<T, N> mask, Vec128<T, N> yes) {
1351 return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
1352}
1353
1354} // namespace detail
1355
1356template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1357HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1358 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
1359}
1360
1361template <size_t N>
1363 Vec128<float, N> yes) {
1364 return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
1365}
1366
1367template <size_t N>
1369 Vec128<double, N> yes) {
1370 return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
1371}
1372
1373// Generic for all vector lengths.
1374template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
1376 const RebindToUnsigned<D> du;
1377 return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
1378}
1379
1380namespace detail {
1381
1382template <typename T, size_t N>
1384 Mask128<T, N> mask, Vec128<T, N> no) {
1385 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
1386 return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
1387}
1388template <typename T, size_t N>
1390 Mask128<T, N> mask, Vec128<T, N> no) {
1391 return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
1392}
1393template <typename T, size_t N>
1395 Mask128<T, N> mask, Vec128<T, N> no) {
1396 return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
1397}
1398template <typename T, size_t N>
1400 Mask128<T, N> mask, Vec128<T, N> no) {
1401 return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
1402}
1403
1404} // namespace detail
1405
1406template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1407HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1408 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
1409}
1410
1411template <size_t N>
1413 Vec128<float, N> no) {
1414 return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
1415}
1416
1417template <size_t N>
1419 Vec128<double, N> no) {
1420 return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
1421}
1422
1423// Generic for all vector lengths.
1424template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
1426 const RebindToUnsigned<D> du;
1427 return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
1428}
1429
1430// ------------------------------ Mask logical
1431
1432// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
1433#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
1434#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
1435 HWY_COMPILER_CLANG >= 800
1436#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
1437#else
1438#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
1439#endif
1440#endif // HWY_COMPILER_HAS_MASK_INTRINSICS
1441
1442namespace detail {
1443
1444template <typename T, size_t N>
1446 const Mask128<T, N> b) {
1447#if HWY_COMPILER_HAS_MASK_INTRINSICS
1448 return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
1449#else
1450 return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)};
1451#endif
1452}
1453template <typename T, size_t N>
1455 const Mask128<T, N> b) {
1456#if HWY_COMPILER_HAS_MASK_INTRINSICS
1457 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
1458#else
1459 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
1460#endif
1461}
1462template <typename T, size_t N>
1464 const Mask128<T, N> b) {
1465#if HWY_COMPILER_HAS_MASK_INTRINSICS
1466 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
1467#else
1468 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
1469#endif
1470}
1471template <typename T, size_t N>
1473 const Mask128<T, N> b) {
1474#if HWY_COMPILER_HAS_MASK_INTRINSICS
1475 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
1476#else
1477 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
1478#endif
1479}
1480
1481template <typename T, size_t N>
1483 const Mask128<T, N> b) {
1484#if HWY_COMPILER_HAS_MASK_INTRINSICS
1485 return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
1486#else
1487 return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)};
1488#endif
1489}
1490template <typename T, size_t N>
1492 const Mask128<T, N> b) {
1493#if HWY_COMPILER_HAS_MASK_INTRINSICS
1494 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
1495#else
1496 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
1497#endif
1498}
1499template <typename T, size_t N>
1501 const Mask128<T, N> b) {
1502#if HWY_COMPILER_HAS_MASK_INTRINSICS
1503 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
1504#else
1505 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
1506#endif
1507}
1508template <typename T, size_t N>
1510 const Mask128<T, N> b) {
1511#if HWY_COMPILER_HAS_MASK_INTRINSICS
1512 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
1513#else
1514 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
1515#endif
1516}
1517
1518template <typename T, size_t N>
1520 const Mask128<T, N> b) {
1521#if HWY_COMPILER_HAS_MASK_INTRINSICS
1522 return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
1523#else
1524 return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)};
1525#endif
1526}
1527template <typename T, size_t N>
1529 const Mask128<T, N> b) {
1530#if HWY_COMPILER_HAS_MASK_INTRINSICS
1531 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
1532#else
1533 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
1534#endif
1535}
1536template <typename T, size_t N>
1538 const Mask128<T, N> b) {
1539#if HWY_COMPILER_HAS_MASK_INTRINSICS
1540 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
1541#else
1542 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
1543#endif
1544}
1545template <typename T, size_t N>
1547 const Mask128<T, N> b) {
1548#if HWY_COMPILER_HAS_MASK_INTRINSICS
1549 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
1550#else
1551 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
1552#endif
1553}
1554
1555template <typename T, size_t N>
1557 const Mask128<T, N> b) {
1558#if HWY_COMPILER_HAS_MASK_INTRINSICS
1559 return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
1560#else
1561 return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)};
1562#endif
1563}
1564template <typename T, size_t N>
1566 const Mask128<T, N> b) {
1567#if HWY_COMPILER_HAS_MASK_INTRINSICS
1568 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
1569#else
1570 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
1571#endif
1572}
1573template <typename T, size_t N>
1575 const Mask128<T, N> b) {
1576#if HWY_COMPILER_HAS_MASK_INTRINSICS
1577 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
1578#else
1579 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
1580#endif
1581}
1582template <typename T, size_t N>
1584 const Mask128<T, N> b) {
1585#if HWY_COMPILER_HAS_MASK_INTRINSICS
1586 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
1587#else
1588 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
1589#endif
1590}
1591
1592template <typename T, size_t N>
1594 const Mask128<T, N> a,
1595 const Mask128<T, N> b) {
1596#if HWY_COMPILER_HAS_MASK_INTRINSICS
1597 return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)};
1598#else
1599 return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
1600#endif
1601}
1602template <typename T, size_t N>
1604 const Mask128<T, N> a,
1605 const Mask128<T, N> b) {
1606#if HWY_COMPILER_HAS_MASK_INTRINSICS
1607 return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)};
1608#else
1609 return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
1610#endif
1611}
1612template <typename T, size_t N>
1614 const Mask128<T, N> a,
1615 const Mask128<T, N> b) {
1616#if HWY_COMPILER_HAS_MASK_INTRINSICS
1617 return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
1618#else
1619 return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
1620#endif
1621}
1622template <typename T, size_t N>
1624 const Mask128<T, N> a,
1625 const Mask128<T, N> b) {
1626#if HWY_COMPILER_HAS_MASK_INTRINSICS
1627 return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)};
1628#else
1629 return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)};
1630#endif
1631}
1632
1633// UnmaskedNot returns ~m.raw without zeroing out any invalid bits
1634template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
1636#if HWY_COMPILER_HAS_MASK_INTRINSICS
1637 return Mask128<T, N>{static_cast<__mmask16>(_knot_mask16(m.raw))};
1638#else
1639 return Mask128<T, N>{static_cast<__mmask16>(~m.raw)};
1640#endif
1641}
1642
1643template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
1645#if HWY_COMPILER_HAS_MASK_INTRINSICS
1646 return Mask128<T, N>{static_cast<__mmask8>(_knot_mask8(m.raw))};
1647#else
1648 return Mask128<T, N>{static_cast<__mmask8>(~m.raw)};
1649#endif
1650}
1651
1652template <typename T>
1654 // sizeof(T) == 1 and N == 16: simply return ~m as all 16 bits of m are valid
1655 return UnmaskedNot(m);
1656}
1657template <typename T, size_t N, HWY_IF_LANES_LE(N, 8)>
1659 // sizeof(T) == 1 and N <= 8: need to zero out the upper bits of ~m as there
1660 // are fewer than 16 valid bits in m
1661
1662 // Return (~m) & ((1ull << N) - 1)
1663 return AndNot(hwy::SizeTag<1>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
1664}
1665template <typename T>
1667 // sizeof(T) == 2 and N == 8: simply return ~m as all 8 bits of m are valid
1668 return UnmaskedNot(m);
1669}
1670template <typename T, size_t N, HWY_IF_LANES_LE(N, 4)>
1672 // sizeof(T) == 2 and N <= 4: need to zero out the upper bits of ~m as there
1673 // are fewer than 8 valid bits in m
1674
1675 // Return (~m) & ((1ull << N) - 1)
1676 return AndNot(hwy::SizeTag<2>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
1677}
1678template <typename T, size_t N>
1680 // sizeof(T) == 4: need to zero out the upper bits of ~m as there are at most
1681 // 4 valid bits in m
1682
1683 // Return (~m) & ((1ull << N) - 1)
1684 return AndNot(hwy::SizeTag<4>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
1685}
1686template <typename T, size_t N>
1688 // sizeof(T) == 8: need to zero out the upper bits of ~m as there are at most
1689 // 2 valid bits in m
1690
1691 // Return (~m) & ((1ull << N) - 1)
1692 return AndNot(hwy::SizeTag<8>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
1693}
1694
1695} // namespace detail
1696
1697template <typename T, size_t N>
1698HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1699 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
1700}
1701
1702template <typename T, size_t N>
1703HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1704 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
1705}
1706
1707template <typename T, size_t N>
1708HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1709 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
1710}
1711
1712template <typename T, size_t N>
1713HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1714 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
1715}
1716
1717template <typename T, size_t N>
1718HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1719 // Flip only the valid bits
1720 return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
1721}
1722
1723template <typename T, size_t N>
1724HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
1725 return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
1726}
1727
1728#else // AVX2 or below
1729
1730// ------------------------------ Mask
1731
1732// Mask and Vec are the same (true = FF..FF).
1733template <typename T, size_t N>
1734HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1735 return Mask128<T, N>{v.raw};
1736}
1737
1738template <class D>
1739using MFromD = decltype(MaskFromVec(VFromD<D>()));
1740
1741template <typename T, size_t N>
1742HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1743 return Vec128<T, N>{v.raw};
1744}
1745
1746// Generic for all vector lengths.
1747template <class D>
1748HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
1749 return VecFromMask(v);
1750}
1751
1752#if HWY_TARGET >= HWY_SSSE3
1753
1754// mask ? yes : no
1755template <typename T, size_t N>
1756HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1757 Vec128<T, N> no) {
1758 const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask);
1759 return Or(And(vmask, yes), AndNot(vmask, no));
1760}
1761
1762#else // HWY_TARGET < HWY_SSSE3
1763
1764// mask ? yes : no
1765template <typename T, size_t N>
1766HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1767 Vec128<T, N> no) {
1768 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
1769}
1770template <size_t N>
1771HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask,
1772 Vec128<float, N> yes, Vec128<float, N> no) {
1773 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
1774}
1775template <size_t N>
1776HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask,
1777 Vec128<double, N> yes,
1778 Vec128<double, N> no) {
1779 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
1780}
1781
1782#endif // HWY_TARGET >= HWY_SSSE3
1783
1784// mask ? yes : 0
1785template <typename T, size_t N>
1786HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1787 return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
1788}
1789
1790// mask ? 0 : no
1791template <typename T, size_t N>
1792HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1793 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
1794}
1795
1796// ------------------------------ Mask logical
1797
1798template <typename T, size_t N>
1799HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1800 const Simd<T, N, 0> d;
1801 return MaskFromVec(Not(VecFromMask(d, m)));
1802}
1803
1804template <typename T, size_t N>
1805HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1806 const Simd<T, N, 0> d;
1807 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1808}
1809
1810template <typename T, size_t N>
1811HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1812 const Simd<T, N, 0> d;
1813 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1814}
1815
1816template <typename T, size_t N>
1817HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1818 const Simd<T, N, 0> d;
1819 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1820}
1821
1822template <typename T, size_t N>
1823HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1824 const Simd<T, N, 0> d;
1825 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1826}
1827
1828template <typename T, size_t N>
1829HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
1830 const Simd<T, N, 0> d;
1831 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
1832}
1833
1834#endif // HWY_TARGET <= HWY_AVX3
1835
1836// ------------------------------ ShiftLeft
1837
1838template <int kBits, size_t N>
1839HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
1840 return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
1841}
1842
1843template <int kBits, size_t N>
1844HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
1845 return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
1846}
1847
1848template <int kBits, size_t N>
1849HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
1850 return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
1851}
1852
1853template <int kBits, size_t N>
1854HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
1855 return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
1856}
1857template <int kBits, size_t N>
1858HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
1859 return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
1860}
1861template <int kBits, size_t N>
1862HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
1863 return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
1864}
1865
1866#if HWY_TARGET <= HWY_AVX3_DL
1867
1868namespace detail {
1869template <typename T, size_t N>
1871 Vec128<T, N> v, VFromD<Repartition<uint64_t, Simd<T, N, 0>>> matrix) {
1872 return Vec128<T, N>{_mm_gf2p8affine_epi64_epi8(v.raw, matrix.raw, 0)};
1873}
1874} // namespace detail
1875
1876#else // HWY_TARGET > HWY_AVX3_DL
1877
1878template <int kBits, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
1879HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
1880 const DFromV<decltype(v)> d8;
1881 // Use raw instead of BitCast to support N=1.
1882 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
1883 return kBits == 1
1884 ? (v + v)
1885 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1886}
1887
1888#endif // HWY_TARGET > HWY_AVX3_DL
1889
1890// ------------------------------ ShiftRight
1891
1892template <int kBits, size_t N>
1893HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
1894 return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
1895}
1896template <int kBits, size_t N>
1897HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
1898 return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
1899}
1900template <int kBits, size_t N>
1901HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
1902 return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
1903}
1904
1905template <int kBits, size_t N>
1906HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
1907 return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
1908}
1909template <int kBits, size_t N>
1910HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
1911 return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
1912}
1913
1914#if HWY_TARGET > HWY_AVX3_DL
1915
1916template <int kBits, size_t N>
1917HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
1918 const DFromV<decltype(v)> d8;
1919 // Use raw instead of BitCast to support N=1.
1920 const Vec128<uint8_t, N> shifted{
1921 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
1922 return shifted & Set(d8, 0xFF >> kBits);
1923}
1924
1925template <int kBits, size_t N>
1926HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
1927 const DFromV<decltype(v)> di;
1928 const RebindToUnsigned<decltype(di)> du;
1929 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1930 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
1931 return (shifted ^ shifted_sign) - shifted_sign;
1932}
1933
1934#endif // HWY_TARGET > HWY_AVX3_DL
1935
1936// i64 is implemented after BroadcastSignBit.
1937
1938// ================================================== MEMORY (1)
1939
1940// Clang static analysis claims the memory immediately after a partial vector
1941// store is uninitialized, and also flags the input to partial loads (at least
1942// for loadl_pd) as "garbage". This is a false alarm because msan does not
1943// raise errors. We work around this by using CopyBytes instead of intrinsics,
1944// but only for the analyzer to avoid potentially bad code generation.
1945// Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
1946#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1947#if defined(__clang_analyzer__) || \
1948 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1949#define HWY_SAFE_PARTIAL_LOAD_STORE 1
1950#else
1951#define HWY_SAFE_PARTIAL_LOAD_STORE 0
1952#endif
1953#endif // HWY_SAFE_PARTIAL_LOAD_STORE
1954
1955// ------------------------------ Load
1956
1957template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
1958HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
1959 return VFromD<D>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
1960}
1961#if HWY_HAVE_FLOAT16
1962template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1963HWY_API Vec128<float16_t> Load(D, const float16_t* HWY_RESTRICT aligned) {
1964 return Vec128<float16_t>{_mm_load_ph(aligned)};
1965}
1966#endif // HWY_HAVE_FLOAT16
1967// Generic for all vector lengths greater than or equal to 16 bytes.
1968template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
1969HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
1970 const RebindToUnsigned<decltype(d)> du;
1971 return BitCast(d, Load(du, detail::U16LanePointer(aligned)));
1972}
1973template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
1974HWY_API Vec128<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
1975 return Vec128<float>{_mm_load_ps(aligned)};
1976}
1977template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
1978HWY_API Vec128<double> Load(D /* tag */, const double* HWY_RESTRICT aligned) {
1979 return Vec128<double>{_mm_load_pd(aligned)};
1980}
1981
1982template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
1983HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
1984 return VFromD<D>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
1985}
1986#if HWY_HAVE_FLOAT16
1987template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1988HWY_API Vec128<float16_t> LoadU(D, const float16_t* HWY_RESTRICT p) {
1989 return Vec128<float16_t>{_mm_loadu_ph(p)};
1990}
1991#endif // HWY_HAVE_FLOAT16
1992// Generic for all vector lengths greater than or equal to 16 bytes.
1993template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
1994HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1995 const RebindToUnsigned<decltype(d)> du;
1996 return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
1997}
1998template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
1999HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
2000 return Vec128<float>{_mm_loadu_ps(p)};
2001}
2002template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
2003HWY_API Vec128<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) {
2004 return Vec128<double>{_mm_loadu_pd(p)};
2005}
2006
2007template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
2008HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
2009 const RebindToUnsigned<decltype(d)> du; // for float16_t
2010#if HWY_SAFE_PARTIAL_LOAD_STORE
2011 __m128i v = _mm_setzero_si128();
2012 CopyBytes<8>(p, &v); // not same size
2013#else
2014 const __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
2015#endif
2016 return BitCast(d, VFromD<decltype(du)>{v});
2017}
2018
2019template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
2020HWY_API Vec64<float> Load(D /* tag */, const float* HWY_RESTRICT p) {
2021#if HWY_SAFE_PARTIAL_LOAD_STORE
2022 __m128 v = _mm_setzero_ps();
2023 CopyBytes<8>(p, &v); // not same size
2024 return Vec64<float>{v};
2025#else
2026 const __m128 hi = _mm_setzero_ps();
2027 return Vec64<float>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))};
2028#endif
2029}
2030
2031template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
2032HWY_API Vec64<double> Load(D /* tag */, const double* HWY_RESTRICT p) {
2033#if HWY_SAFE_PARTIAL_LOAD_STORE
2034 __m128d v = _mm_setzero_pd();
2035 CopyBytes<8>(p, &v); // not same size
2036 return Vec64<double>{v};
2037#else
2038 return Vec64<double>{_mm_load_sd(p)};
2039#endif
2040}
2041
2042template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
2043HWY_API Vec32<float> Load(D /* tag */, const float* HWY_RESTRICT p) {
2044#if HWY_SAFE_PARTIAL_LOAD_STORE
2045 __m128 v = _mm_setzero_ps();
2046 CopyBytes<4>(p, &v); // not same size
2047 return Vec32<float>{v};
2048#else
2049 return Vec32<float>{_mm_load_ss(p)};
2050#endif
2051}
2052
2053// Any <= 32 bit except <float, 1>
2054template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)>
2055HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
2056 const RebindToUnsigned<decltype(d)> du; // for float16_t
2057 // Clang ArgumentPromotionPass seems to break this code. We can unpoison
2058 // before SetTableIndices -> LoadU -> Load and the memory is poisoned again.
2060
2061#if HWY_SAFE_PARTIAL_LOAD_STORE
2062 __m128i v = Zero(Full128<TFromD<decltype(du)>>()).raw;
2063 CopyBytes<d.MaxBytes()>(p, &v); // not same size as VFromD
2064#else
2065 int32_t bits = 0;
2066 CopyBytes<d.MaxBytes()>(p, &bits); // not same size as VFromD
2067 const __m128i v = _mm_cvtsi32_si128(bits);
2068#endif
2069 return BitCast(d, VFromD<decltype(du)>{v});
2070}
2071
2072// For < 128 bit, LoadU == Load.
2073template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2074HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
2075 return Load(d, p);
2076}
2077
2078// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
2079template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
2080HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
2081 return LoadU(d, p);
2082}
2083
2084// ------------------------------ Store
2085
2086template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2087HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
2088 _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
2089}
2090#if HWY_HAVE_FLOAT16
2091template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
2092HWY_API void Store(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT aligned) {
2093 _mm_store_ph(aligned, v.raw);
2094}
2095#endif // HWY_HAVE_FLOAT16
2096// Generic for all vector lengths greater than or equal to 16 bytes.
2097template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
2098HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
2099 const RebindToUnsigned<decltype(d)> du;
2100 Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
2101}
2102template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
2103HWY_API void Store(Vec128<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
2104 _mm_store_ps(aligned, v.raw);
2105}
2106template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
2107HWY_API void Store(Vec128<double> v, D /* tag */,
2108 double* HWY_RESTRICT aligned) {
2109 _mm_store_pd(aligned, v.raw);
2110}
2111
2112template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2113HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
2114 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
2115}
2116#if HWY_HAVE_FLOAT16
2117template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
2118HWY_API void StoreU(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT p) {
2119 _mm_storeu_ph(p, v.raw);
2120}
2121#endif // HWY_HAVE_FLOAT16
2122// Generic for all vector lengths greater than or equal to 16 bytes.
2123template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
2124HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
2125 const RebindToUnsigned<decltype(d)> du;
2126 StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
2127}
2128template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
2129HWY_API void StoreU(Vec128<float> v, D /* tag */, float* HWY_RESTRICT p) {
2130 _mm_storeu_ps(p, v.raw);
2131}
2132template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
2133HWY_API void StoreU(Vec128<double> v, D /* tag */, double* HWY_RESTRICT p) {
2134 _mm_storeu_pd(p, v.raw);
2135}
2136
2137template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
2138HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
2139#if HWY_SAFE_PARTIAL_LOAD_STORE
2140 (void)d;
2141 CopyBytes<8>(&v, p); // not same size
2142#else
2143 const RebindToUnsigned<decltype(d)> du; // for float16_t
2144 _mm_storel_epi64(reinterpret_cast<__m128i*>(p), BitCast(du, v).raw);
2145#endif
2146}
2147template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
2148HWY_API void Store(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) {
2149#if HWY_SAFE_PARTIAL_LOAD_STORE
2150 CopyBytes<8>(&v, p); // not same size
2151#else
2152 _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
2153#endif
2154}
2155template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
2156HWY_API void Store(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) {
2157#if HWY_SAFE_PARTIAL_LOAD_STORE
2158 CopyBytes<8>(&v, p); // not same size
2159#else
2160 _mm_storel_pd(p, v.raw);
2161#endif
2162}
2163
2164// Any <= 32 bit except <float, 1>
2165template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)>
2166HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
2167 CopyBytes<d.MaxBytes()>(&v, p); // not same size
2168}
2169template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
2170HWY_API void Store(Vec32<float> v, D /* tag */, float* HWY_RESTRICT p) {
2171#if HWY_SAFE_PARTIAL_LOAD_STORE
2172 CopyBytes<4>(&v, p); // not same size
2173#else
2174 _mm_store_ss(p, v.raw);
2175#endif
2176}
2177
2178// For < 128 bit, StoreU == Store.
2179template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2180HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
2181 Store(v, d, p);
2182}
2183
2184// ================================================== SWIZZLE (1)
2185
2186// ------------------------------ TableLookupBytes
2187template <typename T, size_t N, typename TI, size_t NI>
2188HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
2189 const Vec128<TI, NI> from) {
2190 const DFromV<decltype(from)> d;
2191 const Repartition<uint8_t, decltype(d)> du8;
2192
2193 const DFromV<decltype(bytes)> d_bytes;
2194 const Repartition<uint8_t, decltype(d_bytes)> du8_bytes;
2195#if HWY_TARGET == HWY_SSE2
2196#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
2197 typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16)));
2198 (void)d;
2199 (void)du8;
2200 (void)d_bytes;
2201 (void)du8_bytes;
2202 return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>(
2203 __builtin_shuffle(reinterpret_cast<GccU8RawVectType>(bytes.raw),
2204 reinterpret_cast<GccU8RawVectType>(from.raw)))};
2205#else
2206 const Full128<uint8_t> du8_full;
2207
2208 alignas(16) uint8_t result_bytes[16];
2209 alignas(16) uint8_t u8_bytes[16];
2210 alignas(16) uint8_t from_bytes[16];
2211
2212 Store(Vec128<uint8_t>{BitCast(du8_bytes, bytes).raw}, du8_full, u8_bytes);
2213 Store(Vec128<uint8_t>{BitCast(du8, from).raw}, du8_full, from_bytes);
2214
2215 for (int i = 0; i < 16; i++) {
2216 result_bytes[i] = u8_bytes[from_bytes[i] & 15];
2217 }
2218
2219 return BitCast(d, VFromD<decltype(du8)>{Load(du8_full, result_bytes).raw});
2220#endif
2221#else // SSSE3 or newer
2222 return BitCast(
2223 d, VFromD<decltype(du8)>{_mm_shuffle_epi8(BitCast(du8_bytes, bytes).raw,
2224 BitCast(du8, from).raw)});
2225#endif
2226}
2227
2228// ------------------------------ TableLookupBytesOr0
2229// For all vector widths; x86 anyway zeroes if >= 0x80 on SSSE3/SSE4/AVX2/AVX3
2230template <class V, class VI>
2231HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
2232#if HWY_TARGET == HWY_SSE2
2233 const DFromV<decltype(from)> d;
2234 const Repartition<int8_t, decltype(d)> di8;
2235
2236 const auto di8_from = BitCast(di8, from);
2237 return BitCast(d, IfThenZeroElse(di8_from < Zero(di8),
2238 TableLookupBytes(bytes, di8_from)));
2239#else
2240 return TableLookupBytes(bytes, from);
2241#endif
2242}
2243
2244// ------------------------------ Shuffles (ShiftRight, TableLookupBytes)
2245
2246// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
2247// Shuffle0321 rotates one lane to the right (the previous least-significant
2248// lane is now most-significant). These could also be implemented via
2249// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
2250
2251// Swap 32-bit halves in 64-bit halves.
2252template <typename T, size_t N>
2253HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
2254 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2255 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2256 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
2257}
2258template <size_t N>
2260 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2261 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)};
2262}
2263
2264// These are used by generic_ops-inl to implement LoadInterleaved3. As with
2265// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
2266// comes from the first argument.
2267namespace detail {
2268
2269template <typename T, HWY_IF_T_SIZE(T, 1)>
2270HWY_API Vec32<T> ShuffleTwo2301(const Vec32<T> a, const Vec32<T> b) {
2271 const DFromV<decltype(a)> d;
2272 const Twice<decltype(d)> d2;
2273 const auto ba = Combine(d2, b, a);
2274#if HWY_TARGET == HWY_SSE2
2275 Vec32<uint16_t> ba_shuffled{
2276 _mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))};
2277 return BitCast(d, Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled)));
2278#else
2279 const RebindToUnsigned<decltype(d2)> d2_u;
2280 const auto shuffle_idx =
2281 BitCast(d2, Dup128VecFromValues(d2_u, 1, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0,
2282 0, 0, 0, 0));
2283 return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
2284#endif
2285}
2286template <typename T, HWY_IF_T_SIZE(T, 2)>
2287HWY_API Vec64<T> ShuffleTwo2301(const Vec64<T> a, const Vec64<T> b) {
2288 const DFromV<decltype(a)> d;
2289 const Twice<decltype(d)> d2;
2290 const auto ba = Combine(d2, b, a);
2291#if HWY_TARGET == HWY_SSE2
2292 Vec64<uint32_t> ba_shuffled{
2293 _mm_shuffle_epi32(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))};
2294 return Vec64<T>{
2295 _mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))};
2296#else
2297 const RebindToUnsigned<decltype(d2)> d2_u;
2298 const auto shuffle_idx = BitCast(
2299 d2,
2300 Dup128VecFromValues(d2_u, 0x0302, 0x0100, 0x0f0e, 0x0d0c, 0, 0, 0, 0));
2301 return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
2302#endif
2303}
2304template <typename T, HWY_IF_T_SIZE(T, 4)>
2305HWY_API Vec128<T> ShuffleTwo2301(const Vec128<T> a, const Vec128<T> b) {
2306 const DFromV<decltype(a)> d;
2307 const RebindToFloat<decltype(d)> df;
2308 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
2309 return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw,
2310 BitCast(df, b).raw, m)});
2311}
2312
2313template <typename T, HWY_IF_T_SIZE(T, 1)>
2314HWY_API Vec32<T> ShuffleTwo1230(const Vec32<T> a, const Vec32<T> b) {
2315 const DFromV<decltype(a)> d;
2316#if HWY_TARGET == HWY_SSE2
2317 const auto zero = Zero(d);
2318 const Rebind<int16_t, decltype(d)> di16;
2319 const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16(
2320 _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))};
2321 const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16(
2322 _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))};
2323 const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled);
2324 return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)};
2325#else
2326 const Twice<decltype(d)> d2;
2327 const auto ba = Combine(d2, b, a);
2328 const RebindToUnsigned<decltype(d2)> d2_u;
2329 const auto shuffle_idx =
2330 BitCast(d2, Dup128VecFromValues(d2_u, 0, 3, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0,
2331 0, 0, 0, 0));
2332 return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
2333#endif
2334}
2335template <typename T, HWY_IF_T_SIZE(T, 2)>
2336HWY_API Vec64<T> ShuffleTwo1230(const Vec64<T> a, const Vec64<T> b) {
2337 const DFromV<decltype(a)> d;
2338#if HWY_TARGET == HWY_SSE2
2339 const Vec32<T> a_shuffled{
2340 _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(3, 0, 3, 0))};
2341 const Vec32<T> b_shuffled{
2342 _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(1, 2, 1, 2))};
2343 return Combine(d, b_shuffled, a_shuffled);
2344#else
2345 const Twice<decltype(d)> d2;
2346 const auto ba = Combine(d2, b, a);
2347 const RebindToUnsigned<decltype(d2)> d2_u;
2348 const auto shuffle_idx = BitCast(
2349 d2,
2350 Dup128VecFromValues(d2_u, 0x0100, 0x0706, 0x0d0c, 0x0b0a, 0, 0, 0, 0));
2351 return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
2352#endif
2353}
2354template <typename T, HWY_IF_T_SIZE(T, 4)>
2355HWY_API Vec128<T> ShuffleTwo1230(const Vec128<T> a, const Vec128<T> b) {
2356 const DFromV<decltype(a)> d;
2357 const RebindToFloat<decltype(d)> df;
2358 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
2359 return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw,
2360 BitCast(df, b).raw, m)});
2361}
2362
2363template <typename T, HWY_IF_T_SIZE(T, 1)>
2364HWY_API Vec32<T> ShuffleTwo3012(const Vec32<T> a, const Vec32<T> b) {
2365 const DFromV<decltype(a)> d;
2366#if HWY_TARGET == HWY_SSE2
2367 const auto zero = Zero(d);
2368 const Rebind<int16_t, decltype(d)> di16;
2369 const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16(
2370 _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))};
2371 const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16(
2372 _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))};
2373 const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled);
2374 return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)};
2375#else
2376 const Twice<decltype(d)> d2;
2377 const auto ba = Combine(d2, b, a);
2378 const RebindToUnsigned<decltype(d2)> d2_u;
2379 const auto shuffle_idx =
2380 BitCast(d2, Dup128VecFromValues(d2_u, 2, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0,
2381 0, 0, 0, 0));
2382 return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
2383#endif
2384}
2385template <typename T, HWY_IF_T_SIZE(T, 2)>
2386HWY_API Vec64<T> ShuffleTwo3012(const Vec64<T> a, const Vec64<T> b) {
2387 const DFromV<decltype(a)> d;
2388#if HWY_TARGET == HWY_SSE2
2389 const Vec32<T> a_shuffled{
2390 _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(1, 2, 1, 2))};
2391 const Vec32<T> b_shuffled{
2392 _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(3, 0, 3, 0))};
2393 return Combine(d, b_shuffled, a_shuffled);
2394#else
2395 const Twice<decltype(d)> d2;
2396 const auto ba = Combine(d2, b, a);
2397 const RebindToUnsigned<decltype(d2)> d2_u;
2398 const auto shuffle_idx = BitCast(
2399 d2,
2400 Dup128VecFromValues(d2_u, 0x0504, 0x0302, 0x0908, 0x0f0e, 0, 0, 0, 0));
2401 return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
2402#endif
2403}
2404template <typename T, HWY_IF_T_SIZE(T, 4)>
2405HWY_API Vec128<T> ShuffleTwo3012(const Vec128<T> a, const Vec128<T> b) {
2406 const DFromV<decltype(a)> d;
2407 const RebindToFloat<decltype(d)> df;
2408 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
2409 return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw,
2410 BitCast(df, b).raw, m)});
2411}
2412
2413} // namespace detail
2414
2415// Swap 64-bit halves
2417 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
2418}
2420 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
2421}
2423 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
2424}
2426 return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
2427}
2429 return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
2430}
2432 return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)};
2433}
2434
2435// Rotate right 32 bits
2437 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
2438}
2440 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
2441}
2443 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
2444}
2445// Rotate left 32 bits
2447 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
2448}
2450 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
2451}
2453 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
2454}
2455
2456// Reverse
2458 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
2459}
2461 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
2462}
2464 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
2465}
2466
2467// ================================================== COMPARE
2468
2469#if HWY_TARGET <= HWY_AVX3
2470
2471// Comparisons set a mask bit to 1 if the condition is true, else 0.
2472
2473// ------------------------------ TestBit
2474
2475namespace detail {
2476
2477template <typename T, size_t N>
2479 const Vec128<T, N> bit) {
2480 return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)};
2481}
2482template <typename T, size_t N>
2484 const Vec128<T, N> bit) {
2485 return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)};
2486}
2487template <typename T, size_t N>
2489 const Vec128<T, N> bit) {
2490 return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)};
2491}
2492template <typename T, size_t N>
2494 const Vec128<T, N> bit) {
2495 return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)};
2496}
2497
2498} // namespace detail
2499
2500template <typename T, size_t N>
2501HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
2502 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
2503 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
2504}
2505
2506// ------------------------------ Equality
2507
2508template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
2510 return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)};
2511}
2512
2513template <typename T, size_t N, HWY_IF_UI16(T)>
2514HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
2515 return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
2516}
2517
2518template <typename T, size_t N, HWY_IF_UI32(T)>
2519HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
2520 return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
2521}
2522
2523template <typename T, size_t N, HWY_IF_UI64(T)>
2524HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
2525 return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
2526}
2527
2528#if HWY_HAVE_FLOAT16
2529template <size_t N>
2530HWY_API Mask128<float16_t, N> operator==(Vec128<float16_t, N> a,
2531 Vec128<float16_t, N> b) {
2532 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2533 HWY_DIAGNOSTICS(push)
2534 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2535 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2536 HWY_DIAGNOSTICS(pop)
2537}
2538#endif // HWY_HAVE_FLOAT16
2539template <size_t N>
2540HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
2541 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2542}
2543
2544template <size_t N>
2545HWY_API Mask128<double, N> operator==(Vec128<double, N> a,
2546 Vec128<double, N> b) {
2547 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2548}
2549
2550// ------------------------------ Inequality
2551
2552template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
2554 return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)};
2555}
2556
2557template <typename T, size_t N, HWY_IF_UI16(T)>
2558HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
2559 return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
2560}
2561
2562template <typename T, size_t N, HWY_IF_UI32(T)>
2563HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
2564 return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
2565}
2566
2567template <typename T, size_t N, HWY_IF_UI64(T)>
2568HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
2569 return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
2570}
2571
2572#if HWY_HAVE_FLOAT16
2573template <size_t N>
2574HWY_API Mask128<float16_t, N> operator!=(Vec128<float16_t, N> a,
2575 Vec128<float16_t, N> b) {
2576 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2577 HWY_DIAGNOSTICS(push)
2578 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2579 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2580 HWY_DIAGNOSTICS(pop)
2581}
2582#endif // HWY_HAVE_FLOAT16
2583template <size_t N>
2584HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
2585 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2586}
2587
2588template <size_t N>
2589HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
2590 Vec128<double, N> b) {
2591 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2592}
2593
2594// ------------------------------ Strict inequality
2595
2596// Signed/float <
2597template <size_t N>
2598HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
2599 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
2600}
2601template <size_t N>
2602HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
2603 Vec128<int16_t, N> b) {
2604 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
2605}
2606template <size_t N>
2607HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
2608 Vec128<int32_t, N> b) {
2609 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
2610}
2611template <size_t N>
2612HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a,
2613 Vec128<int64_t, N> b) {
2614 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
2615}
2616
2617template <size_t N>
2618HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a,
2619 Vec128<uint8_t, N> b) {
2620 return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
2621}
2622template <size_t N>
2623HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a,
2624 Vec128<uint16_t, N> b) {
2625 return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
2626}
2627template <size_t N>
2628HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a,
2629 Vec128<uint32_t, N> b) {
2630 return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
2631}
2632template <size_t N>
2633HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
2634 Vec128<uint64_t, N> b) {
2635 return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
2636}
2637
2638#if HWY_HAVE_FLOAT16
2639template <size_t N>
2640HWY_API Mask128<float16_t, N> operator>(Vec128<float16_t, N> a,
2641 Vec128<float16_t, N> b) {
2642 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2643 HWY_DIAGNOSTICS(push)
2644 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2645 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
2646 HWY_DIAGNOSTICS(pop)
2647}
2648#endif // HWY_HAVE_FLOAT16
2649template <size_t N>
2650HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
2651 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
2652}
2653template <size_t N>
2654HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
2655 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
2656}
2657
2658// ------------------------------ Weak inequality
2659
2660#if HWY_HAVE_FLOAT16
2661template <size_t N>
2662HWY_API Mask128<float16_t, N> operator>=(Vec128<float16_t, N> a,
2663 Vec128<float16_t, N> b) {
2664 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2665 HWY_DIAGNOSTICS(push)
2666 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2667 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
2668 HWY_DIAGNOSTICS(pop)
2669}
2670#endif // HWY_HAVE_FLOAT16
2671template <size_t N>
2672HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) {
2673 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
2674}
2675template <size_t N>
2676HWY_API Mask128<double, N> operator>=(Vec128<double, N> a,
2677 Vec128<double, N> b) {
2678 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
2679}
2680
2681template <size_t N>
2682HWY_API Mask128<int8_t, N> operator>=(Vec128<int8_t, N> a,
2683 Vec128<int8_t, N> b) {
2684 return Mask128<int8_t, N>{_mm_cmpge_epi8_mask(a.raw, b.raw)};
2685}
2686template <size_t N>
2687HWY_API Mask128<int16_t, N> operator>=(Vec128<int16_t, N> a,
2688 Vec128<int16_t, N> b) {
2689 return Mask128<int16_t, N>{_mm_cmpge_epi16_mask(a.raw, b.raw)};
2690}
2691template <size_t N>
2692HWY_API Mask128<int32_t, N> operator>=(Vec128<int32_t, N> a,
2693 Vec128<int32_t, N> b) {
2694 return Mask128<int32_t, N>{_mm_cmpge_epi32_mask(a.raw, b.raw)};
2695}
2696template <size_t N>
2697HWY_API Mask128<int64_t, N> operator>=(Vec128<int64_t, N> a,
2698 Vec128<int64_t, N> b) {
2699 return Mask128<int64_t, N>{_mm_cmpge_epi64_mask(a.raw, b.raw)};
2700}
2701
2702template <size_t N>
2703HWY_API Mask128<uint8_t, N> operator>=(Vec128<uint8_t, N> a,
2704 Vec128<uint8_t, N> b) {
2705 return Mask128<uint8_t, N>{_mm_cmpge_epu8_mask(a.raw, b.raw)};
2706}
2707template <size_t N>
2708HWY_API Mask128<uint16_t, N> operator>=(Vec128<uint16_t, N> a,
2709 Vec128<uint16_t, N> b) {
2710 return Mask128<uint16_t, N>{_mm_cmpge_epu16_mask(a.raw, b.raw)};
2711}
2712template <size_t N>
2713HWY_API Mask128<uint32_t, N> operator>=(Vec128<uint32_t, N> a,
2714 Vec128<uint32_t, N> b) {
2715 return Mask128<uint32_t, N>{_mm_cmpge_epu32_mask(a.raw, b.raw)};
2716}
2717template <size_t N>
2718HWY_API Mask128<uint64_t, N> operator>=(Vec128<uint64_t, N> a,
2719 Vec128<uint64_t, N> b) {
2720 return Mask128<uint64_t, N>{_mm_cmpge_epu64_mask(a.raw, b.raw)};
2721}
2722
2723#else // AVX2 or below
2724
2725// Comparisons fill a lane with 1-bits if the condition is true, else 0.
2726
2727template <class DTo, typename TFrom, size_t NFrom, HWY_IF_V_SIZE_LE_D(DTo, 16)>
2728HWY_API MFromD<DTo> RebindMask(DTo dto, Mask128<TFrom, NFrom> m) {
2729 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
2730 const Simd<TFrom, NFrom, 0> d;
2731 return MaskFromVec(BitCast(dto, VecFromMask(d, m)));
2732}
2733
2734template <typename T, size_t N>
2735HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
2736 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
2737 return (v & bit) == bit;
2738}
2739
2740// ------------------------------ Equality
2741
2742// Unsigned
2743template <size_t N>
2744HWY_API Mask128<uint8_t, N> operator==(Vec128<uint8_t, N> a,
2745 Vec128<uint8_t, N> b) {
2746 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
2747}
2748template <size_t N>
2749HWY_API Mask128<uint16_t, N> operator==(Vec128<uint16_t, N> a,
2750 Vec128<uint16_t, N> b) {
2751 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
2752}
2753template <size_t N>
2754HWY_API Mask128<uint32_t, N> operator==(Vec128<uint32_t, N> a,
2755 Vec128<uint32_t, N> b) {
2756 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
2757}
2758template <size_t N>
2759HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
2760 const Vec128<uint64_t, N> b) {
2761#if HWY_TARGET >= HWY_SSSE3
2762 const DFromV<decltype(a)> d64;
2763 const RepartitionToNarrow<decltype(d64)> d32;
2764 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
2765 const auto cmp64 = cmp32 & Shuffle2301(cmp32);
2766 return MaskFromVec(BitCast(d64, cmp64));
2767#else
2768 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
2769#endif
2770}
2771
2772// Signed
2773template <size_t N>
2774HWY_API Mask128<int8_t, N> operator==(Vec128<int8_t, N> a,
2775 Vec128<int8_t, N> b) {
2776 return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
2777}
2778template <size_t N>
2779HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
2780 Vec128<int16_t, N> b) {
2781 return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
2782}
2783template <size_t N>
2784HWY_API Mask128<int32_t, N> operator==(Vec128<int32_t, N> a,
2785 Vec128<int32_t, N> b) {
2786 return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
2787}
2788template <size_t N>
2789HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
2790 const Vec128<int64_t, N> b) {
2791 // Same as signed ==; avoid duplicating the SSSE3 version.
2792 const DFromV<decltype(a)> d;
2793 RebindToUnsigned<decltype(d)> du;
2794 return RebindMask(d, BitCast(du, a) == BitCast(du, b));
2795}
2796
2797// Float
2798template <size_t N>
2799HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
2800 return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
2801}
2802template <size_t N>
2803HWY_API Mask128<double, N> operator==(Vec128<double, N> a,
2804 Vec128<double, N> b) {
2805 return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
2806}
2807
2808// ------------------------------ Inequality
2809
2810// This cannot have T as a template argument, otherwise it is not more
2811// specialized than rewritten operator== in C++20, leading to compile
2812// errors: https://gcc.godbolt.org/z/xsrPhPvPT.
2813template <size_t N>
2814HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
2815 Vec128<uint8_t, N> b) {
2816 return Not(a == b);
2817}
2818template <size_t N>
2819HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
2820 Vec128<uint16_t, N> b) {
2821 return Not(a == b);
2822}
2823template <size_t N>
2824HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
2825 Vec128<uint32_t, N> b) {
2826 return Not(a == b);
2827}
2828template <size_t N>
2829HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
2830 Vec128<uint64_t, N> b) {
2831 return Not(a == b);
2832}
2833template <size_t N>
2834HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
2835 Vec128<int8_t, N> b) {
2836 return Not(a == b);
2837}
2838template <size_t N>
2839HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
2840 Vec128<int16_t, N> b) {
2841 return Not(a == b);
2842}
2843template <size_t N>
2844HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
2845 Vec128<int32_t, N> b) {
2846 return Not(a == b);
2847}
2848template <size_t N>
2849HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
2850 Vec128<int64_t, N> b) {
2851 return Not(a == b);
2852}
2853
2854template <size_t N>
2855HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
2856 return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
2857}
2858template <size_t N>
2859HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
2860 Vec128<double, N> b) {
2861 return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
2862}
2863
2864// ------------------------------ Strict inequality
2865
2866namespace detail {
2867
2868template <size_t N>
2869HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a,
2870 Vec128<int8_t, N> b) {
2871 return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
2872}
2873template <size_t N>
2874HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a,
2875 Vec128<int16_t, N> b) {
2876 return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
2877}
2878template <size_t N>
2879HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a,
2880 Vec128<int32_t, N> b) {
2881 return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
2882}
2883
2884template <size_t N>
2885HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/,
2886 const Vec128<int64_t, N> a,
2887 const Vec128<int64_t, N> b) {
2888#if HWY_TARGET >= HWY_SSSE3
2889 // See https://stackoverflow.com/questions/65166174/:
2890 const DFromV<decltype(a)> d;
2891 const RepartitionToNarrow<decltype(d)> d32;
2892 const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw};
2893 const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw};
2894 // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper:
2895 // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0.
2896 const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw;
2897 // Duplicate upper to lower half.
2898 return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
2899#else
2900 return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2
2901#endif
2902}
2903
2904template <typename T, size_t N>
2905HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a,
2906 Vec128<T, N> b) {
2907 const DFromV<decltype(a)> du;
2908 const RebindToSigned<decltype(du)> di;
2909 const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
2910 const auto sa = BitCast(di, Xor(a, msb));
2911 const auto sb = BitCast(di, Xor(b, msb));
2912 return RebindMask(du, Gt(hwy::SignedTag(), sa, sb));
2913}
2914
2915template <size_t N>
2916HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a,
2917 Vec128<float, N> b) {
2918 return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
2919}
2920template <size_t N>
2921HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a,
2922 Vec128<double, N> b) {
2923 return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
2924}
2925
2926} // namespace detail
2927
2928template <typename T, size_t N>
2929HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
2930 return detail::Gt(hwy::TypeTag<T>(), a, b);
2931}
2932
2933// ------------------------------ Weak inequality
2934
2935namespace detail {
2936template <typename T, size_t N>
2937HWY_INLINE Mask128<T, N> Ge(hwy::SignedTag tag, Vec128<T, N> a,
2938 Vec128<T, N> b) {
2939 return Not(Gt(tag, b, a));
2940}
2941
2942template <typename T, size_t N>
2943HWY_INLINE Mask128<T, N> Ge(hwy::UnsignedTag tag, Vec128<T, N> a,
2944 Vec128<T, N> b) {
2945 return Not(Gt(tag, b, a));
2946}
2947
2948template <size_t N>
2949HWY_INLINE Mask128<float, N> Ge(hwy::FloatTag /*tag*/, Vec128<float, N> a,
2950 Vec128<float, N> b) {
2951 return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
2952}
2953template <size_t N>
2954HWY_INLINE Mask128<double, N> Ge(hwy::FloatTag /*tag*/, Vec128<double, N> a,
2955 Vec128<double, N> b) {
2956 return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
2957}
2958
2959} // namespace detail
2960
2961template <typename T, size_t N>
2962HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
2963 return detail::Ge(hwy::TypeTag<T>(), a, b);
2964}
2965
2966#endif // HWY_TARGET <= HWY_AVX3
2967
2968// ------------------------------ Reversed comparisons
2969
2970template <typename T, size_t N>
2971HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
2972 return b > a;
2973}
2974
2975template <typename T, size_t N>
2976HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
2977 return b >= a;
2978}
2979
2980// ------------------------------ Iota (Load)
2981
2982namespace detail {
2983
2984template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
2985HWY_INLINE VFromD<D> Iota0(D /*d*/) {
2986 return VFromD<D>{_mm_set_epi8(
2987 static_cast<char>(15), static_cast<char>(14), static_cast<char>(13),
2988 static_cast<char>(12), static_cast<char>(11), static_cast<char>(10),
2989 static_cast<char>(9), static_cast<char>(8), static_cast<char>(7),
2990 static_cast<char>(6), static_cast<char>(5), static_cast<char>(4),
2991 static_cast<char>(3), static_cast<char>(2), static_cast<char>(1),
2992 static_cast<char>(0))};
2993}
2994
2995template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
2996HWY_INLINE VFromD<D> Iota0(D /*d*/) {
2997 return VFromD<D>{_mm_set_epi16(int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4},
2998 int16_t{3}, int16_t{2}, int16_t{1},
2999 int16_t{0})};
3000}
3001
3002#if HWY_HAVE_FLOAT16
3003template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
3004HWY_INLINE VFromD<D> Iota0(D /*d*/) {
3005 return VFromD<D>{_mm_set_ph(float16_t{7}, float16_t{6}, float16_t{5},
3006 float16_t{4}, float16_t{3}, float16_t{2},
3007 float16_t{1}, float16_t{0})};
3008}
3009#endif // HWY_HAVE_FLOAT16
3010
3011template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
3012HWY_INLINE VFromD<D> Iota0(D /*d*/) {
3013 return VFromD<D>{
3014 _mm_set_epi32(int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})};
3015}
3016
3017template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
3018HWY_INLINE VFromD<D> Iota0(D /*d*/) {
3019 return VFromD<D>{_mm_set_epi64x(int64_t{1}, int64_t{0})};
3020}
3021
3022template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3023HWY_INLINE VFromD<D> Iota0(D /*d*/) {
3024 return VFromD<D>{_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)};
3025}
3026
3027template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3028HWY_INLINE VFromD<D> Iota0(D /*d*/) {
3029 return VFromD<D>{_mm_set_pd(1.0, 0.0)};
3030}
3031
3032#if HWY_COMPILER_MSVC
3033template <class V, HWY_IF_V_SIZE_V(V, 1)>
3034static HWY_INLINE V MaskOutVec128Iota(V v) {
3035 const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFF)};
3036 return v & mask_out_mask;
3037}
3038template <class V, HWY_IF_V_SIZE_V(V, 2)>
3039static HWY_INLINE V MaskOutVec128Iota(V v) {
3040#if HWY_TARGET <= HWY_SSE4
3041 return V{_mm_blend_epi16(v.raw, _mm_setzero_si128(), 0xFE)};
3042#else
3043 const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFFFF)};
3044 return v & mask_out_mask;
3045#endif
3046}
3047template <class V, HWY_IF_V_SIZE_V(V, 4)>
3048static HWY_INLINE V MaskOutVec128Iota(V v) {
3049 const DFromV<decltype(v)> d;
3050 const Repartition<float, decltype(d)> df;
3051 using VF = VFromD<decltype(df)>;
3052 return BitCast(d, VF{_mm_move_ss(_mm_setzero_ps(), BitCast(df, v).raw)});
3053}
3054template <class V, HWY_IF_V_SIZE_V(V, 8)>
3055static HWY_INLINE V MaskOutVec128Iota(V v) {
3056 const DFromV<decltype(v)> d;
3057 const RebindToUnsigned<decltype(d)> du;
3058 using VU = VFromD<decltype(du)>;
3059 return BitCast(d, VU{_mm_move_epi64(BitCast(du, v).raw)});
3060}
3061template <class V, HWY_IF_V_SIZE_GT_V(V, 8)>
3062static HWY_INLINE V MaskOutVec128Iota(V v) {
3063 return v;
3064}
3065#endif
3066
3067} // namespace detail
3068
3069template <class D, typename T2, HWY_IF_V_SIZE_LE_D(D, 16)>
3070HWY_API VFromD<D> Iota(D d, const T2 first) {
3071 const auto result_iota =
3072 detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
3073#if HWY_COMPILER_MSVC
3074 return detail::MaskOutVec128Iota(result_iota);
3075#else
3076 return result_iota;
3077#endif
3078}
3079
3080// ------------------------------ FirstN (Iota, Lt)
3081
3082template <class D, class M = MFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
3083HWY_API M FirstN(D d, size_t num) {
3084 constexpr size_t kN = MaxLanes(d);
3085 // For AVX3, this ensures `num` <= 255 as required by bzhi, which only looks
3086 // at the lower 8 bits; for AVX2 and below, this ensures `num` fits in TI.
3087 num = HWY_MIN(num, kN);
3088#if HWY_TARGET <= HWY_AVX3
3089#if HWY_ARCH_X86_64
3090 const uint64_t all = (1ull << kN) - 1;
3091 return M::FromBits(_bzhi_u64(all, num));
3092#else
3093 const uint32_t all = static_cast<uint32_t>((1ull << kN) - 1);
3094 return M::FromBits(_bzhi_u32(all, static_cast<uint32_t>(num)));
3095#endif // HWY_ARCH_X86_64
3096#else // HWY_TARGET > HWY_AVX3
3097 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
3098 using TI = TFromD<decltype(di)>;
3099 return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num)));
3100#endif // HWY_TARGET <= HWY_AVX3
3101}
3102
3103// ------------------------------ InterleaveLower
3104
3105// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
3106// the least-significant lane) and "b". To concatenate two half-width integers
3107// into one, use ZipLower/Upper instead (also works with scalar).
3108
3109template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
3110HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
3111 return Vec128<T, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
3112}
3113template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
3114HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
3115 const DFromV<decltype(a)> d;
3116 const RebindToUnsigned<decltype(d)> du;
3117 using VU = VFromD<decltype(du)>; // for float16_t
3118 return BitCast(
3119 d, VU{_mm_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
3120}
3121template <typename T, size_t N, HWY_IF_UI32(T)>
3122HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
3123 return Vec128<T, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
3124}
3125template <typename T, size_t N, HWY_IF_UI64(T)>
3126HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
3127 return Vec128<T, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
3128}
3129
3130template <size_t N>
3131HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a,
3132 Vec128<float, N> b) {
3133 return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
3134}
3135template <size_t N>
3136HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a,
3137 Vec128<double, N> b) {
3138 return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
3139}
3140
3141// Generic for all vector lengths.
3142template <class D>
3144 return InterleaveLower(a, b);
3145}
3146
3147// ================================================== MEMORY (2)
3148
3149// ------------------------------ MaskedLoad
3150
3151#if HWY_TARGET <= HWY_AVX3
3152
3153template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3155 const TFromD<D>* HWY_RESTRICT p) {
3156 return VFromD<D>{_mm_maskz_loadu_epi8(m.raw, p)};
3157}
3158
3159template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3160HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
3161 const TFromD<D>* HWY_RESTRICT p) {
3162 const RebindToUnsigned<decltype(d)> du; // for float16_t
3163 return BitCast(d, VFromD<decltype(du)>{_mm_maskz_loadu_epi16(m.raw, p)});
3164}
3165
3166template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
3167HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3168 const TFromD<D>* HWY_RESTRICT p) {
3169 return VFromD<D>{_mm_maskz_loadu_epi32(m.raw, p)};
3170}
3171
3172template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
3173HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3174 const TFromD<D>* HWY_RESTRICT p) {
3175 return VFromD<D>{_mm_maskz_loadu_epi64(m.raw, p)};
3176}
3177
3178template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3180 const float* HWY_RESTRICT p) {
3181 return VFromD<D>{_mm_maskz_loadu_ps(m.raw, p)};
3182}
3183
3184template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3186 const double* HWY_RESTRICT p) {
3187 return VFromD<D>{_mm_maskz_loadu_pd(m.raw, p)};
3188}
3189
3190template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3192 const TFromD<D>* HWY_RESTRICT p) {
3193 return VFromD<D>{_mm_mask_loadu_epi8(v.raw, m.raw, p)};
3194}
3195
3196template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3197HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
3198 const TFromD<D>* HWY_RESTRICT p) {
3199 const RebindToUnsigned<decltype(d)> du; // for float16_t
3200 return BitCast(d, VFromD<decltype(du)>{
3201 _mm_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)});
3202}
3203
3204template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
3205HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
3206 const TFromD<D>* HWY_RESTRICT p) {
3207 return VFromD<D>{_mm_mask_loadu_epi32(v.raw, m.raw, p)};
3208}
3209
3210template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
3211HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
3212 const TFromD<D>* HWY_RESTRICT p) {
3213 return VFromD<D>{_mm_mask_loadu_epi64(v.raw, m.raw, p)};
3214}
3215
3216template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3218 const float* HWY_RESTRICT p) {
3219 return VFromD<D>{_mm_mask_loadu_ps(v.raw, m.raw, p)};
3220}
3221
3222template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3224 const double* HWY_RESTRICT p) {
3225 return VFromD<D>{_mm_mask_loadu_pd(v.raw, m.raw, p)};
3226}
3227
3228#elif HWY_TARGET == HWY_AVX2
3229
3230template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
3231HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3232 const TFromD<D>* HWY_RESTRICT p) {
3233 auto p_p = reinterpret_cast<const int*>(p); // NOLINT
3234 return VFromD<D>{_mm_maskload_epi32(p_p, m.raw)};
3235}
3236
3237template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
3238HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
3239 const TFromD<D>* HWY_RESTRICT p) {
3240 auto p_p = reinterpret_cast<const long long*>(p); // NOLINT
3241 return VFromD<D>{_mm_maskload_epi64(p_p, m.raw)};
3242}
3243
3244template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3245HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const float* HWY_RESTRICT p) {
3246 const RebindToSigned<decltype(d)> di;
3247 return VFromD<D>{_mm_maskload_ps(p, BitCast(di, VecFromMask(d, m)).raw)};
3248}
3249
3250template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3251HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const double* HWY_RESTRICT p) {
3252 const RebindToSigned<decltype(d)> di;
3253 return VFromD<D>{_mm_maskload_pd(p, BitCast(di, VecFromMask(d, m)).raw)};
3254}
3255
3256// There is no maskload_epi8/16, so blend instead.
3257template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
3258 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
3259HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
3260 const TFromD<D>* HWY_RESTRICT p) {
3261 return IfThenElseZero(m, LoadU(d, p));
3262}
3263
3264#else // <= SSE4
3265
3266// Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
3267template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
3268HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
3269 const TFromD<D>* HWY_RESTRICT p) {
3270 return IfThenElseZero(m, LoadU(d, p));
3271}
3272
3273#endif
3274
3275// ------------------------------ MaskedLoadOr
3276
3277#if HWY_TARGET > HWY_AVX3 // else: native
3278
3279// Generic for all vector lengths.
3280template <class D>
3281HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
3282 const TFromD<D>* HWY_RESTRICT p) {
3283 return IfThenElse(m, LoadU(d, p), v);
3284}
3285
3286#endif // HWY_TARGET > HWY_AVX3
3287
3288// ------------------------------ LoadN (InterleaveLower)
3289
3290#if HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT
3291
3292#ifdef HWY_NATIVE_LOAD_N
3293#undef HWY_NATIVE_LOAD_N
3294#else
3295#define HWY_NATIVE_LOAD_N
3296#endif
3297
3298// Generic for all vector lengths.
3299template <class D, HWY_IF_T_SIZE_ONE_OF_D(
3300 D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) |
3301 (1 << 4) | (1 << 8))>
3302HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
3303 size_t num_lanes) {
3304 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
3305 d_full;
3306 return ResizeBitCast(d, MaskedLoad(FirstN(d_full, num_lanes), d_full, p));
3307}
3308
3309// Generic for all vector lengths.
3310template <class D, HWY_IF_T_SIZE_ONE_OF_D(
3311 D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) |
3312 (1 << 4) | (1 << 8))>
3313HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
3314 size_t num_lanes) {
3315 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
3316 d_full;
3317 return ResizeBitCast(d, MaskedLoadOr(ResizeBitCast(d_full, no),
3318 FirstN(d_full, num_lanes), d_full, p));
3319}
3320
3321#if HWY_TARGET > HWY_AVX3
3322namespace detail {
3323
3324// 'Leading' means the part that fits in 32-bit lanes. With 2-byte vectors,
3325// there are none, so return the remainder (v_trailing).
3326template <class D, HWY_IF_V_SIZE_LE_D(D, 2)>
3327HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingN(
3328 VFromD<D> /*load_mask*/, D /*d*/, const TFromD<D>* HWY_RESTRICT /*p*/,
3329 VFromD<D> v_trailing) {
3330 return v_trailing;
3331}
3332
3333template <class D, HWY_IF_V_SIZE_LE_D(D, 2)>
3334HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingNOr(
3335 VFromD<D> /*no*/, VFromD<D> /*load_mask*/, D /*d*/,
3336 const TFromD<D>* HWY_RESTRICT /*p*/, VFromD<D> v_trailing) {
3337 return v_trailing;
3338}
3339
3340template <class D, HWY_IF_V_SIZE_GT_D(D, 2)>
3341HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingN(VFromD<D> load_mask, D d,
3342 const TFromD<D>* HWY_RESTRICT p,
3343 VFromD<D> v_trailing) {
3344 using DI32 = Repartition<int32_t, D>;
3345 const FixedTag<int32_t, HWY_MAX(HWY_MAX_LANES_D(DI32), 4)> di32_full;
3346
3347 // ResizeBitCast of load_mask to di32 is okay below if
3348 // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past
3349 // the first (lowest-index) lanes of load_mask.raw will have already been
3350 // zeroed out by FirstN.
3351 return ResizeBitCast(
3353 ResizeBitCast(di32_full, load_mask),
3354 MaskedLoad(MaskFromVec(ResizeBitCast(di32_full, load_mask)),
3355 di32_full, reinterpret_cast<const int32_t*>(p)),
3356 ResizeBitCast(di32_full, v_trailing)));
3357}
3358
3359template <class D, HWY_IF_V_SIZE_GT_D(D, 2)>
3360HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingNOr(VFromD<D> no,
3361 VFromD<D> load_mask, D d,
3362 const TFromD<D>* HWY_RESTRICT p,
3363 VFromD<D> v_trailing) {
3364 using DI32 = Repartition<int32_t, D>;
3365 const FixedTag<int32_t, HWY_MAX(HWY_MAX_LANES_D(DI32), 4)> di32_full;
3366
3367 // ResizeBitCast of load_mask to di32 is okay below if
3368 // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past
3369 // the first (lowest-index) lanes of load_mask.raw will have already been
3370 // zeroed out by FirstN.
3371 return ResizeBitCast(
3373 ResizeBitCast(di32_full, load_mask),
3374 MaskedLoadOr(ResizeBitCast(di32_full, no),
3375 MaskFromVec(ResizeBitCast(di32_full, load_mask)),
3376 di32_full, reinterpret_cast<const int32_t*>(p)),
3377 ResizeBitCast(di32_full, v_trailing)));
3378}
3379
3380// Single lane: load or default value.
3381template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
3382 HWY_IF_LANES_D(D, 1)>
3383HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> /*load_mask*/, D d,
3384 const TFromD<D>* HWY_RESTRICT p,
3385 size_t num_lanes) {
3386 return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
3387}
3388
3389template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
3390 HWY_IF_LANES_D(D, 1)>
3391HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
3392 VFromD<D> no, VFromD<D> /*load_mask*/, D d, const TFromD<D>* HWY_RESTRICT p,
3393 size_t num_lanes) {
3394 return (num_lanes > 0) ? LoadU(d, p) : no;
3395}
3396
3397// Two lanes: load 1, 2, or default.
3398template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)>
3399HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> /*load_mask*/, D d,
3400 const TFromD<D>* HWY_RESTRICT p,
3401 size_t num_lanes) {
3402 if (num_lanes > 1) {
3403 return LoadU(d, p);
3404 } else {
3405 const FixedTag<TFromD<D>, 1> d1;
3406 return (num_lanes == 1) ? ResizeBitCast(d, LoadU(d1, p)) : Zero(d);
3407 }
3408}
3409
3410template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)>
3411HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
3412 VFromD<D> no, VFromD<D> /*load_mask*/, D d, const TFromD<D>* HWY_RESTRICT p,
3413 size_t num_lanes) {
3414 if (num_lanes > 1) {
3415 return LoadU(d, p);
3416 } else {
3417 if (num_lanes == 0) return no;
3418 // Load one, upper lane is default.
3419 const FixedTag<TFromD<D>, 1> d1;
3420 return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
3421 }
3422}
3423
3424template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)>
3425HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> load_mask, D d,
3426 const TFromD<D>* HWY_RESTRICT p,
3427 size_t num_lanes) {
3428 const size_t trailing_n = num_lanes & 3;
3429 if (trailing_n == 0) return Zero(d);
3430
3431 VFromD<D> v_trailing = And(load_mask, Set(d, p[num_lanes - 1]));
3432
3433 if ((trailing_n & 2) != 0) {
3434 const Repartition<int16_t, decltype(d)> di16;
3435 int16_t i16_bits;
3436 CopyBytes<sizeof(int16_t)>(p + num_lanes - trailing_n, &i16_bits);
3437 v_trailing = BitCast(
3438 d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits),
3439 BitCast(di16, v_trailing)));
3440 }
3441
3442 return v_trailing;
3443}
3444
3445template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)>
3446HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
3447 VFromD<D> no, VFromD<D> load_mask, D d, const TFromD<D>* HWY_RESTRICT p,
3448 size_t num_lanes) {
3449 const size_t trailing_n = num_lanes & 3;
3450 if (trailing_n == 0) return no;
3451
3452 VFromD<D> v_trailing = IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no);
3453
3454 if ((trailing_n & 2) != 0) {
3455 const Repartition<int16_t, decltype(d)> di16;
3456 int16_t i16_bits;
3457 CopyBytes<sizeof(int16_t)>(p + num_lanes - trailing_n, &i16_bits);
3458 v_trailing = BitCast(
3459 d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits),
3460 BitCast(di16, v_trailing)));
3461 }
3462
3463 return v_trailing;
3464}
3465
3466template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
3467HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> load_mask, D d,
3468 const TFromD<D>* HWY_RESTRICT p,
3469 size_t num_lanes) {
3470 if ((num_lanes & 1) != 0) {
3471 return And(load_mask, Set(d, p[num_lanes - 1]));
3472 } else {
3473 return Zero(d);
3474 }
3475}
3476
3477template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
3478HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
3479 VFromD<D> no, VFromD<D> load_mask, D d, const TFromD<D>* HWY_RESTRICT p,
3480 size_t num_lanes) {
3481 if ((num_lanes & 1) != 0) {
3482 return IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no);
3483 } else {
3484 return no;
3485 }
3486}
3487
3488} // namespace detail
3489
3490// Generic for all vector lengths.
3491template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
3492HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, size_t N) {
3493 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
3494 d_full;
3495
3496 const VFromD<D> load_mask =
3497 ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N)));
3498 const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D));
3499 const VFromD<D> v_trailing =
3500 detail::AVX2UIF8Or16LoadTrailingN(load_mask, d, p, num_lanes);
3501
3502#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
3503 if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) &&
3504 num_lanes < (4 / sizeof(TFromD<D>))) {
3505 return v_trailing;
3506 }
3507#endif
3508
3509 return detail::AVX2UIF8Or16LoadLeadingN(load_mask, d, p, v_trailing);
3510}
3511
3512// Generic for all vector lengths.
3513template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
3514HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
3515 size_t N) {
3516 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
3517 d_full;
3518
3519 const VFromD<D> load_mask =
3520 ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N)));
3521 const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D));
3522 const VFromD<D> v_trailing =
3523 detail::AVX2UIF8Or16LoadTrailingNOr(no, load_mask, d, p, num_lanes);
3524
3525#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
3526 if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) &&
3527 num_lanes < (4 / sizeof(TFromD<D>))) {
3528 return v_trailing;
3529 }
3530#endif
3531
3532 return detail::AVX2UIF8Or16LoadLeadingNOr(no, load_mask, d, p, v_trailing);
3533}
3534
3535#endif // HWY_TARGET > HWY_AVX3
3536#endif // HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT
3537
3538// ------------------------------ BlendedStore
3539
3540namespace detail {
3541
3542// There is no maskload_epi8/16 with which we could safely implement
3543// BlendedStore. Manual blending is also unsafe because loading a full vector
3544// that crosses the array end causes asan faults. Resort to scalar code; the
3545// caller should instead use memcpy, assuming m is FirstN(d, n).
3546template <class D>
3549 const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t.
3550 using TI = TFromD<decltype(di)>;
3551 alignas(16) TI buf[MaxLanes(d)];
3552 alignas(16) TI mask[MaxLanes(d)];
3553 Store(BitCast(di, v), di, buf);
3554 Store(BitCast(di, VecFromMask(d, m)), di, mask);
3555 for (size_t i = 0; i < MaxLanes(d); ++i) {
3556 if (mask[i]) {
3557 CopySameSize(buf + i, p + i);
3558 }
3559 }
3560}
3561} // namespace detail
3562
3563#if HWY_TARGET <= HWY_AVX3
3564
3565template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3568 _mm_mask_storeu_epi8(p, m.raw, v.raw);
3569}
3570template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3571HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
3572 TFromD<D>* HWY_RESTRICT p) {
3573 const RebindToUnsigned<decltype(d)> du; // for float16_t
3574 _mm_mask_storeu_epi16(reinterpret_cast<uint16_t*>(p), RebindMask(du, m).raw,
3575 BitCast(du, v).raw);
3576}
3577
3578template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
3579HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
3580 TFromD<D>* HWY_RESTRICT p) {
3581 auto pi = reinterpret_cast<int*>(p); // NOLINT
3582 _mm_mask_storeu_epi32(pi, m.raw, v.raw);
3583}
3584
3585template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
3586HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
3587 TFromD<D>* HWY_RESTRICT p) {
3588 auto pi = reinterpret_cast<long long*>(p); // NOLINT
3589 _mm_mask_storeu_epi64(pi, m.raw, v.raw);
3590}
3591
3592template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
3594 _mm_mask_storeu_ps(p, m.raw, v.raw);
3595}
3596
3597template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
3599 _mm_mask_storeu_pd(p, m.raw, v.raw);
3600}
3601
3602#elif HWY_TARGET == HWY_AVX2
3603
3604template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
3605 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
3606HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
3607 TFromD<D>* HWY_RESTRICT p) {
3608 detail::ScalarMaskedStore(v, m, d, p);
3609}
3610
3611namespace detail {
3612
3613template <class D, class V, class M, HWY_IF_UI32_D(D)>
3614HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) {
3615 auto pi = reinterpret_cast<int*>(p); // NOLINT
3616 _mm_maskstore_epi32(pi, m.raw, v.raw);
3617}
3618
3619template <class D, class V, class M, HWY_IF_UI64_D(D)>
3620HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) {
3621 auto pi = reinterpret_cast<long long*>(p); // NOLINT
3622 _mm_maskstore_epi64(pi, m.raw, v.raw);
3623}
3624
3625template <class D, class V, class M, HWY_IF_F32_D(D)>
3626HWY_INLINE void NativeBlendedStore(V v, M m, float* HWY_RESTRICT p) {
3627 _mm_maskstore_ps(p, m.raw, v.raw);
3628}
3629
3630template <class D, class V, class M, HWY_IF_F64_D(D)>
3631HWY_INLINE void NativeBlendedStore(V v, M m, double* HWY_RESTRICT p) {
3632 _mm_maskstore_pd(p, m.raw, v.raw);
3633}
3634
3635} // namespace detail
3636
3637template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
3638 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
3639HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
3640 TFromD<D>* HWY_RESTRICT p) {
3641 const RebindToSigned<decltype(d)> di;
3642 // For partial vectors, avoid writing other lanes by zeroing their mask.
3643 if (d.MaxBytes() < 16) {
3644 const Full128<TFromD<D>> dfull;
3645 const Mask128<TFromD<D>> mfull{m.raw};
3646 m = MFromD<D>{And(mfull, FirstN(dfull, MaxLanes(d))).raw};
3647 }
3648
3649 // Float/double require, and unsigned ints tolerate, signed int masks.
3650 detail::NativeBlendedStore<D>(v, RebindMask(di, m), p);
3651}
3652
3653#else // <= SSE4
3654
3655template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
3656HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
3657 TFromD<D>* HWY_RESTRICT p) {
3658 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
3659 detail::ScalarMaskedStore(v, m, d, p);
3660}
3661
3662#endif // SSE4
3663
3664// ================================================== ARITHMETIC
3665
3666// ------------------------------ Addition
3667
3668// Unsigned
3669template <size_t N>
3670HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
3671 const Vec128<uint8_t, N> b) {
3672 return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
3673}
3674template <size_t N>
3675HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
3676 const Vec128<uint16_t, N> b) {
3677 return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
3678}
3679template <size_t N>
3680HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
3681 const Vec128<uint32_t, N> b) {
3682 return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
3683}
3684template <size_t N>
3685HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
3686 const Vec128<uint64_t, N> b) {
3687 return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
3688}
3689
3690// Signed
3691template <size_t N>
3692HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
3693 const Vec128<int8_t, N> b) {
3694 return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
3695}
3696template <size_t N>
3697HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
3698 const Vec128<int16_t, N> b) {
3699 return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
3700}
3701template <size_t N>
3702HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
3703 const Vec128<int32_t, N> b) {
3704 return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
3705}
3706template <size_t N>
3707HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
3708 const Vec128<int64_t, N> b) {
3709 return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
3710}
3711
3712// Float
3713#if HWY_HAVE_FLOAT16
3714template <size_t N>
3715HWY_API Vec128<float16_t, N> operator+(const Vec128<float16_t, N> a,
3716 const Vec128<float16_t, N> b) {
3717 return Vec128<float16_t, N>{_mm_add_ph(a.raw, b.raw)};
3718}
3719#endif // HWY_HAVE_FLOAT16
3720template <size_t N>
3721HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
3722 const Vec128<float, N> b) {
3723 return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
3724}
3725template <size_t N>
3726HWY_API Vec128<double, N> operator+(const Vec128<double, N> a,
3727 const Vec128<double, N> b) {
3728 return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
3729}
3730
3731// ------------------------------ Subtraction
3732
3733// Unsigned
3734template <size_t N>
3735HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
3736 const Vec128<uint8_t, N> b) {
3737 return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
3738}
3739template <size_t N>
3740HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
3741 Vec128<uint16_t, N> b) {
3742 return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
3743}
3744template <size_t N>
3745HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
3746 const Vec128<uint32_t, N> b) {
3747 return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
3748}
3749template <size_t N>
3750HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
3751 const Vec128<uint64_t, N> b) {
3752 return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
3753}
3754
3755// Signed
3756template <size_t N>
3757HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
3758 const Vec128<int8_t, N> b) {
3759 return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
3760}
3761template <size_t N>
3762HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
3763 const Vec128<int16_t, N> b) {
3764 return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
3765}
3766template <size_t N>
3767HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
3768 const Vec128<int32_t, N> b) {
3769 return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
3770}
3771template <size_t N>
3772HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
3773 const Vec128<int64_t, N> b) {
3774 return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
3775}
3776
3777// Float
3778#if HWY_HAVE_FLOAT16
3779template <size_t N>
3780HWY_API Vec128<float16_t, N> operator-(const Vec128<float16_t, N> a,
3781 const Vec128<float16_t, N> b) {
3782 return Vec128<float16_t, N>{_mm_sub_ph(a.raw, b.raw)};
3783}
3784#endif // HWY_HAVE_FLOAT16
3785template <size_t N>
3786HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
3787 const Vec128<float, N> b) {
3788 return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
3789}
3790template <size_t N>
3791HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
3792 const Vec128<double, N> b) {
3793 return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
3794}
3795
3796// ------------------------------ AddSub
3797
3798#if HWY_TARGET <= HWY_SSSE3
3799
3800#undef HWY_IF_ADDSUB_V
3801#define HWY_IF_ADDSUB_V(V) \
3802 HWY_IF_V_SIZE_GT_V( \
3803 V, ((hwy::IsFloat3264<TFromV<V>>()) ? 32 : sizeof(TFromV<V>)))
3804
3805template <size_t N, HWY_IF_LANES_GT(N, 1)>
3810 return Vec128<double>{_mm_addsub_pd(a.raw, b.raw)};
3811}
3812#endif // HWY_TARGET <= HWY_SSSE3
3813
3814// ------------------------------ SumsOf8
3815template <size_t N>
3816HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
3817 return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
3818}
3819
3820// Generic for all vector lengths
3821template <class V, HWY_IF_I8_D(DFromV<V>)>
3823 const DFromV<decltype(v)> d;
3824 const RebindToUnsigned<decltype(d)> du;
3825 const Repartition<int64_t, decltype(d)> di64;
3826
3827 // Adjust the values of v to be in the 0..255 range by adding 128 to each lane
3828 // of v (which is the same as an bitwise XOR of each i8 lane by 128) and then
3829 // bitcasting the Xor result to an u8 vector.
3830 const auto v_adj = BitCast(du, Xor(v, SignBit(d)));
3831
3832 // Need to add -1024 to each i64 lane of the result of the SumsOf8(v_adj)
3833 // operation to account for the adjustment made above.
3834 return BitCast(di64, SumsOf8(v_adj)) + Set(di64, int64_t{-1024});
3835}
3836
3837#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3838#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3839#else
3840#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3841#endif
3842
3843template <size_t N>
3845 const Vec128<uint8_t, N> b) {
3846 return Vec128<uint64_t, N / 8>{_mm_sad_epu8(a.raw, b.raw)};
3847}
3848
3849// Generic for all vector lengths
3850template <class V, HWY_IF_I8_D(DFromV<V>)>
3852 const DFromV<V> d;
3853 const RebindToUnsigned<decltype(d)> du;
3854 const RepartitionToWideX3<decltype(d)> di64;
3855
3856 // Adjust the values of a and b to be in the 0..255 range by adding 128 to
3857 // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
3858 // by 128) and then bitcasting the results of the Xor operations to u8
3859 // vectors.
3860 const auto i8_msb = SignBit(d);
3861 const auto a_adj = BitCast(du, Xor(a, i8_msb));
3862 const auto b_adj = BitCast(du, Xor(b, i8_msb));
3863
3864 // The result of SumsOf8AbsDiff(a_adj, b_adj) can simply be bitcasted to an
3865 // i64 vector as |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true
3866 return BitCast(di64, SumsOf8AbsDiff(a_adj, b_adj));
3867}
3868
3869// ------------------------------ SumsOf4
3870#if HWY_TARGET <= HWY_AVX3
3871namespace detail {
3872
3873template <size_t N>
3874HWY_INLINE Vec128<uint32_t, (N + 3) / 4> SumsOf4(
3875 hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/,
3877 const DFromV<decltype(v)> d;
3878
3879 // _mm_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
3880 // zeroed out and the sums of the 4 consecutive lanes are already in the
3881 // even uint16_t lanes of the _mm_maskz_dbsad_epu8 result.
3882 return Vec128<uint32_t, (N + 3) / 4>{
3883 _mm_maskz_dbsad_epu8(static_cast<__mmask8>(0x55), v.raw, Zero(d).raw, 0)};
3884}
3885
3886// detail::SumsOf4 for Vec128<int8_t, N> on AVX3 is implemented in x86_512-inl.h
3887
3888} // namespace detail
3889#endif // HWY_TARGET <= HWY_AVX3
3890
3891// ------------------------------ SumsOfAdjQuadAbsDiff
3892
3893#if HWY_TARGET <= HWY_SSE4
3894#ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
3895#undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
3896#else
3897#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
3898#endif
3899
3900template <int kAOffset, int kBOffset, size_t N>
3901HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfAdjQuadAbsDiff(
3903 static_assert(0 <= kAOffset && kAOffset <= 1,
3904 "kAOffset must be between 0 and 1");
3905 static_assert(0 <= kBOffset && kBOffset <= 3,
3906 "kBOffset must be between 0 and 3");
3907 return Vec128<uint16_t, (N + 1) / 2>{
3908 _mm_mpsadbw_epu8(a.raw, b.raw, (kAOffset << 2) | kBOffset)};
3909}
3910
3911// Generic for all vector lengths
3912template <int kAOffset, int kBOffset, class V, HWY_IF_I8_D(DFromV<V>)>
3914 const DFromV<decltype(a)> d;
3915 const RebindToUnsigned<decltype(d)> du;
3916 const RepartitionToWide<decltype(d)> dw;
3917
3918 // Adjust the values of a and b to be in the 0..255 range by adding 128 to
3919 // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
3920 // by 128) and then bitcasting the results of the Xor operations to u8
3921 // vectors.
3922 const auto i8_msb = SignBit(d);
3923 const auto a_adj = BitCast(du, Xor(a, i8_msb));
3924 const auto b_adj = BitCast(du, Xor(b, i8_msb));
3925
3926 // The result of SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj) can
3927 // simply be bitcasted to an i16 vector as
3928 // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true.
3929 return BitCast(dw, SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj));
3930}
3931#endif
3932
3933// ------------------------------ SumsOfShuffledQuadAbsDiff
3934
3935#if HWY_TARGET <= HWY_AVX3
3936#ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
3937#undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
3938#else
3939#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
3940#endif
3941
3942template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, size_t N>
3945 static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
3946 static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
3947 static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
3948 static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
3949 return Vec128<uint16_t, (N + 1) / 2>{
3950 _mm_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
3951}
3952
3953// Generic for all vector lengths
3954template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V,
3955 HWY_IF_I8_D(DFromV<V>)>
3957 V b) {
3958 const DFromV<decltype(a)> d;
3959 const RebindToUnsigned<decltype(d)> du;
3960 const RepartitionToWide<decltype(d)> dw;
3961
3962 // Adjust the values of a and b to be in the 0..255 range by adding 128 to
3963 // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
3964 // by 128) and then bitcasting the results of the Xor operations to u8
3965 // vectors.
3966 const auto i8_msb = SignBit(d);
3967 const auto a_adj = BitCast(du, Xor(a, i8_msb));
3968 const auto b_adj = BitCast(du, Xor(b, i8_msb));
3969
3970 // The result of
3971 // SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj) can
3972 // simply be bitcasted to an i16 vector as
3973 // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true.
3974 return BitCast(
3975 dw, SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj));
3976}
3977#endif
3978
3979// ------------------------------ SaturatedAdd
3980
3981// Returns a + b clamped to the destination range.
3982
3983// Unsigned
3984template <size_t N>
3985HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
3986 const Vec128<uint8_t, N> b) {
3987 return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
3988}
3989template <size_t N>
3990HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
3991 const Vec128<uint16_t, N> b) {
3992 return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
3993}
3994
3995// Signed
3996template <size_t N>
3997HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
3998 const Vec128<int8_t, N> b) {
3999 return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
4000}
4001template <size_t N>
4002HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
4003 const Vec128<int16_t, N> b) {
4004 return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
4005}
4006
4007#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
4008#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
4009#undef HWY_NATIVE_I32_SATURATED_ADDSUB
4010#else
4011#define HWY_NATIVE_I32_SATURATED_ADDSUB
4012#endif
4013
4014#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
4015#undef HWY_NATIVE_I64_SATURATED_ADDSUB
4016#else
4017#define HWY_NATIVE_I64_SATURATED_ADDSUB
4018#endif
4019
4020template <size_t N>
4023 const DFromV<decltype(a)> d;
4024 const auto sum = a + b;
4025 const auto overflow_mask = MaskFromVec(
4026 Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)});
4027 const auto i32_max = Set(d, LimitsMax<int32_t>());
4028 const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32(
4029 i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
4030 return IfThenElse(overflow_mask, overflow_result, sum);
4031}
4032
4033template <size_t N>
4036 const DFromV<decltype(a)> d;
4037 const auto sum = a + b;
4038 const auto overflow_mask = MaskFromVec(
4039 Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)});
4040 const auto i64_max = Set(d, LimitsMax<int64_t>());
4041 const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64(
4042 i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
4043 return IfThenElse(overflow_mask, overflow_result, sum);
4044}
4045#endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
4046
4047// ------------------------------ SaturatedSub
4048
4049// Returns a - b clamped to the destination range.
4050
4051// Unsigned
4052template <size_t N>
4053HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
4054 const Vec128<uint8_t, N> b) {
4055 return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
4056}
4057template <size_t N>
4058HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
4059 const Vec128<uint16_t, N> b) {
4060 return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
4061}
4062
4063// Signed
4064template <size_t N>
4065HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
4066 const Vec128<int8_t, N> b) {
4067 return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
4068}
4069template <size_t N>
4070HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
4071 const Vec128<int16_t, N> b) {
4072 return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
4073}
4074
4075#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
4076template <size_t N>
4079 const DFromV<decltype(a)> d;
4080 const auto diff = a - b;
4081 const auto overflow_mask = MaskFromVec(
4082 Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)});
4083 const auto i32_max = Set(d, LimitsMax<int32_t>());
4084 const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32(
4085 i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
4086 return IfThenElse(overflow_mask, overflow_result, diff);
4087}
4088
4089template <size_t N>
4092 const DFromV<decltype(a)> d;
4093 const auto diff = a - b;
4094 const auto overflow_mask = MaskFromVec(
4095 Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)});
4096 const auto i64_max = Set(d, LimitsMax<int64_t>());
4097 const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64(
4098 i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
4099 return IfThenElse(overflow_mask, overflow_result, diff);
4100}
4101#endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
4102
4103// ------------------------------ AverageRound
4104
4105// Returns (a + b + 1) / 2
4106
4107// Unsigned
4108template <size_t N>
4109HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
4110 const Vec128<uint8_t, N> b) {
4111 return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
4112}
4113template <size_t N>
4114HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
4115 const Vec128<uint16_t, N> b) {
4116 return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
4117}
4118
4119// ------------------------------ Integer multiplication
4120
4121template <size_t N>
4122HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
4123 const Vec128<uint16_t, N> b) {
4124 return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
4125}
4126template <size_t N>
4127HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
4128 const Vec128<int16_t, N> b) {
4129 return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
4130}
4131
4132// Returns the upper sizeof(T)*8 bits of a * b in each lane.
4133template <size_t N>
4134HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
4135 const Vec128<uint16_t, N> b) {
4136 return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
4137}
4138template <size_t N>
4139HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
4140 const Vec128<int16_t, N> b) {
4141 return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
4142}
4143
4144template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)),
4145 HWY_IF_LANES_D(DFromV<V>, 1)>
4146HWY_API V MulHigh(V a, V b) {
4147 const DFromV<decltype(a)> d;
4148 const Full128<TFromD<decltype(d)>> d_full;
4149 return ResizeBitCast(
4150 d, Slide1Down(d_full, ResizeBitCast(d_full, MulEven(a, b))));
4151}
4152
4153// I8/U8/I32/U32 MulHigh is generic for all vector lengths >= 2 lanes
4154template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)),
4155 HWY_IF_LANES_GT_D(DFromV<V>, 1)>
4156HWY_API V MulHigh(V a, V b) {
4157 const DFromV<decltype(a)> d;
4158
4159 const auto p_even = BitCast(d, MulEven(a, b));
4160 const auto p_odd = BitCast(d, MulOdd(a, b));
4161 return InterleaveOdd(d, p_even, p_odd);
4162}
4163
4164// Multiplies even lanes (0, 2 ..) and places the double-wide result into
4165// even and the upper half into its odd neighbor lane.
4166template <class V, HWY_IF_U8_D(DFromV<V>)>
4168 const DFromV<decltype(a)> d;
4169 const RepartitionToWide<decltype(d)> dw;
4170 const auto lo8_mask = Set(dw, uint16_t{0x00FF});
4171 return And(ResizeBitCast(dw, a), lo8_mask) *
4172 And(ResizeBitCast(dw, b), lo8_mask);
4173}
4174
4175template <class V, HWY_IF_I8_D(DFromV<V>)>
4176HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) {
4177 const DFromV<decltype(a)> d;
4178 const RepartitionToWide<decltype(d)> dw;
4179 return ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, a))) *
4180 ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, b)));
4181}
4182
4183template <class V, HWY_IF_UI16_D(DFromV<V>)>
4184HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) {
4185 const DFromV<decltype(a)> d;
4186 const RepartitionToWide<decltype(d)> dw;
4187 const RepartitionToNarrow<decltype(dw)> dw_as_d16;
4188
4189 const auto lo = ResizeBitCast(dw, a * b);
4190 const auto hi = ShiftLeft<16>(ResizeBitCast(dw, MulHigh(a, b)));
4191 return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo)));
4192}
4193
4194template <size_t N>
4195HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
4196 const Vec128<uint32_t, N> b) {
4197 return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
4198}
4199
4200template <size_t N>
4201HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
4202 const Vec128<int32_t, N> b) {
4203#if HWY_TARGET >= HWY_SSSE3
4204 const DFromV<decltype(a)> d;
4205 const RepartitionToWide<decltype(d)> dw;
4206 const RebindToUnsigned<decltype(d)> du;
4207
4208 // p[i] = (((a[i] >> 31) * (a[i] >> 31)) << 64) +
4209 // (((a[i] >> 31) * b[i]) << 32) +
4210 // (((b[i] >> 31) * a[i]) << 32) +
4211 // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF}))
4212
4213 // ((a[i] >> 31) * (a[i] >> 31)) << 64 does not need to be computed as the
4214 // lower 64 bits of ((a[i] >> 31) * (a[i] >> 31)) << 64 is zero.
4215
4216 // (((a[i] >> 31) * b[i]) << 32) + (((b[i] >> 31) * a[i]) << 32) ==
4217 // -((((a[i] >> 31) & b[i]) + ((b[i] >> 31) & a[i])) << 32)
4218
4219 // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) can be
4220 // computed using MulEven(BitCast(du, a), BitCast(du, b))
4221
4222 const auto neg_p_hi = ShiftLeft<32>(
4223 ResizeBitCast(dw, And(ShiftRight<31>(a), b) + And(ShiftRight<31>(b), a)));
4224 const auto p_lo = BitCast(dw, MulEven(BitCast(du, a), BitCast(du, b)));
4225 return p_lo - neg_p_hi;
4226#else
4227 return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
4228#endif
4229}
4230
4231template <class V, HWY_IF_T_SIZE_V(V, 1)>
4233 const DFromV<decltype(a)> d;
4234 const RepartitionToWide<decltype(d)> dw;
4235 return ShiftRight<8>(ResizeBitCast(dw, a)) *
4236 ShiftRight<8>(ResizeBitCast(dw, b));
4237}
4238
4239template <class V, HWY_IF_UI16_D(DFromV<V>)>
4241 const DFromV<decltype(a)> d;
4242 const RepartitionToWide<decltype(d)> dw;
4243 const RebindToUnsigned<decltype(dw)> dw_u;
4244 const RepartitionToNarrow<decltype(dw)> dw_as_d16;
4245
4246 const auto lo = ShiftRight<16>(BitCast(dw_u, ResizeBitCast(dw, a * b)));
4247 const auto hi = ResizeBitCast(dw, MulHigh(a, b));
4248 return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo)));
4249}
4250
4251template <class V, HWY_IF_UI32_D(DFromV<V>)>
4253 return MulEven(DupOdd(a), DupOdd(b));
4254}
4255
4256template <size_t N>
4257HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
4258 const Vec128<uint32_t, N> b) {
4259#if HWY_TARGET >= HWY_SSSE3
4260 // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency.
4261 // 64-bit right shift would also work but also needs port 5, so no benefit.
4262 // Notation: x=don't care, z=0.
4263 const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
4264 const auto mullo_x2x0 = MulEven(a, b);
4265 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
4266 const auto mullo_x3x1 =
4267 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
4268 // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating
4269 // the latter requires one more instruction or a constant.
4270 const __m128i mul_20 =
4271 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
4272 const __m128i mul_31 =
4273 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
4274 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
4275#else
4276 return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
4277#endif
4278}
4279
4280template <size_t N>
4281HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
4282 const Vec128<int32_t, N> b) {
4283 // Same as unsigned; avoid duplicating the SSSE3 code.
4284 const DFromV<decltype(a)> d;
4285 const RebindToUnsigned<decltype(d)> du;
4286 return BitCast(d, BitCast(du, a) * BitCast(du, b));
4287}
4288
4289// ------------------------------ RotateRight (ShiftRight, Or)
4290
4291// U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
4292// RotateRight uses detail::GaloisAffine on AVX3_DL
4293
4294#if HWY_TARGET > HWY_AVX3_DL
4295template <int kBits, size_t N>
4296HWY_API Vec128<uint8_t, N> RotateRight(const Vec128<uint8_t, N> v) {
4297 static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
4298 if (kBits == 0) return v;
4299 // AVX3 does not support 8-bit.
4300 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
4301}
4302#endif
4303
4304template <int kBits, size_t N>
4306 static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
4307 if (kBits == 0) return v;
4308 // AVX3 does not support 16-bit.
4309 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
4310}
4311
4312template <int kBits, size_t N>
4314 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
4315#if HWY_TARGET <= HWY_AVX3
4316 return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)};
4317#else
4318 if (kBits == 0) return v;
4319 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
4320#endif
4321}
4322
4323template <int kBits, size_t N>
4325 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
4326#if HWY_TARGET <= HWY_AVX3
4327 return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)};
4328#else
4329 if (kBits == 0) return v;
4330 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
4331#endif
4332}
4333
4334// I8/I16/I32/I64 RotateRight is generic for all vector lengths
4335template <int kBits, class V, HWY_IF_SIGNED_V(V)>
4336HWY_API V RotateRight(V v) {
4337 const DFromV<decltype(v)> d;
4338 const RebindToUnsigned<decltype(d)> du;
4339 return BitCast(d, RotateRight<kBits>(BitCast(du, v)));
4340}
4341
4342// ------------------------------ Rol/Ror
4343#if HWY_TARGET <= HWY_AVX3
4344
4345#ifdef HWY_NATIVE_ROL_ROR_32_64
4346#undef HWY_NATIVE_ROL_ROR_32_64
4347#else
4348#define HWY_NATIVE_ROL_ROR_32_64
4349#endif
4350
4351template <class T, size_t N, HWY_IF_UI32(T)>
4352HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
4353 return Vec128<T, N>{_mm_rolv_epi32(a.raw, b.raw)};
4354}
4355
4356template <class T, size_t N, HWY_IF_UI32(T)>
4357HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
4358 return Vec128<T, N>{_mm_rorv_epi32(a.raw, b.raw)};
4359}
4360
4361template <class T, size_t N, HWY_IF_UI64(T)>
4362HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
4363 return Vec128<T, N>{_mm_rolv_epi64(a.raw, b.raw)};
4364}
4365
4366template <class T, size_t N, HWY_IF_UI64(T)>
4367HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
4368 return Vec128<T, N>{_mm_rorv_epi64(a.raw, b.raw)};
4369}
4370
4371#endif
4372
4373// ------------------------------ RotateLeftSame/RotateRightSame
4374
4375#if HWY_TARGET <= HWY_AVX3
4376
4377#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
4378#undef HWY_NATIVE_ROL_ROR_SAME_32_64
4379#else
4380#define HWY_NATIVE_ROL_ROR_SAME_32_64
4381#endif
4382
4383// Generic for all vector lengths
4384template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4385 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
4386HWY_API V RotateLeftSame(V v, int bits) {
4387 const DFromV<decltype(v)> d;
4388 return Rol(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits))));
4389}
4390
4391template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4392 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
4393HWY_API V RotateRightSame(V v, int bits) {
4394 const DFromV<decltype(v)> d;
4395 return Ror(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits))));
4396}
4397#endif // HWY_TARGET <= HWY_AVX3
4398
4399// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
4400
4401template <size_t N>
4402HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
4403 const DFromV<decltype(v)> d;
4404 return VecFromMask(v < Zero(d));
4405}
4406
4407template <size_t N>
4409 return ShiftRight<15>(v);
4410}
4411
4412template <size_t N>
4414 return ShiftRight<31>(v);
4415}
4416
4417template <size_t N>
4419 const DFromV<decltype(v)> d;
4420#if HWY_TARGET <= HWY_AVX3
4421 (void)d;
4422 return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)};
4423#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
4424 return VecFromMask(v < Zero(d));
4425#else
4426 // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift
4427 // avoids generating a zero.
4428 const RepartitionToNarrow<decltype(d)> d32;
4429 const auto sign = ShiftRight<31>(BitCast(d32, v));
4430 return Vec128<int64_t, N>{
4431 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4432#endif
4433}
4434
4435// ------------------------------ Integer Abs
4436
4437// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
4438template <size_t N>
4439HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
4440#if HWY_COMPILER_MSVC || HWY_TARGET == HWY_SSE2
4441 const DFromV<decltype(v)> d;
4442 const RebindToUnsigned<decltype(d)> du;
4443 const auto zero = Zero(du);
4444 const auto v_as_u8 = BitCast(du, v);
4445 return BitCast(d, Min(v_as_u8, zero - v_as_u8));
4446#else
4447 return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
4448#endif
4449}
4450
4451template <size_t N>
4452HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
4453#if HWY_TARGET == HWY_SSE2
4454 const auto zero = Zero(DFromV<decltype(v)>());
4455 return Max(v, zero - v);
4456#else
4457 return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
4458#endif
4459}
4460
4461template <size_t N>
4462HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
4463#if HWY_TARGET <= HWY_SSSE3
4464 return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
4465#else
4466 const auto zero = Zero(DFromV<decltype(v)>());
4467 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
4468#endif
4469}
4470
4471#if HWY_TARGET <= HWY_AVX3
4472template <size_t N>
4473HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
4474 return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
4475}
4476#else
4477// I64 Abs is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
4478template <class V, HWY_IF_I64(TFromV<V>)>
4479HWY_API V Abs(V v) {
4480 const auto zero = Zero(DFromV<decltype(v)>());
4481 return IfNegativeThenElse(v, zero - v, v);
4482}
4483#endif
4484
4485#ifdef HWY_NATIVE_SATURATED_ABS
4486#undef HWY_NATIVE_SATURATED_ABS
4487#else
4488#define HWY_NATIVE_SATURATED_ABS
4489#endif
4490
4491// Generic for all vector lengths
4492template <class V, HWY_IF_I8(TFromV<V>)>
4493HWY_API V SaturatedAbs(V v) {
4494 const DFromV<decltype(v)> d;
4495 const RebindToUnsigned<decltype(d)> du;
4496 return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v))));
4497}
4498
4499// Generic for all vector lengths
4500template <class V, HWY_IF_I16(TFromV<V>)>
4501HWY_API V SaturatedAbs(V v) {
4502 return Max(v, SaturatedSub(Zero(DFromV<V>()), v));
4503}
4504
4505// Generic for all vector lengths
4506template <class V, HWY_IF_I32(TFromV<V>)>
4507HWY_API V SaturatedAbs(V v) {
4508 const auto abs_v = Abs(v);
4509
4510#if HWY_TARGET <= HWY_SSE4
4511 const DFromV<decltype(v)> d;
4512 const RebindToUnsigned<decltype(d)> du;
4513 return BitCast(d, Min(BitCast(du, abs_v),
4514 Set(du, static_cast<uint32_t>(LimitsMax<int32_t>()))));
4515#else
4516 return Add(abs_v, BroadcastSignBit(abs_v));
4517#endif
4518}
4519
4520// Generic for all vector lengths
4521template <class V, HWY_IF_I64(TFromV<V>)>
4522HWY_API V SaturatedAbs(V v) {
4523 const auto abs_v = Abs(v);
4524 return Add(abs_v, BroadcastSignBit(abs_v));
4525}
4526
4527// GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512VL
4528// srli_epi64: the count should be unsigned int. Note that this is not the same
4529// as the Shift3264Count in x86_512-inl.h (GCC also requires int).
4530#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
4531 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400)
4532using Shift64Count = int;
4533#else
4534// Assume documented behavior. Clang 12, GCC 14 and MSVC 14.28.29910 match this.
4535using Shift64Count = unsigned int;
4536#endif
4537
4538template <int kBits, size_t N>
4540#if HWY_TARGET <= HWY_AVX3
4541 return Vec128<int64_t, N>{
4542 _mm_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))};
4543#else
4544 const DFromV<decltype(v)> di;
4545 const RebindToUnsigned<decltype(di)> du;
4546 const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
4547 const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
4548 return right | sign;
4549#endif
4550}
4551
4552// ------------------------------ IfNegativeThenElse
4553template <size_t N>
4555 const Vec128<int8_t, N> yes,
4556 const Vec128<int8_t, N> no) {
4557// int8: IfThenElse only looks at the MSB on SSE4 or newer
4558#if HWY_TARGET <= HWY_SSE4
4559 const auto mask = MaskFromVec(v);
4560#else
4561 const DFromV<decltype(v)> d;
4562 const RebindToSigned<decltype(d)> di;
4563 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
4564#endif
4565
4566 return IfThenElse(mask, yes, no);
4567}
4568
4569template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
4571 Vec128<T, N> no) {
4572 static_assert(IsSigned<T>(), "Only works for signed/float");
4573
4574// 16-bit: no native blendv on AVX2 or earlier, so copy sign to lower byte's
4575// MSB.
4576#if HWY_TARGET <= HWY_AVX3
4577 const auto mask = MaskFromVec(v);
4578#else
4579 const DFromV<decltype(v)> d;
4580 const RebindToSigned<decltype(d)> di;
4581 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
4582#endif
4583
4584 return IfThenElse(mask, yes, no);
4585}
4586
4587template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
4588HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
4589 Vec128<T, N> no) {
4590 static_assert(IsSigned<T>(), "Only works for signed/float");
4591 const DFromV<decltype(v)> d;
4592
4593#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
4594 // 32/64-bit: use float IfThenElse on SSE4/AVX2, which only looks at the MSB
4595 // on SSE4 or later.
4596 const RebindToFloat<decltype(d)> df;
4597 const auto mask = MaskFromVec(BitCast(df, v));
4598 return BitCast(d, IfThenElse(mask, BitCast(df, yes), BitCast(df, no)));
4599#else // SSE2, SSSE3, or AVX3
4600
4601#if HWY_TARGET <= HWY_AVX3
4602 // No need to cast to float or broadcast sign bit on AVX3 as IfThenElse only
4603 // looks at the MSB on AVX3
4604 (void)d;
4605 const auto mask = MaskFromVec(v);
4606#else
4607 const RebindToSigned<decltype(d)> di;
4608 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
4609#endif
4610
4611 return IfThenElse(mask, yes, no);
4612#endif
4613}
4614
4615#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
4616
4617#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
4618#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
4619#else
4620#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
4621#endif
4622
4623#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
4624#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
4625#else
4626#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
4627#endif
4628
4629// SSE4/AVX2 IfNegativeThenElseZero/IfNegativeThenZeroElse is generic for all
4630// vector lengths
4631template <class V, HWY_IF_NOT_UNSIGNED_V(V),
4632 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))>
4633HWY_API V IfNegativeThenElseZero(V v, V yes) {
4634 const DFromV<decltype(v)> d;
4635 return IfNegativeThenElse(v, yes, Zero(d));
4636}
4637
4638template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)>
4639HWY_API V IfNegativeThenElseZero(V v, V yes) {
4640 return IfThenElseZero(IsNegative(v), yes);
4641}
4642
4643template <class V, HWY_IF_NOT_UNSIGNED_V(V),
4644 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))>
4645HWY_API V IfNegativeThenZeroElse(V v, V no) {
4646 const DFromV<decltype(v)> d;
4647 return IfNegativeThenElse(v, Zero(d), no);
4648}
4649
4650template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)>
4651HWY_API V IfNegativeThenZeroElse(V v, V no) {
4652 return IfThenZeroElse(IsNegative(v), no);
4653}
4654
4655#endif // HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
4656
4657// ------------------------------ IfNegativeThenNegOrUndefIfZero
4658
4659#if HWY_TARGET <= HWY_SSSE3
4660
4661#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
4662#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
4663#else
4664#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
4665#endif
4666
4667template <size_t N>
4672
4673template <size_t N>
4678
4679template <size_t N>
4684
4685// Generic for all vector lengths
4686template <class V, HWY_IF_I64_D(DFromV<V>)>
4687HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
4688#if HWY_TARGET <= HWY_AVX3
4689 // MaskedSubOr is more efficient than IfNegativeThenElse on AVX3
4690 const DFromV<decltype(v)> d;
4691 return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
4692#else
4693 // IfNegativeThenElse is more efficient than MaskedSubOr on SSE4/AVX2
4694 return IfNegativeThenElse(mask, Neg(v), v);
4695#endif
4696}
4697
4698#endif // HWY_TARGET <= HWY_SSSE3
4699
4700// ------------------------------ ShiftLeftSame
4701
4702template <size_t N>
4703HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
4704 const int bits) {
4705#if HWY_COMPILER_GCC
4706 if (__builtin_constant_p(bits)) {
4707 return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, bits)};
4708 }
4709#endif
4710 return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
4711}
4712template <size_t N>
4713HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
4714 const int bits) {
4715#if HWY_COMPILER_GCC
4716 if (__builtin_constant_p(bits)) {
4717 return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, bits)};
4718 }
4719#endif
4720 return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
4721}
4722template <size_t N>
4723HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
4724 const int bits) {
4725#if HWY_COMPILER_GCC
4726 if (__builtin_constant_p(bits)) {
4727 return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, bits)};
4728 }
4729#endif
4730 return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
4731}
4732
4733template <size_t N>
4734HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
4735 const int bits) {
4736#if HWY_COMPILER_GCC
4737 if (__builtin_constant_p(bits)) {
4738 return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, bits)};
4739 }
4740#endif
4741 return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
4742}
4743
4744template <size_t N>
4745HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
4746 const int bits) {
4747#if HWY_COMPILER_GCC
4748 if (__builtin_constant_p(bits)) {
4749 return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, bits)};
4750 }
4751#endif
4752 return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
4753}
4754
4755template <size_t N>
4756HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
4757 const int bits) {
4758#if HWY_COMPILER_GCC
4759 if (__builtin_constant_p(bits)) {
4760 return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, bits)};
4761 }
4762#endif
4763 return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
4764}
4765
4766template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
4767HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
4768 const DFromV<decltype(v)> d8;
4769 // Use raw instead of BitCast to support N=1.
4770 const Vec128<T, N> shifted{
4771 ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
4772 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
4773}
4774
4775// ------------------------------ ShiftRightSame (BroadcastSignBit)
4776
4777template <size_t N>
4778HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
4779 const int bits) {
4780#if HWY_COMPILER_GCC
4781 if (__builtin_constant_p(bits)) {
4782 return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, bits)};
4783 }
4784#endif
4785 return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
4786}
4787template <size_t N>
4788HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
4789 const int bits) {
4790#if HWY_COMPILER_GCC
4791 if (__builtin_constant_p(bits)) {
4792 return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, bits)};
4793 }
4794#endif
4795 return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
4796}
4797template <size_t N>
4798HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
4799 const int bits) {
4800#if HWY_COMPILER_GCC
4801 if (__builtin_constant_p(bits)) {
4802 return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, bits)};
4803 }
4804#endif
4805 return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
4806}
4807
4808template <size_t N>
4809HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
4810 const int bits) {
4811 const DFromV<decltype(v)> d8;
4812 // Use raw instead of BitCast to support N=1.
4813 const Vec128<uint8_t, N> shifted{
4814 ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
4815 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
4816}
4817
4818template <size_t N>
4819HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
4820 const int bits) {
4821#if HWY_COMPILER_GCC
4822 if (__builtin_constant_p(bits)) {
4823 return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, bits)};
4824 }
4825#endif
4826 return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
4827}
4828
4829template <size_t N>
4830HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
4831 const int bits) {
4832#if HWY_COMPILER_GCC
4833 if (__builtin_constant_p(bits)) {
4834 return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, bits)};
4835 }
4836#endif
4837 return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
4838}
4839template <size_t N>
4840HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
4841 const int bits) {
4842#if HWY_TARGET <= HWY_AVX3
4843#if HWY_COMPILER_GCC
4844 if (__builtin_constant_p(bits)) {
4845 return Vec128<int64_t, N>{
4846 _mm_srai_epi64(v.raw, static_cast<Shift64Count>(bits))};
4847 }
4848#endif
4849 return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
4850#else
4851 const DFromV<decltype(v)> di;
4852 const RebindToUnsigned<decltype(di)> du;
4853 const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
4854 const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
4855 return right | sign;
4856#endif
4857}
4858
4859template <size_t N>
4860HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
4861 const DFromV<decltype(v)> di;
4862 const RebindToUnsigned<decltype(di)> du;
4863 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
4864 const auto shifted_sign =
4865 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
4866 return (shifted ^ shifted_sign) - shifted_sign;
4867}
4868
4869// ------------------------------ Floating-point mul / div
4870
4871#if HWY_HAVE_FLOAT16
4872template <size_t N>
4873HWY_API Vec128<float16_t, N> operator*(Vec128<float16_t, N> a,
4874 Vec128<float16_t, N> b) {
4875 return Vec128<float16_t, N>{_mm_mul_ph(a.raw, b.raw)};
4876}
4877#endif // HWY_HAVE_FLOAT16
4878template <size_t N>
4879HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
4880 return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
4881}
4883 const Vec128<float, 1> b) {
4884 return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)};
4885}
4886template <size_t N>
4887HWY_API Vec128<double, N> operator*(const Vec128<double, N> a,
4888 const Vec128<double, N> b) {
4889 return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
4890}
4892 return Vec64<double>{_mm_mul_sd(a.raw, b.raw)};
4893}
4894
4895#if HWY_HAVE_FLOAT16
4896template <size_t N>
4897HWY_API Vec128<float16_t, N> operator/(const Vec128<float16_t, N> a,
4898 const Vec128<float16_t, N> b) {
4899 return Vec128<float16_t, N>{_mm_div_ph(a.raw, b.raw)};
4900}
4901#endif // HWY_HAVE_FLOAT16
4902template <size_t N>
4903HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
4904 const Vec128<float, N> b) {
4905 return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
4906}
4908 const Vec128<float, 1> b) {
4909 return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)};
4910}
4911template <size_t N>
4912HWY_API Vec128<double, N> operator/(const Vec128<double, N> a,
4913 const Vec128<double, N> b) {
4914 return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
4915}
4917 return Vec64<double>{_mm_div_sd(a.raw, b.raw)};
4918}
4919
4920// Approximate reciprocal
4921#if HWY_HAVE_FLOAT16
4922template <size_t N>
4923HWY_API Vec128<float16_t, N> ApproximateReciprocal(
4924 const Vec128<float16_t, N> v) {
4925 return Vec128<float16_t, N>{_mm_rcp_ph(v.raw)};
4926}
4927#endif // HWY_HAVE_FLOAT16
4928template <size_t N>
4929HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
4930 return Vec128<float, N>{_mm_rcp_ps(v.raw)};
4931}
4935
4936#if HWY_TARGET <= HWY_AVX3
4937#ifdef HWY_NATIVE_F64_APPROX_RECIP
4938#undef HWY_NATIVE_F64_APPROX_RECIP
4939#else
4940#define HWY_NATIVE_F64_APPROX_RECIP
4941#endif
4942
4949#endif
4950
4951// Generic for all vector lengths.
4952template <class V, HWY_IF_FLOAT_V(V)>
4953HWY_API V AbsDiff(V a, V b) {
4954 return Abs(a - b);
4955}
4956
4957// ------------------------------ MaskedMinOr
4958
4959#if HWY_TARGET <= HWY_AVX3
4960
4961#ifdef HWY_NATIVE_MASKED_ARITH
4962#undef HWY_NATIVE_MASKED_ARITH
4963#else
4964#define HWY_NATIVE_MASKED_ARITH
4965#endif
4966
4967template <typename T, size_t N, HWY_IF_U8(T)>
4970 return Vec128<T, N>{_mm_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
4971}
4972template <typename T, size_t N, HWY_IF_I8(T)>
4973HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4974 Vec128<T, N> a, Vec128<T, N> b) {
4975 return Vec128<T, N>{_mm_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
4976}
4977
4978template <typename T, size_t N, HWY_IF_U16(T)>
4979HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4980 Vec128<T, N> a, Vec128<T, N> b) {
4981 return Vec128<T, N>{_mm_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
4982}
4983template <typename T, size_t N, HWY_IF_I16(T)>
4984HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4985 Vec128<T, N> a, Vec128<T, N> b) {
4986 return Vec128<T, N>{_mm_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
4987}
4988
4989template <typename T, size_t N, HWY_IF_U32(T)>
4990HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4991 Vec128<T, N> a, Vec128<T, N> b) {
4992 return Vec128<T, N>{_mm_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
4993}
4994template <typename T, size_t N, HWY_IF_I32(T)>
4995HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4996 Vec128<T, N> a, Vec128<T, N> b) {
4997 return Vec128<T, N>{_mm_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
4998}
4999
5000template <typename T, size_t N, HWY_IF_U64(T)>
5001HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
5002 Vec128<T, N> a, Vec128<T, N> b) {
5003 return Vec128<T, N>{_mm_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
5004}
5005template <typename T, size_t N, HWY_IF_I64(T)>
5006HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
5007 Vec128<T, N> a, Vec128<T, N> b) {
5008 return Vec128<T, N>{_mm_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
5009}
5010
5011template <typename T, size_t N, HWY_IF_F32(T)>
5012HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
5013 Vec128<T, N> a, Vec128<T, N> b) {
5014 return Vec128<T, N>{_mm_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
5015}
5016
5017template <typename T, size_t N, HWY_IF_F64(T)>
5018HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
5019 Vec128<T, N> a, Vec128<T, N> b) {
5020 return Vec128<T, N>{_mm_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
5021}
5022
5023#if HWY_HAVE_FLOAT16
5024template <typename T, size_t N, HWY_IF_F16(T)>
5025HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
5026 Vec128<T, N> a, Vec128<T, N> b) {
5027 return Vec128<T, N>{_mm_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
5028}
5029#endif // HWY_HAVE_FLOAT16
5030
5031// ------------------------------ MaskedMaxOr
5032
5033template <typename T, size_t N, HWY_IF_U8(T)>
5036 return Vec128<T, N>{_mm_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
5037}
5038template <typename T, size_t N, HWY_IF_I8(T)>
5039HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5040 Vec128<T, N> a, Vec128<T, N> b) {
5041 return Vec128<T, N>{_mm_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
5042}
5043
5044template <typename T, size_t N, HWY_IF_U16(T)>
5045HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5046 Vec128<T, N> a, Vec128<T, N> b) {
5047 return Vec128<T, N>{_mm_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
5048}
5049template <typename T, size_t N, HWY_IF_I16(T)>
5050HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5051 Vec128<T, N> a, Vec128<T, N> b) {
5052 return Vec128<T, N>{_mm_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
5053}
5054
5055template <typename T, size_t N, HWY_IF_U32(T)>
5056HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5057 Vec128<T, N> a, Vec128<T, N> b) {
5058 return Vec128<T, N>{_mm_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
5059}
5060template <typename T, size_t N, HWY_IF_I32(T)>
5061HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5062 Vec128<T, N> a, Vec128<T, N> b) {
5063 return Vec128<T, N>{_mm_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
5064}
5065
5066template <typename T, size_t N, HWY_IF_U64(T)>
5067HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5068 Vec128<T, N> a, Vec128<T, N> b) {
5069 return Vec128<T, N>{_mm_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
5070}
5071template <typename T, size_t N, HWY_IF_I64(T)>
5072HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5073 Vec128<T, N> a, Vec128<T, N> b) {
5074 return Vec128<T, N>{_mm_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
5075}
5076
5077template <typename T, size_t N, HWY_IF_F32(T)>
5078HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5079 Vec128<T, N> a, Vec128<T, N> b) {
5080 return Vec128<T, N>{_mm_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
5081}
5082
5083template <typename T, size_t N, HWY_IF_F64(T)>
5084HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5085 Vec128<T, N> a, Vec128<T, N> b) {
5086 return Vec128<T, N>{_mm_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
5087}
5088
5089#if HWY_HAVE_FLOAT16
5090template <typename T, size_t N, HWY_IF_F16(T)>
5091HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
5092 Vec128<T, N> a, Vec128<T, N> b) {
5093 return Vec128<T, N>{_mm_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
5094}
5095#endif // HWY_HAVE_FLOAT16
5096
5097// ------------------------------ MaskedAddOr
5098
5099template <typename T, size_t N, HWY_IF_UI8(T)>
5102 return Vec128<T, N>{_mm_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
5103}
5104
5105template <typename T, size_t N, HWY_IF_UI16(T)>
5106HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
5107 Vec128<T, N> a, Vec128<T, N> b) {
5108 return Vec128<T, N>{_mm_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
5109}
5110
5111template <typename T, size_t N, HWY_IF_UI32(T)>
5112HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
5113 Vec128<T, N> a, Vec128<T, N> b) {
5114 return Vec128<T, N>{_mm_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
5115}
5116
5117template <typename T, size_t N, HWY_IF_UI64(T)>
5118HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
5119 Vec128<T, N> a, Vec128<T, N> b) {
5120 return Vec128<T, N>{_mm_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
5121}
5122
5123template <typename T, size_t N, HWY_IF_F32(T)>
5124HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
5125 Vec128<T, N> a, Vec128<T, N> b) {
5126 return Vec128<T, N>{_mm_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
5127}
5128
5129template <typename T, size_t N, HWY_IF_F64(T)>
5130HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
5131 Vec128<T, N> a, Vec128<T, N> b) {
5132 return Vec128<T, N>{_mm_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
5133}
5134
5135#if HWY_HAVE_FLOAT16
5136template <typename T, size_t N, HWY_IF_F16(T)>
5137HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
5138 Vec128<T, N> a, Vec128<T, N> b) {
5139 return Vec128<T, N>{_mm_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
5140}
5141#endif // HWY_HAVE_FLOAT16
5142
5143// ------------------------------ MaskedSubOr
5144
5145template <typename T, size_t N, HWY_IF_UI8(T)>
5148 return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
5149}
5150
5151template <typename T, size_t N, HWY_IF_UI16(T)>
5152HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
5153 Vec128<T, N> a, Vec128<T, N> b) {
5154 return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
5155}
5156
5157template <typename T, size_t N, HWY_IF_UI32(T)>
5158HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
5159 Vec128<T, N> a, Vec128<T, N> b) {
5160 return Vec128<T, N>{_mm_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
5161}
5162
5163template <typename T, size_t N, HWY_IF_UI64(T)>
5164HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
5165 Vec128<T, N> a, Vec128<T, N> b) {
5166 return Vec128<T, N>{_mm_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
5167}
5168
5169template <typename T, size_t N, HWY_IF_F32(T)>
5170HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
5171 Vec128<T, N> a, Vec128<T, N> b) {
5172 return Vec128<T, N>{_mm_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
5173}
5174
5175template <typename T, size_t N, HWY_IF_F64(T)>
5176HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
5177 Vec128<T, N> a, Vec128<T, N> b) {
5178 return Vec128<T, N>{_mm_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
5179}
5180
5181#if HWY_HAVE_FLOAT16
5182template <typename T, size_t N, HWY_IF_F16(T)>
5183HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
5184 Vec128<T, N> a, Vec128<T, N> b) {
5185 return Vec128<T, N>{_mm_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
5186}
5187#endif // HWY_HAVE_FLOAT16
5188
5189// ------------------------------ MaskedMulOr
5190
5191// There are no elementwise integer mask_mul. Generic for all vector lengths.
5192template <class V, class M>
5193HWY_API V MaskedMulOr(V no, M m, V a, V b) {
5194 return IfThenElse(m, a * b, no);
5195}
5196
5197template <size_t N>
5202
5203template <size_t N>
5209
5210#if HWY_HAVE_FLOAT16
5211template <size_t N>
5212HWY_API Vec128<float16_t, N> MaskedMulOr(Vec128<float16_t, N> no,
5213 Mask128<float16_t, N> m,
5214 Vec128<float16_t, N> a,
5215 Vec128<float16_t, N> b) {
5216 return Vec128<float16_t, N>{_mm_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
5217}
5218#endif // HWY_HAVE_FLOAT16
5219
5220// ------------------------------ MaskedDivOr
5221
5222template <size_t N>
5227
5228template <size_t N>
5234
5235#if HWY_HAVE_FLOAT16
5236template <size_t N>
5237HWY_API Vec128<float16_t, N> MaskedDivOr(Vec128<float16_t, N> no,
5238 Mask128<float16_t, N> m,
5239 Vec128<float16_t, N> a,
5240 Vec128<float16_t, N> b) {
5241 return Vec128<float16_t, N>{_mm_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
5242}
5243#endif // HWY_HAVE_FLOAT16
5244
5245// Generic for all vector lengths
5246template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5247HWY_API V MaskedDivOr(V no, MFromD<DFromV<V>> m, V a, V b) {
5248 return IfThenElse(m, Div(a, b), no);
5249}
5250
5251// ------------------------------ MaskedModOr
5252// Generic for all vector lengths
5253template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5254HWY_API V MaskedModOr(V no, MFromD<DFromV<V>> m, V a, V b) {
5255 return IfThenElse(m, Mod(a, b), no);
5256}
5257
5258// ------------------------------ MaskedSatAddOr
5259
5260template <typename T, size_t N, HWY_IF_I8(T)>
5263 return Vec128<T, N>{_mm_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
5264}
5265
5266template <typename T, size_t N, HWY_IF_U8(T)>
5267HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
5268 Vec128<T, N> a, Vec128<T, N> b) {
5269 return Vec128<T, N>{_mm_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
5270}
5271
5272template <typename T, size_t N, HWY_IF_I16(T)>
5273HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
5274 Vec128<T, N> a, Vec128<T, N> b) {
5275 return Vec128<T, N>{_mm_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
5276}
5277
5278template <typename T, size_t N, HWY_IF_U16(T)>
5279HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
5280 Vec128<T, N> a, Vec128<T, N> b) {
5281 return Vec128<T, N>{_mm_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
5282}
5283
5284// ------------------------------ MaskedSatSubOr
5285
5286template <typename T, size_t N, HWY_IF_I8(T)>
5289 return Vec128<T, N>{_mm_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
5290}
5291
5292template <typename T, size_t N, HWY_IF_U8(T)>
5293HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
5294 Vec128<T, N> a, Vec128<T, N> b) {
5295 return Vec128<T, N>{_mm_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
5296}
5297
5298template <typename T, size_t N, HWY_IF_I16(T)>
5299HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
5300 Vec128<T, N> a, Vec128<T, N> b) {
5301 return Vec128<T, N>{_mm_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
5302}
5303
5304template <typename T, size_t N, HWY_IF_U16(T)>
5305HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
5306 Vec128<T, N> a, Vec128<T, N> b) {
5307 return Vec128<T, N>{_mm_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
5308}
5309
5310#endif // HWY_TARGET <= HWY_AVX3
5311
5312// ------------------------------ Floating-point multiply-add variants
5313
5314#if HWY_HAVE_FLOAT16
5315template <size_t N>
5316HWY_API Vec128<float16_t, N> MulAdd(Vec128<float16_t, N> mul,
5317 Vec128<float16_t, N> x,
5318 Vec128<float16_t, N> add) {
5319 return Vec128<float16_t, N>{_mm_fmadd_ph(mul.raw, x.raw, add.raw)};
5320}
5321
5322template <size_t N>
5323HWY_API Vec128<float16_t, N> NegMulAdd(Vec128<float16_t, N> mul,
5324 Vec128<float16_t, N> x,
5325 Vec128<float16_t, N> add) {
5326 return Vec128<float16_t, N>{_mm_fnmadd_ph(mul.raw, x.raw, add.raw)};
5327}
5328
5329template <size_t N>
5330HWY_API Vec128<float16_t, N> MulSub(Vec128<float16_t, N> mul,
5331 Vec128<float16_t, N> x,
5332 Vec128<float16_t, N> sub) {
5333 return Vec128<float16_t, N>{_mm_fmsub_ph(mul.raw, x.raw, sub.raw)};
5334}
5335
5336template <size_t N>
5337HWY_API Vec128<float16_t, N> NegMulSub(Vec128<float16_t, N> mul,
5338 Vec128<float16_t, N> x,
5339 Vec128<float16_t, N> sub) {
5340 return Vec128<float16_t, N>{_mm_fnmsub_ph(mul.raw, x.raw, sub.raw)};
5341}
5342
5343#endif // HWY_HAVE_FLOAT16
5344template <size_t N>
5346 Vec128<float, N> add) {
5347#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5348 return mul * x + add;
5349#else
5350 return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
5351#endif
5352}
5353template <size_t N>
5355 Vec128<double, N> add) {
5356#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5357 return mul * x + add;
5358#else
5359 return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
5360#endif
5361}
5362
5363// Returns add - mul * x
5364template <size_t N>
5366 Vec128<float, N> add) {
5367#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5368 return add - mul * x;
5369#else
5370 return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
5371#endif
5372}
5373template <size_t N>
5375 Vec128<double, N> add) {
5376#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5377 return add - mul * x;
5378#else
5379 return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
5380#endif
5381}
5382
5383// Returns mul * x - sub
5384template <size_t N>
5386 Vec128<float, N> sub) {
5387#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5388 return mul * x - sub;
5389#else
5390 return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
5391#endif
5392}
5393template <size_t N>
5395 Vec128<double, N> sub) {
5396#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5397 return mul * x - sub;
5398#else
5399 return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
5400#endif
5401}
5402
5403// Returns -mul * x - sub
5404template <size_t N>
5406 Vec128<float, N> sub) {
5407#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5408 return Neg(mul) * x - sub;
5409#else
5410 return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
5411#endif
5412}
5413template <size_t N>
5415 Vec128<double, N> sub) {
5416#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5417 return Neg(mul) * x - sub;
5418#else
5419 return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
5420#endif
5421}
5422
5423#if HWY_TARGET <= HWY_SSSE3
5424
5425#undef HWY_IF_MULADDSUB_V
5426#define HWY_IF_MULADDSUB_V(V) \
5427 HWY_IF_LANES_GT_D(DFromV<V>, 1), \
5428 HWY_IF_T_SIZE_ONE_OF_V( \
5429 V, (1 << 1) | ((hwy::IsFloat<TFromV<V>>()) \
5430 ? 0 \
5431 : ((1 << 2) | (1 << 4) | (1 << 8))))
5432
5433#if HWY_HAVE_FLOAT16
5434template <size_t N, HWY_IF_LANES_GT(N, 1)>
5435HWY_API Vec128<float16_t, N> MulAddSub(Vec128<float16_t, N> mul,
5436 Vec128<float16_t, N> x,
5437 Vec128<float16_t, N> sub_or_add) {
5438 return Vec128<float16_t, N>{_mm_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
5439}
5440#endif // HWY_HAVE_FLOAT16
5441
5442template <size_t N, HWY_IF_LANES_GT(N, 1)>
5444 Vec128<float, N> sub_or_add) {
5445#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5446 return AddSub(mul * x, sub_or_add);
5447#else
5448 return Vec128<float, N>{_mm_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
5449#endif
5450}
5451
5453 Vec128<double> sub_or_add) {
5454#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5455 return AddSub(mul * x, sub_or_add);
5456#else
5457 return Vec128<double>{_mm_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
5458#endif
5459}
5460
5461#endif // HWY_TARGET <= HWY_SSSE3
5462
5463// ------------------------------ Floating-point square root
5464
5465// Full precision square root
5466#if HWY_HAVE_FLOAT16
5467template <size_t N>
5468HWY_API Vec128<float16_t, N> Sqrt(Vec128<float16_t, N> v) {
5469 return Vec128<float16_t, N>{_mm_sqrt_ph(v.raw)};
5470}
5471#endif // HWY_HAVE_FLOAT16
5472template <size_t N>
5473HWY_API Vec128<float, N> Sqrt(Vec128<float, N> v) {
5474 return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
5475}
5477 return Vec128<float, 1>{_mm_sqrt_ss(v.raw)};
5478}
5479template <size_t N>
5480HWY_API Vec128<double, N> Sqrt(Vec128<double, N> v) {
5481 return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
5482}
5484 return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)};
5485}
5486
5487// Approximate reciprocal square root
5488#if HWY_HAVE_FLOAT16
5489template <size_t N>
5490HWY_API Vec128<float16_t, N> ApproximateReciprocalSqrt(Vec128<float16_t, N> v) {
5491 return Vec128<float16_t, N>{_mm_rsqrt_ph(v.raw)};
5492}
5493#endif // HWY_HAVE_FLOAT16
5494template <size_t N>
5495HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
5496 return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
5497}
5501
5502#if HWY_TARGET <= HWY_AVX3
5503#ifdef HWY_NATIVE_F64_APPROX_RSQRT
5504#undef HWY_NATIVE_F64_APPROX_RSQRT
5505#else
5506#define HWY_NATIVE_F64_APPROX_RSQRT
5507#endif
5508
5513#if HWY_COMPILER_MSVC
5514 const DFromV<decltype(v)> d;
5515 return Vec128<double>{_mm_mask_rsqrt14_pd(
5516 Undefined(d).raw, static_cast<__mmask8>(0xFF), v.raw)};
5517#else
5518 return Vec128<double>{_mm_rsqrt14_pd(v.raw)};
5519#endif
5520}
5521#endif
5522
5523// ------------------------------ Min (Gt, IfThenElse)
5524
5525namespace detail {
5526
5527template <typename T, size_t N>
5529 const Vec128<T, N> b) {
5530 const DFromV<decltype(a)> d;
5531 const RebindToUnsigned<decltype(d)> du;
5532 const RebindToSigned<decltype(d)> di;
5533 const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
5534 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
5535 return IfThenElse(gt, b, a);
5536}
5537
5538} // namespace detail
5539
5540// Unsigned
5541template <size_t N>
5542HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
5543 return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
5544}
5545template <size_t N>
5546HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
5547#if HWY_TARGET >= HWY_SSSE3
5548 return detail::MinU(a, b);
5549#else
5550 return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
5551#endif
5552}
5553template <size_t N>
5554HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
5555#if HWY_TARGET >= HWY_SSSE3
5556 return detail::MinU(a, b);
5557#else
5558 return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
5559#endif
5560}
5561template <size_t N>
5562HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
5563#if HWY_TARGET <= HWY_AVX3
5564 return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
5565#else
5566 return detail::MinU(a, b);
5567#endif
5568}
5569
5570// Signed
5571template <size_t N>
5572HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
5573#if HWY_TARGET >= HWY_SSSE3
5574 return IfThenElse(a < b, a, b);
5575#else
5576 return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
5577#endif
5578}
5579template <size_t N>
5580HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
5581 return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
5582}
5583template <size_t N>
5584HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
5585#if HWY_TARGET >= HWY_SSSE3
5586 return IfThenElse(a < b, a, b);
5587#else
5588 return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
5589#endif
5590}
5591template <size_t N>
5592HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
5593#if HWY_TARGET <= HWY_AVX3
5594 return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
5595#else
5596 return IfThenElse(a < b, a, b);
5597#endif
5598}
5599
5600// Float
5601#if HWY_HAVE_FLOAT16
5602template <size_t N>
5603HWY_API Vec128<float16_t, N> Min(Vec128<float16_t, N> a,
5604 Vec128<float16_t, N> b) {
5605 return Vec128<float16_t, N>{_mm_min_ph(a.raw, b.raw)};
5606}
5607#endif // HWY_HAVE_FLOAT16
5608template <size_t N>
5609HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) {
5610 return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
5611}
5612template <size_t N>
5613HWY_API Vec128<double, N> Min(Vec128<double, N> a, Vec128<double, N> b) {
5614 return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
5615}
5616
5617// ------------------------------ Max (Gt, IfThenElse)
5618
5619namespace detail {
5620template <typename T, size_t N>
5622 const Vec128<T, N> b) {
5623 const DFromV<decltype(a)> d;
5624 const RebindToUnsigned<decltype(d)> du;
5625 const RebindToSigned<decltype(d)> di;
5626 const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
5627 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
5628 return IfThenElse(gt, a, b);
5629}
5630
5631} // namespace detail
5632
5633// Unsigned
5634template <size_t N>
5635HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
5636 return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
5637}
5638template <size_t N>
5639HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
5640#if HWY_TARGET >= HWY_SSSE3
5641 return detail::MaxU(a, b);
5642#else
5643 return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
5644#endif
5645}
5646template <size_t N>
5647HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
5648#if HWY_TARGET >= HWY_SSSE3
5649 return detail::MaxU(a, b);
5650#else
5651 return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
5652#endif
5653}
5654template <size_t N>
5655HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
5656#if HWY_TARGET <= HWY_AVX3
5657 return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
5658#else
5659 return detail::MaxU(a, b);
5660#endif
5661}
5662
5663// Signed
5664template <size_t N>
5665HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
5666#if HWY_TARGET >= HWY_SSSE3
5667 return IfThenElse(a < b, b, a);
5668#else
5669 return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
5670#endif
5671}
5672template <size_t N>
5673HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
5674 return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
5675}
5676template <size_t N>
5677HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
5678#if HWY_TARGET >= HWY_SSSE3
5679 return IfThenElse(a < b, b, a);
5680#else
5681 return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
5682#endif
5683}
5684template <size_t N>
5685HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
5686#if HWY_TARGET <= HWY_AVX3
5687 return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
5688#else
5689 return IfThenElse(a < b, b, a);
5690#endif
5691}
5692
5693// Float
5694#if HWY_HAVE_FLOAT16
5695template <size_t N>
5696HWY_API Vec128<float16_t, N> Max(Vec128<float16_t, N> a,
5697 Vec128<float16_t, N> b) {
5698 return Vec128<float16_t, N>{_mm_max_ph(a.raw, b.raw)};
5699}
5700#endif // HWY_HAVE_FLOAT16
5701template <size_t N>
5702HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) {
5703 return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
5704}
5705template <size_t N>
5706HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {
5707 return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
5708}
5709
5710// ================================================== MEMORY (3)
5711
5712// ------------------------------ Non-temporal stores
5713
5714// On clang6, we see incorrect code generated for _mm_stream_pi, so
5715// round even partial vectors up to 16 bytes.
5716template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
5718 const RebindToUnsigned<decltype(d)> du; // for float16_t
5719 _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), BitCast(du, v).raw);
5720}
5721template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
5722HWY_API void Stream(VFromD<D> v, D /* tag */, float* HWY_RESTRICT aligned) {
5723 _mm_stream_ps(aligned, v.raw);
5724}
5725template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
5726HWY_API void Stream(VFromD<D> v, D /* tag */, double* HWY_RESTRICT aligned) {
5727 _mm_stream_pd(aligned, v.raw);
5728}
5729
5730// ------------------------------ Scatter
5731
5732// Work around warnings in the intrinsic definitions (passing -1 as a mask).
5733HWY_DIAGNOSTICS(push)
5734HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
5735
5736// Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
5737using GatherIndex64 = long long int; // NOLINT(runtime/int)
5738static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
5739
5740#if HWY_TARGET <= HWY_AVX3
5741
5742#ifdef HWY_NATIVE_SCATTER
5743#undef HWY_NATIVE_SCATTER
5744#else
5745#define HWY_NATIVE_SCATTER
5746#endif
5747
5748namespace detail {
5749
5750template <int kScale, class D, class VI, HWY_IF_UI32_D(D)>
5752 VI index) {
5753 if (d.MaxBytes() == 16) {
5754 _mm_i32scatter_epi32(base, index.raw, v.raw, kScale);
5755 } else {
5756 const __mmask8 mask = (1u << MaxLanes(d)) - 1;
5757 _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, kScale);
5758 }
5759}
5760
5761template <int kScale, class D, class VI, HWY_IF_UI64_D(D)>
5762HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
5763 VI index) {
5764 if (d.MaxBytes() == 16) {
5765 _mm_i64scatter_epi64(base, index.raw, v.raw, kScale);
5766 } else {
5767 const __mmask8 mask = (1u << MaxLanes(d)) - 1;
5768 _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, kScale);
5769 }
5770}
5771
5772template <int kScale, class D, class VI, HWY_IF_F32_D(D)>
5774 VI index) {
5775 if (d.MaxBytes() == 16) {
5776 _mm_i32scatter_ps(base, index.raw, v.raw, kScale);
5777 } else {
5778 const __mmask8 mask = (1u << MaxLanes(d)) - 1;
5779 _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, kScale);
5780 }
5781}
5782
5783template <int kScale, class D, class VI, HWY_IF_F64_D(D)>
5785 VI index) {
5786 if (d.MaxBytes() == 16) {
5787 _mm_i64scatter_pd(base, index.raw, v.raw, kScale);
5788 } else {
5789 const __mmask8 mask = (1u << MaxLanes(d)) - 1;
5790 _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, kScale);
5791 }
5792}
5793
5794template <int kScale, class D, class VI, HWY_IF_UI32_D(D)>
5796 TFromD<D>* HWY_RESTRICT base, VI index) {
5797 // For partial vectors, ensure upper mask lanes are zero to prevent faults.
5798 if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5799 _mm_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, kScale);
5800}
5801
5802template <int kScale, class D, class VI, HWY_IF_UI64_D(D)>
5803HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d,
5804 TFromD<D>* HWY_RESTRICT base, VI index) {
5805 // For partial vectors, ensure upper mask lanes are zero to prevent faults.
5806 if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5807 _mm_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, kScale);
5808}
5809
5810template <int kScale, class D, class VI, HWY_IF_F32_D(D)>
5812 float* HWY_RESTRICT base, VI index) {
5813 // For partial vectors, ensure upper mask lanes are zero to prevent faults.
5814 if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5815 _mm_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, kScale);
5816}
5817
5818template <int kScale, class D, class VI, HWY_IF_F64_D(D)>
5820 double* HWY_RESTRICT base, VI index) {
5821 // For partial vectors, ensure upper mask lanes are zero to prevent faults.
5822 if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5823 _mm_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, kScale);
5824}
5825
5826} // namespace detail
5827
5828template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5830 VFromD<RebindToSigned<D>> offset) {
5831 return detail::NativeScatter128<1>(v, d, base, offset);
5832}
5833template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5835 VFromD<RebindToSigned<D>> index) {
5836 return detail::NativeScatter128<sizeof(TFromD<D>)>(v, d, base, index);
5837}
5838template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5840 TFromD<D>* HWY_RESTRICT base,
5841 VFromD<RebindToSigned<D>> index) {
5842 return detail::NativeMaskedScatter128<sizeof(TFromD<D>)>(v, m, d, base,
5843 index);
5844}
5845
5846#endif // HWY_TARGET <= HWY_AVX3
5847
5848// ------------------------------ Gather (Load/Store)
5849
5850#if HWY_TARGET <= HWY_AVX2
5851
5852#ifdef HWY_NATIVE_GATHER
5853#undef HWY_NATIVE_GATHER
5854#else
5855#define HWY_NATIVE_GATHER
5856#endif
5857
5858namespace detail {
5859
5860template <int kScale, typename T, size_t N, HWY_IF_UI32(T)>
5862 Vec128<int32_t, N> indices) {
5863 return Vec128<T, N>{_mm_i32gather_epi32(
5864 reinterpret_cast<const int32_t*>(base), indices.raw, kScale)};
5865}
5866
5867template <int kScale, typename T, size_t N, HWY_IF_UI64(T)>
5869 Vec128<int64_t, N> indices) {
5870 return Vec128<T, N>{_mm_i64gather_epi64(
5871 reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)};
5872}
5873
5874template <int kScale, size_t N>
5876 Vec128<int32_t, N> indices) {
5877 return Vec128<float, N>{_mm_i32gather_ps(base, indices.raw, kScale)};
5878}
5879
5880template <int kScale, size_t N>
5882 Vec128<int64_t, N> indices) {
5883 return Vec128<double, N>{_mm_i64gather_pd(base, indices.raw, kScale)};
5884}
5885
5886template <int kScale, typename T, size_t N, HWY_IF_UI32(T)>
5888 Mask128<T, N> m,
5889 const T* HWY_RESTRICT base,
5890 Vec128<int32_t, N> indices) {
5891#if HWY_TARGET <= HWY_AVX3
5892 return Vec128<T, N>{_mm_mmask_i32gather_epi32(
5893 no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base),
5894 kScale)};
5895#else
5896 return Vec128<T, N>{
5897 _mm_mask_i32gather_epi32(no.raw, reinterpret_cast<const int32_t*>(base),
5898 indices.raw, m.raw, kScale)};
5899#endif
5900}
5901
5902template <int kScale, typename T, size_t N, HWY_IF_UI64(T)>
5904 Mask128<T, N> m,
5905 const T* HWY_RESTRICT base,
5906 Vec128<int64_t, N> indices) {
5907#if HWY_TARGET <= HWY_AVX3
5908 return Vec128<T, N>{_mm_mmask_i64gather_epi64(
5909 no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base),
5910 kScale)};
5911#else
5912 return Vec128<T, N>{_mm_mask_i64gather_epi64(
5913 no.raw, reinterpret_cast<const GatherIndex64*>(base), indices.raw, m.raw,
5914 kScale)};
5915#endif
5916}
5917
5918template <int kScale, size_t N>
5920 Vec128<float, N> no, Mask128<float, N> m, const float* HWY_RESTRICT base,
5921 Vec128<int32_t, N> indices) {
5922#if HWY_TARGET <= HWY_AVX3
5923 return Vec128<float, N>{
5924 _mm_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
5925#else
5926 return Vec128<float, N>{
5927 _mm_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)};
5928#endif
5929}
5930
5931template <int kScale, size_t N>
5933 Vec128<double, N> no, Mask128<double, N> m, const double* HWY_RESTRICT base,
5934 Vec128<int64_t, N> indices) {
5935#if HWY_TARGET <= HWY_AVX3
5936 return Vec128<double, N>{
5937 _mm_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
5938#else
5939 return Vec128<double, N>{
5940 _mm_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)};
5941#endif
5942}
5943
5944} // namespace detail
5945
5946template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5948 VFromD<RebindToSigned<D>> offsets) {
5949 const RebindToSigned<decltype(d)> di;
5950 (void)di; // for HWY_DASSERT
5951 HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
5952 return detail::NativeGather128<1>(base, offsets);
5953}
5954
5955template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
5957 VFromD<RebindToSigned<D>> indices) {
5958 const RebindToSigned<decltype(d)> di;
5959 (void)di; // for HWY_DASSERT
5960 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
5961 return detail::NativeGather128<sizeof(T)>(base, indices);
5962}
5963
5964template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
5966 const T* HWY_RESTRICT base,
5967 VFromD<RebindToSigned<D>> indices) {
5968 // For partial vectors, ensure upper mask lanes are zero to prevent faults.
5969 if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5970
5971 const RebindToSigned<decltype(d)> di;
5972 (void)di; // for HWY_DASSERT
5973 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
5974 return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices);
5975}
5976
5977// Generic for all vector lengths.
5978template <class D>
5980 const TFromD<D>* HWY_RESTRICT base,
5981 VFromD<RebindToSigned<D>> indices) {
5982 return MaskedGatherIndexOr(Zero(d), m, d, base, indices);
5983}
5984
5985#endif // HWY_TARGET <= HWY_AVX2
5986
5987HWY_DIAGNOSTICS(pop)
5988
5989// ================================================== SWIZZLE (2)
5990
5991// ------------------------------ LowerHalf
5992
5993template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
5994HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
5995 return VFromD<D>{v.raw};
5996}
5997template <typename T, size_t N>
5998HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
5999 return Vec128<T, N / 2>{v.raw};
6000}
6001
6002// ------------------------------ ShiftLeftBytes
6003
6004template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6006 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
6007 const RebindToUnsigned<decltype(d)> du;
6008 return BitCast(
6009 d, VFromD<decltype(du)>{_mm_slli_si128(BitCast(du, v).raw, kBytes)});
6010}
6011
6012// Generic for all vector lengths.
6013template <int kBytes, class V>
6014HWY_API V ShiftLeftBytes(const V v) {
6015 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
6016}
6017
6018// ------------------------------ ShiftLeftLanes
6019
6020// Generic for all vector lengths.
6021template <int kLanes, class D>
6022HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) {
6023 const Repartition<uint8_t, decltype(d)> d8;
6024 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v)));
6025}
6026
6027// Generic for all vector lengths.
6028template <int kLanes, class V>
6029HWY_API V ShiftLeftLanes(const V v) {
6030 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
6031}
6032
6033// ------------------------------ ShiftRightBytes
6034template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6036 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
6037 const RebindToUnsigned<decltype(d)> du;
6038 // For partial vectors, clear upper lanes so we shift in zeros.
6039 if (d.MaxBytes() != 16) {
6040 const Full128<TFromD<D>> dfull;
6041 const VFromD<decltype(dfull)> vfull{v.raw};
6042 v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
6043 }
6044 return BitCast(
6045 d, VFromD<decltype(du)>{_mm_srli_si128(BitCast(du, v).raw, kBytes)});
6046}
6047
6048// ------------------------------ ShiftRightLanes
6049// Generic for all vector lengths.
6050template <int kLanes, class D>
6051HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) {
6052 const Repartition<uint8_t, decltype(d)> d8;
6053 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
6054 return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
6055}
6056
6057// ------------------------------ UpperHalf (ShiftRightBytes)
6058
6059// Full input: copy hi into lo (smaller instruction encoding than shifts).
6060template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
6061HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
6062 const Twice<RebindToUnsigned<decltype(d)>> dut;
6063 using VUT = VFromD<decltype(dut)>; // for float16_t
6064 const VUT vut = BitCast(dut, v);
6065 return BitCast(d, LowerHalf(VUT{_mm_unpackhi_epi64(vut.raw, vut.raw)}));
6066}
6067template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
6069 return Vec64<float>{_mm_movehl_ps(v.raw, v.raw)};
6070}
6071template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
6073 return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)};
6074}
6075
6076// Partial
6077template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
6078HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
6079 return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
6080}
6081
6082// ------------------------------ ExtractLane (UpperHalf)
6083
6084namespace detail {
6085
6086template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
6087HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
6088 static_assert(kLane < N, "Lane index out of bounds");
6089#if HWY_TARGET >= HWY_SSSE3
6090 const int pair = _mm_extract_epi16(v.raw, kLane / 2);
6091 constexpr int kShift = kLane & 1 ? 8 : 0;
6092 return static_cast<T>((pair >> kShift) & 0xFF);
6093#else
6094 return static_cast<T>(_mm_extract_epi8(v.raw, kLane) & 0xFF);
6095#endif
6096}
6097
6098template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
6099HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
6100 static_assert(kLane < N, "Lane index out of bounds");
6101 const DFromV<decltype(v)> d;
6102 const RebindToUnsigned<decltype(d)> du;
6103 const uint16_t lane = static_cast<uint16_t>(
6104 _mm_extract_epi16(BitCast(du, v).raw, kLane) & 0xFFFF);
6105 return BitCastScalar<T>(lane);
6106}
6107
6108template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)>
6109HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
6110 static_assert(kLane < N, "Lane index out of bounds");
6111#if HWY_TARGET >= HWY_SSSE3
6112 return static_cast<T>(_mm_cvtsi128_si32(
6113 (kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, kLane)));
6114#else
6115 return static_cast<T>(_mm_extract_epi32(v.raw, kLane));
6116#endif
6117}
6118
6119template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)>
6120HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
6121 static_assert(kLane < N, "Lane index out of bounds");
6122#if HWY_ARCH_X86_32
6123 alignas(16) T lanes[2];
6124 Store(v, DFromV<decltype(v)>(), lanes);
6125 return lanes[kLane];
6126#elif HWY_TARGET >= HWY_SSSE3
6127 return static_cast<T>(
6128 _mm_cvtsi128_si64((kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, 0xEE)));
6129#else
6130 return static_cast<T>(_mm_extract_epi64(v.raw, kLane));
6131#endif
6132}
6133
6134template <size_t kLane, size_t N>
6135HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
6136 static_assert(kLane < N, "Lane index out of bounds");
6137#if HWY_TARGET >= HWY_SSSE3
6138 return _mm_cvtss_f32((kLane == 0) ? v.raw
6139 : _mm_shuffle_ps(v.raw, v.raw, kLane));
6140#else
6141 // Bug in the intrinsic, returns int but should be float.
6142 const int32_t bits = _mm_extract_ps(v.raw, kLane);
6143 return BitCastScalar<float>(bits);
6144#endif
6145}
6146
6147// There is no extract_pd; two overloads because there is no UpperHalf for N=1.
6148template <size_t kLane>
6150 static_assert(kLane == 0, "Lane index out of bounds");
6151 return GetLane(v);
6152}
6153
6154template <size_t kLane>
6156 static_assert(kLane < 2, "Lane index out of bounds");
6157 const Half<DFromV<decltype(v)>> dh;
6158 return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v));
6159}
6160
6161} // namespace detail
6162
6163// Requires one overload per vector length because ExtractLane<3> may be a
6164// compile error if it calls _mm_extract_epi64.
6165template <typename T>
6166HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
6167 HWY_DASSERT(i == 0);
6168 (void)i;
6169 return GetLane(v);
6170}
6171
6172template <typename T>
6173HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
6174#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6175 if (__builtin_constant_p(i)) {
6176 switch (i) {
6177 case 0:
6178 return detail::ExtractLane<0>(v);
6179 case 1:
6180 return detail::ExtractLane<1>(v);
6181 }
6182 }
6183#endif
6184 alignas(16) T lanes[2];
6185 Store(v, DFromV<decltype(v)>(), lanes);
6186 return lanes[i];
6187}
6188
6189template <typename T>
6190HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
6191#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6192 if (__builtin_constant_p(i)) {
6193 switch (i) {
6194 case 0:
6195 return detail::ExtractLane<0>(v);
6196 case 1:
6197 return detail::ExtractLane<1>(v);
6198 case 2:
6199 return detail::ExtractLane<2>(v);
6200 case 3:
6201 return detail::ExtractLane<3>(v);
6202 }
6203 }
6204#endif
6205 alignas(16) T lanes[4];
6206 Store(v, DFromV<decltype(v)>(), lanes);
6207 return lanes[i];
6208}
6209
6210template <typename T>
6211HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
6212#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6213 if (__builtin_constant_p(i)) {
6214 switch (i) {
6215 case 0:
6216 return detail::ExtractLane<0>(v);
6217 case 1:
6218 return detail::ExtractLane<1>(v);
6219 case 2:
6220 return detail::ExtractLane<2>(v);
6221 case 3:
6222 return detail::ExtractLane<3>(v);
6223 case 4:
6224 return detail::ExtractLane<4>(v);
6225 case 5:
6226 return detail::ExtractLane<5>(v);
6227 case 6:
6228 return detail::ExtractLane<6>(v);
6229 case 7:
6230 return detail::ExtractLane<7>(v);
6231 }
6232 }
6233#endif
6234 alignas(16) T lanes[8];
6235 Store(v, DFromV<decltype(v)>(), lanes);
6236 return lanes[i];
6237}
6238
6239template <typename T>
6240HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
6241#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6242 if (__builtin_constant_p(i)) {
6243 switch (i) {
6244 case 0:
6245 return detail::ExtractLane<0>(v);
6246 case 1:
6247 return detail::ExtractLane<1>(v);
6248 case 2:
6249 return detail::ExtractLane<2>(v);
6250 case 3:
6251 return detail::ExtractLane<3>(v);
6252 case 4:
6253 return detail::ExtractLane<4>(v);
6254 case 5:
6255 return detail::ExtractLane<5>(v);
6256 case 6:
6257 return detail::ExtractLane<6>(v);
6258 case 7:
6259 return detail::ExtractLane<7>(v);
6260 case 8:
6261 return detail::ExtractLane<8>(v);
6262 case 9:
6263 return detail::ExtractLane<9>(v);
6264 case 10:
6265 return detail::ExtractLane<10>(v);
6266 case 11:
6267 return detail::ExtractLane<11>(v);
6268 case 12:
6269 return detail::ExtractLane<12>(v);
6270 case 13:
6271 return detail::ExtractLane<13>(v);
6272 case 14:
6273 return detail::ExtractLane<14>(v);
6274 case 15:
6275 return detail::ExtractLane<15>(v);
6276 }
6277 }
6278#endif
6279 alignas(16) T lanes[16];
6280 Store(v, DFromV<decltype(v)>(), lanes);
6281 return lanes[i];
6282}
6283
6284// ------------------------------ InsertLane (UpperHalf)
6285
6286namespace detail {
6287
6288template <class V>
6290 const DFromV<decltype(v)> d;
6291
6292#if HWY_TARGET <= HWY_AVX3
6293 using RawMask = decltype(MaskFromVec(VFromD<decltype(d)>()).raw);
6294 const auto mask = MFromD<decltype(d)>{static_cast<RawMask>(uint64_t{1} << i)};
6295#else
6296 const RebindToUnsigned<decltype(d)> du;
6297 using TU = TFromD<decltype(du)>;
6298 const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast<TU>(i)));
6299#endif
6300
6301 return IfThenElse(mask, Set(d, t), v);
6302}
6303
6304template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
6305HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
6306 static_assert(kLane < N, "Lane index out of bounds");
6307#if HWY_TARGET >= HWY_SSSE3
6308 return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
6309#else
6310 return Vec128<T, N>{_mm_insert_epi8(v.raw, t, kLane)};
6311#endif
6312}
6313
6314template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
6315HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
6316 static_assert(kLane < N, "Lane index out of bounds");
6317 const DFromV<decltype(v)> d;
6318 const RebindToUnsigned<decltype(d)> du;
6319 const uint16_t bits = BitCastScalar<uint16_t>(t);
6320 return BitCast(d, VFromD<decltype(du)>{
6321 _mm_insert_epi16(BitCast(du, v).raw, bits, kLane)});
6322}
6323
6324template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)>
6325HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
6326 static_assert(kLane < N, "Lane index out of bounds");
6327#if HWY_TARGET >= HWY_SSSE3
6328 return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
6329#else
6330 const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
6331 return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
6332#endif
6333}
6334
6335template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)>
6336HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
6337 static_assert(kLane < N, "Lane index out of bounds");
6338#if HWY_TARGET >= HWY_SSSE3 || HWY_ARCH_X86_32
6339 const DFromV<decltype(v)> d;
6340 const RebindToFloat<decltype(d)> df;
6341 const auto vt = BitCast(df, Set(d, t));
6342 if (kLane == 0) {
6343 return BitCast(
6344 d, Vec128<double, N>{_mm_shuffle_pd(vt.raw, BitCast(df, v).raw, 2)});
6345 }
6346 return BitCast(
6347 d, Vec128<double, N>{_mm_shuffle_pd(BitCast(df, v).raw, vt.raw, 0)});
6348#else
6349 const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
6350 return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
6351#endif
6352}
6353
6354template <size_t kLane, size_t N>
6355HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
6356 static_assert(kLane < N, "Lane index out of bounds");
6357#if HWY_TARGET >= HWY_SSSE3
6358 return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
6359#else
6360 return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)};
6361#endif
6362}
6363
6364// There is no insert_pd; two overloads because there is no UpperHalf for N=1.
6365template <size_t kLane>
6367 static_assert(kLane == 0, "Lane index out of bounds");
6368 return Set(DFromV<decltype(v)>(), t);
6369}
6370
6371template <size_t kLane>
6373 static_assert(kLane < 2, "Lane index out of bounds");
6374 const DFromV<decltype(v)> d;
6375 const Vec128<double> vt = Set(d, t);
6376 if (kLane == 0) {
6377 return Vec128<double>{_mm_shuffle_pd(vt.raw, v.raw, 2)};
6378 }
6379 return Vec128<double>{_mm_shuffle_pd(v.raw, vt.raw, 0)};
6380}
6381
6382} // namespace detail
6383
6384// Requires one overload per vector length because InsertLane<3> may be a
6385// compile error if it calls _mm_insert_epi64.
6386
6387template <typename T>
6388HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
6389 HWY_DASSERT(i == 0);
6390 (void)i;
6391 return Set(DFromV<decltype(v)>(), t);
6392}
6393
6394template <typename T>
6395HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
6396#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6397 if (__builtin_constant_p(i)) {
6398 switch (i) {
6399 case 0:
6400 return detail::InsertLane<0>(v, t);
6401 case 1:
6402 return detail::InsertLane<1>(v, t);
6403 }
6404 }
6405#endif
6406 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
6407}
6408
6409template <typename T>
6410HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
6411#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6412 if (__builtin_constant_p(i)) {
6413 switch (i) {
6414 case 0:
6415 return detail::InsertLane<0>(v, t);
6416 case 1:
6417 return detail::InsertLane<1>(v, t);
6418 case 2:
6419 return detail::InsertLane<2>(v, t);
6420 case 3:
6421 return detail::InsertLane<3>(v, t);
6422 }
6423 }
6424#endif
6425 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
6426}
6427
6428template <typename T>
6429HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
6430#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6431 if (__builtin_constant_p(i)) {
6432 switch (i) {
6433 case 0:
6434 return detail::InsertLane<0>(v, t);
6435 case 1:
6436 return detail::InsertLane<1>(v, t);
6437 case 2:
6438 return detail::InsertLane<2>(v, t);
6439 case 3:
6440 return detail::InsertLane<3>(v, t);
6441 case 4:
6442 return detail::InsertLane<4>(v, t);
6443 case 5:
6444 return detail::InsertLane<5>(v, t);
6445 case 6:
6446 return detail::InsertLane<6>(v, t);
6447 case 7:
6448 return detail::InsertLane<7>(v, t);
6449 }
6450 }
6451#endif
6452 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
6453}
6454
6455template <typename T>
6456HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
6457#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6458 if (__builtin_constant_p(i)) {
6459 switch (i) {
6460 case 0:
6461 return detail::InsertLane<0>(v, t);
6462 case 1:
6463 return detail::InsertLane<1>(v, t);
6464 case 2:
6465 return detail::InsertLane<2>(v, t);
6466 case 3:
6467 return detail::InsertLane<3>(v, t);
6468 case 4:
6469 return detail::InsertLane<4>(v, t);
6470 case 5:
6471 return detail::InsertLane<5>(v, t);
6472 case 6:
6473 return detail::InsertLane<6>(v, t);
6474 case 7:
6475 return detail::InsertLane<7>(v, t);
6476 case 8:
6477 return detail::InsertLane<8>(v, t);
6478 case 9:
6479 return detail::InsertLane<9>(v, t);
6480 case 10:
6481 return detail::InsertLane<10>(v, t);
6482 case 11:
6483 return detail::InsertLane<11>(v, t);
6484 case 12:
6485 return detail::InsertLane<12>(v, t);
6486 case 13:
6487 return detail::InsertLane<13>(v, t);
6488 case 14:
6489 return detail::InsertLane<14>(v, t);
6490 case 15:
6491 return detail::InsertLane<15>(v, t);
6492 }
6493 }
6494#endif
6495 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
6496}
6497
6498// ------------------------------ CombineShiftRightBytes
6499
6500#if HWY_TARGET == HWY_SSE2
6501template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)>
6503 static_assert(0 < kBytes && kBytes < 16, "kBytes invalid");
6504 return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi));
6505}
6506template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6508 constexpr size_t kSize = d.MaxBytes();
6509 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
6510
6511 const Twice<decltype(d)> dt;
6512 return VFromD<D>{ShiftRightBytes<kBytes>(dt, Combine(dt, hi, lo)).raw};
6513}
6514#else
6515template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)>
6517 const Repartition<uint8_t, decltype(d)> d8;
6518 return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
6519 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
6520}
6521
6522template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6524 constexpr size_t kSize = d.MaxBytes();
6525 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
6526 const Repartition<uint8_t, decltype(d)> d8;
6527 using V8 = Vec128<uint8_t>;
6528 const DFromV<V8> dfull8;
6529 const Repartition<TFromD<D>, decltype(dfull8)> dfull;
6530 const V8 hi8{BitCast(d8, hi).raw};
6531 // Move into most-significant bytes
6532 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
6533 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8);
6534 return VFromD<D>{BitCast(dfull, r).raw};
6535}
6536#endif
6537
6538// ------------------------------ Broadcast/splat any lane
6539
6540template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
6541HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
6542 const DFromV<decltype(v)> d;
6543 const RebindToUnsigned<decltype(d)> du;
6544 using VU = VFromD<decltype(du)>;
6545 const VU vu = BitCast(du, v); // for float16_t
6546 static_assert(0 <= kLane && kLane < N, "Invalid lane");
6547 if (kLane < 4) {
6548 const __m128i lo = _mm_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF);
6549 return BitCast(d, VU{_mm_unpacklo_epi64(lo, lo)});
6550 } else {
6551 const __m128i hi = _mm_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF);
6552 return BitCast(d, VU{_mm_unpackhi_epi64(hi, hi)});
6553 }
6554}
6555
6556template <int kLane, typename T, size_t N, HWY_IF_UI32(T)>
6557HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
6558 static_assert(0 <= kLane && kLane < N, "Invalid lane");
6559 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
6560}
6561
6562template <int kLane, typename T, size_t N, HWY_IF_UI64(T)>
6563HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
6564 static_assert(0 <= kLane && kLane < N, "Invalid lane");
6565 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
6566}
6567
6568template <int kLane, size_t N>
6570 static_assert(0 <= kLane && kLane < N, "Invalid lane");
6571 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
6572}
6573
6574template <int kLane, size_t N>
6576 static_assert(0 <= kLane && kLane < N, "Invalid lane");
6577 return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)};
6578}
6579
6580// ------------------------------ TableLookupLanes (Shuffle01)
6581
6582// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
6583template <typename T, size_t N = 16 / sizeof(T)>
6584struct Indices128 {
6585 __m128i raw;
6586};
6587
6588template <class D, typename T = TFromD<D>, typename TI, size_t kN,
6589 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 1)>
6591 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
6592#if HWY_IS_DEBUG_BUILD
6593 const Rebind<TI, decltype(d)> di;
6594 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
6595 AllTrue(di, Lt(vec, Set(di, kN * 2))));
6596#endif
6597
6598 // No change as byte indices are always used for 8-bit lane types
6599 (void)d;
6600 return Indices128<T, kN>{vec.raw};
6601}
6602
6603template <class D, typename T = TFromD<D>, typename TI, size_t kN,
6604 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 2)>
6605HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
6606 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
6607#if HWY_IS_DEBUG_BUILD
6608 const Rebind<TI, decltype(d)> di;
6609 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
6610 AllTrue(di, Lt(vec, Set(di, kN * 2))));
6611#endif
6612
6613#if HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2
6614 (void)d;
6615 return Indices128<T, kN>{vec.raw};
6616#else // SSSE3, SSE4, or AVX2
6617 const Repartition<uint8_t, decltype(d)> d8;
6618 using V8 = VFromD<decltype(d8)>;
6619 alignas(16) static constexpr uint8_t kByteOffsets[16] = {
6620 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
6621
6622 // Broadcast each lane index to all 4 bytes of T
6623 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
6624 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
6625 const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
6626
6627 // Shift to bytes
6628 const Repartition<uint16_t, decltype(d)> d16;
6629 const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices)));
6630
6631 return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
6632#endif // HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2
6633}
6634
6635template <class D, typename T = TFromD<D>, typename TI, size_t kN,
6636 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 4)>
6637HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
6638 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
6639#if HWY_IS_DEBUG_BUILD
6640 const Rebind<TI, decltype(d)> di;
6641 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
6642 AllTrue(di, Lt(vec, Set(di, kN * 2))));
6643#endif
6644
6645#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
6646 (void)d;
6647 return Indices128<T, kN>{vec.raw};
6648#else
6649 const Repartition<uint8_t, decltype(d)> d8;
6650 using V8 = VFromD<decltype(d8)>;
6651 alignas(16) static constexpr uint8_t kByteOffsets[16] = {
6652 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
6653
6654 // Broadcast each lane index to all 4 bytes of T
6655 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
6656 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
6657 const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
6658
6659 // Shift to bytes
6660 const Repartition<uint16_t, decltype(d)> d16;
6661 const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
6662
6663 return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
6664#endif
6665}
6666
6667template <class D, typename T = TFromD<D>, typename TI, size_t kN,
6668 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 8)>
6669HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
6670 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
6671#if HWY_IS_DEBUG_BUILD
6672 const Rebind<TI, decltype(d)> di;
6673 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
6674 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(kN * 2)))));
6675#else
6676 (void)d;
6677#endif
6678
6679 // No change - even without AVX3, we can shuffle+blend.
6680 return Indices128<T, kN>{vec.raw};
6681}
6682
6683template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI>
6684HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
6685 D d, const TI* idx) {
6686 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
6687 const Rebind<TI, decltype(d)> di;
6688 return IndicesFromVec(d, LoadU(di, idx));
6689}
6690
6691template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
6695
6696template <typename T, size_t N, HWY_IF_UI16(T)>
6697HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
6698#if HWY_TARGET <= HWY_AVX3
6699 return {_mm_permutexvar_epi16(idx.raw, v.raw)};
6700#elif HWY_TARGET == HWY_SSE2
6701#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
6702 typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16)));
6703 return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>(
6704 __builtin_shuffle(reinterpret_cast<GccU16RawVectType>(v.raw),
6705 reinterpret_cast<GccU16RawVectType>(idx.raw)))};
6706#else
6707 const Full128<T> d_full;
6708 alignas(16) T src_lanes[8];
6709 alignas(16) uint16_t indices[8];
6710 alignas(16) T result_lanes[8];
6711
6712 Store(Vec128<T>{v.raw}, d_full, src_lanes);
6713 _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw);
6714
6715 for (int i = 0; i < 8; i++) {
6716 result_lanes[i] = src_lanes[indices[i] & 7u];
6717 }
6718
6719 return Vec128<T, N>{Load(d_full, result_lanes).raw};
6720#endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
6721#else
6722 return TableLookupBytes(v, Vec128<T, N>{idx.raw});
6723#endif
6724}
6725
6726#if HWY_HAVE_FLOAT16
6727template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 2)>
6728HWY_API Vec128<float16_t, N> TableLookupLanes(Vec128<float16_t, N> v,
6729 Indices128<float16_t, N> idx) {
6730 return {_mm_permutexvar_ph(idx.raw, v.raw)};
6731}
6732#endif // HWY_HAVE_FLOAT16
6733
6734template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
6735HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
6736#if HWY_TARGET <= HWY_AVX2
6737 const DFromV<decltype(v)> d;
6738 const RebindToFloat<decltype(d)> df;
6739 const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)};
6740 return BitCast(d, perm);
6741#elif HWY_TARGET == HWY_SSE2
6742#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
6743 typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
6744 return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>(
6745 __builtin_shuffle(reinterpret_cast<GccU32RawVectType>(v.raw),
6746 reinterpret_cast<GccU32RawVectType>(idx.raw)))};
6747#else
6748 const Full128<T> d_full;
6749 alignas(16) T src_lanes[4];
6750 alignas(16) uint32_t indices[4];
6751 alignas(16) T result_lanes[4];
6752
6753 Store(Vec128<T>{v.raw}, d_full, src_lanes);
6754 _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw);
6755
6756 for (int i = 0; i < 4; i++) {
6757 result_lanes[i] = src_lanes[indices[i] & 3u];
6758 }
6759
6760 return Vec128<T, N>{Load(d_full, result_lanes).raw};
6761#endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
6762#else // SSSE3 or SSE4
6763 return TableLookupBytes(v, Vec128<T, N>{idx.raw});
6764#endif
6765}
6766
6767#if HWY_TARGET <= HWY_SSSE3
6768template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)>
6771#if HWY_TARGET <= HWY_AVX2
6772 return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)};
6773#else // SSSE3 or SSE4
6774 const DFromV<decltype(v)> df;
6775 const RebindToSigned<decltype(df)> di;
6776 return BitCast(df,
6778#endif // HWY_TARGET <= HWY_AVX2
6779}
6780#endif // HWY_TARGET <= HWY_SSSE3
6781
6782// Single lane: no change
6783template <typename T>
6784HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
6785 Indices128<T, 1> /* idx */) {
6786 return v;
6787}
6788
6789template <typename T, HWY_IF_UI64(T)>
6791 const DFromV<decltype(v)> d;
6792 Vec128<int64_t> vidx{idx.raw};
6793#if HWY_TARGET <= HWY_AVX2
6794 // There is no _mm_permute[x]var_epi64.
6795 vidx += vidx; // bit1 is the decider (unusual)
6796 const RebindToFloat<decltype(d)> df;
6797 return BitCast(
6798 d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)});
6799#else
6800 // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
6801 // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
6802 // to obtain an all-zero or all-one mask.
6803 const RebindToSigned<decltype(d)> di;
6804 const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
6805 const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same));
6806 return IfThenElse(mask_same, v, Shuffle01(v));
6807#endif
6808}
6809
6811 Indices128<double> idx) {
6812 Vec128<int64_t> vidx{idx.raw};
6813#if HWY_TARGET <= HWY_AVX2
6814 vidx += vidx; // bit1 is the decider (unusual)
6815 return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)};
6816#else
6817 // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
6818 // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
6819 // to obtain an all-zero or all-one mask.
6820 const DFromV<decltype(v)> d;
6821 const RebindToSigned<decltype(d)> di;
6822 const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
6823 const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same));
6824 return IfThenElse(mask_same, v, Shuffle01(v));
6825#endif
6826}
6827
6828// ------------------------------ ReverseBlocks
6829
6830// Single block: no change
6831template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6832HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
6833 return v;
6834}
6835
6836// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
6837
6838// Single lane: no change
6839template <class D, HWY_IF_LANES_D(D, 1)>
6840HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
6841 return v;
6842}
6843
6844// 32-bit x2: shuffle
6845template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
6846HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
6847 return VFromD<D>{Shuffle2301(Vec128<TFromD<D>>{v.raw}).raw};
6848}
6849
6850// 64-bit x2: shuffle
6851template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
6852HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
6853 return Shuffle01(v);
6854}
6855
6856// 32-bit x4: shuffle
6857template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
6858HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
6859 return Shuffle0123(v);
6860}
6861
6862// 16-bit
6863template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2),
6864 HWY_IF_LANES_GT_D(D, 1)>
6866 const RebindToUnsigned<decltype(d)> du;
6867 using VU = VFromD<decltype(du)>;
6868 const VU vu = BitCast(du, v); // for float16_t
6869 constexpr size_t kN = MaxLanes(d);
6870 if (kN == 1) return v;
6871 if (kN == 2) {
6872 return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 0, 1))});
6873 }
6874 if (kN == 4) {
6875 return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))});
6876 }
6877
6878#if HWY_TARGET == HWY_SSE2
6879 const VU rev4{
6880 _mm_shufflehi_epi16(_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)),
6881 _MM_SHUFFLE(0, 1, 2, 3))};
6882 return BitCast(d, VU{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))});
6883#else
6884 const RebindToSigned<decltype(d)> di;
6885 const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
6886 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
6887 return BitCast(d, TableLookupBytes(v, shuffle));
6888#endif
6889}
6890
6891template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1),
6892 HWY_IF_LANES_GT_D(D, 1)>
6893HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
6894 constexpr int kN = static_cast<int>(MaxLanes(d));
6895 if (kN == 1) return v;
6896#if HWY_TARGET <= HWY_SSSE3
6897 // NOTE: Lanes with negative shuffle control mask values are set to zero.
6898 alignas(16) static constexpr int8_t kReverse[16] = {
6899 kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8,
6900 kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16};
6901 const RebindToSigned<decltype(d)> di;
6902 const VFromD<decltype(di)> idx = Load(di, kReverse);
6903 return VFromD<D>{_mm_shuffle_epi8(BitCast(di, v).raw, idx.raw)};
6904#else
6905 const RepartitionToWide<decltype(d)> d16;
6906 return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v))));
6907#endif
6908}
6909
6910// ------------------------------ Reverse2
6911
6912// Single lane: no change
6913template <class D, HWY_IF_LANES_D(D, 1)>
6914HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
6915 return v;
6916}
6917
6918// Generic for all vector lengths (128-bit sufficient if SSE2).
6919template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
6921#if HWY_TARGET <= HWY_AVX3
6922 const Repartition<uint32_t, decltype(d)> du32;
6923 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
6924#elif HWY_TARGET == HWY_SSE2
6925 const RebindToUnsigned<decltype(d)> du;
6926 using VU = VFromD<decltype(du)>;
6927 const VU vu = BitCast(du, v); // for float16_t
6928 constexpr size_t kN = MaxLanes(d);
6929 __m128i shuf_result = _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(2, 3, 0, 1));
6930 if (kN > 4) {
6931 shuf_result = _mm_shufflehi_epi16(shuf_result, _MM_SHUFFLE(2, 3, 0, 1));
6932 }
6933 return BitCast(d, VU{shuf_result});
6934#else
6935 const RebindToSigned<decltype(d)> di;
6936 const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
6937 di, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C);
6938 return BitCast(d, TableLookupBytes(v, shuffle));
6939#endif
6940}
6941
6942// Generic for all vector lengths.
6943template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_GT_D(D, 1)>
6944HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
6945 return Shuffle2301(v);
6946}
6947
6948// Generic for all vector lengths.
6949template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
6950HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
6951 return Shuffle01(v);
6952}
6953
6954// ------------------------------ Reverse4
6955
6956template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
6958 const RebindToUnsigned<decltype(d)> du;
6959 using VU = VFromD<decltype(du)>;
6960 const VU vu = BitCast(du, v); // for float16_t
6961 // 4x 16-bit: a single shufflelo suffices.
6962 constexpr size_t kN = MaxLanes(d);
6963 if (kN <= 4) {
6964 return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))});
6965 }
6966
6967#if HWY_TARGET == HWY_SSE2
6968 return BitCast(d, VU{_mm_shufflehi_epi16(
6969 _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)),
6970 _MM_SHUFFLE(0, 1, 2, 3))});
6971#else
6972 const RebindToSigned<decltype(d)> di;
6973 const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
6974 di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
6975 return BitCast(d, TableLookupBytes(v, shuffle));
6976#endif
6977}
6978
6979// Generic for all vector lengths.
6980template <class D, HWY_IF_T_SIZE_D(D, 4)>
6981HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
6982 return Shuffle0123(v);
6983}
6984
6985template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
6986HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) {
6987 HWY_ASSERT(0); // don't have 4 u64 lanes
6988}
6989
6990// ------------------------------ Reverse8
6991
6992template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
6993HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
6994#if HWY_TARGET == HWY_SSE2
6995 const RepartitionToWide<decltype(d)> dw;
6996 return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
6997#else
6998 const RebindToSigned<decltype(d)> di;
6999 const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
7000 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
7001 return BitCast(d, TableLookupBytes(v, shuffle));
7002#endif
7003}
7004
7005template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
7006 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
7007HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) {
7008 HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit
7009}
7010
7011// ------------------------------ ReverseBits in x86_512
7012
7013// ------------------------------ InterleaveUpper (UpperHalf)
7014
7015// Full
7016template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
7017HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
7018 return VFromD<D>{_mm_unpackhi_epi8(a.raw, b.raw)};
7019}
7020template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
7021HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
7022 const DFromV<decltype(a)> d;
7023 const RebindToUnsigned<decltype(d)> du;
7024 using VU = VFromD<decltype(du)>; // for float16_t
7025 return BitCast(
7026 d, VU{_mm_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
7027}
7028template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
7030 return VFromD<D>{_mm_unpackhi_epi32(a.raw, b.raw)};
7031}
7032template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
7034 return VFromD<D>{_mm_unpackhi_epi64(a.raw, b.raw)};
7035}
7036template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
7038 return VFromD<D>{_mm_unpackhi_ps(a.raw, b.raw)};
7039}
7040template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
7042 return VFromD<D>{_mm_unpackhi_pd(a.raw, b.raw)};
7043}
7044
7045// Partial
7046template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7048 const Half<decltype(d)> d2;
7049 return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
7050 VFromD<D>{UpperHalf(d2, b).raw});
7051}
7052
7053// -------------------------- I8/U8 Broadcast (InterleaveLower, InterleaveUpper)
7054
7055template <int kLane, class T, size_t N, HWY_IF_T_SIZE(T, 1)>
7056HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
7057 static_assert(0 <= kLane && kLane < N, "Invalid lane");
7058 const DFromV<decltype(v)> d;
7059
7060#if HWY_TARGET == HWY_SSE2
7061 const Full128<T> d_full;
7062 const Vec128<T> v_full{v.raw};
7063 const auto v_interleaved = (kLane < 8)
7064 ? InterleaveLower(d_full, v_full, v_full)
7065 : InterleaveUpper(d_full, v_full, v_full);
7066 return ResizeBitCast(
7067 d, Broadcast<kLane & 7>(BitCast(Full128<uint16_t>(), v_interleaved)));
7068#else
7069 return TableLookupBytes(v, Set(d, static_cast<T>(kLane)));
7070#endif
7071}
7072
7073// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
7074
7075// Same as Interleave*, except that the return lanes are double-width integers;
7076// this is necessary because the single-lane scalar cannot return two values.
7077// Generic for all vector lengths.
7078template <class V, class DW = RepartitionToWide<DFromV<V>>>
7079HWY_API VFromD<DW> ZipLower(V a, V b) {
7080 return BitCast(DW(), InterleaveLower(a, b));
7081}
7082template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
7083HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
7084 return BitCast(dw, InterleaveLower(D(), a, b));
7085}
7086
7087template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
7088HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
7089 return BitCast(dw, InterleaveUpper(D(), a, b));
7090}
7091
7092// ------------------------------ Per4LaneBlockShuffle
7093namespace detail {
7094
7095#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7096#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7097#else
7098#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7099#endif
7100
7101template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
7103 const uint32_t x2,
7104 const uint32_t x1,
7105 const uint32_t x0) {
7106 return ResizeBitCast(
7107 d, Vec128<uint32_t>{_mm_set_epi32(
7108 static_cast<int32_t>(x3), static_cast<int32_t>(x2),
7109 static_cast<int32_t>(x1), static_cast<int32_t>(x0))});
7110}
7111
7112template <size_t kIdx3210, class V>
7114 hwy::SizeTag<2> /*lane_size_tag*/,
7115 hwy::SizeTag<8> /*vect_size_tag*/, V v) {
7116 const DFromV<decltype(v)> d;
7117 const RebindToUnsigned<decltype(d)> du; // for float16_t
7118 return BitCast(d,
7119 VFromD<decltype(du)>{_mm_shufflelo_epi16(
7120 BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))});
7121}
7122
7123#if HWY_TARGET == HWY_SSE2
7124template <size_t kIdx3210, class V>
7126 hwy::SizeTag<2> /*lane_size_tag*/,
7127 hwy::SizeTag<16> /*vect_size_tag*/, V v) {
7128 const DFromV<decltype(v)> d;
7129 const RebindToUnsigned<decltype(d)> du; // for float16_t
7130 constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF);
7131 return BitCast(
7132 d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
7133 _mm_shufflelo_epi16(BitCast(du, v).raw, kShuffle), kShuffle)});
7134}
7135
7136template <size_t kIdx3210, size_t kVectSize, class V,
7137 hwy::EnableIf<(kVectSize == 4 || kVectSize == 8)>* = nullptr>
7138HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag,
7139 hwy::SizeTag<1> /*lane_size_tag*/,
7140 hwy::SizeTag<kVectSize> /*vect_size_tag*/,
7141 V v) {
7142 const DFromV<decltype(v)> d;
7143 const RebindToUnsigned<decltype(d)> du;
7144 const Rebind<uint16_t, decltype(d)> du16;
7145 const RebindToSigned<decltype(du16)> di16;
7146
7147 const auto vu16 = PromoteTo(du16, BitCast(du, v));
7148 const auto shuf16_result = Per4LaneBlockShuffle(
7149 idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<kVectSize * 2>(), vu16);
7150 return BitCast(d, DemoteTo(du, BitCast(di16, shuf16_result)));
7151}
7152
7153template <size_t kIdx3210, size_t kVectSize, class V>
7155 hwy::SizeTag<1> /*lane_size_tag*/,
7156 hwy::SizeTag<16> /*vect_size_tag*/, V v) {
7157 const DFromV<decltype(v)> d;
7158 const RebindToUnsigned<decltype(d)> du;
7159 const Repartition<uint16_t, decltype(d)> du16;
7160 const RebindToSigned<decltype(du16)> di16;
7161
7162 const auto zero = Zero(d);
7163 const auto v_lo16 = BitCast(du16, InterleaveLower(d, v, zero));
7164 const auto v_hi16 = BitCast(du16, InterleaveUpper(d, v, zero));
7165
7166 const auto lo_shuf_result = Per4LaneBlockShuffle(
7167 idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_lo16);
7168 const auto hi_shuf_result = Per4LaneBlockShuffle(
7169 idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_hi16);
7170
7171 return BitCast(d, OrderedDemote2To(du, BitCast(di16, lo_shuf_result),
7172 BitCast(di16, hi_shuf_result)));
7173}
7174#endif
7175
7176template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
7178 hwy::SizeTag<4> /*lane_size_tag*/,
7179 hwy::SizeTag<16> /*vect_size_tag*/, V v) {
7180 return V{_mm_shuffle_epi32(v.raw, static_cast<int>(kIdx3210 & 0xFF))};
7181}
7182
7183template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)>
7184HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
7185 hwy::SizeTag<4> /*lane_size_tag*/,
7186 hwy::SizeTag<16> /*vect_size_tag*/, V v) {
7187 return V{_mm_shuffle_ps(v.raw, v.raw, static_cast<int>(kIdx3210 & 0xFF))};
7188}
7189
7190} // namespace detail
7191
7192// ------------------------------ SlideUpLanes
7193
7194namespace detail {
7195
7196template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
7197HWY_INLINE V SlideUpLanes(V v, size_t amt) {
7198 const DFromV<decltype(v)> d;
7199 const Full64<uint64_t> du64;
7200 const auto vu64 = ResizeBitCast(du64, v);
7201 return ResizeBitCast(
7202 d, ShiftLeftSame(vu64, static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
7203}
7204
7205#if HWY_TARGET <= HWY_SSSE3
7206template <class V, HWY_IF_V_SIZE_V(V, 16)>
7207HWY_INLINE V SlideUpLanes(V v, size_t amt) {
7208 const DFromV<decltype(v)> d;
7209 const Repartition<uint8_t, decltype(d)> du8;
7210 const auto idx =
7211 Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>)));
7212 return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx));
7213}
7214#else
7215template <class V, HWY_IF_V_SIZE_V(V, 16)>
7216HWY_INLINE V SlideUpLanes(V v, size_t amt) {
7217 const DFromV<decltype(v)> d;
7218 const Repartition<int32_t, decltype(d)> di32;
7219 const Repartition<uint64_t, decltype(d)> du64;
7220 constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV<V>);
7221
7222 const auto vu64 = BitCast(du64, v);
7223 const auto v_hi = IfVecThenElse(
7224 BitCast(du64, Set(di32, -static_cast<int32_t>(amt >= kNumOfLanesPerU64))),
7225 BitCast(du64, ShiftLeftBytes<8>(du64, vu64)), vu64);
7226 const auto v_lo = ShiftLeftBytes<8>(du64, v_hi);
7227
7228 const int shl_amt = static_cast<int>((amt * sizeof(TFromV<V>) * 8) & 63);
7229 return BitCast(
7230 d, Or(ShiftLeftSame(v_hi, shl_amt), ShiftRightSame(v_lo, 64 - shl_amt)));
7231}
7232#endif
7233
7234} // namespace detail
7235
7236template <class D, HWY_IF_LANES_D(D, 1)>
7237HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
7238 return v;
7239}
7240
7241template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
7242HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
7243#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
7244 if (__builtin_constant_p(amt)) {
7245 switch (amt) {
7246 case 0:
7247 return v;
7248 case 1:
7249 return ShiftLeftLanes<1>(d, v);
7250 }
7251 }
7252#else
7253 (void)d;
7254#endif
7255
7256 return detail::SlideUpLanes(v, amt);
7257}
7258
7259template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
7260HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
7261#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
7262 if (__builtin_constant_p(amt)) {
7263 switch (amt) {
7264 case 0:
7265 return v;
7266 case 1:
7267 return ShiftLeftLanes<1>(d, v);
7268 case 2:
7269 return ShiftLeftLanes<2>(d, v);
7270 case 3:
7271 return ShiftLeftLanes<3>(d, v);
7272 }
7273 }
7274#else
7275 (void)d;
7276#endif
7277
7278 return detail::SlideUpLanes(v, amt);
7279}
7280
7281template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
7282HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
7283#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
7284 if (__builtin_constant_p(amt)) {
7285 switch (amt) {
7286 case 0:
7287 return v;
7288 case 1:
7289 return ShiftLeftLanes<1>(d, v);
7290 case 2:
7291 return ShiftLeftLanes<2>(d, v);
7292 case 3:
7293 return ShiftLeftLanes<3>(d, v);
7294 case 4:
7295 return ShiftLeftLanes<4>(d, v);
7296 case 5:
7297 return ShiftLeftLanes<5>(d, v);
7298 case 6:
7299 return ShiftLeftLanes<6>(d, v);
7300 case 7:
7301 return ShiftLeftLanes<7>(d, v);
7302 }
7303 }
7304#else
7305 (void)d;
7306#endif
7307
7308 return detail::SlideUpLanes(v, amt);
7309}
7310
7311template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
7312HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
7313#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
7314 if (__builtin_constant_p(amt)) {
7315 switch (amt) {
7316 case 0:
7317 return v;
7318 case 1:
7319 return ShiftLeftLanes<1>(d, v);
7320 case 2:
7321 return ShiftLeftLanes<2>(d, v);
7322 case 3:
7323 return ShiftLeftLanes<3>(d, v);
7324 case 4:
7325 return ShiftLeftLanes<4>(d, v);
7326 case 5:
7327 return ShiftLeftLanes<5>(d, v);
7328 case 6:
7329 return ShiftLeftLanes<6>(d, v);
7330 case 7:
7331 return ShiftLeftLanes<7>(d, v);
7332 case 8:
7333 return ShiftLeftLanes<8>(d, v);
7334 case 9:
7335 return ShiftLeftLanes<9>(d, v);
7336 case 10:
7337 return ShiftLeftLanes<10>(d, v);
7338 case 11:
7339 return ShiftLeftLanes<11>(d, v);
7340 case 12:
7341 return ShiftLeftLanes<12>(d, v);
7342 case 13:
7343 return ShiftLeftLanes<13>(d, v);
7344 case 14:
7345 return ShiftLeftLanes<14>(d, v);
7346 case 15:
7347 return ShiftLeftLanes<15>(d, v);
7348 }
7349 }
7350#else
7351 (void)d;
7352#endif
7353
7354 return detail::SlideUpLanes(v, amt);
7355}
7356
7357// ------------------------------ SlideDownLanes
7358
7359namespace detail {
7360
7361template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
7362HWY_INLINE V SlideDownLanes(V v, size_t amt) {
7363 const DFromV<decltype(v)> d;
7364 const Repartition<UnsignedFromSize<d.MaxBytes()>, decltype(d)> dv;
7365 return BitCast(d,
7366 ShiftRightSame(BitCast(dv, v),
7367 static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
7368}
7369
7370#if HWY_TARGET <= HWY_SSSE3
7371template <class V, HWY_IF_V_SIZE_V(V, 16)>
7372HWY_INLINE V SlideDownLanes(V v, size_t amt) {
7373 const DFromV<decltype(v)> d;
7374 const Repartition<int8_t, decltype(d)> di8;
7375 auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
7376 idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
7377 return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
7378}
7379#else
7380template <class V, HWY_IF_V_SIZE_V(V, 16)>
7381HWY_INLINE V SlideDownLanes(V v, size_t amt) {
7382 const DFromV<decltype(v)> d;
7383 const Repartition<int32_t, decltype(d)> di32;
7384 const Repartition<uint64_t, decltype(d)> du64;
7385 constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV<V>);
7386
7387 const auto vu64 = BitCast(du64, v);
7388 const auto v_lo = IfVecThenElse(
7389 BitCast(du64, Set(di32, -static_cast<int32_t>(amt >= kNumOfLanesPerU64))),
7390 BitCast(du64, ShiftRightBytes<8>(du64, vu64)), vu64);
7391 const auto v_hi = ShiftRightBytes<8>(du64, v_lo);
7392
7393 const int shr_amt = static_cast<int>((amt * sizeof(TFromV<V>) * 8) & 63);
7394 return BitCast(
7395 d, Or(ShiftRightSame(v_lo, shr_amt), ShiftLeftSame(v_hi, 64 - shr_amt)));
7396}
7397#endif
7398
7399} // namespace detail
7400
7401template <class D, HWY_IF_LANES_D(D, 1)>
7402HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
7403 return v;
7404}
7405
7406template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
7407HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
7408#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
7409 if (__builtin_constant_p(amt)) {
7410 switch (amt) {
7411 case 0:
7412 return v;
7413 case 1:
7414 return ShiftRightLanes<1>(d, v);
7415 }
7416 }
7417#else
7418 (void)d;
7419#endif
7420
7421 return detail::SlideDownLanes(v, amt);
7422}
7423
7424template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
7425HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
7426#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
7427 if (__builtin_constant_p(amt)) {
7428 switch (amt) {
7429 case 0:
7430 return v;
7431 case 1:
7432 return ShiftRightLanes<1>(d, v);
7433 case 2:
7434 return ShiftRightLanes<2>(d, v);
7435 case 3:
7436 return ShiftRightLanes<3>(d, v);
7437 }
7438 }
7439#else
7440 (void)d;
7441#endif
7442
7443 return detail::SlideDownLanes(v, amt);
7444}
7445
7446template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
7447HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
7448#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
7449 if (__builtin_constant_p(amt)) {
7450 switch (amt) {
7451 case 0:
7452 return v;
7453 case 1:
7454 return ShiftRightLanes<1>(d, v);
7455 case 2:
7456 return ShiftRightLanes<2>(d, v);
7457 case 3:
7458 return ShiftRightLanes<3>(d, v);
7459 case 4:
7460 return ShiftRightLanes<4>(d, v);
7461 case 5:
7462 return ShiftRightLanes<5>(d, v);
7463 case 6:
7464 return ShiftRightLanes<6>(d, v);
7465 case 7:
7466 return ShiftRightLanes<7>(d, v);
7467 }
7468 }
7469#else
7470 (void)d;
7471#endif
7472
7473 return detail::SlideDownLanes(v, amt);
7474}
7475
7476template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
7477HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
7478#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
7479 if (__builtin_constant_p(amt)) {
7480 switch (amt) {
7481 case 0:
7482 return v;
7483 case 1:
7484 return ShiftRightLanes<1>(d, v);
7485 case 2:
7486 return ShiftRightLanes<2>(d, v);
7487 case 3:
7488 return ShiftRightLanes<3>(d, v);
7489 case 4:
7490 return ShiftRightLanes<4>(d, v);
7491 case 5:
7492 return ShiftRightLanes<5>(d, v);
7493 case 6:
7494 return ShiftRightLanes<6>(d, v);
7495 case 7:
7496 return ShiftRightLanes<7>(d, v);
7497 case 8:
7498 return ShiftRightLanes<8>(d, v);
7499 case 9:
7500 return ShiftRightLanes<9>(d, v);
7501 case 10:
7502 return ShiftRightLanes<10>(d, v);
7503 case 11:
7504 return ShiftRightLanes<11>(d, v);
7505 case 12:
7506 return ShiftRightLanes<12>(d, v);
7507 case 13:
7508 return ShiftRightLanes<13>(d, v);
7509 case 14:
7510 return ShiftRightLanes<14>(d, v);
7511 case 15:
7512 return ShiftRightLanes<15>(d, v);
7513 }
7514 }
7515#else
7516 (void)d;
7517#endif
7518
7519 return detail::SlideDownLanes(v, amt);
7520}
7521
7522// ================================================== MEMORY (4)
7523
7524// ------------------------------ StoreN (ExtractLane)
7525
7526#if HWY_TARGET <= HWY_AVX2
7527
7528#ifdef HWY_NATIVE_STORE_N
7529#undef HWY_NATIVE_STORE_N
7530#else
7531#define HWY_NATIVE_STORE_N
7532#endif
7533
7534template <class D, HWY_IF_T_SIZE_ONE_OF_D(
7535 D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) |
7536 (1 << 4) | (1 << 8))>
7538 size_t max_lanes_to_store) {
7539 const size_t num_lanes_to_store =
7540 HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D));
7541
7542#if HWY_COMPILER_MSVC
7543 // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore
7544 HWY_FENCE;
7545#endif
7546
7547 BlendedStore(v, FirstN(d, num_lanes_to_store), d, p);
7548
7549#if HWY_COMPILER_MSVC
7550 // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore
7551 HWY_FENCE;
7552#endif
7553
7554 detail::MaybeUnpoison(p, num_lanes_to_store);
7555}
7556
7557#if HWY_TARGET > HWY_AVX3
7558template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
7559 HWY_IF_LANES_D(D, 1)>
7560HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
7561 size_t max_lanes_to_store) {
7562 if (max_lanes_to_store > 0) {
7563 StoreU(v, d, p);
7564 }
7565}
7566
7567template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
7568 HWY_IF_LANES_D(D, 2)>
7569HWY_API void StoreN(VFromD<D> v, D /*d*/, TFromD<D>* HWY_RESTRICT p,
7570 size_t max_lanes_to_store) {
7571 if (max_lanes_to_store >= 1) {
7572 p[static_cast<size_t>(max_lanes_to_store > 1)] = detail::ExtractLane<1>(v);
7573 p[0] = GetLane(v);
7574 }
7575}
7576
7577namespace detail {
7578
7579template <class D, HWY_IF_T_SIZE_D(D, 1)>
7580HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/,
7581 TFromD<D>* HWY_RESTRICT p,
7582 size_t num_lanes_to_store) {
7583 // AVX2UIF8Or16StoreTrailingN should only be called for an I8/U8 vector if
7584 // (num_lanes_to_store & 3) != 0 is true
7585 const auto v_full128 = ResizeBitCast(Full128<TFromD<D>>(), v_trailing);
7586 if ((num_lanes_to_store & 2) != 0) {
7587 const uint16_t u16_bits = GetLane(BitCast(Full128<uint16_t>(), v_full128));
7588 p[num_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128);
7589 CopyBytes<sizeof(uint16_t)>(&u16_bits,
7590 p + (num_lanes_to_store & ~size_t{3}));
7591 } else {
7592 p[num_lanes_to_store - 1] = GetLane(v_full128);
7593 }
7594}
7595
7596template <class D, HWY_IF_T_SIZE_D(D, 2)>
7597HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/,
7598 TFromD<D>* p,
7599 size_t num_lanes_to_store) {
7600 // AVX2UIF8Or16StoreTrailingN should only be called for an I16/U16/F16/BF16
7601 // vector if (num_lanes_to_store & 1) == 1 is true
7602 p[num_lanes_to_store - 1] = GetLane(v_trailing);
7603}
7604
7605} // namespace detail
7606
7607template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
7608 HWY_IF_LANES_GT_D(D, 2)>
7609HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* p, size_t max_lanes_to_store) {
7610 const size_t num_lanes_to_store =
7611 HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D));
7612
7613 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
7614 d_full;
7615 const RebindToUnsigned<decltype(d_full)> du_full;
7616 const Repartition<int32_t, decltype(d_full)> di32_full;
7617
7618 const auto i32_store_mask = BitCast(
7619 di32_full, VecFromMask(du_full, FirstN(du_full, num_lanes_to_store)));
7620 const auto vi32 = ResizeBitCast(di32_full, v);
7621
7622#if HWY_COMPILER_MSVC
7623 // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore
7624 HWY_FENCE;
7625#endif
7626
7627 BlendedStore(vi32, MaskFromVec(i32_store_mask), di32_full,
7628 reinterpret_cast<int32_t*>(p));
7629
7630 constexpr size_t kNumOfLanesPerI32 = 4 / sizeof(TFromD<D>);
7631 constexpr size_t kTrailingLenMask = kNumOfLanesPerI32 - 1;
7632 const size_t trailing_n = (num_lanes_to_store & kTrailingLenMask);
7633
7634 if (trailing_n != 0) {
7635 const VFromD<D> v_trailing = ResizeBitCast(
7636 d, SlideDownLanes(di32_full, vi32,
7637 num_lanes_to_store / kNumOfLanesPerI32));
7638 detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_lanes_to_store);
7639 }
7640
7641#if HWY_COMPILER_MSVC
7642 // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore
7643 HWY_FENCE;
7644#endif
7645
7646 detail::MaybeUnpoison(p, num_lanes_to_store);
7647}
7648#endif // HWY_TARGET > HWY_AVX3
7649#endif // HWY_TARGET <= HWY_AVX2
7650
7651// ================================================== COMBINE
7652
7653// ------------------------------ Combine (InterleaveLower)
7654
7655// N = N/2 + N/2 (upper half undefined)
7656template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
7657HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
7658 const Half<decltype(d)> dh;
7659 const RebindToUnsigned<decltype(dh)> duh;
7660 // Treat half-width input as one lane, and expand to two lanes.
7661 using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
7662 const VU lo{BitCast(duh, lo_half).raw};
7663 const VU hi{BitCast(duh, hi_half).raw};
7664 return BitCast(d, InterleaveLower(lo, hi));
7665}
7666
7667// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
7668
7669template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
7671 const RebindToUnsigned<decltype(d)> du;
7672 const Half<decltype(du)> duh;
7673 return BitCast(d, VFromD<decltype(du)>{_mm_move_epi64(BitCast(duh, lo).raw)});
7674}
7675
7676template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
7677HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
7678 const Half<D> dh;
7679 return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
7680}
7681
7682#if HWY_HAVE_FLOAT16
7683template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
7684HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
7685 const RebindToUnsigned<decltype(d)> du;
7686 const Half<decltype(du)> duh;
7687 return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo)));
7688}
7689#endif
7690
7691// Generic for all vector lengths.
7692template <class D, HWY_X86_IF_EMULATED_D(D)>
7693HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
7694 const RebindToUnsigned<decltype(d)> du;
7695 const Half<decltype(du)> duh;
7696 return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo)));
7697}
7698
7699// ------------------------------ Concat full (InterleaveLower)
7700
7701// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
7702template <class D, HWY_IF_V_SIZE_D(D, 16)>
7704 const Repartition<uint64_t, decltype(d)> d64;
7705 return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
7706}
7707
7708// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
7709template <class D, HWY_IF_V_SIZE_D(D, 16)>
7711 const Repartition<uint64_t, decltype(d)> d64;
7712 return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
7713}
7714
7715// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
7716template <class D, HWY_IF_V_SIZE_D(D, 16)>
7718 return CombineShiftRightBytes<8>(d, hi, lo);
7719}
7720
7721// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
7722template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
7724 const Repartition<double, decltype(d)> dd;
7725#if HWY_TARGET >= HWY_SSSE3
7726 return BitCast(
7727 d, Vec128<double>{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw,
7728 _MM_SHUFFLE2(1, 0))});
7729#else
7730 // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle.
7731 return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
7732 BitCast(dd, lo).raw, 1)});
7733#endif
7734}
7735template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
7737 Vec128<float> lo) {
7738#if HWY_TARGET >= HWY_SSSE3
7739 (void)d;
7740 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))};
7741#else
7742 // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
7743 const RepartitionToWide<decltype(d)> dd;
7744 return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
7745 BitCast(dd, lo).raw, 1)});
7746#endif
7747}
7748template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
7750 Vec128<double> lo) {
7751#if HWY_TARGET >= HWY_SSSE3
7752 return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))};
7753#else
7754 // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
7755 return Vec128<double>{_mm_blend_pd(hi.raw, lo.raw, 1)};
7756#endif
7757}
7758
7759// ------------------------------ Concat partial (Combine, LowerHalf)
7760
7761template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7762HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
7763 const Half<decltype(d)> d2;
7764 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
7765}
7766
7767template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7768HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
7769 const Half<decltype(d)> d2;
7770 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
7771}
7772
7773template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7775 const VFromD<D> lo) {
7776 const Half<decltype(d)> d2;
7777 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
7778}
7779
7780template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7782 const Half<decltype(d)> d2;
7783 return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
7784}
7785
7786// ------------------------------ ConcatOdd
7787
7788// 8-bit full
7789template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
7791 const Repartition<uint16_t, decltype(d)> dw;
7792 // Right-shift 8 bits per u16 so we can pack.
7793 const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
7794 const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
7795 return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)};
7796}
7797
7798// 8-bit x8
7799template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
7801#if HWY_TARGET == HWY_SSE2
7802 const Repartition<uint16_t, decltype(d)> dw;
7803 // Right-shift 8 bits per u16 so we can pack.
7804 const Vec64<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
7805 const Vec64<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
7806 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw),
7807 _MM_SHUFFLE(2, 0, 2, 0))};
7808#else
7809 const Repartition<uint32_t, decltype(d)> du32;
7810 // Don't care about upper half, no need to zero.
7811 alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
7812 const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU8));
7813 const VFromD<D> L = TableLookupBytes(lo, shuf);
7814 const VFromD<D> H = TableLookupBytes(hi, shuf);
7815 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
7816#endif
7817}
7818
7819// 8-bit x4
7820template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)>
7822#if HWY_TARGET == HWY_SSE2
7823 const Repartition<uint16_t, decltype(d)> dw;
7824 const Twice<decltype(dw)> dw_2;
7825 // Right-shift 8 bits per u16 so we can pack.
7826 const Vec32<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
7827 const Vec32<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
7828 const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL);
7829 return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)};
7830#else
7831 const Repartition<uint16_t, decltype(d)> du16;
7832 // Don't care about upper half, no need to zero.
7833 alignas(16) const uint8_t kCompactOddU8[4] = {1, 3};
7834 const VFromD<D> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactOddU8));
7835 const VFromD<D> L = TableLookupBytes(lo, shuf);
7836 const VFromD<D> H = TableLookupBytes(hi, shuf);
7837 return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
7838#endif
7839}
7840
7841template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
7843 // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
7844 // 0xFFFF8000, which correctly saturates to 0x8000.
7845 const RebindToUnsigned<decltype(d)> du;
7846 const Repartition<int32_t, decltype(d)> dw;
7847 const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
7848 const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
7849 return BitCast(d, VFromD<decltype(du)>{_mm_packs_epi32(uL.raw, uH.raw)});
7850}
7851
7852// 16-bit x4
7853template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
7855#if HWY_TARGET == HWY_SSE2
7856 // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
7857 // 0xFFFF8000, which correctly saturates to 0x8000.
7858 const Repartition<int32_t, decltype(d)> dw;
7859 const Vec64<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
7860 const Vec64<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
7861 return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(uL.raw, uH.raw),
7862 _MM_SHUFFLE(2, 0, 2, 0))};
7863#else
7864 const Repartition<uint32_t, decltype(d)> du32;
7865 // Don't care about upper half, no need to zero.
7866 alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
7867 const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU16));
7868 const VFromD<D> L = TableLookupBytes(lo, shuf);
7869 const VFromD<D> H = TableLookupBytes(hi, shuf);
7870 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
7871#endif
7872}
7873
7874// 32-bit full
7875template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
7877 const RebindToFloat<decltype(d)> df;
7878 return BitCast(
7879 d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
7880 _MM_SHUFFLE(3, 1, 3, 1))});
7881}
7882
7883// Any type x2
7884template <class D, HWY_IF_LANES_D(D, 2)>
7886 return InterleaveUpper(d, lo, hi);
7887}
7888
7889// ------------------------------ ConcatEven (InterleaveLower)
7890
7891// 8-bit full
7892template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
7894 const Repartition<uint16_t, decltype(d)> dw;
7895 // Isolate lower 8 bits per u16 so we can pack.
7896 const Vec128<uint16_t> mask = Set(dw, 0x00FF);
7897 const Vec128<uint16_t> uH = And(BitCast(dw, hi), mask);
7898 const Vec128<uint16_t> uL = And(BitCast(dw, lo), mask);
7899 return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)};
7900}
7901
7902// 8-bit x8
7903template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
7905#if HWY_TARGET == HWY_SSE2
7906 const Repartition<uint16_t, decltype(d)> dw;
7907 // Isolate lower 8 bits per u16 so we can pack.
7908 const Vec64<uint16_t> mask = Set(dw, 0x00FF);
7909 const Vec64<uint16_t> uH = And(BitCast(dw, hi), mask);
7910 const Vec64<uint16_t> uL = And(BitCast(dw, lo), mask);
7911 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw),
7912 _MM_SHUFFLE(2, 0, 2, 0))};
7913#else
7914 const Repartition<uint32_t, decltype(d)> du32;
7915 // Don't care about upper half, no need to zero.
7916 alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
7917 const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU8));
7918 const VFromD<D> L = TableLookupBytes(lo, shuf);
7919 const VFromD<D> H = TableLookupBytes(hi, shuf);
7920 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
7921#endif
7922}
7923
7924// 8-bit x4
7925template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)>
7927#if HWY_TARGET == HWY_SSE2
7928 const Repartition<uint16_t, decltype(d)> dw;
7929 const Twice<decltype(dw)> dw_2;
7930 // Isolate lower 8 bits per u16 so we can pack.
7931 const Vec32<uint16_t> mask = Set(dw, 0x00FF);
7932 const Vec32<uint16_t> uH = And(BitCast(dw, hi), mask);
7933 const Vec32<uint16_t> uL = And(BitCast(dw, lo), mask);
7934 const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL);
7935 return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)};
7936#else
7937 const Repartition<uint16_t, decltype(d)> du16;
7938 // Don't care about upper half, no need to zero.
7939 alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2};
7940 const VFromD<D> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactEvenU8));
7941 const VFromD<D> L = TableLookupBytes(lo, shuf);
7942 const VFromD<D> H = TableLookupBytes(hi, shuf);
7943 return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
7944#endif
7945}
7946
7947// 16-bit full
7948template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
7950#if HWY_TARGET <= HWY_SSE4
7951 // Isolate lower 16 bits per u32 so we can pack.
7952 const RebindToUnsigned<decltype(d)> du; // for float16_t
7953 const Repartition<uint32_t, decltype(d)> dw;
7954 const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
7955 const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
7956 const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
7957 return BitCast(d, VFromD<decltype(du)>{_mm_packus_epi32(uL.raw, uH.raw)});
7958#elif HWY_TARGET == HWY_SSE2
7959 const Repartition<uint32_t, decltype(d)> dw;
7960 return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))),
7961 BitCast(d, ShiftLeft<16>(BitCast(dw, lo))));
7962#else
7963 const RebindToUnsigned<decltype(d)> du;
7964 // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two
7965 // inputs, then concatenate them.
7966 alignas(16)
7967 const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
7968 const VFromD<D> shuf = BitCast(d, Load(du, kCompactEvenU16));
7969 const VFromD<D> L = TableLookupBytes(lo, shuf);
7970 const VFromD<D> H = TableLookupBytes(hi, shuf);
7971 return ConcatLowerLower(d, H, L);
7972#endif
7973}
7974
7975// 16-bit x4
7976template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
7978#if HWY_TARGET == HWY_SSE2
7979 const Repartition<uint32_t, decltype(d)> dw;
7980 return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))),
7981 BitCast(d, ShiftLeft<16>(BitCast(dw, lo))));
7982#else
7983 const Repartition<uint32_t, decltype(d)> du32;
7984 // Don't care about upper half, no need to zero.
7985 alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
7986 const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU16));
7987 const VFromD<D> L = TableLookupBytes(lo, shuf);
7988 const VFromD<D> H = TableLookupBytes(hi, shuf);
7989 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
7990#endif
7991}
7992
7993// 32-bit full
7994template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
7996 const RebindToFloat<decltype(d)> df;
7997 return BitCast(
7998 d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
7999 _MM_SHUFFLE(2, 0, 2, 0))});
8000}
8001template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
8002HWY_API VFromD<D> ConcatEven(D /* d */, VFromD<D> hi, VFromD<D> lo) {
8003 return VFromD<D>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
8004}
8005
8006// Any T x2
8007template <class D, HWY_IF_LANES_D(D, 2)>
8009 return InterleaveLower(d, lo, hi);
8010}
8011
8012// ------------------------------ DupEven (InterleaveLower)
8013
8014template <typename T>
8015HWY_API Vec128<T, 1> DupEven(const Vec128<T, 1> v) {
8016 return v;
8017}
8018
8019template <typename T>
8020HWY_API Vec128<T, 2> DupEven(const Vec128<T, 2> v) {
8021 return InterleaveLower(DFromV<decltype(v)>(), v, v);
8022}
8023
8024template <typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 2)>
8026 const DFromV<decltype(v)> d;
8027
8028#if HWY_TARGET <= HWY_SSSE3
8029 const RebindToUnsigned<decltype(d)> du;
8030 const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
8031 du, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
8032 return TableLookupBytes(v, BitCast(d, shuffle));
8033#else
8034 const Repartition<uint16_t, decltype(d)> du16;
8035 return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0xFF00})),
8036 BitCast(d, ShiftLeft<8>(BitCast(du16, v))), v);
8037#endif
8038}
8039
8040template <typename T, HWY_IF_T_SIZE(T, 2)>
8042 const DFromV<decltype(v)> d;
8043 const RebindToUnsigned<decltype(d)> du; // for float16_t
8044 return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16(
8045 BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0))});
8046}
8047
8048// Generic for all vector lengths.
8049template <class V, HWY_IF_T_SIZE_V(V, 2)>
8050HWY_API V DupEven(const V v) {
8051 const DFromV<decltype(v)> d;
8052 const RebindToUnsigned<decltype(d)> du; // for float16_t
8053#if HWY_TARGET <= HWY_SSSE3
8054 const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
8055 du, 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c);
8056 return TableLookupBytes(v, BitCast(d, shuffle));
8057#else
8058 return BitCast(
8059 d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
8060 _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0)),
8061 _MM_SHUFFLE(2, 2, 0, 0))});
8062#endif
8063}
8064
8065template <typename T, HWY_IF_UI32(T)>
8066HWY_API Vec128<T> DupEven(Vec128<T> v) {
8067 return Vec128<T>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
8068}
8069
8071 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
8072}
8073
8074// ------------------------------ DupOdd (InterleaveUpper)
8075
8076template <typename T, HWY_IF_T_SIZE(T, 1)>
8078 return v;
8079}
8080
8081template <typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 1)>
8083 const DFromV<decltype(v)> d;
8084
8085#if HWY_TARGET <= HWY_SSSE3
8086 const RebindToUnsigned<decltype(d)> du;
8087 const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
8088 du, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
8089 return TableLookupBytes(v, BitCast(d, shuffle));
8090#else
8091 const Repartition<uint16_t, decltype(d)> du16;
8092 return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0x00FF})),
8093 BitCast(d, ShiftRight<8>(BitCast(du16, v))), v);
8094#endif
8095}
8096
8097template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)>
8099 const DFromV<decltype(v)> d;
8100 const RebindToUnsigned<decltype(d)> du; // for float16_t
8101 return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16(
8102 BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1))});
8103}
8104
8105// Generic for all vector lengths.
8106template <typename V, HWY_IF_T_SIZE_V(V, 2), HWY_IF_V_SIZE_GT_V(V, 8)>
8107HWY_API V DupOdd(V v) {
8108 const DFromV<decltype(v)> d;
8109 const RebindToUnsigned<decltype(d)> du; // for float16_t
8110#if HWY_TARGET <= HWY_SSSE3
8111 const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
8112 du, 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e);
8113 return TableLookupBytes(v, BitCast(d, shuffle));
8114#else
8115 return BitCast(
8116 d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
8117 _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1)),
8118 _MM_SHUFFLE(3, 3, 1, 1))});
8119#endif
8120}
8121
8122template <typename T, size_t N, HWY_IF_UI32(T)>
8123HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
8124 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
8125}
8126template <size_t N>
8128 return Vec128<float, N>{
8129 _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
8130}
8131
8132template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
8133HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
8134 return InterleaveUpper(DFromV<decltype(v)>(), v, v);
8135}
8136
8137// ------------------------------ TwoTablesLookupLanes (DupEven)
8138
8139template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8140HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
8141 Indices128<T, N> idx) {
8142 const DFromV<decltype(a)> d;
8143 const Twice<decltype(d)> dt;
8144// TableLookupLanes currently requires table and index vectors to be the same
8145// size, though a half-length index vector would be sufficient here.
8146#if HWY_IS_MSAN
8147 const Vec128<T, N> idx_vec{idx.raw};
8148 const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
8149#else
8150 // We only keep LowerHalf of the result, which is valid in idx.
8151 const Indices128<T, N * 2> idx2{idx.raw};
8152#endif
8153 return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2));
8154}
8155
8156template <typename T, HWY_IF_T_SIZE(T, 1)>
8158 Indices128<T> idx) {
8159#if HWY_TARGET <= HWY_AVX3_DL
8160 return Vec128<T>{_mm_permutex2var_epi8(a.raw, idx.raw, b.raw)};
8161#else // AVX3 or below
8162 const DFromV<decltype(a)> d;
8163 const Vec128<T> idx_vec{idx.raw};
8164
8165#if HWY_TARGET <= HWY_SSE4
8166 const Repartition<uint16_t, decltype(d)> du16;
8167 const auto sel_hi_mask =
8168 MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec))));
8169#else
8170 const RebindToSigned<decltype(d)> di;
8171 const auto sel_hi_mask =
8172 RebindMask(d, BitCast(di, idx_vec) > Set(di, int8_t{15}));
8173#endif
8174
8175 const auto lo_lookup_result = TableLookupBytes(a, idx_vec);
8176#if HWY_TARGET <= HWY_AVX3
8177 const Vec128<T> lookup_result{_mm_mask_shuffle_epi8(
8178 lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)};
8179 return lookup_result;
8180#else
8181 const auto hi_lookup_result = TableLookupBytes(b, idx_vec);
8182 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
8183#endif // HWY_TARGET <= HWY_AVX3
8184#endif // HWY_TARGET <= HWY_AVX3_DL
8185}
8186
8187template <typename T, HWY_IF_T_SIZE(T, 2)>
8188HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
8189 Indices128<T> idx) {
8190#if HWY_TARGET <= HWY_AVX3
8191 return Vec128<T>{_mm_permutex2var_epi16(a.raw, idx.raw, b.raw)};
8192#elif HWY_TARGET == HWY_SSE2
8193 const DFromV<decltype(a)> d;
8194 const RebindToSigned<decltype(d)> di;
8195 const Vec128<T> idx_vec{idx.raw};
8196 const auto sel_hi_mask =
8197 RebindMask(d, BitCast(di, idx_vec) > Set(di, int16_t{7}));
8198 const auto lo_lookup_result = TableLookupLanes(a, idx);
8199 const auto hi_lookup_result = TableLookupLanes(b, idx);
8200 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
8201#else
8202 const DFromV<decltype(a)> d;
8203 const Repartition<uint8_t, decltype(d)> du8;
8204 return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
8205 Indices128<uint8_t>{idx.raw}));
8206#endif
8207}
8208
8209template <typename T, HWY_IF_UI32(T)>
8210HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
8211 Indices128<T> idx) {
8212#if HWY_TARGET <= HWY_AVX3
8213 return Vec128<T>{_mm_permutex2var_epi32(a.raw, idx.raw, b.raw)};
8214#else // AVX2 or below
8215 const DFromV<decltype(a)> d;
8216
8217#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
8218 const Vec128<T> idx_vec{idx.raw};
8219
8220#if HWY_TARGET <= HWY_AVX2
8221 const RebindToFloat<decltype(d)> d_sel;
8222 const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<29>(idx_vec)));
8223#else
8224 const RebindToSigned<decltype(d)> d_sel;
8225 const auto sel_hi_mask = BitCast(d_sel, idx_vec) > Set(d_sel, int32_t{3});
8226#endif
8227
8228 const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx));
8229 const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx));
8230 return BitCast(d,
8231 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
8232#else // SSSE3 or SSE4
8233 const Repartition<uint8_t, decltype(d)> du8;
8234 return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
8235 Indices128<uint8_t>{idx.raw}));
8236#endif // HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
8237#endif // HWY_TARGET <= HWY_AVX3
8238}
8239
8240#if HWY_HAVE_FLOAT16
8241HWY_API Vec128<float16_t> TwoTablesLookupLanes(Vec128<float16_t> a,
8242 Vec128<float16_t> b,
8243 Indices128<float16_t> idx) {
8244 return Vec128<float16_t>{_mm_permutex2var_ph(a.raw, idx.raw, b.raw)};
8245}
8246#endif // HWY_HAVE_FLOAT16
8248 Indices128<float> idx) {
8249#if HWY_TARGET <= HWY_AVX3
8250 return Vec128<float>{_mm_permutex2var_ps(a.raw, idx.raw, b.raw)};
8251#elif HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
8252 const DFromV<decltype(a)> d;
8253
8254#if HWY_TARGET <= HWY_AVX2
8255 const auto sel_hi_mask =
8256 MaskFromVec(BitCast(d, ShiftLeft<29>(Vec128<int32_t>{idx.raw})));
8257#else
8258 const RebindToSigned<decltype(d)> di;
8259 const auto sel_hi_mask =
8260 RebindMask(d, Vec128<int32_t>{idx.raw} > Set(di, int32_t{3}));
8261#endif
8262
8263 const auto lo_lookup_result = TableLookupLanes(a, idx);
8264 const auto hi_lookup_result = TableLookupLanes(b, idx);
8265 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
8266#else // SSSE3 or SSE4
8267 const DFromV<decltype(a)> d;
8268 const Repartition<uint8_t, decltype(d)> du8;
8269 return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
8270 Indices128<uint8_t>{idx.raw}));
8271#endif
8272}
8273
8274template <typename T, HWY_IF_UI64(T)>
8275HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
8276 Indices128<T> idx) {
8277#if HWY_TARGET <= HWY_AVX3
8278 return Vec128<T>{_mm_permutex2var_epi64(a.raw, idx.raw, b.raw)};
8279#else
8280 const DFromV<decltype(a)> d;
8281 const Vec128<T> idx_vec{idx.raw};
8282 const Indices128<T> idx_mod{And(idx_vec, Set(d, T{1})).raw};
8283
8284#if HWY_TARGET <= HWY_SSE4
8285 const RebindToFloat<decltype(d)> d_sel;
8286 const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<62>(idx_vec)));
8287#else // SSE2 or SSSE3
8288 const Repartition<int32_t, decltype(d)> di32;
8289 const RebindToSigned<decltype(d)> d_sel;
8290 const auto sel_hi_mask = MaskFromVec(
8291 BitCast(d_sel, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) >
8292 Set(di32, int32_t{1}))));
8293#endif // HWY_TARGET <= HWY_SSE4
8294
8295 const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx_mod));
8296 const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx_mod));
8297 return BitCast(d,
8298 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
8299#endif // HWY_TARGET <= HWY_AVX3
8300}
8301
8303 Indices128<double> idx) {
8304#if HWY_TARGET <= HWY_AVX3
8305 return Vec128<double>{_mm_permutex2var_pd(a.raw, idx.raw, b.raw)};
8306#else
8307 const DFromV<decltype(a)> d;
8308 const RebindToSigned<decltype(d)> di;
8309 const Vec128<int64_t> idx_vec{idx.raw};
8310 const Indices128<double> idx_mod{And(idx_vec, Set(di, int64_t{1})).raw};
8311
8312#if HWY_TARGET <= HWY_SSE4
8313 const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<62>(idx_vec)));
8314#else // SSE2 or SSSE3
8315 const Repartition<int32_t, decltype(d)> di32;
8316 const auto sel_hi_mask =
8317 MaskFromVec(BitCast(d, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) >
8318 Set(di32, int32_t{1}))));
8319#endif // HWY_TARGET <= HWY_SSE4
8320
8321 const auto lo_lookup_result = TableLookupLanes(a, idx_mod);
8322 const auto hi_lookup_result = TableLookupLanes(b, idx_mod);
8323 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
8324#endif // HWY_TARGET <= HWY_AVX3
8325}
8326
8327// ------------------------------ OddEven (IfThenElse)
8328
8329template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
8330HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
8331 const DFromV<decltype(a)> d;
8332 const Repartition<uint8_t, decltype(d)> d8;
8333 alignas(16) static constexpr uint8_t mask[16] = {
8334 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
8335 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
8336}
8337
8338template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
8339HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
8340 const DFromV<decltype(a)> d;
8341#if HWY_TARGET >= HWY_SSSE3
8342 const Repartition<uint8_t, decltype(d)> d8;
8343 alignas(16) static constexpr uint8_t mask[16] = {
8344 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
8345 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
8346#else
8347 const RebindToUnsigned<decltype(d)> du; // for float16_t
8348 return BitCast(d, VFromD<decltype(du)>{_mm_blend_epi16(
8349 BitCast(du, a).raw, BitCast(du, b).raw, 0x55)});
8350#endif
8351}
8352
8353template <typename T, size_t N, HWY_IF_UI32(T)>
8354HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
8355#if HWY_TARGET >= HWY_SSSE3
8356 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
8357 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
8358 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
8359#else
8360 // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle.
8361 const DFromV<decltype(a)> d;
8362 const RebindToFloat<decltype(d)> df;
8363 return BitCast(d, Vec128<float, N>{_mm_blend_ps(BitCast(df, a).raw,
8364 BitCast(df, b).raw, 5)});
8365#endif
8366}
8367
8368template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
8369HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
8370 // Same as ConcatUpperLower for full vectors; do not call that because this
8371 // is more efficient for 64x1 vectors.
8372 const DFromV<decltype(a)> d;
8373 const RebindToFloat<decltype(d)> dd;
8374#if HWY_TARGET >= HWY_SSSE3
8375 return BitCast(
8376 d, Vec128<double, N>{_mm_shuffle_pd(
8377 BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))});
8378#else
8379 // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
8380 return BitCast(d, Vec128<double, N>{_mm_blend_pd(BitCast(dd, a).raw,
8381 BitCast(dd, b).raw, 1)});
8382#endif
8383}
8384
8385template <size_t N>
8386HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) {
8387#if HWY_TARGET >= HWY_SSSE3
8388 // SHUFPS must fill the lower half of the output from one input, so we
8389 // need another shuffle. Unpack avoids another immediate byte.
8390 const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
8391 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
8392 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
8393#else
8394 return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
8395#endif
8396}
8397
8398// -------------------------- InterleaveEven
8399
8400template <class D, HWY_IF_LANES_LE_D(D, 2)>
8402 return ConcatEven(d, b, a);
8403}
8404
8405// I8/U8 InterleaveEven is generic for all vector lengths that are >= 4 bytes
8406template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
8408 const Repartition<uint16_t, decltype(d)> du16;
8409 return OddEven(BitCast(d, ShiftLeft<8>(BitCast(du16, b))), a);
8410}
8411
8412// I16/U16 InterleaveEven is generic for all vector lengths that are >= 8 bytes
8413template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
8415 const Repartition<uint32_t, decltype(d)> du32;
8416 return OddEven(BitCast(d, ShiftLeft<16>(BitCast(du32, b))), a);
8417}
8418
8419#if HWY_TARGET <= HWY_AVX3
8420template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
8422 return VFromD<D>{_mm_mask_shuffle_epi32(
8423 a.raw, static_cast<__mmask8>(0x0A), b.raw,
8424 static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
8425}
8426template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
8428 return VFromD<D>{_mm_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0x0A),
8429 b.raw, b.raw, _MM_SHUFFLE(2, 2, 0, 0))};
8430}
8431#else
8432template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
8434 const RebindToFloat<decltype(d)> df;
8435 const auto b2_b0_a2_a0 = ConcatEven(df, BitCast(df, b), BitCast(df, a));
8436 return BitCast(
8437 d, VFromD<decltype(df)>{_mm_shuffle_ps(b2_b0_a2_a0.raw, b2_b0_a2_a0.raw,
8438 _MM_SHUFFLE(3, 1, 2, 0))});
8439}
8440#endif
8441
8442// -------------------------- InterleaveOdd
8443
8444template <class D, HWY_IF_LANES_LE_D(D, 2)>
8446 return ConcatOdd(d, b, a);
8447}
8448
8449// I8/U8 InterleaveOdd is generic for all vector lengths that are >= 4 bytes
8450template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
8452 const Repartition<uint16_t, decltype(d)> du16;
8453 return OddEven(b, BitCast(d, ShiftRight<8>(BitCast(du16, a))));
8454}
8455
8456// I16/U16 InterleaveOdd is generic for all vector lengths that are >= 8 bytes
8457template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
8459 const Repartition<uint32_t, decltype(d)> du32;
8460 return OddEven(b, BitCast(d, ShiftRight<16>(BitCast(du32, a))));
8461}
8462
8463#if HWY_TARGET <= HWY_AVX3
8464template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
8466 return VFromD<D>{_mm_mask_shuffle_epi32(
8467 b.raw, static_cast<__mmask8>(0x05), a.raw,
8468 static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
8469}
8470template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
8472 return VFromD<D>{_mm_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x05),
8473 a.raw, a.raw, _MM_SHUFFLE(3, 3, 1, 1))};
8474}
8475#else
8476template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
8478 const RebindToFloat<decltype(d)> df;
8479 const auto b3_b1_a3_a1 = ConcatOdd(df, BitCast(df, b), BitCast(df, a));
8480 return BitCast(
8481 d, VFromD<decltype(df)>{_mm_shuffle_ps(b3_b1_a3_a1.raw, b3_b1_a3_a1.raw,
8482 _MM_SHUFFLE(3, 1, 2, 0))});
8483}
8484#endif
8485
8486// ------------------------------ OddEvenBlocks
8487template <typename T, size_t N>
8488HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
8489 return even;
8490}
8491
8492// ------------------------------ SwapAdjacentBlocks
8493
8494template <typename T, size_t N>
8495HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
8496 return v;
8497}
8498
8499// ------------------------------ Shl (ZipLower, Mul)
8500
8501// Use AVX2/3 variable shifts where available, otherwise multiply by powers of
8502// two from loading float exponents, which is considerably faster (according
8503// to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
8504
8505namespace detail {
8506#if HWY_TARGET == HWY_AVX2 // Unused for AVX3 - we use sllv directly
8507template <class V>
8509 const DFromV<decltype(v)> d;
8510 const Rebind<uint32_t, decltype(d)> du32;
8511 return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits));
8512}
8513#elif HWY_TARGET > HWY_AVX2
8514// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
8515template <typename T, HWY_IF_T_SIZE(T, 2)>
8517 const DFromV<decltype(v)> d;
8518 const RebindToUnsigned<decltype(d)> du;
8519 const RepartitionToWide<decltype(d)> dw;
8520 const Rebind<float, decltype(dw)> df;
8521 const auto zero = Zero(d);
8522 // Move into exponent (this u16 will become the upper half of an f32)
8523 const auto exp = ShiftLeft<23 - 16>(v);
8524 const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
8525 // Insert 0 into lower halves for reinterpreting as binary32.
8526 const auto f0 = ZipLower(dw, zero, upper);
8527 const auto f1 = ZipUpper(dw, zero, upper);
8528 // See cvtps comment below.
8529 const VFromD<decltype(dw)> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)};
8530 const VFromD<decltype(dw)> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
8531#if HWY_TARGET <= HWY_SSE4
8532 return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits1.raw)};
8533#else
8534 return ConcatEven(du, BitCast(du, bits1), BitCast(du, bits0));
8535#endif
8536}
8537
8538template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)>
8539HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
8540 const DFromV<decltype(v)> d;
8541 const RebindToUnsigned<decltype(d)> du;
8542 const Twice<decltype(du)> dt_u;
8543 const RepartitionToWide<decltype(dt_u)> dt_w;
8544 const RebindToFloat<decltype(dt_w)> dt_f;
8545 // Move into exponent (this u16 will become the upper half of an f32)
8546 const auto exp = ShiftLeft<23 - 16>(v);
8547 const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
8548 // Insert 0 into lower halves for reinterpreting as binary32.
8549 const auto f0 = ZipLower(dt_w, Zero(dt_u), ResizeBitCast(dt_u, upper));
8550 // See cvtps comment below.
8551 const VFromD<decltype(dt_w)> bits0{_mm_cvtps_epi32(BitCast(dt_f, f0).raw)};
8552#if HWY_TARGET <= HWY_SSE4
8553 return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits0.raw)};
8554#elif HWY_TARGET == HWY_SSSE3
8555 alignas(16)
8556 const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
8557 return TableLookupBytes(bits0, Load(du, kCompactEvenU16));
8558#else
8559 const RebindToSigned<decltype(dt_w)> dt_i32;
8560 const auto bits0_i32 = ShiftRight<16>(BitCast(dt_i32, ShiftLeft<16>(bits0)));
8561 return VFromD<decltype(du)>{_mm_packs_epi32(bits0_i32.raw, bits0_i32.raw)};
8562#endif
8563}
8564
8565// Same, for 32-bit shifts.
8566template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
8567HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
8568 const DFromV<decltype(v)> d;
8569 const auto exp = ShiftLeft<23>(v);
8570 const auto f = exp + Set(d, 0x3F800000); // 1.0f
8571 // Do not use ConvertTo because we rely on the native 0x80..00 overflow
8572 // behavior.
8573 return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
8574}
8575
8576#endif // HWY_TARGET > HWY_AVX2
8577
8578template <size_t N>
8580 Vec128<uint16_t, N> bits) {
8581#if HWY_TARGET <= HWY_AVX3
8582 return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
8583#elif HWY_TARGET == HWY_AVX2
8584 return AVX2ShlU16Vec128(v, bits);
8585#else
8586 return v * Pow2(bits);
8587#endif
8588}
8589
8590#if HWY_TARGET > HWY_AVX3
8592 Vec16<uint16_t> bits) {
8593#if HWY_TARGET <= HWY_SSE4
8594 const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
8595#else
8596 const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)});
8597#endif
8598 return Vec16<uint16_t>{_mm_sll_epi16(v.raw, bits16.raw)};
8599}
8600#endif
8601
8602#if HWY_TARGET <= HWY_AVX3
8603template <class V>
8605 const DFromV<decltype(v)> d;
8606 const Rebind<uint16_t, decltype(d)> du16;
8607 return TruncateTo(d, PromoteTo(du16, v) << PromoteTo(du16, bits));
8608}
8609#elif HWY_TARGET <= HWY_AVX2
8610template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
8611HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) {
8612 const DFromV<decltype(v)> d;
8613 const Rebind<uint32_t, decltype(d)> du32;
8614 return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits));
8615}
8616template <class V, HWY_IF_V_SIZE_V(V, 16)>
8617HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) {
8618 const DFromV<decltype(v)> d;
8619 const Half<decltype(d)> dh;
8620 const Rebind<uint16_t, decltype(d)> du16;
8621 const Rebind<uint32_t, decltype(dh)> dh_u32;
8622
8623 const VFromD<decltype(dh_u32)> lo_shl_result =
8624 PromoteTo(dh_u32, LowerHalf(dh, v))
8625 << PromoteTo(dh_u32, LowerHalf(dh, bits));
8626 const VFromD<decltype(dh_u32)> hi_shl_result =
8627 PromoteTo(dh_u32, UpperHalf(dh, v))
8628 << PromoteTo(dh_u32, UpperHalf(dh, bits));
8629 const VFromD<decltype(du16)> u16_shl_result = ConcatEven(
8630 du16, BitCast(du16, hi_shl_result), BitCast(du16, lo_shl_result));
8631 return TruncateTo(d, u16_shl_result);
8632}
8633#endif // HWY_TARGET <= HWY_AVX3
8634
8635// 8-bit: may use the Shl overload for uint16_t.
8636template <size_t N>
8638 Vec128<uint8_t, N> bits) {
8639 const DFromV<decltype(v)> d;
8640#if HWY_TARGET <= HWY_AVX3_DL
8641 (void)tag;
8642 // kMask[i] = 0xFF >> i
8643 alignas(16) static constexpr uint8_t kMasks[16] = {
8644 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00};
8645 // kShl[i] = 1 << i
8646 alignas(16) static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10,
8647 0x20, 0x40, 0x80, 0x00};
8648 v = And(v, TableLookupBytes(Load(Full64<uint8_t>(), kMasks), bits));
8649 const VFromD<decltype(d)> mul =
8650 TableLookupBytes(Load(Full64<uint8_t>(), kShl), bits);
8651 return VFromD<decltype(d)>{_mm_gf2p8mul_epi8(v.raw, mul.raw)};
8652#elif HWY_TARGET <= HWY_AVX2
8653 (void)tag;
8654 (void)d;
8655 return AVX2ShlU8Vec128(v, bits);
8656#else
8657 const Repartition<uint16_t, decltype(d)> dw;
8658 using VW = VFromD<decltype(dw)>;
8659 const VW even_mask = Set(dw, 0x00FF);
8660 const VW odd_mask = Set(dw, 0xFF00);
8661 const VW vw = BitCast(dw, v);
8662 const VW bits16 = BitCast(dw, bits);
8663 // Shift even lanes in-place
8664 const VW evens = Shl(tag, vw, And(bits16, even_mask));
8665 const VW odds = Shl(tag, And(vw, odd_mask), ShiftRight<8>(bits16));
8666 return OddEven(BitCast(d, odds), BitCast(d, evens));
8667#endif
8668}
8670 Vec128<uint8_t, 1> bits) {
8671#if HWY_TARGET <= HWY_SSE4
8672 const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)};
8673#else
8674 const Vec16<uint16_t> bits8 =
8675 And(Vec16<uint16_t>{bits.raw}, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFF)});
8676#endif
8677 return Vec128<uint8_t, 1>{_mm_sll_epi16(v.raw, bits8.raw)};
8678}
8679
8680template <size_t N>
8682 Vec128<uint32_t, N> bits) {
8683#if HWY_TARGET >= HWY_SSE4
8684 return v * Pow2(bits);
8685#else
8686 return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
8687#endif
8688}
8689
8690#if HWY_TARGET >= HWY_SSE4
8692 const Vec32<uint32_t> bits) {
8693#if HWY_TARGET == HWY_SSE4
8694 const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)};
8695#else
8696 const auto bits32 =
8698#endif
8699 return Vec32<uint32_t>{_mm_sll_epi32(v.raw, bits32.raw)};
8700}
8701#endif
8702
8704 Vec128<uint64_t> bits) {
8705#if HWY_TARGET >= HWY_SSE4
8706 const DFromV<decltype(v)> d;
8707 // Individual shifts and combine
8708 const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
8709 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
8710 const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
8711 return ConcatUpperLower(d, out1, out0);
8712#else
8713 return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
8714#endif
8715}
8717 Vec64<uint64_t> bits) {
8718 return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)};
8719}
8720
8721// Signed left shift is the same as unsigned.
8722template <typename T, size_t N>
8724 Vec128<T, N> bits) {
8725 const DFromV<decltype(v)> di;
8726 const RebindToUnsigned<decltype(di)> du;
8727 return BitCast(di,
8728 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
8729}
8730
8731} // namespace detail
8732
8733template <typename T, size_t N>
8734HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
8735 return detail::Shl(hwy::TypeTag<T>(), v, bits);
8736}
8737
8738// ------------------------------ Shr (mul, mask, BroadcastSignBit)
8739
8740// Use AVX2+ variable shifts except for SSSE3/SSE4. There, we use
8741// widening multiplication by powers of two obtained by loading float exponents,
8742// followed by a constant right-shift. This is still faster than a scalar or
8743// bit-test approach: https://gcc.godbolt.org/z/9G7Y9v.
8744
8745#if HWY_TARGET <= HWY_AVX2
8746namespace detail {
8747
8748#if HWY_TARGET <= HWY_AVX3
8749template <class V>
8751 const DFromV<decltype(v)> d;
8752 const Rebind<uint16_t, decltype(d)> du16;
8753 const RebindToSigned<decltype(du16)> di16;
8754 return DemoteTo(d,
8755 BitCast(di16, PromoteTo(du16, v) >> PromoteTo(du16, bits)));
8756}
8757#else // AVX2
8758template <class V>
8759HWY_INLINE V AVX2ShrU16Vec128(V v, V bits) {
8760 const DFromV<decltype(v)> d;
8761 const Rebind<uint32_t, decltype(d)> du32;
8762 const RebindToSigned<decltype(du32)> di32;
8763 return DemoteTo(d,
8764 BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits)));
8765}
8766template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
8767HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) {
8768 const DFromV<decltype(v)> d;
8769 const Rebind<uint32_t, decltype(d)> du32;
8770 const RebindToSigned<decltype(du32)> di32;
8771 return DemoteTo(d,
8772 BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits)));
8773}
8774template <class V, HWY_IF_V_SIZE_V(V, 16)>
8775HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) {
8776 const DFromV<decltype(v)> d;
8777 const Half<decltype(d)> dh;
8778 const Rebind<int16_t, decltype(d)> di16;
8779 const Rebind<uint16_t, decltype(d)> du16;
8780 const Rebind<int32_t, decltype(dh)> dh_i32;
8781 const Rebind<uint32_t, decltype(dh)> dh_u32;
8782
8783 const auto lo_shr_result =
8784 BitCast(dh_i32, PromoteTo(dh_u32, LowerHalf(dh, v)) >>
8785 PromoteTo(dh_u32, LowerHalf(dh, bits)));
8786 const auto hi_shr_result =
8787 BitCast(dh_i32, PromoteTo(dh_u32, UpperHalf(dh, v)) >>
8788 PromoteTo(dh_u32, UpperHalf(dh, bits)));
8789 const auto i16_shr_result =
8790 BitCast(di16, OrderedDemote2To(du16, lo_shr_result, hi_shr_result));
8791 return DemoteTo(d, i16_shr_result);
8792}
8793#endif // HWY_TARGET <= HWY_AVX3
8794
8795} // namespace detail
8796#endif // HWY_TARGET <= HWY_AVX2
8797
8798template <size_t N>
8800 const Vec128<uint16_t, N> bits) {
8801#if HWY_TARGET <= HWY_AVX3
8802 return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)};
8803#elif HWY_TARGET <= HWY_AVX2
8804 return detail::AVX2ShrU16Vec128(in, bits);
8805#else
8806 const DFromV<decltype(in)> d;
8807 // For bits=0, we cannot mul by 2^16, so fix the result later.
8808 const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits));
8809 // Replace output with input where bits == 0.
8810 return IfThenElse(bits == Zero(d), in, out);
8811#endif
8812}
8813
8814#if HWY_TARGET > HWY_AVX3
8815HWY_API Vec16<uint16_t> operator>>(const Vec16<uint16_t> in,
8816 const Vec16<uint16_t> bits) {
8817#if HWY_TARGET <= HWY_SSE4
8818 const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
8819#else
8820 const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)});
8821#endif
8822 return Vec16<uint16_t>{_mm_srl_epi16(in.raw, bits16.raw)};
8823}
8824#endif
8825
8826// 8-bit uses 16-bit shifts.
8827template <size_t N>
8829 const Vec128<uint8_t, N> bits) {
8830#if HWY_TARGET <= HWY_AVX2
8831 return detail::AVX2ShrU8Vec128(in, bits);
8832#else
8833 const DFromV<decltype(in)> d;
8834 const Repartition<uint16_t, decltype(d)> dw;
8835 using VW = VFromD<decltype(dw)>;
8836 const VW mask = Set(dw, 0x00FF);
8837 const VW vw = BitCast(dw, in);
8838 const VW bits16 = BitCast(dw, bits);
8839 const VW evens = And(vw, mask) >> And(bits16, mask);
8840 // Shift odd lanes in-place
8841 const VW odds = vw >> ShiftRight<8>(bits16);
8842 return OddEven(BitCast(d, odds), BitCast(d, evens));
8843#endif
8844}
8846 const Vec128<uint8_t, 1> bits) {
8847#if HWY_TARGET <= HWY_SSE4
8848 const Vec16<uint16_t> in8{_mm_cvtepu8_epi16(in.raw)};
8849 const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)};
8850#else
8851 const Vec16<uint16_t> mask{_mm_set_epi64x(0, 0xFF)};
8852 const Vec16<uint16_t> in8 = And(Vec16<uint16_t>{in.raw}, mask);
8853 const Vec16<uint16_t> bits8 = And(Vec16<uint16_t>{bits.raw}, mask);
8854#endif
8855 return Vec128<uint8_t, 1>{_mm_srl_epi16(in8.raw, bits8.raw)};
8856}
8857
8858template <size_t N>
8860 const Vec128<uint32_t, N> bits) {
8861#if HWY_TARGET >= HWY_SSE4
8862 // 32x32 -> 64 bit mul, then shift right by 32.
8863 const DFromV<decltype(in)> d32;
8864 // Move odd lanes into position for the second mul. Shuffle more gracefully
8865 // handles N=1 than repartitioning to u64 and shifting 32 bits right.
8866 const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)};
8867 // For bits=0, we cannot mul by 2^32, so fix the result later.
8868 const auto mul = detail::Pow2(Set(d32, 32) - bits);
8869 const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0
8870 const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)};
8871 // No need to shift right, already in the correct position.
8872 const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ?
8873 const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20));
8874 // Replace output with input where bits == 0.
8875 return IfThenElse(bits == Zero(d32), in, out);
8876#else
8877 return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)};
8878#endif
8879}
8880
8881#if HWY_TARGET >= HWY_SSE4
8883 const Vec128<uint32_t, 1> bits) {
8884#if HWY_TARGET == HWY_SSE4
8885 const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)};
8886#else
8887 const auto bits32 =
8889#endif
8890 return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits32.raw)};
8891}
8892#endif
8893
8894HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
8895 const Vec128<uint64_t> bits) {
8896#if HWY_TARGET >= HWY_SSE4
8897 const DFromV<decltype(v)> d;
8898 // Individual shifts and combine
8899 const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
8900 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
8901 const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
8902 return ConcatUpperLower(d, out1, out0);
8903#else
8904 return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
8905#endif
8906}
8907HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v,
8908 const Vec64<uint64_t> bits) {
8909 return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)};
8910}
8911
8912namespace detail {
8913
8914#if HWY_TARGET <= HWY_AVX3
8915template <class V>
8917 const DFromV<decltype(v)> d;
8918 const Rebind<int16_t, decltype(d)> di16;
8919 return DemoteTo(d, PromoteTo(di16, v) >> PromoteTo(di16, bits));
8920}
8921#elif HWY_TARGET <= HWY_AVX2 // AVX2
8922template <class V>
8923HWY_INLINE V AVX2ShrI16Vec128(V v, V bits) {
8924 const DFromV<decltype(v)> d;
8925 const Rebind<int32_t, decltype(d)> di32;
8926 return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits));
8927}
8928template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
8929HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) {
8930 const DFromV<decltype(v)> d;
8931 const Rebind<int32_t, decltype(d)> di32;
8932 return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits));
8933}
8934template <class V, HWY_IF_V_SIZE_V(V, 16)>
8935HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) {
8936 const DFromV<decltype(v)> d;
8937 const Half<decltype(d)> dh;
8938 const Rebind<int16_t, decltype(d)> di16;
8939 const Rebind<int32_t, decltype(dh)> dh_i32;
8940
8941 const auto lo_shr_result = PromoteTo(dh_i32, LowerHalf(dh, v)) >>
8942 PromoteTo(dh_i32, LowerHalf(dh, bits));
8943 const auto hi_shr_result = PromoteTo(dh_i32, UpperHalf(dh, v)) >>
8944 PromoteTo(dh_i32, UpperHalf(dh, bits));
8945 const auto i16_shr_result =
8946 OrderedDemote2To(di16, lo_shr_result, hi_shr_result);
8947 return DemoteTo(d, i16_shr_result);
8948}
8949#endif
8950
8951#if HWY_TARGET > HWY_AVX3
8952// Also used in x86_256-inl.h.
8953template <class DI, class V>
8954HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) {
8955 const RebindToUnsigned<DI> du;
8956 const auto count = BitCast(du, count_i); // same type as value to shift
8957 // Clear sign and restore afterwards. This is preferable to shifting the MSB
8958 // downwards because Shr is somewhat more expensive than Shl.
8959 const auto sign = BroadcastSignBit(v);
8960 const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below
8961 return BitCast(di, abs >> count) ^ sign;
8962}
8963#endif
8964
8965} // namespace detail
8966
8967template <size_t N>
8969 Vec128<int16_t, N> bits) {
8970#if HWY_TARGET <= HWY_AVX3
8971 return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)};
8972#elif HWY_TARGET <= HWY_AVX2
8973 return detail::AVX2ShrI16Vec128(v, bits);
8974#else
8975 const DFromV<decltype(v)> d;
8976 return detail::SignedShr(d, v, bits);
8977#endif
8978}
8979
8980#if HWY_TARGET > HWY_AVX3
8981HWY_API Vec16<int16_t> operator>>(Vec16<int16_t> v, Vec16<int16_t> bits) {
8982#if HWY_TARGET <= HWY_SSE4
8983 const Vec16<int16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
8984#else
8985 const auto bits16 = And(bits, Vec16<int16_t>{_mm_set_epi64x(0, 0xFFFF)});
8986#endif
8987 return Vec16<int16_t>{_mm_sra_epi16(v.raw, bits16.raw)};
8988}
8989#endif
8990
8991template <size_t N>
8993 Vec128<int8_t, N> bits) {
8994#if HWY_TARGET <= HWY_AVX2
8995 return detail::AVX2ShrI8Vec128(v, bits);
8996#else
8997 const DFromV<decltype(v)> d;
8998 return detail::SignedShr(d, v, bits);
8999#endif
9000}
9002 Vec128<int8_t, 1> bits) {
9003#if HWY_TARGET <= HWY_SSE4
9004 const Vec16<int16_t> vi16{_mm_cvtepi8_epi16(v.raw)};
9005 const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)};
9006#else
9007 const DFromV<decltype(v)> d;
9008 const Rebind<int16_t, decltype(d)> di16;
9009 const Twice<decltype(d)> dt;
9010
9011 const auto vi16 = ShiftRight<8>(BitCast(di16, Combine(dt, v, v)));
9012 const Vec16<uint16_t> bits8 =
9013 And(Vec16<uint16_t>{bits.raw}, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFF)});
9014#endif
9015 return Vec128<int8_t, 1>{_mm_sra_epi16(vi16.raw, bits8.raw)};
9016}
9017
9018template <size_t N>
9020 Vec128<int32_t, N> bits) {
9021#if HWY_TARGET <= HWY_AVX2
9022 return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)};
9023#else
9024 const DFromV<decltype(v)> d;
9025 return detail::SignedShr(d, v, bits);
9026#endif
9027}
9028
9029#if HWY_TARGET > HWY_AVX2
9030HWY_API Vec32<int32_t> operator>>(Vec32<int32_t> v, Vec32<int32_t> bits) {
9031#if HWY_TARGET == HWY_SSE4
9032 const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)};
9033#else
9034 const auto bits32 = Combine(Full64<int32_t>(), Zero(Full32<int32_t>()), bits);
9035#endif
9036 return Vec32<int32_t>{_mm_sra_epi32(v.raw, bits32.raw)};
9037}
9038#endif
9039
9040template <size_t N>
9042 Vec128<int64_t, N> bits) {
9043#if HWY_TARGET <= HWY_AVX3
9044 return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)};
9045#else
9046 const DFromV<decltype(v)> d;
9047 return detail::SignedShr(d, v, bits);
9048#endif
9049}
9050
9051// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
9052
9053namespace detail {
9054
9055template <class V, HWY_IF_U64(TFromV<V>)>
9056static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) {
9057 const DFromV<decltype(a)> du64;
9058 const RepartitionToNarrow<decltype(du64)> du32;
9059 const auto maskL = Set(du64, 0xFFFFFFFFULL);
9060 const auto a32 = BitCast(du32, a);
9061 const auto b32 = BitCast(du32, b);
9062 // Inputs for MulEven: we only need the lower 32 bits
9063 const auto aH = Shuffle2301(a32);
9064 const auto bH = Shuffle2301(b32);
9065
9066 // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
9067 // the even (lower 64 bits of every 128-bit block) results. See
9068 // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.txt
9069 const auto aLbL = MulEven(a32, b32);
9070 const auto w3 = aLbL & maskL;
9071
9072 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
9073 const auto w2 = t2 & maskL;
9074 const auto w1 = ShiftRight<32>(t2);
9075
9076 const auto t = MulEven(a32, bH) + w2;
9077 const auto k = ShiftRight<32>(t);
9078
9079 mulH = MulEven(aH, bH) + w1 + k;
9080 return ShiftLeft<32>(t) + w3;
9081}
9082
9083template <class V, HWY_IF_I64(TFromV<V>)>
9084static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) {
9085 const DFromV<decltype(a)> di64;
9086 const RebindToUnsigned<decltype(di64)> du64;
9087 using VU64 = VFromD<decltype(du64)>;
9088
9089 VU64 unsigned_mulH;
9090 const auto mulL = BitCast(
9091 di64, SSE2Mul128(BitCast(du64, a), BitCast(du64, b), unsigned_mulH));
9092 mulH = BitCast(di64, unsigned_mulH) - And(BroadcastSignBit(a), b) -
9093 And(a, BroadcastSignBit(b));
9094 return mulL;
9095}
9096
9097} // namespace detail
9098
9099#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
9100
9101template <class V, HWY_IF_UI64(TFromV<V>),
9102 HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))>
9103HWY_API V MulEven(V a, V b) {
9104 V mulH;
9105 const V mulL = detail::SSE2Mul128(a, b, mulH);
9106 return InterleaveLower(mulL, mulH);
9107}
9108
9109template <class V, HWY_IF_UI64(TFromV<V>),
9110 HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))>
9111HWY_API V MulOdd(V a, V b) {
9112 const DFromV<decltype(a)> du64;
9113 V mulH;
9114 const V mulL = detail::SSE2Mul128(a, b, mulH);
9115 return InterleaveUpper(du64, mulL, mulH);
9116}
9117
9118#endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
9119
9120template <class V, HWY_IF_UI64(TFromV<V>),
9121 HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 8 : 0))>
9122HWY_API V MulHigh(V a, V b) {
9123 V mulH;
9124 detail::SSE2Mul128(a, b, mulH);
9125 return mulH;
9126}
9127
9128#if HWY_ARCH_X86_64
9129
9130template <class T, HWY_IF_UI64(T)>
9131HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
9132 const DFromV<decltype(a)> d;
9133 alignas(16) T mul[2];
9134 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
9135 return Load(d, mul);
9136}
9137
9138template <class T, HWY_IF_UI64(T)>
9139HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
9140 const DFromV<decltype(a)> d;
9141 const Half<decltype(d)> d2;
9142 alignas(16) T mul[2];
9143 const T a1 = GetLane(UpperHalf(d2, a));
9144 const T b1 = GetLane(UpperHalf(d2, b));
9145 mul[0] = Mul128(a1, b1, &mul[1]);
9146 return Load(d, mul);
9147}
9148
9149template <class T, HWY_IF_UI64(T)>
9150HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
9151 T hi;
9152 Mul128(GetLane(a), GetLane(b), &hi);
9153 return Vec64<T>{_mm_cvtsi64_si128(static_cast<int64_t>(hi))};
9154}
9155
9156#endif // HWY_ARCH_X86_64
9157
9158// ------------------------------ WidenMulPairwiseAdd
9159
9160// Generic for all vector lengths.
9161template <class D32, HWY_IF_F32_D(D32),
9163HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
9164 // TODO(janwas): _mm_dpbf16_ps when available
9165 const RebindToUnsigned<decltype(df32)> du32;
9166 // Lane order within sum0/1 is undefined, hence we can avoid the
9167 // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
9168 // leads to the odd/even order that RearrangeToOddPlusEven prefers.
9169 using VU32 = VFromD<decltype(du32)>;
9170 const VU32 odd = Set(du32, 0xFFFF0000u);
9171 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
9172 const VU32 ao = And(BitCast(du32, a), odd);
9173 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
9174 const VU32 bo = And(BitCast(du32, b), odd);
9175 return MulAdd(BitCast(df32, ae), BitCast(df32, be),
9176 Mul(BitCast(df32, ao), BitCast(df32, bo)));
9177}
9178
9179// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
9180template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
9182HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
9183 return VFromD<D32>{_mm_madd_epi16(a.raw, b.raw)};
9184}
9185
9186// Generic for all vector lengths.
9187template <class DU32, HWY_IF_U32_D(DU32),
9189HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) {
9190 const auto p_lo = a * b;
9191 const auto p_hi = MulHigh(a, b);
9192
9193 const auto p_hi1_lo0 = BitCast(du32, OddEven(p_hi, p_lo));
9194 const auto p_hi0_lo1 = Or(ShiftLeft<16>(BitCast(du32, p_hi)),
9195 ShiftRight<16>(BitCast(du32, p_lo)));
9196 return Add(BitCast(du32, p_hi1_lo0), BitCast(du32, p_hi0_lo1));
9197}
9198
9199// ------------------------------ SatWidenMulPairwiseAdd
9200
9201#if HWY_TARGET <= HWY_SSSE3
9202
9203#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
9204#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
9205#else
9206#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
9207#endif
9208
9209// Even if N=1, the input is always at least 2 lanes, hence _mm_maddubs_epi16
9210// is safe.
9211template <class DI16, HWY_IF_I16_D(DI16), HWY_IF_V_SIZE_LE_D(DI16, 16)>
9213 DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a,
9215 return VFromD<DI16>{_mm_maddubs_epi16(a.raw, b.raw)};
9216}
9217
9218#endif
9219
9220// ------------------------------ SatWidenMulPairwiseAccumulate
9221
9222#if HWY_TARGET <= HWY_AVX3_DL
9223
9224#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
9225#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
9226#else
9227#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
9228#endif
9229
9230// Even if N=1, the I16 vectors have at least 2 lanes, hence _mm_dpwssds_epi32
9231// is safe.
9232template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
9234 DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
9236 return VFromD<DI32>{_mm_dpwssds_epi32(sum.raw, a.raw, b.raw)};
9237}
9238
9239#endif // HWY_TARGET <= HWY_AVX3_DL
9240
9241// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ShiftLeft)
9242
9243// Generic for all vector lengths.
9244template <class D32, HWY_IF_F32_D(D32),
9246HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
9247 const VFromD<D32> sum0,
9248 VFromD<D32>& sum1) {
9249 // TODO(janwas): _mm_dpbf16_ps when available
9250 const RebindToUnsigned<decltype(df32)> du32;
9251 // Lane order within sum0/1 is undefined, hence we can avoid the
9252 // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
9253 // leads to the odd/even order that RearrangeToOddPlusEven prefers.
9254 using VU32 = VFromD<decltype(du32)>;
9255 const VU32 odd = Set(du32, 0xFFFF0000u);
9256 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
9257 const VU32 ao = And(BitCast(du32, a), odd);
9258 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
9259 const VU32 bo = And(BitCast(du32, b), odd);
9260 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
9261 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
9262}
9263
9264// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
9265template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
9267HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d, V16 a, V16 b,
9268 const VFromD<D32> sum0,
9269 VFromD<D32>& /*sum1*/) {
9270 (void)d;
9271#if HWY_TARGET <= HWY_AVX3_DL
9272 return VFromD<D32>{_mm_dpwssd_epi32(sum0.raw, a.raw, b.raw)};
9273#else
9274 return sum0 + WidenMulPairwiseAdd(d, a, b);
9275#endif
9276}
9277
9278template <class DU32, HWY_IF_U32_D(DU32),
9281 const VFromD<DU32> sum0,
9282 VFromD<DU32>& /*sum1*/) {
9283 (void)d;
9284 return sum0 + WidenMulPairwiseAdd(d, a, b);
9285}
9286
9287// ------------------------------ RearrangeToOddPlusEven
9288template <size_t N>
9289HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(const Vec128<int32_t, N> sum0,
9290 Vec128<int32_t, N> /*sum1*/) {
9291 return sum0; // invariant already holds
9292}
9293
9294template <size_t N>
9295HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven(
9296 const Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> /*sum1*/) {
9297 return sum0; // invariant already holds
9298}
9299
9300template <class VW>
9301HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
9302 return Add(sum0, sum1);
9303}
9304
9305// ------------------------------ SumOfMulQuadAccumulate
9306#if HWY_TARGET <= HWY_AVX3_DL
9307
9308#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
9309#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
9310#else
9311#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
9312#endif
9313
9314template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
9316 DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
9318 return VFromD<DI32>{_mm_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)};
9319}
9320
9321#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
9322#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
9323#else
9324#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
9325#endif
9326template <class DI32, HWY_IF_I32_D(DI32)>
9327HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
9328 VFromD<Repartition<int8_t, DI32>> a,
9329 VFromD<Repartition<int8_t, DI32>> b,
9330 VFromD<DI32> sum) {
9331 // TODO(janwas): AVX-VNNI-INT8 has dpbssd.
9332 const Repartition<uint8_t, decltype(di32)> du8;
9333
9334 const auto a_u = BitCast(du8, a);
9335 const auto result_sum_0 = SumOfMulQuadAccumulate(di32, a_u, b, sum);
9336 const auto result_sum_1 = ShiftLeft<8>(
9337 SumOfMulQuadAccumulate(di32, ShiftRight<7>(a_u), b, Zero(di32)));
9338 return result_sum_0 - result_sum_1;
9339}
9340
9341#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
9342#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
9343#else
9344#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
9345#endif
9346template <class DU32, HWY_IF_U32_D(DU32)>
9347HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
9348 DU32 du32, VFromD<Repartition<uint8_t, DU32>> a,
9349 VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
9350 // TODO(janwas): AVX-VNNI-INT8 has dpbuud.
9351 const Repartition<uint8_t, decltype(du32)> du8;
9352 const RebindToSigned<decltype(du8)> di8;
9353 const RebindToSigned<decltype(du32)> di32;
9354
9355 const auto b_i = BitCast(di8, b);
9356 const auto result_sum_0 =
9357 SumOfMulQuadAccumulate(di32, a, b_i, BitCast(di32, sum));
9358 const auto result_sum_1 = ShiftLeft<8>(
9359 SumOfMulQuadAccumulate(di32, a, BroadcastSignBit(b_i), Zero(di32)));
9360
9361 return BitCast(du32, result_sum_0 - result_sum_1);
9362}
9363
9364#endif // HWY_TARGET <= HWY_AVX3_DL
9365
9366// ================================================== CONVERT
9367
9368// ------------------------------ Promotions (part w/ narrow lanes -> full)
9369
9370// Unsigned: zero-extend.
9371template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
9372HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
9373#if HWY_TARGET >= HWY_SSSE3
9374 const __m128i zero = _mm_setzero_si128();
9375 return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)};
9376#else
9377 return VFromD<D>{_mm_cvtepu8_epi16(v.raw)};
9378#endif
9379}
9380template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
9381HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
9382#if HWY_TARGET >= HWY_SSSE3
9383 return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
9384#else
9385 return VFromD<D>{_mm_cvtepu16_epi32(v.raw)};
9386#endif
9387}
9388template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
9389HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
9390#if HWY_TARGET >= HWY_SSSE3
9391 return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
9392#else
9393 return VFromD<D>{_mm_cvtepu32_epi64(v.raw)};
9394#endif
9395}
9396template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
9397HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
9398#if HWY_TARGET >= HWY_SSSE3
9399 const __m128i zero = _mm_setzero_si128();
9400 const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
9401 return VFromD<D>{_mm_unpacklo_epi16(u16, zero)};
9402#else
9403 return VFromD<D>{_mm_cvtepu8_epi32(v.raw)};
9404#endif
9405}
9406template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
9407HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
9408#if HWY_TARGET > HWY_SSSE3
9409 const Rebind<uint32_t, decltype(d)> du32;
9410 return PromoteTo(d, PromoteTo(du32, v));
9411#elif HWY_TARGET == HWY_SSSE3
9412 alignas(16) static constexpr int8_t kShuffle[16] = {
9413 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1};
9414 const Repartition<int8_t, decltype(d)> di8;
9415 return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
9416#else
9417 (void)d;
9418 return VFromD<D>{_mm_cvtepu8_epi64(v.raw)};
9419#endif
9420}
9421template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
9422HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
9423#if HWY_TARGET > HWY_SSSE3
9424 const Rebind<uint32_t, decltype(d)> du32;
9425 return PromoteTo(d, PromoteTo(du32, v));
9426#elif HWY_TARGET == HWY_SSSE3
9427 alignas(16) static constexpr int8_t kShuffle[16] = {
9428 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1};
9429 const Repartition<int8_t, decltype(d)> di8;
9430 return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
9431#else
9432 (void)d;
9433 return VFromD<D>{_mm_cvtepu16_epi64(v.raw)};
9434#endif
9435}
9436
9437// Unsigned to signed: same plus cast.
9438template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
9439 HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)),
9441HWY_API VFromD<D> PromoteTo(D di, V v) {
9442 const RebindToUnsigned<decltype(di)> du;
9443 return BitCast(di, PromoteTo(du, v));
9444}
9445
9446// Signed: replicate sign bit.
9447template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
9448HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
9449#if HWY_TARGET >= HWY_SSSE3
9450 return ShiftRight<8>(VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)});
9451#else
9452 return VFromD<D>{_mm_cvtepi8_epi16(v.raw)};
9453#endif
9454}
9455template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
9456HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
9457#if HWY_TARGET >= HWY_SSSE3
9458 return ShiftRight<16>(VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)});
9459#else
9460 return VFromD<D>{_mm_cvtepi16_epi32(v.raw)};
9461#endif
9462}
9463template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
9464HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
9465#if HWY_TARGET >= HWY_SSSE3
9466 return ShiftRight<32>(VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)});
9467#else
9468 return VFromD<D>{_mm_cvtepi32_epi64(v.raw)};
9469#endif
9470}
9471template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
9472HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
9473#if HWY_TARGET >= HWY_SSSE3
9474 const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
9475 const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
9476 return ShiftRight<24>(VFromD<D>{x4});
9477#else
9478 return VFromD<D>{_mm_cvtepi8_epi32(v.raw)};
9479#endif
9480}
9481template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
9482HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) {
9483#if HWY_TARGET >= HWY_SSSE3
9484 const Repartition<int32_t, decltype(d)> di32;
9485 const Half<decltype(di32)> dh_i32;
9486 const VFromD<decltype(di32)> x4{PromoteTo(dh_i32, v).raw};
9487 const VFromD<decltype(di32)> s4{
9488 _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))};
9489 return ZipLower(d, x4, s4);
9490#else
9491 (void)d;
9492 return VFromD<D>{_mm_cvtepi8_epi64(v.raw)};
9493#endif
9494}
9495template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
9496HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) {
9497#if HWY_TARGET >= HWY_SSSE3
9498 const Repartition<int32_t, decltype(d)> di32;
9499 const Half<decltype(di32)> dh_i32;
9500 const VFromD<decltype(di32)> x2{PromoteTo(dh_i32, v).raw};
9501 const VFromD<decltype(di32)> s2{
9502 _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))};
9503 return ZipLower(d, x2, s2);
9504#else
9505 (void)d;
9506 return VFromD<D>{_mm_cvtepi16_epi64(v.raw)};
9507#endif
9508}
9509
9510#if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)
9511
9512// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
9513#ifdef HWY_NATIVE_F16C
9514#undef HWY_NATIVE_F16C
9515#else
9516#define HWY_NATIVE_F16C
9517#endif
9518
9519// Workaround for origin tracking bug in Clang msan prior to 11.0
9520// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
9521#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
9522#define HWY_INLINE_F16 HWY_NOINLINE
9523#else
9524#define HWY_INLINE_F16 HWY_INLINE
9525#endif
9526template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
9527HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
9528#if HWY_HAVE_FLOAT16
9529 const RebindToUnsigned<DFromV<decltype(v)>> du16;
9530 return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)};
9531#else
9532 return VFromD<D>{_mm_cvtph_ps(v.raw)};
9533#endif
9534}
9535
9536#endif // HWY_NATIVE_F16C
9537
9538#if HWY_HAVE_FLOAT16
9539
9540#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
9541#undef HWY_NATIVE_PROMOTE_F16_TO_F64
9542#else
9543#define HWY_NATIVE_PROMOTE_F16_TO_F64
9544#endif
9545
9546template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9547HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
9548 return VFromD<D>{_mm_cvtph_pd(v.raw)};
9549}
9550
9551#endif // HWY_HAVE_FLOAT16
9552
9553template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
9554HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
9555 const Rebind<uint16_t, decltype(df32)> du16;
9556 const RebindToSigned<decltype(df32)> di32;
9557 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
9558}
9559
9560template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9561HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
9562 return VFromD<D>{_mm_cvtps_pd(v.raw)};
9563}
9564
9565template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9566HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
9567 return VFromD<D>{_mm_cvtepi32_pd(v.raw)};
9568}
9569
9570#if HWY_TARGET <= HWY_AVX3
9571template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9572HWY_API VFromD<D> PromoteTo(D /*df64*/, VFromD<Rebind<uint32_t, D>> v) {
9573 return VFromD<D>{_mm_cvtepu32_pd(v.raw)};
9574}
9575#else
9576// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
9577template <class D, HWY_IF_F64_D(D)>
9578HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
9579 const Rebind<int32_t, decltype(df64)> di32;
9580 const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v));
9581 return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result,
9582 Set(df64, 4294967296.0),
9583 Zero(df64));
9584}
9585#endif
9586
9587// ------------------------------ PromoteEvenTo/PromoteOddTo
9588
9589#if HWY_TARGET > HWY_AVX3
9590namespace detail {
9591
9592// I32->I64 PromoteEvenTo/PromoteOddTo
9593
9594template <class D, HWY_IF_LANES_D(D, 1)>
9596 hwy::SizeTag<8> /*to_lane_size_tag*/,
9597 hwy::SignedTag /*from_type_tag*/, D d_to,
9598 Vec64<int32_t> v) {
9599 return PromoteLowerTo(d_to, v);
9600}
9601
9602template <class D, HWY_IF_LANES_D(D, 2)>
9604 hwy::SizeTag<8> /*to_lane_size_tag*/,
9605 hwy::SignedTag /*from_type_tag*/, D d_to,
9606 Vec128<int32_t> v) {
9607 const Repartition<int32_t, D> d_from;
9608 return PromoteLowerTo(d_to, ConcatEven(d_from, v, v));
9609}
9610
9611template <class D, class V, HWY_IF_LANES_LE_D(D, 2)>
9613 hwy::SizeTag<8> /*to_lane_size_tag*/,
9614 hwy::SignedTag /*from_type_tag*/, D d_to,
9615 V v) {
9616 const Repartition<int32_t, D> d_from;
9617 return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v));
9618}
9619
9620} // namespace detail
9621#endif
9622
9623// ------------------------------ Demotions (full -> part w/ narrow lanes)
9624
9625template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
9626HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
9627 return VFromD<D>{_mm_packs_epi32(v.raw, v.raw)};
9628}
9629
9630template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
9631HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
9632#if HWY_TARGET >= HWY_SSSE3
9633 const Rebind<int32_t, D> di32;
9634 const auto zero_if_neg = AndNot(ShiftRight<31>(v), v);
9635 const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF)));
9636 const auto clamped = Or(zero_if_neg, too_big);
9637#if HWY_TARGET == HWY_SSE2
9638 const Rebind<uint16_t, decltype(di32)> du16;
9639 const RebindToSigned<decltype(du16)> di16;
9640 return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped))));
9641#else
9642 const Repartition<uint16_t, decltype(di32)> du16;
9643 // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
9644 alignas(16) static constexpr uint16_t kLower2Bytes[16] = {
9645 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
9646 const auto lo2 = Load(du16, kLower2Bytes);
9647 return VFromD<D>{TableLookupBytes(BitCast(du16, clamped), lo2).raw};
9648#endif
9649#else
9650 return VFromD<D>{_mm_packus_epi32(v.raw, v.raw)};
9651#endif
9652}
9653
9654template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
9655HWY_API VFromD<D> DemoteTo(D du16, VFromD<Rebind<uint32_t, D>> v) {
9656 const DFromV<decltype(v)> du32;
9657 const RebindToSigned<decltype(du32)> di32;
9658#if HWY_TARGET >= HWY_SSSE3
9659 const auto too_big =
9660 VecFromMask(di32, Gt(BitCast(di32, ShiftRight<16>(v)), Zero(di32)));
9661 const auto clamped = Or(BitCast(di32, v), too_big);
9662#if HWY_TARGET == HWY_SSE2
9663 const RebindToSigned<decltype(du16)> di16;
9664 return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped))));
9665#else
9666 (void)du16;
9667 const Repartition<uint16_t, decltype(di32)> du16_full;
9668 // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
9669 alignas(16) static constexpr uint16_t kLower2Bytes[16] = {
9670 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
9671 const auto lo2 = Load(du16_full, kLower2Bytes);
9672 return VFromD<D>{TableLookupBytes(BitCast(du16_full, clamped), lo2).raw};
9673#endif
9674#else
9675 return DemoteTo(du16, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF))));
9676#endif
9677}
9678
9679template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
9680HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
9681 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
9682 return VFromD<D>{_mm_packus_epi16(i16, i16)};
9683}
9684
9685template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
9686HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
9687 return VFromD<D>{_mm_packus_epi16(v.raw, v.raw)};
9688}
9689
9690template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
9691HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
9692 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
9693 return VFromD<D>{_mm_packs_epi16(i16, i16)};
9694}
9695
9696template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
9697HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
9698 return VFromD<D>{_mm_packs_epi16(v.raw, v.raw)};
9699}
9700
9701template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
9702HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint32_t, D>> v) {
9703#if HWY_TARGET <= HWY_AVX3
9704 // NOTE: _mm_cvtusepi32_epi8 is a saturated conversion of 32-bit unsigned
9705 // integers to 8-bit unsigned integers
9706 (void)du8;
9707 return VFromD<D>{_mm_cvtusepi32_epi8(v.raw)};
9708#else
9709 const DFromV<decltype(v)> du32;
9710 const RebindToSigned<decltype(du32)> di32;
9711 const auto max_i32 = Set(du32, 0x7FFFFFFFu);
9712
9713#if HWY_TARGET >= HWY_SSSE3
9714 // On SSE2/SSSE3, clamp u32 values to an i32 using the u8 Min operation
9715 // as SSE2/SSSE3 can do an u8 Min operation in a single instruction.
9716
9717 // The u8 Min operation below leaves the lower 24 bits of each 32-bit
9718 // lane unchanged.
9719
9720 // The u8 Min operation below will leave any values that are less than or
9721 // equal to 0x7FFFFFFF unchanged.
9722
9723 // For values that are greater than or equal to 0x80000000, the u8 Min
9724 // operation below will force the upper 8 bits to 0x7F and leave the lower
9725 // 24 bits unchanged.
9726
9727 // An u8 Min operation is okay here as any clamped value that is greater than
9728 // or equal to 0x80000000 will be clamped to a value between 0x7F000000 and
9729 // 0x7FFFFFFF through the u8 Min operation below, which will then be converted
9730 // to 0xFF through the i32->u8 demotion.
9731 const Repartition<uint8_t, decltype(du32)> du32_as_du8;
9732 const auto clamped = BitCast(
9733 di32, Min(BitCast(du32_as_du8, v), BitCast(du32_as_du8, max_i32)));
9734#else
9735 const auto clamped = BitCast(di32, Min(v, max_i32));
9736#endif
9737
9738 return DemoteTo(du8, clamped);
9739#endif
9740}
9741
9742template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
9743HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) {
9744 const DFromV<decltype(v)> du16;
9745 const RebindToSigned<decltype(du16)> di16;
9746 const auto max_i16 = Set(du16, 0x7FFF);
9747
9748#if HWY_TARGET >= HWY_SSSE3
9749 // On SSE2/SSSE3, clamp u16 values to an i16 using the u8 Min operation
9750 // as SSE2/SSSE3 can do an u8 Min operation in a single instruction.
9751
9752 // The u8 Min operation below leaves the lower 8 bits of each 16-bit
9753 // lane unchanged.
9754
9755 // The u8 Min operation below will leave any values that are less than or
9756 // equal to 0x7FFF unchanged.
9757
9758 // For values that are greater than or equal to 0x8000, the u8 Min
9759 // operation below will force the upper 8 bits to 0x7F and leave the lower
9760 // 8 bits unchanged.
9761
9762 // An u8 Min operation is okay here as any clamped value that is greater than
9763 // or equal to 0x8000 will be clamped to a value between 0x7F00 and
9764 // 0x7FFF through the u8 Min operation below, which will then be converted
9765 // to 0xFF through the i16->u8 demotion.
9766 const Repartition<uint8_t, decltype(du16)> du16_as_du8;
9767 const auto clamped = BitCast(
9768 di16, Min(BitCast(du16_as_du8, v), BitCast(du16_as_du8, max_i16)));
9769#else
9770 const auto clamped = BitCast(di16, Min(v, max_i16));
9771#endif
9772
9773 return DemoteTo(du8, clamped);
9774}
9775
9776#if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)
9777
9778// HWY_NATIVE_F16C was already toggled above.
9779
9780// Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate).
9781// clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain.
9782HWY_DIAGNOSTICS(push)
9783HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain")
9784
9785template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
9786HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
9787 const RebindToUnsigned<decltype(df16)> du16;
9788 return BitCast(
9789 df16, VFromD<decltype(du16)>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
9790}
9791
9792HWY_DIAGNOSTICS(pop)
9793
9794#endif // F16C
9795
9796#if HWY_HAVE_FLOAT16
9797
9798#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
9799#undef HWY_NATIVE_DEMOTE_F64_TO_F16
9800#else
9801#define HWY_NATIVE_DEMOTE_F64_TO_F16
9802#endif
9803
9804template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
9805HWY_API VFromD<D> DemoteTo(D /*df16*/, VFromD<Rebind<double, D>> v) {
9806 return VFromD<D>{_mm_cvtpd_ph(v.raw)};
9807}
9808
9809#endif // HWY_HAVE_FLOAT16
9810
9811// The _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics require GCC 9 or later
9812// or Clang 10 or later
9813
9814// Also need GCC or Clang to bit cast the __m128bh, __m256bh, or __m512bh vector
9815// returned by the _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics to a
9816// __m128i, __m256i, or __m512i as there are currently no intrinsics available
9817// (as of GCC 13 and Clang 17) to bit cast a __m128bh, __m256bh, or __m512bh
9818// vector to a __m128i, __m256i, or __m512i vector
9819
9820#if HWY_AVX3_HAVE_F32_TO_BF16C
9821#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
9822#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
9823#else
9824#define HWY_NATIVE_DEMOTE_F32_TO_BF16
9825#endif
9826
9827template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
9828HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
9829#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
9830 // Inline assembly workaround for LLVM codegen bug
9831 __m128i raw_result;
9832 __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
9833 return VFromD<D>{raw_result};
9834#else
9835 // The _mm_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be
9836 // bit casted to a __m128i vector
9837 return VFromD<D>{detail::BitCastToInteger(_mm_cvtneps_pbh(v.raw))};
9838#endif
9839}
9840
9841template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
9842HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec128<float> a,
9843 Vec128<float> b) {
9844#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
9845 // Inline assembly workaround for LLVM codegen bug
9846 __m128i raw_result;
9847 __asm__("vcvtne2ps2bf16 %2, %1, %0"
9848 : "=v"(raw_result)
9849 : "v"(b.raw), "v"(a.raw));
9850 return VFromD<D>{raw_result};
9851#else
9852 // The _mm_cvtne2ps_pbh intrinsic returns a __m128bh vector that needs to be
9853 // bit casted to a __m128i vector
9854 return VFromD<D>{detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw))};
9855#endif
9856}
9857
9858template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
9859HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<float> a,
9860 Vec64<float> b) {
9861 return VFromD<D>{_mm_shuffle_epi32(
9862 detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw)),
9863 _MM_SHUFFLE(2, 0, 2, 0))};
9864}
9865
9866template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
9867HWY_API VFromD<D> ReorderDemote2To(D dbf16, Vec32<float> a, Vec32<float> b) {
9868 const DFromV<decltype(a)> d;
9869 const Twice<decltype(d)> dt;
9870 return DemoteTo(dbf16, Combine(dt, b, a));
9871}
9872#endif // HWY_AVX3_HAVE_F32_TO_BF16C
9873
9874// Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
9875template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
9877 const DFromV<decltype(a)> d;
9878 const Twice<decltype(d)> dt;
9879 return DemoteTo(dn, Combine(dt, b, a));
9880}
9881template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
9883 Vec64<int32_t> b) {
9884 return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(a.raw, b.raw),
9885 _MM_SHUFFLE(2, 0, 2, 0))};
9886}
9887template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
9889 Vec128<int32_t> b) {
9890 return VFromD<D>{_mm_packs_epi32(a.raw, b.raw)};
9891}
9892
9893template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
9894HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) {
9895 const DFromV<decltype(a)> d;
9896 const Twice<decltype(d)> dt;
9897 return DemoteTo(dn, Combine(dt, b, a));
9898}
9899template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
9900HWY_API VFromD<D> ReorderDemote2To(D dn, Vec64<int32_t> a, Vec64<int32_t> b) {
9901#if HWY_TARGET >= HWY_SSSE3
9902 const DFromV<decltype(a)> d;
9903 const Twice<decltype(d)> dt;
9904 return DemoteTo(dn, Combine(dt, b, a));
9905#else
9906 (void)dn;
9907 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi32(a.raw, b.raw),
9908 _MM_SHUFFLE(2, 0, 2, 0))};
9909#endif
9910}
9911template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
9912HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<int32_t> a, Vec128<int32_t> b) {
9913#if HWY_TARGET >= HWY_SSSE3
9914 const Half<decltype(dn)> dnh;
9915 const auto u16_a = DemoteTo(dnh, a);
9916 const auto u16_b = DemoteTo(dnh, b);
9917 return Combine(dn, u16_b, u16_a);
9918#else
9919 (void)dn;
9920 return VFromD<D>{_mm_packus_epi32(a.raw, b.raw)};
9921#endif
9922}
9923
9924template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
9926 Vec128<uint32_t> b) {
9927 const DFromV<decltype(a)> du32;
9928 const RebindToSigned<decltype(du32)> di32;
9929 const auto max_i32 = Set(du32, 0x7FFFFFFFu);
9930
9931#if HWY_TARGET >= HWY_SSSE3
9932 const Repartition<uint8_t, decltype(du32)> du32_as_du8;
9933 // On SSE2/SSSE3, clamp a and b using u8 Min operation
9934 const auto clamped_a = BitCast(
9935 di32, Min(BitCast(du32_as_du8, a), BitCast(du32_as_du8, max_i32)));
9936 const auto clamped_b = BitCast(
9937 di32, Min(BitCast(du32_as_du8, b), BitCast(du32_as_du8, max_i32)));
9938#else
9939 const auto clamped_a = BitCast(di32, Min(a, max_i32));
9940 const auto clamped_b = BitCast(di32, Min(b, max_i32));
9941#endif
9942
9943 return ReorderDemote2To(dn, clamped_a, clamped_b);
9944}
9945
9946template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
9947HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint32_t, D>> a,
9948 VFromD<Repartition<uint32_t, D>> b) {
9949 const DFromV<decltype(a)> d;
9950 const Twice<decltype(d)> dt;
9951 return DemoteTo(dn, Combine(dt, b, a));
9952}
9953
9954// Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
9955template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
9956HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a,
9957 VFromD<Repartition<int16_t, D>> b) {
9958 const DFromV<decltype(a)> d;
9959 const Twice<decltype(d)> dt;
9960 return DemoteTo(dn, Combine(dt, b, a));
9961}
9962template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
9964 Vec64<int16_t> b) {
9965 return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi16(a.raw, b.raw),
9966 _MM_SHUFFLE(2, 0, 2, 0))};
9967}
9968template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
9970 Vec128<int16_t> b) {
9971 return VFromD<D>{_mm_packs_epi16(a.raw, b.raw)};
9972}
9973
9974template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
9975HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a,
9976 VFromD<Repartition<int16_t, D>> b) {
9977 const DFromV<decltype(a)> d;
9978 const Twice<decltype(d)> dt;
9979 return DemoteTo(dn, Combine(dt, b, a));
9980}
9981template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
9982HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int16_t> a,
9983 Vec64<int16_t> b) {
9984 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(a.raw, b.raw),
9985 _MM_SHUFFLE(2, 0, 2, 0))};
9986}
9987template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
9988HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
9989 Vec128<int16_t> b) {
9990 return VFromD<D>{_mm_packus_epi16(a.raw, b.raw)};
9991}
9992
9993template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
9995 Vec128<uint16_t> b) {
9996 const DFromV<decltype(a)> du16;
9997 const RebindToSigned<decltype(du16)> di16;
9998 const auto max_i16 = Set(du16, 0x7FFFu);
9999
10000#if HWY_TARGET >= HWY_SSSE3
10001 const Repartition<uint8_t, decltype(du16)> du16_as_du8;
10002 // On SSE2/SSSE3, clamp a and b using u8 Min operation
10003 const auto clamped_a = BitCast(
10004 di16, Min(BitCast(du16_as_du8, a), BitCast(du16_as_du8, max_i16)));
10005 const auto clamped_b = BitCast(
10006 di16, Min(BitCast(du16_as_du8, b), BitCast(du16_as_du8, max_i16)));
10007#else
10008 const auto clamped_a = BitCast(di16, Min(a, max_i16));
10009 const auto clamped_b = BitCast(di16, Min(b, max_i16));
10010#endif
10011
10012 return ReorderDemote2To(dn, clamped_a, clamped_b);
10013}
10014
10015template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
10016HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint16_t, D>> a,
10017 VFromD<Repartition<uint16_t, D>> b) {
10018 const DFromV<decltype(a)> d;
10019 const Twice<decltype(d)> dt;
10020 return DemoteTo(dn, Combine(dt, b, a));
10021}
10022
10023template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
10024 HWY_IF_V_SIZE_LE_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
10025 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
10026 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
10028 return ReorderDemote2To(d, a, b);
10029}
10030
10031#if HWY_AVX3_HAVE_F32_TO_BF16C
10032// F32 to BF16 OrderedDemote2To is generic for all vector lengths on targets
10033// that support AVX512BF16
10034template <class D, HWY_IF_BF16_D(D)>
10035HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
10036 VFromD<Repartition<float, D>> b) {
10037 return ReorderDemote2To(dbf16, a, b);
10038}
10039#endif // HWY_AVX3_HAVE_F32_TO_BF16C
10040
10041template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
10042HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
10043 return VFromD<D>{_mm_cvtpd_ps(v.raw)};
10044}
10045
10046namespace detail {
10047
10048// Generic for all vector lengths.
10049template <class D>
10051 // The max can be exactly represented in binary64, so clamping beforehand
10052 // prevents x86 conversion from raising an exception and returning 80..00.
10053 return Min(v, Set(d, 2147483647.0));
10054}
10055
10056// For ConvertTo float->int of same size, clamping before conversion would
10057// change the result because the max integer value is not exactly representable.
10058// Instead detect the overflow result after conversion and fix it.
10059// Generic for all vector lengths.
10060template <class DI>
10062 VFromD<RebindToFloat<DI>> original,
10063 VFromD<DI> converted) {
10064 // Combinations of original and output sign:
10065 // --: normal <0 or -huge_val to 80..00: OK
10066 // -+: -0 to 0 : OK
10067 // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF
10068 // ++: normal >0 : OK
10069 const VFromD<DI> sign_wrong = AndNot(BitCast(di, original), converted);
10070#if HWY_COMPILER_GCC_ACTUAL
10071 // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also
10072 // Add() if using that instead. Work around with one more instruction.
10073 const RebindToUnsigned<DI> du;
10074 const VFromD<DI> mask = BroadcastSignBit(sign_wrong);
10075 const VFromD<DI> max = BitCast(di, ShiftRight<1>(BitCast(du, mask)));
10076 return IfVecThenElse(mask, max, converted);
10077#else
10078 return Xor(converted, BroadcastSignBit(sign_wrong));
10079#endif
10080}
10081
10082} // namespace detail
10083
10084#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
10085#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
10086#else
10087#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
10088#endif
10089
10090template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
10092 return VFromD<D>{_mm_cvttpd_epi32(v.raw)};
10093}
10094
10095// F64 to I32 DemoteTo is generic for all vector lengths
10096template <class D, HWY_IF_I32_D(D)>
10097HWY_API VFromD<D> DemoteTo(D di32, VFromD<Rebind<double, D>> v) {
10098 const Rebind<double, decltype(di32)> df64;
10099 const VFromD<decltype(df64)> clamped = detail::ClampF64ToI32Max(df64, v);
10100 return DemoteInRangeTo(di32, clamped);
10101}
10102
10103#if HWY_TARGET <= HWY_AVX3
10104template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
10105HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
10106 return VFromD<D>{_mm_cvttpd_epu32(v.raw)};
10107}
10108
10109template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
10110HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
10111 return VFromD<D>{
10112 _mm_maskz_cvttpd_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
10113}
10114#else // HWY_TARGET > HWY_AVX3
10115
10116// F64 to U32 DemoteInRangeTo is generic for all vector lengths on
10117// SSE2/SSSE3/SSE4/AVX2
10118template <class D, HWY_IF_U32_D(D)>
10119HWY_API VFromD<D> DemoteInRangeTo(D du32, VFromD<Rebind<double, D>> v) {
10120 const RebindToSigned<decltype(du32)> di32;
10121 const Rebind<double, decltype(du32)> df64;
10122 const RebindToUnsigned<decltype(df64)> du64;
10123
10124 const auto k2_31 = Set(df64, 2147483648.0);
10125 const auto v_is_ge_k2_31 = (v >= k2_31);
10126 const auto clamped_lo31_f64 = v - IfThenElseZero(v_is_ge_k2_31, k2_31);
10127 const auto clamped_lo31_u32 =
10128 BitCast(du32, DemoteInRangeTo(di32, clamped_lo31_f64));
10129 const auto clamped_u32_msb = ShiftLeft<31>(
10130 TruncateTo(du32, BitCast(du64, VecFromMask(df64, v_is_ge_k2_31))));
10131 return Or(clamped_lo31_u32, clamped_u32_msb);
10132}
10133
10134// F64 to U32 DemoteTo is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
10135template <class D, HWY_IF_U32_D(D)>
10136HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
10137 const Rebind<double, decltype(du32)> df64;
10138 const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0));
10139 return DemoteInRangeTo(du32, clamped);
10140}
10141#endif // HWY_TARGET <= HWY_AVX3
10142
10143#if HWY_TARGET <= HWY_AVX3
10144template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
10145HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
10146 return VFromD<D>{_mm_cvtepi64_ps(v.raw)};
10147}
10148template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
10149HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
10150 return VFromD<D>{_mm_cvtepu64_ps(v.raw)};
10151}
10152#else
10153// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
10154template <class D, HWY_IF_F32_D(D)>
10155HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) {
10156 const Rebind<double, decltype(df32)> df64;
10157 const RebindToUnsigned<decltype(df64)> du64;
10158 const RebindToSigned<decltype(df32)> di32;
10159 const RebindToUnsigned<decltype(df32)> du32;
10160
10161 const auto k2p64_63 = Set(df64, 27670116110564327424.0);
10162 const auto f64_hi52 =
10163 Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63;
10164 const auto f64_lo12 =
10165 PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
10166 Set(du32, uint32_t{0x00000FFF}))));
10167
10168 const auto f64_sum = f64_hi52 + f64_lo12;
10169 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
10170
10171 const auto f64_sum_is_inexact =
10172 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
10173 const auto f64_bits_decrement =
10174 And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))),
10175 f64_sum_is_inexact);
10176
10177 const auto adj_f64_val = BitCast(
10178 df64,
10179 Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));
10180
10181 return DemoteTo(df32, adj_f64_val);
10182}
10183
10184// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
10185template <class D, HWY_IF_F32_D(D)>
10186HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) {
10187 const Rebind<double, decltype(df32)> df64;
10188 const RebindToUnsigned<decltype(df64)> du64;
10189 const RebindToSigned<decltype(df32)> di32;
10190 const RebindToUnsigned<decltype(df32)> du32;
10191
10192 const auto k2p64 = Set(df64, 18446744073709551616.0);
10193 const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
10194 const auto f64_lo12 =
10195 PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
10196 Set(du32, uint32_t{0x00000FFF}))));
10197
10198 const auto f64_sum = f64_hi52 + f64_lo12;
10199 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
10200 const auto f64_sum_is_inexact =
10201 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
10202
10203 const auto adj_f64_val = BitCast(
10204 df64,
10205 Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)),
10206 f64_sum_is_inexact));
10207
10208 return DemoteTo(df32, adj_f64_val);
10209}
10210#endif
10211
10212// For already range-limited input [0, 255].
10213template <size_t N>
10214HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
10215#if HWY_TARGET == HWY_SSE2
10216 const RebindToSigned<DFromV<decltype(v)>> di32;
10217 const Rebind<uint8_t, decltype(di32)> du8;
10218 return DemoteTo(du8, BitCast(di32, v));
10219#else
10220 const DFromV<decltype(v)> d32;
10221 const Repartition<uint8_t, decltype(d32)> d8;
10222 alignas(16) static constexpr uint32_t k8From32[4] = {
10223 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
10224 // Also replicate bytes into all 32 bit lanes for safety.
10225 const auto quad = TableLookupBytes(v, Load(d32, k8From32));
10226 return LowerHalf(LowerHalf(BitCast(d8, quad)));
10227#endif
10228}
10229
10230// ------------------------------ F32->UI64 PromoteTo
10231#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
10232#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
10233#else
10234#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
10235#endif
10236
10237#if HWY_TARGET <= HWY_AVX3
10238template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
10239HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
10240 const Rebind<float, decltype(di64)> df32;
10241 const RebindToFloat<decltype(di64)> df64;
10242 const Twice<decltype(df32)> dt_f32;
10243
10244 return detail::FixConversionOverflow(
10245 di64,
10246 BitCast(df64, InterleaveLower(ResizeBitCast(dt_f32, v),
10247 ResizeBitCast(dt_f32, v))),
10248 PromoteInRangeTo(di64, v));
10249}
10250template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
10252 return VFromD<D>{_mm_cvttps_epi64(v.raw)};
10253}
10254template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
10255HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
10256 return VFromD<D>{
10257 _mm_maskz_cvttps_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
10258}
10259template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
10260HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
10261 return VFromD<D>{_mm_cvttps_epu64(v.raw)};
10262}
10263#else // AVX2 or below
10264
10265// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
10266template <class D, HWY_IF_I64_D(D)>
10267HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
10268 const Rebind<int32_t, decltype(di64)> di32;
10269 const RebindToFloat<decltype(di32)> df32;
10270 const RebindToUnsigned<decltype(di32)> du32;
10271 const Repartition<uint8_t, decltype(du32)> du32_as_du8;
10272
10273 const auto exponent_adj = BitCast(
10274 du32,
10275 Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
10276 BitCast(du32_as_du8, Set(du32, uint32_t{157}))),
10277 BitCast(du32_as_du8, Set(du32, uint32_t{32}))));
10278 const auto adj_v =
10279 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
10280
10281 const auto f32_to_i32_result = ConvertTo(di32, adj_v);
10282 const auto lo64_or_mask = PromoteTo(
10283 di64,
10284 BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result,
10285 Set(di32, LimitsMax<int32_t>())))));
10286
10287 return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result))
10288 << PromoteTo(di64, exponent_adj),
10289 lo64_or_mask);
10290}
10291
10292// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
10293template <class D, HWY_IF_UI64_D(D)>
10294HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) {
10295 const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32;
10296 const RebindToSigned<decltype(d32)> di32;
10297 const RebindToFloat<decltype(d32)> df32;
10298 const RebindToUnsigned<decltype(d32)> du32;
10299 const Repartition<uint8_t, decltype(d32)> du32_as_du8;
10300
10301 const auto exponent_adj = BitCast(
10302 du32,
10303 SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
10304 BitCast(du32_as_du8, Set(du32, uint32_t{0xFFFFFF9Du}))));
10305 const auto adj_v =
10306 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
10307
10308 const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v);
10309 return PromoteTo(d64, BitCast(d32, f32_to_i32_result))
10310 << PromoteTo(d64, exponent_adj);
10311}
10312
10313namespace detail {
10314
10315template <class DU64, HWY_IF_V_SIZE_LE_D(DU64, 16)>
10316HWY_INLINE VFromD<DU64> PromoteF32ToU64OverflowMaskToU64(
10317 DU64 du64, VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) {
10318 const Rebind<int32_t, decltype(du64)> di32;
10319 const Twice<decltype(di32)> dt_i32;
10320
10321 const auto vt_i32_overflow_mask = ResizeBitCast(dt_i32, i32_overflow_mask);
10322 return BitCast(du64,
10323 InterleaveLower(vt_i32_overflow_mask, vt_i32_overflow_mask));
10324}
10325
10326template <class DU64, HWY_IF_V_SIZE_GT_D(DU64, 16)>
10327HWY_INLINE VFromD<DU64> PromoteF32ToU64OverflowMaskToU64(
10328 DU64 du64, VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) {
10329 const RebindToSigned<decltype(du64)> di64;
10330 return BitCast(du64, PromoteTo(di64, i32_overflow_mask));
10331}
10332
10333} // namespace detail
10334
10335// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
10336template <class D, HWY_IF_U64_D(D)>
10337HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
10338 const Rebind<int32_t, decltype(du64)> di32;
10339 const RebindToFloat<decltype(di32)> df32;
10340 const RebindToUnsigned<decltype(di32)> du32;
10341 const Repartition<uint8_t, decltype(du32)> du32_as_du8;
10342
10343 const auto non_neg_v = ZeroIfNegative(v);
10344
10345 const auto exponent_adj = BitCast(
10346 du32, Min(SaturatedSub(BitCast(du32_as_du8,
10347 ShiftRight<23>(BitCast(du32, non_neg_v))),
10348 BitCast(du32_as_du8, Set(du32, uint32_t{157}))),
10349 BitCast(du32_as_du8, Set(du32, uint32_t{33}))));
10350
10351 const auto adj_v =
10352 BitCast(df32, BitCast(du32, non_neg_v) - ShiftLeft<23>(exponent_adj));
10353 const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v);
10354
10355 const auto i32_overflow_mask = BroadcastSignBit(f32_to_i32_result);
10356 const auto overflow_result =
10357 detail::PromoteF32ToU64OverflowMaskToU64(du64, i32_overflow_mask);
10358
10359 return Or(PromoteTo(du64, BitCast(du32, f32_to_i32_result))
10360 << PromoteTo(du64, exponent_adj),
10361 overflow_result);
10362}
10363#endif // HWY_TARGET <= HWY_AVX3
10364
10365// ------------------------------ MulFixedPoint15
10366
10367#if HWY_TARGET == HWY_SSE2
10368HWY_API Vec128<int16_t> MulFixedPoint15(const Vec128<int16_t> a,
10369 const Vec128<int16_t> b) {
10370 const DFromV<decltype(a)> d;
10371 const Repartition<int32_t, decltype(d)> di32;
10372
10373 auto lo_product = a * b;
10374 auto hi_product = MulHigh(a, b);
10375
10376 const VFromD<decltype(di32)> i32_product_lo{
10377 _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)};
10378 const VFromD<decltype(di32)> i32_product_hi{
10379 _mm_unpackhi_epi16(lo_product.raw, hi_product.raw)};
10380
10381 const auto round_up_incr = Set(di32, 0x4000);
10382 return ReorderDemote2To(d, ShiftRight<15>(i32_product_lo + round_up_incr),
10383 ShiftRight<15>(i32_product_hi + round_up_incr));
10384}
10385
10386template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
10387HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
10388 const Vec128<int16_t, N> b) {
10389 const DFromV<decltype(a)> d;
10390 const Rebind<int32_t, decltype(d)> di32;
10391
10392 const auto lo_product = a * b;
10393 const auto hi_product = MulHigh(a, b);
10394 const VFromD<decltype(di32)> i32_product{
10395 _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)};
10396
10397 return DemoteTo(d, ShiftRight<15>(i32_product + Set(di32, 0x4000)));
10398}
10399#else
10400template <size_t N>
10401HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
10402 const Vec128<int16_t, N> b) {
10403 return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
10404}
10405#endif
10406
10407// ------------------------------ Truncations
10408
10409template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)>
10410HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) {
10411 // BitCast requires the same size; DTo might be u8x1 and v u16x1.
10412 const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto;
10413 return VFromD<DTo>{BitCast(dto, v).raw};
10414}
10415
10416template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
10418#if HWY_TARGET == HWY_SSE2
10419 const Vec128<uint8_t, 1> lo{v.raw};
10420 const Vec128<uint8_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)};
10421 return Combine(d, hi, lo);
10422#else
10423 const Repartition<uint8_t, DFromV<decltype(v)>> d8;
10424 (void)d;
10425 alignas(16) static constexpr uint8_t kIdx[16] = {0, 8, 0, 8, 0, 8, 0, 8,
10426 0, 8, 0, 8, 0, 8, 0, 8};
10427 const Vec128<uint8_t> v8 = TableLookupBytes(v, Load(d8, kIdx));
10428 return LowerHalf(LowerHalf(LowerHalf(v8)));
10429#endif
10430}
10431
10432template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
10433HWY_API VFromD<D> TruncateTo(D d, Vec128<uint64_t> v) {
10434#if HWY_TARGET == HWY_SSE2
10435 const Vec128<uint16_t, 1> lo{v.raw};
10436 const Vec128<uint16_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)};
10437 return Combine(d, hi, lo);
10438#else
10439 (void)d;
10440 const Repartition<uint16_t, DFromV<decltype(v)>> d16;
10441 alignas(16) static constexpr uint16_t kIdx[8] = {
10442 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u};
10443 const Vec128<uint16_t> v16 = TableLookupBytes(v, Load(d16, kIdx));
10444 return LowerHalf(LowerHalf(v16));
10445#endif
10446}
10447
10448template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
10449HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
10450 return VFromD<D>{_mm_shuffle_epi32(v.raw, 0x88)};
10451}
10452
10453template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
10454HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
10455 const DFromV<decltype(v)> du32;
10456#if HWY_TARGET == HWY_SSE2
10457 const RebindToSigned<decltype(du32)> di32;
10458 const Rebind<uint8_t, decltype(di32)> du8;
10459 return DemoteTo(du8, BitCast(di32, ShiftRight<24>(ShiftLeft<24>(v))));
10460#else
10461 const Repartition<uint8_t, decltype(du32)> d;
10462 alignas(16) static constexpr uint8_t kIdx[16] = {
10463 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu,
10464 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu};
10465 return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kIdx))));
10466#endif
10467}
10468
10469template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
10470HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
10471 const DFromV<decltype(v)> du32;
10472#if HWY_TARGET == HWY_SSE2
10473 const RebindToSigned<decltype(du32)> di32;
10474 const Rebind<uint16_t, decltype(di32)> du16;
10475 const RebindToSigned<decltype(du16)> di16;
10476 return BitCast(
10477 du16, DemoteTo(di16, ShiftRight<16>(BitCast(di32, ShiftLeft<16>(v)))));
10478#else
10479 const Repartition<uint16_t, decltype(du32)> d;
10480 return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v)));
10481#endif
10482}
10483
10484template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
10485HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
10486 const DFromV<decltype(v)> du16;
10487#if HWY_TARGET == HWY_SSE2
10488 const RebindToSigned<decltype(du16)> di16;
10489 const Rebind<uint8_t, decltype(di16)> du8;
10490 const RebindToSigned<decltype(du8)> di8;
10491 return BitCast(du8,
10492 DemoteTo(di8, ShiftRight<8>(BitCast(di16, ShiftLeft<8>(v)))));
10493#else
10494 const Repartition<uint8_t, decltype(du16)> d;
10495 return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v)));
10496#endif
10497}
10498
10499// ------------------------------ Demotions to/from i64
10500
10501#if HWY_TARGET <= HWY_AVX3
10502template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
10503HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
10504 return VFromD<D>{_mm_cvtsepi64_epi32(v.raw)};
10505}
10506template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)>
10507HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
10508 return VFromD<D>{_mm_cvtsepi64_epi16(v.raw)};
10509}
10510template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)>
10511HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
10512 return VFromD<D>{_mm_cvtsepi64_epi8(v.raw)};
10513}
10514
10515template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
10516HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
10517 const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
10518 return VFromD<D>{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
10519}
10520template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
10521HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
10522 const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
10523 return VFromD<D>{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
10524}
10525template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
10526HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
10527 const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
10528 return VFromD<D>{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
10529}
10530
10531template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
10532HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
10533 return VFromD<D>{_mm_cvtusepi64_epi32(v.raw)};
10534}
10535template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
10536HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
10537 return VFromD<D>{_mm_cvtusepi64_epi16(v.raw)};
10538}
10539template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
10540HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
10541 return VFromD<D>{_mm_cvtusepi64_epi8(v.raw)};
10542}
10543#else // AVX2 or below
10544
10545// Disable the default unsigned to signed DemoteTo/ReorderDemote2To
10546// implementations in generic_ops-inl.h for U64->I8/I16/I32 demotions on
10547// SSE2/SSSE3/SSE4/AVX2 as U64->I8/I16/I32 DemoteTo/ReorderDemote2To for
10548// SSE2/SSSE3/SSE4/AVX2 is implemented in x86_128-inl.h
10549
10550// The default unsigned to signed DemoteTo/ReorderDemote2To
10551// implementations in generic_ops-inl.h are still used for U32->I8/I16 and
10552// U16->I8 demotions on SSE2/SSSE3/SSE4/AVX2
10553
10554#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
10555#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) HWY_IF_NOT_T_SIZE_V(V, 8)
10556
10557namespace detail {
10558template <class D, HWY_IF_UNSIGNED_D(D)>
10560 D /*dn*/, VFromD<Rebind<uint64_t, D>> v) {
10561 return v;
10562}
10563
10564template <class D, HWY_IF_SIGNED_D(D)>
10566 D /*dn*/, VFromD<Rebind<uint64_t, D>> v) {
10567 const DFromV<decltype(v)> du64;
10568 return And(v,
10569 Set(du64, static_cast<uint64_t>(hwy::HighestValue<TFromD<D>>())));
10570}
10571
10572template <class D>
10574 D dn, VFromD<Rebind<uint64_t, D>> v) {
10575 const Rebind<uint64_t, D> du64;
10576 const RebindToSigned<decltype(du64)> di64;
10577 constexpr int kShiftAmt = static_cast<int>(sizeof(TFromD<D>) * 8) -
10578 static_cast<int>(hwy::IsSigned<TFromD<D>>());
10579
10580 const auto too_big = BitCast(
10581 du64, VecFromMask(
10582 di64, Gt(BitCast(di64, ShiftRight<kShiftAmt>(v)), Zero(di64))));
10583 return DemoteFromU64MaskOutResult(dn, Or(v, too_big));
10584}
10585
10586template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class V>
10588 return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a));
10589}
10590
10591} // namespace detail
10592
10593template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
10594 HWY_IF_SIGNED_D(D)>
10595HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
10596 const DFromV<decltype(v)> di64;
10597 const RebindToUnsigned<decltype(di64)> du64;
10598 const RebindToUnsigned<decltype(dn)> dn_u;
10599
10600 // Negative values are saturated by first saturating their bitwise inverse
10601 // and then inverting the saturation result
10602 const auto invert_mask = BitCast(du64, BroadcastSignBit(v));
10603 const auto saturated_vals = Xor(
10604 invert_mask,
10605 detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v))));
10606 return BitCast(dn, TruncateTo(dn_u, saturated_vals));
10607}
10608
10609template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
10611HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
10612 const DFromV<decltype(v)> di64;
10613 const RebindToUnsigned<decltype(di64)> du64;
10614
10615 const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v));
10616 return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals));
10617}
10618
10619template <class D,
10621 D, ((HWY_TARGET != HWY_SSE2) ? ((1 << 1) | (1 << 2)) : 0) |
10622 (1 << 4)),
10623 HWY_IF_SIGNED_D(D)>
10624HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
10625 const RebindToUnsigned<decltype(dn)> dn_u;
10626 return BitCast(dn, TruncateTo(dn_u, detail::DemoteFromU64Saturate(dn, v)));
10627}
10628
10629#if HWY_TARGET == HWY_SSE2
10630template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
10631 HWY_IF_SIGNED_D(D)>
10632HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
10633 const Rebind<int32_t, decltype(dn)> di32;
10634 return DemoteTo(dn, DemoteTo(di32, v));
10635}
10636#endif // HWY_TARGET == HWY_SSE2
10637
10638template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
10640HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
10641 return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v));
10642}
10643#endif // HWY_TARGET <= HWY_AVX3
10644
10645template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2),
10647HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a,
10648 VFromD<Repartition<int64_t, D>> b) {
10649 const DFromV<decltype(a)> d;
10650 const Twice<decltype(d)> dt;
10651 return DemoteTo(dn, Combine(dt, b, a));
10652}
10653
10654template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_U32_D(D)>
10655HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
10656 VFromD<Repartition<uint64_t, D>> b) {
10657 const DFromV<decltype(a)> d;
10658 const Twice<decltype(d)> dt;
10659 return DemoteTo(dn, Combine(dt, b, a));
10660}
10661
10662#if HWY_TARGET > HWY_AVX3
10663template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_I32_D(D)>
10664HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
10665 VFromD<Repartition<uint64_t, D>> b) {
10666 const DFromV<decltype(a)> d;
10667 const Twice<decltype(d)> dt;
10668 return DemoteTo(dn, Combine(dt, b, a));
10669}
10670#endif
10671
10672#if HWY_TARGET > HWY_AVX2
10673template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
10674HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
10675 Vec128<int64_t> b) {
10676 const DFromV<decltype(a)> di64;
10677 const RebindToUnsigned<decltype(di64)> du64;
10678 const Half<decltype(dn)> dnh;
10679
10680 // Negative values are saturated by first saturating their bitwise inverse
10681 // and then inverting the saturation result
10682 const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a));
10683 const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b));
10684 const auto saturated_a = Xor(
10685 invert_mask_a,
10686 detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a))));
10687 const auto saturated_b = Xor(
10688 invert_mask_b,
10689 detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b))));
10690
10691 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
10692}
10693
10694template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
10695HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
10696 Vec128<int64_t> b) {
10697 const DFromV<decltype(a)> di64;
10698 const RebindToUnsigned<decltype(di64)> du64;
10699 const Half<decltype(dn)> dnh;
10700
10701 const auto saturated_a = detail::DemoteFromU64Saturate(
10702 dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a)));
10703 const auto saturated_b = detail::DemoteFromU64Saturate(
10704 dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b)));
10705
10706 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
10707}
10708
10709template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
10710HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint64_t> a,
10711 Vec128<uint64_t> b) {
10712 const Half<decltype(dn)> dnh;
10713
10714 const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a);
10715 const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b);
10716
10717 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
10718}
10719#endif // HWY_TARGET > HWY_AVX2
10720
10721// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
10722
10723#if HWY_HAVE_FLOAT16
10724template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
10725HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
10726 return VFromD<D>{_mm_cvtepu16_ph(v.raw)};
10727}
10728template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
10729HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
10730 return VFromD<D>{_mm_cvtepi16_ph(v.raw)};
10731}
10732#endif // HWY_HAVE_FLOAT16
10733
10734template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
10735HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
10736 return VFromD<D>{_mm_cvtepi32_ps(v.raw)};
10737}
10738
10739#if HWY_TARGET <= HWY_AVX3
10740template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
10741HWY_API VFromD<D> ConvertTo(D /*df*/, VFromD<Rebind<uint32_t, D>> v) {
10742 return VFromD<D>{_mm_cvtepu32_ps(v.raw)};
10743}
10744
10745template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
10747 return VFromD<D>{_mm_cvtepi64_pd(v.raw)};
10748}
10749
10750template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
10752 return VFromD<D>{_mm_cvtepu64_pd(v.raw)};
10753}
10754#else // AVX2 or below
10755// Generic for all vector lengths.
10756template <class D, HWY_IF_F32_D(D)>
10757HWY_API VFromD<D> ConvertTo(D df, VFromD<Rebind<uint32_t, D>> v) {
10758 // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
10759 const RebindToUnsigned<decltype(df)> du32;
10760 const RebindToSigned<decltype(df)> d32;
10761
10762 const auto msk_lo = Set(du32, 0xFFFF);
10763 const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
10764
10765 // Extract the 16 lowest/highest significant bits of v and cast to signed int
10766 const auto v_lo = BitCast(d32, And(v, msk_lo));
10767 const auto v_hi = BitCast(d32, ShiftRight<16>(v));
10768 return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
10769}
10770
10771// Generic for all vector lengths.
10772template <class D, HWY_IF_F64_D(D)>
10773HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<int64_t, D>> v) {
10774 // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
10775 const Repartition<uint32_t, decltype(dd)> d32;
10776 const Repartition<uint64_t, decltype(dd)> d64;
10777
10778 // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
10779 const auto k84_63 = Set(d64, 0x4530000080000000ULL);
10780 const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
10781
10782 // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
10783 const auto k52 = Set(d32, 0x43300000);
10784 const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
10785
10786 const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
10787 return (v_upper - k84_63_52) + v_lower; // order matters!
10788}
10789
10790namespace detail {
10791template <class VW>
10793 const DFromV<decltype(w)> d64;
10794 const RebindToFloat<decltype(d64)> dd;
10795 const auto cnst2_52_dbl = Set(dd, 0x0010000000000000); // 2^52
10796 return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl;
10797}
10798} // namespace detail
10799
10800// Generic for all vector lengths.
10801template <class D, HWY_IF_F64_D(D)>
10802HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<uint64_t, D>> v) {
10803 // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
10804 const RebindToUnsigned<decltype(dd)> d64;
10805 using VU = VFromD<decltype(d64)>;
10806
10807 const VU msk_lo = Set(d64, 0xFFFFFFFF);
10808 const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
10809
10810 // Extract the 32 lowest/highest significant bits of v
10811 const VU v_lo = And(v, msk_lo);
10812 const VU v_hi = ShiftRight<32>(v);
10813
10814 const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo);
10815 return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl);
10816}
10817#endif // HWY_TARGET <= HWY_AVX3
10818
10819// Truncates (rounds toward zero).
10820
10821#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
10822#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
10823#else
10824#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
10825#endif
10826
10827#if HWY_HAVE_FLOAT16
10828template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
10829HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
10830 return VFromD<D>{_mm_cvttph_epi16(v.raw)};
10831}
10832
10833// F16 to I16 ConvertTo is generic for all vector lengths
10834template <class D, HWY_IF_I16_D(D)>
10835HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
10836 return detail::FixConversionOverflow(di, v, ConvertInRangeTo(di, v));
10837}
10838
10839template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
10840HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
10841 return VFromD<D>{_mm_cvttph_epu16(v.raw)};
10842}
10843
10844template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
10845HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
10846 return VFromD<D>{
10847 _mm_maskz_cvttph_epu16(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
10848}
10849#endif // HWY_HAVE_FLOAT16
10850
10851template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
10853 return VFromD<D>{_mm_cvttps_epi32(v.raw)};
10854}
10855
10856// F32 to I32 ConvertTo is generic for all vector lengths
10857template <class D, HWY_IF_I32_D(D)>
10859 return detail::FixConversionOverflow(di, v, ConvertInRangeTo(di, v));
10860}
10861
10862#if HWY_TARGET <= HWY_AVX3
10863template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
10864HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
10865 return VFromD<DI>{_mm_cvttpd_epi64(v.raw)};
10866}
10867
10868// F64 to I64 ConvertTo is generic for all vector lengths on AVX3
10869template <class DI, HWY_IF_I64_D(DI)>
10871 return detail::FixConversionOverflow(di, v, ConvertInRangeTo(di, v));
10872}
10873
10874template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
10876 return VFromD<DU>{_mm_cvttps_epu32(v.raw)};
10877}
10878
10879template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
10881 return VFromD<DU>{
10882 _mm_maskz_cvttps_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
10883}
10884
10885template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
10886HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
10887 return VFromD<DU>{_mm_cvttpd_epu64(v.raw)};
10888}
10889
10890template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
10891HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
10892 return VFromD<DU>{
10893 _mm_maskz_cvttpd_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
10894}
10895
10896#else // AVX2 or below
10897
10898namespace detail {
10899
10900template <class DU32, HWY_IF_U32_D(DU32)>
10901static HWY_INLINE VFromD<DU32> ConvInRangeF32ToU32(
10902 DU32 du32, VFromD<RebindToFloat<DU32>> v, VFromD<DU32>& exp_diff) {
10903 const RebindToSigned<decltype(du32)> di32;
10904 const RebindToFloat<decltype(du32)> df32;
10905
10906 exp_diff = Set(du32, uint32_t{158}) - ShiftRight<23>(BitCast(du32, v));
10907 const auto scale_down_f32_val_mask =
10908 VecFromMask(du32, Eq(exp_diff, Zero(du32)));
10909
10910 const auto v_scaled =
10911 BitCast(df32, BitCast(du32, v) + ShiftLeft<23>(scale_down_f32_val_mask));
10912 const auto f32_to_u32_result =
10913 BitCast(du32, ConvertInRangeTo(di32, v_scaled));
10914
10915 return f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask);
10916}
10917
10918} // namespace detail
10919
10920// F32 to U32 ConvertInRangeTo is generic for all vector lengths on
10921// SSE2/SSSE3/SSE4/AVX2
10922template <class DU32, HWY_IF_U32_D(DU32)>
10924 VFromD<RebindToFloat<DU32>> v) {
10925 VFromD<DU32> exp_diff;
10926 const auto f32_to_u32_result = detail::ConvInRangeF32ToU32(du32, v, exp_diff);
10927 return f32_to_u32_result;
10928}
10929
10930// F32 to U32 ConvertTo is generic for all vector lengths on
10931// SSE2/SSSE3/SSE4/AVX2
10932template <class DU32, HWY_IF_U32_D(DU32)>
10933HWY_API VFromD<DU32> ConvertTo(DU32 du32, VFromD<RebindToFloat<DU32>> v) {
10934 const RebindToSigned<decltype(du32)> di32;
10935
10936 const auto non_neg_v = ZeroIfNegative(v);
10937 VFromD<DU32> exp_diff;
10938 const auto f32_to_u32_result =
10939 detail::ConvInRangeF32ToU32(du32, non_neg_v, exp_diff);
10940
10941 return Or(f32_to_u32_result,
10942 BitCast(du32, BroadcastSignBit(BitCast(di32, exp_diff))));
10943}
10944
10945namespace detail {
10946
10947template <class D64, HWY_IF_UI64_D(D64)>
10948HWY_API VFromD<D64> ConvAbsInRangeF64ToUI64(D64 d64,
10949 VFromD<Rebind<double, D64>> v,
10950 VFromD<D64>& biased_exp) {
10951 const RebindToSigned<decltype(d64)> di64;
10952 const RebindToUnsigned<decltype(d64)> du64;
10953 using VU64 = VFromD<decltype(du64)>;
10954 const Repartition<uint16_t, decltype(di64)> du16;
10955 const VU64 k1075 = Set(du64, 1075); /* biased exponent of 2^52 */
10956
10957 // Exponent indicates whether the number can be represented as int64_t.
10958 biased_exp = BitCast(d64, ShiftRight<52>(BitCast(du64, v)));
10959 HWY_IF_CONSTEXPR(IsSigned<TFromD<D64>>()) {
10960 biased_exp = And(biased_exp, Set(d64, TFromD<D64>{0x7FF}));
10961 }
10962
10963 // If we were to cap the exponent at 51 and add 2^52, the number would be in
10964 // [2^52, 2^53) and mantissa bits could be read out directly. We need to
10965 // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
10966 // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
10967 // manually shift the mantissa into place (we already have many of the
10968 // inputs anyway).
10969
10970 // Use 16-bit saturated unsigned subtraction to compute shift_mnt and
10971 // shift_int since biased_exp[i] is a non-negative integer that is less than
10972 // or equal to 2047.
10973
10974 // 16-bit saturated unsigned subtraction is also more efficient than a
10975 // 64-bit subtraction followed by a 64-bit signed Max operation on
10976 // SSE2/SSSE3/SSE4/AVX2.
10977
10978 // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
10979 // zero as the upper 48 bits of both k1075 and biased_exp are zero.
10980
10981 const VU64 shift_mnt = BitCast(
10982 du64, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
10983 const VU64 shift_int = BitCast(
10984 du64, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
10985 const VU64 mantissa = BitCast(du64, v) & Set(du64, (1ULL << 52) - 1);
10986 // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86
10987 // returning zero in that case.
10988 const VU64 int53 = (mantissa | Set(du64, 1ULL << 52)) >> shift_mnt;
10989
10990 // For inputs larger than 2^53 - 1, insert zeros at the bottom.
10991
10992 // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be
10993 // shifted out of the left shift result below as shift_int[i] <= 11 is true
10994 // for any inputs that are less than 2^64.
10995
10996 return BitCast(d64, int53 << shift_int);
10997}
10998
10999} // namespace detail
11000
11001#if HWY_ARCH_X86_64
11002template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)>
11003HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec64<double> v) {
11004 return VFromD<DI>{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
11005}
11006template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)>
11007HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec128<double> v) {
11008 const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
11009 const Full64<double> dd2;
11010 const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw));
11011 return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)};
11012}
11013
11014template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
11015HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
11016 return detail::FixConversionOverflow(di, v, ConvertInRangeTo(di, v));
11017}
11018#endif // HWY_ARCH_X86_64
11019
11020#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
11021template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
11022 HWY_IF_I64_D(DI)>
11023HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<Rebind<double, DI>> v) {
11024 using VI = VFromD<DI>;
11025
11026 VI biased_exp;
11027 const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp);
11028 const VI sign_mask = BroadcastSignBit(BitCast(di, v));
11029
11030 // If the input was negative, negate the integer (two's complement).
11031 return (shifted ^ sign_mask) - sign_mask;
11032}
11033
11034template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
11035 HWY_IF_I64_D(DI)>
11036HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
11037 using VI = VFromD<DI>;
11038
11039 VI biased_exp;
11040 const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp);
11041
11042#if HWY_TARGET <= HWY_SSE4
11043 const auto in_range = biased_exp < Set(di, 1086);
11044#else
11045 const Repartition<int32_t, decltype(di)> di32;
11046 const auto in_range = MaskFromVec(BitCast(
11047 di,
11048 VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086))));
11049#endif
11050
11051 // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
11052 const VI sign_mask = BroadcastSignBit(BitCast(di, v));
11053 const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
11054 const VI magnitude = IfThenElse(in_range, shifted, limit);
11055
11056 // If the input was negative, negate the integer (two's complement).
11057 return (magnitude ^ sign_mask) - sign_mask;
11058}
11059#endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
11060
11061// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
11062template <class DU, HWY_IF_U64_D(DU)>
11063HWY_API VFromD<DU> ConvertInRangeTo(DU du, VFromD<Rebind<double, DU>> v) {
11064 VFromD<DU> biased_exp;
11065 const auto shifted = detail::ConvAbsInRangeF64ToUI64(du, v, biased_exp);
11066 return shifted;
11067}
11068
11069// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
11070template <class DU, HWY_IF_U64_D(DU)>
11071HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) {
11072 const RebindToSigned<DU> di;
11073 using VU = VFromD<DU>;
11074
11075 VU biased_exp;
11076 const VU shifted =
11077 detail::ConvAbsInRangeF64ToUI64(du, ZeroIfNegative(v), biased_exp);
11078
11079 // Exponent indicates whether the number can be represented as uint64_t.
11080#if HWY_TARGET <= HWY_SSE4
11081 const VU out_of_range =
11082 BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086)));
11083#else
11084 const Repartition<int32_t, decltype(di)> di32;
11085 const VU out_of_range = BitCast(
11086 du,
11087 VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) > Set(di32, 1086)));
11088#endif
11089
11090 return (shifted | out_of_range);
11091}
11092#endif // HWY_TARGET <= HWY_AVX3
11093
11094template <size_t N>
11095HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
11096 const RebindToSigned<DFromV<decltype(v)>> di;
11097 return detail::FixConversionOverflow(
11098 di, v, VFromD<decltype(di)>{_mm_cvtps_epi32(v.raw)});
11099}
11100
11101// ------------------------------ Floating-point rounding (ConvertTo)
11102
11103#if HWY_TARGET >= HWY_SSSE3
11104
11105// Toward nearest integer, ties to even
11106template <typename T, size_t N>
11107HWY_API Vec128<T, N> Round(const Vec128<T, N> v) {
11108 static_assert(IsFloat<T>(), "Only for float");
11109 // Rely on rounding after addition with a large value such that no mantissa
11110 // bits remain (assuming the current mode is nearest-even). We may need a
11111 // compiler flag for precise floating-point to prevent "optimizing" this out.
11112 const DFromV<decltype(v)> df;
11113 const auto max = Set(df, MantissaEnd<T>());
11114 const auto large = CopySignToAbs(max, v);
11115 const auto added = large + v;
11116 const auto rounded = added - large;
11117 // Keep original if NaN or the magnitude is large (already an int).
11118 return IfThenElse(Abs(v) < max, rounded, v);
11119}
11120
11121namespace detail {
11122
11123// Truncating to integer and converting back to float is correct except when the
11124// input magnitude is large, in which case the input was already an integer
11125// (because mantissa >> exponent is zero).
11126template <typename T, size_t N>
11128 static_assert(IsFloat<T>(), "Only for float");
11129 const DFromV<decltype(v)> d;
11130 return Abs(v) < Set(d, MantissaEnd<T>());
11131}
11132
11133} // namespace detail
11134
11135// Toward zero, aka truncate
11136template <typename T, size_t N>
11137HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
11138 static_assert(IsFloat<T>(), "Only for float");
11139 const DFromV<decltype(v)> df;
11140 const RebindToSigned<decltype(df)> di;
11141
11142 const auto integer = ConvertTo(di, v); // round toward 0
11143 const auto int_f = ConvertTo(df, integer);
11144
11145 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
11146}
11147
11148// Toward +infinity, aka ceiling
11149template <typename T, size_t N>
11151 static_assert(IsFloat<T>(), "Only for float");
11152 const DFromV<decltype(v)> df;
11153 const RebindToSigned<decltype(df)> di;
11154
11155 const auto integer = ConvertTo(di, v); // round toward 0
11156 const auto int_f = ConvertTo(df, integer);
11157
11158 // Truncating a positive non-integer ends up smaller; if so, add 1.
11159 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
11160
11161 return IfThenElse(detail::UseInt(v), int_f - neg1, v);
11162}
11163
11164// Toward -infinity, aka floor
11165template <typename T, size_t N>
11167 static_assert(IsFloat<T>(), "Only for float");
11168 const DFromV<decltype(v)> df;
11169 const RebindToSigned<decltype(df)> di;
11170
11171 const auto integer = ConvertTo(di, v); // round toward 0
11172 const auto int_f = ConvertTo(df, integer);
11173
11174 // Truncating a negative non-integer ends up larger; if so, subtract 1.
11175 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
11176
11177 return IfThenElse(detail::UseInt(v), int_f + neg1, v);
11178}
11179
11180#else
11181
11182// Toward nearest integer, ties to even
11183#if HWY_HAVE_FLOAT16
11184template <size_t N>
11185HWY_API Vec128<float16_t, N> Round(const Vec128<float16_t, N> v) {
11186 return Vec128<float16_t, N>{
11187 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
11188}
11189#endif // HWY_HAVE_FLOAT16
11190template <size_t N>
11191HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
11192 return Vec128<float, N>{
11193 _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
11194}
11195template <size_t N>
11196HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
11197 return Vec128<double, N>{
11198 _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
11199}
11200
11201// Toward zero, aka truncate
11202#if HWY_HAVE_FLOAT16
11203template <size_t N>
11204HWY_API Vec128<float16_t, N> Trunc(const Vec128<float16_t, N> v) {
11205 return Vec128<float16_t, N>{
11206 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
11207}
11208#endif // HWY_HAVE_FLOAT16
11209template <size_t N>
11210HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
11211 return Vec128<float, N>{
11212 _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
11213}
11214template <size_t N>
11215HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
11216 return Vec128<double, N>{
11217 _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
11218}
11219
11220// Toward +infinity, aka ceiling
11221#if HWY_HAVE_FLOAT16
11222template <size_t N>
11223HWY_API Vec128<float16_t, N> Ceil(const Vec128<float16_t, N> v) {
11224 return Vec128<float16_t, N>{
11225 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
11226}
11227#endif // HWY_HAVE_FLOAT16
11228template <size_t N>
11229HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
11230 return Vec128<float, N>{
11231 _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
11232}
11233template <size_t N>
11234HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
11235 return Vec128<double, N>{
11236 _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
11237}
11238
11239// Toward -infinity, aka floor
11240#if HWY_HAVE_FLOAT16
11241template <size_t N>
11242HWY_API Vec128<float16_t, N> Floor(const Vec128<float16_t, N> v) {
11243 return Vec128<float16_t, N>{
11244 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
11245}
11246#endif // HWY_HAVE_FLOAT16
11247template <size_t N>
11248HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
11249 return Vec128<float, N>{
11250 _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
11251}
11252template <size_t N>
11253HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
11254 return Vec128<double, N>{
11255 _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
11256}
11257
11258#endif // !HWY_SSSE3
11259
11260// ------------------------------ Floating-point classification
11261
11262#define HWY_X86_FPCLASS_QNAN 0x01
11263#define HWY_X86_FPCLASS_POS0 0x02
11264#define HWY_X86_FPCLASS_NEG0 0x04
11265#define HWY_X86_FPCLASS_POS_INF 0x08
11266#define HWY_X86_FPCLASS_NEG_INF 0x10
11267#define HWY_X86_FPCLASS_SUBNORMAL 0x20
11268#define HWY_X86_FPCLASS_NEG 0x40
11269#define HWY_X86_FPCLASS_SNAN 0x80
11270
11271#if HWY_HAVE_FLOAT16 || HWY_IDE
11272
11273template <size_t N>
11274HWY_API Mask128<float16_t, N> IsNaN(const Vec128<float16_t, N> v) {
11275 return Mask128<float16_t, N>{
11276 _mm_fpclass_ph_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
11277}
11278
11279template <size_t N>
11280HWY_API Mask128<float16_t, N> IsEitherNaN(Vec128<float16_t, N> a,
11281 Vec128<float16_t, N> b) {
11282 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
11283 HWY_DIAGNOSTICS(push)
11284 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
11285 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
11286 HWY_DIAGNOSTICS(pop)
11287}
11288
11289template <size_t N>
11290HWY_API Mask128<float16_t, N> IsInf(const Vec128<float16_t, N> v) {
11291 return Mask128<float16_t, N>{_mm_fpclass_ph_mask(
11293}
11294
11295template <size_t N>
11296HWY_API Mask128<float16_t, N> IsFinite(const Vec128<float16_t, N> v) {
11297 // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
11298 // and negate the mask.
11299 return Not(Mask128<float16_t, N>{_mm_fpclass_ph_mask(
11302}
11303
11304#endif // HWY_HAVE_FLOAT16
11305
11306template <size_t N>
11308#if HWY_TARGET <= HWY_AVX3
11309 return Mask128<float, N>{
11310 _mm_fpclass_ps_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
11311#else
11312 return Mask128<float, N>{_mm_cmpunord_ps(v.raw, v.raw)};
11313#endif
11314}
11315template <size_t N>
11317#if HWY_TARGET <= HWY_AVX3
11318 return Mask128<double, N>{
11319 _mm_fpclass_pd_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
11320#else
11321 return Mask128<double, N>{_mm_cmpunord_pd(v.raw, v.raw)};
11322#endif
11323}
11324
11325#ifdef HWY_NATIVE_IS_EITHER_NAN
11326#undef HWY_NATIVE_IS_EITHER_NAN
11327#else
11328#define HWY_NATIVE_IS_EITHER_NAN
11329#endif
11330
11331template <size_t N>
11333#if HWY_TARGET <= HWY_AVX3
11334 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
11335#else
11336 return Mask128<float, N>{_mm_cmpunord_ps(a.raw, b.raw)};
11337#endif
11338}
11339
11340template <size_t N>
11343#if HWY_TARGET <= HWY_AVX3
11344 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
11345#else
11346 return Mask128<double, N>{_mm_cmpunord_pd(a.raw, b.raw)};
11347#endif
11348}
11349
11350#if HWY_TARGET <= HWY_AVX3
11351
11352// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
11353#ifdef HWY_NATIVE_ISINF
11354#undef HWY_NATIVE_ISINF
11355#else
11356#define HWY_NATIVE_ISINF
11357#endif
11358
11359template <size_t N>
11364template <size_t N>
11369
11370// Returns whether normal/subnormal/zero.
11371template <size_t N>
11373 // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
11374 // and negate the mask.
11375 return Not(Mask128<float, N>{_mm_fpclass_ps_mask(
11378}
11379template <size_t N>
11385
11386#endif // HWY_TARGET <= HWY_AVX3
11387
11388// ================================================== CRYPTO
11389
11390#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4
11391
11392// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
11393#ifdef HWY_NATIVE_AES
11394#undef HWY_NATIVE_AES
11395#else
11396#define HWY_NATIVE_AES
11397#endif
11398
11399HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
11400 Vec128<uint8_t> round_key) {
11401 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
11402}
11403
11404HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
11405 Vec128<uint8_t> round_key) {
11406 return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
11407}
11408
11409HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) {
11410 return Vec128<uint8_t>{_mm_aesimc_si128(state.raw)};
11411}
11412
11413HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state,
11414 Vec128<uint8_t> round_key) {
11415 return Vec128<uint8_t>{_mm_aesdec_si128(state.raw, round_key.raw)};
11416}
11417
11418HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state,
11419 Vec128<uint8_t> round_key) {
11420 return Vec128<uint8_t>{_mm_aesdeclast_si128(state.raw, round_key.raw)};
11421}
11422
11423template <uint8_t kRcon>
11424HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
11425 return Vec128<uint8_t>{_mm_aeskeygenassist_si128(v.raw, kRcon)};
11426}
11427
11428template <size_t N>
11431 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
11432}
11433
11434template <size_t N>
11437 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
11438}
11439
11440#endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4
11441
11442// ================================================== MISC
11443
11444// ------------------------------ LoadMaskBits (TestBit)
11445
11446#if HWY_TARGET > HWY_AVX3
11447namespace detail {
11448
11449template <class D, HWY_IF_T_SIZE_D(D, 1)>
11450HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
11451 const RebindToUnsigned<decltype(d)> du;
11452 // Easier than Set(), which would require an >8-bit type, which would not
11453 // compile for T=uint8_t, kN=1.
11454 const VFromD<D> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))};
11455
11456#if HWY_TARGET == HWY_SSE2
11457 // {b0, b1, ...} ===> {b0, b0, b1, b1, ...}
11458 __m128i unpacked_vbits = _mm_unpacklo_epi8(vbits.raw, vbits.raw);
11459 // {b0, b0, b1, b1, ...} ==> {b0, b0, b0, b0, b1, b1, b1, b1, ...}
11460 unpacked_vbits = _mm_unpacklo_epi16(unpacked_vbits, unpacked_vbits);
11461 // {b0, b0, b0, b0, b1, b1, b1, b1, ...} ==>
11462 // {b0, b0, b0, b0, b0, b0, b0, b0, b1, b1, b1, b1, b1, b1, b1, b1}
11463 const VFromD<decltype(du)> rep8{
11464 _mm_unpacklo_epi32(unpacked_vbits, unpacked_vbits)};
11465#else
11466 // Replicate bytes 8x such that each byte contains the bit that governs it.
11467 alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
11468 1, 1, 1, 1, 1, 1, 1, 1};
11469 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
11470#endif
11471 const VFromD<decltype(du)> bit = Dup128VecFromValues(
11472 du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
11473 return RebindMask(d, TestBit(rep8, bit));
11474}
11475
11476template <class D, HWY_IF_T_SIZE_D(D, 2)>
11477HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
11478 const RebindToUnsigned<decltype(d)> du;
11479 alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
11480 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
11481 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
11482}
11483
11484template <class D, HWY_IF_T_SIZE_D(D, 4)>
11485HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
11486 const RebindToUnsigned<decltype(d)> du;
11487 alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
11488 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
11489 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
11490}
11491
11492template <class D, HWY_IF_T_SIZE_D(D, 8)>
11493HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
11494 const RebindToUnsigned<decltype(d)> du;
11495 alignas(16) static constexpr uint64_t kBit[8] = {1, 2};
11496 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
11497}
11498
11499} // namespace detail
11500#endif // HWY_TARGET > HWY_AVX3
11501
11502// `p` points to at least 8 readable bytes, not all of which need be valid.
11503template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11504HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
11505 constexpr size_t kN = MaxLanes(d);
11506#if HWY_TARGET <= HWY_AVX3
11507 (void)d;
11508 uint64_t mask_bits = 0;
11509 constexpr size_t kNumBytes = (kN + 7) / 8;
11510 CopyBytes<kNumBytes>(bits, &mask_bits);
11511 if (kN < 8) {
11512 mask_bits &= (1ull << kN) - 1;
11513 }
11514
11515 return MFromD<D>::FromBits(mask_bits);
11516#else
11517 uint64_t mask_bits = 0;
11518 constexpr size_t kNumBytes = (kN + 7) / 8;
11519 CopyBytes<kNumBytes>(bits, &mask_bits);
11520 if (kN < 8) {
11521 mask_bits &= (1ull << kN) - 1;
11522 }
11523
11524 return detail::LoadMaskBits128(d, mask_bits);
11525#endif
11526}
11527
11528// ------------------------------ Dup128MaskFromMaskBits
11529
11530template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11531HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
11532 constexpr size_t kN = MaxLanes(d);
11533 if (kN < 8) mask_bits &= (1u << kN) - 1;
11534
11535#if HWY_TARGET <= HWY_AVX3
11536 return MFromD<D>::FromBits(mask_bits);
11537#else
11538 return detail::LoadMaskBits128(d, mask_bits);
11539#endif
11540}
11541
11542template <typename T>
11543struct CompressIsPartition {
11544#if HWY_TARGET <= HWY_AVX3
11545 // AVX3 supports native compress, but a table-based approach allows
11546 // 'partitioning' (also moving mask=false lanes to the top), which helps
11547 // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8
11548 // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3
11549 // u32x8 etc.).
11550 enum { value = (sizeof(T) == 8) };
11551#else
11552 // generic_ops-inl does not guarantee IsPartition for 8-bit.
11553 enum { value = (sizeof(T) != 1) };
11554#endif
11555};
11556
11557#if HWY_TARGET <= HWY_AVX3
11558
11559// ------------------------------ StoreMaskBits
11560
11561// `p` points to at least 8 writable bytes.
11562template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11563HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
11564 constexpr size_t kN = MaxLanes(d);
11565 constexpr size_t kNumBytes = (kN + 7) / 8;
11566 CopyBytes<kNumBytes>(&mask.raw, bits);
11567
11568 // Non-full byte, need to clear the undefined upper bits.
11569 if (kN < 8) {
11570 const int mask_bits = (1 << kN) - 1;
11571 bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
11572 }
11573
11574 return kNumBytes;
11575}
11576
11577// ------------------------------ Mask testing
11578
11579// Beware: the suffix indicates the number of mask bits, not lane size!
11580
11581template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11582HWY_API size_t CountTrue(D d, MFromD<D> mask) {
11583 constexpr size_t kN = MaxLanes(d);
11584 const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
11585 return PopCount(mask_bits);
11586}
11587
11588template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11589HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
11590 constexpr size_t kN = MaxLanes(d);
11591 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
11592 return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
11593}
11594
11595template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11596HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
11597 constexpr size_t kN = MaxLanes(d);
11598 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
11599 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
11600}
11601
11602template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11603HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
11604 constexpr size_t kN = MaxLanes(d);
11605 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
11606 return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits);
11607}
11608
11609template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11610HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
11611 constexpr size_t kN = MaxLanes(d);
11612 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
11613 return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
11614 : -1;
11615}
11616
11617template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11618HWY_API bool AllFalse(D d, MFromD<D> mask) {
11619 constexpr size_t kN = MaxLanes(d);
11620 const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
11621 return mask_bits == 0;
11622}
11623
11624template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11625HWY_API bool AllTrue(D d, MFromD<D> mask) {
11626 constexpr size_t kN = MaxLanes(d);
11627 const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
11628 // Cannot use _kortestc because we may have less than 8 mask bits.
11629 return mask_bits == (1ull << kN) - 1;
11630}
11631
11632// ------------------------------ Compress
11633
11634// 8-16 bit Compress, CompressStore defined in x86_512 because they use Vec512.
11635
11636// Single lane: no-op
11637template <typename T>
11638HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
11639 return v;
11640}
11641
11642template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)>
11644 return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
11645}
11646
11647template <typename T, HWY_IF_T_SIZE(T, 8)>
11648HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
11649 HWY_DASSERT(mask.raw < 4);
11650
11651 // There are only 2 lanes, so we can afford to load the index vector directly.
11652 alignas(16) static constexpr uint8_t u8_indices[64] = {
11653 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
11654 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
11655 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
11656 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
11657
11658 const DFromV<decltype(v)> d;
11659 const Repartition<uint8_t, decltype(d)> d8;
11660 const auto index = Load(d8, u8_indices + 16 * mask.raw);
11661 return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
11662}
11663
11664// ------------------------------ CompressNot (Compress)
11665
11666// Single lane: no-op
11667template <typename T>
11668HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
11669 return v;
11670}
11671
11672template <typename T, HWY_IF_T_SIZE(T, 8)>
11673HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
11674 // See CompressIsPartition, PrintCompressNot64x2NibbleTables
11675 alignas(16) static constexpr uint64_t packed_array[16] = {
11676 0x00000010, 0x00000001, 0x00000010, 0x00000010};
11677
11678 // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
11679 // _mm_permutexvar_epi64 will ignore the upper bits.
11680 const DFromV<decltype(v)> d;
11681 const RebindToUnsigned<decltype(d)> du64;
11682 const auto packed = Set(du64, packed_array[mask.raw]);
11683 alignas(16) static constexpr uint64_t shifts[2] = {0, 4};
11684 const auto indices = Indices128<T>{(packed >> Load(du64, shifts)).raw};
11685 return TableLookupLanes(v, indices);
11686}
11687
11688// ------------------------------ CompressBlocksNot
11689HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
11690 Mask128<uint64_t> /* m */) {
11691 return v;
11692}
11693
11694// ------------------------------ CompressStore (defined in x86_512)
11695
11696// ------------------------------ CompressBlendedStore (CompressStore)
11697template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
11698HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
11699 TFromD<D>* HWY_RESTRICT unaligned) {
11700 // AVX-512 already does the blending at no extra cost (latency 11,
11701 // rthroughput 2 - same as compress plus store).
11702 if (HWY_TARGET == HWY_AVX3_DL ||
11703 (HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD<D>) > 2)) {
11704 // We're relying on the mask to blend. Clear the undefined upper bits.
11705 constexpr size_t kN = MaxLanes(d);
11706 if (kN != 16 / sizeof(TFromD<D>)) {
11707 m = And(m, FirstN(d, kN));
11708 }
11709 return CompressStore(v, m, d, unaligned);
11710 } else {
11711 const size_t count = CountTrue(d, m);
11712 const VFromD<D> compressed = Compress(v, m);
11713#if HWY_MEM_OPS_MIGHT_FAULT
11714 // BlendedStore tests mask for each lane, but we know that the mask is
11715 // FirstN, so we can just copy.
11716 alignas(16) TFromD<D> buf[MaxLanes(d)];
11717 Store(compressed, d, buf);
11718 CopyBytes(buf, unaligned, count * sizeof(TFromD<D>));
11719#else
11720 BlendedStore(compressed, FirstN(d, count), d, unaligned);
11721#endif
11722 detail::MaybeUnpoison(unaligned, count);
11723 return count;
11724 }
11725}
11726
11727// ------------------------------ CompressBitsStore (defined in x86_512)
11728
11729#else // AVX2 or below
11730
11731// ------------------------------ StoreMaskBits
11732
11733namespace detail {
11734
11735constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
11736 return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
11737}
11738
11739template <typename T, size_t N>
11740HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
11741 const Mask128<T, N> mask) {
11742 const Simd<T, N, 0> d;
11743 const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
11744 return U64FromInt(_mm_movemask_epi8(sign_bits));
11745}
11746
11747template <typename T, size_t N>
11748HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
11749 const Mask128<T, N> mask) {
11750 // Remove useless lower half of each u16 while preserving the sign bit.
11751 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
11752 return U64FromInt(_mm_movemask_epi8(sign_bits));
11753}
11754
11755template <typename T, size_t N>
11756HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
11757 const Simd<T, N, 0> d;
11758 const Simd<float, N, 0> df;
11759 const auto sign_bits = BitCast(df, VecFromMask(d, mask));
11760 return U64FromInt(_mm_movemask_ps(sign_bits.raw));
11761}
11762
11763template <typename T, size_t N>
11764HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
11765 const Simd<T, N, 0> d;
11766 const Simd<double, N, 0> df;
11767 const auto sign_bits = BitCast(df, VecFromMask(d, mask));
11768 return U64FromInt(_mm_movemask_pd(sign_bits.raw));
11769}
11770
11771template <typename T, size_t N>
11772HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
11773 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
11774}
11775
11776} // namespace detail
11777
11778// `p` points to at least 8 writable bytes.
11779template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11780HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
11781 constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8;
11782 const uint64_t mask_bits = detail::BitsFromMask(mask);
11783 CopyBytes<kNumBytes>(&mask_bits, bits);
11784 return kNumBytes;
11785}
11786
11787// ------------------------------ Mask testing
11788
11789template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11790HWY_API bool AllFalse(D /* tag */, MFromD<D> mask) {
11791 // Cheaper than PTEST, which is 2 uop / 3L.
11792 return detail::BitsFromMask(mask) == 0;
11793}
11794
11795template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11796HWY_API bool AllTrue(D d, MFromD<D> mask) {
11797 constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1;
11798 return detail::BitsFromMask(mask) == kAllBits;
11799}
11800
11801template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11802HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
11803 return PopCount(detail::BitsFromMask(mask));
11804}
11805
11806template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11807HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) {
11809 static_cast<uint32_t>(detail::BitsFromMask(mask)));
11810}
11811
11812template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11813HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD<D> mask) {
11814 const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
11815 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
11816}
11817
11818template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11819HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) {
11821 static_cast<uint32_t>(detail::BitsFromMask(mask)));
11822}
11823
11824template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11825HWY_API intptr_t FindLastTrue(D /* tag */, MFromD<D> mask) {
11826 const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
11827 return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
11828 : -1;
11829}
11830
11831// ------------------------------ Compress, CompressBits
11832
11833namespace detail {
11834
11835// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
11836template <class D, HWY_IF_T_SIZE_D(D, 2)>
11837HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
11838 HWY_DASSERT(mask_bits < 256);
11839 const Rebind<uint8_t, decltype(d)> d8;
11840 const Twice<decltype(d8)> d8t;
11841 const RebindToUnsigned<decltype(d)> du;
11842
11843 // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
11844 // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
11845 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
11846 // store lane indices and convert to byte indices (2*lane + 0..1), with the
11847 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
11848 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
11849 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
11850 // is likely more costly than the higher cache footprint from storing bytes.
11851 alignas(16) static constexpr uint8_t table[2048] = {
11852 // PrintCompress16x8Tables
11853 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
11854 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
11855 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
11856 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
11857 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
11858 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
11859 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
11860 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
11861 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
11862 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
11863 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
11864 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
11865 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
11866 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
11867 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
11868 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
11869 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
11870 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
11871 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
11872 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
11873 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
11874 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
11875 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
11876 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
11877 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
11878 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
11879 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
11880 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
11881 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
11882 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
11883 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
11884 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
11885 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
11886 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
11887 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
11888 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
11889 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
11890 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
11891 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
11892 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
11893 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
11894 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
11895 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
11896 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
11897 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
11898 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
11899 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
11900 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
11901 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
11902 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
11903 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
11904 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
11905 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
11906 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
11907 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
11908 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
11909 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
11910 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
11911 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
11912 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
11913 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
11914 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
11915 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
11916 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
11917 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
11918 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
11919 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
11920 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
11921 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
11922 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
11923 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
11924 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
11925 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
11926 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
11927 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
11928 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
11929 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
11930 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
11931 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
11932 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
11933 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
11934 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
11935 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
11936 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
11937 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
11938 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
11939 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
11940 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
11941 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
11942 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
11943 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
11944 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
11945 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
11946 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
11947 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
11948 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
11949 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
11950 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
11951 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
11952 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
11953 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
11954 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
11955 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
11956 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
11957 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
11958 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
11959 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
11960 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
11961 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
11962 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
11963 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
11964 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
11965 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
11966 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
11967 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
11968 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
11969 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
11970 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
11971 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
11972 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
11973 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
11974 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
11975 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
11976 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
11977 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
11978 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
11979 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
11980 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
11981
11982 const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
11983 const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
11984 return BitCast(d, pairs + Set(du, 0x0100));
11985}
11986
11987template <class D, HWY_IF_T_SIZE_D(D, 2)>
11988HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
11989 HWY_DASSERT(mask_bits < 256);
11990 const Rebind<uint8_t, decltype(d)> d8;
11991 const Twice<decltype(d8)> d8t;
11992 const RebindToUnsigned<decltype(d)> du;
11993
11994 // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
11995 // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
11996 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
11997 // store lane indices and convert to byte indices (2*lane + 0..1), with the
11998 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
11999 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
12000 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
12001 // is likely more costly than the higher cache footprint from storing bytes.
12002 alignas(16) static constexpr uint8_t table[2048] = {
12003 // PrintCompressNot16x8Tables
12004 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
12005 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
12006 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
12007 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
12008 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
12009 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
12010 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
12011 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
12012 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
12013 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
12014 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
12015 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
12016 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
12017 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
12018 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
12019 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
12020 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
12021 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
12022 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
12023 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
12024 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
12025 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
12026 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
12027 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
12028 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
12029 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
12030 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
12031 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
12032 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
12033 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
12034 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
12035 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
12036 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
12037 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
12038 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
12039 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
12040 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
12041 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
12042 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
12043 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
12044 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
12045 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
12046 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
12047 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
12048 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
12049 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
12050 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
12051 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
12052 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
12053 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
12054 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
12055 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
12056 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
12057 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
12058 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
12059 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
12060 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
12061 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
12062 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
12063 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
12064 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
12065 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
12066 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
12067 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
12068 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
12069 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
12070 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
12071 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
12072 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
12073 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
12074 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
12075 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
12076 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
12077 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
12078 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
12079 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
12080 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
12081 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
12082 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
12083 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
12084 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
12085 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
12086 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
12087 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
12088 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
12089 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
12090 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
12091 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
12092 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
12093 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
12094 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
12095 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
12096 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
12097 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
12098 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
12099 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
12100 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
12101 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
12102 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
12103 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
12104 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
12105 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
12106 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
12107 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
12108 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
12109 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
12110 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
12111 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
12112 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
12113 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
12114 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
12115 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
12116 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
12117 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
12118 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
12119 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
12120 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
12121 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
12122 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
12123 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
12124 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
12125 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
12126 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
12127 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
12128 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
12129 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
12130 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
12131 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
12132
12133 const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
12134 const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
12135 return BitCast(d, pairs + Set(du, 0x0100));
12136}
12137
12138template <class D, HWY_IF_T_SIZE_D(D, 4)>
12139HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
12140 HWY_DASSERT(mask_bits < 16);
12141
12142 // There are only 4 lanes, so we can afford to load the index vector directly.
12143 alignas(16) static constexpr uint8_t u8_indices[256] = {
12144 // PrintCompress32x4Tables
12145 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
12146 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
12147 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
12148 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
12149 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
12150 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
12151 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
12152 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
12153 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
12154 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
12155 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
12156 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
12157 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
12158 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
12159 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
12160 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
12161
12162 const Repartition<uint8_t, decltype(d)> d8;
12163 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
12164}
12165
12166template <class D, HWY_IF_T_SIZE_D(D, 4)>
12167HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
12168 HWY_DASSERT(mask_bits < 16);
12169
12170 // There are only 4 lanes, so we can afford to load the index vector directly.
12171 alignas(16) static constexpr uint8_t u8_indices[256] = {
12172 // PrintCompressNot32x4Tables
12173 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
12174 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
12175 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
12176 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
12177 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
12178 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
12179 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
12180 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12181 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
12182 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
12183 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
12184 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
12185 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
12186 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12187 12, 13, 14, 15};
12188
12189 const Repartition<uint8_t, decltype(d)> d8;
12190 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
12191}
12192
12193template <class D, HWY_IF_T_SIZE_D(D, 8)>
12194HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
12195 HWY_DASSERT(mask_bits < 4);
12196
12197 // There are only 2 lanes, so we can afford to load the index vector directly.
12198 alignas(16) static constexpr uint8_t u8_indices[64] = {
12199 // PrintCompress64x2Tables
12200 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12201 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12202 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
12203 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
12204
12205 const Repartition<uint8_t, decltype(d)> d8;
12206 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
12207}
12208
12209template <class D, HWY_IF_T_SIZE_D(D, 8)>
12210HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
12211 HWY_DASSERT(mask_bits < 4);
12212
12213 // There are only 2 lanes, so we can afford to load the index vector directly.
12214 alignas(16) static constexpr uint8_t u8_indices[64] = {
12215 // PrintCompressNot64x2Tables
12216 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12217 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
12218 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
12219 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
12220
12221 const Repartition<uint8_t, decltype(d)> d8;
12222 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
12223}
12224
12225template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
12226HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
12227 const DFromV<decltype(v)> d;
12228 const RebindToUnsigned<decltype(d)> du;
12229
12230 HWY_DASSERT(mask_bits < (1ull << N));
12231 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
12232 return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
12233}
12234
12235template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
12236HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
12237 const DFromV<decltype(v)> d;
12238 const RebindToUnsigned<decltype(d)> du;
12239
12240 HWY_DASSERT(mask_bits < (1ull << N));
12241 const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits));
12242 return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
12243}
12244
12245} // namespace detail
12246
12247// Single lane: no-op
12248template <typename T>
12249HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
12250 return v;
12251}
12252
12253// Two lanes: conditional swap
12254template <typename T, HWY_IF_T_SIZE(T, 8)>
12255HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
12256 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
12257 const DFromV<decltype(v)> d;
12258 const Vec128<T> m = VecFromMask(d, mask);
12259 const Vec128<T> maskL = DupEven(m);
12260 const Vec128<T> maskH = DupOdd(m);
12261 const Vec128<T> swap = AndNot(maskL, maskH);
12262 return IfVecThenElse(swap, Shuffle01(v), v);
12263}
12264
12265// General case, 2 or 4 bytes
12266template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
12267HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
12268 return detail::CompressBits(v, detail::BitsFromMask(mask));
12269}
12270
12271// ------------------------------ CompressNot
12272
12273// Single lane: no-op
12274template <typename T>
12275HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
12276 return v;
12277}
12278
12279// Two lanes: conditional swap
12280template <typename T, HWY_IF_T_SIZE(T, 8)>
12281HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
12282 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
12283 const DFromV<decltype(v)> d;
12284 const Vec128<T> m = VecFromMask(d, mask);
12285 const Vec128<T> maskL = DupEven(m);
12286 const Vec128<T> maskH = DupOdd(m);
12287 const Vec128<T> swap = AndNot(maskH, maskL);
12288 return IfVecThenElse(swap, Shuffle01(v), v);
12289}
12290
12291template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
12292HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
12293 // For partial vectors, we cannot pull the Not() into the table because
12294 // BitsFromMask clears the upper bits.
12295 if (N < 16 / sizeof(T)) {
12296 return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
12297 }
12298 return detail::CompressNotBits(v, detail::BitsFromMask(mask));
12299}
12300
12301// ------------------------------ CompressBlocksNot
12302HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
12303 Mask128<uint64_t> /* m */) {
12304 return v;
12305}
12306
12307template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
12308HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
12309 const uint8_t* HWY_RESTRICT bits) {
12310 uint64_t mask_bits = 0;
12311 constexpr size_t kNumBytes = (N + 7) / 8;
12312 CopyBytes<kNumBytes>(bits, &mask_bits);
12313 if (N < 8) {
12314 mask_bits &= (1ull << N) - 1;
12315 }
12316
12317 return detail::CompressBits(v, mask_bits);
12318}
12319
12320// ------------------------------ CompressStore, CompressBitsStore
12321
12322template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
12323HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
12324 TFromD<D>* HWY_RESTRICT unaligned) {
12325 const RebindToUnsigned<decltype(d)> du;
12326
12327 const uint64_t mask_bits = detail::BitsFromMask(m);
12328 HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
12329 const size_t count = PopCount(mask_bits);
12330
12331 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
12332 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
12333 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
12334 StoreU(compressed, d, unaligned);
12335 detail::MaybeUnpoison(unaligned, count);
12336 return count;
12337}
12338
12339template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
12340HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
12341 TFromD<D>* HWY_RESTRICT unaligned) {
12342 const RebindToUnsigned<decltype(d)> du;
12343
12344 const uint64_t mask_bits = detail::BitsFromMask(m);
12345 HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
12346 const size_t count = PopCount(mask_bits);
12347
12348 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
12349 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
12350 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
12351 BlendedStore(compressed, FirstN(d, count), d, unaligned);
12352 detail::MaybeUnpoison(unaligned, count);
12353 return count;
12354}
12355
12356template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
12357HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
12358 D d, TFromD<D>* HWY_RESTRICT unaligned) {
12359 const RebindToUnsigned<decltype(d)> du;
12360
12361 uint64_t mask_bits = 0;
12362 constexpr size_t kN = MaxLanes(d);
12363 constexpr size_t kNumBytes = (kN + 7) / 8;
12364 CopyBytes<kNumBytes>(bits, &mask_bits);
12365 if (kN < 8) {
12366 mask_bits &= (1ull << kN) - 1;
12367 }
12368 const size_t count = PopCount(mask_bits);
12369
12370 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
12371 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
12372 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
12373 StoreU(compressed, d, unaligned);
12374
12375 detail::MaybeUnpoison(unaligned, count);
12376 return count;
12377}
12378
12379#endif // HWY_TARGET <= HWY_AVX3
12380
12381// ------------------------------ Expand
12382
12383// Otherwise, use the generic_ops-inl.h fallback.
12384#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
12385
12386// The native instructions for 8/16-bit actually require VBMI2 (HWY_AVX3_DL),
12387// but we still want to override generic_ops-inl's table-based implementation
12388// whenever we have the 32-bit expand provided by AVX3.
12389#ifdef HWY_NATIVE_EXPAND
12390#undef HWY_NATIVE_EXPAND
12391#else
12392#define HWY_NATIVE_EXPAND
12393#endif
12394
12395namespace detail {
12396
12397#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2
12398
12399template <size_t N>
12401 Mask128<uint8_t, N> mask) {
12402 return Vec128<uint8_t, N>{_mm_maskz_expand_epi8(mask.raw, v.raw)};
12403}
12404
12405template <size_t N>
12407 Mask128<uint16_t, N> mask) {
12408 return Vec128<uint16_t, N>{_mm_maskz_expand_epi16(mask.raw, v.raw)};
12409}
12410
12411template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)>
12413 const uint8_t* HWY_RESTRICT unaligned) {
12414 return VFromD<D>{_mm_maskz_expandloadu_epi8(mask.raw, unaligned)};
12415}
12416
12417template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
12419 const uint16_t* HWY_RESTRICT unaligned) {
12420 return VFromD<D>{_mm_maskz_expandloadu_epi16(mask.raw, unaligned)};
12421}
12422
12423#endif // HWY_TARGET <= HWY_AVX3_DL
12424
12425template <size_t N>
12427 Mask128<uint32_t, N> mask) {
12428 return Vec128<uint32_t, N>{_mm_maskz_expand_epi32(mask.raw, v.raw)};
12429}
12430
12431template <size_t N>
12433 Mask128<uint64_t, N> mask) {
12434 return Vec128<uint64_t, N>{_mm_maskz_expand_epi64(mask.raw, v.raw)};
12435}
12436
12437template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
12439 const uint32_t* HWY_RESTRICT unaligned) {
12440 return VFromD<D>{_mm_maskz_expandloadu_epi32(mask.raw, unaligned)};
12441}
12442
12443template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
12445 const uint64_t* HWY_RESTRICT unaligned) {
12446 return VFromD<D>{_mm_maskz_expandloadu_epi64(mask.raw, unaligned)};
12447}
12448
12449} // namespace detail
12450
12451// Otherwise, 8/16-bit are implemented in x86_512 using PromoteTo.
12452#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2
12453
12454template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
12455HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
12456 const DFromV<decltype(v)> d;
12457 const RebindToUnsigned<decltype(d)> du;
12458 const MFromD<decltype(du)> mu = RebindMask(du, mask);
12459 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
12460}
12461
12462#endif // HWY_TARGET <= HWY_AVX3_DL
12463
12464template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
12465HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
12466 const DFromV<decltype(v)> d;
12467 const RebindToUnsigned<decltype(d)> du;
12468 const MFromD<decltype(du)> mu = RebindMask(du, mask);
12469 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
12470}
12471
12472// ------------------------------ LoadExpand
12473
12474template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
12475 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
12476HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
12477 const TFromD<D>* HWY_RESTRICT unaligned) {
12478#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
12479 const RebindToUnsigned<decltype(d)> du;
12480 using TU = TFromD<decltype(du)>;
12481 const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned);
12482 const MFromD<decltype(du)> mu = RebindMask(du, mask);
12483 return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
12484#else
12485 return Expand(LoadU(d, unaligned), mask);
12486#endif
12487}
12488
12489template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
12490 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
12491HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
12492 const TFromD<D>* HWY_RESTRICT unaligned) {
12493#if HWY_TARGET <= HWY_AVX3
12494 const RebindToUnsigned<decltype(d)> du;
12495 using TU = TFromD<decltype(du)>;
12496 const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned);
12497 const MFromD<decltype(du)> mu = RebindMask(du, mask);
12498 return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
12499#else
12500 return Expand(LoadU(d, unaligned), mask);
12501#endif
12502}
12503
12504#endif // HWY_TARGET <= HWY_AVX3
12505
12506// ------------------------------ StoreInterleaved2/3/4
12507
12508// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
12509// generic_ops-inl.h.
12510
12511// ------------------------------ Additional mask logical operations
12512
12513#if HWY_TARGET <= HWY_AVX3
12514namespace detail {
12515
12516template <class T, HWY_IF_LANES_LE(sizeof(T), 4)>
12517static HWY_INLINE uint32_t AVX3Blsi(T x) {
12518 using TU = MakeUnsigned<T>;
12519 const auto u32_val = static_cast<uint32_t>(static_cast<TU>(x));
12520#if HWY_COMPILER_CLANGCL
12521 return static_cast<uint32_t>(u32_val & (0u - u32_val));
12522#else
12523 return static_cast<uint32_t>(_blsi_u32(u32_val));
12524#endif
12525}
12526template <class T, HWY_IF_T_SIZE(T, 8)>
12527static HWY_INLINE uint64_t AVX3Blsi(T x) {
12528 const auto u64_val = static_cast<uint64_t>(x);
12529#if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32
12530 return static_cast<uint64_t>(u64_val & (0ULL - u64_val));
12531#else
12532 return static_cast<uint64_t>(_blsi_u64(u64_val));
12533#endif
12534}
12535
12536template <class T, HWY_IF_LANES_LE(sizeof(T), 4)>
12537static HWY_INLINE uint32_t AVX3Blsmsk(T x) {
12538 using TU = MakeUnsigned<T>;
12539 const auto u32_val = static_cast<uint32_t>(static_cast<TU>(x));
12540#if HWY_COMPILER_CLANGCL
12541 return static_cast<uint32_t>(u32_val ^ (u32_val - 1u));
12542#else
12543 return static_cast<uint32_t>(_blsmsk_u32(u32_val));
12544#endif
12545}
12546template <class T, HWY_IF_T_SIZE(T, 8)>
12547static HWY_INLINE uint64_t AVX3Blsmsk(T x) {
12548 const auto u64_val = static_cast<uint64_t>(x);
12549#if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32
12550 return static_cast<uint64_t>(u64_val ^ (u64_val - 1ULL));
12551#else
12552 return static_cast<uint64_t>(_blsmsk_u64(u64_val));
12553#endif
12554}
12555
12556} // namespace detail
12557
12558template <class T, size_t N>
12560 constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
12561 return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>(
12562 (0u - detail::AVX3Blsi(mask.raw)) & kActiveElemMask)};
12563}
12564template <class T, size_t N>
12565HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
12566 constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
12567 return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>(
12568 (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)};
12569}
12570template <class T, size_t N>
12571HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
12572 constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
12573 return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>(
12574 detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)};
12575}
12576template <class T, size_t N>
12577HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
12578 return Mask128<T, N>{
12579 static_cast<typename Mask128<T, N>::Raw>(detail::AVX3Blsi(mask.raw))};
12580}
12581#else // AVX2 or below
12582template <class T>
12583HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
12584 return mask;
12585}
12586template <class T>
12587HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
12588 const FixedTag<T, 2> d;
12589 const auto vmask = VecFromMask(d, mask);
12590 return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
12591}
12592template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
12593HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
12594 const Simd<T, N, 0> d;
12595 const auto vmask = VecFromMask(d, mask);
12596 const auto neg_vmask =
12597 ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask)));
12598 return MaskFromVec(Or(vmask, neg_vmask));
12599}
12600template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
12601HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
12602 const Full128<T> d;
12603 const Repartition<int64_t, decltype(d)> di64;
12604 const Repartition<float, decltype(d)> df32;
12605 const Repartition<int32_t, decltype(d)> di32;
12606 using VF = VFromD<decltype(df32)>;
12607
12608 auto vmask = BitCast(di64, VecFromMask(d, mask));
12609 vmask = Or(vmask, Neg(vmask));
12610
12611 // Copy the sign bit of the first int64_t lane to the second int64_t lane
12612 const auto vmask2 = BroadcastSignBit(
12613 BitCast(di32, VF{_mm_shuffle_ps(Zero(df32).raw, BitCast(df32, vmask).raw,
12614 _MM_SHUFFLE(1, 1, 0, 0))}));
12615 return MaskFromVec(BitCast(d, Or(vmask, BitCast(di64, vmask2))));
12616}
12617
12618template <class T, size_t N>
12619HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
12620 return Not(SetAtOrAfterFirst(mask));
12621}
12622
12623template <class T>
12624HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
12625 return mask;
12626}
12627template <class T>
12628HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
12629 const FixedTag<T, 2> d;
12630 const RebindToSigned<decltype(d)> di;
12631
12632 const auto vmask = BitCast(di, VecFromMask(d, mask));
12633 const auto zero = Zero(di);
12634 const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
12635 return MaskFromVec(BitCast(d, And(vmask, vmask2)));
12636}
12637template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
12638HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
12639 const Simd<T, N, 0> d;
12640 const RebindToSigned<decltype(d)> di;
12641
12642 const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask));
12643 const auto only_first_vmask =
12644 BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask)))));
12645 return MaskFromVec(only_first_vmask);
12646}
12647template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
12648HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
12649 const Full128<T> d;
12650 const RebindToSigned<decltype(d)> di;
12651 const Repartition<int64_t, decltype(d)> di64;
12652
12653 const auto zero = Zero(di64);
12654 const auto vmask = BitCast(di64, VecFromMask(d, mask));
12655 const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero);
12656 const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask))));
12657 return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2))));
12658}
12659
12660template <class T>
12661HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) {
12662 const FixedTag<T, 1> d;
12663 const RebindToSigned<decltype(d)> di;
12664 using TI = MakeSigned<T>;
12665
12666 return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
12667}
12668template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
12669HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
12670 const Simd<T, N, 0> d;
12671 return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
12672}
12673#endif // HWY_TARGET <= HWY_AVX3
12674
12675// ------------------------------ Reductions
12676
12677// Nothing fully native, generic_ops-inl defines SumOfLanes and ReduceSum.
12678
12679// We provide specializations of u8x8 and u8x16, so exclude those.
12680#undef HWY_IF_SUM_OF_LANES_D
12681#define HWY_IF_SUM_OF_LANES_D(D) \
12682 HWY_IF_LANES_GT_D(D, 1), \
12683 hwy::EnableIf<!hwy::IsSame<TFromD<D>, uint8_t>() || \
12684 (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \
12685 nullptr
12686
12687template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 8)>
12688HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
12689 return Set(d, static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF));
12690}
12691template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 16)>
12693 const Repartition<uint64_t, decltype(d)> d64;
12694 VFromD<decltype(d64)> sums = SumsOf8(v);
12695 sums = SumOfLanes(d64, sums);
12696 return Broadcast<0>(BitCast(d, sums));
12697}
12698
12699#if HWY_TARGET <= HWY_SSE4
12700// We provide specializations of u8x8, u8x16, and u16x8, so exclude those.
12701#undef HWY_IF_MINMAX_OF_LANES_D
12702#define HWY_IF_MINMAX_OF_LANES_D(D) \
12703 HWY_IF_LANES_GT_D(D, 1), \
12704 hwy::EnableIf<(!hwy::IsSame<TFromD<D>, uint8_t>() || \
12705 ((HWY_V_SIZE_D(D) < 8) || (HWY_V_SIZE_D(D) > 16))) && \
12706 (!hwy::IsSame<TFromD<D>, uint16_t>() || \
12707 (HWY_V_SIZE_D(D) != 16))>* = nullptr
12708
12709template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
12711 return Broadcast<0>(Vec128<uint16_t>{_mm_minpos_epu16(v.raw)});
12712}
12713
12714template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
12716 const Vec128<uint16_t> max = Set(d, LimitsMax<uint16_t>());
12717 return max - MinOfLanes(d, max - v);
12718}
12719
12720template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
12722 const Rebind<uint16_t, decltype(d)> d16;
12723 return TruncateTo(d, MinOfLanes(d16, PromoteTo(d16, v)));
12724}
12725template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
12727 const Half<decltype(d)> dh;
12728 Vec64<uint8_t> result =
12729 Min(MinOfLanes(dh, UpperHalf(dh, v)), MinOfLanes(dh, LowerHalf(dh, v)));
12730 return Combine(d, result, result);
12731}
12732
12733template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
12735 const Vec64<uint8_t> m(Set(d, LimitsMax<uint8_t>()));
12736 return m - MinOfLanes(d, m - v);
12737}
12738template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
12740 const Vec128<uint8_t> m(Set(d, LimitsMax<uint8_t>()));
12741 return m - MinOfLanes(d, m - v);
12742}
12743
12744#endif // HWY_TARGET <= HWY_SSE4
12745
12746// ------------------------------ Lt128
12747
12748namespace detail {
12749
12750// Returns vector-mask for Lt128. Generic for all vector lengths.
12751template <class D, HWY_IF_U64_D(D)>
12753 // Truth table of Eq and Lt for Hi and Lo u64.
12754 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
12755 // =H =L cH cL | out = cH | (=H & cL)
12756 // 0 0 0 0 | 0
12757 // 0 0 0 1 | 0
12758 // 0 0 1 0 | 1
12759 // 0 0 1 1 | 1
12760 // 0 1 0 0 | 0
12761 // 0 1 0 1 | 0
12762 // 0 1 1 0 | 1
12763 // 1 0 0 0 | 0
12764 // 1 0 0 1 | 1
12765 // 1 1 0 0 | 0
12766 const auto eqHL = Eq(a, b);
12767 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
12768 const VFromD<D> ltLX = ShiftLeftLanes<1>(ltHL);
12769 const VFromD<D> vecHx = IfThenElse(eqHL, ltLX, ltHL);
12770 return InterleaveUpper(d, vecHx, vecHx);
12771}
12772
12773// Returns vector-mask for Eq128. Generic for all vector lengths.
12774template <class D, HWY_IF_U64_D(D)>
12776 const auto eqHL = VecFromMask(d, Eq(a, b));
12777 const auto eqLH = Reverse2(d, eqHL);
12778 return And(eqHL, eqLH);
12779}
12780
12781template <class D, HWY_IF_U64_D(D)>
12783 const auto neHL = VecFromMask(d, Ne(a, b));
12784 const auto neLH = Reverse2(d, neHL);
12785 return Or(neHL, neLH);
12786}
12787
12788template <class D, HWY_IF_U64_D(D)>
12790 // No specialization required for AVX-512: Mask <-> Vec is fast, and
12791 // copying mask bits to their neighbor seems infeasible.
12792 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
12793 return InterleaveUpper(d, ltHL, ltHL);
12794}
12795
12796template <class D, HWY_IF_U64_D(D)>
12798 // No specialization required for AVX-512: Mask <-> Vec is fast, and
12799 // copying mask bits to their neighbor seems infeasible.
12800 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
12801 return InterleaveUpper(d, eqHL, eqHL);
12802}
12803
12804template <class D, HWY_IF_U64_D(D)>
12806 // No specialization required for AVX-512: Mask <-> Vec is fast, and
12807 // copying mask bits to their neighbor seems infeasible.
12808 const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
12809 return InterleaveUpper(d, neHL, neHL);
12810}
12811
12812} // namespace detail
12813
12814template <class D, HWY_IF_U64_D(D)>
12816 return MaskFromVec(detail::Lt128Vec(d, a, b));
12817}
12818
12819template <class D, HWY_IF_U64_D(D)>
12821 return MaskFromVec(detail::Eq128Vec(d, a, b));
12822}
12823
12824template <class D, HWY_IF_U64_D(D)>
12826 return MaskFromVec(detail::Ne128Vec(d, a, b));
12827}
12828
12829template <class D, HWY_IF_U64_D(D)>
12831 return MaskFromVec(detail::Lt128UpperVec(d, a, b));
12832}
12833
12834template <class D, HWY_IF_U64_D(D)>
12836 return MaskFromVec(detail::Eq128UpperVec(d, a, b));
12837}
12838
12839template <class D, HWY_IF_U64_D(D)>
12841 return MaskFromVec(detail::Ne128UpperVec(d, a, b));
12842}
12843
12844// ------------------------------ Min128, Max128 (Lt128)
12845
12846// Avoids the extra MaskFromVec in Lt128.
12847template <class D, HWY_IF_U64_D(D)>
12849 return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
12850}
12851
12852template <class D, HWY_IF_U64_D(D)>
12854 return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
12855}
12856
12857template <class D, HWY_IF_U64_D(D)>
12859 return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
12860}
12861
12862template <class D, HWY_IF_U64_D(D)>
12864 return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
12865}
12866
12867// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
12868
12869#if HWY_TARGET <= HWY_AVX3
12870
12871#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
12872#undef HWY_NATIVE_LEADING_ZERO_COUNT
12873#else
12874#define HWY_NATIVE_LEADING_ZERO_COUNT
12875#endif
12876
12877template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
12879 return V{_mm_lzcnt_epi32(v.raw)};
12880}
12881
12882template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
12883HWY_API V LeadingZeroCount(V v) {
12884 return V{_mm_lzcnt_epi64(v.raw)};
12885}
12886
12887// HighestSetBitIndex and TrailingZeroCount is implemented in x86_512-inl.h
12888// for AVX3 targets
12889
12890#endif // HWY_TARGET <= HWY_AVX3
12891
12892// NOLINTNEXTLINE(google-readability-namespace-comments)
12893} // namespace HWY_NAMESPACE
12894} // namespace hwy
12896
12897#undef HWY_X86_IF_EMULATED_D
12898
12899// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
12900// the warning seems to be issued at the call site of intrinsics, i.e. our code.
12901HWY_DIAGNOSTICS(pop)
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_LANES_GT(kN, lanes)
Definition base.h:618
#define HWY_IF_CONSTEXPR
Definition base.h:310
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_MAYBE_UNUSED
Definition base.h:113
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
#define HWY_FENCE
Definition base.h:224
Definition arm_neon-inl.h:865
Raw raw
Definition arm_neon-inl.h:878
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:867
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition x86_128-inl.h:164
Definition arm_neon-inl.h:813
HWY_INLINE Vec128 & operator%=(const Vec128 other)
Definition x86_128-inl.h:109
T PrivateT
Definition arm_neon-inl.h:816
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition x86_128-inl.h:100
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:815
Raw raw
Definition arm_neon-inl.h:851
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition x86_128-inl.h:106
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition x86_128-inl.h:118
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition x86_128-inl.h:115
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition x86_128-inl.h:97
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition x86_128-inl.h:112
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition x86_128-inl.h:103
#define HWY_ARCH_X86_64
Definition detect_compiler_arch.h:173
#define HWY_SSE2
Definition detect_targets.h:80
#define HWY_AVX3_DL
Definition detect_targets.h:73
#define HWY_TARGET
Definition detect_targets.h:543
#define HWY_AVX3
Definition detect_targets.h:74
#define HWY_AVX3_ZEN4
Definition detect_targets.h:68
#define HWY_SSE4
Definition detect_targets.h:77
#define HWY_SSSE3
Definition detect_targets.h:78
HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV< V > t)
Definition x86_128-inl.h:6289
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:744
HWY_INLINE void NativeMaskedScatter128(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT base, VI index)
Definition x86_128-inl.h:5795
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE VFromD< D > IndicesFromBits128(D d, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5622
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition ops/shared-inl.h:151
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_API V InsertLane(const V v, TFromD< D > t)
Definition arm_neon-inl.h:1793
HWY_API Vec128< T, N > GaloisAffine(Vec128< T, N > v, VFromD< Repartition< uint64_t, Simd< T, N, 0 > > > matrix)
Definition x86_128-inl.h:1870
HWY_INLINE VFromD< DI > FixConversionOverflow(DI di, VFromD< RebindToFloat< DI > > original, VFromD< DI > converted)
Definition x86_128-inl.h:10061
static HWY_INLINE uint32_t AVX3Blsi(T x)
Definition x86_128-inl.h:12517
HWY_INLINE V Eq128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7087
HWY_INLINE V AVX2ShrI8Vec128(V v, V bits)
Definition x86_128-inl.h:8916
HWY_INLINE MFromD< D > LoadMaskBits128(D, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5084
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE V SlideUpLanes(V v, size_t amt)
Definition arm_neon-inl.h:6201
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE Mask128< T, N > UnmaskedNot(const Mask128< T, N > m)
Definition x86_128-inl.h:1635
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1593
HWY_INLINE VFromD< D > ReorderDemote2From64To32Combine(D dn, V a, V b)
Definition wasm_128-inl.h:4515
HWY_INLINE void NativeScatter128(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT base, VI index)
Definition x86_128-inl.h:5751
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:560
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE Vec128< T, N > NativeGather128(const T *HWY_RESTRICT base, Vec128< int32_t, N > indices)
Definition x86_128-inl.h:5861
HWY_INLINE V AVX2ShlU8Vec128(V v, V bits)
Definition x86_128-inl.h:8604
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
HWY_INLINE Vec128< float16_t, N > ConcatEven(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7002
HWY_INLINE MFromD< D > LoadMaskBits(D d, uint64_t mask_bits)
Definition arm_neon-inl.h:8051
static HWY_INLINE uint32_t AVX3Blsmsk(T x)
Definition x86_128-inl.h:12537
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:164
HWY_INLINE VFromD< Rebind< double, DFromV< VW > > > U64ToF64VecFast(VW w)
Definition wasm_128-inl.h:4656
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
HWY_INLINE Vec128< float16_t, N > ConcatOdd(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7009
HWY_API V AVX2ShlU16Vec128(V v, V bits)
Definition x86_128-inl.h:8508
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:570
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5005
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6088
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2845
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE VFromD< D > NativeLoadExpand(MFromD< D > mask, D, const uint8_t *HWY_RESTRICT unaligned)
Definition x86_128-inl.h:12412
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:1383
HWY_INLINE VFromD< D > PromoteOddTo(hwy::FloatTag to_type_tag, hwy::SizeTag< 4 > to_lane_size_tag, hwy::FloatTag from_type_tag, D d_to, svfloat16_t v)
Definition arm_sve-inl.h:4419
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition x86_128-inl.h:5621
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8296
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6031
HWY_INLINE V Ne128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7093
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8851
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:774
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
static HWY_INLINE V SSE2Mul128(V a, V b, V &mulH)
Definition x86_128-inl.h:9056
HWY_INLINE VFromD< D > PromoteEvenTo(hwy::SignedTag, hwy::SizeTag< 2 >, hwy::SignedTag, D d_to, svint8_t v)
Definition arm_sve-inl.h:4334
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8860
HWY_INLINE VFromD< Rebind< uint64_t, D > > DemoteFromU64MaskOutResult(D, VFromD< Rebind< uint64_t, D > > v)
Definition wasm_128-inl.h:4487
HWY_INLINE Vec128< T, N > NativeMaskedGatherOr128(Vec128< T, N > no, Mask128< T, N > m, const T *HWY_RESTRICT base, Vec128< int32_t, N > indices)
Definition x86_128-inl.h:5887
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_API Vec128< T, N > CompressNotBits(Vec128< T, N > v, uint64_t mask_bits)
Definition ppc_vsx-inl.h:6017
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6076
HWY_INLINE VFromD< Rebind< uint64_t, D > > DemoteFromU64Saturate(D dn, VFromD< Rebind< uint64_t, D > > v)
Definition wasm_128-inl.h:4501
HWY_API void ScalarMaskedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition x86_128-inl.h:3547
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE V SlideDownLanes(V v, size_t amt)
Definition arm_neon-inl.h:6346
HWY_INLINE Vec128< uint8_t, N > NativeExpand(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition x86_128-inl.h:12400
HWY_INLINE V AVX2ShrU8Vec128(V v, V bits)
Definition x86_128-inl.h:8750
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE VFromD< D > ClampF64ToI32Max(D d, VFromD< D > v)
Definition x86_128-inl.h:10050
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE VFromD< D > IndicesFromNotBits128(D d, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5771
HWY_INLINE V Lt128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7081
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag< 0x88 >, hwy::SizeTag< kLaneSize >, hwy::SizeTag< kVectSize >, V v)
Definition arm_neon-inl.h:6160
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition x86_128-inl.h:5528
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< DI32 > SatWidenMulPairwiseAccumulate(DI32 di32, VFromD< Repartition< int16_t, DI32 > > a, VFromD< Repartition< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition generic_ops-inl.h:5179
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API svbool_t MaskFalse(const D)
Definition arm_sve-inl.h:372
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
RepartitionToWide< RepartitionToWideX2< D > > RepartitionToWideX3
Definition ops/shared-inl.h:483
HWY_API Mask< D > SlideMask1Up(D d, Mask< D > m)
Definition generic_ops-inl.h:7071
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
V Shl(V a, V b)
Definition generic_ops-inl.h:7322
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API Vec128< int64_t > SaturatedAbs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3288
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API V MaskedModOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:4666
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2902
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API V AddSub(V a, V b)
Definition generic_ops-inl.h:775
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API V Rol(V a, V b)
Definition generic_ops-inl.h:445
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API svbool_t DemoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1420
HWY_API V ZeroIfNegative(V v)
Definition generic_ops-inl.h:266
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
Vec128< T, 4/sizeof(T)> Vec32
Definition arm_neon-inl.h:858
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API MFromD< DTo > OrderedDemote2MasksTo(DTo d_to, DFrom, MFromD< DFrom > a, MFromD< DFrom > b)
Definition x86_128-inl.h:1107
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfAdjQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3901
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< DI > ConvertInRangeTo(DI di, VFromD< RebindToFloat< DI > > v)
Definition emu128-inl.h:1900
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< uint16_t,(N+1)/2 > SumsOfShuffledQuadAbsDiff(Vec128< uint8_t, N > a, Vec128< uint8_t, N > b)
Definition x86_128-inl.h:3943
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
Vec128< T, 2/sizeof(T)> Vec16
Definition arm_neon-inl.h:861
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v)
Definition generic_ops-inl.h:869
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API VFromD< D > PromoteInRangeTo(D d64, VFromD< Rebind< float, D > > v)
Definition arm_neon-inl.h:4497
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec< DI16 > SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b)
Definition generic_ops-inl.h:5153
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Mask< D > SlideMask1Down(D d, Mask< D > m)
Definition generic_ops-inl.h:7076
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API Vec< RepartitionToWideX3< DFromV< V > > > SumsOf8AbsDiff(V a, V b)
Definition generic_ops-inl.h:2820
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API VFromD< D > PromoteLowerTo(D d, V v)
Definition generic_ops-inl.h:2984
HWY_API V MulAddSub(V mul, V x, V sub_or_add)
Definition arm_sve-inl.h:4285
HWY_API V RotateRightSame(V v, int bits)
Definition generic_ops-inl.h:601
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
Vec128< T, 8/sizeof(T)> Vec64
Definition arm_neon-inl.h:855
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API V Ror(V a, V b)
Definition generic_ops-inl.h:459
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API Mask< D > SlideMaskDownLanes(D d, Mask< D > m, size_t amt)
Definition generic_ops-inl.h:7086
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
unsigned int Shift64Count
Definition x86_128-inl.h:4535
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:476
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API VFromD< D32 > DemoteInRangeTo(D32 d32, VFromD< Rebind< double, D32 > > v)
Definition emu128-inl.h:1845
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API svbool_t LowerHalfOfMask(D, svbool_t m)
Definition arm_sve-inl.h:1456
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API MFromD< DFromV< V > > IsEitherNaN(V a, V b)
Definition generic_ops-inl.h:1177
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API V Div(V a, V b)
Definition arm_sve-inl.h:4639
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition ops/shared-inl.h:407
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > MaskedGatherIndex(MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2731
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API svbool_t IsNegative(V v)
Definition arm_sve-inl.h:1623
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
long long int GatherIndex64
Definition x86_128-inl.h:5737
HWY_API Mask< D > SlideMaskUpLanes(D d, Mask< D > m, size_t amt)
Definition generic_ops-inl.h:7081
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API MFromD< D > UpperHalfOfMask(D, MFromD< Twice< D > > m)
Definition x86_128-inl.h:1051
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API MFromD< D > CombineMasks(D, MFromD< Half< D > > hi, MFromD< Half< D > > lo)
Definition x86_128-inl.h:959
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API svbool_t PromoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1394
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V RotateLeftSame(V v, int bits)
Definition generic_ops-inl.h:588
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API V Mod(V a, V b)
Definition arm_sve-inl.h:4660
HWY_API V IfNegativeThenZeroElse(V v, V no)
Definition generic_ops-inl.h:256
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
Definition abort.h:8
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API constexpr bool IsSigned()
Definition base.h:2134
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2577
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2540
typename EnableIfT< Condition >::type EnableIf
Definition base.h:486
HWY_API size_t PopCount(T x)
Definition base.h:2615
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue()
Definition base.h:2212
#define HWY_IF_U32_D(D)
Definition ops/shared-inl.h:579
#define HWY_IF_F16_D(D)
Definition ops/shared-inl.h:597
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_I64_D(D)
Definition ops/shared-inl.h:585
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_V_SIZE_LE_V(V, bytes)
Definition ops/shared-inl.h:634
#define HWY_IF_LANES_GT_D(D, lanes)
Definition ops/shared-inl.h:562
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_I32_D(D)
Definition ops/shared-inl.h:584
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_V_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:609
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_T_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:557
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_IF_I8_D(D)
Definition ops/shared-inl.h:582
#define HWY_IF_NOT_UNSIGNED_V(V)
Definition ops/shared-inl.h:614
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:555
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_MAX_BYTES
Definition set_macros-inl.h:168
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition arm_neon-inl.h:5654
__m128i raw
Definition x86_128-inl.h:6585
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:5655
Definition ops/shared-inl.h:198
HWY_INLINE __m128d operator()(__m128i v)
Definition x86_128-inl.h:295
HWY_INLINE __m128 operator()(__m128i v)
Definition x86_128-inl.h:291
HWY_INLINE __m128i operator()(__m128i v)
Definition x86_128-inl.h:281
HWY_INLINE __v128_u operator()(__v128_u v)
Definition wasm_128-inl.h:180
__f64x2 type
Definition wasm_128-inl.h:68
__f32x4 type
Definition wasm_128-inl.h:64
Definition x86_128-inl.h:67
__v128_u type
Definition wasm_128-inl.h:60
__mmask16 type
Definition x86_128-inl.h:143
__mmask8 type
Definition x86_128-inl.h:147
__mmask8 type
Definition x86_128-inl.h:151
__mmask8 type
Definition x86_128-inl.h:155
Definition x86_128-inl.h:140
Definition base.h:694
Definition base.h:1117
int VFromD
Definition tuple-inl.h:25
#define HWY_X86_FPCLASS_NEG_INF
Definition x86_128-inl.h:11266
HWY_AFTER_NAMESPACE()
#define HWY_X86_FPCLASS_SNAN
Definition x86_128-inl.h:11269
#define HWY_X86_FPCLASS_POS_INF
Definition x86_128-inl.h:11265
#define HWY_X86_FPCLASS_QNAN
Definition x86_128-inl.h:11262
HWY_BEFORE_NAMESPACE()