Grok 12.0.1
ppc_vsx-inl.h
Go to the documentation of this file.
1// Copyright 2023 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 128-bit vectors for VSX/Z14
17// External include guard in highway.h - see comment there.
18
19#if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
20#define HWY_S390X_HAVE_Z14 1
21#else
22#define HWY_S390X_HAVE_Z14 0
23#endif
24
25#pragma push_macro("vector")
26#pragma push_macro("pixel")
27#pragma push_macro("bool")
28
29#undef vector
30#undef pixel
31#undef bool
32
33#if HWY_S390X_HAVE_Z14
34#include <vecintrin.h>
35#else
36#include <altivec.h>
37#endif
38
39#pragma pop_macro("vector")
40#pragma pop_macro("pixel")
41#pragma pop_macro("bool")
42
43#include "hwy/ops/shared-inl.h"
44
45// clang's altivec.h gates some intrinsics behind #ifdef __POWER10_VECTOR__, and
46// some GCC do the same for _ARCH_PWR10.
47// This means we can only use POWER10-specific intrinsics in static dispatch
48// mode (where the -mpower10-vector compiler flag is passed). Same for PPC9.
49// On other compilers, the usual target check is sufficient.
50#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \
51 (defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__))
52#define HWY_PPC_HAVE_9 1
53#else
54#define HWY_PPC_HAVE_9 0
55#endif
56
57#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \
58 (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
59#define HWY_PPC_HAVE_10 1
60#else
61#define HWY_PPC_HAVE_10 0
62#endif
63
64#if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13
65#define HWY_S390X_HAVE_Z15 1
66#else
67#define HWY_S390X_HAVE_Z15 0
68#endif
69
71namespace hwy {
72namespace HWY_NAMESPACE {
73namespace detail {
74
75template <typename T>
76struct Raw128;
77
78// Each Raw128 specialization defines the following typedefs:
79// - type:
80// the backing Altivec/VSX raw vector type of the Vec128<T, N> type
81// - RawBoolVec:
82// the backing Altivec/VSX raw __bool vector type of the Mask128<T, N> type
83// - RawT:
84// the lane type for intrinsics, in particular vec_splat
85// - AlignedRawVec:
86// the 128-bit GCC/Clang vector type for aligned loads/stores
87// - UnalignedRawVec:
88// the 128-bit GCC/Clang vector type for unaligned loads/stores
89#define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \
90 template <> \
91 struct Raw128<LANE_TYPE> { \
92 using type = __vector RAW_VECT_LANE_TYPE; \
93 using RawBoolVec = __vector __bool RAW_BOOL_VECT_LANE_TYPE; \
94 using RawT = RAW_VECT_LANE_TYPE; \
95 typedef LANE_TYPE AlignedRawVec \
96 __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \
97 typedef LANE_TYPE UnalignedRawVec __attribute__(( \
98 __vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \
99 };
100
101HWY_VSX_RAW128(int8_t, signed char, char)
102HWY_VSX_RAW128(uint8_t, unsigned char, char)
103HWY_VSX_RAW128(int16_t, signed short, short) // NOLINT(runtime/int)
104HWY_VSX_RAW128(uint16_t, unsigned short, short) // NOLINT(runtime/int)
105HWY_VSX_RAW128(int32_t, signed int, int)
106HWY_VSX_RAW128(uint32_t, unsigned int, int)
107HWY_VSX_RAW128(int64_t, signed long long, long long) // NOLINT(runtime/int)
108HWY_VSX_RAW128(uint64_t, unsigned long long, long long) // NOLINT(runtime/int)
109HWY_VSX_RAW128(float, float, int)
110HWY_VSX_RAW128(double, double, long long) // NOLINT(runtime/int)
111
112template <>
113struct Raw128<bfloat16_t> : public Raw128<uint16_t> {};
114
115template <>
116struct Raw128<float16_t> : public Raw128<uint16_t> {};
117
118#undef HWY_VSX_RAW128
119
120} // namespace detail
121
122template <typename T, size_t N = 16 / sizeof(T)>
123class Vec128 {
124 using Raw = typename detail::Raw128<T>::type;
125
126 public:
127 using PrivateT = T; // only for DFromV
128 static constexpr size_t kPrivateN = N; // only for DFromV
129
130 // Compound assignment. Only usable if there is a corresponding non-member
131 // binary operator overload. For example, only f32 and f64 support division.
133 return *this = (*this * other);
134 }
136 return *this = (*this / other);
137 }
139 return *this = (*this + other);
140 }
142 return *this = (*this - other);
143 }
145 return *this = (*this % other);
146 }
148 return *this = (*this & other);
149 }
151 return *this = (*this | other);
152 }
154 return *this = (*this ^ other);
155 }
156
157 Raw raw;
158};
159
160template <typename T>
161using Vec64 = Vec128<T, 8 / sizeof(T)>;
162
163template <typename T>
164using Vec32 = Vec128<T, 4 / sizeof(T)>;
165
166template <typename T>
167using Vec16 = Vec128<T, 2 / sizeof(T)>;
168
169// FF..FF or 0.
170template <typename T, size_t N = 16 / sizeof(T)>
171struct Mask128 {
173
174 using PrivateT = T; // only for DFromM
175 static constexpr size_t kPrivateN = N; // only for DFromM
176};
177
178template <class V>
180
181template <class M>
183
184template <class V>
185using TFromV = typename V::PrivateT;
186
187// ------------------------------ Zero
188
189// Returns an all-zero vector/part.
190template <class D, typename T = TFromD<D>>
191HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
192 // There is no vec_splats for 64-bit, so we cannot rely on casting the 0
193 // argument in order to select the correct overload. We instead cast the
194 // return vector type; see also the comment in BitCast.
195 return Vec128<T, HWY_MAX_LANES_D(D)>{
196 reinterpret_cast<typename detail::Raw128<T>::type>(vec_splats(0))};
197}
198
199template <class D>
200using VFromD = decltype(Zero(D()));
201
202// ------------------------------ Tuple (VFromD)
203#include "hwy/ops/tuple-inl.h"
204
205// ------------------------------ BitCast
206
207template <class D, typename FromT>
209 Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
210 // C-style casts are not sufficient when compiling with
211 // -fno-lax-vector-conversions, which will be the future default in Clang,
212 // but reinterpret_cast is.
213 return VFromD<D>{
214 reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
215}
216
217// ------------------------------ ResizeBitCast
218
219template <class D, typename FromV>
220HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) {
221 // C-style casts are not sufficient when compiling with
222 // -fno-lax-vector-conversions, which will be the future default in Clang,
223 // but reinterpret_cast is.
224 return VFromD<D>{
225 reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
226}
227
228// ------------------------------ Set
229
230// Returns a vector/part with all lanes set to "t".
231template <class D, HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)>
233 using RawLane = typename detail::Raw128<TFromD<D>>::RawT;
234 return VFromD<D>{vec_splats(static_cast<RawLane>(t))};
235}
236
237template <class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)>
238HWY_API VFromD<D> Set(D d, TFromD<D> t) {
239 const RebindToUnsigned<decltype(d)> du;
240 return BitCast(d, Set(du, BitCastScalar<TFromD<decltype(du)>>(t)));
241}
242
243// Returns a vector with uninitialized elements.
244template <class D>
246#if HWY_COMPILER_GCC_ACTUAL
247 // Suppressing maybe-uninitialized both here and at the caller does not work,
248 // so initialize.
249 return Zero(d);
250#else
251 HWY_DIAGNOSTICS(push)
252 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
253 typename detail::Raw128<TFromD<D>>::type raw;
254 return VFromD<decltype(d)>{raw};
255 HWY_DIAGNOSTICS(pop)
256#endif
257}
258
259// ------------------------------ GetLane
260
261// Gets the single value stored in a vector/part.
262
263template <typename T, size_t N>
264HWY_API T GetLane(Vec128<T, N> v) {
265 return static_cast<T>(v.raw[0]);
266}
267
268// ------------------------------ Dup128VecFromValues
269
270template <class D, HWY_IF_T_SIZE_D(D, 1)>
271HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
272 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
273 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
274 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
275 TFromD<D> t11, TFromD<D> t12,
276 TFromD<D> t13, TFromD<D> t14,
277 TFromD<D> t15) {
278 const typename detail::Raw128<TFromD<D>>::type raw = {
279 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15};
280 return VFromD<D>{raw};
281}
282
283template <class D, HWY_IF_UI16_D(D)>
284HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
285 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
286 TFromD<D> t5, TFromD<D> t6,
287 TFromD<D> t7) {
288 const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3,
289 t4, t5, t6, t7};
290 return VFromD<D>{raw};
291}
292
293template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
294HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
295 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
296 TFromD<D> t5, TFromD<D> t6,
297 TFromD<D> t7) {
298 const RebindToUnsigned<decltype(d)> du;
299 return BitCast(
301 du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
302 BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
303 BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
304 BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
305}
306
307template <class D, HWY_IF_T_SIZE_D(D, 4)>
308HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
309 TFromD<D> t2, TFromD<D> t3) {
310 const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3};
311 return VFromD<D>{raw};
312}
313
314template <class D, HWY_IF_T_SIZE_D(D, 8)>
315HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
316 const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1};
317 return VFromD<D>{raw};
318}
319
320// ================================================== LOGICAL
321
322// ------------------------------ And
323
324template <typename T, size_t N>
325HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
326 const DFromV<decltype(a)> d;
327 const RebindToUnsigned<decltype(d)> du;
328 using VU = VFromD<decltype(du)>;
329#if HWY_S390X_HAVE_Z14
330 return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw});
331#else
332 return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)});
333#endif
334}
335
336// ------------------------------ AndNot
337
338// Returns ~not_mask & mask.
339template <typename T, size_t N>
340HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
341 const DFromV<decltype(mask)> d;
342 const RebindToUnsigned<decltype(d)> du;
343 using VU = VFromD<decltype(du)>;
344 return BitCast(
345 d, VU{vec_andc(BitCast(du, mask).raw, BitCast(du, not_mask).raw)});
346}
347
348// ------------------------------ Or
349
350template <typename T, size_t N>
351HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
352 const DFromV<decltype(a)> d;
353 const RebindToUnsigned<decltype(d)> du;
354 using VU = VFromD<decltype(du)>;
355#if HWY_S390X_HAVE_Z14
356 return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw});
357#else
358 return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)});
359#endif
360}
361
362// ------------------------------ Xor
363
364template <typename T, size_t N>
365HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
366 const DFromV<decltype(a)> d;
367 const RebindToUnsigned<decltype(d)> du;
368 using VU = VFromD<decltype(du)>;
369#if HWY_S390X_HAVE_Z14
370 return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw});
371#else
372 return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)});
373#endif
374}
375
376// ------------------------------ Not
377template <typename T, size_t N>
378HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
379 const DFromV<decltype(v)> d;
380 const RebindToUnsigned<decltype(d)> du;
381 using VU = VFromD<decltype(du)>;
382 return BitCast(d, VU{vec_nor(BitCast(du, v).raw, BitCast(du, v).raw)});
383}
384
385// ------------------------------ IsConstantRawAltivecVect
386namespace detail {
387
388template <class RawV>
390 hwy::SizeTag<1> /* lane_size_tag */, RawV v) {
391 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
392 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
393 __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
394 __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
395 __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
396 __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
397 __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
398 __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]);
399}
400
401template <class RawV>
403 hwy::SizeTag<2> /* lane_size_tag */, RawV v) {
404 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
405 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
406 __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
407 __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]);
408}
409
410template <class RawV>
412 hwy::SizeTag<4> /* lane_size_tag */, RawV v) {
413 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
414 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]);
415}
416
417template <class RawV>
419 hwy::SizeTag<8> /* lane_size_tag */, RawV v) {
420 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]);
421}
422
423template <class RawV>
425 return IsConstantRawAltivecVect(hwy::SizeTag<sizeof(decltype(v[0]))>(), v);
426}
427
428} // namespace detail
429
430// ------------------------------ TernaryLogic
431#if HWY_PPC_HAVE_10
432namespace detail {
433
434// NOTE: the kTernLogOp bits of the PPC10 TernaryLogic operation are in reverse
435// order of the kTernLogOp bits of AVX3
436// _mm_ternarylogic_epi64(a, b, c, kTernLogOp)
437template <uint8_t kTernLogOp, class V>
438HWY_INLINE V TernaryLogic(V a, V b, V c) {
439 const DFromV<decltype(a)> d;
440 const RebindToUnsigned<decltype(d)> du;
441 using VU = VFromD<decltype(du)>;
442 const auto a_raw = BitCast(du, a).raw;
443 const auto b_raw = BitCast(du, b).raw;
444 const auto c_raw = BitCast(du, c).raw;
445
446#if HWY_COMPILER_GCC_ACTUAL
447 // Use inline assembly on GCC to work around GCC compiler bug
448 typename detail::Raw128<TFromV<VU>>::type raw_ternlog_result;
449 __asm__("xxeval %x0,%x1,%x2,%x3,%4"
450 : "=wa"(raw_ternlog_result)
451 : "wa"(a_raw), "wa"(b_raw), "wa"(c_raw),
452 "n"(static_cast<unsigned>(kTernLogOp))
453 :);
454#else
455 const auto raw_ternlog_result =
456 vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp);
457#endif
458
459 return BitCast(d, VU{raw_ternlog_result});
460}
461
462} // namespace detail
463#endif // HWY_PPC_HAVE_10
464
465// ------------------------------ Xor3
466template <typename T, size_t N>
467HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
468#if HWY_PPC_HAVE_10
469#if defined(__OPTIMIZE__)
470 if (static_cast<int>(detail::IsConstantRawAltivecVect(x1.raw)) +
471 static_cast<int>(detail::IsConstantRawAltivecVect(x2.raw)) +
472 static_cast<int>(detail::IsConstantRawAltivecVect(x3.raw)) >=
473 2) {
474 return Xor(x1, Xor(x2, x3));
475 } else // NOLINT
476#endif
477 {
478 return detail::TernaryLogic<0x69>(x1, x2, x3);
479 }
480#else
481 return Xor(x1, Xor(x2, x3));
482#endif
483}
484
485// ------------------------------ Or3
486template <typename T, size_t N>
487HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
488#if HWY_PPC_HAVE_10
489#if defined(__OPTIMIZE__)
490 if (static_cast<int>(detail::IsConstantRawAltivecVect(o1.raw)) +
491 static_cast<int>(detail::IsConstantRawAltivecVect(o2.raw)) +
492 static_cast<int>(detail::IsConstantRawAltivecVect(o3.raw)) >=
493 2) {
494 return Or(o1, Or(o2, o3));
495 } else // NOLINT
496#endif
497 {
498 return detail::TernaryLogic<0x7F>(o1, o2, o3);
499 }
500#else
501 return Or(o1, Or(o2, o3));
502#endif
503}
504
505// ------------------------------ OrAnd
506template <typename T, size_t N>
507HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
508#if HWY_PPC_HAVE_10
509#if defined(__OPTIMIZE__)
512 return Or(o, And(a1, a2));
513 } else // NOLINT
514#endif
515 {
516 return detail::TernaryLogic<0x1F>(o, a1, a2);
517 }
518#else
519 return Or(o, And(a1, a2));
520#endif
521}
522
523// ------------------------------ IfVecThenElse
524template <typename T, size_t N>
525HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
526 Vec128<T, N> no) {
527 const DFromV<decltype(yes)> d;
528 const RebindToUnsigned<decltype(d)> du;
529 return BitCast(
530 d, VFromD<decltype(du)>{vec_sel(BitCast(du, no).raw, BitCast(du, yes).raw,
531 BitCast(du, mask).raw)});
532}
533
534// ------------------------------ BitwiseIfThenElse
535
536#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
537#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
538#else
539#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
540#endif
541
542template <class V>
543HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
544 return IfVecThenElse(mask, yes, no);
545}
546
547// ------------------------------ Operator overloads (internal-only if float)
548
549template <typename T, size_t N>
550HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) {
551 return And(a, b);
552}
553
554template <typename T, size_t N>
555HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) {
556 return Or(a, b);
557}
558
559template <typename T, size_t N>
560HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
561 return Xor(a, b);
562}
563
564// ================================================== SIGN
565
566// ------------------------------ Neg
567
568template <typename T, size_t N, HWY_IF_SIGNED(T)>
570 // If T is an signed integer type, use Zero(d) - v instead of vec_neg to
571 // avoid undefined behavior in the case where v[i] == LimitsMin<T>()
572 const DFromV<decltype(v)> d;
573 return Zero(d) - v;
574}
575
576template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
577HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
578#if HWY_S390X_HAVE_Z14
579 return Xor(v, SignBit(DFromV<decltype(v)>()));
580#else
581 return Vec128<T, N>{vec_neg(v.raw)};
582#endif
583}
584
585template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
586HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
587 return Xor(v, SignBit(DFromV<decltype(v)>()));
588}
589
590// ------------------------------ Abs
591
592// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
593template <class T, size_t N, HWY_IF_SIGNED(T)>
595 // If T is a signed integer type, use Max(v, Neg(v)) instead of vec_abs to
596 // avoid undefined behavior in the case where v[i] == LimitsMin<T>().
597 return Max(v, Neg(v));
598}
599
600template <class T, size_t N, HWY_IF_FLOAT3264(T)>
601HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
602 return Vec128<T, N>{vec_abs(v.raw)};
603}
604
605// ------------------------------ CopySign
606
607#if HWY_S390X_HAVE_Z14
608template <class V>
609HWY_API V CopySign(const V magn, const V sign) {
610 static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
611
612 const DFromV<decltype(magn)> d;
613 const auto msb = SignBit(d);
614
615 // Truth table for msb, magn, sign | bitwise msb ? sign : mag
616 // 0 0 0 | 0
617 // 0 0 1 | 0
618 // 0 1 0 | 1
619 // 0 1 1 | 1
620 // 1 0 0 | 0
621 // 1 0 1 | 1
622 // 1 1 0 | 0
623 // 1 1 1 | 1
624 return BitwiseIfThenElse(msb, sign, magn);
625}
626#else // VSX
627template <size_t N>
628HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn,
629 Vec128<float, N> sign) {
630 // Work around compiler bugs that are there with vec_cpsgn on older versions
631 // of GCC/Clang
632#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
633 return Vec128<float, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
634#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
635 HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp)
636 return Vec128<float, N>{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)};
637#else
638 return Vec128<float, N>{vec_cpsgn(sign.raw, magn.raw)};
639#endif
640}
641
642template <size_t N>
643HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn,
644 Vec128<double, N> sign) {
645 // Work around compiler bugs that are there with vec_cpsgn on older versions
646 // of GCC/Clang
647#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
648 return Vec128<double, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
649#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
650 HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp)
651 return Vec128<double, N>{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)};
652#else
653 return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)};
654#endif
655}
656#endif // HWY_S390X_HAVE_Z14
657
658template <typename T, size_t N>
659HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
660 // PPC8 can also handle abs < 0, so no extra action needed.
661 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
662 return CopySign(abs, sign);
663}
664
665// ================================================== MEMORY (1)
666
667// Note: type punning is safe because the types are tagged with may_alias.
668// (https://godbolt.org/z/fqrWjfjsP)
669
670// ------------------------------ Load
671
672template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
673HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
674// Suppress the ignoring attributes warning that is generated by
675// HWY_RCAST_ALIGNED(const LoadRaw*, aligned) with GCC
676#if HWY_COMPILER_GCC
677 HWY_DIAGNOSTICS(push)
678 HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
679#endif
680
681 using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
682 const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
683 using ResultRaw = typename detail::Raw128<T>::type;
684 return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
685
686#if HWY_COMPILER_GCC
687 HWY_DIAGNOSTICS(pop)
688#endif
689}
690
691// Any <= 64 bit
692template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
694 using BitsT = UnsignedFromSize<d.MaxBytes()>;
695
696 BitsT bits;
697 const Repartition<BitsT, decltype(d)> d_bits;
698 CopyBytes<d.MaxBytes()>(p, &bits);
699 return BitCast(d, Set(d_bits, bits));
700}
701
702// ================================================== MASK
703
704// ------------------------------ Mask
705
706// Mask and Vec are both backed by vector types (true = FF..FF).
707template <typename T, size_t N>
708HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
709 using Raw = typename detail::Raw128<T>::RawBoolVec;
710 return Mask128<T, N>{reinterpret_cast<Raw>(v.raw)};
711}
712
713template <class D>
714using MFromD = decltype(MaskFromVec(VFromD<D>()));
715
716template <typename T, size_t N>
718 return Vec128<T, N>{
719 reinterpret_cast<typename detail::Raw128<T>::type>(v.raw)};
720}
721
722template <class D>
723HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
724 return VFromD<D>{
725 reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
726}
727
728// mask ? yes : no
729template <typename T, size_t N>
730HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
731 Vec128<T, N> no) {
732 const DFromV<decltype(yes)> d;
733 const RebindToUnsigned<decltype(d)> du;
734 return BitCast(d, VFromD<decltype(du)>{vec_sel(
735 BitCast(du, no).raw, BitCast(du, yes).raw, mask.raw)});
736}
737
738// mask ? yes : 0
739template <typename T, size_t N>
740HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
741 return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
742}
743
744// mask ? 0 : no
745template <typename T, size_t N>
746HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
747 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
748}
749
750// ------------------------------ Mask logical
751
752template <typename T, size_t N>
753HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
754 return Mask128<T, N>{vec_nor(m.raw, m.raw)};
755}
756
757template <typename T, size_t N>
758HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
759#if HWY_S390X_HAVE_Z14
760 return Mask128<T, N>{a.raw & b.raw};
761#else
762 return Mask128<T, N>{vec_and(a.raw, b.raw)};
763#endif
764}
765
766template <typename T, size_t N>
767HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
768 return Mask128<T, N>{vec_andc(b.raw, a.raw)};
769}
770
771template <typename T, size_t N>
772HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
773#if HWY_S390X_HAVE_Z14
774 return Mask128<T, N>{a.raw | b.raw};
775#else
776 return Mask128<T, N>{vec_or(a.raw, b.raw)};
777#endif
778}
779
780template <typename T, size_t N>
781HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
782#if HWY_S390X_HAVE_Z14
783 return Mask128<T, N>{a.raw ^ b.raw};
784#else
785 return Mask128<T, N>{vec_xor(a.raw, b.raw)};
786#endif
787}
788
789template <typename T, size_t N>
790HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
791 return Mask128<T, N>{vec_nor(a.raw, b.raw)};
792}
793
794// ------------------------------ ShiftLeftSame
795
796template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
798 const DFromV<decltype(v)> d;
799 const RebindToUnsigned<decltype(d)> du;
800 using TU = TFromD<decltype(du)>;
801
802#if HWY_S390X_HAVE_Z14
803 return BitCast(d,
804 VFromD<decltype(du)>{BitCast(du, v).raw
805 << Set(du, static_cast<TU>(bits)).raw});
806#else
807 // Do an unsigned vec_sl operation to avoid undefined behavior
808 return BitCast(
809 d, VFromD<decltype(du)>{
810 vec_sl(BitCast(du, v).raw, Set(du, static_cast<TU>(bits)).raw)});
811#endif
812}
813
814// ------------------------------ ShiftRightSame
815
816template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
818 using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
819#if HWY_S390X_HAVE_Z14
820 return Vec128<T, N>{v.raw >> vec_splats(static_cast<TU>(bits))};
821#else
822 return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))};
823#endif
824}
825
826template <typename T, size_t N, HWY_IF_SIGNED(T)>
827HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
828#if HWY_S390X_HAVE_Z14
829 using TI = typename detail::Raw128<T>::RawT;
830 return Vec128<T, N>{v.raw >> vec_splats(static_cast<TI>(bits))};
831#else
832 using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
833 return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))};
834#endif
835}
836
837// ------------------------------ ShiftLeft
838
839template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
841 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
842 return ShiftLeftSame(v, kBits);
843}
844
845// ------------------------------ ShiftRight
846
847template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
849 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
850 return ShiftRightSame(v, kBits);
851}
852
853// ------------------------------ BroadcastSignBit
854
855template <typename T, size_t N, HWY_IF_SIGNED(T)>
856HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
857 return ShiftRightSame(v, static_cast<int>(sizeof(T) * 8 - 1));
858}
859
860// ================================================== SWIZZLE (1)
861
862// ------------------------------ TableLookupBytes
863template <typename T, size_t N, typename TI, size_t NI>
864HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes,
865 Vec128<TI, NI> from) {
866 const Repartition<uint8_t, DFromV<decltype(from)>> du8_from;
867 return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>(
868 vec_perm(bytes.raw, bytes.raw, BitCast(du8_from, from).raw))};
869}
870
871// ------------------------------ TableLookupBytesOr0
872// For all vector widths; Altivec/VSX needs zero out
873template <class V, class VI>
874HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
875 const DFromV<VI> di;
876 Repartition<int8_t, decltype(di)> di8;
877 const VI zeroOutMask = BitCast(di, BroadcastSignBit(BitCast(di8, from)));
878 return AndNot(zeroOutMask, TableLookupBytes(bytes, from));
879}
880
881// ------------------------------ Reverse
882template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)>
883HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) {
884 return Vec128<T>{vec_reve(v.raw)};
885}
886
887// ------------------------------ Shuffles (Reverse)
888
889// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
890// Shuffle0321 rotates one lane to the right (the previous least-significant
891// lane is now most-significant). These could also be implemented via
892// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
893
894// Swap 32-bit halves in 64-bit halves.
895template <typename T, size_t N>
896HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) {
897 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
898 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
899 const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
900 12, 13, 14, 15, 8, 9, 10, 11};
901 return Vec128<T, N>{vec_perm(v.raw, v.raw, kShuffle)};
902}
903
904// These are used by generic_ops-inl to implement LoadInterleaved3. As with
905// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
906// comes from the first argument.
907namespace detail {
908
909template <typename T, HWY_IF_T_SIZE(T, 1)>
911 const __vector unsigned char kShuffle16 = {1, 0, 19, 18};
912 return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle16)};
913}
914template <typename T, HWY_IF_T_SIZE(T, 2)>
916 const __vector unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21};
917 return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
918}
919template <typename T, HWY_IF_T_SIZE(T, 4)>
921 const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
922 28, 29, 30, 31, 24, 25, 26, 27};
923 return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
924}
925
926template <typename T, HWY_IF_T_SIZE(T, 1)>
928 const __vector unsigned char kShuffle = {0, 3, 18, 17};
929 return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
930}
931template <typename T, HWY_IF_T_SIZE(T, 2)>
933 const __vector unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19};
934 return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
935}
936template <typename T, HWY_IF_T_SIZE(T, 4)>
938 const __vector unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15,
939 24, 25, 26, 27, 20, 21, 22, 23};
940 return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
941}
942
943template <typename T, HWY_IF_T_SIZE(T, 1)>
945 const __vector unsigned char kShuffle = {2, 1, 16, 19};
946 return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
947}
948template <typename T, HWY_IF_T_SIZE(T, 2)>
950 const __vector unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23};
951 return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
952}
953template <typename T, HWY_IF_T_SIZE(T, 4)>
955 const __vector unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7,
956 16, 17, 18, 19, 28, 29, 30, 31};
957 return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
958}
959
960} // namespace detail
961
962// Swap 64-bit halves
963template <class T, HWY_IF_T_SIZE(T, 4)>
965 const Full128<T> d;
966 const Full128<uint64_t> du64;
967 return BitCast(d, Reverse(du64, BitCast(du64, v)));
968}
969template <class T, HWY_IF_T_SIZE(T, 8)>
973
974// Rotate right 32 bits
975template <class T, HWY_IF_T_SIZE(T, 4)>
977#if HWY_IS_LITTLE_ENDIAN
978 return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
979#else
980 return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
981#endif
982}
983// Rotate left 32 bits
984template <class T, HWY_IF_T_SIZE(T, 4)>
986#if HWY_IS_LITTLE_ENDIAN
987 return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
988#else
989 return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
990#endif
991}
992
993template <class T, HWY_IF_T_SIZE(T, 4)>
997
998// ================================================== COMPARE
999
1000// Comparisons fill a lane with 1-bits if the condition is true, else 0.
1001
1002template <class DTo, typename TFrom, size_t NFrom>
1003HWY_API MFromD<DTo> RebindMask(DTo /*dto*/, Mask128<TFrom, NFrom> m) {
1004 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
1005 return MFromD<DTo>{m.raw};
1006}
1007
1008template <typename T, size_t N>
1009HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1010 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1011 return (v & bit) == bit;
1012}
1013
1014// ------------------------------ Equality
1015
1016template <typename T, size_t N>
1017HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) {
1018 return Mask128<T, N>{vec_cmpeq(a.raw, b.raw)};
1019}
1020
1021// ------------------------------ Inequality
1022
1023// This cannot have T as a template argument, otherwise it is not more
1024// specialized than rewritten operator== in C++20, leading to compile
1025// errors: https://gcc.godbolt.org/z/xsrPhPvPT.
1026template <size_t N>
1029#if HWY_PPC_HAVE_9
1030 return Mask128<uint8_t, N>{vec_cmpne(a.raw, b.raw)};
1031#else
1032 return Not(a == b);
1033#endif
1034}
1035template <size_t N>
1038#if HWY_PPC_HAVE_9
1039 return Mask128<uint16_t, N>{vec_cmpne(a.raw, b.raw)};
1040#else
1041 return Not(a == b);
1042#endif
1043}
1044template <size_t N>
1047#if HWY_PPC_HAVE_9
1048 return Mask128<uint32_t, N>{vec_cmpne(a.raw, b.raw)};
1049#else
1050 return Not(a == b);
1051#endif
1052}
1053template <size_t N>
1058template <size_t N>
1061#if HWY_PPC_HAVE_9
1062 return Mask128<int8_t, N>{vec_cmpne(a.raw, b.raw)};
1063#else
1064 return Not(a == b);
1065#endif
1066}
1067template <size_t N>
1070#if HWY_PPC_HAVE_9
1071 return Mask128<int16_t, N>{vec_cmpne(a.raw, b.raw)};
1072#else
1073 return Not(a == b);
1074#endif
1075}
1076template <size_t N>
1079#if HWY_PPC_HAVE_9
1080 return Mask128<int32_t, N>{vec_cmpne(a.raw, b.raw)};
1081#else
1082 return Not(a == b);
1083#endif
1084}
1085template <size_t N>
1090
1091template <size_t N>
1095
1096template <size_t N>
1101
1102// ------------------------------ Strict inequality
1103
1104template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1108
1109// ------------------------------ Weak inequality
1110
1111template <typename T, size_t N, HWY_IF_FLOAT(T)>
1113 return Mask128<T, N>{vec_cmpge(a.raw, b.raw)};
1114}
1115
1116template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1117HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
1118 return Not(b > a);
1119}
1120
1121// ------------------------------ Reversed comparisons
1122
1123template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1127
1128template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1132
1133// ================================================== MEMORY (2)
1134
1135// ------------------------------ Load
1136template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
1137HWY_API Vec128<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) {
1138 using LoadRaw = typename detail::Raw128<T>::UnalignedRawVec;
1139 const LoadRaw* HWY_RESTRICT praw = reinterpret_cast<const LoadRaw*>(p);
1140 using ResultRaw = typename detail::Raw128<T>::type;
1141 return Vec128<T>{reinterpret_cast<ResultRaw>(*praw)};
1142}
1143
1144// For < 128 bit, LoadU == Load.
1145template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
1147 return Load(d, p);
1148}
1149
1150// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1151template <class D, typename T = TFromD<D>>
1153 return LoadU(d, p);
1154}
1155
1156#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
1157#ifdef HWY_NATIVE_LOAD_N
1158#undef HWY_NATIVE_LOAD_N
1159#else
1160#define HWY_NATIVE_LOAD_N
1161#endif
1162
1163template <class D, typename T = TFromD<D>>
1165 size_t max_lanes_to_load) {
1166#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
1167 if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
1168 return Zero(d);
1169 }
1170
1171 if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) &&
1172 max_lanes_to_load >= HWY_MAX_LANES_D(D)) {
1173 return LoadU(d, p);
1174 }
1175#endif
1176
1177 const size_t num_of_bytes_to_load =
1178 HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
1179 const Repartition<uint8_t, decltype(d)> du8;
1180#if HWY_S390X_HAVE_Z14
1181 return (num_of_bytes_to_load > 0)
1182 ? BitCast(d, VFromD<decltype(du8)>{vec_load_len(
1183 const_cast<unsigned char*>(
1184 reinterpret_cast<const unsigned char*>(p)),
1185 static_cast<unsigned>(num_of_bytes_to_load - 1))})
1186 : Zero(d);
1187#else
1188 return BitCast(
1189 d,
1190 VFromD<decltype(du8)>{vec_xl_len(
1191 const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
1192 num_of_bytes_to_load)});
1193#endif
1194}
1195
1196template <class D, typename T = TFromD<D>>
1198 size_t max_lanes_to_load) {
1199#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
1200 if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
1201 return no;
1202 }
1203
1204 if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) &&
1205 max_lanes_to_load >= HWY_MAX_LANES_D(D)) {
1206 return LoadU(d, p);
1207 }
1208#endif
1209
1210 return IfThenElse(FirstN(d, max_lanes_to_load),
1211 LoadN(d, p, max_lanes_to_load), no);
1212}
1213
1214#endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
1215
1216// Returns a vector with lane i=[0, N) set to "first" + i.
1217namespace detail {
1218
1219template <class D, HWY_IF_T_SIZE_D(D, 1)>
1221 constexpr __vector unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7,
1222 8, 9, 10, 11, 12, 13, 14, 15};
1223 return BitCast(d, VFromD<RebindToUnsigned<D>>{kU8Iota0});
1224}
1225
1226template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
1228 constexpr __vector unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
1229 return BitCast(d, VFromD<RebindToUnsigned<D>>{kU16Iota0});
1230}
1231
1232template <class D, HWY_IF_UI32_D(D)>
1234 constexpr __vector unsigned int kU32Iota0 = {0, 1, 2, 3};
1235 return BitCast(d, VFromD<RebindToUnsigned<D>>{kU32Iota0});
1236}
1237
1238template <class D, HWY_IF_UI64_D(D)>
1240 constexpr __vector unsigned long long kU64Iota0 = {0, 1};
1241 return BitCast(d, VFromD<RebindToUnsigned<D>>{kU64Iota0});
1242}
1243
1244template <class D, HWY_IF_F32_D(D)>
1245HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1246 constexpr __vector float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f};
1247 return VFromD<D>{kF32Iota0};
1248}
1249
1250template <class D, HWY_IF_F64_D(D)>
1251HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1252 constexpr __vector double kF64Iota0 = {0.0, 1.0};
1253 return VFromD<D>{kF64Iota0};
1254}
1255
1256} // namespace detail
1257
1258template <class D, typename T2>
1259HWY_API VFromD<D> Iota(D d, const T2 first) {
1260 return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
1261}
1262
1263// ------------------------------ FirstN (Iota, Lt)
1264
1265template <class D>
1266HWY_API MFromD<D> FirstN(D d, size_t num) {
1267 const RebindToUnsigned<decltype(d)> du;
1268 using TU = TFromD<decltype(du)>;
1269 return RebindMask(d, Iota(du, 0) < Set(du, static_cast<TU>(num)));
1270}
1271
1272// ------------------------------ MaskedLoad
1273template <class D, typename T = TFromD<D>>
1275 return IfThenElseZero(m, LoadU(d, p));
1276}
1277
1278// ------------------------------ MaskedLoadOr
1279template <class D, typename T = TFromD<D>>
1281 const T* HWY_RESTRICT p) {
1282 return IfThenElse(m, LoadU(d, p), v);
1283}
1284
1285// ------------------------------ Store
1286
1287template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
1288HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
1289// Suppress the ignoring attributes warning that is generated by
1290// HWY_RCAST_ALIGNED(StoreRaw*, aligned) with GCC
1291#if HWY_COMPILER_GCC
1292 HWY_DIAGNOSTICS(push)
1293 HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
1294#endif
1295
1296 using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
1297 *HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
1298
1299#if HWY_COMPILER_GCC
1300 HWY_DIAGNOSTICS(pop)
1301#endif
1302}
1303
1304template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
1305HWY_API void StoreU(Vec128<T> v, D /* tag */, T* HWY_RESTRICT p) {
1306 using StoreRaw = typename detail::Raw128<T>::UnalignedRawVec;
1307 *reinterpret_cast<StoreRaw*>(p) = reinterpret_cast<StoreRaw>(v.raw);
1308}
1309
1310template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
1312 using BitsT = UnsignedFromSize<d.MaxBytes()>;
1313
1314 const Repartition<BitsT, decltype(d)> d_bits;
1315 const BitsT bits = GetLane(BitCast(d_bits, v));
1316 CopyBytes<d.MaxBytes()>(&bits, p);
1317}
1318
1319// For < 128 bit, StoreU == Store.
1320template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
1322 Store(v, d, p);
1323}
1324
1325#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
1326
1327#ifdef HWY_NATIVE_STORE_N
1328#undef HWY_NATIVE_STORE_N
1329#else
1330#define HWY_NATIVE_STORE_N
1331#endif
1332
1333template <class D, typename T = TFromD<D>>
1334HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1335 size_t max_lanes_to_store) {
1336#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
1337 if (__builtin_constant_p(max_lanes_to_store) && max_lanes_to_store == 0) {
1338 return;
1339 }
1340
1341 if (__builtin_constant_p(max_lanes_to_store >= HWY_MAX_LANES_D(D)) &&
1342 max_lanes_to_store >= HWY_MAX_LANES_D(D)) {
1343 StoreU(v, d, p);
1344 return;
1345 }
1346#endif
1347
1348 const size_t num_of_bytes_to_store =
1349 HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
1350 const Repartition<uint8_t, decltype(d)> du8;
1351#if HWY_S390X_HAVE_Z14
1352 if (num_of_bytes_to_store > 0) {
1353 vec_store_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
1354 static_cast<unsigned>(num_of_bytes_to_store - 1));
1355 }
1356#else
1357 vec_xst_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
1358 num_of_bytes_to_store);
1359#endif
1360}
1361#endif
1362
1363// ------------------------------ BlendedStore
1364
1365template <class D>
1366HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1367 TFromD<D>* HWY_RESTRICT p) {
1368 const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t.
1369 using TI = TFromD<decltype(di)>;
1370 alignas(16) TI buf[MaxLanes(d)];
1371 alignas(16) TI mask[MaxLanes(d)];
1372 Store(BitCast(di, v), di, buf);
1373 Store(BitCast(di, VecFromMask(d, m)), di, mask);
1374 for (size_t i = 0; i < MaxLanes(d); ++i) {
1375 if (mask[i]) {
1376 CopySameSize(buf + i, p + i);
1377 }
1378 }
1379}
1380
1381// ================================================== ARITHMETIC
1382
1383namespace detail {
1384// If TFromD<D> is an integer type, detail::RebindToUnsignedIfNotFloat<D>
1385// rebinds D to MakeUnsigned<TFromD<D>>.
1386
1387// Otherwise, if TFromD<D> is a floating-point type (including F16 and BF16),
1388// detail::RebindToUnsignedIfNotFloat<D> is the same as D.
1389template <class D>
1391 hwy::If<(!hwy::IsFloat<TFromD<D>>() && !hwy::IsSpecialFloat<TFromD<D>>()),
1393} // namespace detail
1394
1395// ------------------------------ Addition
1396
1397template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1399 const DFromV<decltype(a)> d;
1400 const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
1401
1402 // If T is an integer type, do an unsigned vec_add to avoid undefined behavior
1403#if HWY_S390X_HAVE_Z14
1404 return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw +
1405 BitCast(d_arith, b).raw});
1406#else
1407 return BitCast(d, VFromD<decltype(d_arith)>{vec_add(
1408 BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
1409#endif
1410}
1411
1412// ------------------------------ Subtraction
1413
1414template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1416 const DFromV<decltype(a)> d;
1417 const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
1418
1419 // If T is an integer type, do an unsigned vec_sub to avoid undefined behavior
1420#if HWY_S390X_HAVE_Z14
1421 return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw -
1422 BitCast(d_arith, b).raw});
1423#else
1424 return BitCast(d, VFromD<decltype(d_arith)>{vec_sub(
1425 BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
1426#endif
1427}
1428
1429// ------------------------------ SumsOf8
1430template <class V, HWY_IF_U8(TFromV<V>)>
1434
1435template <class V, HWY_IF_I8(TFromV<V>)>
1437#if HWY_S390X_HAVE_Z14
1438 const DFromV<decltype(v)> di8;
1439 const RebindToUnsigned<decltype(di8)> du8;
1440 const RepartitionToWideX3<decltype(di8)> di64;
1441
1442 return BitCast(di64, SumsOf8(BitCast(du8, Xor(v, SignBit(di8))))) +
1443 Set(di64, int64_t{-1024});
1444#else
1445 return SumsOf2(SumsOf4(v));
1446#endif
1447}
1448
1449// ------------------------------ SaturatedAdd
1450
1451// Returns a + b clamped to the destination range.
1452
1453#if HWY_S390X_HAVE_Z14
1454// Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedAdd instructions unlike most
1455// other integer SIMD instruction sets
1456
1457template <typename T, size_t N, HWY_IF_UNSIGNED(T),
1458 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1459HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
1460 return Add(a, Min(b, Not(a)));
1461}
1462
1463template <typename T, size_t N, HWY_IF_SIGNED(T),
1464 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1465HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
1466 const DFromV<decltype(a)> d;
1467 const auto sum = Add(a, b);
1468 const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
1469 const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
1470 return IfNegativeThenElse(overflow_mask, overflow_result, sum);
1471}
1472
1473#else // VSX
1474
1475#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
1476#undef HWY_NATIVE_I32_SATURATED_ADDSUB
1477#else
1478#define HWY_NATIVE_I32_SATURATED_ADDSUB
1479#endif
1480
1481#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
1482#undef HWY_NATIVE_U32_SATURATED_ADDSUB
1483#else
1484#define HWY_NATIVE_U32_SATURATED_ADDSUB
1485#endif
1486
1487template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1488 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
1489HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
1490 return Vec128<T, N>{vec_adds(a.raw, b.raw)};
1491}
1492#endif // HWY_S390X_HAVE_Z14
1493
1494#if HWY_PPC_HAVE_10
1495
1496#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
1497#undef HWY_NATIVE_I64_SATURATED_ADDSUB
1498#else
1499#define HWY_NATIVE_I64_SATURATED_ADDSUB
1500#endif
1501
1502template <class V, HWY_IF_I64_D(DFromV<V>)>
1503HWY_API V SaturatedAdd(V a, V b) {
1504 const DFromV<decltype(a)> d;
1505 const auto sum = Add(a, b);
1506 const auto overflow_mask =
1507 BroadcastSignBit(detail::TernaryLogic<0x42>(a, b, sum));
1508 const auto overflow_result =
1509 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
1510 return IfNegativeThenElse(overflow_mask, overflow_result, sum);
1511}
1512
1513#endif // HWY_PPC_HAVE_10
1514
1515// ------------------------------ SaturatedSub
1516
1517// Returns a - b clamped to the destination range.
1518
1519#if HWY_S390X_HAVE_Z14
1520// Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedSub instructions unlike most
1521// other integer SIMD instruction sets
1522
1523template <typename T, size_t N, HWY_IF_UNSIGNED(T),
1524 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1525HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
1526 return Sub(a, Min(a, b));
1527}
1528
1529template <typename T, size_t N, HWY_IF_SIGNED(T),
1530 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1531HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
1532 const DFromV<decltype(a)> d;
1533 const auto diff = Sub(a, b);
1534 const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
1535 const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
1536 return IfNegativeThenElse(overflow_mask, overflow_result, diff);
1537}
1538
1539#else // VSX
1540
1541template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1542 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
1543HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
1544 return Vec128<T, N>{vec_subs(a.raw, b.raw)};
1545}
1546#endif // HWY_S390X_HAVE_Z14
1547
1548#if HWY_PPC_HAVE_10
1549
1550template <class V, HWY_IF_I64_D(DFromV<V>)>
1551HWY_API V SaturatedSub(V a, V b) {
1552 const DFromV<decltype(a)> d;
1553 const auto diff = Sub(a, b);
1554 const auto overflow_mask =
1555 BroadcastSignBit(detail::TernaryLogic<0x18>(a, b, diff));
1556 const auto overflow_result =
1557 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
1558 return IfNegativeThenElse(overflow_mask, overflow_result, diff);
1559}
1560
1561#endif // HWY_PPC_HAVE_10
1562
1563// ------------------------------ AverageRound
1564
1565// Returns (a + b + 1) / 2
1566
1567template <typename T, size_t N, HWY_IF_UNSIGNED(T),
1568 HWY_IF_T_SIZE_ONE_OF(T, 0x6)>
1572
1573// ------------------------------ Multiplication
1574
1575// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
1576#ifdef HWY_NATIVE_MUL_8
1577#undef HWY_NATIVE_MUL_8
1578#else
1579#define HWY_NATIVE_MUL_8
1580#endif
1581#ifdef HWY_NATIVE_MUL_64
1582#undef HWY_NATIVE_MUL_64
1583#else
1584#define HWY_NATIVE_MUL_64
1585#endif
1586
1587template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1589 const DFromV<decltype(a)> d;
1590 const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
1591
1592 // If T is an integer type, do an unsigned vec_mul to avoid undefined behavior
1593#if HWY_S390X_HAVE_Z14
1594 return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw *
1595 BitCast(d_arith, b).raw});
1596#else
1597 return BitCast(d, VFromD<decltype(d_arith)>{vec_mul(
1598 BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
1599#endif
1600}
1601
1602// Returns the upper sizeof(T)*8 bits of a * b in each lane.
1603
1604#if HWY_S390X_HAVE_Z14
1605#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1606 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
1607#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1608 hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
1609#elif HWY_PPC_HAVE_10
1610#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1611 HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))
1612#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1613 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))
1614#else
1615#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1616 hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
1617#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1618 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
1619#endif
1620
1621#if HWY_S390X_HAVE_Z14 || HWY_PPC_HAVE_10
1622template <typename T, size_t N, HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T),
1624HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
1625 return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
1626}
1627#endif
1628
1629template <typename T, HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
1632 const auto p_even = MulEven(a, b);
1633
1634#if HWY_IS_LITTLE_ENDIAN
1635 const auto p_even_full = ResizeBitCast(Full128<T>(), p_even);
1636 return Vec128<T, 1>{
1637 vec_sld(p_even_full.raw, p_even_full.raw, 16 - sizeof(T))};
1638#else
1639 const DFromV<decltype(a)> d;
1640 return ResizeBitCast(d, p_even);
1641#endif
1642}
1643
1644template <typename T, size_t N,
1648 const DFromV<decltype(a)> d;
1649
1650 const auto p_even = BitCast(d, MulEven(a, b));
1651 const auto p_odd = BitCast(d, MulOdd(a, b));
1652
1653#if HWY_IS_LITTLE_ENDIAN
1654 return InterleaveOdd(d, p_even, p_odd);
1655#else
1656 return InterleaveEven(d, p_even, p_odd);
1657#endif
1658}
1659
1660#if !HWY_PPC_HAVE_10
1661template <class T, HWY_IF_UI64(T)>
1662HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
1663 T p_hi;
1664 Mul128(GetLane(a), GetLane(b), &p_hi);
1665 return Set(Full64<T>(), p_hi);
1666}
1667
1668template <class T, HWY_IF_UI64(T)>
1669HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
1670 const DFromV<decltype(a)> d;
1671 const Half<decltype(d)> dh;
1672 return Combine(d, MulHigh(UpperHalf(dh, a), UpperHalf(dh, b)),
1673 MulHigh(LowerHalf(dh, a), LowerHalf(dh, b)));
1674}
1675#endif // !HWY_PPC_HAVE_10
1676
1677#undef HWY_PPC_IF_MULHIGH_USING_VEC_MULH
1678#undef HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH
1679
1680// Multiplies even lanes (0, 2, ..) and places the double-wide result into
1681// even and the upper half into its odd neighbor lane.
1682template <typename T, size_t N,
1683 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
1685HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a,
1686 Vec128<T, N> b) {
1687 return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mule(a.raw, b.raw)};
1688}
1689
1690// Multiplies odd lanes (1, 3, ..) and places the double-wide result into
1691// even and the upper half into its odd neighbor lane.
1692template <typename T, size_t N,
1693 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
1695HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
1696 Vec128<T, N> b) {
1697 return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
1698}
1699
1700// ------------------------------ Rol/Ror
1701
1702#ifdef HWY_NATIVE_ROL_ROR_8
1703#undef HWY_NATIVE_ROL_ROR_8
1704#else
1705#define HWY_NATIVE_ROL_ROR_8
1706#endif
1707
1708#ifdef HWY_NATIVE_ROL_ROR_16
1709#undef HWY_NATIVE_ROL_ROR_16
1710#else
1711#define HWY_NATIVE_ROL_ROR_16
1712#endif
1713
1714#ifdef HWY_NATIVE_ROL_ROR_32_64
1715#undef HWY_NATIVE_ROL_ROR_32_64
1716#else
1717#define HWY_NATIVE_ROL_ROR_32_64
1718#endif
1719
1720template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1722 const DFromV<decltype(a)> d;
1723 const RebindToUnsigned<decltype(d)> du;
1724 return BitCast(
1725 d, VFromD<decltype(du)>{vec_rl(BitCast(du, a).raw, BitCast(du, b).raw)});
1726}
1727
1728template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1730 const DFromV<decltype(a)> d;
1731 const RebindToSigned<decltype(d)> di;
1732 return Rol(a, BitCast(d, Neg(BitCast(di, b))));
1733}
1734
1735// ------------------------------ RotateRight
1736template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1737HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
1738 const DFromV<decltype(v)> d;
1739 constexpr size_t kSizeInBits = sizeof(T) * 8;
1740 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1741
1742 return (kBits == 0)
1743 ? v
1744 : Rol(v, Set(d, static_cast<T>(static_cast<int>(kSizeInBits) -
1745 kBits)));
1746}
1747
1748// ------------------------------ RotateLeftSame/RotateRightSame
1749#ifdef HWY_NATIVE_ROL_ROR_SAME_8
1750#undef HWY_NATIVE_ROL_ROR_SAME_8
1751#else
1752#define HWY_NATIVE_ROL_ROR_SAME_8
1753#endif
1754
1755#ifdef HWY_NATIVE_ROL_ROR_SAME_16
1756#undef HWY_NATIVE_ROL_ROR_SAME_16
1757#else
1758#define HWY_NATIVE_ROL_ROR_SAME_16
1759#endif
1760
1761#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
1762#undef HWY_NATIVE_ROL_ROR_SAME_32_64
1763#else
1764#define HWY_NATIVE_ROL_ROR_SAME_32_64
1765#endif
1766
1767template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1769 const DFromV<decltype(v)> d;
1770 return Rol(v, Set(d, static_cast<T>(static_cast<unsigned>(bits))));
1771}
1772
1773template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1775 const DFromV<decltype(v)> d;
1776 return Rol(v, Set(d, static_cast<T>(0u - static_cast<unsigned>(bits))));
1777}
1778
1779// ------------------------------ IfNegativeThenElse
1780
1781template <typename T, size_t N>
1782HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1783 Vec128<T, N> no) {
1784 static_assert(IsSigned<T>(), "Only works for signed/float");
1785
1786 const DFromV<decltype(v)> d;
1787#if HWY_PPC_HAVE_10
1788 const RebindToUnsigned<decltype(d)> du;
1789 return BitCast(
1790 d, VFromD<decltype(du)>{vec_blendv(
1791 BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)});
1792#else
1793 const RebindToSigned<decltype(d)> di;
1794 return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no);
1795#endif
1796}
1797
1798#if HWY_PPC_HAVE_10
1799#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1800#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1801#else
1802#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1803#endif
1804
1805#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1806#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1807#else
1808#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1809#endif
1810
1811template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1812HWY_API V IfNegativeThenElseZero(V v, V yes) {
1813 const DFromV<decltype(v)> d;
1814 return IfNegativeThenElse(v, yes, Zero(d));
1815}
1816
1817template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1818HWY_API V IfNegativeThenZeroElse(V v, V no) {
1819 const DFromV<decltype(v)> d;
1820 return IfNegativeThenElse(v, Zero(d), no);
1821}
1822#endif
1823
1824// generic_ops takes care of integer T.
1825template <typename T, size_t N, HWY_IF_FLOAT(T)>
1826HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
1827 return Abs(a - b);
1828}
1829
1830// ------------------------------ Floating-point multiply-add variants
1831
1832// Returns mul * x + add
1833template <typename T, size_t N, HWY_IF_FLOAT(T)>
1834HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
1835 Vec128<T, N> add) {
1836 return Vec128<T, N>{vec_madd(mul.raw, x.raw, add.raw)};
1837}
1838
1839// Returns add - mul * x
1840template <typename T, size_t N, HWY_IF_FLOAT(T)>
1841HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
1842 Vec128<T, N> add) {
1843 // NOTE: the vec_nmsub operation below computes -(mul * x - add),
1844 // which is equivalent to add - mul * x in the round-to-nearest
1845 // and round-towards-zero rounding modes
1846 return Vec128<T, N>{vec_nmsub(mul.raw, x.raw, add.raw)};
1847}
1848
1849// Returns mul * x - sub
1850template <typename T, size_t N, HWY_IF_FLOAT(T)>
1851HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
1852 Vec128<T, N> sub) {
1853 return Vec128<T, N>{vec_msub(mul.raw, x.raw, sub.raw)};
1854}
1855
1856// Returns -mul * x - sub
1857template <typename T, size_t N, HWY_IF_FLOAT(T)>
1858HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
1859 Vec128<T, N> sub) {
1860 // NOTE: The vec_nmadd operation below computes -(mul * x + sub),
1861 // which is equivalent to -mul * x - sub in the round-to-nearest
1862 // and round-towards-zero rounding modes
1863 return Vec128<T, N>{vec_nmadd(mul.raw, x.raw, sub.raw)};
1864}
1865
1866// ------------------------------ Floating-point div
1867// Approximate reciprocal
1868
1869#ifdef HWY_NATIVE_F64_APPROX_RECIP
1870#undef HWY_NATIVE_F64_APPROX_RECIP
1871#else
1872#define HWY_NATIVE_F64_APPROX_RECIP
1873#endif
1874
1875template <typename T, size_t N, HWY_IF_FLOAT(T)>
1876HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
1877#if HWY_S390X_HAVE_Z14
1878 return Vec128<T, N>{a.raw / b.raw};
1879#else
1880 return Vec128<T, N>{vec_div(a.raw, b.raw)};
1881#endif
1882}
1883
1884template <typename T, size_t N, HWY_IF_FLOAT(T)>
1886#if HWY_S390X_HAVE_Z14
1887 const DFromV<decltype(v)> d;
1888 return Set(d, T(1.0)) / v;
1889#else
1890 return Vec128<T, N>{vec_re(v.raw)};
1891#endif
1892}
1893
1894// ------------------------------ Floating-point square root
1895
1896#if HWY_S390X_HAVE_Z14
1897// Approximate reciprocal square root
1898template <size_t N>
1899HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
1900 const DFromV<decltype(v)> d;
1901 const RebindToUnsigned<decltype(d)> du;
1902
1903 const auto half = v * Set(d, 0.5f);
1904 // Initial guess based on log2(f)
1905 const auto guess = BitCast(
1906 d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v)));
1907 // One Newton-Raphson iteration
1908 return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f));
1909}
1910#else // VSX
1911
1912#ifdef HWY_NATIVE_F64_APPROX_RSQRT
1913#undef HWY_NATIVE_F64_APPROX_RSQRT
1914#else
1915#define HWY_NATIVE_F64_APPROX_RSQRT
1916#endif
1917
1918// Approximate reciprocal square root
1919template <class T, size_t N, HWY_IF_FLOAT(T)>
1920HWY_API Vec128<T, N> ApproximateReciprocalSqrt(Vec128<T, N> v) {
1921 return Vec128<T, N>{vec_rsqrte(v.raw)};
1922}
1923#endif // HWY_S390X_HAVE_Z14
1924
1925// Full precision square root
1926template <class T, size_t N, HWY_IF_FLOAT(T)>
1927HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
1928 return Vec128<T, N>{vec_sqrt(v.raw)};
1929}
1930
1931// ------------------------------ Min (Gt, IfThenElse)
1932
1933template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1935 return Vec128<T, N>{vec_min(a.raw, b.raw)};
1936}
1937
1938// ------------------------------ Max (Gt, IfThenElse)
1939
1940template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1942 return Vec128<T, N>{vec_max(a.raw, b.raw)};
1943}
1944
1945// ------------------------------- Integer AbsDiff for PPC9/PPC10
1946
1947#if HWY_PPC_HAVE_9
1948#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
1949#undef HWY_NATIVE_INTEGER_ABS_DIFF
1950#else
1951#define HWY_NATIVE_INTEGER_ABS_DIFF
1952#endif
1953
1954template <class V, HWY_IF_UNSIGNED_V(V),
1955 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
1956HWY_API V AbsDiff(const V a, const V b) {
1957 return V{vec_absd(a.raw, b.raw)};
1958}
1959
1960template <class V, HWY_IF_U64_D(DFromV<V>)>
1961HWY_API V AbsDiff(const V a, const V b) {
1962 return Sub(Max(a, b), Min(a, b));
1963}
1964
1965template <class V, HWY_IF_SIGNED_V(V)>
1966HWY_API V AbsDiff(const V a, const V b) {
1967 return Sub(Max(a, b), Min(a, b));
1968}
1969
1970#endif // HWY_PPC_HAVE_9
1971
1972// ------------------------------ Integer Div for PPC10
1973#if HWY_PPC_HAVE_10
1974#ifdef HWY_NATIVE_INT_DIV
1975#undef HWY_NATIVE_INT_DIV
1976#else
1977#define HWY_NATIVE_INT_DIV
1978#endif
1979
1980template <size_t N>
1981HWY_API Vec128<int32_t, N> operator/(Vec128<int32_t, N> a,
1982 Vec128<int32_t, N> b) {
1983 // Inline assembly is used instead of vec_div for I32 Div on PPC10 to avoid
1984 // undefined behavior if b[i] == 0 or
1985 // (a[i] == LimitsMin<int32_t>() && b[i] == -1)
1986
1987 // Clang will also optimize out I32 vec_div on PPC10 if optimizations are
1988 // enabled and any of the lanes of b are known to be zero (even in the unused
1989 // lanes of a partial vector)
1990 __vector signed int raw_result;
1991 __asm__("vdivsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1992 return Vec128<int32_t, N>{raw_result};
1993}
1994
1995template <size_t N>
1996HWY_API Vec128<uint32_t, N> operator/(Vec128<uint32_t, N> a,
1997 Vec128<uint32_t, N> b) {
1998 // Inline assembly is used instead of vec_div for U32 Div on PPC10 to avoid
1999 // undefined behavior if b[i] == 0
2000
2001 // Clang will also optimize out U32 vec_div on PPC10 if optimizations are
2002 // enabled and any of the lanes of b are known to be zero (even in the unused
2003 // lanes of a partial vector)
2004 __vector unsigned int raw_result;
2005 __asm__("vdivuw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2006 return Vec128<uint32_t, N>{raw_result};
2007}
2008
2009template <size_t N>
2010HWY_API Vec128<int64_t, N> operator/(Vec128<int64_t, N> a,
2011 Vec128<int64_t, N> b) {
2012 // Inline assembly is used instead of vec_div for I64 Div on PPC10 to avoid
2013 // undefined behavior if b[i] == 0 or
2014 // (a[i] == LimitsMin<int64_t>() && b[i] == -1)
2015
2016 // Clang will also optimize out I64 vec_div on PPC10 if optimizations are
2017 // enabled and any of the lanes of b are known to be zero (even in the unused
2018 // lanes of a partial vector)
2019 __vector signed long long raw_result;
2020 __asm__("vdivsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2021 return Vec128<int64_t, N>{raw_result};
2022}
2023
2024template <size_t N>
2025HWY_API Vec128<uint64_t, N> operator/(Vec128<uint64_t, N> a,
2026 Vec128<uint64_t, N> b) {
2027 // Inline assembly is used instead of vec_div for U64 Div on PPC10 to avoid
2028 // undefined behavior if b[i] == 0
2029
2030 // Clang will also optimize out U64 vec_div on PPC10 if optimizations are
2031 // enabled and any of the lanes of b are known to be zero (even in the unused
2032 // lanes of a partial vector)
2033 __vector unsigned long long raw_result;
2034 __asm__("vdivud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2035 return Vec128<uint64_t, N>{raw_result};
2036}
2037
2038template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
2039 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
2040HWY_API Vec128<T> operator/(Vec128<T> a, Vec128<T> b) {
2041 const DFromV<decltype(a)> d;
2042 const RepartitionToWide<decltype(d)> dw;
2043 return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b),
2044 PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b));
2045}
2046
2047template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
2048 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
2049 HWY_IF_V_SIZE_LE(T, N, 8)>
2050HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
2051 const DFromV<decltype(a)> d;
2052 const Rebind<MakeWide<T>, decltype(d)> dw;
2053 return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b));
2054}
2055
2056template <size_t N>
2057HWY_API Vec128<int32_t, N> operator%(Vec128<int32_t, N> a,
2058 Vec128<int32_t, N> b) {
2059 // Inline assembly is used instead of vec_mod for I32 Mod on PPC10 to avoid
2060 // undefined behavior if b[i] == 0 or
2061 // (a[i] == LimitsMin<int32_t>() && b[i] == -1)
2062
2063 // Clang will also optimize out I32 vec_mod on PPC10 if optimizations are
2064 // enabled and any of the lanes of b are known to be zero (even in the unused
2065 // lanes of a partial vector)
2066 __vector signed int raw_result;
2067 __asm__("vmodsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2068 return Vec128<int32_t, N>{raw_result};
2069}
2070
2071template <size_t N>
2072HWY_API Vec128<uint32_t, N> operator%(Vec128<uint32_t, N> a,
2073 Vec128<uint32_t, N> b) {
2074 // Inline assembly is used instead of vec_mod for U32 Mod on PPC10 to avoid
2075 // undefined behavior if b[i] == 0
2076
2077 // Clang will also optimize out U32 vec_mod on PPC10 if optimizations are
2078 // enabled and any of the lanes of b are known to be zero (even in the unused
2079 // lanes of a partial vector)
2080 __vector unsigned int raw_result;
2081 __asm__("vmoduw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2082 return Vec128<uint32_t, N>{raw_result};
2083}
2084
2085template <size_t N>
2086HWY_API Vec128<int64_t, N> operator%(Vec128<int64_t, N> a,
2087 Vec128<int64_t, N> b) {
2088 // Inline assembly is used instead of vec_mod for I64 Mod on PPC10 to avoid
2089 // undefined behavior if b[i] == 0 or
2090 // (a[i] == LimitsMin<int64_t>() && b[i] == -1)
2091
2092 // Clang will also optimize out I64 vec_mod on PPC10 if optimizations are
2093 // enabled and any of the lanes of b are known to be zero (even in the unused
2094 // lanes of a partial vector)
2095 __vector signed long long raw_result;
2096 __asm__("vmodsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2097 return Vec128<int64_t, N>{raw_result};
2098}
2099
2100template <size_t N>
2101HWY_API Vec128<uint64_t, N> operator%(Vec128<uint64_t, N> a,
2102 Vec128<uint64_t, N> b) {
2103 // Inline assembly is used instead of vec_mod for U64 Mod on PPC10 to avoid
2104 // undefined behavior if b[i] == 0
2105
2106 // Clang will also optimize out U64 vec_mod on PPC10 if optimizations are
2107 // enabled and any of the lanes of b are known to be zero (even in the unused
2108 // lanes of a partial vector)
2109 __vector unsigned long long raw_result;
2110 __asm__("vmodud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2111 return Vec128<uint64_t, N>{raw_result};
2112}
2113
2114template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
2115 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
2116HWY_API Vec128<T> operator%(Vec128<T> a, Vec128<T> b) {
2117 const DFromV<decltype(a)> d;
2118 const RepartitionToWide<decltype(d)> dw;
2119 return OrderedDemote2To(d, PromoteLowerTo(dw, a) % PromoteLowerTo(dw, b),
2120 PromoteUpperTo(dw, a) % PromoteUpperTo(dw, b));
2121}
2122
2123template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
2124 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
2125 HWY_IF_V_SIZE_LE(T, N, 8)>
2126HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
2127 const DFromV<decltype(a)> d;
2128 const Rebind<MakeWide<T>, decltype(d)> dw;
2129 return DemoteTo(d, PromoteTo(dw, a) % PromoteTo(dw, b));
2130}
2131#endif
2132
2133// ================================================== MEMORY (3)
2134
2135// ------------------------------ Non-temporal stores
2136
2137template <class D>
2138HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
2139 __builtin_prefetch(aligned, 1, 0);
2140 Store(v, d, aligned);
2141}
2142
2143// ------------------------------ Scatter in generic_ops-inl.h
2144// ------------------------------ Gather in generic_ops-inl.h
2145
2146// ================================================== SWIZZLE (2)
2147
2148// ------------------------------ LowerHalf
2149
2150// Returns upper/lower half of a vector.
2151template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2153 return VFromD<D>{v.raw};
2154}
2155template <typename T, size_t N>
2156HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
2157 return Vec128<T, N / 2>{v.raw};
2158}
2159
2160// ------------------------------ ShiftLeftBytes
2161
2162// NOTE: The ShiftLeftBytes operation moves the elements of v to the right
2163// by kBytes bytes and zeroes out the first kBytes bytes of v on both
2164// little-endian and big-endian PPC targets
2165// (same behavior as the HWY_EMU128 ShiftLeftBytes operation on both
2166// little-endian and big-endian targets)
2167
2168template <int kBytes, class D>
2170 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2171 if (kBytes == 0) return v;
2172 const auto zeros = Zero(d);
2173#if HWY_IS_LITTLE_ENDIAN
2174 return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
2175#else
2176 return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
2177#endif
2178}
2179
2180template <int kBytes, typename T, size_t N>
2181HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
2182 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
2183}
2184
2185// ------------------------------ ShiftLeftLanes
2186
2187// NOTE: The ShiftLeftLanes operation moves the elements of v to the right
2188// by kLanes lanes and zeroes out the first kLanes lanes of v on both
2189// little-endian and big-endian PPC targets
2190// (same behavior as the HWY_EMU128 ShiftLeftLanes operation on both
2191// little-endian and big-endian targets)
2192
2193template <int kLanes, class D, typename T = TFromD<D>>
2195 const Repartition<uint8_t, decltype(d)> d8;
2196 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2197}
2198
2199template <int kLanes, typename T, size_t N>
2200HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
2201 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
2202}
2203
2204// ------------------------------ ShiftRightBytes
2205
2206// NOTE: The ShiftRightBytes operation moves the elements of v to the left
2207// by kBytes bytes and zeroes out the last kBytes bytes of v on both
2208// little-endian and big-endian PPC targets
2209// (same behavior as the HWY_EMU128 ShiftRightBytes operation on both
2210// little-endian and big-endian targets)
2211
2212template <int kBytes, class D>
2214 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2215 if (kBytes == 0) return v;
2216
2217 // For partial vectors, clear upper lanes so we shift in zeros.
2218 if (d.MaxBytes() != 16) {
2219 const Full128<TFromD<D>> dfull;
2220 VFromD<decltype(dfull)> vfull{v.raw};
2221 v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
2222 }
2223
2224 const auto zeros = Zero(d);
2225#if HWY_IS_LITTLE_ENDIAN
2226 return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
2227#else
2228 return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
2229#endif
2230}
2231
2232// ------------------------------ ShiftRightLanes
2233
2234// NOTE: The ShiftRightLanes operation moves the elements of v to the left
2235// by kLanes lanes and zeroes out the last kLanes lanes of v on both
2236// little-endian and big-endian PPC targets
2237// (same behavior as the HWY_EMU128 ShiftRightLanes operation on both
2238// little-endian and big-endian targets)
2239
2240template <int kLanes, class D>
2242 const Repartition<uint8_t, decltype(d)> d8;
2243 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
2244 return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
2245}
2246
2247// ------------------------------ UpperHalf (ShiftRightBytes)
2248
2249template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2251 return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
2252}
2253
2254// ------------------------------ ExtractLane
2255template <typename T, size_t N>
2256HWY_API T ExtractLane(Vec128<T, N> v, size_t i) {
2257 return static_cast<T>(v.raw[i]);
2258}
2259
2260// ------------------------------ InsertLane
2261template <typename T, size_t N>
2262HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
2263#if HWY_IS_LITTLE_ENDIAN
2264 typename detail::Raw128<T>::type raw_result = v.raw;
2265 raw_result[i] = BitCastScalar<typename detail::Raw128<T>::RawT>(t);
2266 return Vec128<T, N>{raw_result};
2267#else
2268 // On ppc64be without this, mul_test fails, but swizzle_test passes.
2269 DFromV<decltype(v)> d;
2270 alignas(16) T lanes[16 / sizeof(T)];
2271 Store(v, d, lanes);
2272 lanes[i] = t;
2273 return Load(d, lanes);
2274#endif
2275}
2276
2277// ------------------------------ CombineShiftRightBytes
2278
2279// NOTE: The CombineShiftRightBytes operation below moves the elements of lo to
2280// the left by kBytes bytes and moves the elements of hi right by (d.MaxBytes()
2281// - kBytes) bytes on both little-endian and big-endian PPC targets.
2282
2283template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
2285 constexpr size_t kSize = 16;
2286 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
2287#if HWY_IS_LITTLE_ENDIAN
2288 return Vec128<T>{vec_sld(hi.raw, lo.raw, (-kBytes) & 15)};
2289#else
2290 return Vec128<T>{vec_sld(lo.raw, hi.raw, kBytes)};
2291#endif
2292}
2293
2294template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2296 constexpr size_t kSize = d.MaxBytes();
2297 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
2298 const Repartition<uint8_t, decltype(d)> d8;
2299 using V8 = Vec128<uint8_t>;
2300 const DFromV<V8> dfull8;
2301 const Repartition<TFromD<D>, decltype(dfull8)> dfull;
2302 const V8 hi8{BitCast(d8, hi).raw};
2303 // Move into most-significant bytes
2304 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
2305 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8);
2306 return VFromD<D>{BitCast(dfull, r).raw};
2307}
2308
2309// ------------------------------ Broadcast/splat any lane
2310
2311template <int kLane, typename T, size_t N>
2312HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
2313 static_assert(0 <= kLane && kLane < N, "Invalid lane");
2314 return Vec128<T, N>{vec_splat(v.raw, kLane)};
2315}
2316
2317// ------------------------------ TableLookupLanes (Shuffle01)
2318
2319// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
2320template <typename T, size_t N = 16 / sizeof(T)>
2321struct Indices128 {
2322 __vector unsigned char raw;
2323};
2324
2325namespace detail {
2326
2327template <class D, HWY_IF_T_SIZE_D(D, 1)>
2328HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
2329 D d) {
2330 const Repartition<uint8_t, decltype(d)> d8;
2331 return Iota(d8, 0);
2332}
2333
2334template <class D, HWY_IF_T_SIZE_D(D, 2)>
2335HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
2336 D d) {
2337 const Repartition<uint8_t, decltype(d)> d8;
2338#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2339 constexpr __vector unsigned char kBroadcastLaneBytes = {
2340 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
2341#else
2342 constexpr __vector unsigned char kBroadcastLaneBytes = {
2343 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
2344#endif
2345 return VFromD<decltype(d8)>{kBroadcastLaneBytes};
2346}
2347
2348template <class D, HWY_IF_T_SIZE_D(D, 4)>
2350 D d) {
2351 const Repartition<uint8_t, decltype(d)> d8;
2352#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2353 constexpr __vector unsigned char kBroadcastLaneBytes = {
2354 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2355#else
2356 constexpr __vector unsigned char kBroadcastLaneBytes = {
2357 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15};
2358#endif
2359 return VFromD<decltype(d8)>{kBroadcastLaneBytes};
2360}
2361
2362template <class D, HWY_IF_T_SIZE_D(D, 8)>
2364 D d) {
2365 const Repartition<uint8_t, decltype(d)> d8;
2366#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2367 constexpr __vector unsigned char kBroadcastLaneBytes = {
2368 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2369#else
2370 constexpr __vector unsigned char kBroadcastLaneBytes = {
2371 7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
2372#endif
2373 return VFromD<decltype(d8)>{kBroadcastLaneBytes};
2374}
2375
2376template <class D, HWY_IF_T_SIZE_D(D, 1)>
2378 const Repartition<uint8_t, decltype(d)> d8;
2379 return Zero(d8);
2380}
2381
2382template <class D, HWY_IF_T_SIZE_D(D, 2)>
2384 const Repartition<uint8_t, decltype(d)> d8;
2385 constexpr __vector unsigned char kByteOffsets = {0, 1, 0, 1, 0, 1, 0, 1,
2386 0, 1, 0, 1, 0, 1, 0, 1};
2387 return VFromD<decltype(d8)>{kByteOffsets};
2388}
2389
2390template <class D, HWY_IF_T_SIZE_D(D, 4)>
2392 const Repartition<uint8_t, decltype(d)> d8;
2393 constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 0, 1, 2, 3,
2394 0, 1, 2, 3, 0, 1, 2, 3};
2395 return VFromD<decltype(d8)>{kByteOffsets};
2396}
2397
2398template <class D, HWY_IF_T_SIZE_D(D, 8)>
2400 const Repartition<uint8_t, decltype(d)> d8;
2401 constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 4, 5, 6, 7,
2402 0, 1, 2, 3, 4, 5, 6, 7};
2403 return VFromD<decltype(d8)>{kByteOffsets};
2404}
2405
2406} // namespace detail
2407
2408template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)>
2409HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
2410 D d, Vec128<TI, MaxLanes(D())> vec) {
2411 using T = TFromD<D>;
2412 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2413#if HWY_IS_DEBUG_BUILD
2414 const RebindToUnsigned<decltype(d)> du;
2415 using TU = TFromD<decltype(du)>;
2417 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
2418#endif
2419
2420 const Repartition<uint8_t, decltype(d)> d8;
2421 return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d8, vec).raw};
2422}
2423
2424template <class D, typename TI,
2425 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
2426HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
2427 D d, Vec128<TI, MaxLanes(D())> vec) {
2428 using T = TFromD<D>;
2429 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2430#if HWY_IS_DEBUG_BUILD
2431 const RebindToUnsigned<decltype(d)> du;
2432 using TU = TFromD<decltype(du)>;
2434 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
2435#endif
2436
2437 const Repartition<uint8_t, decltype(d)> d8;
2438 using V8 = VFromD<decltype(d8)>;
2439
2440 // Broadcast each lane index to all bytes of T and shift to bytes
2441 const V8 lane_indices = TableLookupBytes(
2443 constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
2444 const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
2445 const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
2446 return Indices128<TFromD<D>, MaxLanes(D())>{sum.raw};
2447}
2448
2449template <class D, typename TI>
2450HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
2451 D d, const TI* idx) {
2452 const Rebind<TI, decltype(d)> di;
2453 return IndicesFromVec(d, LoadU(di, idx));
2454}
2455
2456template <typename T, size_t N>
2457HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
2458 const DFromV<decltype(v)> d;
2459 const Repartition<uint8_t, decltype(d)> d8;
2460 return BitCast(d, TableLookupBytes(v, VFromD<decltype(d8)>{idx.raw}));
2461}
2462
2463// Single lane: no change
2464template <typename T>
2466 Indices128<T, 1> /* idx */) {
2467 return v;
2468}
2469
2470template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
2471HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
2472 Indices128<T, N> idx) {
2473 const DFromV<decltype(a)> d;
2474 const Twice<decltype(d)> dt;
2475 const Repartition<uint8_t, decltype(dt)> dt_u8;
2476// TableLookupLanes currently requires table and index vectors to be the same
2477// size, though a half-length index vector would be sufficient here.
2478#if HWY_IS_MSAN
2479 const Vec128<T, N> idx_vec{idx.raw};
2480 const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
2481#else
2482 // We only keep LowerHalf of the result, which is valid in idx.
2483 const Indices128<T, N * 2> idx2{idx.raw};
2484#endif
2485 return LowerHalf(
2486 d, TableLookupBytes(Combine(dt, b, a),
2487 BitCast(dt, VFromD<decltype(dt_u8)>{idx2.raw})));
2488}
2489
2490template <typename T>
2492 Indices128<T> idx) {
2493 return Vec128<T>{vec_perm(a.raw, b.raw, idx.raw)};
2494}
2495
2496// ------------------------------ ReverseBlocks
2497
2498// Single block: no change
2499template <class D>
2500HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
2501 return v;
2502}
2503
2504// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
2505
2506// Single lane: no change
2507template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
2508HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) {
2509 return v;
2510}
2511
2512// 32-bit x2: shuffle
2513template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
2515 return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw};
2516}
2517
2518// 16-bit x4: shuffle
2519template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
2520HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) {
2521 const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
2522 14, 15, 12, 13, 10, 11, 8, 9};
2523 return Vec64<T>{vec_perm(v.raw, v.raw, kShuffle)};
2524}
2525
2526// 16-bit x2: rotate bytes
2527template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
2529 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
2530 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
2531}
2532
2533// ------------------------------- ReverseLaneBytes
2534
2535#if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
2536 (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400)
2537
2538// Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes.
2539#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
2540#undef HWY_NATIVE_REVERSE_LANE_BYTES
2541#else
2542#define HWY_NATIVE_REVERSE_LANE_BYTES
2543#endif
2544
2545template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
2546 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
2548 return V{vec_revb(v.raw)};
2549}
2550
2551// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
2552#ifdef HWY_NATIVE_REVERSE2_8
2553#undef HWY_NATIVE_REVERSE2_8
2554#else
2555#define HWY_NATIVE_REVERSE2_8
2556#endif
2557
2558template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2560 const Repartition<uint16_t, decltype(d)> du16;
2561 return BitCast(d, ReverseLaneBytes(BitCast(du16, v)));
2562}
2563
2564template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2566 const Repartition<uint32_t, decltype(d)> du32;
2567 return BitCast(d, ReverseLaneBytes(BitCast(du32, v)));
2568}
2569
2570template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2572 const Repartition<uint64_t, decltype(d)> du64;
2573 return BitCast(d, ReverseLaneBytes(BitCast(du64, v)));
2574}
2575
2576#endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
2577
2578template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2580 return Reverse2(d, v);
2581}
2582
2583template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2584HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
2585 return Reverse4(d, v);
2586}
2587
2588template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2589HWY_API Vec64<T> Reverse(D d, Vec64<T> v) {
2590 return Reverse8(d, v);
2591}
2592
2593// ------------------------------ Reverse2
2594
2595// Single lane: no change
2596template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
2598 return v;
2599}
2600
2601template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
2603 const Repartition<uint32_t, decltype(d)> du32;
2604 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2605}
2606
2607template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
2609 const Repartition<uint64_t, decltype(d)> du64;
2610 return BitCast(d, RotateRight<32>(BitCast(du64, v)));
2611}
2612
2613template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
2614HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
2615 return Shuffle01(v);
2616}
2617
2618// ------------------------------ Reverse4
2619
2620template <class D, HWY_IF_T_SIZE_D(D, 2)>
2621HWY_API VFromD<D> Reverse4(D /*d*/, VFromD<D> v) {
2622 const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
2623 14, 15, 12, 13, 10, 11, 8, 9};
2624 return VFromD<D>{vec_perm(v.raw, v.raw, kShuffle)};
2625}
2626
2627template <class D, HWY_IF_T_SIZE_D(D, 4)>
2629 return Reverse(d, v);
2630}
2631
2632template <class D, HWY_IF_T_SIZE_D(D, 8)>
2633HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) {
2634 HWY_ASSERT(0); // don't have 4 u64 lanes
2635}
2636
2637// ------------------------------ Reverse8
2638
2639template <class D, HWY_IF_T_SIZE_D(D, 2)>
2641 return Reverse(d, v);
2642}
2643
2644template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
2645HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) {
2646 HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit
2647}
2648
2649// ------------------------------ InterleaveLower
2650
2651// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
2652// the least-significant lane) and "b". To concatenate two half-width integers
2653// into one, use ZipLower/Upper instead (also works with scalar).
2654
2655template <typename T, size_t N>
2656HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
2657 return Vec128<T, N>{vec_mergeh(a.raw, b.raw)};
2658}
2659
2660// Additional overload for the optional tag
2661template <class D>
2663 return InterleaveLower(a, b);
2664}
2665
2666// ------------------------------ InterleaveUpper (UpperHalf)
2667
2668// Full
2669template <class D, typename T = TFromD<D>>
2671 return Vec128<T>{vec_mergel(a.raw, b.raw)};
2672}
2673
2674// Partial
2675template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2677 const Half<decltype(d)> d2;
2678 return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
2679 VFromD<D>{UpperHalf(d2, b).raw});
2680}
2681
2682// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2683
2684// Same as Interleave*, except that the return lanes are double-width integers;
2685// this is necessary because the single-lane scalar cannot return two values.
2686template <class V, class DW = RepartitionToWide<DFromV<V>>>
2687HWY_API VFromD<DW> ZipLower(V a, V b) {
2688 return BitCast(DW(), InterleaveLower(a, b));
2689}
2690template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2691HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2692 return BitCast(dw, InterleaveLower(D(), a, b));
2693}
2694
2695template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2696HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2697 return BitCast(dw, InterleaveUpper(D(), a, b));
2698}
2699
2700// ------------------------------ Per4LaneBlkShufDupSet4xU32
2701
2702// Used by hwy/ops/generic_ops-inl.h to implement Per4LaneBlockShuffle
2703namespace detail {
2704
2705#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
2706#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
2707#else
2708#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
2709#endif
2710
2711template <class D>
2713 const uint32_t x2,
2714 const uint32_t x1,
2715 const uint32_t x0) {
2716 const __vector unsigned int raw = {x0, x1, x2, x3};
2717 return ResizeBitCast(d, Vec128<uint32_t>{raw});
2718}
2719
2720} // namespace detail
2721
2722// ------------------------------ SlideUpLanes
2723
2724template <class D>
2725HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
2726 const Repartition<uint8_t, decltype(d)> du8;
2727 using VU8 = VFromD<decltype(du8)>;
2728 const auto v_shift_amt =
2729 BitCast(Full128<uint8_t>(),
2730 Set(Full128<uint32_t>(),
2731 static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
2732
2733#if HWY_S390X_HAVE_Z14
2734 return BitCast(d, VU8{vec_srb(BitCast(du8, v).raw, v_shift_amt.raw)});
2735#else // VSX
2736#if HWY_IS_LITTLE_ENDIAN
2737 return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
2738#else
2739 return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
2740#endif // HWY_IS_LITTLE_ENDIAN
2741#endif // HWY_S390X_HAVE_Z14
2742}
2743
2744// ------------------------------ SlideDownLanes
2745
2746template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2747HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
2748 using TU = UnsignedFromSize<d.MaxBytes()>;
2749 const Repartition<TU, decltype(d)> du;
2750 const auto v_shift_amt =
2751 Set(du, static_cast<TU>(amt * sizeof(TFromD<D>) * 8));
2752
2753#if HWY_IS_LITTLE_ENDIAN
2754 return BitCast(d, BitCast(du, v) >> v_shift_amt);
2755#else
2756 return BitCast(d, BitCast(du, v) << v_shift_amt);
2757#endif
2758}
2759
2760template <class D, HWY_IF_V_SIZE_D(D, 16)>
2761HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
2762 const Repartition<uint8_t, decltype(d)> du8;
2763 using VU8 = VFromD<decltype(du8)>;
2764 const auto v_shift_amt =
2765 BitCast(Full128<uint8_t>(),
2766 Set(Full128<uint32_t>(),
2767 static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
2768
2769#if HWY_S390X_HAVE_Z14
2770 return BitCast(d, VU8{vec_slb(BitCast(du8, v).raw, v_shift_amt.raw)});
2771#else // VSX
2772#if HWY_IS_LITTLE_ENDIAN
2773 return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
2774#else
2775 return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
2776#endif // HWY_IS_LITTLE_ENDIAN
2777#endif // HWY_S390X_HAVE_Z14
2778}
2779
2780// ================================================== COMBINE
2781
2782// ------------------------------ Combine (InterleaveLower)
2783
2784// N = N/2 + N/2 (upper half undefined)
2785template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
2786HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
2787 const Half<decltype(d)> dh;
2788 // Treat half-width input as one lane, and expand to two lanes.
2789 using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
2790 using Raw = typename detail::Raw128<TFromV<VU>>::type;
2791 const VU lo{reinterpret_cast<Raw>(lo_half.raw)};
2792 const VU hi{reinterpret_cast<Raw>(hi_half.raw)};
2793 return BitCast(d, InterleaveLower(lo, hi));
2794}
2795
2796// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
2797
2798template <class D>
2799HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
2800 const Half<D> dh;
2801 return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
2802}
2803
2804// ------------------------------ Concat full (InterleaveLower)
2805
2806// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2807template <class D, typename T = TFromD<D>>
2809 const Repartition<uint64_t, decltype(d)> d64;
2810 return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
2811}
2812
2813// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
2814template <class D, typename T = TFromD<D>>
2816 const Repartition<uint64_t, decltype(d)> d64;
2817 return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
2818}
2819
2820// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
2821template <class D, typename T = TFromD<D>>
2823 return CombineShiftRightBytes<8>(d, hi, lo);
2824}
2825
2826// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
2827template <class D, typename T = TFromD<D>>
2829 const __vector unsigned char kShuffle = {0, 1, 2, 3, 4, 5, 6, 7,
2830 24, 25, 26, 27, 28, 29, 30, 31};
2831 return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
2832}
2833
2834// ------------------------------ Concat partial (Combine, LowerHalf)
2835
2836template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2838 const Half<decltype(d)> d2;
2839 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
2840}
2841
2842template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2844 const Half<decltype(d)> d2;
2845 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
2846}
2847
2848template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2850 const Half<decltype(d)> d2;
2851 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
2852}
2853
2854template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
2856 const Half<decltype(d)> d2;
2857 return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
2858}
2859
2860// ------------------------------ TruncateTo
2861
2862template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
2863 hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 2)>* = nullptr,
2864 HWY_IF_LANES_D(D, 1)>
2866 using Raw = typename detail::Raw128<TFromD<D>>::type;
2867#if HWY_IS_LITTLE_ENDIAN
2868 return VFromD<D>{reinterpret_cast<Raw>(v.raw)};
2869#else
2870 return VFromD<D>{reinterpret_cast<Raw>(
2871 vec_sld(v.raw, v.raw, sizeof(FromT) - sizeof(TFromD<D>)))};
2872#endif
2873}
2874
2875namespace detail {
2876
2877template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
2878 HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)>
2880 D /* tag */, Vec128<FromT, Repartition<FromT, D>().MaxLanes()> lo,
2881 Vec128<FromT, Repartition<FromT, D>().MaxLanes()> hi) {
2882 return VFromD<D>{vec_pack(lo.raw, hi.raw)};
2883}
2884
2885} // namespace detail
2886
2887template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
2888 HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)>
2890 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
2891 return VFromD<D>{vec_pack(v.raw, v.raw)};
2892}
2893
2894template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
2895 hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr,
2896 HWY_IF_LANES_GT_D(D, 1)>
2898 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
2899 const Rebind<MakeNarrow<FromT>, decltype(d)> d2;
2900 return TruncateTo(d, TruncateTo(d2, v));
2901}
2902
2903// ------------------------------ ConcatOdd (TruncateTo)
2904
2905// 8-bit full
2906template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2908 const Repartition<uint16_t, decltype(d)> dw;
2909 const RebindToUnsigned<decltype(d)> du;
2910#if HWY_IS_LITTLE_ENDIAN
2911 // Right-shift 8 bits per u16 so we can pack.
2912 const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
2913 const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
2914#else
2915 const Vec128<uint16_t> uH = BitCast(dw, hi);
2916 const Vec128<uint16_t> uL = BitCast(dw, lo);
2917#endif
2918 return BitCast(d, detail::Truncate2To(du, uL, uH));
2919}
2920
2921// 8-bit x8
2922template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2924 // Don't care about upper half, no need to zero.
2925 const __vector unsigned char kCompactOddU8 = {1, 3, 5, 7, 17, 19, 21, 23};
2926 return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
2927}
2928
2929// 8-bit x4
2930template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2931HWY_API Vec32<T> ConcatOdd(D /*d*/, Vec32<T> hi, Vec32<T> lo) {
2932 // Don't care about upper half, no need to zero.
2933 const __vector unsigned char kCompactOddU8 = {1, 3, 17, 19};
2934 return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
2935}
2936
2937// 16-bit full
2938template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
2939HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
2940 const Repartition<uint32_t, decltype(d)> dw;
2941 const RebindToUnsigned<decltype(d)> du;
2942#if HWY_IS_LITTLE_ENDIAN
2943 const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
2944 const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
2945#else
2946 const Vec128<uint32_t> uH = BitCast(dw, hi);
2947 const Vec128<uint32_t> uL = BitCast(dw, lo);
2948#endif
2949 return BitCast(d, detail::Truncate2To(du, uL, uH));
2950}
2951
2952// 16-bit x4
2953template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
2954HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
2955 // Don't care about upper half, no need to zero.
2956 const __vector unsigned char kCompactOddU16 = {2, 3, 6, 7, 18, 19, 22, 23};
2957 return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU16)};
2958}
2959
2960// 32-bit full
2961template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
2962HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
2963#if HWY_IS_LITTLE_ENDIAN
2964 (void)d;
2965 const __vector unsigned char kShuffle = {4, 5, 6, 7, 12, 13, 14, 15,
2966 20, 21, 22, 23, 28, 29, 30, 31};
2967 return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
2968#else
2969 const RebindToUnsigned<decltype(d)> du;
2970 const Repartition<uint64_t, decltype(d)> dw;
2971 return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi)));
2972#endif
2973}
2974
2975// Any type x2
2976template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
2977HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
2978 return InterleaveUpper(d, lo, hi);
2979}
2980
2981// ------------------------------ ConcatEven (TruncateTo)
2982
2983// 8-bit full
2984template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2986 const Repartition<uint16_t, decltype(d)> dw;
2987 const RebindToUnsigned<decltype(d)> du;
2988#if HWY_IS_LITTLE_ENDIAN
2989 const Vec128<uint16_t> uH = BitCast(dw, hi);
2990 const Vec128<uint16_t> uL = BitCast(dw, lo);
2991#else
2992 // Right-shift 8 bits per u16 so we can pack.
2993 const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
2994 const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
2995#endif
2996 return BitCast(d, detail::Truncate2To(du, uL, uH));
2997}
2998
2999// 8-bit x8
3000template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3002 // Don't care about upper half, no need to zero.
3003 const __vector unsigned char kCompactEvenU8 = {0, 2, 4, 6, 16, 18, 20, 22};
3004 return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
3005}
3006
3007// 8-bit x4
3008template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
3009HWY_API Vec32<T> ConcatEven(D /*d*/, Vec32<T> hi, Vec32<T> lo) {
3010 // Don't care about upper half, no need to zero.
3011 const __vector unsigned char kCompactEvenU8 = {0, 2, 16, 18};
3012 return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
3013}
3014
3015// 16-bit full
3016template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3017HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
3018 // Isolate lower 16 bits per u32 so we can pack.
3019 const Repartition<uint32_t, decltype(d)> dw;
3020 const RebindToUnsigned<decltype(d)> du;
3021#if HWY_IS_LITTLE_ENDIAN
3022 const Vec128<uint32_t> uH = BitCast(dw, hi);
3023 const Vec128<uint32_t> uL = BitCast(dw, lo);
3024#else
3025 const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
3026 const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
3027#endif
3028 return BitCast(d, detail::Truncate2To(du, uL, uH));
3029}
3030
3031// 16-bit x4
3032template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
3033HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
3034 // Don't care about upper half, no need to zero.
3035 const __vector unsigned char kCompactEvenU16 = {0, 1, 4, 5, 16, 17, 20, 21};
3036 return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU16)};
3037}
3038
3039// 32-bit full
3040template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
3041HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
3042#if HWY_IS_LITTLE_ENDIAN
3043 const Repartition<uint64_t, decltype(d)> dw;
3044 const RebindToUnsigned<decltype(d)> du;
3045 return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi)));
3046#else
3047 (void)d;
3048 constexpr __vector unsigned char kShuffle = {0, 1, 2, 3, 8, 9, 10, 11,
3049 16, 17, 18, 19, 24, 25, 26, 27};
3050 return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
3051#endif
3052}
3053
3054// Any T x2
3055template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
3056HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
3057 return InterleaveLower(d, lo, hi);
3058}
3059
3060// ------------------------------ OrderedTruncate2To (ConcatEven, ConcatOdd)
3061#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3062#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3063#else
3064#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
3065#endif
3066
3067template <class D, HWY_IF_UNSIGNED_D(D), class V, HWY_IF_UNSIGNED_V(V),
3068 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
3069 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
3071#if HWY_IS_LITTLE_ENDIAN
3072 return ConcatEven(d, BitCast(d, b), BitCast(d, a));
3073#else
3074 return ConcatOdd(d, BitCast(d, b), BitCast(d, a));
3075#endif
3076}
3077
3078// ------------------------------ DupEven (InterleaveLower)
3079
3080template <typename T>
3082 return v;
3083}
3084
3085template <typename T>
3087 return InterleaveLower(DFromV<decltype(v)>(), v, v);
3088}
3089
3090template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
3091HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
3092 const DFromV<decltype(v)> d;
3093 const Repartition<uint8_t, decltype(d)> du8;
3094 constexpr __vector unsigned char kShuffle = {0, 0, 2, 2, 4, 4, 6, 6,
3095 8, 8, 10, 10, 12, 12, 14, 14};
3096 return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
3097}
3098
3099template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
3100HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
3101 const DFromV<decltype(v)> d;
3102 const Repartition<uint8_t, decltype(d)> du8;
3103 constexpr __vector unsigned char kShuffle = {0, 1, 0, 1, 4, 5, 4, 5,
3104 8, 9, 8, 9, 12, 13, 12, 13};
3105 return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
3106}
3107
3108template <typename T, HWY_IF_T_SIZE(T, 4)>
3110#if HWY_S390X_HAVE_Z14
3111 const DFromV<decltype(v)> d;
3112 const Repartition<uint8_t, decltype(d)> du8;
3113 return TableLookupBytes(
3114 v, BitCast(d, Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10,
3115 11, 8, 9, 10, 11)));
3116#else
3117 return Vec128<T>{vec_mergee(v.raw, v.raw)};
3118#endif
3119}
3120
3121// ------------------------------ DupOdd (InterleaveUpper)
3122
3123template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
3124HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3125 const DFromV<decltype(v)> d;
3126 const Repartition<uint8_t, decltype(d)> du8;
3127 constexpr __vector unsigned char kShuffle = {1, 1, 3, 3, 5, 5, 7, 7,
3128 9, 9, 11, 11, 13, 13, 15, 15};
3129 return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
3130}
3131
3132template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
3133HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3134 const DFromV<decltype(v)> d;
3135 const Repartition<uint8_t, decltype(d)> du8;
3136 constexpr __vector unsigned char kShuffle = {2, 3, 2, 3, 6, 7, 6, 7,
3137 10, 11, 10, 11, 14, 15, 14, 15};
3138 return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
3139}
3140
3141template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
3142HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3143#if HWY_S390X_HAVE_Z14
3144 const DFromV<decltype(v)> d;
3145 const Repartition<uint8_t, decltype(d)> du8;
3146 return TableLookupBytes(
3147 v, BitCast(d, Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14,
3148 15, 12, 13, 14, 15)));
3149#else
3150 return Vec128<T, N>{vec_mergeo(v.raw, v.raw)};
3151#endif
3152}
3153
3154template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
3155HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3156 return InterleaveUpper(DFromV<decltype(v)>(), v, v);
3157}
3158
3159// ------------------------------ OddEven (IfThenElse)
3160
3161template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
3163 const DFromV<decltype(a)> d;
3164 const __vector unsigned char mask = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3165 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3166 return IfVecThenElse(BitCast(d, Vec128<uint8_t, N>{mask}), b, a);
3167}
3168
3169template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
3170HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
3171 const DFromV<decltype(a)> d;
3172 const __vector unsigned char mask = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
3173 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
3174 return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 2>{mask}), b, a);
3175}
3176
3177template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
3178HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
3179 const DFromV<decltype(a)> d;
3180 const __vector unsigned char mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0,
3181 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0};
3182 return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 4>{mask}), b, a);
3183}
3184
3185template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
3186HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
3187 // Same as ConcatUpperLower for full vectors; do not call that because this
3188 // is more efficient for 64x1 vectors.
3189 const DFromV<decltype(a)> d;
3190 const __vector unsigned char mask = {
3191 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0};
3192 return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a);
3193}
3194
3195// ------------------------------ InterleaveEven
3196
3197template <class D, HWY_IF_T_SIZE_D(D, 1)>
3199 const Full128<TFromD<D>> d_full;
3200 const Indices128<TFromD<D>> idx{
3201 Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24,
3202 10, 26, 12, 28, 14, 30)
3203 .raw};
3205 ResizeBitCast(d_full, b), idx));
3206}
3207
3208template <class D, HWY_IF_T_SIZE_D(D, 2)>
3210 const Full128<TFromD<D>> d_full;
3211 const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
3212 16, 17, 4, 5, 20, 21, 8,
3213 9, 24, 25, 12, 13, 28, 29)
3214 .raw};
3216 ResizeBitCast(d_full, b), idx));
3217}
3218
3219template <class D, HWY_IF_T_SIZE_D(D, 4)>
3221#if HWY_S390X_HAVE_Z14
3222 const Full128<TFromD<D>> d_full;
3223 const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
3224 2, 3, 16, 17, 18, 19, 8,
3225 9, 10, 11, 24, 25, 26, 27)
3226 .raw};
3228 ResizeBitCast(d_full, b), idx));
3229#else
3230 (void)d;
3231 return VFromD<D>{vec_mergee(a.raw, b.raw)};
3232#endif
3233}
3234
3235template <class D, HWY_IF_T_SIZE_D(D, 8)>
3237 return InterleaveLower(a, b);
3238}
3239
3240// ------------------------------ InterleaveOdd
3241
3242template <class D, HWY_IF_T_SIZE_D(D, 1)>
3244 const Full128<TFromD<D>> d_full;
3245 const Indices128<TFromD<D>> idx{
3246 Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25,
3247 11, 27, 13, 29, 15, 31)
3248 .raw};
3250 ResizeBitCast(d_full, b), idx));
3251}
3252
3253template <class D, HWY_IF_T_SIZE_D(D, 2)>
3255 const Full128<TFromD<D>> d_full;
3256 const Indices128<TFromD<D>> idx{
3257 Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10,
3258 11, 26, 27, 14, 15, 30, 31)
3259 .raw};
3261 ResizeBitCast(d_full, b), idx));
3262}
3263
3264template <class D, HWY_IF_T_SIZE_D(D, 4)>
3266#if HWY_S390X_HAVE_Z14
3267 const Full128<TFromD<D>> d_full;
3268 const Indices128<TFromD<D>> idx{
3269 Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12,
3270 13, 14, 15, 28, 29, 30, 31)
3271 .raw};
3273 ResizeBitCast(d_full, b), idx));
3274#else
3275 (void)d;
3276 return VFromD<D>{vec_mergeo(a.raw, b.raw)};
3277#endif
3278}
3279
3280template <class D, HWY_IF_T_SIZE_D(D, 8)>
3282 return InterleaveUpper(d, a, b);
3283}
3284
3285// ------------------------------ OddEvenBlocks
3286template <typename T, size_t N>
3287HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
3288 return even;
3289}
3290
3291// ------------------------------ SwapAdjacentBlocks
3292
3293template <typename T, size_t N>
3294HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
3295 return v;
3296}
3297
3298// ------------------------------ MulFixedPoint15 (OddEven)
3299
3300#if HWY_S390X_HAVE_Z14
3302 const DFromV<decltype(a)> di16;
3303 const RepartitionToWide<decltype(di16)> di32;
3304
3305 const auto round_up_incr = Set(di32, 0x4000);
3306 const auto i32_product = MulEven(a, b) + round_up_incr;
3307
3308 return ResizeBitCast(di16, ShiftLeft<1>(i32_product));
3309}
3310template <size_t N, HWY_IF_LANES_GT(N, 1)>
3311HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
3312 Vec128<int16_t, N> b) {
3313 const DFromV<decltype(a)> di16;
3314 const RepartitionToWide<decltype(di16)> di32;
3315
3316 const auto round_up_incr = Set(di32, 0x4000);
3317 const auto even_product = MulEven(a, b) + round_up_incr;
3318 const auto odd_product = MulOdd(a, b) + round_up_incr;
3319
3320 return OddEven(BitCast(di16, ShiftRight<15>(odd_product)),
3321 BitCast(di16, ShiftLeft<1>(even_product)));
3322}
3323#else
3324template <size_t N>
3325HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
3326 Vec128<int16_t, N> b) {
3327 const Vec128<int16_t> zero = Zero(Full128<int16_t>());
3328 return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
3329}
3330#endif
3331
3332// ------------------------------ Shl
3333
3334namespace detail {
3335template <typename T, size_t N>
3337 Vec128<T, N> bits) {
3338#if HWY_S390X_HAVE_Z14
3339 return Vec128<T, N>{v.raw << bits.raw};
3340#else
3341 return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
3342#endif
3343}
3344
3345// Signed left shift is the same as unsigned.
3346template <typename T, size_t N>
3348 Vec128<T, N> bits) {
3349 const DFromV<decltype(v)> di;
3350 const RebindToUnsigned<decltype(di)> du;
3351 return BitCast(di,
3352 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
3353}
3354
3355} // namespace detail
3356
3357template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
3359 return detail::Shl(hwy::TypeTag<T>(), v, bits);
3360}
3361
3362// ------------------------------ Shr
3363
3364namespace detail {
3365template <typename T, size_t N>
3367 Vec128<T, N> bits) {
3368#if HWY_S390X_HAVE_Z14
3369 return Vec128<T, N>{v.raw >> bits.raw};
3370#else
3371 return Vec128<T, N>{vec_sr(v.raw, bits.raw)};
3372#endif
3373}
3374
3375template <typename T, size_t N>
3377 Vec128<T, N> bits) {
3378#if HWY_S390X_HAVE_Z14
3379 return Vec128<T, N>{v.raw >> bits.raw};
3380#else
3381 const DFromV<decltype(v)> di;
3382 const RebindToUnsigned<decltype(di)> du;
3383 return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)};
3384#endif
3385}
3386
3387} // namespace detail
3388
3389template <typename T, size_t N>
3390HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
3391 return detail::Shr(hwy::TypeTag<T>(), v, bits);
3392}
3393
3394// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
3395
3396template <class T, HWY_IF_UI64(T)>
3397HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
3398#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
3399 using V64 = typename detail::Raw128<T>::type;
3400 const V64 mul128_result = reinterpret_cast<V64>(vec_mule(a.raw, b.raw));
3401#if HWY_IS_LITTLE_ENDIAN
3402 return Vec128<T>{mul128_result};
3403#else
3404 // Need to swap the two halves of mul128_result on big-endian targets as
3405 // the upper 64 bits of the product are in lane 0 of mul128_result and
3406 // the lower 64 bits of the product are in lane 1 of mul128_result
3407 return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
3408#endif
3409#else
3410 alignas(16) T mul[2];
3411 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
3412 return Load(Full128<T>(), mul);
3413#endif
3414}
3415
3416template <class T, HWY_IF_UI64(T)>
3417HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
3418#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
3419 using V64 = typename detail::Raw128<T>::type;
3420 const V64 mul128_result = reinterpret_cast<V64>(vec_mulo(a.raw, b.raw));
3421#if HWY_IS_LITTLE_ENDIAN
3422 return Vec128<T>{mul128_result};
3423#else
3424 // Need to swap the two halves of mul128_result on big-endian targets as
3425 // the upper 64 bits of the product are in lane 0 of mul128_result and
3426 // the lower 64 bits of the product are in lane 1 of mul128_result
3427 return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
3428#endif
3429#else
3430 alignas(16) T mul[2];
3431 const Full64<T> d2;
3432 mul[0] =
3433 Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
3434 return Load(Full128<T>(), mul);
3435#endif
3436}
3437
3438// ------------------------------ WidenMulPairwiseAdd
3439
3440template <class D32, HWY_IF_F32_D(D32),
3442HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
3443 const RebindToUnsigned<decltype(df32)> du32;
3444 // Lane order within sum0/1 is undefined, hence we can avoid the
3445 // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
3446 // leads to the odd/even order that RearrangeToOddPlusEven prefers.
3447 using VU32 = VFromD<decltype(du32)>;
3448 const VU32 odd = Set(du32, 0xFFFF0000u);
3449 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
3450 const VU32 ao = And(BitCast(du32, a), odd);
3451 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
3452 const VU32 bo = And(BitCast(du32, b), odd);
3453 return MulAdd(BitCast(df32, ae), BitCast(df32, be),
3454 Mul(BitCast(df32, ao), BitCast(df32, bo)));
3455}
3456
3457// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
3458template <class D32, HWY_IF_UI32_D(D32),
3460HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) {
3461#if HWY_S390X_HAVE_Z14
3462 (void)d32;
3463 return MulEven(a, b) + MulOdd(a, b);
3464#else
3465 return VFromD<D32>{vec_msum(a.raw, b.raw, Zero(d32).raw)};
3466#endif
3467}
3468
3469// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3470
3471template <class D32, HWY_IF_F32_D(D32),
3473HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
3474 VFromD<D32> sum0,
3475 VFromD<D32>& sum1) {
3476 const RebindToUnsigned<decltype(df32)> du32;
3477 // Lane order within sum0/1 is undefined, hence we can avoid the
3478 // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
3479 // leads to the odd/even order that RearrangeToOddPlusEven prefers.
3480 using VU32 = VFromD<decltype(du32)>;
3481 const VU32 odd = Set(du32, 0xFFFF0000u);
3482 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
3483 const VU32 ao = And(BitCast(du32, a), odd);
3484 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
3485 const VU32 bo = And(BitCast(du32, b), odd);
3486 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
3487 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
3488}
3489
3490// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
3491template <class D32, HWY_IF_UI32_D(D32),
3493HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /*d32*/, V16 a, V16 b,
3494 VFromD<D32> sum0,
3495 VFromD<D32>& /*sum1*/) {
3496#if HWY_S390X_HAVE_Z14
3497 return MulEven(a, b) + MulOdd(a, b) + sum0;
3498#else
3499 return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)};
3500#endif
3501}
3502
3503// ------------------------------ RearrangeToOddPlusEven
3504template <size_t N>
3506 Vec128<int32_t, N> /*sum1*/) {
3507 return sum0; // invariant already holds
3508}
3509
3510template <size_t N>
3513 return sum0; // invariant already holds
3514}
3515
3516template <class VW>
3517HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
3518 return Add(sum0, sum1);
3519}
3520
3521// ------------------------------ SatWidenMulPairwiseAccumulate
3522#if !HWY_S390X_HAVE_Z14
3523
3524#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3525#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3526#else
3527#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3528#endif
3529
3530template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
3532 DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
3533 VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
3534 return VFromD<DI32>{vec_msums(a.raw, b.raw, sum.raw)};
3535}
3536
3537#endif // !HWY_S390X_HAVE_Z14
3538
3539// ------------------------------ SumOfMulQuadAccumulate
3540#if !HWY_S390X_HAVE_Z14
3541
3542#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
3543#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
3544#else
3545#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
3546#endif
3547template <class DU32, HWY_IF_U32_D(DU32)>
3549 DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
3550 VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
3551 return VFromD<DU32>{vec_msum(a.raw, b.raw, sum.raw)};
3552}
3553
3554#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
3555#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
3556#else
3557#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
3558#endif
3559
3560template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
3562 DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
3563 VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
3564 return VFromD<DI32>{vec_msum(b_i.raw, a_u.raw, sum.raw)};
3565}
3566
3567#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
3568#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
3569#else
3570#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
3571#endif
3572template <class DI32, HWY_IF_I32_D(DI32)>
3574 VFromD<Repartition<int8_t, DI32>> a,
3575 VFromD<Repartition<int8_t, DI32>> b,
3576 VFromD<DI32> sum) {
3577 const Repartition<uint8_t, decltype(di32)> du8;
3578
3579 const auto result_sum_0 =
3580 SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum);
3581 const auto result_sum_1 = ShiftLeft<8>(SumsOf4(And(b, BroadcastSignBit(a))));
3582 return result_sum_0 - result_sum_1;
3583}
3584
3585#endif // !HWY_S390X_HAVE_Z14
3586
3587// ================================================== CONVERT
3588
3589// ------------------------------ Promotions (part w/ narrow lanes -> full)
3590
3591// Unsigned to signed/unsigned: zero-extend.
3592template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
3595 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
3596 // First pretend the input has twice the lanes - the upper half will be
3597 // ignored by ZipLower.
3598 const Rebind<FromT, Twice<D>> d2;
3599 const VFromD<decltype(d2)> twice{v.raw};
3600 // Then cast to narrow as expected by ZipLower, in case the sign of FromT
3601 // differs from that of D.
3602 const RepartitionToNarrow<D> dn;
3603
3604#if HWY_IS_LITTLE_ENDIAN
3605 return ZipLower(BitCast(dn, twice), Zero(dn));
3606#else
3607 return ZipLower(Zero(dn), BitCast(dn, twice));
3608#endif
3609}
3610
3611// Signed: replicate sign bit.
3612template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
3614HWY_API VFromD<D> PromoteTo(D /* d */,
3615 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
3616 using Raw = typename detail::Raw128<TFromD<D>>::type;
3617 return VFromD<D>{reinterpret_cast<Raw>(vec_unpackh(v.raw))};
3618}
3619
3620// 8-bit to 32-bit: First, promote to 16-bit, and then convert to 32-bit.
3621template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_D(D),
3622 HWY_IF_T_SIZE(FromT, 1)>
3624 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
3625 const DFromV<decltype(v)> d8;
3626 const Rebind<MakeWide<FromT>, decltype(d8)> d16;
3627 return PromoteTo(d32, PromoteTo(d16, v));
3628}
3629
3630// 8-bit or 16-bit to 64-bit: First, promote to MakeWide<FromT>, and then
3631// convert to 64-bit.
3632template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 8), HWY_IF_NOT_FLOAT_D(D),
3634 HWY_IF_T_SIZE_ONE_OF(FromT, (1 << 1) | (1 << 2))>
3636 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
3637 const Rebind<MakeWide<FromT>, decltype(d64)> dw;
3638 return PromoteTo(d64, PromoteTo(dw, v));
3639}
3640
3641#if HWY_PPC_HAVE_9
3642
3643// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
3644#ifdef HWY_NATIVE_F16C
3645#undef HWY_NATIVE_F16C
3646#else
3647#define HWY_NATIVE_F16C
3648#endif
3649
3650template <class D, HWY_IF_F32_D(D)>
3651HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
3652 return VFromD<D>{vec_extract_fp32_from_shorth(v.raw)};
3653}
3654
3655#endif // HWY_PPC_HAVE_9
3656
3657template <class D, HWY_IF_F32_D(D)>
3658HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
3659 const Rebind<uint16_t, decltype(df32)> du16;
3660 const RebindToSigned<decltype(df32)> di32;
3661 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3662}
3663
3664template <class D, HWY_IF_F64_D(D)>
3665HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
3666 const __vector float raw_v = InterleaveLower(v, v).raw;
3667#if HWY_IS_LITTLE_ENDIAN
3668 return VFromD<D>{vec_doubleo(raw_v)};
3669#else
3670 return VFromD<D>{vec_doublee(raw_v)};
3671#endif
3672}
3673
3674template <class D, HWY_IF_F64_D(D)>
3676#if HWY_S390X_HAVE_Z14
3677 const RebindToSigned<decltype(df64)> di64;
3678 return ConvertTo(df64, PromoteTo(di64, v));
3679#else // VSX
3680 (void)df64;
3681 const __vector signed int raw_v = InterleaveLower(v, v).raw;
3682#if HWY_IS_LITTLE_ENDIAN
3683 return VFromD<D>{vec_doubleo(raw_v)};
3684#else
3685 return VFromD<D>{vec_doublee(raw_v)};
3686#endif
3687#endif // HWY_S390X_HAVE_Z14
3688}
3689
3690template <class D, HWY_IF_F64_D(D)>
3692#if HWY_S390X_HAVE_Z14
3693 const RebindToUnsigned<decltype(df64)> du64;
3694 return ConvertTo(df64, PromoteTo(du64, v));
3695#else // VSX
3696 (void)df64;
3697 const __vector unsigned int raw_v = InterleaveLower(v, v).raw;
3698#if HWY_IS_LITTLE_ENDIAN
3699 return VFromD<D>{vec_doubleo(raw_v)};
3700#else
3701 return VFromD<D>{vec_doublee(raw_v)};
3702#endif
3703#endif // HWY_S390X_HAVE_Z14
3704}
3705
3706#if !HWY_S390X_HAVE_Z14
3707namespace detail {
3708
3709template <class V>
3710static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
3711#if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND)
3712 // Workaround for QEMU 7/8 VSX float to int conversion bug
3713 return IfThenElseZero(v == v, v);
3714#else
3715 return v;
3716#endif
3717}
3718
3719} // namespace detail
3720#endif // !HWY_S390X_HAVE_Z14
3721
3722template <class D, HWY_IF_I64_D(D)>
3723HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
3724#if !HWY_S390X_HAVE_Z14 && \
3725 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3726 const __vector float raw_v =
3727 detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
3728 return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
3729#else
3730 const RebindToFloat<decltype(di64)> df64;
3731 return ConvertTo(di64, PromoteTo(df64, v));
3732#endif
3733}
3734
3735template <class D, HWY_IF_U64_D(D)>
3736HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
3737#if !HWY_S390X_HAVE_Z14 && \
3738 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3739 const __vector float raw_v =
3740 detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
3741 return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
3742 __builtin_vsx_xvcvspuxds(raw_v))};
3743#else
3744 const RebindToFloat<decltype(du64)> df64;
3745 return ConvertTo(du64, PromoteTo(df64, v));
3746#endif
3747}
3748
3749// ------------------------------ PromoteUpperTo
3750
3751#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
3752#undef HWY_NATIVE_PROMOTE_UPPER_TO
3753#else
3754#define HWY_NATIVE_PROMOTE_UPPER_TO
3755#endif
3756
3757// Unsigned to signed/unsigned: zero-extend.
3758template <class D, typename FromT, HWY_IF_V_SIZE_D(D, 16),
3759 HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
3762 const RebindToUnsigned<D> du;
3763 const RepartitionToNarrow<decltype(du)> dn;
3764
3765#if HWY_IS_LITTLE_ENDIAN
3766 return BitCast(d, ZipUpper(du, v, Zero(dn)));
3767#else
3768 return BitCast(d, ZipUpper(du, Zero(dn), v));
3769#endif
3770}
3771
3772// Signed: replicate sign bit.
3773template <class D, typename FromT, HWY_IF_V_SIZE_D(D, 16),
3774 HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
3776HWY_API VFromD<D> PromoteUpperTo(D /* d */, Vec128<FromT> v) {
3777 using Raw = typename detail::Raw128<TFromD<D>>::type;
3778 return VFromD<D>{reinterpret_cast<Raw>(vec_unpackl(v.raw))};
3779}
3780
3781// F16 to F32
3782template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3784#if HWY_PPC_HAVE_9
3785 (void)df32;
3786 return VFromD<D>{vec_extract_fp32_from_shortl(v.raw)};
3787#else
3788 const Rebind<float16_t, decltype(df32)> dh;
3789 return PromoteTo(df32, UpperHalf(dh, v));
3790#endif
3791}
3792
3793// BF16 to F32
3794template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3796 const Repartition<uint16_t, decltype(df32)> du16;
3797 const RebindToSigned<decltype(df32)> di32;
3798 return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v))));
3799}
3800
3801template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3803 const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw;
3804#if HWY_IS_LITTLE_ENDIAN
3805 return VFromD<D>{vec_doubleo(raw_v)};
3806#else
3807 return VFromD<D>{vec_doublee(raw_v)};
3808#endif
3809}
3810
3811template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3813#if HWY_S390X_HAVE_Z14
3814 const RebindToSigned<decltype(df64)> di64;
3815 return ConvertTo(df64, PromoteUpperTo(di64, v));
3816#else // VSX
3817 (void)df64;
3818 const __vector signed int raw_v =
3819 InterleaveUpper(Full128<int32_t>(), v, v).raw;
3820#if HWY_IS_LITTLE_ENDIAN
3821 return VFromD<D>{vec_doubleo(raw_v)};
3822#else
3823 return VFromD<D>{vec_doublee(raw_v)};
3824#endif
3825#endif // HWY_S390X_HAVE_Z14
3826}
3827
3828template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3830#if HWY_S390X_HAVE_Z14
3831 const RebindToUnsigned<decltype(df64)> du64;
3832 return ConvertTo(df64, PromoteUpperTo(du64, v));
3833#else // VSX
3834 (void)df64;
3835 const __vector unsigned int raw_v =
3837#if HWY_IS_LITTLE_ENDIAN
3838 return VFromD<D>{vec_doubleo(raw_v)};
3839#else
3840 return VFromD<D>{vec_doublee(raw_v)};
3841#endif
3842#endif // HWY_S390X_HAVE_Z14
3843}
3844
3845template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
3846HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
3847#if !HWY_S390X_HAVE_Z14 && \
3848 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3849 const __vector float raw_v =
3850 detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
3851 .raw;
3852 return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
3853#else
3854 const RebindToFloat<decltype(di64)> df64;
3855 return ConvertTo(di64, PromoteUpperTo(df64, v));
3856#endif
3857}
3858
3859template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
3860HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) {
3861#if !HWY_S390X_HAVE_Z14 && \
3862 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3863 const __vector float raw_v =
3864 detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
3865 .raw;
3866 return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
3867 __builtin_vsx_xvcvspuxds(raw_v))};
3868#else
3869 const RebindToFloat<decltype(du64)> df64;
3870 return ConvertTo(du64, PromoteUpperTo(df64, v));
3871#endif
3872}
3873
3874// Generic version for <=64 bit input/output
3875template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V>
3877 const Rebind<TFromV<V>, decltype(d)> dh;
3878 return PromoteTo(d, UpperHalf(dh, v));
3879}
3880
3881// ------------------------------ PromoteEvenTo/PromoteOddTo
3882
3883namespace detail {
3884
3885// Signed to Signed PromoteEvenTo/PromoteOddTo for PPC9/PPC10
3886#if HWY_PPC_HAVE_9 && \
3887 (HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200)
3888
3889#if HWY_IS_LITTLE_ENDIAN
3890template <class D, class V>
3892 hwy::SizeTag<4> /*to_lane_size_tag*/,
3893 hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3894 V v) {
3895 return VFromD<D>{vec_signexti(v.raw)};
3896}
3897template <class D, class V>
3899 hwy::SizeTag<8> /*to_lane_size_tag*/,
3900 hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3901 V v) {
3902 return VFromD<D>{vec_signextll(v.raw)};
3903}
3904#else
3905template <class D, class V>
3907 hwy::SizeTag<4> /*to_lane_size_tag*/,
3908 hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3909 V v) {
3910 return VFromD<D>{vec_signexti(v.raw)};
3911}
3912template <class D, class V>
3914 hwy::SizeTag<8> /*to_lane_size_tag*/,
3915 hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3916 V v) {
3917 return VFromD<D>{vec_signextll(v.raw)};
3918}
3919#endif // HWY_IS_LITTLE_ENDIAN
3920
3921#endif // HWY_PPC_HAVE_9
3922
3923// I32/U32/F32->F64 PromoteEvenTo
3924#if HWY_S390X_HAVE_Z14
3925template <class D, class V>
3927 hwy::SizeTag<8> /*to_lane_size_tag*/,
3928 hwy::FloatTag /*from_type_tag*/, D /*d_to*/,
3929 V v) {
3930 return VFromD<D>{vec_doublee(v.raw)};
3931}
3932template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
3934 hwy::SizeTag<8> /*to_lane_size_tag*/,
3935 FromTypeTag /*from_type_tag*/, D d_to, V v) {
3936 const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
3937 return ConvertTo(d_to, PromoteEvenTo(dw, v));
3938}
3939#else // VSX
3940template <class D, class V, class FromTypeTag>
3942 hwy::SizeTag<8> /*to_lane_size_tag*/,
3943 FromTypeTag /*from_type_tag*/, D /*d_to*/,
3944 V v) {
3945 return VFromD<D>{vec_doublee(v.raw)};
3946}
3947#endif // HWY_S390X_HAVE_Z14
3948
3949// F32->I64 PromoteEvenTo
3950template <class D, class V>
3952 hwy::SizeTag<8> /*to_lane_size_tag*/,
3953 hwy::FloatTag /*from_type_tag*/, D d_to,
3954 V v) {
3955#if !HWY_S390X_HAVE_Z14 && \
3956 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3957 (void)d_to;
3958 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3959#if HWY_IS_LITTLE_ENDIAN
3960 // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
3961 // on little-endian PPC, and the vec_sld operation below will shift the even
3962 // lanes of normalized_v into the odd lanes.
3963 return VFromD<D>{
3964 __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
3965#else
3966 // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
3967 // on big-endian PPC.
3968 return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
3969#endif
3970#else
3971 const RebindToFloat<decltype(d_to)> df64;
3973 hwy::FloatTag(), df64, v));
3974#endif
3975}
3976
3977// F32->U64 PromoteEvenTo
3978template <class D, class V>
3980 hwy::SizeTag<8> /*to_lane_size_tag*/,
3981 hwy::FloatTag /*from_type_tag*/, D d_to,
3982 V v) {
3983#if !HWY_S390X_HAVE_Z14 && \
3984 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3985 (void)d_to;
3986 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3987#if HWY_IS_LITTLE_ENDIAN
3988 // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
3989 // on little-endian PPC, and the vec_sld operation below will shift the even
3990 // lanes of normalized_v into the odd lanes.
3991 return VFromD<D>{
3992 reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
3993 vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
3994#else
3995 // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
3996 // on big-endian PPC.
3997 return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
3998 __builtin_vsx_xvcvspuxds(normalized_v.raw))};
3999#endif
4000#else
4001 const RebindToFloat<decltype(d_to)> df64;
4003 hwy::FloatTag(), df64, v));
4004#endif
4005}
4006
4007// I32/U32/F32->F64 PromoteOddTo
4008#if HWY_S390X_HAVE_Z14
4009template <class D, class V>
4011 hwy::SizeTag<8> /*to_lane_size_tag*/,
4012 hwy::FloatTag /*from_type_tag*/, D d_to,
4013 V v) {
4015 d_to, V{vec_sld(v.raw, v.raw, 4)});
4016}
4017template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
4019 hwy::SizeTag<8> /*to_lane_size_tag*/,
4020 FromTypeTag /*from_type_tag*/, D d_to, V v) {
4021 const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
4022 return ConvertTo(d_to, PromoteOddTo(dw, v));
4023}
4024#else
4025template <class D, class V, class FromTypeTag>
4027 hwy::SizeTag<8> /*to_lane_size_tag*/,
4028 FromTypeTag /*from_type_tag*/, D /*d_to*/,
4029 V v) {
4030 return VFromD<D>{vec_doubleo(v.raw)};
4031}
4032#endif
4033
4034// F32->I64 PromoteOddTo
4035template <class D, class V>
4037 hwy::SizeTag<8> /*to_lane_size_tag*/,
4038 hwy::FloatTag /*from_type_tag*/, D d_to,
4039 V v) {
4040#if !HWY_S390X_HAVE_Z14 && \
4041 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
4042 (void)d_to;
4043 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
4044#if HWY_IS_LITTLE_ENDIAN
4045 // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
4046 // on little-endian PPC
4047 return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
4048#else
4049 // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
4050 // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
4051 // of normalized_v into the even lanes.
4052 return VFromD<D>{
4053 __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
4054#endif
4055#else
4056 const RebindToFloat<decltype(d_to)> df64;
4058 hwy::FloatTag(), df64, v));
4059#endif
4060}
4061
4062// F32->U64 PromoteOddTo
4063template <class D, class V>
4065 hwy::SizeTag<8> /*to_lane_size_tag*/,
4066 hwy::FloatTag /*from_type_tag*/, D d_to,
4067 V v) {
4068#if !HWY_S390X_HAVE_Z14 && \
4069 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
4070 (void)d_to;
4071 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
4072#if HWY_IS_LITTLE_ENDIAN
4073 // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
4074 // on little-endian PPC
4075 return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
4076 __builtin_vsx_xvcvspuxds(normalized_v.raw))};
4077#else
4078 // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
4079 // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
4080 // of normalized_v into the even lanes.
4081 return VFromD<D>{
4082 reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
4083 vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
4084#endif
4085#else
4086 const RebindToFloat<decltype(d_to)> df64;
4088 hwy::FloatTag(), df64, v));
4089#endif
4090}
4091
4092} // namespace detail
4093
4094// ------------------------------ Demotions (full -> part w/ narrow lanes)
4095
4096template <class D, typename FromT, HWY_IF_UNSIGNED_D(D),
4097 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
4098 HWY_IF_SIGNED(FromT), HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
4100 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4101 return VFromD<D>{vec_packsu(v.raw, v.raw)};
4102}
4103
4104template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT),
4105 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
4106 HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
4107HWY_API VFromD<D> DemoteTo(D /* tag */,
4108 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4109 return VFromD<D>{vec_packs(v.raw, v.raw)};
4110}
4111
4112template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
4113 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
4114 HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
4115HWY_API VFromD<D> DemoteTo(D /* tag */,
4116 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4117 return VFromD<D>{vec_packs(v.raw, v.raw)};
4118}
4119
4120template <class D, class FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT),
4121 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
4122 hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
4124 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4125 const Rebind<MakeNarrow<FromT>, D> d2;
4126 return DemoteTo(d, DemoteTo(d2, v));
4127}
4128
4129template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
4130 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
4131 hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
4133 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4134 const Rebind<MakeNarrow<FromT>, D> d2;
4135 return DemoteTo(d, DemoteTo(d2, v));
4136}
4137
4138template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_SIGNED(FromT),
4139 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
4140 hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
4142 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4143 const Rebind<MakeUnsigned<MakeNarrow<FromT>>, D> d2;
4144 return DemoteTo(d, DemoteTo(d2, v));
4145}
4146
4147#if HWY_PPC_HAVE_9 && \
4148 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp))
4149
4150// We already toggled HWY_NATIVE_F16C above.
4151
4152template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
4153HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
4154// Avoid vec_pack_to_short_fp32 on Clang because its implementation is buggy.
4155#if HWY_COMPILER_GCC_ACTUAL
4156 (void)df16;
4157 return VFromD<D>{vec_pack_to_short_fp32(v.raw, v.raw)};
4158#elif HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp)
4159 // Work around bug in the clang implementation of vec_pack_to_short_fp32
4160 // by using the __builtin_vsx_xvcvsphp builtin on PPC9/PPC10 targets
4161 // if the __builtin_vsx_xvcvsphp intrinsic is available
4162 const RebindToUnsigned<decltype(df16)> du16;
4163 const Rebind<uint32_t, D> du;
4164 const VFromD<decltype(du)> bits16{
4165 reinterpret_cast<__vector unsigned int>(__builtin_vsx_xvcvsphp(v.raw))};
4166 return BitCast(df16, TruncateTo(du16, bits16));
4167#else
4168#error "Only define the function if we have a native implementation"
4169#endif
4170}
4171
4172#endif // HWY_PPC_HAVE_9
4173
4174#if HWY_PPC_HAVE_9
4175
4176#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
4177#undef HWY_NATIVE_DEMOTE_F64_TO_F16
4178#else
4179#define HWY_NATIVE_DEMOTE_F64_TO_F16
4180#endif
4181
4182namespace detail {
4183
4184// On big-endian PPC9, VsxXscvdphp converts vf64[0] to a F16, returned as an U64
4185// vector with the resulting F16 bits in the lower 16 bits of U64 lane 0
4186
4187// On little-endian PPC9, VsxXscvdphp converts vf64[1] to a F16, returned as
4188// an U64 vector with the resulting F16 bits in the lower 16 bits of U64 lane 1
4189static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) {
4190 // Inline assembly is needed for the PPC9 xscvdphp instruction as there is
4191 // currently no intrinsic available for the PPC9 xscvdphp instruction
4192 __vector unsigned long long raw_result;
4193 __asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw));
4194 return Vec128<uint64_t>{raw_result};
4195}
4196
4197} // namespace detail
4198
4199template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)>
4200HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
4201 const RebindToUnsigned<decltype(df16)> du16;
4202 const Rebind<uint64_t, decltype(df16)> du64;
4203
4204 const Full128<double> df64_full;
4205#if HWY_IS_LITTLE_ENDIAN
4206 const auto bits16_as_u64 =
4207 UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v)));
4208#else
4209 const auto bits16_as_u64 =
4210 LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v)));
4211#endif
4212
4213 return BitCast(df16, TruncateTo(du16, bits16_as_u64));
4214}
4215
4216template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)>
4217HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
4218 const RebindToUnsigned<decltype(df16)> du16;
4219 const Rebind<uint64_t, decltype(df16)> du64;
4220 const Rebind<double, decltype(df16)> df64;
4221
4222#if HWY_IS_LITTLE_ENDIAN
4223 const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v));
4224 const auto bits64_as_u64_1 = detail::VsxXscvdphp(v);
4225 const auto bits64_as_u64 =
4226 InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1);
4227#else
4228 const auto bits64_as_u64_0 = detail::VsxXscvdphp(v);
4229 const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v));
4230 const auto bits64_as_u64 =
4231 InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1);
4232#endif
4233
4234 return BitCast(df16, TruncateTo(du16, bits64_as_u64));
4235}
4236
4237#elif HWY_S390X_HAVE_Z14
4238
4239#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
4240#undef HWY_NATIVE_DEMOTE_F64_TO_F16
4241#else
4242#define HWY_NATIVE_DEMOTE_F64_TO_F16
4243#endif
4244
4245namespace detail {
4246
4247template <class DF32, HWY_IF_F32_D(DF32)>
4249 DF32 df32, VFromD<Rebind<double, DF32>> v) {
4250 const Twice<DF32> dt_f32;
4251
4252 __vector float raw_f32_in_even;
4253 __asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw));
4254
4255 const VFromD<decltype(dt_f32)> f32_in_even{raw_f32_in_even};
4256 return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even));
4257}
4258
4259} // namespace detail
4260
4261template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
4263 const Rebind<float, decltype(df16)> df32;
4264 return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
4265}
4266
4267#endif // HWY_PPC_HAVE_9
4268
4269#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4270
4271#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
4272#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
4273#else
4274#define HWY_NATIVE_DEMOTE_F32_TO_BF16
4275#endif
4276
4277namespace detail {
4278
4279// VsxXvcvspbf16 converts a F32 vector to a BF16 vector, bitcasted to an U32
4280// vector with the resulting BF16 bits in the lower 16 bits of each U32 lane
4281template <class D, HWY_IF_BF16_D(D)>
4282static HWY_INLINE VFromD<Rebind<uint32_t, D>> VsxXvcvspbf16(
4283 D dbf16, VFromD<Rebind<float, D>> v) {
4284 const Rebind<uint32_t, decltype(dbf16)> du32;
4285 const Repartition<uint8_t, decltype(du32)> du32_as_du8;
4286
4287 using VU32 = __vector unsigned int;
4288
4289 // Even though the __builtin_vsx_xvcvspbf16 builtin performs a F32 to BF16
4290 // conversion, the __builtin_vsx_xvcvspbf16 intrinsic expects a
4291 // __vector unsigned char argument (at least as of GCC 13 and Clang 17)
4292 return VFromD<Rebind<uint32_t, D>>{reinterpret_cast<VU32>(
4293 __builtin_vsx_xvcvspbf16(BitCast(du32_as_du8, v).raw))};
4294}
4295
4296} // namespace detail
4297
4298template <class D, HWY_IF_BF16_D(D)>
4299HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
4300 const RebindToUnsigned<decltype(dbf16)> du16;
4301 return BitCast(dbf16, TruncateTo(du16, detail::VsxXvcvspbf16(dbf16, v)));
4302}
4303
4304#endif // HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4305
4306// Specializations for partial vectors because vec_packs sets lanes above 2*N.
4307template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN),
4308 HWY_IF_SIGNED_V(V),
4309 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
4310 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
4312 const DFromV<decltype(a)> d;
4313 const Twice<decltype(d)> dt;
4314 return DemoteTo(dn, Combine(dt, b, a));
4315}
4316template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_SIGNED_D(DN),
4317 HWY_IF_SIGNED_V(V),
4318 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
4319 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
4320HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
4321 const Twice<decltype(dn)> dn_full;
4322 const Repartition<uint32_t, decltype(dn_full)> du32_full;
4323
4324 const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
4325 const auto vu32_full = BitCast(du32_full, v_full);
4326 return LowerHalf(
4327 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
4328}
4329template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_SIGNED_D(DN),
4330 HWY_IF_SIGNED_V(V),
4331 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
4332 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
4333HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
4334 return VFromD<DN>{vec_packs(a.raw, b.raw)};
4335}
4336
4337template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4),
4339 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
4340 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
4341HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
4342 const DFromV<decltype(a)> d;
4343 const Twice<decltype(d)> dt;
4344 return DemoteTo(dn, Combine(dt, b, a));
4345}
4346template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN),
4347 HWY_IF_SIGNED_V(V),
4348 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
4349 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
4350HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
4351 const Twice<decltype(dn)> dn_full;
4352 const Repartition<uint32_t, decltype(dn_full)> du32_full;
4353
4354 const VFromD<decltype(dn_full)> v_full{vec_packsu(a.raw, b.raw)};
4355 const auto vu32_full = BitCast(du32_full, v_full);
4356 return LowerHalf(
4357 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
4358}
4359template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN),
4360 HWY_IF_SIGNED_V(V),
4361 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
4362 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
4363HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
4364 return VFromD<DN>{vec_packsu(a.raw, b.raw)};
4365}
4366
4367template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4),
4369 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
4370 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
4371HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
4372 const DFromV<decltype(a)> d;
4373 const Twice<decltype(d)> dt;
4374 return DemoteTo(dn, Combine(dt, b, a));
4375}
4376template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN),
4378 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
4379 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
4380HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
4381 const Twice<decltype(dn)> dn_full;
4382 const Repartition<uint32_t, decltype(dn_full)> du32_full;
4383
4384 const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
4385 const auto vu32_full = BitCast(du32_full, v_full);
4386 return LowerHalf(
4387 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
4388}
4389template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN),
4391 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
4392 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
4393HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
4394 return VFromD<DN>{vec_packs(a.raw, b.raw)};
4395}
4396
4397#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4398template <class D, class V, HWY_IF_BF16_D(D), HWY_IF_F32(TFromV<V>),
4399 HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V) * 2)>
4400HWY_API VFromD<D> ReorderDemote2To(D dbf16, V a, V b) {
4401 const RebindToUnsigned<decltype(dbf16)> du16;
4402 const Half<decltype(dbf16)> dh_bf16;
4403 return BitCast(dbf16,
4404 OrderedTruncate2To(du16, detail::VsxXvcvspbf16(dh_bf16, a),
4405 detail::VsxXvcvspbf16(dh_bf16, b)));
4406}
4407#endif
4408
4409template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
4410 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4411 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
4412 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
4414 return ReorderDemote2To(d, a, b);
4415}
4416
4417#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4418template <class D, HWY_IF_BF16_D(D), class V, HWY_IF_F32(TFromV<V>),
4419 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
4420HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
4421 return ReorderDemote2To(d, a, b);
4422}
4423#endif
4424
4425template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
4427 return Vec32<float>{vec_floate(v.raw)};
4428}
4429
4430template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
4432#if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
4433 const Vec128<float> f64_to_f32{vec_floate(v.raw)};
4434#else
4435 const Vec128<float> f64_to_f32{vec_floato(v.raw)};
4436#endif
4437
4438#if HWY_S390X_HAVE_Z14
4439 const Twice<decltype(d)> dt;
4440 return LowerHalf(d, ConcatEven(dt, f64_to_f32, f64_to_f32));
4441#else
4442 const RebindToUnsigned<D> du;
4443 const Rebind<uint64_t, D> du64;
4444 return Vec64<float>{
4445 BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw};
4446#endif
4447}
4448
4449template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
4451#if HWY_S390X_HAVE_Z14
4452 const Rebind<int64_t, decltype(di32)> di64;
4453 return DemoteTo(di32, ConvertTo(di64, v));
4454#else
4455 (void)di32;
4456 return Vec32<int32_t>{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
4457#endif
4458}
4459
4460template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
4462#if HWY_S390X_HAVE_Z14
4463 const Rebind<int64_t, decltype(di32)> di64;
4464 return DemoteTo(di32, ConvertTo(di64, v));
4465#else
4466 (void)di32;
4467
4468#if HWY_IS_LITTLE_ENDIAN
4469 const Vec128<int32_t> f64_to_i32{
4470 vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
4471#else
4472 const Vec128<int32_t> f64_to_i32{
4473 vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)};
4474#endif
4475
4476 const Rebind<int64_t, D> di64;
4477 const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32);
4478 return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)};
4479#endif
4480}
4481
4482template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
4484#if HWY_S390X_HAVE_Z14
4485 const Rebind<uint64_t, decltype(du32)> du64;
4486 return DemoteTo(du32, ConvertTo(du64, v));
4487#else
4488 (void)du32;
4489 return Vec32<uint32_t>{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
4490#endif
4491}
4492
4493template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
4495#if HWY_S390X_HAVE_Z14
4496 const Rebind<uint64_t, decltype(du32)> du64;
4497 return DemoteTo(du32, ConvertTo(du64, v));
4498#else
4499 (void)du32;
4500#if HWY_IS_LITTLE_ENDIAN
4501 const Vec128<uint32_t> f64_to_u32{
4502 vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
4503#else
4504 const Vec128<uint32_t> f64_to_u32{
4505 vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)};
4506#endif
4507
4508 const Rebind<uint64_t, D> du64;
4509 const Vec128<uint64_t> vu64 = BitCast(du64, f64_to_u32);
4510 return Vec64<uint32_t>{vec_pack(vu64.raw, vu64.raw)};
4511#endif
4512}
4513
4514#if HWY_S390X_HAVE_Z14
4515namespace detail {
4516
4517template <class V, HWY_IF_I64(TFromV<V>)>
4519 __vector double raw_result;
4520 // Use inline assembly to do a round-to-odd I64->F64 conversion on Z14
4521 __asm__("vcdgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
4522 return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
4523}
4524
4525template <class V, HWY_IF_U64(TFromV<V>)>
4527 __vector double raw_result;
4528 // Use inline assembly to do a round-to-odd U64->F64 conversion on Z14
4529 __asm__("vcdlgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
4530 return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
4531}
4532
4533} // namespace detail
4534#endif // HWY_S390X_HAVE_Z14
4535
4536template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
4538#if HWY_S390X_HAVE_Z14
4540#else // VSX
4541 (void)df32;
4542 return Vec32<float>{vec_floate(v.raw)};
4543#endif
4544}
4545
4546template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
4548#if HWY_S390X_HAVE_Z14
4550#else // VSX
4551#if HWY_IS_LITTLE_ENDIAN
4552 const Vec128<float> i64_to_f32{vec_floate(v.raw)};
4553#else
4554 const Vec128<float> i64_to_f32{vec_floato(v.raw)};
4555#endif
4556
4557 const RebindToUnsigned<decltype(df32)> du32;
4558 const Rebind<uint64_t, decltype(df32)> du64;
4559 return Vec64<float>{
4560 BitCast(df32, TruncateTo(du32, BitCast(du64, i64_to_f32))).raw};
4561#endif
4562}
4563
4564template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
4566#if HWY_S390X_HAVE_Z14
4568#else // VSX
4569 (void)df32;
4570 return Vec32<float>{vec_floate(v.raw)};
4571#endif
4572}
4573
4574template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
4576#if HWY_S390X_HAVE_Z14
4578#else // VSX
4579#if HWY_IS_LITTLE_ENDIAN
4580 const Vec128<float> u64_to_f32{vec_floate(v.raw)};
4581#else
4582 const Vec128<float> u64_to_f32{vec_floato(v.raw)};
4583#endif
4584
4585 const RebindToUnsigned<decltype(df32)> du;
4586 const Rebind<uint64_t, decltype(df32)> du64;
4587 return Vec64<float>{
4588 BitCast(df32, TruncateTo(du, BitCast(du64, u64_to_f32))).raw};
4589#endif
4590}
4591
4592// For already range-limited input [0, 255].
4593template <size_t N>
4594HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
4595 const Rebind<uint16_t, DFromV<decltype(v)>> du16;
4596 const Rebind<uint8_t, decltype(du16)> du8;
4597 return TruncateTo(du8, TruncateTo(du16, v));
4598}
4599// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
4600
4601// Note: altivec.h vec_ct* currently contain C casts which triggers
4602// -Wdeprecate-lax-vec-conv-all warnings, so disable them.
4603
4604#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4605template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
4606 HWY_IF_V_SIZE_LE_D(D, 8)>
4608 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4609 const Rebind<double, decltype(df32)> df64;
4610 return DemoteTo(df32, PromoteTo(df64, v));
4611}
4612template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
4613 HWY_IF_V_SIZE_D(D, 16)>
4615 const RepartitionToWide<decltype(df32)> df64;
4616
4617 const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)};
4618 const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)};
4619 return ConcatEven(df32, vf32_hi, vf32_lo);
4620}
4621#else // Z15 or PPC
4622template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT)>
4623HWY_API VFromD<D> ConvertTo(D /* tag */,
4624 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4625 HWY_DIAGNOSTICS(push)
4626#if HWY_COMPILER_CLANG
4627 HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
4628#endif
4630 return VFromD<D>{vec_float(v.raw)};
4631#else
4632 return VFromD<D>{vec_ctf(v.raw, 0)};
4633#endif
4634 HWY_DIAGNOSTICS(pop)
4635}
4636#endif // HWY_TARGET == HWY_Z14
4637
4638template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT),
4639 HWY_IF_T_SIZE_D(D, sizeof(FromT))>
4640HWY_API VFromD<D> ConvertTo(D /* tag */,
4641 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4642 return VFromD<D>{vec_double(v.raw)};
4643}
4644
4645// Truncates (rounds toward zero).
4646#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4647template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
4649 Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4650 const Rebind<int64_t, decltype(di32)> di64;
4651 return DemoteTo(di32, PromoteTo(di64, v));
4652}
4653template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_D(D, 16)>
4655 Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4656 const RepartitionToWide<decltype(di32)> di64;
4657 return OrderedDemote2To(di32, PromoteLowerTo(di64, v),
4658 PromoteUpperTo(di64, v));
4659}
4660#else // Z15 or PPC
4661template <class D, HWY_IF_I32_D(D)>
4662HWY_API VFromD<D> ConvertTo(D /* tag */,
4663 Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4664#if defined(__OPTIMIZE__)
4666 constexpr int32_t kMinI32 = LimitsMin<int32_t>();
4667 constexpr int32_t kMaxI32 = LimitsMax<int32_t>();
4668 return Dup128VecFromValues(
4669 D(),
4670 (v.raw[0] >= -2147483648.0f)
4671 ? ((v.raw[0] < 2147483648.0f) ? static_cast<int32_t>(v.raw[0])
4672 : kMaxI32)
4673 : ((v.raw[0] < 0) ? kMinI32 : 0),
4674 (v.raw[1] >= -2147483648.0f)
4675 ? ((v.raw[1] < 2147483648.0f) ? static_cast<int32_t>(v.raw[1])
4676 : kMaxI32)
4677 : ((v.raw[1] < 0) ? kMinI32 : 0),
4678 (v.raw[2] >= -2147483648.0f)
4679 ? ((v.raw[2] < 2147483648.0f) ? static_cast<int32_t>(v.raw[2])
4680 : kMaxI32)
4681 : ((v.raw[2] < 0) ? kMinI32 : 0),
4682 (v.raw[3] >= -2147483648.0f)
4683 ? ((v.raw[3] < 2147483648.0f) ? static_cast<int32_t>(v.raw[3])
4684 : kMaxI32)
4685 : ((v.raw[3] < 0) ? kMinI32 : 0));
4686 }
4687#endif
4688
4689#if HWY_S390X_HAVE_Z15
4690 // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
4691 // the range of an int32_t
4692 __vector signed int raw_result;
4693 __asm__("vcfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4694 return VFromD<D>{raw_result};
4695#else
4696 HWY_DIAGNOSTICS(push)
4697#if HWY_COMPILER_CLANG
4698 HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
4699#endif
4700 return VFromD<D>{vec_cts(v.raw, 0)};
4701 HWY_DIAGNOSTICS(pop)
4702#endif // HWY_S390X_HAVE_Z15
4703}
4704#endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4705
4706template <class D, HWY_IF_I64_D(D)>
4708 Vec128<double, Rebind<double, D>().MaxLanes()> v) {
4709#if defined(__OPTIMIZE__)
4711 constexpr int64_t kMinI64 = LimitsMin<int64_t>();
4712 constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
4713 return Dup128VecFromValues(D(),
4714 (v.raw[0] >= -9223372036854775808.0)
4715 ? ((v.raw[0] < 9223372036854775808.0)
4716 ? static_cast<int64_t>(v.raw[0])
4717 : kMaxI64)
4718 : ((v.raw[0] < 0) ? kMinI64 : 0LL),
4719 (v.raw[1] >= -9223372036854775808.0)
4720 ? ((v.raw[1] < 9223372036854775808.0)
4721 ? static_cast<int64_t>(v.raw[1])
4722 : kMaxI64)
4723 : ((v.raw[1] < 0) ? kMinI64 : 0LL));
4724 }
4725#endif
4726
4727 // Use inline assembly to avoid undefined behavior if v[i] is not within the
4728 // range of an int64_t
4729 __vector signed long long raw_result;
4730#if HWY_S390X_HAVE_Z14
4731 __asm__("vcgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4732#else
4733 __asm__("xvcvdpsxds %x0,%x1"
4734 : "=wa"(raw_result)
4735 : "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
4736#endif
4737 return VFromD<D>{raw_result};
4738}
4739
4740#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4741template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
4743 Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4744 const Rebind<uint64_t, decltype(du32)> du64;
4745 return DemoteTo(du32, PromoteTo(du64, v));
4746}
4747template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_D(D, 16)>
4749 Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4750 const RepartitionToWide<decltype(du32)> du64;
4751 return OrderedDemote2To(du32, PromoteLowerTo(du64, v),
4752 PromoteUpperTo(du64, v));
4753}
4754#else // Z15 or VSX
4755template <class D, HWY_IF_U32_D(D)>
4756HWY_API VFromD<D> ConvertTo(D /* tag */,
4757 Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4758#if defined(__OPTIMIZE__)
4760 constexpr uint32_t kMaxU32 = LimitsMax<uint32_t>();
4761 return Dup128VecFromValues(
4762 D(),
4763 (v.raw[0] >= 0.0f)
4764 ? ((v.raw[0] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[0])
4765 : kMaxU32)
4766 : 0,
4767 (v.raw[1] >= 0.0f)
4768 ? ((v.raw[1] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[1])
4769 : kMaxU32)
4770 : 0,
4771 (v.raw[2] >= 0.0f)
4772 ? ((v.raw[2] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[2])
4773 : kMaxU32)
4774 : 0,
4775 (v.raw[3] >= 0.0f)
4776 ? ((v.raw[3] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[3])
4777 : kMaxU32)
4778 : 0);
4779 }
4780#endif
4781
4782#if HWY_S390X_HAVE_Z15
4783 // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
4784 // the range of an uint32_t
4785 __vector unsigned int raw_result;
4786 __asm__("vclfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4787 return VFromD<D>{raw_result};
4788#else // VSX
4789 HWY_DIAGNOSTICS(push)
4790#if HWY_COMPILER_CLANG
4791 HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
4792#endif
4793 VFromD<D> result{vec_ctu(v.raw, 0)};
4794 HWY_DIAGNOSTICS(pop)
4795 return result;
4796#endif // HWY_S390X_HAVE_Z15
4797}
4798#endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4799
4800template <class D, HWY_IF_U64_D(D)>
4801HWY_API VFromD<D> ConvertTo(D /* tag */,
4802 Vec128<double, Rebind<double, D>().MaxLanes()> v) {
4803 HWY_DIAGNOSTICS(push)
4804#if HWY_COMPILER_CLANG
4805 HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
4806#endif
4807
4808#if defined(__OPTIMIZE__)
4809 if (detail::IsConstantRawAltivecVect(v.raw)) {
4810 constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
4811 return Dup128VecFromValues(
4812 D(),
4813 (v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0)
4814 ? static_cast<uint64_t>(v.raw[0])
4815 : kMaxU64)
4816 : 0,
4817 (v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0)
4818 ? static_cast<uint64_t>(v.raw[1])
4819 : kMaxU64)
4820 : 0);
4821 }
4822#endif
4823
4824 // Use inline assembly to avoid undefined behavior if v[i] is not within the
4825 // range of an uint64_t
4826 __vector unsigned long long raw_result;
4827#if HWY_S390X_HAVE_Z14
4828 __asm__("vclgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4829#else // VSX
4830 __asm__("xvcvdpuxds %x0,%x1"
4831 : "=wa"(raw_result)
4832 : "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
4833#endif
4834 return VFromD<D>{raw_result};
4835}
4836
4837// ------------------------------ Floating-point rounding (ConvertTo)
4838
4839// Toward nearest integer, ties to even
4840template <size_t N>
4841HWY_API Vec128<float, N> Round(Vec128<float, N> v) {
4842 return Vec128<float, N>{vec_round(v.raw)};
4843}
4844
4845template <size_t N>
4847#if HWY_S390X_HAVE_Z14
4848 return Vec128<double, N>{vec_round(v.raw)};
4849#else
4850 return Vec128<double, N>{vec_rint(v.raw)};
4851#endif
4852}
4853
4854template <size_t N>
4855HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
4856 const DFromV<decltype(v)> d;
4857 const RebindToSigned<decltype(d)> di;
4858 return ConvertTo(di, Round(v));
4859}
4860
4861// Toward zero, aka truncate
4862template <typename T, size_t N, HWY_IF_FLOAT(T)>
4864 return Vec128<T, N>{vec_trunc(v.raw)};
4865}
4866
4867// Toward +infinity, aka ceiling
4868template <typename T, size_t N, HWY_IF_FLOAT(T)>
4870 return Vec128<T, N>{vec_ceil(v.raw)};
4871}
4872
4873// Toward -infinity, aka floor
4874template <typename T, size_t N, HWY_IF_FLOAT(T)>
4876 return Vec128<T, N>{vec_floor(v.raw)};
4877}
4878
4879// ------------------------------ Floating-point classification
4880
4881template <typename T, size_t N>
4882HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
4883 static_assert(IsFloat<T>(), "Only for float");
4884 return v != v;
4885}
4886
4887template <typename T, size_t N>
4889 static_assert(IsFloat<T>(), "Only for float");
4890 using TU = MakeUnsigned<T>;
4891 const DFromV<decltype(v)> d;
4892 const RebindToUnsigned<decltype(d)> du;
4893 const VFromD<decltype(du)> vu = BitCast(du, v);
4894 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
4895 return RebindMask(
4896 d,
4897 Eq(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
4898}
4899
4900// Returns whether normal/subnormal/zero.
4901template <typename T, size_t N>
4903 static_assert(IsFloat<T>(), "Only for float");
4904 using TU = MakeUnsigned<T>;
4905 const DFromV<decltype(v)> d;
4906 const RebindToUnsigned<decltype(d)> du;
4907 const VFromD<decltype(du)> vu = BitCast(du, v);
4908 // 'Shift left' to clear the sign bit, check for exponent<max.
4909 return RebindMask(
4910 d,
4911 Lt(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
4912}
4913
4914// ================================================== CRYPTO
4915
4916#if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO)
4917
4918// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4919#ifdef HWY_NATIVE_AES
4920#undef HWY_NATIVE_AES
4921#else
4922#define HWY_NATIVE_AES
4923#endif
4924
4925namespace detail {
4926#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600
4927using CipherTag = Full128<uint64_t>;
4928#else
4929using CipherTag = Full128<uint8_t>;
4930#endif // !HWY_COMPILER_CLANG
4931using CipherVec = VFromD<CipherTag>;
4932} // namespace detail
4933
4934HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
4935 Vec128<uint8_t> round_key) {
4936 const detail::CipherTag dc;
4937 const Full128<uint8_t> du8;
4938#if HWY_IS_LITTLE_ENDIAN
4939 return Reverse(du8,
4940 BitCast(du8, detail::CipherVec{vec_cipher_be(
4941 BitCast(dc, Reverse(du8, state)).raw,
4942 BitCast(dc, Reverse(du8, round_key)).raw)}));
4943#else
4944 return BitCast(du8, detail::CipherVec{vec_cipher_be(
4945 BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
4946#endif
4947}
4948
4949HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
4950 Vec128<uint8_t> round_key) {
4951 const detail::CipherTag dc;
4952 const Full128<uint8_t> du8;
4953#if HWY_IS_LITTLE_ENDIAN
4954 return Reverse(du8,
4955 BitCast(du8, detail::CipherVec{vec_cipherlast_be(
4956 BitCast(dc, Reverse(du8, state)).raw,
4957 BitCast(dc, Reverse(du8, round_key)).raw)}));
4958#else
4959 return BitCast(du8, detail::CipherVec{vec_cipherlast_be(
4960 BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
4961#endif
4962}
4963
4964HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state,
4965 Vec128<uint8_t> round_key) {
4966 const detail::CipherTag dc;
4967 const Full128<uint8_t> du8;
4968#if HWY_IS_LITTLE_ENDIAN
4969 return Xor(Reverse(du8, BitCast(du8, detail::CipherVec{vec_ncipher_be(
4970 BitCast(dc, Reverse(du8, state)).raw,
4971 Zero(dc).raw)})),
4972 round_key);
4973#else
4974 return Xor(BitCast(du8, detail::CipherVec{vec_ncipher_be(
4975 BitCast(dc, state).raw, Zero(dc).raw)}),
4976 round_key);
4977#endif
4978}
4979
4980HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state,
4981 Vec128<uint8_t> round_key) {
4982 const detail::CipherTag dc;
4983 const Full128<uint8_t> du8;
4984#if HWY_IS_LITTLE_ENDIAN
4985 return Reverse(du8,
4986 BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
4987 BitCast(dc, Reverse(du8, state)).raw,
4988 BitCast(dc, Reverse(du8, round_key)).raw)}));
4989#else
4990 return BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
4991 BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
4992#endif
4993}
4994
4995HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) {
4996 const Full128<uint8_t> du8;
4997 const auto zero = Zero(du8);
4998
4999 // PPC8/PPC9/PPC10 does not have a single instruction for the AES
5000 // InvMixColumns operation like ARM Crypto, SVE2 Crypto, or AES-NI do.
5001
5002 // The AESInvMixColumns operation can be carried out on PPC8/PPC9/PPC10
5003 // by doing an AESLastRound operation with a zero round_key followed by an
5004 // AESRoundInv operation with a zero round_key.
5005 return AESRoundInv(AESLastRound(state, zero), zero);
5006}
5007
5008template <uint8_t kRcon>
5009HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
5010 constexpr __vector unsigned char kRconXorMask = {0, 0, 0, 0, kRcon, 0, 0, 0,
5011 0, 0, 0, 0, kRcon, 0, 0, 0};
5012 constexpr __vector unsigned char kRotWordShuffle = {
5013 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12};
5014 const detail::CipherTag dc;
5015 const Full128<uint8_t> du8;
5016 const auto sub_word_result =
5017 BitCast(du8, detail::CipherVec{vec_sbox_be(BitCast(dc, v).raw)});
5018 const auto rot_word_result =
5019 TableLookupBytes(sub_word_result, Vec128<uint8_t>{kRotWordShuffle});
5020 return Xor(rot_word_result, Vec128<uint8_t>{kRconXorMask});
5021}
5022
5023template <size_t N>
5024HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
5025 Vec128<uint64_t, N> b) {
5026 // NOTE: Lane 1 of both a and b need to be zeroed out for the
5027 // vec_pmsum_be operation below as the vec_pmsum_be operation
5028 // does a carryless multiplication of each 64-bit half and then
5029 // adds the two halves using an bitwise XOR operation.
5030
5031 const DFromV<decltype(a)> d;
5032 const auto zero = Zero(d);
5033
5034 using VU64 = __vector unsigned long long;
5035 const VU64 pmsum_result = reinterpret_cast<VU64>(
5036 vec_pmsum_be(InterleaveLower(a, zero).raw, InterleaveLower(b, zero).raw));
5037
5038#if HWY_IS_LITTLE_ENDIAN
5039 return Vec128<uint64_t, N>{pmsum_result};
5040#else
5041 // Need to swap the two halves of pmsum_result on big-endian targets as
5042 // the upper 64 bits of the carryless multiplication result are in lane 0 of
5043 // pmsum_result and the lower 64 bits of the carryless multiplication result
5044 // are in lane 1 of mul128_result
5045 return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
5046#endif
5047}
5048
5049template <size_t N>
5050HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
5051 Vec128<uint64_t, N> b) {
5052 // NOTE: Lane 0 of both a and b need to be zeroed out for the
5053 // vec_pmsum_be operation below as the vec_pmsum_be operation
5054 // does a carryless multiplication of each 64-bit half and then
5055 // adds the two halves using an bitwise XOR operation.
5056
5057 const DFromV<decltype(a)> d;
5058 const auto zero = Zero(d);
5059
5060 using VU64 = __vector unsigned long long;
5061 const VU64 pmsum_result = reinterpret_cast<VU64>(
5062 vec_pmsum_be(vec_mergel(zero.raw, a.raw), vec_mergel(zero.raw, b.raw)));
5063
5064#if HWY_IS_LITTLE_ENDIAN
5065 return Vec128<uint64_t, N>{pmsum_result};
5066#else
5067 // Need to swap the two halves of pmsum_result on big-endian targets as
5068 // the upper 64 bits of the carryless multiplication result are in lane 0 of
5069 // pmsum_result and the lower 64 bits of the carryless multiplication result
5070 // are in lane 1 of mul128_result
5071 return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
5072#endif
5073}
5074
5075#endif // !defined(HWY_DISABLE_PPC8_CRYPTO)
5076
5077// ================================================== MISC
5078
5079// ------------------------------ LoadMaskBits (TestBit)
5080
5081namespace detail {
5082
5083template <class D, HWY_IF_T_SIZE_D(D, 1)>
5084HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
5085#if HWY_PPC_HAVE_10
5086 const Vec128<uint8_t> mask_vec{vec_genbm(mask_bits)};
5087
5088#if HWY_IS_LITTLE_ENDIAN
5089 return MFromD<D>{MaskFromVec(mask_vec).raw};
5090#else
5091 return MFromD<D>{MaskFromVec(Reverse(Full128<uint8_t>(), mask_vec)).raw};
5092#endif // HWY_IS_LITTLE_ENDIAN
5093
5094#else // PPC9 or earlier
5095 const Full128<uint8_t> du8;
5096 const Full128<uint16_t> du16;
5097 const Vec128<uint8_t> vbits =
5098 BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
5099
5100 // Replicate bytes 8x such that each byte contains the bit that governs it.
5101#if HWY_IS_LITTLE_ENDIAN
5102 const __vector unsigned char kRep8 = {0, 0, 0, 0, 0, 0, 0, 0,
5103 1, 1, 1, 1, 1, 1, 1, 1};
5104#else
5105 const __vector unsigned char kRep8 = {1, 1, 1, 1, 1, 1, 1, 1,
5106 0, 0, 0, 0, 0, 0, 0, 0};
5107#endif // HWY_IS_LITTLE_ENDIAN
5108
5109 const Vec128<uint8_t> rep8{vec_perm(vbits.raw, vbits.raw, kRep8)};
5110 const __vector unsigned char kBit = {1, 2, 4, 8, 16, 32, 64, 128,
5111 1, 2, 4, 8, 16, 32, 64, 128};
5112 return MFromD<D>{TestBit(rep8, Vec128<uint8_t>{kBit}).raw};
5113#endif // HWY_PPC_HAVE_10
5114}
5115
5116template <class D, HWY_IF_T_SIZE_D(D, 2)>
5117HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
5118#if HWY_PPC_HAVE_10
5119 const Vec128<uint16_t> mask_vec{vec_genhm(mask_bits)};
5120
5121#if HWY_IS_LITTLE_ENDIAN
5122 return MFromD<D>{MaskFromVec(mask_vec).raw};
5123#else
5124 return MFromD<D>{MaskFromVec(Reverse(Full128<uint16_t>(), mask_vec)).raw};
5125#endif // HWY_IS_LITTLE_ENDIAN
5126
5127#else // PPC9 or earlier
5128 const __vector unsigned short kBit = {1, 2, 4, 8, 16, 32, 64, 128};
5129 const auto vmask_bits =
5130 Set(Full128<uint16_t>(), static_cast<uint16_t>(mask_bits));
5131 return MFromD<D>{TestBit(vmask_bits, Vec128<uint16_t>{kBit}).raw};
5132#endif // HWY_PPC_HAVE_10
5133}
5134
5135template <class D, HWY_IF_T_SIZE_D(D, 4)>
5136HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
5137#if HWY_PPC_HAVE_10
5138 const Vec128<uint32_t> mask_vec{vec_genwm(mask_bits)};
5139
5140#if HWY_IS_LITTLE_ENDIAN
5141 return MFromD<D>{MaskFromVec(mask_vec).raw};
5142#else
5143 return MFromD<D>{MaskFromVec(Reverse(Full128<uint32_t>(), mask_vec)).raw};
5144#endif // HWY_IS_LITTLE_ENDIAN
5145
5146#else // PPC9 or earlier
5147 const __vector unsigned int kBit = {1, 2, 4, 8};
5148 const auto vmask_bits =
5149 Set(Full128<uint32_t>(), static_cast<uint32_t>(mask_bits));
5150 return MFromD<D>{TestBit(vmask_bits, Vec128<uint32_t>{kBit}).raw};
5151#endif // HWY_PPC_HAVE_10
5152}
5153
5154template <class D, HWY_IF_T_SIZE_D(D, 8)>
5155HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
5156#if HWY_PPC_HAVE_10
5157 const Vec128<uint64_t> mask_vec{vec_gendm(mask_bits)};
5158
5159#if HWY_IS_LITTLE_ENDIAN
5160 return MFromD<D>{MaskFromVec(mask_vec).raw};
5161#else
5162 return MFromD<D>{MaskFromVec(Reverse(Full128<uint64_t>(), mask_vec)).raw};
5163#endif // HWY_IS_LITTLE_ENDIAN
5164
5165#else // PPC9 or earlier
5166 const __vector unsigned long long kBit = {1, 2};
5167 const auto vmask_bits =
5168 Set(Full128<uint64_t>(), static_cast<uint64_t>(mask_bits));
5169 return MFromD<D>{TestBit(vmask_bits, Vec128<uint64_t>{kBit}).raw};
5170#endif // HWY_PPC_HAVE_10
5171}
5172
5173} // namespace detail
5174
5175// `p` points to at least 8 readable bytes, not all of which need be valid.
5176template <class D, HWY_IF_LANES_LE_D(D, 8)>
5177HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
5178 // If there are 8 or fewer lanes, simply convert bits[0] to a uint64_t
5179 uint64_t mask_bits = bits[0];
5180
5181 constexpr size_t kN = MaxLanes(d);
5182 if (kN < 8) mask_bits &= (1u << kN) - 1;
5183
5184 return detail::LoadMaskBits128(d, mask_bits);
5185}
5186
5187template <class D, HWY_IF_LANES_D(D, 16)>
5188HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
5189 // First, copy the mask bits to a uint16_t as there as there are at most
5190 // 16 lanes in a vector.
5191
5192 // Copying the mask bits to a uint16_t first will also ensure that the
5193 // mask bits are loaded into the lower 16 bits on big-endian PPC targets.
5194 uint16_t u16_mask_bits;
5195 CopyBytes<sizeof(uint16_t)>(bits, &u16_mask_bits);
5196
5197#if HWY_IS_LITTLE_ENDIAN
5198 return detail::LoadMaskBits128(d, u16_mask_bits);
5199#else
5200 // On big-endian targets, u16_mask_bits need to be byte swapped as bits
5201 // contains the mask bits in little-endian byte order
5202
5203 // GCC/Clang will optimize the load of u16_mask_bits and byte swap to a
5204 // single lhbrx instruction on big-endian PPC targets when optimizations
5205 // are enabled.
5206#if HWY_HAS_BUILTIN(__builtin_bswap16)
5207 return detail::LoadMaskBits128(d, __builtin_bswap16(u16_mask_bits));
5208#else
5210 d, static_cast<uint16_t>((u16_mask_bits << 8) | (u16_mask_bits >> 8)));
5211#endif
5212#endif
5213}
5214
5215template <typename T>
5216struct CompressIsPartition {
5217 // generic_ops-inl does not guarantee IsPartition for 8-bit.
5218 enum { value = (sizeof(T) != 1) };
5219};
5220
5221// ------------------------------ Dup128MaskFromMaskBits
5222
5223template <class D>
5224HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5225 constexpr size_t kN = MaxLanes(d);
5226 if (kN < 8) mask_bits &= (1u << kN) - 1;
5227 return detail::LoadMaskBits128(d, mask_bits);
5228}
5229
5230// ------------------------------ StoreMaskBits
5231
5232namespace detail {
5233
5234#if !HWY_S390X_HAVE_Z14 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5235// fallback for missing vec_extractm
5236template <size_t N>
5237HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
5238 __vector unsigned char bit_shuffle) {
5239 // clang POWER8 and 9 targets appear to differ in their return type of
5240 // vec_vbpermq: unsigned or signed, so cast to avoid a warning.
5241 using VU64 = detail::Raw128<uint64_t>::type;
5242 const Vec128<uint64_t> extracted{
5243 reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))};
5244 return extracted.raw[HWY_IS_LITTLE_ENDIAN];
5245}
5246
5247#endif // !HWY_S390X_HAVE_Z14 && !HWY_PPC_HAVE_10
5248
5249#if HWY_S390X_HAVE_Z14
5250template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
5251HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
5252 const DFromM<decltype(mask)> d;
5253 const Repartition<uint8_t, decltype(d)> du8;
5254 const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5255
5256 return ReduceSum(
5257 du8, And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128,
5258 1, 2, 4, 8, 16, 32, 64, 128)));
5259}
5260
5261template <typename T>
5262HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
5263 const DFromM<decltype(mask)> d;
5264 const Repartition<uint8_t, decltype(d)> du8;
5265 const Repartition<uint64_t, decltype(d)> du64;
5266 const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5267
5268 const auto mask_bytes = SumsOf8(
5269 And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
5270 4, 8, 16, 32, 64, 128)));
5271
5272 const Rebind<uint8_t, decltype(du64)> du8_2;
5273 const Repartition<uint16_t, decltype(du8_2)> du16_1;
5274 return GetLane(
5275 BitCast(du16_1, TruncateTo(du8_2, Reverse2(du64, mask_bytes))));
5276}
5277#else
5278template <typename T, size_t N>
5279HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
5280 const DFromM<decltype(mask)> d;
5281 const Repartition<uint8_t, decltype(d)> du8;
5282 const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5283
5284#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5285 return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
5286#else // PPC8, PPC9, or big-endian PPC10
5287 const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
5288 56, 48, 40, 32, 24, 16, 8, 0};
5289 return ExtractSignBits(sign_bits, kBitShuffle);
5290#endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5291}
5292#endif // HWY_S390X_HAVE_Z14
5293
5294template <typename T, size_t N>
5296 const DFromM<decltype(mask)> d;
5297 const RebindToUnsigned<decltype(d)> du;
5298
5299#if HWY_S390X_HAVE_Z14
5300 const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
5301 return ReduceSum(
5302 du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8, 16, 32, 64, 128)));
5303#else // VSX
5304 const Repartition<uint8_t, decltype(d)> du8;
5305 const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5306
5307#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5308 return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
5309#else // PPC8, PPC9, or big-endian PPC10
5310 (void)du;
5311#if HWY_IS_LITTLE_ENDIAN
5312 const __vector unsigned char kBitShuffle = {
5313 112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
5314#else
5315 const __vector unsigned char kBitShuffle = {
5316 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0};
5317#endif
5318 return ExtractSignBits(sign_bits, kBitShuffle);
5319#endif // HWY_PPC_HAVE_10
5320#endif // HWY_S390X_HAVE_Z14
5321}
5322
5323template <typename T, size_t N>
5325 const DFromM<decltype(mask)> d;
5326 const RebindToUnsigned<decltype(d)> du;
5327
5328#if HWY_S390X_HAVE_Z14
5329 const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
5330 return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8)));
5331#else // VSX
5332 const Repartition<uint8_t, decltype(d)> du8;
5333 const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5334
5335#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5336 return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
5337#else // PPC8, PPC9, or big-endian PPC10
5338 (void)du;
5339#if HWY_IS_LITTLE_ENDIAN
5340 const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
5341 128, 128, 128, 128, 128, 128,
5342 128, 128, 128, 128};
5343#else
5344 const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
5345 128, 128, 128, 128, 128, 128,
5346 96, 64, 32, 0};
5347#endif
5348 return ExtractSignBits(sign_bits, kBitShuffle);
5349#endif // HWY_PPC_HAVE_10
5350#endif // HWY_S390X_HAVE_Z14
5351}
5352
5353template <typename T, size_t N>
5355 const DFromM<decltype(mask)> d;
5356 const RebindToUnsigned<decltype(d)> du;
5357
5358#if HWY_S390X_HAVE_Z14
5359 const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
5360 return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2)));
5361#else // VSX
5362 const Repartition<uint8_t, decltype(d)> du8;
5363 const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5364
5365#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5366 return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
5367#else
5368 (void)du;
5369#if HWY_IS_LITTLE_ENDIAN
5370 const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
5371 128, 128, 128, 128, 128, 128,
5372 128, 128, 128, 128};
5373#else
5374 const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
5375 128, 128, 128, 128, 128, 128,
5376 128, 128, 64, 0};
5377#endif
5378 return ExtractSignBits(sign_bits, kBitShuffle);
5379#endif // HWY_PPC_HAVE_10
5380#endif // HWY_S390X_HAVE_Z14
5381}
5382
5383// Returns the lowest N of the mask bits.
5384template <typename T, size_t N>
5385constexpr uint64_t OnlyActive(uint64_t mask_bits) {
5386 return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
5387}
5388
5389template <typename T, size_t N>
5390HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) {
5391 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
5392}
5393
5394} // namespace detail
5395
5396// `p` points to at least 8 writable bytes.
5397template <class D, HWY_IF_LANES_LE_D(D, 8)>
5398HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
5399 // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask
5400 // to an uint8_t and store the result in bits[0].
5401 bits[0] = static_cast<uint8_t>(detail::BitsFromMask(mask));
5402 return sizeof(uint8_t);
5403}
5404
5405template <class D, HWY_IF_LANES_D(D, 16)>
5406HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
5407 const auto mask_bits = detail::BitsFromMask(mask);
5408
5409 // First convert mask_bits to a uint16_t as we only want to store
5410 // the lower 16 bits of mask_bits as there are 16 lanes in mask.
5411
5412 // Converting mask_bits to a uint16_t first will also ensure that
5413 // the lower 16 bits of mask_bits are stored instead of the upper 16 bits
5414 // of mask_bits on big-endian PPC targets.
5415#if HWY_IS_LITTLE_ENDIAN
5416 const uint16_t u16_mask_bits = static_cast<uint16_t>(mask_bits);
5417#else
5418 // On big-endian targets, the bytes of mask_bits need to be swapped
5419 // as StoreMaskBits expects the mask bits to be stored in little-endian
5420 // byte order.
5421
5422 // GCC will also optimize the byte swap and CopyBytes operations below
5423 // to a single sthbrx instruction when optimizations are enabled on
5424 // big-endian PPC targets
5425#if HWY_HAS_BUILTIN(__builtin_bswap16)
5426 const uint16_t u16_mask_bits =
5427 __builtin_bswap16(static_cast<uint16_t>(mask_bits));
5428#else
5429 const uint16_t u16_mask_bits = static_cast<uint16_t>(
5430 (mask_bits << 8) | (static_cast<uint16_t>(mask_bits) >> 8));
5431#endif
5432#endif
5433
5434 CopyBytes<sizeof(uint16_t)>(&u16_mask_bits, bits);
5435 return sizeof(uint16_t);
5436}
5437
5438// ------------------------------ Mask testing
5439
5440template <class D, HWY_IF_V_SIZE_D(D, 16)>
5442 const RebindToUnsigned<decltype(d)> du;
5443 return static_cast<bool>(
5444 vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Zero(du).raw));
5445}
5446
5447template <class D, HWY_IF_V_SIZE_D(D, 16)>
5448HWY_API bool AllTrue(D d, MFromD<D> mask) {
5449 const RebindToUnsigned<decltype(d)> du;
5450 using TU = TFromD<decltype(du)>;
5451 return static_cast<bool>(vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw,
5452 Set(du, hwy::LimitsMax<TU>()).raw));
5453}
5454
5455template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
5456HWY_API bool AllFalse(D d, MFromD<D> mask) {
5457 const Full128<TFromD<D>> d_full;
5458 constexpr size_t kN = MaxLanes(d);
5459 return AllFalse(d_full,
5460 And(MFromD<decltype(d_full)>{mask.raw}, FirstN(d_full, kN)));
5461}
5462
5463template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
5464HWY_API bool AllTrue(D d, MFromD<D> mask) {
5465 const Full128<TFromD<D>> d_full;
5466 constexpr size_t kN = MaxLanes(d);
5467 return AllTrue(
5468 d_full, Or(MFromD<decltype(d_full)>{mask.raw}, Not(FirstN(d_full, kN))));
5469}
5470
5471template <class D>
5472HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
5473 return PopCount(detail::BitsFromMask(mask));
5474}
5475
5476#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5477namespace detail {
5478
5479template <class V>
5480static HWY_INLINE size_t VsxCntlzLsbb(V v) {
5481#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \
5482 HWY_IS_LITTLE_ENDIAN
5483 // Use inline assembly to work around bug in GCC 11 and earlier on
5484 // little-endian PPC9
5485 int idx;
5486 __asm__("vctzlsbb %0,%1" : "=r"(idx) : "v"(v.raw));
5487 return static_cast<size_t>(idx);
5488#else
5489 return static_cast<size_t>(vec_cntlz_lsbb(v.raw));
5490#endif
5491}
5492
5493template <class V>
5494static HWY_INLINE size_t VsxCnttzLsbb(V v) {
5495#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \
5496 HWY_IS_LITTLE_ENDIAN
5497 // Use inline assembly to work around bug in GCC 11 and earlier on
5498 // little-endian PPC9
5499 int idx;
5500 __asm__("vclzlsbb %0,%1" : "=r"(idx) : "v"(v.raw));
5501 return static_cast<size_t>(idx);
5502#else
5503 return static_cast<size_t>(vec_cnttz_lsbb(v.raw));
5504#endif
5505}
5506
5507} // namespace detail
5508#endif
5509
5510template <class D, typename T = TFromD<D>>
5512// For little-endian PPC10, BitsFromMask is already efficient.
5513#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5514 if (detail::IsFull(d)) {
5515 const Repartition<uint8_t, D> d8;
5516 const auto bytes = BitCast(d8, VecFromMask(d, mask));
5517 return detail::VsxCntlzLsbb(bytes) / sizeof(T);
5518 }
5519#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5520 (void)d;
5522}
5523
5524template <class D, typename T = TFromD<D>>
5526// For little-endian PPC10, BitsFromMask is already efficient.
5527#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5528 constexpr size_t kN = 16 / sizeof(T);
5529 if (detail::IsFull(d)) {
5530 const Repartition<uint8_t, D> d8;
5531 const auto bytes = BitCast(d8, VecFromMask(d, mask));
5532 const size_t idx = detail::VsxCntlzLsbb(bytes) / sizeof(T);
5533 return idx == kN ? -1 : static_cast<intptr_t>(idx);
5534 }
5535#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5536 (void)d;
5537 const uint64_t mask_bits = detail::BitsFromMask(mask);
5538 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
5539}
5540
5541template <class D, typename T = TFromD<D>>
5543// For little-endian PPC10, BitsFromMask is already efficient.
5544#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5545 if (detail::IsFull(d)) {
5546 const Repartition<uint8_t, D> d8;
5547 const auto bytes = BitCast(d8, VecFromMask(d, mask));
5548 const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T);
5549 return 16 / sizeof(T) - 1 - idx;
5550 }
5551#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5552 (void)d;
5554}
5555
5556template <class D, typename T = TFromD<D>>
5557HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
5558// For little-endian PPC10, BitsFromMask is already efficient.
5559#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5560 constexpr size_t kN = 16 / sizeof(T);
5561 if (detail::IsFull(d)) {
5562 const Repartition<uint8_t, D> d8;
5563 const auto bytes = BitCast(d8, VecFromMask(d, mask));
5564 const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T);
5565 return idx == kN ? -1 : static_cast<intptr_t>(kN - 1 - idx);
5566 }
5567#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5568 (void)d;
5569 const uint64_t mask_bits = detail::BitsFromMask(mask);
5570 return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits))
5571 : -1;
5572}
5573
5574// ------------------------------ Compress, CompressBits
5575
5576namespace detail {
5577
5578#if HWY_PPC_HAVE_10
5579template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 1)>
5580HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
5581 constexpr unsigned kGenPcvmMode =
5582 (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
5583
5584 // Inline assembly is used instead of the vec_genpcvm intrinsic to work around
5585 // compiler bugs on little-endian PPC10
5586 typename detail::Raw128<TFromD<D>>::type idx;
5587 __asm__("xxgenpcvbm %x0, %1, %2"
5588 : "=wa"(idx)
5589 : "v"(mask.raw), "i"(kGenPcvmMode));
5590 return VFromD<decltype(d)>{idx};
5591}
5592template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 2)>
5593HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
5594 constexpr unsigned kGenPcvmMode =
5595 (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
5596
5597 // Inline assembly is used instead of the vec_genpcvm intrinsic to work around
5598 // compiler bugs on little-endian PPC10
5599 typename detail::Raw128<TFromD<D>>::type idx;
5600 __asm__("xxgenpcvhm %x0, %1, %2"
5601 : "=wa"(idx)
5602 : "v"(mask.raw), "i"(kGenPcvmMode));
5603 return VFromD<decltype(d)>{idx};
5604}
5605template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 4)>
5606HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
5607 constexpr unsigned kGenPcvmMode =
5608 (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
5609
5610 // Inline assembly is used instead of the vec_genpcvm intrinsic to work around
5611 // compiler bugs on little-endian PPC10
5612 typename detail::Raw128<TFromD<D>>::type idx;
5613 __asm__("xxgenpcvwm %x0, %1, %2"
5614 : "=wa"(idx)
5615 : "v"(mask.raw), "i"(kGenPcvmMode));
5616 return VFromD<decltype(d)>{idx};
5617}
5618#endif
5619
5620// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
5621template <class D, HWY_IF_T_SIZE_D(D, 2)>
5623 HWY_DASSERT(mask_bits < 256);
5624 const Rebind<uint8_t, decltype(d)> d8;
5625 const Twice<decltype(d8)> d8t;
5626 const RebindToUnsigned<decltype(d)> du;
5627
5628 // To reduce cache footprint, store lane indices and convert to byte indices
5629 // (2*lane + 0..1), with the doubling baked into the table. It's not clear
5630 // that the additional cost of unpacking nibbles is worthwhile.
5631 alignas(16) static constexpr uint8_t table[2048] = {
5632 // PrintCompress16x8Tables
5633 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5634 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5635 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
5636 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5637 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
5638 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
5639 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
5640 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5641 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
5642 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
5643 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
5644 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
5645 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
5646 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
5647 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
5648 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5649 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
5650 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
5651 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
5652 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
5653 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
5654 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
5655 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
5656 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
5657 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
5658 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
5659 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
5660 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
5661 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
5662 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
5663 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
5664 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5665 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
5666 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
5667 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
5668 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
5669 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
5670 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
5671 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
5672 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
5673 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
5674 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
5675 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
5676 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
5677 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
5678 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
5679 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
5680 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
5681 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
5682 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
5683 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
5684 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
5685 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
5686 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
5687 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
5688 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
5689 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
5690 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
5691 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
5692 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
5693 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
5694 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
5695 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
5696 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5697 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
5698 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
5699 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
5700 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
5701 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
5702 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
5703 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
5704 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
5705 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
5706 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
5707 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
5708 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
5709 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
5710 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
5711 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
5712 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
5713 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
5714 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
5715 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
5716 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
5717 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
5718 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
5719 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
5720 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
5721 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
5722 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
5723 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
5724 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
5725 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
5726 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
5727 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
5728 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
5729 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
5730 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
5731 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
5732 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
5733 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
5734 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
5735 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
5736 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
5737 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
5738 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
5739 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
5740 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
5741 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
5742 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
5743 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
5744 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
5745 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
5746 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
5747 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
5748 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
5749 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
5750 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
5751 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
5752 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
5753 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
5754 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
5755 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
5756 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
5757 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
5758 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
5759 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
5760 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5761
5762 const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
5763 const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
5764 constexpr uint16_t kPairIndexIncrement =
5765 HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
5766
5767 return BitCast(d, pairs + Set(du, kPairIndexIncrement));
5768}
5769
5770template <class D, HWY_IF_T_SIZE_D(D, 2)>
5772 HWY_DASSERT(mask_bits < 256);
5773 const Rebind<uint8_t, decltype(d)> d8;
5774 const Twice<decltype(d8)> d8t;
5775 const RebindToUnsigned<decltype(d)> du;
5776
5777 // To reduce cache footprint, store lane indices and convert to byte indices
5778 // (2*lane + 0..1), with the doubling baked into the table. It's not clear
5779 // that the additional cost of unpacking nibbles is worthwhile.
5780 alignas(16) static constexpr uint8_t table[2048] = {
5781 // PrintCompressNot16x8Tables
5782 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
5783 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
5784 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
5785 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
5786 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
5787 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
5788 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
5789 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
5790 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
5791 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
5792 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
5793 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
5794 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
5795 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
5796 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
5797 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
5798 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
5799 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
5800 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
5801 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
5802 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
5803 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
5804 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
5805 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
5806 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
5807 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
5808 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
5809 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
5810 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
5811 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
5812 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
5813 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
5814 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
5815 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
5816 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
5817 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
5818 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
5819 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
5820 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
5821 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
5822 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
5823 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
5824 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
5825 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
5826 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
5827 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
5828 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
5829 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
5830 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
5831 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
5832 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
5833 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
5834 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
5835 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
5836 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
5837 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
5838 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
5839 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
5840 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
5841 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
5842 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
5843 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
5844 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
5845 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
5846 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
5847 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
5848 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
5849 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
5850 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
5851 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
5852 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
5853 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
5854 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
5855 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
5856 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
5857 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
5858 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
5859 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
5860 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
5861 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
5862 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
5863 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
5864 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
5865 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
5866 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
5867 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
5868 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
5869 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
5870 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
5871 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
5872 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
5873 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
5874 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
5875 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
5876 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
5877 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
5878 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
5879 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
5880 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
5881 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
5882 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
5883 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
5884 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
5885 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
5886 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
5887 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
5888 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
5889 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
5890 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
5891 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
5892 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
5893 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
5894 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
5895 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
5896 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
5897 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
5898 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
5899 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
5900 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
5901 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
5902 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
5903 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
5904 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
5905 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
5906 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
5907 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
5908 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
5909 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
5910
5911 const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
5912 const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
5913 constexpr uint16_t kPairIndexIncrement =
5914 HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
5915
5916 return BitCast(d, pairs + Set(du, kPairIndexIncrement));
5917}
5918
5919template <class D, HWY_IF_T_SIZE_D(D, 4)>
5920HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
5921 HWY_DASSERT(mask_bits < 16);
5922
5923 // There are only 4 lanes, so we can afford to load the index vector directly.
5924 alignas(16) static constexpr uint8_t u8_indices[256] = {
5925 // PrintCompress32x4Tables
5926 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5927 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5928 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
5929 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5930 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
5931 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
5932 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
5933 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5934 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
5935 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
5936 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
5937 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
5938 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
5939 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
5940 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
5941 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5942
5943 const Repartition<uint8_t, decltype(d)> d8;
5944 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5945}
5946
5947template <class D, HWY_IF_T_SIZE_D(D, 4)>
5948HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
5949 HWY_DASSERT(mask_bits < 16);
5950
5951 // There are only 4 lanes, so we can afford to load the index vector directly.
5952 alignas(16) static constexpr uint8_t u8_indices[256] = {
5953 // PrintCompressNot32x4Tables
5954 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
5955 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5956 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
5957 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5958 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
5959 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
5960 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5961 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5962 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
5963 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5964 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
5965 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
5966 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5967 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5968 12, 13, 14, 15};
5969
5970 const Repartition<uint8_t, decltype(d)> d8;
5971 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5972}
5973
5974template <class D, HWY_IF_T_SIZE_D(D, 8)>
5975HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
5976 HWY_DASSERT(mask_bits < 4);
5977
5978 // There are only 2 lanes, so we can afford to load the index vector directly.
5979 alignas(16) static constexpr uint8_t u8_indices[64] = {
5980 // PrintCompress64x2Tables
5981 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5982 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5983 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5984 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5985
5986 const Repartition<uint8_t, decltype(d)> d8;
5987 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
5988}
5989
5990template <class D, HWY_IF_T_SIZE_D(D, 8)>
5991HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
5992 HWY_DASSERT(mask_bits < 4);
5993
5994 // There are only 2 lanes, so we can afford to load the index vector directly.
5995 alignas(16) static constexpr uint8_t u8_indices[64] = {
5996 // PrintCompressNot64x2Tables
5997 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5998 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5999 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6000 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6001
6002 const Repartition<uint8_t, decltype(d)> d8;
6003 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6004}
6005
6006template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
6008 const DFromV<decltype(v)> d;
6009 const RebindToUnsigned<decltype(d)> du;
6010
6011 HWY_DASSERT(mask_bits < (1ull << N));
6012 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
6013 return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6014}
6015
6016template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
6018 const DFromV<decltype(v)> d;
6019 const RebindToUnsigned<decltype(d)> du;
6020
6021 HWY_DASSERT(mask_bits < (1ull << N));
6022 const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits));
6023 return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6024}
6025
6026} // namespace detail
6027
6028// Single lane: no-op
6029template <typename T>
6030HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6031 return v;
6032}
6033
6034// Two lanes: conditional swap
6035template <typename T, HWY_IF_T_SIZE(T, 8)>
6037 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
6038 const Full128<T> d;
6039 const Vec128<T> m = VecFromMask(d, mask);
6040 const Vec128<T> maskL = DupEven(m);
6041 const Vec128<T> maskH = DupOdd(m);
6042 const Vec128<T> swap = AndNot(maskL, maskH);
6043 return IfVecThenElse(swap, Shuffle01(v), v);
6044}
6045
6046#if HWY_PPC_HAVE_10
6047#ifdef HWY_NATIVE_COMPRESS8
6048#undef HWY_NATIVE_COMPRESS8
6049#else
6050#define HWY_NATIVE_COMPRESS8
6051#endif
6052
6053// General case, 1 byte
6054template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
6055HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
6056 const DFromV<decltype(v)> d;
6057 return TableLookupBytes(
6058 v, detail::CompressOrExpandIndicesFromMask<true>(d, mask));
6059}
6060#endif
6061
6062// General case, 2 or 4 bytes
6063template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
6064HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
6065 return detail::CompressBits(v, detail::BitsFromMask(mask));
6066}
6067
6068// ------------------------------ CompressNot
6069
6070// Single lane: no-op
6071template <typename T>
6072HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6073 return v;
6074}
6075
6076// Two lanes: conditional swap
6077template <typename T, HWY_IF_T_SIZE(T, 8)>
6078HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
6079 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
6080 const Full128<T> d;
6081 const Vec128<T> m = VecFromMask(d, mask);
6082 const Vec128<T> maskL = DupEven(m);
6083 const Vec128<T> maskH = DupOdd(m);
6084 const Vec128<T> swap = AndNot(maskH, maskL);
6085 return IfVecThenElse(swap, Shuffle01(v), v);
6086}
6087
6088#if HWY_PPC_HAVE_10
6089// General case, 1 byte
6090template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
6091HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
6092 const DFromV<decltype(v)> d;
6093 return TableLookupBytes(
6094 v, detail::CompressOrExpandIndicesFromMask<true>(d, Not(mask)));
6095}
6096#endif
6097
6098// General case, 2 or 4 bytes
6099template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
6100HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
6101 // For partial vectors, we cannot pull the Not() into the table because
6102 // BitsFromMask clears the upper bits.
6103 if (N < 16 / sizeof(T)) {
6104 return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
6105 }
6106 return detail::CompressNotBits(v, detail::BitsFromMask(mask));
6107}
6108
6109// ------------------------------ CompressBlocksNot
6110HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
6111 Mask128<uint64_t> /* m */) {
6112 return v;
6113}
6114
6115#if HWY_PPC_HAVE_10
6116template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
6117HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
6118 const uint8_t* HWY_RESTRICT bits) {
6119 const DFromV<decltype(v)> d;
6120 return Compress(v, LoadMaskBits(d, bits));
6121}
6122#endif
6123
6124template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
6126 const uint8_t* HWY_RESTRICT bits) {
6127 // As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply
6128 // convert bits[0] to a uint64_t
6129 uint64_t mask_bits = bits[0];
6130 if (N < 8) {
6131 mask_bits &= (1ull << N) - 1;
6132 }
6133
6134 return detail::CompressBits(v, mask_bits);
6135}
6136
6137// ------------------------------ CompressStore, CompressBitsStore
6138
6139#if HWY_PPC_HAVE_10
6140template <class D, HWY_IF_T_SIZE_D(D, 1)>
6141HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
6142 TFromD<D>* HWY_RESTRICT unaligned) {
6143 const size_t count = CountTrue(d, m);
6144 const auto indices = detail::CompressOrExpandIndicesFromMask<true>(d, m);
6145 const auto compressed = TableLookupBytes(v, indices);
6146 StoreU(compressed, d, unaligned);
6147 return count;
6148}
6149#endif
6150
6151template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
6152HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
6153 TFromD<D>* HWY_RESTRICT unaligned) {
6154 const RebindToUnsigned<decltype(d)> du;
6155
6156 const uint64_t mask_bits = detail::BitsFromMask(m);
6157 HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
6158 const size_t count = PopCount(mask_bits);
6159
6160 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
6161 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6162 StoreU(compressed, d, unaligned);
6163 return count;
6164}
6165
6166#if HWY_PPC_HAVE_10
6167template <class D, HWY_IF_T_SIZE_D(D, 1)>
6168HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
6169 TFromD<D>* HWY_RESTRICT unaligned) {
6170 const size_t count = CountTrue(d, m);
6171 const auto indices = detail::CompressOrExpandIndicesFromMask<true>(d, m);
6172 const auto compressed = TableLookupBytes(v, indices);
6173 StoreN(compressed, d, unaligned, count);
6174 return count;
6175}
6176#endif
6177
6178template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
6179HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
6180 TFromD<D>* HWY_RESTRICT unaligned) {
6181 const RebindToUnsigned<decltype(d)> du;
6182
6183 const uint64_t mask_bits = detail::BitsFromMask(m);
6184 HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
6185 const size_t count = PopCount(mask_bits);
6186
6187 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
6188 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6189#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
6190 StoreN(compressed, d, unaligned, count);
6191#else
6192 BlendedStore(compressed, FirstN(d, count), d, unaligned);
6193#endif
6194 return count;
6195}
6196
6197#if HWY_PPC_HAVE_10
6198template <class D, HWY_IF_T_SIZE_D(D, 1)>
6199HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
6200 D d, TFromD<D>* HWY_RESTRICT unaligned) {
6201 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
6202}
6203#endif
6204
6205template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
6206HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
6207 D d, TFromD<D>* HWY_RESTRICT unaligned) {
6208 const RebindToUnsigned<decltype(d)> du;
6209
6210 // As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply
6211 // convert bits[0] to a uint64_t
6212 uint64_t mask_bits = bits[0];
6213 constexpr size_t kN = MaxLanes(d);
6214 if (kN < 8) {
6215 mask_bits &= (1ull << kN) - 1;
6216 }
6217 const size_t count = PopCount(mask_bits);
6218
6219 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
6220 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6221 StoreU(compressed, d, unaligned);
6222
6223 return count;
6224}
6225
6226// ------------------------------ Expand
6227#if HWY_PPC_HAVE_10
6228#ifdef HWY_NATIVE_EXPAND
6229#undef HWY_NATIVE_EXPAND
6230#else
6231#define HWY_NATIVE_EXPAND
6232#endif
6233
6234template <typename T, size_t N,
6235 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
6236HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
6237 const DFromV<decltype(v)> d;
6238 const auto idx = detail::CompressOrExpandIndicesFromMask<false>(d, mask);
6239 return IfThenElseZero(mask, TableLookupBytes(v, idx));
6240}
6241
6242template <typename T, HWY_IF_T_SIZE(T, 8)>
6243HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
6244 // Same as Compress, just zero out the mask=false lanes.
6245 return IfThenElseZero(mask, Compress(v, mask));
6246}
6247
6248// For single-element vectors, this is at least as fast as native.
6249template <typename T>
6250HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) {
6251 return IfThenElseZero(mask, v);
6252}
6253
6254template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6255HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
6256 const TFromD<D>* HWY_RESTRICT unaligned) {
6257 return Expand(LoadU(d, unaligned), mask);
6258}
6259#endif // HWY_PPC_HAVE_10
6260
6261// ------------------------------ StoreInterleaved2/3/4
6262
6263// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
6264// generic_ops-inl.h.
6265
6266// ------------------------------ Additional mask logical operations
6267namespace detail {
6268
6269#if HWY_IS_LITTLE_ENDIAN
6270template <class V>
6272 return v;
6273}
6274template <class V>
6276 return v;
6277}
6278#else
6279template <class V, HWY_IF_T_SIZE_V(V, 1)>
6281 const DFromV<decltype(v)> d;
6282 return Reverse8(d, v);
6283}
6284template <class V, HWY_IF_T_SIZE_V(V, 2)>
6286 const DFromV<decltype(v)> d;
6287 return Reverse4(d, v);
6288}
6289template <class V, HWY_IF_T_SIZE_V(V, 4)>
6291 const DFromV<decltype(v)> d;
6292 return Reverse2(d, v);
6293}
6294template <class V, HWY_IF_T_SIZE_V(V, 8)>
6296 return v;
6297}
6298template <class V>
6300 const DFromV<decltype(v)> d;
6301 return Reverse(d, v);
6302}
6303#endif
6304
6305template <class V>
6307#if HWY_S390X_HAVE_Z14
6308 const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
6309 vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw),
6310 reinterpret_cast<__vector unsigned char>(b.raw)))};
6311#elif defined(__SIZEOF_INT128__)
6312 using VU128 = __vector unsigned __int128;
6313 const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
6314 vec_sub(reinterpret_cast<VU128>(a.raw), reinterpret_cast<VU128>(b.raw)))};
6315#else
6316 const DFromV<decltype(a)> d;
6317 const Repartition<uint64_t, decltype(d)> du64;
6318
6319 const auto u64_a = BitCast(du64, a);
6320 const auto u64_b = BitCast(du64, b);
6321
6322 const auto diff_u64 = u64_a - u64_b;
6323 const auto borrow_u64 = VecFromMask(du64, u64_a < u64_b);
6324
6325#if HWY_IS_LITTLE_ENDIAN
6326 const auto borrow_u64_shifted = ShiftLeftBytes<8>(du64, borrow_u64);
6327#else
6328 const auto borrow_u64_shifted = ShiftRightBytes<8>(du64, borrow_u64);
6329#endif
6330
6331 const auto diff_i128 = BitCast(d, diff_u64 + borrow_u64_shifted);
6332#endif
6333
6334 return diff_i128;
6335}
6336
6337} // namespace detail
6338
6339template <class T>
6340HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
6341 return mask;
6342}
6343template <class T>
6344HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
6345 const FixedTag<T, 2> d;
6346 const auto vmask = VecFromMask(d, mask);
6347 return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
6348}
6349template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
6350HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
6351 const Simd<T, N, 0> d;
6352 const Full64<T> d_full64;
6353
6354 const auto vmask = VecFromMask(d, mask);
6355 const auto vmask_le64 =
6356 BitCast(Full64<int64_t>(),
6358 const auto neg_vmask_le64 = Neg(vmask_le64);
6359 const auto neg_vmask = ResizeBitCast(
6360 d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64)));
6361
6362 return MaskFromVec(Or(vmask, neg_vmask));
6363}
6364template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
6365HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
6366 const Full128<T> d;
6367 auto vmask = VecFromMask(d, mask);
6368
6369 const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask);
6370 const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128);
6371 const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128);
6372
6373 return MaskFromVec(BitCast(d, Or(vmask, neg_vmask)));
6374}
6375
6376template <class T, size_t N>
6377HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
6378 return Not(SetAtOrAfterFirst(mask));
6379}
6380
6381template <class T>
6382HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
6383 return mask;
6384}
6385template <class T>
6386HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
6387 const FixedTag<T, 2> d;
6388 const RebindToSigned<decltype(d)> di;
6389
6390 const auto vmask = BitCast(di, VecFromMask(d, mask));
6391 const auto zero = Zero(di);
6392 const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
6393 return MaskFromVec(BitCast(d, And(vmask, vmask2)));
6394}
6395template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
6396HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
6397 const Simd<T, N, 0> d;
6398 const Full64<T> d_full64;
6399 const RebindToSigned<decltype(d)> di;
6400
6401 const auto vmask = VecFromMask(d, mask);
6402 const auto vmask_le64 =
6403 BitCast(Full64<int64_t>(),
6405 const auto neg_vmask_le64 = Neg(vmask_le64);
6406 const auto neg_vmask = ResizeBitCast(
6407 d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64)));
6408
6409 const auto first_vmask = BitCast(di, And(vmask, neg_vmask));
6410 return MaskFromVec(BitCast(d, Or(first_vmask, Neg(first_vmask))));
6411}
6412template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
6413HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
6414 const Full128<T> d;
6415 const RebindToSigned<decltype(d)> di;
6416
6417 const auto vmask = VecFromMask(d, mask);
6418 const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask);
6419 const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128);
6420 const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128);
6421
6422 return MaskFromVec(BitCast(d, Neg(BitCast(di, And(vmask, neg_vmask)))));
6423}
6424
6425template <class T>
6426HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) {
6427 const FixedTag<T, 1> d;
6428 const RebindToSigned<decltype(d)> di;
6429 using TI = MakeSigned<T>;
6430
6431 return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
6432}
6433template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
6434HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
6435 const Simd<T, N, 0> d;
6436 return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
6437}
6438
6439// ------------------------------ SumsOf2 and SumsOf4
6440namespace detail {
6441
6442#if !HWY_S390X_HAVE_Z14
6443// Casts nominally int32_t result to D.
6444template <class D>
6445HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
6446 __vector signed int b) {
6447 const Repartition<int32_t, D> di32;
6448#ifdef __OPTIMIZE__
6450 const int64_t sum0 =
6451 static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
6452 static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
6453 static_cast<int64_t>(b[0]);
6454 const int64_t sum1 =
6455 static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
6456 static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
6457 static_cast<int64_t>(b[1]);
6458 const int64_t sum2 =
6459 static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
6460 static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
6461 static_cast<int64_t>(b[2]);
6462 const int64_t sum3 =
6463 static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
6464 static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
6465 static_cast<int64_t>(b[3]);
6466 const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
6467 const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
6468 const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
6469 const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
6470 using Raw = typename detail::Raw128<int32_t>::type;
6471 return BitCast(
6472 d,
6473 VFromD<decltype(di32)>{Raw{
6474 (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
6475 : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
6476 (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
6477 : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
6478 (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
6479 : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
6480 (sign3 == (sum3 >> 31))
6481 ? static_cast<int32_t>(sum3)
6482 : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
6483 } else // NOLINT
6484#endif
6485 {
6486 return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
6487 }
6488}
6489
6490// Casts nominally uint32_t result to D.
6491template <class D>
6492HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
6493 __vector unsigned int b) {
6494 const Repartition<uint32_t, D> du32;
6495#ifdef __OPTIMIZE__
6497 const uint64_t sum0 =
6498 static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
6499 static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
6500 static_cast<uint64_t>(b[0]);
6501 const uint64_t sum1 =
6502 static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
6503 static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
6504 static_cast<uint64_t>(b[1]);
6505 const uint64_t sum2 =
6506 static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
6507 static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
6508 static_cast<uint64_t>(b[2]);
6509 const uint64_t sum3 =
6510 static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
6511 static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
6512 static_cast<uint64_t>(b[3]);
6513 return BitCast(
6514 d,
6515 VFromD<decltype(du32)>{(__vector unsigned int){
6516 static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
6517 static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
6518 static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
6519 static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
6520 : 0xFFFFFFFFu)}});
6521 } else // NOLINT
6522#endif
6523 {
6524 return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
6525 }
6526}
6527
6528// Casts nominally int32_t result to D.
6529template <class D>
6530HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
6531 __vector signed int b) {
6532 const Repartition<int32_t, D> di32;
6533#ifdef __OPTIMIZE__
6534 const Repartition<uint64_t, D> du64;
6535 constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
6536 if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
6537 __builtin_constant_p(b[kDestLaneOffset + 2])) {
6538 const int64_t sum0 = static_cast<int64_t>(a[0]) +
6539 static_cast<int64_t>(a[1]) +
6540 static_cast<int64_t>(b[kDestLaneOffset]);
6541 const int64_t sum1 = static_cast<int64_t>(a[2]) +
6542 static_cast<int64_t>(a[3]) +
6543 static_cast<int64_t>(b[kDestLaneOffset + 2]);
6544 const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
6545 const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
6546 return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
6547 (sign0 == (sum0 >> 31))
6548 ? static_cast<uint32_t>(sum0)
6549 : static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
6550 (sign1 == (sum1 >> 31))
6551 ? static_cast<uint32_t>(sum1)
6552 : static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
6553 } else // NOLINT
6554#endif
6555 {
6556 __vector signed int sum;
6557
6558 // Inline assembly is used for vsum2sws to avoid unnecessary shuffling
6559 // on little-endian PowerPC targets as the result of the vsum2sws
6560 // instruction will already be in the correct lanes on little-endian
6561 // PowerPC targets.
6562 __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
6563
6564 return BitCast(d, VFromD<decltype(di32)>{sum});
6565 }
6566}
6567
6568// Casts nominally int32_t result to D.
6569template <class D>
6570HWY_INLINE VFromD<D> AltivecVsum4shs(D d, __vector signed short a,
6571 __vector signed int b) {
6572 const Repartition<int32_t, D> di32;
6573#ifdef __OPTIMIZE__
6575 const int64_t sum0 = static_cast<int64_t>(a[0]) +
6576 static_cast<int64_t>(a[1]) +
6577 static_cast<int64_t>(b[0]);
6578 const int64_t sum1 = static_cast<int64_t>(a[2]) +
6579 static_cast<int64_t>(a[3]) +
6580 static_cast<int64_t>(b[1]);
6581 const int64_t sum2 = static_cast<int64_t>(a[4]) +
6582 static_cast<int64_t>(a[5]) +
6583 static_cast<int64_t>(b[2]);
6584 const int64_t sum3 = static_cast<int64_t>(a[6]) +
6585 static_cast<int64_t>(a[7]) +
6586 static_cast<int64_t>(b[3]);
6587 const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
6588 const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
6589 const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
6590 const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
6591 using Raw = typename detail::Raw128<int32_t>::type;
6592 return BitCast(
6593 d,
6594 VFromD<decltype(di32)>{Raw{
6595 (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
6596 : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
6597 (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
6598 : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
6599 (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
6600 : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
6601 (sign3 == (sum3 >> 31))
6602 ? static_cast<int32_t>(sum3)
6603 : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
6604 } else // NOLINT
6605#endif
6606 {
6607 return BitCast(d, VFromD<decltype(di32)>{vec_vsum4shs(a, b)});
6608 }
6609}
6610
6611// Casts nominally int32_t result to D.
6612template <class D>
6613HWY_INLINE VFromD<D> AltivecVsumsws(D d, __vector signed int a,
6614 __vector signed int b) {
6615 const Repartition<int32_t, D> di32;
6616#ifdef __OPTIMIZE__
6617 constexpr int kDestLaneOffset = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
6618 if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset])) {
6619 const int64_t sum =
6620 static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
6621 static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
6622 static_cast<int64_t>(b[kDestLaneOffset]);
6623 const int32_t sign = static_cast<int32_t>(sum >> 63);
6624#if HWY_IS_LITTLE_ENDIAN
6625 return BitCast(
6626 d, VFromD<decltype(di32)>{(__vector signed int){
6627 (sign == (sum >> 31)) ? static_cast<int32_t>(sum)
6628 : static_cast<int32_t>(sign ^ 0x7FFFFFFF),
6629 0, 0, 0}});
6630#else
6631 return BitCast(d, VFromD<decltype(di32)>{(__vector signed int){
6632 0, 0, 0,
6633 (sign == (sum >> 31))
6634 ? static_cast<int32_t>(sum)
6635 : static_cast<int32_t>(sign ^ 0x7FFFFFFF)}});
6636#endif
6637 } else // NOLINT
6638#endif
6639 {
6640 __vector signed int sum;
6641
6642 // Inline assembly is used for vsumsws to avoid unnecessary shuffling
6643 // on little-endian PowerPC targets as the result of the vsumsws
6644 // instruction will already be in the correct lanes on little-endian
6645 // PowerPC targets.
6646 __asm__("vsumsws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
6647
6648 return BitCast(d, VFromD<decltype(di32)>{sum});
6649 }
6650}
6651
6652template <size_t N>
6653HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) {
6654 const RebindToSigned<DFromV<decltype(v)>> di16;
6655 const RepartitionToWide<decltype(di16)> di32;
6656 return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw,
6657 Set(di32, 65536).raw);
6658}
6659#endif // !HWY_S390X_HAVE_Z14
6660
6661// U16->U32 SumsOf2
6662template <class V>
6664 hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6665 const DFromV<V> d;
6666 const RepartitionToWide<decltype(d)> dw;
6667
6668#if HWY_S390X_HAVE_Z14
6669 return VFromD<decltype(dw)>{vec_sum4(v.raw, Zero(d).raw)};
6670#else
6671 return BitCast(dw, AltivecU16SumsOf2(v));
6672#endif
6673}
6674
6675// I16->I32 SumsOf2
6676template <class V>
6678 hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6679 const DFromV<V> d;
6680 const RepartitionToWide<decltype(d)> dw;
6681
6682#if HWY_S390X_HAVE_Z14
6683 const RebindToUnsigned<decltype(d)> du;
6685 BitCast(du, Xor(v, SignBit(d))))) +
6686 Set(dw, int32_t{-65536});
6687#else
6688 return AltivecVsum4shs(dw, v.raw, Zero(dw).raw);
6689#endif
6690}
6691
6692#if HWY_S390X_HAVE_Z14
6693// U32->U64 SumsOf2
6694template <class V>
6696 hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
6697 const DFromV<V> d;
6698 const RepartitionToWide<decltype(d)> dw;
6699 return VFromD<decltype(dw)>{vec_sum2(v.raw, Zero(d).raw)};
6700}
6701
6702// I32->I64 SumsOf2
6703template <class V>
6705 hwy::SignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
6706 const DFromV<V> d;
6707 const RepartitionToWide<decltype(d)> dw;
6708 const RebindToUnsigned<decltype(d)> du;
6709
6711 BitCast(du, Xor(v, SignBit(d))))) +
6712 Set(dw, int64_t{-4294967296LL});
6713}
6714#endif
6715
6716// U8->U32 SumsOf4
6717template <class V>
6719 hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
6720 const DFromV<V> d;
6721 const RepartitionToWideX2<decltype(d)> dw2;
6722
6723#if HWY_S390X_HAVE_Z14
6724 return VFromD<decltype(dw2)>{vec_sum4(v.raw, Zero(d).raw)};
6725#else
6726 return AltivecVsum4ubs(dw2, v.raw, Zero(dw2).raw);
6727#endif
6728}
6729
6730// I8->I32 SumsOf4
6731template <class V>
6733 hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
6734 const DFromV<V> d;
6735 const RepartitionToWideX2<decltype(d)> dw2;
6736
6737#if HWY_S390X_HAVE_Z14
6738 const RebindToUnsigned<decltype(d)> du;
6740 BitCast(du, Xor(v, SignBit(d))))) +
6741 Set(dw2, int32_t{-512});
6742#else
6743 return AltivecVsum4sbs(dw2, v.raw, Zero(dw2).raw);
6744#endif
6745}
6746
6747// U16->U64 SumsOf4
6748template <class V>
6750 hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6751 const DFromV<V> d;
6752 const RepartitionToWide<decltype(d)> dw;
6753 const RepartitionToWide<decltype(dw)> dw2;
6754
6755#if HWY_S390X_HAVE_Z14
6756 return VFromD<decltype(dw2)>{vec_sum2(v.raw, Zero(d).raw)};
6757#else
6758 const RebindToSigned<decltype(dw)> dw_i;
6759 return AltivecVsum2sws(dw2, BitCast(dw_i, SumsOf2(v)).raw, Zero(dw_i).raw);
6760#endif
6761}
6762
6763// I16->I64 SumsOf4
6764template <class V>
6766 hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6767 const DFromV<V> d;
6768 const RepartitionToWide<decltype(d)> dw;
6769 const RepartitionToWide<decltype(dw)> dw2;
6770
6771#if HWY_S390X_HAVE_Z14
6772 const RebindToUnsigned<decltype(d)> du;
6774 BitCast(du, Xor(v, SignBit(d))))) +
6775 Set(dw2, int64_t{-131072});
6776#else // VSX
6777 const auto sums_of_4_in_lo32 =
6778 AltivecVsum2sws(dw, SumsOf2(v).raw, Zero(dw).raw);
6779
6780#if HWY_IS_LITTLE_ENDIAN
6781 return PromoteEvenTo(dw2, sums_of_4_in_lo32);
6782#else
6783 return PromoteOddTo(dw2, sums_of_4_in_lo32);
6784#endif // HWY_IS_LITTLE_ENDIAN
6785#endif // HWY_S390X_HAVE_Z14
6786}
6787
6788} // namespace detail
6789
6790// ------------------------------ SumOfLanes
6791
6792// We define SumOfLanes for 8/16-bit types (and I32/U32/I64/U64 on Z14/Z15/Z16);
6793// enable generic for the rest.
6794#undef HWY_IF_SUM_OF_LANES_D
6795#if HWY_S390X_HAVE_Z14
6796#define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D)
6797#else
6798#define HWY_IF_SUM_OF_LANES_D(D) \
6799 HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))
6800#endif
6801
6802#if HWY_S390X_HAVE_Z14
6803namespace detail {
6804
6805template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
6806 HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
6808 const DFromV<decltype(v)> d;
6809 const RebindToUnsigned<decltype(d)> du;
6810 return BitCast(
6811 d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)});
6812}
6813
6814} // namespace detail
6815
6816template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
6818 return Broadcast<1>(detail::SumOfU32OrU64LanesAsU128(v));
6819}
6820#endif
6821
6822template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
6824 constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
6825 return Broadcast<kSumLaneIdx>(
6827}
6828
6829template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
6831 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
6832 return Broadcast<kSumLaneIdx>(
6834}
6835
6836template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
6838 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
6839#if HWY_S390X_HAVE_Z14
6840 return Broadcast<kSumLaneIdx>(
6843#else // VSX
6844 const auto zero = Zero(Full128<int32_t>());
6845 return Broadcast<kSumLaneIdx>(
6846 detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw));
6847#endif
6848}
6849
6850template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
6852#if HWY_S390X_HAVE_Z14
6853 const RebindToUnsigned<decltype(di16)> du16;
6854 return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
6855#else
6856 constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
6857 return Broadcast<kSumLaneIdx>(
6859#endif
6860}
6861
6862template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
6864#if HWY_S390X_HAVE_Z14
6865 const RebindToUnsigned<decltype(di16)> du16;
6866 return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
6867#else
6868 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
6869 return Broadcast<kSumLaneIdx>(
6871#endif
6872}
6873
6874template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
6876#if HWY_S390X_HAVE_Z14
6877 const RebindToUnsigned<decltype(di16)> du16;
6878 return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
6879#else
6880 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
6881 const Full128<int32_t> di32;
6882 const auto zero = Zero(di32);
6883 return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6884 di16, detail::AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
6885#endif
6886}
6887
6888template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
6890 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
6891 return Broadcast<kSumLaneIdx>(
6893}
6894
6895template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
6897 const Twice<decltype(du8)> dt_u8;
6898 return LowerHalf(du8, SumOfLanes(dt_u8, Combine(dt_u8, Zero(du8), v)));
6899}
6900
6901template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
6903 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
6904 return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v)));
6905}
6906
6907template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
6909 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
6910
6911#if HWY_S390X_HAVE_Z14
6912 return Broadcast<kSumLaneIdx>(
6915#else
6916 const Full128<uint32_t> du32;
6917 const RebindToSigned<decltype(du32)> di32;
6918 const Vec128<uint32_t> zero = Zero(du32);
6919 return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6920 du8, detail::AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
6921 BitCast(di32, zero).raw));
6922#endif
6923}
6924
6925template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
6927#if HWY_S390X_HAVE_Z14
6928 const RebindToUnsigned<decltype(di8)> du8;
6929 return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
6930#else
6931 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
6932 return Broadcast<kSumLaneIdx>(
6934#endif
6935}
6936
6937template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I8_D(D)>
6939 const Twice<decltype(di8)> dt_i8;
6940 return LowerHalf(di8, SumOfLanes(dt_i8, Combine(dt_i8, Zero(di8), v)));
6941}
6942
6943template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
6945#if HWY_S390X_HAVE_Z14
6946 const RebindToUnsigned<decltype(di8)> du8;
6947 return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
6948#else
6949 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
6950 return Broadcast<kSumLaneIdx>(BitCast(di8, SumsOf8(v)));
6951#endif
6952}
6953
6954template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
6956#if HWY_S390X_HAVE_Z14
6957 const RebindToUnsigned<decltype(di8)> du8;
6958 return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
6959#else
6960 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
6961 const Full128<int32_t> di32;
6962 const Vec128<int32_t> zero = Zero(di32);
6963 return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6964 di8, detail::AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
6965#endif
6966}
6967
6968#if HWY_S390X_HAVE_Z14
6969template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_UI32_D(D)>
6971 const RebindToUnsigned<decltype(d32)> du32;
6972 return Broadcast<1>(
6974 BitCast(du32, v))));
6975}
6976
6977template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
6978HWY_API VFromD<D> SumOfLanes(D /*d32*/, VFromD<D> v) {
6979 return Broadcast<3>(detail::SumOfU32OrU64LanesAsU128(v));
6980}
6981#endif
6982
6983// generic_ops defines MinOfLanes and MaxOfLanes.
6984
6985// ------------------------------ ReduceSum for N=4 I8/U8
6986
6987// GetLane(SumsOf4(v)) is more efficient on PPC/Z14 than the default N=4
6988// I8/U8 ReduceSum implementation in generic_ops-inl.h
6989#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
6990#undef HWY_NATIVE_REDUCE_SUM_4_UI8
6991#else
6992#define HWY_NATIVE_REDUCE_SUM_4_UI8
6993#endif
6994
6995template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
6996HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
6997 return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
6998}
6999
7000// ------------------------------ Lt128
7001
7002namespace detail {
7003
7004// Returns vector-mask for Lt128.
7005template <class D, class V = VFromD<D>>
7006HWY_INLINE V Lt128Vec(D d, V a, V b) {
7007 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
7008#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
7009 (void)d;
7010 using VU64 = __vector unsigned long long;
7011 using VU128 = __vector unsigned __int128;
7012#if HWY_IS_LITTLE_ENDIAN
7013 const VU128 a_u128 = reinterpret_cast<VU128>(a.raw);
7014 const VU128 b_u128 = reinterpret_cast<VU128>(b.raw);
7015#else
7016 // NOTE: Need to swap the halves of both a and b on big-endian targets
7017 // as the upper 64 bits of a and b are in lane 1 and the lower 64 bits
7018 // of a and b are in lane 0 whereas the vec_cmplt operation below expects
7019 // the upper 64 bits in lane 0 and the lower 64 bits in lane 1 on
7020 // big-endian PPC targets.
7021 const VU128 a_u128 = reinterpret_cast<VU128>(vec_sld(a.raw, a.raw, 8));
7022 const VU128 b_u128 = reinterpret_cast<VU128>(vec_sld(b.raw, b.raw, 8));
7023#endif
7024 return V{reinterpret_cast<VU64>(vec_cmplt(a_u128, b_u128))};
7025#else // !HWY_PPC_HAVE_10
7026 // Truth table of Eq and Lt for Hi and Lo u64.
7027 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
7028 // =H =L cH cL | out = cH | (=H & cL)
7029 // 0 0 0 0 | 0
7030 // 0 0 0 1 | 0
7031 // 0 0 1 0 | 1
7032 // 0 0 1 1 | 1
7033 // 0 1 0 0 | 0
7034 // 0 1 0 1 | 0
7035 // 0 1 1 0 | 1
7036 // 1 0 0 0 | 0
7037 // 1 0 0 1 | 1
7038 // 1 1 0 0 | 0
7039 const auto eqHL = Eq(a, b);
7040 const V ltHL = VecFromMask(d, Lt(a, b));
7041 const V ltLX = ShiftLeftLanes<1>(ltHL);
7042 const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
7043 return InterleaveUpper(d, vecHx, vecHx);
7044#endif
7045}
7046
7047// Returns vector-mask for Eq128.
7048template <class D, class V = VFromD<D>>
7049HWY_INLINE V Eq128Vec(D d, V a, V b) {
7050 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
7051#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
7052 (void)d;
7053 using VU64 = __vector unsigned long long;
7054 using VU128 = __vector unsigned __int128;
7055 return V{reinterpret_cast<VU64>(vec_cmpeq(reinterpret_cast<VU128>(a.raw),
7056 reinterpret_cast<VU128>(b.raw)))};
7057#else
7058 const auto eqHL = VecFromMask(d, Eq(a, b));
7059 const auto eqLH = Reverse2(d, eqHL);
7060 return And(eqHL, eqLH);
7061#endif
7062}
7063
7064template <class D, class V = VFromD<D>>
7065HWY_INLINE V Ne128Vec(D d, V a, V b) {
7066 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
7067#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
7068 (void)d;
7069 using VU64 = __vector unsigned long long;
7070 using VU128 = __vector unsigned __int128;
7071 return V{reinterpret_cast<VU64>(vec_cmpne(reinterpret_cast<VU128>(a.raw),
7072 reinterpret_cast<VU128>(b.raw)))};
7073#else
7074 const auto neHL = VecFromMask(d, Ne(a, b));
7075 const auto neLH = Reverse2(d, neHL);
7076 return Or(neHL, neLH);
7077#endif
7078}
7079
7080template <class D, class V = VFromD<D>>
7081HWY_INLINE V Lt128UpperVec(D d, V a, V b) {
7082 const V ltHL = VecFromMask(d, Lt(a, b));
7083 return InterleaveUpper(d, ltHL, ltHL);
7084}
7085
7086template <class D, class V = VFromD<D>>
7087HWY_INLINE V Eq128UpperVec(D d, V a, V b) {
7088 const V eqHL = VecFromMask(d, Eq(a, b));
7089 return InterleaveUpper(d, eqHL, eqHL);
7090}
7091
7092template <class D, class V = VFromD<D>>
7093HWY_INLINE V Ne128UpperVec(D d, V a, V b) {
7094 const V neHL = VecFromMask(d, Ne(a, b));
7095 return InterleaveUpper(d, neHL, neHL);
7096}
7097
7098} // namespace detail
7099
7100template <class D, class V = VFromD<D>>
7101HWY_API MFromD<D> Lt128(D d, V a, V b) {
7102 return MaskFromVec(detail::Lt128Vec(d, a, b));
7103}
7104
7105template <class D, class V = VFromD<D>>
7106HWY_API MFromD<D> Eq128(D d, V a, V b) {
7107 return MaskFromVec(detail::Eq128Vec(d, a, b));
7108}
7109
7110template <class D, class V = VFromD<D>>
7111HWY_API MFromD<D> Ne128(D d, V a, V b) {
7112 return MaskFromVec(detail::Ne128Vec(d, a, b));
7113}
7114
7115template <class D, class V = VFromD<D>>
7117 return MaskFromVec(detail::Lt128UpperVec(d, a, b));
7118}
7119
7120template <class D, class V = VFromD<D>>
7122 return MaskFromVec(detail::Eq128UpperVec(d, a, b));
7123}
7124
7125template <class D, class V = VFromD<D>>
7127 return MaskFromVec(detail::Ne128UpperVec(d, a, b));
7128}
7129
7130// ------------------------------ Min128, Max128 (Lt128)
7131
7132// Avoids the extra MaskFromVec in Lt128.
7133template <class D, class V = VFromD<D>>
7134HWY_API V Min128(D d, const V a, const V b) {
7135 return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
7136}
7137
7138template <class D, class V = VFromD<D>>
7139HWY_API V Max128(D d, const V a, const V b) {
7140 return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
7141}
7142
7143template <class D, class V = VFromD<D>>
7144HWY_API V Min128Upper(D d, const V a, const V b) {
7145 return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
7146}
7147
7148template <class D, class V = VFromD<D>>
7149HWY_API V Max128Upper(D d, const V a, const V b) {
7150 return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
7151}
7152
7153// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
7154
7155#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
7156#undef HWY_NATIVE_LEADING_ZERO_COUNT
7157#else
7158#define HWY_NATIVE_LEADING_ZERO_COUNT
7159#endif
7160
7161template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
7163#if HWY_S390X_HAVE_Z14
7164 const DFromV<decltype(v)> d;
7165 const RebindToUnsigned<decltype(d)> du;
7166
7167#if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
7168 // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
7169 // constant
7170 __asm__("" : "+v"(v.raw));
7171#endif
7172
7173 return BitCast(d, VFromD<decltype(du)>{vec_cntlz(BitCast(du, v).raw)});
7174#else
7175 return V{vec_cntlz(v.raw)};
7176#endif
7177}
7178
7179template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
7181 const DFromV<decltype(v)> d;
7182 using T = TFromD<decltype(d)>;
7183 return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
7184}
7185
7186#if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
7187template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
7189#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
7190 return V{vec_vctz(v.raw)};
7191#else
7192#if HWY_S390X_HAVE_Z14
7193 const DFromV<decltype(v)> d;
7194 const RebindToUnsigned<decltype(d)> du;
7195
7196#if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
7197 // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
7198 // constant
7199 __asm__("" : "+v"(v.raw));
7200#endif
7201
7202 return BitCast(d, VFromD<decltype(du)>{vec_cnttz(BitCast(du, v).raw)});
7203#else
7204 return V{vec_cnttz(v.raw)};
7205#endif // HWY_S390X_HAVE_Z14
7206#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
7207}
7208#else
7209template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
7211 const DFromV<decltype(v)> d;
7212 const RebindToSigned<decltype(d)> di;
7213 using TI = TFromD<decltype(di)>;
7214
7215 const auto vi = BitCast(di, v);
7216 const auto lowest_bit = And(vi, Neg(vi));
7217 constexpr TI kNumOfBitsInT{sizeof(TI) * 8};
7218 const auto bit_idx = HighestSetBitIndex(lowest_bit);
7220 Set(di, kNumOfBitsInT), bit_idx));
7221}
7222#endif
7223
7224#undef HWY_PPC_HAVE_9
7225#undef HWY_PPC_HAVE_10
7226#undef HWY_S390X_HAVE_Z14
7227#undef HWY_S390X_HAVE_Z15
7228
7229// NOLINTNEXTLINE(google-readability-namespace-comments)
7230} // namespace HWY_NAMESPACE
7231} // namespace hwy
#define HWY_RESTRICT
Definition base.h:95
#define HWY_RCAST_ALIGNED(type, ptr)
Definition base.h:144
#define HWY_IF_SIGNED(T)
Definition base.h:622
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_API
Definition base.h:171
#define HWY_IF_T_SIZE(T, bytes)
Definition base.h:639
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_LANES_GT(kN, lanes)
Definition base.h:618
#define HWY_IF_V_SIZE_LE(T, kN, bytes)
Definition base.h:611
#define HWY_IF_NOT_FLOAT(T)
Definition base.h:626
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array)
Definition base.h:645
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)
Definition base.h:635
#define HWY_IF_UNSIGNED(T)
Definition base.h:620
#define HWY_IF_UI32(T)
Definition base.h:686
Definition arm_neon-inl.h:865
T PrivateT
Definition arm_neon-inl.h:870
Raw raw
Definition arm_neon-inl.h:878
detail::Raw128< T >::RawBoolVec raw
Definition ppc_vsx-inl.h:172
Definition arm_neon-inl.h:813
HWY_INLINE Vec128 & operator%=(const Vec128 other)
Definition ppc_vsx-inl.h:144
T PrivateT
Definition arm_neon-inl.h:816
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition ppc_vsx-inl.h:135
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:815
Raw raw
Definition arm_neon-inl.h:851
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition ppc_vsx-inl.h:141
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition ppc_vsx-inl.h:153
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition ppc_vsx-inl.h:150
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition ppc_vsx-inl.h:132
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition ppc_vsx-inl.h:147
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition ppc_vsx-inl.h:138
HWY_INLINE V Per128BitBlkRevLanesOnBe(V v)
Definition ppc_vsx-inl.h:6299
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:744
HWY_API Vec32< T > ShuffleTwo1230(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:927
HWY_INLINE VFromD< D > IndicesFromBits128(D d, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5622
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:1334
HWY_INLINE Vec128< T > SumOfU32OrU64LanesAsU128(Vec128< T > v)
Definition ppc_vsx-inl.h:6807
HWY_INLINE V Eq128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7087
HWY_API Vec128< T, N > Shl(hwy::UnsignedTag, Vec128< T, N > v, Vec128< T, N > bits)
Definition ppc_vsx-inl.h:3336
HWY_INLINE VFromD< RepartitionToWide< DFromV< V > > > SumsOf2(hwy::SignedTag, hwy::SizeTag< 1 >, V v)
Definition arm_neon-inl.h:1959
HWY_INLINE MFromD< D > LoadMaskBits128(D, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5084
HWY_API VFromD< DTo > ConvertTo(hwy::FloatTag, DTo, Vec128< TFrom, HWY_MAX_LANES_D(DTo)> from)
Definition emu128-inl.h:1857
HWY_INLINE V Per64BitBlkRevLanesOnBe(V v)
Definition ppc_vsx-inl.h:6280
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:839
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_API VFromD< D > Truncate2To(D, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> lo, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> hi)
Definition ppc_vsx-inl.h:2879
static HWY_INLINE bool IsConstantRawAltivecVect(hwy::SizeTag< 1 >, RawV v)
Definition ppc_vsx-inl.h:389
HWY_API Vec128< T, N > Shr(hwy::UnsignedTag, Vec128< T, N > v, Vec128< T, N > bits)
Definition ppc_vsx-inl.h:3366
static HWY_INLINE HWY_MAYBE_UNUSED TFromV< V > GetLane(V v)
Definition arm_neon-inl.h:1634
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8141
HWY_INLINE Vec128< float16_t, N > ConcatEven(Vec128< float16_t, N > hi, Vec128< float16_t, N > lo)
Definition arm_neon-inl.h:7002
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecBroadcastLaneBytes(D d)
Definition arm_neon-inl.h:5661
HWY_INLINE V I128Subtract(V a, V b)
Definition ppc_vsx-inl.h:6306
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE VFromD< Repartition< uint8_t, D > > IndicesFromVecByteOffsets(D d)
Definition arm_neon-inl.h:5695
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6088
static HWY_INLINE VFromD< DF32 > DemoteToF32WithRoundToOdd(DF32 df32, VFromD< Rebind< double, DF32 > > v)
Definition ppc_vsx-inl.h:4248
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_API Vec32< T > ShuffleTwo3012(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:944
HWY_INLINE VFromD< D > PromoteOddTo(hwy::FloatTag to_type_tag, hwy::SizeTag< 4 > to_lane_size_tag, hwy::FloatTag from_type_tag, D d_to, svfloat16_t v)
Definition arm_sve-inl.h:4419
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6031
HWY_INLINE V Ne128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7093
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag, hwy::SizeTag< 1 >, svint8_t v)
Definition arm_sve-inl.h:982
HWY_INLINE VFromD< RebindToFloat< DFromV< V > > > ConvToF64WithRoundToOdd(V v)
Definition ppc_vsx-inl.h:4518
HWY_INLINE VFromD< D > PromoteEvenTo(hwy::SignedTag, hwy::SizeTag< 2 >, hwy::SignedTag, D d_to, svint8_t v)
Definition arm_sve-inl.h:4334
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:8276
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition ops/shared-inl.h:325
HWY_API Vec128< T, N > CompressBits(Vec128< T, N > v, uint64_t mask_bits)
Definition ppc_vsx-inl.h:6007
HWY_API Vec128< T, N > CompressNotBits(Vec128< T, N > v, uint64_t mask_bits)
Definition ppc_vsx-inl.h:6017
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:6076
HWY_API Vec32< T > ShuffleTwo2301(Vec32< T > a, Vec32< T > b)
Definition ppc_vsx-inl.h:910
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:2478
HWY_INLINE VFromD< D > IndicesFromNotBits128(D d, uint64_t mask_bits)
Definition ppc_vsx-inl.h:5771
HWY_INLINE V Lt128UpperVec(D d, V a, V b)
Definition ppc_vsx-inl.h:7081
HWY_INLINE VFromD< D > Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0)
Definition ppc_vsx-inl.h:2712
hwy::If<(!hwy::IsFloat< TFromD< D > >() &&!hwy::IsSpecialFloat< TFromD< D > >()), RebindToUnsigned< D >, D > RebindToUnsignedIfNotFloat
Definition ppc_vsx-inl.h:1390
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3221
HWY_INLINE VFromD< D > Max128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9480
HWY_API Vec128< uint8_t > operator>>(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2245
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< DI32 > SatWidenMulPairwiseAccumulate(DI32 di32, VFromD< Repartition< int16_t, DI32 > > a, VFromD< Repartition< int16_t, DI32 > > b, VFromD< DI32 > sum)
Definition generic_ops-inl.h:5179
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T > Shuffle2103(Vec128< T > v)
Definition arm_neon-inl.h:6024
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API intptr_t FindLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8392
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
RepartitionToWide< RepartitionToWideX2< D > > RepartitionToWideX3
Definition ops/shared-inl.h:483
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > operator<<(Vec128< uint8_t > v, Vec128< uint8_t > bits)
Definition arm_neon-inl.h:2175
HWY_API Vec128< uint8_t > AESLastRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7447
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > operator==(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1173
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API V Rol(V a, V b)
Definition generic_ops-inl.h:445
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API VFromD< D > ShiftLeftLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5268
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
Vec128< T, 4/sizeof(T)> Vec32
Definition arm_neon-inl.h:858
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2806
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2811
HWY_API Mask128< T, N > operator<=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1214
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8896
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
Vec128< T, 2/sizeof(T)> Vec16
Definition arm_neon-inl.h:861
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API VFromD< RepartitionToWideX2< DFromV< V > > > SumsOf4(V v)
Definition generic_ops-inl.h:3733
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API size_t StoreMaskBits(D d, MFromD< D > mask, uint8_t *bits)
Definition arm_neon-inl.h:8402
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2816
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API V LeadingZeroCount(V v)
Definition arm_neon-inl.h:9506
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API Vec128< uint64_t > CLMulUpper(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7456
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API MFromD< D > LoadMaskBits(D d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8094
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API V ReverseLaneBytes(V v)
Definition generic_ops-inl.h:6386
HWY_API VFromD< D > PromoteLowerTo(D d, V v)
Definition generic_ops-inl.h:2984
HWY_API V RotateRightSame(V v, int bits)
Definition generic_ops-inl.h:601
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
Vec128< T, 8/sizeof(T)> Vec64
Definition arm_neon-inl.h:855
HWY_API Vec128< uint8_t > AESRoundInv(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7437
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API Vec1< MakeWide< T > > SumsOf2(const Vec1< T > v)
Definition scalar-inl.h:549
Simd< typename M::PrivateT, M::kPrivateN, 0 > DFromM
Definition arm_neon-inl.h:888
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > ConcatEven(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7047
HWY_API V BitwiseIfThenElse(V mask, V yes, V no)
Definition arm_neon-inl.h:2799
HWY_API V IfNegativeThenElseZero(V v, V yes)
Definition generic_ops-inl.h:241
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API V Ror(V a, V b)
Definition generic_ops-inl.h:459
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API VFromD< DN > OrderedTruncate2To(DN dn, V a, V b)
Definition emu128-inl.h:1978
HWY_API Vec128< uint8_t > AESRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7418
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API VFromD< D > ShiftRightBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5280
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API Vec128< T, N > operator/(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2511
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:476
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API VFromD< D > InterleaveUpper(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:6095
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
HWY_API VFromD< D > LoadExpand(MFromD< D > mask, D d, const TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_sve-inl.h:5655
HWY_API VFromD< DI32 > SumOfMulQuadAccumulate(DI32, svint8_t a, svint8_t b, svint32_t sum)
Definition arm_sve-inl.h:5894
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:3225
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
RepartitionToWide< RepartitionToWide< D > > RepartitionToWideX2
Definition ops/shared-inl.h:480
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_INLINE VFromD< D > Min128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9475
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API Vec128< uint8_t > AESInvMixColumns(Vec128< uint8_t > state)
Definition arm_neon-inl.h:7433
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API V HighestSetBitIndex(V v)
Definition arm_neon-inl.h:9523
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
HWY_API VFromD< D > PromoteUpperTo(D d, V v)
Definition arm_sve-inl.h:2228
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API Mask128< T, N > operator<(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1197
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API Vec128< T, N > operator*(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:816
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Mask128< T, 1 > SetAtOrAfterFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9320
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API V TrailingZeroCount(V v)
Definition arm_neon-inl.h:9530
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API VFromD< D > ConcatOdd(D, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:7020
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API Vec1< T > operator%(Vec1< T > a, Vec1< T > b)
Definition generic_ops-inl.h:5095
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Mask128< T, N > operator!=(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:1182
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API Vec128< uint8_t > AESKeyGenAssist(Vec128< uint8_t > v)
Definition arm_neon-inl.h:7814
HWY_API Vec128< uint8_t > AESLastRound(Vec128< uint8_t > state, Vec128< uint8_t > round_key)
Definition arm_neon-inl.h:7428
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V RotateLeftSame(V v, int bits)
Definition generic_ops-inl.h:588
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API V IfNegativeThenZeroElse(V v, V no)
Definition generic_ops-inl.h:256
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:8872
HWY_API Vec128< uint64_t > CLMulLower(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:7452
Definition abort.h:8
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2551
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From &val)
Definition base.h:1024
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
constexpr size_t FloorLog2(TI x)
Definition base.h:2662
typename IfT< Condition, Then, Else >::type If
Definition base.h:520
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
HWY_API constexpr bool IsSpecialFloat()
Definition base.h:832
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2588
typename EnableIfT< Condition >::type EnableIf
Definition base.h:486
HWY_API size_t PopCount(T x)
Definition base.h:2615
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_SIGNED_V(V)
Definition ops/shared-inl.h:616
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)
Definition ops/shared-inl.h:546
#define HWY_IF_UI32_D(D)
Definition ops/shared-inl.h:591
#define HWY_IF_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:624
#define HWY_IF_LANES_GT_D(D, lanes)
Definition ops/shared-inl.h:562
#define HWY_IF_V_SIZE_D(D, bytes)
Definition ops/shared-inl.h:605
#define HWY_IF_LANES_D(D, lanes)
Definition ops/shared-inl.h:560
#define HWY_IF_F64_D(D)
Definition ops/shared-inl.h:601
#define HWY_IF_V_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:607
#define HWY_IF_SIGNED_D(D)
Definition ops/shared-inl.h:534
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
Definition ops/shared-inl.h:621
#define HWY_IF_NOT_FLOAT_D(D)
Definition ops/shared-inl.h:536
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_S390X_HAVE_Z15
Definition ppc_vsx-inl.h:67
#define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE)
Definition ppc_vsx-inl.h:89
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T)
Definition ppc_vsx-inl.h:1607
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T)
Definition ppc_vsx-inl.h:1605
HWY_BEFORE_NAMESPACE()
#define HWY_AFTER_NAMESPACE()
Definition set_macros-inl.h:633
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
@ value
Definition arm_neon-inl.h:8429
Definition arm_neon-inl.h:5654
__vector unsigned char raw
Definition ppc_vsx-inl.h:2322
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:5655
Definition ops/shared-inl.h:198
Definition x86_128-inl.h:67
__v128_u type
Definition wasm_128-inl.h:60
Definition base.h:694
Definition base.h:1594
Definition base.h:1117
int VFromD
Definition tuple-inl.h:25