Grok 12.0.1
ops/shared-inl.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Per-target definitions shared by ops/*.h and user code.
17
18// IWYU pragma: begin_exports
19// Export does not seem to be recursive, so re-export these (also in base.h)
20#include <stddef.h>
21
22#include "hwy/base.h"
23// "IWYU pragma: keep" does not work for this include, so hide it from the IDE.
24#if !HWY_IDE
25#include <stdint.h>
26#endif
27
29
30// Separate header because foreach_target.h re-enables its include guard.
32
33// IWYU pragma: end_exports
34
35#if HWY_IS_MSAN
36#include <sanitizer/msan_interface.h>
37#endif
38
39// We are covered by the highway.h include guard, but generic_ops-inl.h
40// includes this again #if HWY_IDE.
41// clang-format off
42#if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE) // NOLINT
43// clang-format on
44#ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE
45#undef HIGHWAY_HWY_OPS_SHARED_TOGGLE
46#else
47#define HIGHWAY_HWY_OPS_SHARED_TOGGLE
48#endif
49
51namespace hwy {
52namespace HWY_NAMESPACE {
53
54// NOTE: GCC generates incorrect code for vector arguments to non-inlined
55// functions in two situations:
56// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
57// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
58// - on aarch64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
59// all) tests to fail.
60//
61// We therefore pass by const& only on GCC and (Windows or aarch64). This alias
62// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
63// and possibly also other functions that are not inlined.
64#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
65template <class V>
66using VecArg = const V&;
67#else
68template <class V>
69using VecArg = V;
70#endif
71
72namespace detail {
73
74template <typename T>
76 using type = T;
77};
78template <>
80#if HWY_HAVE_SCALAR_F16_TYPE
81 using type = hwy::float16_t::Native;
82#else
83 using type = uint16_t;
84#endif
85};
86template <>
88#if HWY_HAVE_SCALAR_BF16_TYPE
89 using type = hwy::bfloat16_t::Native;
90#else
91 using type = uint16_t;
92#endif
93};
94
95// The type expected by intrinsics for the given Highway lane type T. This
96// usually matches T, but differs for our wrapper types [b]float16_t. Use this
97// only when defining intrinsic wrappers, and NOT for casting, which is UB.
98template <typename T>
100
101// Returns the same pointer after changing type to NativeLaneType. Use this only
102// for wrapper functions that call intrinsics (e.g. load/store) where some of
103// the overloads expect _Float16* or __bf16* arguments. For non-special floats,
104// this returns the same pointer and type.
105//
106// This makes use of the fact that a wrapper struct is pointer-interconvertible
107// with its first member (a union), thus also with the union members. Do NOT
108// call both this and U16LanePointer on the same object - they access different
109// union members, and this is not guaranteed to be safe.
110template <typename T, HWY_IF_NOT_SPECIAL_FLOAT(T)>
112 return p;
113}
114template <typename T, typename NT = NativeLaneType<RemoveConst<T>>,
115 HWY_IF_F16(T)>
116HWY_INLINE constexpr If<IsConst<T>(), const NT*, NT*> NativeLanePointer(T* p) {
117#if HWY_HAVE_SCALAR_F16_TYPE
118 return &p->native;
119#else
120 return &p->bits;
121#endif
122}
123template <typename T, typename NT = NativeLaneType<RemoveConst<T>>,
124 HWY_IF_BF16(T)>
125HWY_INLINE constexpr If<IsConst<T>(), const NT*, NT*> NativeLanePointer(T* p) {
126#if HWY_HAVE_SCALAR_BF16_TYPE
127 return &p->native;
128#else
129 return &p->bits;
130#endif
131}
132
133// Returns a pointer to the u16 member of our [b]float16_t wrapper structs.
134// Use this in Highway targets that lack __bf16 intrinsics; for storing to
135// memory, we BitCast vectors to u16 and write to the pointer returned here.
136// Do NOT call both this and U16LanePointer on the same object - they access
137// different union members, and this is not guaranteed to be safe.
138template <typename T, HWY_IF_SPECIAL_FLOAT(T)>
139HWY_INLINE If<IsConst<T>(), const uint16_t*, uint16_t*> U16LanePointer(T* p) {
140 return &p->bits;
141}
142
143// Returns N * 2^pow2. N is the number of lanes in a full vector and pow2 the
144// desired fraction or multiple of it, see Simd<>. `pow2` is most often in
145// [-3, 3] but can also be lower for user-specified fractions.
146constexpr size_t ScaleByPower(size_t N, int pow2) {
147 return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
148}
149
150template <typename T>
151HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) {
152 // Workaround for MSAN not marking compressstore as initialized (b/233326619)
153#if HWY_IS_MSAN
154 __msan_unpoison(unaligned, count * sizeof(T));
155#else
156 (void)unaligned;
157 (void)count;
158#endif
159}
160
161} // namespace detail
162
163// Highway operations are implemented as overloaded functions selected using a
164// zero-sized tag type D := Simd<T, N, kPow2>. T denotes the lane type.
165//
166// N defines how many lanes are in a 'full' vector, typically equal to
167// HWY_LANES(T) (which is the actual count on targets with vectors of known
168// size, and an upper bound in case of scalable vectors), otherwise a
169// user-specified limit at most that large.
170//
171// 2^kPow2 is a _subsequently_ applied scaling factor that indicates the
172// desired fraction of a 'full' vector: 0 means full, -1 means half; 1,2,3
173// means two/four/eight full vectors ganged together. The largest supported
174// kPow2 is `HWY_MAX_POW2` and the aliases below take care of clamping
175// user-specified values to that. Note that `Simd<T, 1, 0>` and `Simd<T, 2, -1>`
176// have the same `MaxLanes` and `Lanes`.
177//
178// We can theoretically keep halving Lanes(), but recursive instantiations of
179// kPow2 - 1 will eventually fail e.g. because -64 is not a valid shift count.
180// Users must terminate such compile-time recursions at or above HWY_MIN_POW2.
181//
182// WARNING: do not use N directly because it may be a special representation of
183// a fractional MaxLanes. This arises when we Rebind Simd<uint8_t, 1, 0> to
184// Simd<uint32_t, ??, 2>. RVV requires that the last argument (kPow2) be two,
185// but we want MaxLanes to be the same in both cases. Hence ?? is a
186// fixed-point encoding of 1/4.
187//
188// Instead of referring to Simd<> directly, users create D via aliases:
189// - ScalableTag<T> for a full vector;
190// - ScalableTag<T, kPow2>() for a fraction/group, where `kPow2` is
191// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`;
192// - CappedTag<T, kLimit> for a vector with up to kLimit lanes; or
193// - FixedTag<T, kNumLanes> for a vector with exactly kNumLanes lanes.
194//
195// Instead of N, use Lanes(D()) for the actual number of lanes at runtime and
196// D().MaxLanes() for a constexpr upper bound. Both are powers of two.
197template <typename Lane, size_t N, int kPow2>
198struct Simd {
199 constexpr Simd() = default;
200 using T = Lane;
201
202 private:
203 static_assert(sizeof(Lane) <= 8, "Lanes are up to 64-bit");
204 static_assert(IsSame<Lane, RemoveCvRef<Lane>>(),
205 "Lane must not be a reference type, const-qualified type, or "
206 "volatile-qualified type");
207 static_assert(IsIntegerLaneType<Lane>() || IsFloat<Lane>() ||
208 IsSpecialFloat<Lane>(),
209 "IsIntegerLaneType<T>(), IsFloat<T>(), or IsSpecialFloat<T>() "
210 "must be true");
211 // 20 bits are sufficient for any HWY_MAX_BYTES. This is the 'normal' value of
212 // N when kFrac == 0, otherwise it is one (see FracN).
213 static constexpr size_t kWhole = N & 0xFFFFF;
214 // Fractional part is in the bits above kWhole.
215 static constexpr int kFrac = static_cast<int>(N >> 20);
216 // Can be 8x larger because kPow2 may be as low as -3 (Rebind of a larger
217 // type to u8 results in fractions).
218 static_assert(kWhole <= 8 * HWY_MAX_N && kFrac <= 3, "Out of range");
219 static_assert(kFrac == 0 || kWhole == 1, "If frac, whole must be 1");
220 static_assert((kWhole & (kWhole - 1)) == 0 && kWhole != 0, "Not 2^x");
221 // Important to check this here because kPow2 <= -64 causes confusing
222 // compile errors (invalid shift count).
223 static_assert(kPow2 >= HWY_MIN_POW2, "Forgot kPow2 recursion terminator?");
224 // However, do NOT verify kPow2 <= HWY_MAX_POW2 - users should be able to
225 // Rebind<uint64_t, ScalableTag<uint8_t, 3>> in order to discover that its
226 // kPow2 is out of bounds.
227
228 public:
229 // Upper bound on the number of lanes (tight if !HWY_HAVE_SCALABLE). In the
230 // common case, N == kWhole, but if kFrac is nonzero, we deduct it from kPow2.
231 // E.g. Rebind<uint32_t, Simd<uint8_t, 1, 0>> is Simd<uint32_t, 0x200001, 2>.
232 // The resulting number of lanes is still 1 because this N represents 1/4
233 // (the ratio of the sizes). Note that RVV requires kPow2 to be the ratio of
234 // the sizes so that the correct LMUL overloads are chosen, even if N is
235 // small enough that it would fit in an LMUL=1 vector.
236 //
237 // Cannot be an enum because GCC warns when using enums and non-enums in the
238 // same expression. Cannot be a static constexpr function (MSVC limitation).
239 // Rounded up to one so this is a valid array length.
240 //
241 // Do not use this directly - only 'public' so it is visible from the accessor
242 // macro required by MSVC.
243 static constexpr size_t kPrivateLanes =
244 HWY_MAX(size_t{1}, detail::ScaleByPower(kWhole, kPow2 - kFrac));
245 // Do not use this directly - only 'public' so it is visible from the accessor
246 // macro required by MSVC.
247 static constexpr int kPrivatePow2 = kPow2;
248
249 constexpr size_t MaxLanes() const { return kPrivateLanes; }
250 constexpr size_t MaxBytes() const { return kPrivateLanes * sizeof(Lane); }
251 constexpr size_t MaxBlocks() const { return (MaxBytes() + 15) / 16; }
252 // For SFINAE (HWY_IF_POW2_GT_D).
253 constexpr int Pow2() const { return kPow2; }
254
255 // ------------------------------ Changing lane type or count
256 // Do not use any of these directly. Anything used from member typedefs cannot
257 // be made private, but functions only used within other functions can.
258
259 // Returns number of NewT lanes that fit within MaxBytes().
260 template <typename NewT>
261 static constexpr size_t RepartitionLanes() {
262 // Round up to correctly handle larger NewT.
263 return (kPrivateLanes * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
264 }
265
266 // Returns the new kPow2 required for lanes of type NewT.
267 template <typename NewT>
268 static constexpr int RebindPow2() {
269 return kPow2 +
270 ((sizeof(NewT) >= sizeof(T))
271 ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
272 : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT))));
273 }
274
275 private:
276 // Returns 0 or whole NewN such that kNewMaxLanes = NewN * 2^kNewPow2.
277 template <int kNewPow2, size_t kNewMaxLanes>
278 static constexpr size_t WholeN() {
279 return detail::ScaleByPower(kNewMaxLanes, -kNewPow2);
280 }
281
282 // Returns fractional NewN such that kNewMaxLanes = NewN * 2^kNewPow2.
283 template <int kNewPow2, size_t kNewMaxLanes>
284 static constexpr size_t FracN() {
285 // Only reached if kNewPow2 > CeilLog2(kNewMaxLanes) >= 0 (else WholeN
286 // would not have been zero), but clamp to zero to avoid warnings. kFrac is
287 // the difference, stored in the upper bits of N, and we also set kWhole =
288 // 1 so that the new kPrivateLanes = kNewMaxLanes.
289 static_assert(HWY_MAX_N <= (size_t{1} << 20), "Change bit shift");
290 return static_cast<size_t>(
291 1 + (HWY_MAX(0, kNewPow2 - static_cast<int>(CeilLog2(kNewMaxLanes)))
292 << 20));
293 }
294
295 public:
296 // Returns (whole or fractional) NewN, see above.
297 template <int kNewPow2, size_t kNewMaxLanes>
298 static constexpr size_t NewN() {
299 // We require a fraction if inverting kNewPow2 results in 0.
300 return WholeN<kNewPow2, kNewMaxLanes>() == 0
301 ? FracN<kNewPow2, kNewMaxLanes>()
302 : WholeN<kNewPow2, kNewMaxLanes>();
303 }
304
305 // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
306 template <typename NewT>
307 using Rebind =
308 Simd<NewT, NewN<RebindPow2<NewT>(), kPrivateLanes>(), RebindPow2<NewT>()>;
309
310 // Change lane type while keeping the same vector size, e.g. for MulEven.
311 template <typename NewT>
314
315 // Half the lanes while keeping the same lane type, e.g. for LowerHalf.
316 using Half = Simd<T, N, kPow2 - 1>;
317
318 // Twice the lanes while keeping the same lane type, e.g. for Combine.
320};
321
322namespace detail {
323
324template <typename T, size_t N, int kPow2>
325constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
326 return N == HWY_LANES(T) && kPow2 == 0;
327}
328
329// Struct wrappers enable validation of arguments via static_assert.
330template <typename T, size_t N, int kPow2>
332 using type = Simd<T, HWY_MIN(N, HWY_MAX_N), HWY_MIN(kPow2, HWY_MAX_POW2)>;
333};
334
335template <typename T, int kPow2>
337 using type = typename ClampNAndPow2<T, HWY_LANES(T), kPow2>::type;
338};
339
340template <typename T, size_t kLimit, int kPow2>
342 static_assert(kLimit != 0, "Does not make sense to have zero lanes");
343 // Safely handle non-power-of-two inputs by rounding down, which is allowed by
344 // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
345 static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
346 static constexpr size_t N = HWY_MIN(kLimitPow2, HWY_LANES(T));
348};
349
350template <typename T, size_t kNumLanes>
352 static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
353 static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
355};
356
357} // namespace detail
358
359// ------------------------------ Aliases for Simd<>
360
361// Tag describing a full vector (kPow2 == 0: the most common usage, e.g. 1D
362// loops where the application does not care about the vector size) or a
363// fraction/multiple of one. Fractions (kPow2 < 0) are useful for arguments or
364// return values of type promotion and demotion. User-specified kPow2 is
365// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
366template <typename T, int kPow2 = 0>
368
369// Tag describing a vector with *up to* kLimit active lanes, even on targets
370// with scalable vectors and HWY_SCALAR. The runtime lane count `Lanes(tag)` may
371// be less than kLimit, and is 1 on HWY_SCALAR. This alias is typically used for
372// 1D loops with a relatively low application-defined upper bound, e.g. for 8x8
373// DCTs. However, it is better if data structures are designed to be
374// vector-length-agnostic (e.g. a hybrid SoA where there are chunks of `M >=
375// MaxLanes(d)` DC components followed by M AC1, .., and M AC63; this would
376// enable vector-length-agnostic loops using ScalableTag). User-specified kPow2
377// is interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
378template <typename T, size_t kLimit, int kPow2 = 0>
380
381#if !HWY_HAVE_SCALABLE
382// If the vector size is known, and the app knows it does not want more than
383// kLimit lanes, then capping can be beneficial. For example, AVX-512 has lower
384// IPC and potentially higher costs for unaligned load/store vs. 256-bit AVX2.
385template <typename T, size_t kLimit, int kPow2 = 0>
387#else // HWY_HAVE_SCALABLE
388// .. whereas on RVV/SVE, the cost of clamping Lanes() may exceed the benefit.
389template <typename T, size_t kLimit, int kPow2 = 0>
391#endif
392
393// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
394// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
395// two not exceeding `HWY_LANES(T)`.
396//
397// NOTE: if the application does not need to support HWY_SCALAR (+), use this
398// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
399// This is useful for data structures that rely on exactly 128-bit SIMD, but
400// these are discouraged because they cannot benefit from wider vectors.
401// Instead, applications would ideally define a larger problem size and loop
402// over it with the (unknown size) vectors from ScalableTag.
403//
404// + e.g. if the baseline is known to support SIMD, or the application requires
405// ops such as TableLookupBytes not supported by HWY_SCALAR.
406template <typename T, size_t kNumLanes>
408
409// Convenience form for fixed sizes.
410template <typename T>
411using Full16 = Simd<T, 2 / sizeof(T), 0>;
412
413template <typename T>
414using Full32 = Simd<T, 4 / sizeof(T), 0>;
415
416template <typename T>
417using Full64 = Simd<T, 8 / sizeof(T), 0>;
418
419template <typename T>
420using Full128 = Simd<T, 16 / sizeof(T), 0>;
421
422// ------------------------------ Accessors for Simd<>
423
424// Lane type.
425template <class D>
426using TFromD = typename D::T;
427
428// Upper bound on the number of lanes, typically used for SFINAE conditions and
429// to allocate storage for targets with known vector sizes. Note: this may be a
430// loose bound, instead use Lanes() as the actual size for AllocateAligned.
431// MSVC workaround: use static constant directly instead of a function.
432#define HWY_MAX_LANES_D(D) D::kPrivateLanes
433
434// Same as D().Pow2(), but this is too complex for SFINAE with MSVC, so we use a
435// static constant directly.
436#define HWY_POW2_D(D) D::kPrivatePow2
437
438// Non-macro form of HWY_MAX_LANES_D in case that is preferable. WARNING: the
439// macro form may be required for MSVC, which has limitations on deducing
440// arguments.
441template <class D>
443 return HWY_MAX_LANES_D(D);
444}
445
446#if !HWY_HAVE_SCALABLE
447
448// If non-scalable, this is constexpr; otherwise the target's header defines a
449// non-constexpr version of this function. This is the actual vector length,
450// used when advancing loop counters.
451template <class D>
452HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t Lanes(D) {
453 return HWY_MAX_LANES_D(D);
454}
455
456#endif // !HWY_HAVE_SCALABLE
457
458// Tag for the same number of lanes as D, but with the LaneType T.
459template <class T, class D>
460using Rebind = typename D::template Rebind<T>;
461
462template <class D>
464template <class D>
466template <class D>
468
469// Tag for the same total size as D, but with the LaneType T.
470template <class T, class D>
471using Repartition = typename D::template Repartition<T>;
472
473template <class D>
475template <class D>
477
478// Shorthand for applying RepartitionToWide twice (for 8/16-bit types).
479template <class D>
481// Shorthand for applying RepartitionToWide three times (for 8-bit types).
482template <class D>
484
485// Tag for the same lane type as D, but half the lanes.
486template <class D>
487using Half = typename D::Half;
488
489// Tag for the same lane type as D, but twice the lanes.
490template <class D>
491using Twice = typename D::Twice;
492
493// Tag for a 16-byte block with the same lane type as D
494#if HWY_HAVE_SCALABLE
495namespace detail {
496
497template <class D>
498class BlockDFromD_t {};
499
500template <typename T, size_t N, int kPow2>
501class BlockDFromD_t<Simd<T, N, kPow2>> {
502 using D = Simd<T, N, kPow2>;
503 static constexpr int kNewPow2 = HWY_MIN(kPow2, 0);
504 static constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), HWY_MAX_LANES_D(D));
505 static constexpr size_t kNewN = D::template NewN<kNewPow2, kMaxLpb>();
506
507 public:
508 using type = Simd<T, kNewN, kNewPow2>;
509};
510
511} // namespace detail
512
513template <class D>
514using BlockDFromD = typename detail::BlockDFromD_t<RemoveConst<D>>::type;
515#else
516template <class D>
518 Simd<TFromD<D>, HWY_MIN(16 / sizeof(TFromD<D>), HWY_MAX_LANES_D(D)), 0>;
519#endif
520
521// Returns whether `ptr` is a multiple of `Lanes(d)` elements.
522template <class D, typename T>
523HWY_API bool IsAligned(D d, T* ptr) {
524 const size_t N = Lanes(d);
525 return reinterpret_cast<uintptr_t>(ptr) % (N * sizeof(T)) == 0;
526}
527
528// ------------------------------ Choosing overloads (SFINAE)
529
530// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
531#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
532#define HWY_IF_NOT_UNSIGNED_D(D) \
533 HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
534#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
535#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
536#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
537#define HWY_IF_FLOAT3264_D(D) HWY_IF_FLOAT3264(hwy::HWY_NAMESPACE::TFromD<D>)
538#define HWY_IF_NOT_FLOAT3264_D(D) \
539 HWY_IF_NOT_FLOAT3264(hwy::HWY_NAMESPACE::TFromD<D>)
540#define HWY_IF_SPECIAL_FLOAT_D(D) \
541 HWY_IF_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
542#define HWY_IF_NOT_SPECIAL_FLOAT_D(D) \
543 HWY_IF_NOT_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
544#define HWY_IF_FLOAT_OR_SPECIAL_D(D) \
545 HWY_IF_FLOAT_OR_SPECIAL(hwy::HWY_NAMESPACE::TFromD<D>)
546#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D) \
547 HWY_IF_NOT_FLOAT_NOR_SPECIAL(hwy::HWY_NAMESPACE::TFromD<D>)
548
549#define HWY_IF_T_SIZE_D(D, bytes) \
550 HWY_IF_T_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
551#define HWY_IF_NOT_T_SIZE_D(D, bytes) \
552 HWY_IF_NOT_T_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
553#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array) \
554 HWY_IF_T_SIZE_ONE_OF(hwy::HWY_NAMESPACE::TFromD<D>, bit_array)
555#define HWY_IF_T_SIZE_LE_D(D, bytes) \
556 HWY_IF_T_SIZE_LE(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
557#define HWY_IF_T_SIZE_GT_D(D, bytes) \
558 HWY_IF_T_SIZE_GT(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
559
560#define HWY_IF_LANES_D(D, lanes) HWY_IF_LANES(HWY_MAX_LANES_D(D), lanes)
561#define HWY_IF_LANES_LE_D(D, lanes) HWY_IF_LANES_LE(HWY_MAX_LANES_D(D), lanes)
562#define HWY_IF_LANES_GT_D(D, lanes) HWY_IF_LANES_GT(HWY_MAX_LANES_D(D), lanes)
563#define HWY_IF_LANES_PER_BLOCK_D(D, lanes) \
564 HWY_IF_LANES_PER_BLOCK(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), \
565 lanes)
566
567#if HWY_COMPILER_MSVC
568#define HWY_IF_POW2_LE_D(D, pow2) \
569 hwy::EnableIf<HWY_POW2_D(D) <= pow2>* = nullptr
570#define HWY_IF_POW2_GT_D(D, pow2) \
571 hwy::EnableIf<(HWY_POW2_D(D) > pow2)>* = nullptr
572#else
573#define HWY_IF_POW2_LE_D(D, pow2) hwy::EnableIf<D().Pow2() <= pow2>* = nullptr
574#define HWY_IF_POW2_GT_D(D, pow2) hwy::EnableIf<(D().Pow2() > pow2)>* = nullptr
575#endif // HWY_COMPILER_MSVC
576
577#define HWY_IF_U8_D(D) HWY_IF_U8(hwy::HWY_NAMESPACE::TFromD<D>)
578#define HWY_IF_U16_D(D) HWY_IF_U16(hwy::HWY_NAMESPACE::TFromD<D>)
579#define HWY_IF_U32_D(D) HWY_IF_U32(hwy::HWY_NAMESPACE::TFromD<D>)
580#define HWY_IF_U64_D(D) HWY_IF_U64(hwy::HWY_NAMESPACE::TFromD<D>)
581
582#define HWY_IF_I8_D(D) HWY_IF_I8(hwy::HWY_NAMESPACE::TFromD<D>)
583#define HWY_IF_I16_D(D) HWY_IF_I16(hwy::HWY_NAMESPACE::TFromD<D>)
584#define HWY_IF_I32_D(D) HWY_IF_I32(hwy::HWY_NAMESPACE::TFromD<D>)
585#define HWY_IF_I64_D(D) HWY_IF_I64(hwy::HWY_NAMESPACE::TFromD<D>)
586
587// Use instead of HWY_IF_T_SIZE_D to avoid ambiguity with float16_t/float/double
588// overloads.
589#define HWY_IF_UI8_D(D) HWY_IF_UI8(hwy::HWY_NAMESPACE::TFromD<D>)
590#define HWY_IF_UI16_D(D) HWY_IF_UI16(hwy::HWY_NAMESPACE::TFromD<D>)
591#define HWY_IF_UI32_D(D) HWY_IF_UI32(hwy::HWY_NAMESPACE::TFromD<D>)
592#define HWY_IF_UI64_D(D) HWY_IF_UI64(hwy::HWY_NAMESPACE::TFromD<D>)
593
594#define HWY_IF_BF16_D(D) HWY_IF_BF16(hwy::HWY_NAMESPACE::TFromD<D>)
595#define HWY_IF_NOT_BF16_D(D) HWY_IF_NOT_BF16(hwy::HWY_NAMESPACE::TFromD<D>)
596
597#define HWY_IF_F16_D(D) HWY_IF_F16(hwy::HWY_NAMESPACE::TFromD<D>)
598#define HWY_IF_NOT_F16_D(D) HWY_IF_NOT_F16(hwy::HWY_NAMESPACE::TFromD<D>)
599
600#define HWY_IF_F32_D(D) HWY_IF_F32(hwy::HWY_NAMESPACE::TFromD<D>)
601#define HWY_IF_F64_D(D) HWY_IF_F64(hwy::HWY_NAMESPACE::TFromD<D>)
602
603#define HWY_V_SIZE_D(D) \
604 (HWY_MAX_LANES_D(D) * sizeof(hwy::HWY_NAMESPACE::TFromD<D>))
605#define HWY_IF_V_SIZE_D(D, bytes) \
606 HWY_IF_V_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes)
607#define HWY_IF_V_SIZE_LE_D(D, bytes) \
608 HWY_IF_V_SIZE_LE(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes)
609#define HWY_IF_V_SIZE_GT_D(D, bytes) \
610 HWY_IF_V_SIZE_GT(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes)
611
612// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
613#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
614#define HWY_IF_NOT_UNSIGNED_V(V) \
615 HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
616#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
617#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
618#define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
619#define HWY_IF_SPECIAL_FLOAT_V(V) \
620 HWY_IF_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
621#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \
622 HWY_IF_NOT_FLOAT_NOR_SPECIAL(hwy::HWY_NAMESPACE::TFromV<V>)
623
624#define HWY_IF_T_SIZE_V(V, bytes) \
625 HWY_IF_T_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, bytes)
626#define HWY_IF_NOT_T_SIZE_V(V, bytes) \
627 HWY_IF_NOT_T_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, bytes)
628#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \
629 HWY_IF_T_SIZE_ONE_OF(hwy::HWY_NAMESPACE::TFromV<V>, bit_array)
630
631#define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(DFromV<V>)
632#define HWY_IF_V_SIZE_V(V, bytes) \
633 HWY_IF_V_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
634#define HWY_IF_V_SIZE_LE_V(V, bytes) \
635 HWY_IF_V_SIZE_LE(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
636#define HWY_IF_V_SIZE_GT_V(V, bytes) \
637 HWY_IF_V_SIZE_GT(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
638
639// Use in implementations of ReduceSum etc. to avoid conflicts with the N=1 and
640// N=4 8-bit specializations in generic_ops-inl.
641#undef HWY_IF_REDUCE_D
642#define HWY_IF_REDUCE_D(D) \
643 hwy::EnableIf<HWY_MAX_LANES_D(D) != 1 && \
644 (HWY_MAX_LANES_D(D) != 4 || \
645 sizeof(hwy::HWY_NAMESPACE::TFromD<D>) != 1)>* = nullptr
646
647#undef HWY_IF_SUM_OF_LANES_D
648#define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1)
649
650#undef HWY_IF_MINMAX_OF_LANES_D
651#define HWY_IF_MINMAX_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1)
652
653#undef HWY_IF_ADDSUB_V
654#define HWY_IF_ADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
655
656#undef HWY_IF_MULADDSUB_V
657#define HWY_IF_MULADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
658
659// HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V is used to disable the default
660// implementation of unsigned to signed DemoteTo/ReorderDemote2To in
661// generic_ops-inl.h for at least some of the unsigned to signed demotions on
662// SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2
663
664#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
665#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) void* = nullptr
666
667// Old names (deprecated)
668#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes)
669#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes)
670
671// NOLINTNEXTLINE(google-readability-namespace-comments)
672} // namespace HWY_NAMESPACE
673} // namespace hwy
675
676#endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_MAYBE_UNUSED
Definition base.h:113
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition ops/shared-inl.h:151
HWY_INLINE T * NativeLanePointer(T *p)
Definition ops/shared-inl.h:111
typename NativeLaneTypeT< T >::type NativeLaneType
Definition ops/shared-inl.h:99
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition ops/shared-inl.h:146
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition ops/shared-inl.h:325
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
Simd< TFromD< D >, HWY_MIN(16/sizeof(TFromD< D >), HWY_MAX_LANES_D(D)), 0 > BlockDFromD
Definition ops/shared-inl.h:517
D d
Definition arm_sve-inl.h:1915
RepartitionToWide< RepartitionToWideX2< D > > RepartitionToWideX3
Definition ops/shared-inl.h:483
V VecArg
Definition ops/shared-inl.h:69
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API bool IsAligned(D d, T *ptr)
Definition ops/shared-inl.h:523
typename detail::CappedTagChecker< T, kLimit, kPow2 >::type CappedTag
Definition ops/shared-inl.h:379
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:476
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:367
RepartitionToWide< RepartitionToWide< D > > RepartitionToWideX2
Definition ops/shared-inl.h:480
typename D::Half Half
Definition ops/shared-inl.h:487
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition ops/shared-inl.h:407
CappedTag< T, kLimit, kPow2 > CappedTagIfFixed
Definition ops/shared-inl.h:386
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:467
typename D::Twice Twice
Definition ops/shared-inl.h:491
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
Definition abort.h:8
constexpr size_t FloorLog2(TI x)
Definition base.h:2662
typename IfT< Condition, Then, Else >::type If
Definition base.h:520
constexpr size_t CeilLog2(TI x)
Definition base.h:2669
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_LANES(T)
Definition set_macros-inl.h:169
#define HWY_MAX_N
Definition set_macros-inl.h:61
#define HWY_MIN_POW2
Definition set_macros-inl.h:78
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
#define HWY_MAX_POW2
Definition set_macros-inl.h:72
Definition ops/shared-inl.h:198
static constexpr size_t WholeN()
Definition ops/shared-inl.h:278
static constexpr size_t kPrivateLanes
Definition ops/shared-inl.h:243
constexpr Simd()=default
static constexpr size_t NewN()
Definition ops/shared-inl.h:298
static constexpr int kFrac
Definition ops/shared-inl.h:215
static constexpr size_t FracN()
Definition ops/shared-inl.h:284
constexpr size_t MaxBytes() const
Definition ops/shared-inl.h:250
constexpr size_t MaxLanes() const
Definition ops/shared-inl.h:249
constexpr int Pow2() const
Definition ops/shared-inl.h:253
static constexpr int RebindPow2()
Definition ops/shared-inl.h:268
static constexpr int kPrivatePow2
Definition ops/shared-inl.h:247
constexpr size_t MaxBlocks() const
Definition ops/shared-inl.h:251
static constexpr size_t kWhole
Definition ops/shared-inl.h:213
Lane T
Definition ops/shared-inl.h:200
static constexpr size_t RepartitionLanes()
Definition ops/shared-inl.h:261
Definition ops/shared-inl.h:341
static constexpr size_t N
Definition ops/shared-inl.h:346
typename ClampNAndPow2< T, N, kPow2 >::type type
Definition ops/shared-inl.h:347
static constexpr size_t kLimitPow2
Definition ops/shared-inl.h:345
Definition ops/shared-inl.h:331
Definition ops/shared-inl.h:351
Definition ops/shared-inl.h:75
T type
Definition ops/shared-inl.h:76
Definition ops/shared-inl.h:336
typename ClampNAndPow2< T, HWY_LANES(T), kPow2 >::type type
Definition ops/shared-inl.h:337
Definition base.h:1594
uint16_t bits
Definition base.h:1606
Definition base.h:1117