Grok 12.0.1
base.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef HIGHWAY_HWY_BASE_H_
17#define HIGHWAY_HWY_BASE_H_
18
19// For SIMD module implementations and their callers, target-independent.
20
21// IWYU pragma: begin_exports
22#include <stddef.h>
23#include <stdint.h>
24
26#include "hwy/highway_export.h"
27
28#if HWY_COMPILER_MSVC && defined(_MSVC_LANG) && _MSVC_LANG > __cplusplus
29#define HWY_CXX_LANG _MSVC_LANG
30#else
31#define HWY_CXX_LANG __cplusplus
32#endif
33
34// "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
35#if !HWY_IDE
36
37#if !defined(HWY_NO_LIBCXX)
38#ifndef __STDC_FORMAT_MACROS
39#define __STDC_FORMAT_MACROS // before inttypes.h
40#endif
41#include <inttypes.h>
42#endif
43
44#if (HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC
45#include <atomic>
46#endif
47
48#endif // !HWY_IDE
49
50#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \
51 __cpp_impl_three_way_comparison >= 201907L && defined(__has_include) && \
52 !defined(HWY_DISABLE_CXX20_THREE_WAY_COMPARE)
53#if __has_include(<compare>)
54#include <compare>
55#define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1
56#endif
57#endif
58
59// IWYU pragma: end_exports
60
61#if HWY_COMPILER_MSVC
62#include <string.h> // memcpy
63#endif
64
65//------------------------------------------------------------------------------
66// Compiler-specific definitions
67
68#define HWY_STR_IMPL(macro) #macro
69#define HWY_STR(macro) HWY_STR_IMPL(macro)
70
71#if HWY_COMPILER_MSVC
72
73#include <intrin.h>
74
75#define HWY_RESTRICT __restrict
76#define HWY_INLINE __forceinline
77#define HWY_NOINLINE __declspec(noinline)
78#define HWY_FLATTEN
79#define HWY_NORETURN __declspec(noreturn)
80#define HWY_LIKELY(expr) (expr)
81#define HWY_UNLIKELY(expr) (expr)
82#define HWY_PRAGMA(tokens) __pragma(tokens)
83#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
84#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
85#define HWY_MAYBE_UNUSED
86#define HWY_HAS_ASSUME_ALIGNED 0
87#if (_MSC_VER >= 1700)
88#define HWY_MUST_USE_RESULT _Check_return_
89#else
90#define HWY_MUST_USE_RESULT
91#endif
92
93#else
94
95#define HWY_RESTRICT __restrict__
96// force inlining without optimization enabled creates very inefficient code
97// that can cause compiler timeout
98#ifdef __OPTIMIZE__
99#define HWY_INLINE inline __attribute__((always_inline))
100#else
101#define HWY_INLINE inline
102#endif
103#define HWY_NOINLINE __attribute__((noinline))
104#define HWY_FLATTEN __attribute__((flatten))
105#define HWY_NORETURN __attribute__((noreturn))
106#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
107#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
108#define HWY_PRAGMA(tokens) _Pragma(#tokens)
109#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
110#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
111// Encountered "attribute list cannot appear here" when using the C++17
112// [[maybe_unused]], so only use the old style attribute for now.
113#define HWY_MAYBE_UNUSED __attribute__((unused))
114#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
115
116#endif // !HWY_COMPILER_MSVC
117
118//------------------------------------------------------------------------------
119// Builtin/attributes (no more #include after this point due to namespace!)
120
121namespace hwy {
122
123// Enables error-checking of format strings.
124#if HWY_HAS_ATTRIBUTE(__format__)
125#define HWY_FORMAT(idx_fmt, idx_arg) \
126 __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
127#else
128#define HWY_FORMAT(idx_fmt, idx_arg)
129#endif
130
131// Returns a void* pointer which the compiler then assumes is N-byte aligned.
132// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
133//
134// The assignment semantics are required by GCC/Clang. ICC provides an in-place
135// __assume_aligned, whereas MSVC's __assume appears unsuitable.
136#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
137#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
138#else
139#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
140#endif
141
142// Returns a pointer whose type is `type` (T*), while allowing the compiler to
143// assume that the untyped pointer `ptr` is aligned to a multiple of sizeof(T).
144#define HWY_RCAST_ALIGNED(type, ptr) \
145 reinterpret_cast<type>(HWY_ASSUME_ALIGNED((ptr), alignof(RemovePtr<type>)))
146
147// Clang and GCC require attributes on each function into which SIMD intrinsics
148// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
149// automatic annotation via pragmas.
150#if HWY_COMPILER_ICC
151// As of ICC 2021.{1-9} the pragma is neither implemented nor required.
152#define HWY_PUSH_ATTRIBUTES(targets_str)
153#define HWY_POP_ATTRIBUTES
154#elif HWY_COMPILER_CLANG
155#define HWY_PUSH_ATTRIBUTES(targets_str) \
156 HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
157 apply_to = function))
158#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
159#elif HWY_COMPILER_GCC_ACTUAL
160#define HWY_PUSH_ATTRIBUTES(targets_str) \
161 HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
162#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
163#else
164#define HWY_PUSH_ATTRIBUTES(targets_str)
165#define HWY_POP_ATTRIBUTES
166#endif
167
168//------------------------------------------------------------------------------
169// Macros
170
171#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
172
173#define HWY_CONCAT_IMPL(a, b) a##b
174#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
175
176#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
177#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
178
179#if HWY_COMPILER_GCC_ACTUAL
180// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
181#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
182#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
183#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
184#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
185#define HWY_DEFAULT_UNROLL HWY_UNROLL()
186#else
187#define HWY_UNROLL(factor)
188#define HWY_DEFAULT_UNROLL
189#endif
190
191// Tell a compiler that the expression always evaluates to true.
192// The expression should be free from any side effects.
193// Some older compilers may have trouble with complex expressions, therefore
194// it is advisable to split multiple conditions into separate assume statements,
195// and manually check the generated code.
196// OK but could fail:
197// HWY_ASSUME(x == 2 && y == 3);
198// Better:
199// HWY_ASSUME(x == 2);
200// HWY_ASSUME(y == 3);
201#if HWY_HAS_CPP_ATTRIBUTE(assume)
202#define HWY_ASSUME(expr) [[assume(expr)]]
203#elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
204#define HWY_ASSUME(expr) __assume(expr)
205// __builtin_assume() was added in clang 3.6.
206#elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
207#define HWY_ASSUME(expr) __builtin_assume(expr)
208// __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added
209// later, so check for the compiler version directly.
210#elif HWY_COMPILER_GCC_ACTUAL >= 405
211#define HWY_ASSUME(expr) \
212 ((expr) ? static_cast<void>(0) : __builtin_unreachable())
213#else
214#define HWY_ASSUME(expr) static_cast<void>(0)
215#endif
216
217// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
218// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
219// does, without generating code.
220#if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)
221#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
222#else
223// TODO(janwas): investigate alternatives. On Arm, the above generates barriers.
224#define HWY_FENCE
225#endif
226
227// 4 instances of a given literal value, useful as input to LoadDup128.
228#define HWY_REP4(literal) literal, literal, literal, literal
229
231 Abort(const char* file, int line, const char* format, ...);
232
233#define HWY_ABORT(format, ...) \
234 ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
235
236// Always enabled.
237#define HWY_ASSERT(condition) \
238 do { \
239 if (!(condition)) { \
240 HWY_ABORT("Assert %s", #condition); \
241 } \
242 } while (0)
243
244#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
245#define HWY_IS_MSAN 1
246#else
247#define HWY_IS_MSAN 0
248#endif
249
250#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
251#define HWY_IS_ASAN 1
252#else
253#define HWY_IS_ASAN 0
254#endif
255
256#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
257#define HWY_IS_TSAN 1
258#else
259#define HWY_IS_TSAN 0
260#endif
261
262#if HWY_HAS_FEATURE(undefined_behavior_sanitizer) || \
263 defined(UNDEFINED_BEHAVIOR_SANITIZER)
264#define HWY_IS_UBSAN 1
265#else
266#define HWY_IS_UBSAN 0
267#endif
268
269// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
270// You can disable MSAN by adding this attribute to the function that fails.
271#if HWY_IS_MSAN
272#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
273#else
274#define HWY_ATTR_NO_MSAN
275#endif
276
277// For enabling HWY_DASSERT and shortening tests in slower debug builds
278#if !defined(HWY_IS_DEBUG_BUILD)
279// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
280// MSVC defines NDEBUG (if not, could instead check _DEBUG).
281#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
282 HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN || defined(__clang_analyzer__)
283#define HWY_IS_DEBUG_BUILD 1
284#else
285#define HWY_IS_DEBUG_BUILD 0
286#endif
287#endif // HWY_IS_DEBUG_BUILD
288
289#if HWY_IS_DEBUG_BUILD
290#define HWY_DASSERT(condition) HWY_ASSERT(condition)
291#else
292#define HWY_DASSERT(condition) \
293 do { \
294 } while (0)
295#endif
296#if __cpp_constexpr >= 201603L
297#define HWY_CXX17_CONSTEXPR constexpr
298#else
299#define HWY_CXX17_CONSTEXPR
300#endif
301#if __cpp_constexpr >= 201304L
302#define HWY_CXX14_CONSTEXPR constexpr
303#else
304#define HWY_CXX14_CONSTEXPR
305#endif
306
307#if HWY_CXX_LANG >= 201703L
308#define HWY_IF_CONSTEXPR if constexpr
309#else
310#define HWY_IF_CONSTEXPR if
311#endif
312
313#ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE
314#define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
315#endif
316
317//------------------------------------------------------------------------------
318// CopyBytes / ZeroBytes
319
320#if HWY_COMPILER_MSVC
321#pragma intrinsic(memcpy)
322#pragma intrinsic(memset)
323#endif
324
325// The source/destination must not overlap/alias.
326template <size_t kBytes, typename From, typename To>
327HWY_API void CopyBytes(const From* from, To* to) {
328#if HWY_COMPILER_MSVC
329 memcpy(to, from, kBytes);
330#else
331 __builtin_memcpy(to, from, kBytes);
332#endif
333}
334
335HWY_API void CopyBytes(const void* HWY_RESTRICT from, void* HWY_RESTRICT to,
336 size_t num_of_bytes_to_copy) {
337#if HWY_COMPILER_MSVC
338 memcpy(to, from, num_of_bytes_to_copy);
339#else
340 __builtin_memcpy(to, from, num_of_bytes_to_copy);
341#endif
342}
343
344// Same as CopyBytes, but for same-sized objects; avoids a size argument.
345template <typename From, typename To>
346HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
347 static_assert(sizeof(From) == sizeof(To), "");
348 CopyBytes<sizeof(From)>(from, to);
349}
350
351template <size_t kBytes, typename To>
352HWY_API void ZeroBytes(To* to) {
353#if HWY_COMPILER_MSVC
354 memset(to, 0, kBytes);
355#else
356 __builtin_memset(to, 0, kBytes);
357#endif
358}
359
360HWY_API void ZeroBytes(void* to, size_t num_bytes) {
361#if HWY_COMPILER_MSVC
362 memset(to, 0, num_bytes);
363#else
364 __builtin_memset(to, 0, num_bytes);
365#endif
366}
367
368//------------------------------------------------------------------------------
369// kMaxVectorSize (undocumented, pending removal)
370
371#if HWY_ARCH_X86
372static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
373#elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
374 __riscv_v_intrinsic >= 11000
375// Not actually an upper bound on the size.
376static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
377#else
378static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
379#endif
380
381//------------------------------------------------------------------------------
382// Alignment
383
384// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
385// should be allocated dynamically via aligned_allocator.h because Lanes() may
386// exceed the stack size.
387#if HWY_ARCH_X86
388#define HWY_ALIGN_MAX alignas(64)
389#elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
390 __riscv_v_intrinsic >= 11000
391#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
392#else
393#define HWY_ALIGN_MAX alignas(16)
394#endif
395
396//------------------------------------------------------------------------------
397// Lane types
398
399// hwy::float16_t and hwy::bfloat16_t are forward declared here to allow
400// BitCastScalar to be implemented before the implementations of the
401// hwy::float16_t and hwy::bfloat16_t types
402struct float16_t;
403struct bfloat16_t;
404
405using float32_t = float;
406using float64_t = double;
407
408#pragma pack(push, 1)
409
410// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
411// https://reviews.llvm.org/D86310
412struct alignas(16) uint128_t {
413 uint64_t lo; // little-endian layout
414 uint64_t hi;
415};
416
417// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
418// field is to be compared (Lt128Upper instead of Lt128).
419struct alignas(16) K64V64 {
420 uint64_t value; // little-endian layout
421 uint64_t key;
422};
423
424// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
425// than when considering both to be a 64-bit key.
426struct alignas(8) K32V32 {
427 uint32_t value; // little-endian layout
428 uint32_t key;
429};
430
431#pragma pack(pop)
432
433static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
434 const uint128_t& b) {
435 return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
436}
437// Required for std::greater.
438static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
439 const uint128_t& b) {
440 return b < a;
441}
442static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
443 const uint128_t& b) {
444 return a.lo == b.lo && a.hi == b.hi;
445}
446
447static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
448 const K64V64& b) {
449 return a.key < b.key;
450}
451// Required for std::greater.
452static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
453 const K64V64& b) {
454 return b < a;
455}
456static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
457 const K64V64& b) {
458 return a.key == b.key;
459}
460
461static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
462 const K32V32& b) {
463 return a.key < b.key;
464}
465// Required for std::greater.
466static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
467 const K32V32& b) {
468 return b < a;
469}
470static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
471 const K32V32& b) {
472 return a.key == b.key;
473}
474
475//------------------------------------------------------------------------------
476// Controlling overload resolution (SFINAE)
477
478template <bool Condition>
479struct EnableIfT {};
480template <>
481struct EnableIfT<true> {
482 using type = void;
483};
484
485template <bool Condition>
487
488template <typename T, typename U>
489struct IsSameT {
490 enum { value = 0 };
491};
492
493template <typename T>
494struct IsSameT<T, T> {
495 enum { value = 1 };
496};
497
498template <typename T, typename U>
499HWY_API constexpr bool IsSame() {
501}
502
503// Returns whether T matches either of U1 or U2
504template <typename T, typename U1, typename U2>
508
509template <bool Condition, typename Then, typename Else>
510struct IfT {
511 using type = Then;
512};
513
514template <class Then, class Else>
515struct IfT<false, Then, Else> {
516 using type = Else;
517};
518
519template <bool Condition, typename Then, typename Else>
521
522template <typename T>
523struct IsConstT {
524 enum { value = 0 };
525};
526
527template <typename T>
528struct IsConstT<const T> {
529 enum { value = 1 };
530};
531
532template <typename T>
533HWY_API constexpr bool IsConst() {
534 return IsConstT<T>::value;
535}
536
537template <class T>
539 using type = T;
540};
541template <class T>
542struct RemoveConstT<const T> {
543 using type = T;
544};
545
546template <class T>
548
549template <class T>
551 using type = T;
552};
553template <class T>
554struct RemoveVolatileT<volatile T> {
555 using type = T;
556};
557
558template <class T>
560
561template <class T>
563 using type = T;
564};
565template <class T>
566struct RemoveRefT<T&> {
567 using type = T;
568};
569template <class T>
570struct RemoveRefT<T&&> {
571 using type = T;
572};
573
574template <class T>
576
577template <class T>
579
580template <class T>
582 using type = T;
583};
584template <class T>
585struct RemovePtrT<T*> {
586 using type = T;
587};
588template <class T>
589struct RemovePtrT<const T*> {
590 using type = T;
591};
592template <class T>
593struct RemovePtrT<volatile T*> {
594 using type = T;
595};
596template <class T>
597struct RemovePtrT<const volatile T*> {
598 using type = T;
599};
600
601template <class T>
603
604// Insert into template/function arguments to enable this overload only for
605// vectors of exactly, at most (LE), or more than (GT) this many bytes.
606//
607// As an example, checking for a total size of 16 bytes will match both
608// Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
609#define HWY_IF_V_SIZE(T, kN, bytes) \
610 hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
611#define HWY_IF_V_SIZE_LE(T, kN, bytes) \
612 hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
613#define HWY_IF_V_SIZE_GT(T, kN, bytes) \
614 hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
615
616#define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
617#define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
618#define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
619
620#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr
621#define HWY_IF_NOT_UNSIGNED(T) hwy::EnableIf<hwy::IsSigned<T>()>* = nullptr
622#define HWY_IF_SIGNED(T) \
623 hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \
624 !hwy::IsSpecialFloat<T>()>* = nullptr
625#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
626#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
627#define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
628#define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
629#define HWY_IF_SPECIAL_FLOAT(T) \
630 hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
631#define HWY_IF_NOT_SPECIAL_FLOAT(T) \
632 hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
633#define HWY_IF_FLOAT_OR_SPECIAL(T) \
634 hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
635#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
636 hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
637#define HWY_IF_INTEGER(T) hwy::EnableIf<hwy::IsInteger<T>()>* = nullptr
638
639#define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
640#define HWY_IF_NOT_T_SIZE(T, bytes) \
641 hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
642// bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
643// too similar. If you want the opposite of this (2 or 4 bytes), ask for those
644// bits explicitly (0x14) instead of attempting to 'negate' 0x102.
645#define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
646 hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
647#define HWY_IF_T_SIZE_LE(T, bytes) \
648 hwy::EnableIf<(sizeof(T) <= (bytes))>* = nullptr
649#define HWY_IF_T_SIZE_GT(T, bytes) \
650 hwy::EnableIf<(sizeof(T) > (bytes))>* = nullptr
651
652#define HWY_IF_SAME(T, expected) \
653 hwy::EnableIf<hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
654#define HWY_IF_NOT_SAME(T, expected) \
655 hwy::EnableIf<!hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
656
657// One of two expected types
658#define HWY_IF_SAME2(T, expected1, expected2) \
659 hwy::EnableIf< \
660 hwy::IsSameEither<hwy::RemoveCvRef<T>, expected1, expected2>()>* = \
661 nullptr
662
663#define HWY_IF_U8(T) HWY_IF_SAME(T, uint8_t)
664#define HWY_IF_U16(T) HWY_IF_SAME(T, uint16_t)
665#define HWY_IF_U32(T) HWY_IF_SAME(T, uint32_t)
666#define HWY_IF_U64(T) HWY_IF_SAME(T, uint64_t)
667
668#define HWY_IF_I8(T) HWY_IF_SAME(T, int8_t)
669#define HWY_IF_I16(T) HWY_IF_SAME(T, int16_t)
670#define HWY_IF_I32(T) HWY_IF_SAME(T, int32_t)
671#define HWY_IF_I64(T) HWY_IF_SAME(T, int64_t)
672
673#define HWY_IF_BF16(T) HWY_IF_SAME(T, hwy::bfloat16_t)
674#define HWY_IF_NOT_BF16(T) HWY_IF_NOT_SAME(T, hwy::bfloat16_t)
675
676#define HWY_IF_F16(T) HWY_IF_SAME(T, hwy::float16_t)
677#define HWY_IF_NOT_F16(T) HWY_IF_NOT_SAME(T, hwy::float16_t)
678
679#define HWY_IF_F32(T) HWY_IF_SAME(T, float)
680#define HWY_IF_F64(T) HWY_IF_SAME(T, double)
681
682// Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
683// overloads.
684#define HWY_IF_UI8(T) HWY_IF_SAME2(T, uint8_t, int8_t)
685#define HWY_IF_UI16(T) HWY_IF_SAME2(T, uint16_t, int16_t)
686#define HWY_IF_UI32(T) HWY_IF_SAME2(T, uint32_t, int32_t)
687#define HWY_IF_UI64(T) HWY_IF_SAME2(T, uint64_t, int64_t)
688
689#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
690 hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
691
692// Empty struct used as a size tag type.
693template <size_t N>
694struct SizeTag {};
695
696template <class T>
697class DeclValT {
698 private:
699 template <class U, class URef = U&&>
700 static URef TryAddRValRef(int);
701 template <class U, class Arg>
702 static U TryAddRValRef(Arg);
703
704 public:
705 using type = decltype(TryAddRValRef<T>(0));
707};
708
709// hwy::DeclVal<T>() can only be used in unevaluated contexts such as within an
710// expression of a decltype specifier.
711
712// hwy::DeclVal<T>() does not require that T have a public default constructor
713template <class T>
714HWY_API typename DeclValT<T>::type DeclVal() noexcept {
716 "DeclVal() cannot be used in an evaluated context");
717}
718
719template <class T>
720struct IsArrayT {
721 enum { value = 0 };
722};
723
724template <class T>
725struct IsArrayT<T[]> {
726 enum { value = 1 };
727};
728
729template <class T, size_t N>
730struct IsArrayT<T[N]> {
731 enum { value = 1 };
732};
733
734template <class T>
735static constexpr bool IsArray() {
736 return IsArrayT<T>::value;
737}
738
739#if HWY_COMPILER_MSVC
740HWY_DIAGNOSTICS(push)
741HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers")
742#endif
743
744template <class From, class To>
746 private:
747 template <class T>
748 static hwy::SizeTag<1> TestFuncWithToArg(T);
749
750 template <class T, class U>
751 static decltype(IsConvertibleT<T, U>::template TestFuncWithToArg<U>(
752 DeclVal<T>()))
754
755 template <class T, class U, class Arg>
757
758 public:
759 enum {
760 value = (IsSame<RemoveConst<RemoveVolatile<From>>, void>() &&
762 (!IsArray<To>() &&
763 (IsSame<To, decltype(DeclVal<To>())>() ||
764 !IsSame<const RemoveConst<To>, RemoveConst<To>>()) &&
765 IsSame<decltype(TryConvTest<From, To>(0)), hwy::SizeTag<1>>())
766 };
767};
768
769#if HWY_COMPILER_MSVC
771#endif
772
773template <class From, class To>
774HWY_API constexpr bool IsConvertible() {
776}
777
778template <class From, class To>
780 private:
781 template <class T, class U, class = decltype(static_cast<U>(DeclVal<T>()))>
783
784 template <class T, class U, class Arg>
786
787 public:
788 enum {
789 value = IsSame<decltype(TryStaticCastTest<From, To>(0)), hwy::SizeTag<1>>()
790 };
791};
792
793template <class From, class To>
794static constexpr bool IsStaticCastable() {
796}
797
798#define HWY_IF_CASTABLE(From, To) \
799 hwy::EnableIf<IsStaticCastable<From, To>()>* = nullptr
800
801#define HWY_IF_OP_CASTABLE(op, T, Native) \
802 HWY_IF_CASTABLE(decltype(DeclVal<Native>() op DeclVal<T>()), Native)
803
804template <class T, class From>
806 private:
807 template <class T1, class T2, class = decltype(DeclVal<T1>() = DeclVal<T2>())>
809
810 template <class T1, class T2, class Arg>
812
813 public:
814 enum {
815 value = IsSame<decltype(TryAssignTest<T, From>(0)), hwy::SizeTag<1>>()
816 };
817};
818
819template <class T, class From>
820static constexpr bool IsAssignable() {
822}
823
824#define HWY_IF_ASSIGNABLE(T, From) \
825 hwy::EnableIf<IsAssignable<T, From>()>* = nullptr
826
827// ----------------------------------------------------------------------------
828// IsSpecialFloat
829
830// These types are often special-cased and not supported in all ops.
831template <typename T>
832HWY_API constexpr bool IsSpecialFloat() {
833 return IsSameEither<RemoveCvRef<T>, hwy::float16_t, hwy::bfloat16_t>();
834}
835
836// -----------------------------------------------------------------------------
837// IsIntegerLaneType and IsInteger
838
839template <class T>
840HWY_API constexpr bool IsIntegerLaneType() {
841 return false;
842}
843template <>
844HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() {
845 return true;
846}
847template <>
848HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() {
849 return true;
850}
851template <>
852HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() {
853 return true;
854}
855template <>
856HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() {
857 return true;
858}
859template <>
860HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() {
861 return true;
862}
863template <>
864HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() {
865 return true;
866}
867template <>
868HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() {
869 return true;
870}
871template <>
872HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
873 return true;
874}
875
876template <class T>
877HWY_API constexpr bool IsInteger() {
878 // NOTE: Do not add a IsInteger<wchar_t>() specialization below as it is
879 // possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC
880 // with the /Zc:wchar_t- option.
881 return IsIntegerLaneType<T>() || IsSame<RemoveCvRef<T>, wchar_t>() ||
882 IsSameEither<RemoveCvRef<T>, size_t, ptrdiff_t>() ||
883 IsSameEither<RemoveCvRef<T>, intptr_t, uintptr_t>();
884}
885template <>
886HWY_INLINE constexpr bool IsInteger<bool>() {
887 return true;
888}
889template <>
890HWY_INLINE constexpr bool IsInteger<char>() {
891 return true;
892}
893template <>
894HWY_INLINE constexpr bool IsInteger<signed char>() {
895 return true;
896}
897template <>
898HWY_INLINE constexpr bool IsInteger<unsigned char>() {
899 return true;
900}
901template <>
902HWY_INLINE constexpr bool IsInteger<short>() { // NOLINT
903 return true;
904}
905template <>
906HWY_INLINE constexpr bool IsInteger<unsigned short>() { // NOLINT
907 return true;
908}
909template <>
910HWY_INLINE constexpr bool IsInteger<int>() {
911 return true;
912}
913template <>
914HWY_INLINE constexpr bool IsInteger<unsigned>() {
915 return true;
916}
917template <>
918HWY_INLINE constexpr bool IsInteger<long>() { // NOLINT
919 return true;
920}
921template <>
922HWY_INLINE constexpr bool IsInteger<unsigned long>() { // NOLINT
923 return true;
924}
925template <>
926HWY_INLINE constexpr bool IsInteger<long long>() { // NOLINT
927 return true;
928}
929template <>
930HWY_INLINE constexpr bool IsInteger<unsigned long long>() { // NOLINT
931 return true;
932}
933#if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
934template <>
935HWY_INLINE constexpr bool IsInteger<char8_t>() {
936 return true;
937}
938#endif
939template <>
940HWY_INLINE constexpr bool IsInteger<char16_t>() {
941 return true;
942}
943template <>
944HWY_INLINE constexpr bool IsInteger<char32_t>() {
945 return true;
946}
947
948// -----------------------------------------------------------------------------
949// BitCastScalar
950
951#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
952#define HWY_BITCASTSCALAR_CONSTEXPR constexpr
953#else
954#define HWY_BITCASTSCALAR_CONSTEXPR
955#endif
956
957#if __cpp_constexpr >= 201304L
958#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
959#else
960#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
961#endif
962
963#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
964namespace detail {
965
966template <class From>
967struct BitCastScalarSrcCastHelper {
968 static HWY_INLINE constexpr const From& CastSrcValRef(const From& val) {
969 return val;
970 }
971};
972
973#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
974// Workaround for Clang 9 constexpr __builtin_bit_cast bug
975template <class To, class From,
977 hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr>
979BuiltinBitCastScalar(const From& val) {
980 static_assert(sizeof(To) == sizeof(From),
981 "sizeof(To) == sizeof(From) must be true");
982 return static_cast<To>(val);
983}
984
985template <class To, class From,
986 hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() &&
987 hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr>
989BuiltinBitCastScalar(const From& val) {
990 return __builtin_bit_cast(To, val);
991}
992#endif // HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
993
994} // namespace detail
995
996template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)>
998 // If From is hwy::float16_t or hwy::bfloat16_t, first cast val to either
999 // const typename From::Native& or const uint16_t& using
1000 // detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef to
1001 // allow BitCastScalar from hwy::float16_t or hwy::bfloat16_t to be constexpr
1002 // if To is not a pointer type, union type, or a struct/class containing a
1003 // pointer, union, or reference subobject
1004#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
1005 return detail::BuiltinBitCastScalar<To>(
1006 detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
1007 val));
1008#else
1009 return __builtin_bit_cast(
1010 To, detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
1011 val));
1012#endif
1013}
1014template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)>
1016 // If To is hwy::float16_t or hwy::bfloat16_t, first do a BitCastScalar of val
1017 // to uint16_t, and then bit cast the uint16_t value to To using To::FromBits
1018 // as hwy::float16_t::FromBits and hwy::bfloat16_t::FromBits are guaranteed to
1019 // be constexpr if the __builtin_bit_cast intrinsic is available.
1020 return To::FromBits(BitCastScalar<uint16_t>(val));
1021}
1022#else
1023template <class To, class From>
1025 To result;
1026 CopySameSize(&val, &result);
1027 return result;
1028}
1029#endif
1030
1031//------------------------------------------------------------------------------
1032// F16 lane type
1033
1034#pragma pack(push, 1)
1035
1036// Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
1037// included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
1038// __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
1039#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
1040 (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
1041 (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
1042#define HWY_NEON_HAVE_F16C 1
1043#else
1044#define HWY_NEON_HAVE_F16C 0
1045#endif
1046
1047// RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
1048// HWY_HAVE_FLOAT16.
1049#if HWY_ARCH_RISCV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
1050#define HWY_RVV_HAVE_F16_VEC 1
1051#else
1052#define HWY_RVV_HAVE_F16_VEC 0
1053#endif
1054
1055// x86 compiler supports _Float16, not necessarily with operators.
1056// Avoid clang-cl because it lacks __extendhfsf2.
1057#if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \
1058 ((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) || \
1059 HWY_COMPILER_GCC_ACTUAL >= 1200)
1060#define HWY_SSE2_HAVE_F16_TYPE 1
1061#else
1062#define HWY_SSE2_HAVE_F16_TYPE 0
1063#endif
1064
1065#ifndef HWY_HAVE_SCALAR_F16_TYPE
1066// Compiler supports _Float16, not necessarily with operators.
1067#if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
1068#define HWY_HAVE_SCALAR_F16_TYPE 1
1069#else
1070#define HWY_HAVE_SCALAR_F16_TYPE 0
1071#endif
1072#endif // HWY_HAVE_SCALAR_F16_TYPE
1073
1074#ifndef HWY_HAVE_SCALAR_F16_OPERATORS
1075// Recent enough compiler also has operators.
1076#if HWY_HAVE_SCALAR_F16_TYPE && \
1077 (HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \
1078 (HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL && \
1079 !defined(_WIN32)) || \
1080 (HWY_ARCH_ARM && \
1081 (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)))
1082#define HWY_HAVE_SCALAR_F16_OPERATORS 1
1083#else
1084#define HWY_HAVE_SCALAR_F16_OPERATORS 0
1085#endif
1086#endif // HWY_HAVE_SCALAR_F16_OPERATORS
1087
1088namespace detail {
1089
1090template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()>
1092
1093template <class T, class TVal>
1097
1098template <class T>
1101
1102template <class T, class TVal = RemoveCvRef<T>>
1104 using type = T;
1105};
1106
1107template <class T>
1110
1111} // namespace detail
1112
1113// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
1114// by concatenating base type and bits. We use a wrapper class instead of a
1115// typedef to the native type to ensure that the same symbols, e.g. for VQSort,
1116// are generated regardless of F16 support; see #1684.
1117struct alignas(2) float16_t {
1118#if HWY_HAVE_SCALAR_F16_TYPE
1119#if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
1120 using Native = _Float16;
1121#elif HWY_NEON_HAVE_F16C
1122 using Native = __fp16;
1123#else
1124#error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
1125#endif
1126#endif // HWY_HAVE_SCALAR_F16_TYPE
1127
1128 union {
1129#if HWY_HAVE_SCALAR_F16_TYPE
1130 // Accessed via NativeLaneType, and used directly if
1131 // HWY_HAVE_SCALAR_F16_OPERATORS.
1132 Native native;
1133#endif
1134 // Only accessed via NativeLaneType or U16LaneType.
1135 uint16_t bits;
1136 };
1137
1138 // Default init and copying.
1139 float16_t() noexcept = default;
1140 constexpr float16_t(const float16_t&) noexcept = default;
1141 constexpr float16_t(float16_t&&) noexcept = default;
1142 float16_t& operator=(const float16_t&) noexcept = default;
1143 float16_t& operator=(float16_t&&) noexcept = default;
1144
1145#if HWY_HAVE_SCALAR_F16_TYPE
1146 // NEON vget/set_lane intrinsics and SVE `svaddv` could use explicit
1147 // float16_t(intrinsic()), but user code expects implicit conversions.
1148 constexpr float16_t(Native arg) noexcept : native(arg) {}
1149 constexpr operator Native() const noexcept { return native; }
1150#endif
1151
1152#if HWY_HAVE_SCALAR_F16_TYPE
1153 static HWY_BITCASTSCALAR_CONSTEXPR float16_t FromBits(uint16_t bits) {
1154 return float16_t(BitCastScalar<Native>(bits));
1155 }
1156#else
1157
1158 private:
1160 constexpr float16_t(F16FromU16BitsTag /*tag*/, uint16_t u16_bits)
1161 : bits(u16_bits) {}
1162
1163 public:
1164 static constexpr float16_t FromBits(uint16_t bits) {
1165 return float16_t(F16FromU16BitsTag(), bits);
1166 }
1167#endif
1168
1169 // When backed by a native type, ensure the wrapper behaves like the native
1170 // type by forwarding all operators. Unfortunately it seems difficult to reuse
1171 // this code in a base class, so we repeat it in float16_t.
1172#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
1173 template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
1174 IsConvertible<T, Native>()>* = nullptr>
1175 constexpr float16_t(T&& arg) noexcept
1176 : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1177
1178 template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
1179 !IsConvertible<T, Native>() &&
1180 IsStaticCastable<T, Native>()>* = nullptr>
1181 explicit constexpr float16_t(T&& arg) noexcept
1182 : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1183
1184 // pre-decrement operator (--x)
1185 HWY_CXX14_CONSTEXPR float16_t& operator--() noexcept {
1186 native = static_cast<Native>(native - Native{1});
1187 return *this;
1188 }
1189
1190 // post-decrement operator (x--)
1191 HWY_CXX14_CONSTEXPR float16_t operator--(int) noexcept {
1192 float16_t result = *this;
1193 native = static_cast<Native>(native - Native{1});
1194 return result;
1195 }
1196
1197 // pre-increment operator (++x)
1198 HWY_CXX14_CONSTEXPR float16_t& operator++() noexcept {
1199 native = static_cast<Native>(native + Native{1});
1200 return *this;
1201 }
1202
1203 // post-increment operator (x++)
1204 HWY_CXX14_CONSTEXPR float16_t operator++(int) noexcept {
1205 float16_t result = *this;
1206 native = static_cast<Native>(native + Native{1});
1207 return result;
1208 }
1209
1210 constexpr float16_t operator-() const noexcept {
1211 return float16_t(static_cast<Native>(-native));
1212 }
1213 constexpr float16_t operator+() const noexcept { return *this; }
1214
1215 // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
1216 // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
1217#define HWY_FLOAT16_BINARY_OP(op, op_func, assign_func) \
1218 constexpr float16_t op_func(const float16_t& rhs) const noexcept { \
1219 return float16_t(static_cast<Native>(native op rhs.native)); \
1220 } \
1221 template <typename T, HWY_IF_NOT_F16(T), \
1222 typename UnwrappedT = \
1223 detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
1224 typename RawResultT = \
1225 decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
1226 typename ResultT = \
1227 detail::NativeSpecialFloatToWrapper<RawResultT>, \
1228 HWY_IF_CASTABLE(RawResultT, ResultT)> \
1229 constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
1230 static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
1231 return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
1232 } \
1233 HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func( \
1234 const hwy::float16_t& rhs) noexcept { \
1235 native = static_cast<Native>(native op rhs.native); \
1236 return *this; \
1237 } \
1238 template <typename T, HWY_IF_NOT_F16(T), \
1239 HWY_IF_OP_CASTABLE(op, const T&, Native), \
1240 HWY_IF_ASSIGNABLE( \
1241 Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
1242 HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(const T& rhs) noexcept( \
1243 noexcept( \
1244 static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
1245 native = static_cast<Native>(native op rhs); \
1246 return *this; \
1247 }
1248
1249 HWY_FLOAT16_BINARY_OP(+, operator+, operator+=)
1250 HWY_FLOAT16_BINARY_OP(-, operator-, operator-=)
1251 HWY_FLOAT16_BINARY_OP(*, operator*, operator*=)
1252 HWY_FLOAT16_BINARY_OP(/, operator/, operator/=)
1253#undef HWY_FLOAT16_BINARY_OP
1254
1255#endif // HWY_HAVE_SCALAR_F16_OPERATORS
1256};
1257static_assert(sizeof(hwy::float16_t) == 2, "Wrong size of float16_t");
1258
1259#if HWY_HAVE_SCALAR_F16_TYPE
1260namespace detail {
1261
1262#if HWY_HAVE_SCALAR_F16_OPERATORS
1263template <class T>
1264struct SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true> {
1265 using type = hwy::float16_t::Native;
1266};
1267#endif
1268
1269template <class T>
1270struct NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native> {
1271 using type = hwy::float16_t;
1272};
1273
1274} // namespace detail
1275#endif // HWY_HAVE_SCALAR_F16_TYPE
1276
1277#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1278namespace detail {
1279
1280template <>
1281struct BitCastScalarSrcCastHelper<hwy::float16_t> {
1282#if HWY_HAVE_SCALAR_F16_TYPE
1283 static HWY_INLINE constexpr const hwy::float16_t::Native& CastSrcValRef(
1284 const hwy::float16_t& val) {
1285 return val.native;
1286 }
1287#else
1288 static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
1289 const hwy::float16_t& val) {
1290 return val.bits;
1291 }
1292#endif
1293};
1294
1295} // namespace detail
1296#endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1297
1298#if HWY_HAVE_SCALAR_F16_OPERATORS
1299#define HWY_F16_CONSTEXPR constexpr
1300#else
1301#define HWY_F16_CONSTEXPR HWY_BITCASTSCALAR_CXX14_CONSTEXPR
1302#endif // HWY_HAVE_SCALAR_F16_OPERATORS
1303
1305#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
1306 return static_cast<float>(f16);
1307#endif
1308#if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
1309 const uint16_t bits16 = BitCastScalar<uint16_t>(f16);
1310 const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
1311 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1312 const uint32_t mantissa = bits16 & 0x3FF;
1313
1314 // Subnormal or zero
1315 if (biased_exp == 0) {
1316 const float subnormal =
1317 (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
1318 return sign ? -subnormal : subnormal;
1319 }
1320
1321 // Normalized, infinity or NaN: convert the representation directly
1322 // (faster than ldexp/tables).
1323 const uint32_t biased_exp32 =
1324 biased_exp == 31 ? 0xFF : biased_exp + (127 - 15);
1325 const uint32_t mantissa32 = mantissa << (23 - 10);
1326 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1327
1328 return BitCastScalar<float>(bits32);
1329#endif // !HWY_HAVE_SCALAR_F16_OPERATORS
1330}
1331
1332#if HWY_IS_DEBUG_BUILD && \
1333 (HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926)
1334#if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
1335// If C++23 if !consteval support is available, only execute
1336// HWY_DASSERT(condition) if F16FromF32 is not called from a constant-evaluated
1337// context to avoid compilation errors.
1338#define HWY_F16_FROM_F32_DASSERT(condition) \
1339 do { \
1340 if !consteval { \
1341 HWY_DASSERT(condition); \
1342 } \
1343 } while (0)
1344#elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \
1345 HWY_COMPILER_MSVC >= 1926
1346// If the __builtin_is_constant_evaluated() intrinsic is available,
1347// only do HWY_DASSERT(condition) if __builtin_is_constant_evaluated() returns
1348// false to avoid compilation errors if F16FromF32 is called from a
1349// constant-evaluated context.
1350#define HWY_F16_FROM_F32_DASSERT(condition) \
1351 do { \
1352 if (!__builtin_is_constant_evaluated()) { \
1353 HWY_DASSERT(condition); \
1354 } \
1355 } while (0)
1356#else
1357// If C++23 if !consteval support is not available,
1358// the __builtin_is_constant_evaluated() intrinsic is not available,
1359// HWY_IS_DEBUG_BUILD is 1, and the __builtin_bit_cast intrinsic is available,
1360// do not do a HWY_DASSERT to avoid compilation errors if F16FromF32 is
1361// called from a constant-evaluated context.
1362#define HWY_F16_FROM_F32_DASSERT(condition) \
1363 do { \
1364 } while (0)
1365#endif // defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
1366#else
1367// If HWY_IS_DEBUG_BUILD is 0 or the __builtin_bit_cast intrinsic is not
1368// available, define HWY_F16_FROM_F32_DASSERT(condition) as
1369// HWY_DASSERT(condition)
1370#define HWY_F16_FROM_F32_DASSERT(condition) HWY_DASSERT(condition)
1371#endif // HWY_IS_DEBUG_BUILD && (HWY_HAS_BUILTIN(__builtin_bit_cast) ||
1372 // HWY_COMPILER_MSVC >= 1926)
1373
1375#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
1376 return float16_t(static_cast<float16_t::Native>(f32));
1377#endif
1378#if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
1379 const uint32_t bits32 = BitCastScalar<uint32_t>(f32);
1380 const uint32_t sign = bits32 >> 31;
1381 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1382 constexpr uint32_t kMantissaMask = 0x7FFFFF;
1383 const uint32_t mantissa32 = bits32 & kMantissaMask;
1384
1385 // Before shifting (truncation), round to nearest even to reduce bias. If
1386 // the lowest remaining mantissa bit is odd, increase the offset. Example
1387 // with the lowest remaining bit (left) and next lower two bits; the
1388 // latter, plus two more, will be truncated.
1389 // 0[00] + 1 = 0[01]
1390 // 0[01] + 1 = 0[10]
1391 // 0[10] + 1 = 0[11] (round down toward even)
1392 // 0[11] + 1 = 1[00] (round up)
1393 // 1[00] + 10 = 1[10]
1394 // 1[01] + 10 = 1[11]
1395 // 1[10] + 10 = C0[00] (round up toward even with C=1 carry out)
1396 // 1[11] + 10 = C0[01] (round up toward even with C=1 carry out)
1397 const uint32_t odd_bit = (mantissa32 >> 13) & 1;
1398 const uint32_t rounded = mantissa32 + odd_bit + 0xFFF;
1399 const bool carry = rounded >= (1u << 23);
1400
1401 const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry;
1402
1403 // Tiny or zero => zero.
1404 if (exp < -24) {
1405 // restore original sign
1406 return float16_t::FromBits(static_cast<uint16_t>(sign << 15));
1407 }
1408
1409 // If biased_exp16 would be >= 31, first check whether the input was NaN so we
1410 // can set the mantissa to nonzero.
1411 const bool is_nan = (biased_exp32 == 255) && mantissa32 != 0;
1412 const bool overflowed = exp >= 16;
1413 const uint32_t biased_exp16 =
1414 static_cast<uint32_t>(HWY_MIN(HWY_MAX(0, exp + 15), 31));
1415 // exp = [-24, -15] => subnormal, shift the mantissa.
1416 const uint32_t sub_exp = static_cast<uint32_t>(HWY_MAX(-14 - exp, 0));
1417 HWY_F16_FROM_F32_DASSERT(sub_exp < 11);
1418 const uint32_t shifted_mantissa =
1419 (rounded & kMantissaMask) >> (23 - 10 + sub_exp);
1420 const uint32_t leading = sub_exp == 0u ? 0u : (1024u >> sub_exp);
1421 const uint32_t mantissa16 = is_nan ? 0x3FF
1422 : overflowed ? 0u
1423 : (leading + shifted_mantissa);
1424
1425#if HWY_IS_DEBUG_BUILD
1426 if (exp < -14) {
1427 HWY_F16_FROM_F32_DASSERT(biased_exp16 == 0);
1428 HWY_F16_FROM_F32_DASSERT(sub_exp >= 1);
1429 } else if (exp <= 15) {
1430 HWY_F16_FROM_F32_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1431 HWY_F16_FROM_F32_DASSERT(sub_exp == 0);
1432 }
1433#endif
1434
1435 HWY_F16_FROM_F32_DASSERT(mantissa16 < 1024);
1436 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1437 HWY_F16_FROM_F32_DASSERT(bits16 < 0x10000);
1438 const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
1439 return float16_t::FromBits(narrowed);
1440#endif // !HWY_HAVE_SCALAR_F16_OPERATORS
1441}
1442
1444#if HWY_HAVE_SCALAR_F16_OPERATORS
1445 return float16_t(static_cast<float16_t::Native>(f64));
1446#else
1447 // The mantissa bits of f64 are first rounded using round-to-odd rounding
1448 // to the nearest f64 value that has the lower 29 bits zeroed out to
1449 // ensure that the result is correctly rounded to a F16.
1450
1451 // The F64 round-to-odd operation below will round a normal F64 value
1452 // (using round-to-odd rounding) to a F64 value that has 24 bits of precision.
1453
1454 // It is okay if the magnitude of a denormal F64 value is rounded up in the
1455 // F64 round-to-odd step below as the magnitude of a denormal F64 value is
1456 // much smaller than 2^(-24) (the smallest positive denormal F16 value).
1457
1458 // It is also okay if bit 29 of a NaN F64 value is changed by the F64
1459 // round-to-odd step below as the lower 13 bits of a F32 NaN value are usually
1460 // discarded or ignored by the conversion of a F32 NaN value to a F16.
1461
1462 // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
1463 // NaN value as the result of the F64 round-to-odd step will have at least one
1464 // mantissa bit if f64 is a NaN value.
1465
1466 // The F64 round-to-odd step will ensure that the F64 to F32 conversion is
1467 // exact if the magnitude of the rounded F64 value (using round-to-odd
1468 // rounding) is between 2^(-126) (the smallest normal F32 value) and
1469 // HighestValue<float>() (the largest finite F32 value)
1470
1471 // It is okay if the F64 to F32 conversion is inexact for F64 values that have
1472 // a magnitude that is less than 2^(-126) as the magnitude of a denormal F32
1473 // value is much smaller than 2^(-24) (the smallest positive denormal F16
1474 // value).
1475
1476 return F16FromF32(
1477 static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
1478 (BitCastScalar<uint64_t>(f64) & 0xFFFFFFFFE0000000ULL) |
1479 ((BitCastScalar<uint64_t>(f64) + 0x000000001FFFFFFFULL) &
1480 0x0000000020000000ULL)))));
1481#endif
1482}
1483
1484// More convenient to define outside float16_t because these may use
1485// F32FromF16, which is defined after the struct.
1487 float16_t rhs) noexcept {
1488#if HWY_HAVE_SCALAR_F16_OPERATORS
1489 return lhs.native == rhs.native;
1490#else
1491 return F32FromF16(lhs) == F32FromF16(rhs);
1492#endif
1493}
1495 float16_t rhs) noexcept {
1496#if HWY_HAVE_SCALAR_F16_OPERATORS
1497 return lhs.native != rhs.native;
1498#else
1499 return F32FromF16(lhs) != F32FromF16(rhs);
1500#endif
1501}
1502HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
1503#if HWY_HAVE_SCALAR_F16_OPERATORS
1504 return lhs.native < rhs.native;
1505#else
1506 return F32FromF16(lhs) < F32FromF16(rhs);
1507#endif
1508}
1510 float16_t rhs) noexcept {
1511#if HWY_HAVE_SCALAR_F16_OPERATORS
1512 return lhs.native <= rhs.native;
1513#else
1514 return F32FromF16(lhs) <= F32FromF16(rhs);
1515#endif
1516}
1517HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
1518#if HWY_HAVE_SCALAR_F16_OPERATORS
1519 return lhs.native > rhs.native;
1520#else
1521 return F32FromF16(lhs) > F32FromF16(rhs);
1522#endif
1523}
1525 float16_t rhs) noexcept {
1526#if HWY_HAVE_SCALAR_F16_OPERATORS
1527 return lhs.native >= rhs.native;
1528#else
1529 return F32FromF16(lhs) >= F32FromF16(rhs);
1530#endif
1531}
1532#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
1533HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
1534 float16_t lhs, float16_t rhs) noexcept {
1535#if HWY_HAVE_SCALAR_F16_OPERATORS
1536 return lhs.native <=> rhs.native;
1537#else
1538 return F32FromF16(lhs) <=> F32FromF16(rhs);
1539#endif
1540}
1541#endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
1542
1543//------------------------------------------------------------------------------
1544// BF16 lane type
1545
1546// Compiler supports ACLE __bf16, not necessarily with operators.
1547
1548// Disable the __bf16 type on AArch64 with GCC 13 or earlier as there is a bug
1549// in GCC 13 and earlier that sometimes causes BF16 constant values to be
1550// incorrectly loaded on AArch64, and this GCC bug on AArch64 is
1551// described at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111867.
1552
1553#if HWY_ARCH_ARM_A64 && \
1554 (HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400)
1555#define HWY_ARM_HAVE_SCALAR_BF16_TYPE 1
1556#else
1557#define HWY_ARM_HAVE_SCALAR_BF16_TYPE 0
1558#endif
1559
1560// x86 compiler supports __bf16, not necessarily with operators.
1561#ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
1562#if HWY_ARCH_X86 && defined(__SSE2__) && \
1563 ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL) || \
1564 HWY_COMPILER_GCC_ACTUAL >= 1300)
1565#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
1566#else
1567#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 0
1568#endif
1569#endif // HWY_SSE2_HAVE_SCALAR_BF16_TYPE
1570
1571// Compiler supports __bf16, not necessarily with operators.
1572#if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE
1573#define HWY_HAVE_SCALAR_BF16_TYPE 1
1574#else
1575#define HWY_HAVE_SCALAR_BF16_TYPE 0
1576#endif
1577
1578#ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
1579// Recent enough compiler also has operators. aarch64 clang 18 hits internal
1580// compiler errors on bf16 ToString, hence only enable on GCC for now.
1581#if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1300)
1582#define HWY_HAVE_SCALAR_BF16_OPERATORS 1
1583#else
1584#define HWY_HAVE_SCALAR_BF16_OPERATORS 0
1585#endif
1586#endif // HWY_HAVE_SCALAR_BF16_OPERATORS
1587
1588#if HWY_HAVE_SCALAR_BF16_OPERATORS
1589#define HWY_BF16_CONSTEXPR constexpr
1590#else
1591#define HWY_BF16_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
1592#endif
1593
1594struct alignas(2) bfloat16_t {
1595#if HWY_HAVE_SCALAR_BF16_TYPE
1596 using Native = __bf16;
1597#endif
1598
1599 union {
1600#if HWY_HAVE_SCALAR_BF16_TYPE
1601 // Accessed via NativeLaneType, and used directly if
1602 // HWY_HAVE_SCALAR_BF16_OPERATORS.
1603 Native native;
1604#endif
1605 // Only accessed via NativeLaneType or U16LaneType.
1606 uint16_t bits;
1607 };
1608
1609 // Default init and copying
1610 bfloat16_t() noexcept = default;
1611 constexpr bfloat16_t(bfloat16_t&&) noexcept = default;
1612 constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
1613 bfloat16_t& operator=(bfloat16_t&& arg) noexcept = default;
1614 bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
1615
1616// Only enable implicit conversions if we have a native type.
1617#if HWY_HAVE_SCALAR_BF16_TYPE
1618 constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
1619 constexpr operator Native() const noexcept { return native; }
1620#endif
1621
1622#if HWY_HAVE_SCALAR_BF16_TYPE
1623 static HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t FromBits(uint16_t bits) {
1624 return bfloat16_t(BitCastScalar<Native>(bits));
1625 }
1626#else
1627
1628 private:
1630 constexpr bfloat16_t(BF16FromU16BitsTag /*tag*/, uint16_t u16_bits)
1631 : bits(u16_bits) {}
1632
1633 public:
1634 static constexpr bfloat16_t FromBits(uint16_t bits) {
1635 return bfloat16_t(BF16FromU16BitsTag(), bits);
1636 }
1637#endif
1638
1639 // When backed by a native type, ensure the wrapper behaves like the native
1640 // type by forwarding all operators. Unfortunately it seems difficult to reuse
1641 // this code in a base class, so we repeat it in float16_t.
1642#if HWY_HAVE_SCALAR_BF16_OPERATORS || HWY_IDE
1643 template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
1644 !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
1645 IsConvertible<T, Native>()>* = nullptr>
1646 constexpr bfloat16_t(T&& arg) noexcept(
1647 noexcept(static_cast<Native>(DeclVal<T>())))
1648 : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1649
1650 template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
1651 !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
1652 !IsConvertible<T, Native>() &&
1653 IsStaticCastable<T, Native>()>* = nullptr>
1654 explicit constexpr bfloat16_t(T&& arg) noexcept(
1655 noexcept(static_cast<Native>(DeclVal<T>())))
1656 : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1657
1658 HWY_CXX14_CONSTEXPR bfloat16_t& operator=(Native arg) noexcept {
1659 native = arg;
1660 return *this;
1661 }
1662
1663 // pre-decrement operator (--x)
1664 HWY_CXX14_CONSTEXPR bfloat16_t& operator--() noexcept {
1665 native = static_cast<Native>(native - Native{1});
1666 return *this;
1667 }
1668
1669 // post-decrement operator (x--)
1670 HWY_CXX14_CONSTEXPR bfloat16_t operator--(int) noexcept {
1671 bfloat16_t result = *this;
1672 native = static_cast<Native>(native - Native{1});
1673 return result;
1674 }
1675
1676 // pre-increment operator (++x)
1677 HWY_CXX14_CONSTEXPR bfloat16_t& operator++() noexcept {
1678 native = static_cast<Native>(native + Native{1});
1679 return *this;
1680 }
1681
1682 // post-increment operator (x++)
1683 HWY_CXX14_CONSTEXPR bfloat16_t operator++(int) noexcept {
1684 bfloat16_t result = *this;
1685 native = static_cast<Native>(native + Native{1});
1686 return result;
1687 }
1688
1689 constexpr bfloat16_t operator-() const noexcept {
1690 return bfloat16_t(static_cast<Native>(-native));
1691 }
1692 constexpr bfloat16_t operator+() const noexcept { return *this; }
1693
1694 // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
1695 // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
1696#define HWY_BFLOAT16_BINARY_OP(op, op_func, assign_func) \
1697 constexpr bfloat16_t op_func(const bfloat16_t& rhs) const noexcept { \
1698 return bfloat16_t(static_cast<Native>(native op rhs.native)); \
1699 } \
1700 template <typename T, HWY_IF_NOT_BF16(T), \
1701 typename UnwrappedT = \
1702 detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
1703 typename RawResultT = \
1704 decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
1705 typename ResultT = \
1706 detail::NativeSpecialFloatToWrapper<RawResultT>, \
1707 HWY_IF_CASTABLE(RawResultT, ResultT)> \
1708 constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
1709 static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
1710 return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
1711 } \
1712 HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func( \
1713 const hwy::bfloat16_t& rhs) noexcept { \
1714 native = static_cast<Native>(native op rhs.native); \
1715 return *this; \
1716 } \
1717 template <typename T, HWY_IF_NOT_BF16(T), \
1718 HWY_IF_OP_CASTABLE(op, const T&, Native), \
1719 HWY_IF_ASSIGNABLE( \
1720 Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
1721 HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(const T& rhs) noexcept( \
1722 noexcept( \
1723 static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
1724 native = static_cast<Native>(native op rhs); \
1725 return *this; \
1726 }
1727 HWY_BFLOAT16_BINARY_OP(+, operator+, operator+=)
1728 HWY_BFLOAT16_BINARY_OP(-, operator-, operator-=)
1729 HWY_BFLOAT16_BINARY_OP(*, operator*, operator*=)
1730 HWY_BFLOAT16_BINARY_OP(/, operator/, operator/=)
1731#undef HWY_BFLOAT16_BINARY_OP
1732
1733#endif // HWY_HAVE_SCALAR_BF16_OPERATORS
1734};
1735static_assert(sizeof(hwy::bfloat16_t) == 2, "Wrong size of bfloat16_t");
1736
1737#pragma pack(pop)
1738
1739#if HWY_HAVE_SCALAR_BF16_TYPE
1740namespace detail {
1741
1742#if HWY_HAVE_SCALAR_BF16_OPERATORS
1743template <class T>
1744struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> {
1745 using type = hwy::bfloat16_t::Native;
1746};
1747#endif
1748
1749template <class T>
1750struct NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native> {
1751 using type = hwy::bfloat16_t;
1752};
1753
1754} // namespace detail
1755#endif // HWY_HAVE_SCALAR_BF16_TYPE
1756
1757#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1758namespace detail {
1759
1760template <>
1761struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> {
1762#if HWY_HAVE_SCALAR_BF16_TYPE
1763 static HWY_INLINE constexpr const hwy::bfloat16_t::Native& CastSrcValRef(
1764 const hwy::bfloat16_t& val) {
1765 return val.native;
1766 }
1767#else
1768 static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
1769 const hwy::bfloat16_t& val) {
1770 return val.bits;
1771 }
1772#endif
1773};
1774
1775} // namespace detail
1776#endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1777
1779#if HWY_HAVE_SCALAR_BF16_OPERATORS
1780 return static_cast<float>(bf);
1781#else
1782 return BitCastScalar<float>(static_cast<uint32_t>(
1783 static_cast<uint32_t>(BitCastScalar<uint16_t>(bf)) << 16));
1784#endif
1785}
1786
1787namespace detail {
1788
1789// Returns the increment to add to the bits of a finite F32 value to round a
1790// finite F32 to the nearest BF16 value
1792 const uint32_t f32_bits) {
1793 return static_cast<uint32_t>(((f32_bits & 0x7FFFFFFFu) < 0x7F800000u)
1794 ? (0x7FFFu + ((f32_bits >> 16) & 1u))
1795 : 0u);
1796}
1797
1798// Converts f32_bits (which is the bits of a F32 value) to BF16 bits,
1799// rounded to the nearest F16 value
1801 const uint32_t f32_bits) {
1802 // Round f32_bits to the nearest BF16 by first adding
1803 // F32BitsToBF16RoundIncr(f32_bits) to f32_bits and then right shifting
1804 // f32_bits + F32BitsToBF16RoundIncr(f32_bits) by 16
1805
1806 // If f32_bits is the bit representation of a NaN F32 value, make sure that
1807 // bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16
1808 // values and to prevent NaN F32 values from being converted to an infinite
1809 // BF16 value
1810 return static_cast<uint16_t>(
1811 ((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16) |
1812 (static_cast<uint32_t>((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) << 6));
1813}
1814
1815} // namespace detail
1816
1818#if HWY_HAVE_SCALAR_BF16_OPERATORS
1819 return static_cast<bfloat16_t>(f);
1820#else
1821 return bfloat16_t::FromBits(
1822 detail::F32BitsToBF16Bits(BitCastScalar<uint32_t>(f)));
1823#endif
1824}
1825
1827#if HWY_HAVE_SCALAR_BF16_OPERATORS
1828 return static_cast<bfloat16_t>(f64);
1829#else
1830 // The mantissa bits of f64 are first rounded using round-to-odd rounding
1831 // to the nearest f64 value that has the lower 38 bits zeroed out to
1832 // ensure that the result is correctly rounded to a BF16.
1833
1834 // The F64 round-to-odd operation below will round a normal F64 value
1835 // (using round-to-odd rounding) to a F64 value that has 15 bits of precision.
1836
1837 // It is okay if the magnitude of a denormal F64 value is rounded up in the
1838 // F64 round-to-odd step below as the magnitude of a denormal F64 value is
1839 // much smaller than 2^(-133) (the smallest positive denormal BF16 value).
1840
1841 // It is also okay if bit 38 of a NaN F64 value is changed by the F64
1842 // round-to-odd step below as the lower 16 bits of a F32 NaN value are usually
1843 // discarded or ignored by the conversion of a F32 NaN value to a BF16.
1844
1845 // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
1846 // NaN value as the result of the F64 round-to-odd step will have at least one
1847 // mantissa bit if f64 is a NaN value.
1848
1849 // The F64 round-to-odd step below will ensure that the F64 to F32 conversion
1850 // is exact if the magnitude of the rounded F64 value (using round-to-odd
1851 // rounding) is between 2^(-135) (one-fourth of the smallest positive denormal
1852 // BF16 value) and HighestValue<float>() (the largest finite F32 value).
1853
1854 // If |f64| is less than 2^(-135), the magnitude of the result of the F64 to
1855 // F32 conversion is guaranteed to be less than or equal to 2^(-135), which
1856 // ensures that the F32 to BF16 conversion is correctly rounded, even if the
1857 // conversion of a rounded F64 value whose magnitude is less than 2^(-135)
1858 // to a F32 is inexact.
1859
1860 return BF16FromF32(
1861 static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
1862 (BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
1863 ((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
1864 0x0000004000000000ULL)))));
1865#endif
1866}
1867
1868// More convenient to define outside bfloat16_t because these may use
1869// F32FromBF16, which is defined after the struct.
1870
1872 bfloat16_t rhs) noexcept {
1873#if HWY_HAVE_SCALAR_BF16_OPERATORS
1874 return lhs.native == rhs.native;
1875#else
1876 return F32FromBF16(lhs) == F32FromBF16(rhs);
1877#endif
1878}
1879
1881 bfloat16_t rhs) noexcept {
1882#if HWY_HAVE_SCALAR_BF16_OPERATORS
1883 return lhs.native != rhs.native;
1884#else
1885 return F32FromBF16(lhs) != F32FromBF16(rhs);
1886#endif
1887}
1889 bfloat16_t rhs) noexcept {
1890#if HWY_HAVE_SCALAR_BF16_OPERATORS
1891 return lhs.native < rhs.native;
1892#else
1893 return F32FromBF16(lhs) < F32FromBF16(rhs);
1894#endif
1895}
1897 bfloat16_t rhs) noexcept {
1898#if HWY_HAVE_SCALAR_BF16_OPERATORS
1899 return lhs.native <= rhs.native;
1900#else
1901 return F32FromBF16(lhs) <= F32FromBF16(rhs);
1902#endif
1903}
1905 bfloat16_t rhs) noexcept {
1906#if HWY_HAVE_SCALAR_BF16_OPERATORS
1907 return lhs.native > rhs.native;
1908#else
1909 return F32FromBF16(lhs) > F32FromBF16(rhs);
1910#endif
1911}
1913 bfloat16_t rhs) noexcept {
1914#if HWY_HAVE_SCALAR_BF16_OPERATORS
1915 return lhs.native >= rhs.native;
1916#else
1917 return F32FromBF16(lhs) >= F32FromBF16(rhs);
1918#endif
1919}
1920#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
1921HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>(
1922 bfloat16_t lhs, bfloat16_t rhs) noexcept {
1923#if HWY_HAVE_SCALAR_BF16_OPERATORS
1924 return lhs.native <=> rhs.native;
1925#else
1926 return F32FromBF16(lhs) <=> F32FromBF16(rhs);
1927#endif
1928}
1929#endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
1930
1931//------------------------------------------------------------------------------
1932// Type relations
1933
1934namespace detail {
1935
1936template <typename T>
1938template <>
1939struct Relations<uint8_t> {
1940 using Unsigned = uint8_t;
1941 using Signed = int8_t;
1942 using Wide = uint16_t;
1943 enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
1944};
1945template <>
1946struct Relations<int8_t> {
1947 using Unsigned = uint8_t;
1948 using Signed = int8_t;
1949 using Wide = int16_t;
1950 enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
1951};
1952template <>
1953struct Relations<uint16_t> {
1954 using Unsigned = uint16_t;
1955 using Signed = int16_t;
1957 using Wide = uint32_t;
1958 using Narrow = uint8_t;
1959 enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
1960};
1961template <>
1962struct Relations<int16_t> {
1963 using Unsigned = uint16_t;
1964 using Signed = int16_t;
1966 using Wide = int32_t;
1967 using Narrow = int8_t;
1968 enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
1969};
1970template <>
1971struct Relations<uint32_t> {
1972 using Unsigned = uint32_t;
1973 using Signed = int32_t;
1974 using Float = float;
1975 using Wide = uint64_t;
1976 using Narrow = uint16_t;
1977 enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
1978};
1979template <>
1980struct Relations<int32_t> {
1981 using Unsigned = uint32_t;
1982 using Signed = int32_t;
1983 using Float = float;
1984 using Wide = int64_t;
1985 using Narrow = int16_t;
1986 enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
1987};
1988template <>
1989struct Relations<uint64_t> {
1990 using Unsigned = uint64_t;
1991 using Signed = int64_t;
1992 using Float = double;
1994 using Narrow = uint32_t;
1995 enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
1996};
1997template <>
1998struct Relations<int64_t> {
1999 using Unsigned = uint64_t;
2000 using Signed = int64_t;
2001 using Float = double;
2002 using Narrow = int32_t;
2003 enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
2004};
2005template <>
2008 using Narrow = uint64_t;
2009 enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
2010};
2011template <>
2013 using Unsigned = uint16_t;
2014 using Signed = int16_t;
2016 using Wide = float;
2017 enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
2018};
2019template <>
2021 using Unsigned = uint16_t;
2022 using Signed = int16_t;
2023 using Wide = float;
2024 enum { is_signed = 1, is_float = 1, is_bf16 = 1 };
2025};
2026template <>
2027struct Relations<float> {
2028 using Unsigned = uint32_t;
2029 using Signed = int32_t;
2030 using Float = float;
2031 using Wide = double;
2033 enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
2034};
2035template <>
2036struct Relations<double> {
2037 using Unsigned = uint64_t;
2038 using Signed = int64_t;
2039 using Float = double;
2040 using Narrow = float;
2041 enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
2042};
2043
2044template <size_t N>
2046template <>
2047struct TypeFromSize<1> {
2048 using Unsigned = uint8_t;
2049 using Signed = int8_t;
2050};
2051template <>
2052struct TypeFromSize<2> {
2053 using Unsigned = uint16_t;
2054 using Signed = int16_t;
2056};
2057template <>
2058struct TypeFromSize<4> {
2059 using Unsigned = uint32_t;
2060 using Signed = int32_t;
2061 using Float = float;
2062};
2063template <>
2064struct TypeFromSize<8> {
2065 using Unsigned = uint64_t;
2066 using Signed = int64_t;
2067 using Float = double;
2068};
2069template <>
2070struct TypeFromSize<16> {
2072};
2073
2074} // namespace detail
2075
2076// Aliases for types of a different category, but the same size.
2077template <typename T>
2079template <typename T>
2081template <typename T>
2083
2084// Aliases for types of the same category, but different size.
2085template <typename T>
2087template <typename T>
2089
2090// Obtain type from its size [bytes].
2091template <size_t N>
2093template <size_t N>
2095template <size_t N>
2097
2098// Avoid confusion with SizeTag where the parameter is a lane size.
2100using SignedTag = SizeTag<0x100>; // integer
2103
2104template <typename T, class R = detail::Relations<T>>
2105constexpr auto TypeTag()
2106 -> hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)> {
2108}
2109
2110// For when we only want to distinguish FloatTag from everything else.
2112
2113template <typename T, class R = detail::Relations<T>>
2114constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
2116}
2117
2118//------------------------------------------------------------------------------
2119// Type traits
2120
2121template <typename T>
2122HWY_API constexpr bool IsFloat3264() {
2123 return IsSameEither<RemoveCvRef<T>, float, double>();
2124}
2125
2126template <typename T>
2127HWY_API constexpr bool IsFloat() {
2128 // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
2129 // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1.
2130 return IsSame<RemoveCvRef<T>, float16_t>() || IsFloat3264<T>();
2131}
2132
2133template <typename T>
2134HWY_API constexpr bool IsSigned() {
2135 return static_cast<T>(0) > static_cast<T>(-1);
2136}
2137template <>
2138constexpr bool IsSigned<float16_t>() {
2139 return true;
2140}
2141template <>
2142constexpr bool IsSigned<bfloat16_t>() {
2143 return true;
2144}
2145template <>
2146constexpr bool IsSigned<hwy::uint128_t>() {
2147 return false;
2148}
2149template <>
2150constexpr bool IsSigned<hwy::K64V64>() {
2151 return false;
2152}
2153template <>
2154constexpr bool IsSigned<hwy::K32V32>() {
2155 return false;
2156}
2157
2158template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
2160 using type = T;
2161};
2162
2163template <typename T>
2164struct MakeLaneTypeIfIntegerT<T, true> {
2166 UnsignedFromSize<sizeof(T)>>;
2167};
2168
2169template <typename T>
2171
2172// Largest/smallest representable integer values.
2173template <typename T>
2174HWY_API constexpr T LimitsMax() {
2175 static_assert(IsInteger<T>(), "Only for integer types");
2176 using TU = UnsignedFromSize<sizeof(T)>;
2177 return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~TU(0)) >> 1)
2178 : static_cast<TU>(~TU(0)));
2179}
2180template <typename T>
2181HWY_API constexpr T LimitsMin() {
2182 static_assert(IsInteger<T>(), "Only for integer types");
2183 return IsSigned<T>() ? static_cast<T>(-1) - LimitsMax<T>()
2184 : static_cast<T>(0);
2185}
2186
2187// Largest/smallest representable value (integer or float). This naming avoids
2188// confusion with numeric_limits<float>::min() (the smallest positive value).
2189// Cannot be constexpr because we use CopySameSize for [b]float16_t.
2190template <typename T>
2192 return LimitsMin<T>();
2193}
2194template <>
2196 return bfloat16_t::FromBits(uint16_t{0xFF7Fu}); // -1.1111111 x 2^127
2197}
2198template <>
2200 return float16_t::FromBits(uint16_t{0xFBFFu}); // -1.1111111111 x 2^15
2201}
2202template <>
2203HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() {
2204 return -3.402823466e+38F;
2205}
2206template <>
2207HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() {
2208 return -1.7976931348623158e+308;
2209}
2210
2211template <typename T>
2213 return LimitsMax<T>();
2214}
2215template <>
2217 return bfloat16_t::FromBits(uint16_t{0x7F7Fu}); // 1.1111111 x 2^127
2218}
2219template <>
2221 return float16_t::FromBits(uint16_t{0x7BFFu}); // 1.1111111111 x 2^15
2222}
2223template <>
2224HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() {
2225 return 3.402823466e+38F;
2226}
2227template <>
2228HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() {
2229 return 1.7976931348623158e+308;
2230}
2231
2232// Difference between 1.0 and the next representable value. Equal to
2233// 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
2234template <typename T>
2238template <>
2240 return bfloat16_t::FromBits(uint16_t{0x3C00u}); // 0.0078125
2241}
2242template <>
2244 return float16_t::FromBits(uint16_t{0x1400u}); // 0.0009765625
2245}
2246template <>
2248 return 1.192092896e-7f;
2249}
2250template <>
2251HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() {
2252 return 2.2204460492503131e-16;
2253}
2254
2255// Returns width in bits of the mantissa field in IEEE binary16/32/64.
2256template <typename T>
2257constexpr int MantissaBits() {
2258 static_assert(sizeof(T) == 0, "Only instantiate the specializations");
2259 return 0;
2260}
2261template <>
2262constexpr int MantissaBits<bfloat16_t>() {
2263 return 7;
2264}
2265template <>
2266constexpr int MantissaBits<float16_t>() {
2267 return 10;
2268}
2269template <>
2270constexpr int MantissaBits<float>() {
2271 return 23;
2272}
2273template <>
2274constexpr int MantissaBits<double>() {
2275 return 52;
2276}
2277
2278// Returns the (left-shifted by one bit) IEEE binary16/32/64 representation with
2279// the largest possible (biased) exponent field. Used by IsInf.
2280template <typename T>
2282 return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
2283}
2284
2285// Returns bitmask of the sign bit in IEEE binary16/32/64.
2286template <typename T>
2288 return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
2289}
2290
2291// Returns bitmask of the exponent field in IEEE binary16/32/64.
2292template <typename T>
2294 return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
2295 static_cast<MakeUnsigned<T>>(~SignMask<T>());
2296}
2297
2298// Returns bitmask of the mantissa field in IEEE binary16/32/64.
2299template <typename T>
2301 return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
2302}
2303
2304// Returns 1 << mantissa_bits as a floating-point number. All integers whose
2305// absolute value are less than this can be represented exactly.
2306template <typename T>
2308 static_assert(sizeof(T) == 0, "Only instantiate the specializations");
2309 return 0;
2310}
2311template <>
2313 return bfloat16_t::FromBits(uint16_t{0x4300u}); // 1.0 x 2^7
2314}
2315template <>
2317 return float16_t::FromBits(uint16_t{0x6400u}); // 1.0 x 2^10
2318}
2319template <>
2320HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() {
2321 return 8388608.0f; // 1 << 23
2322}
2323template <>
2324HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() {
2325 // floating point literal with p52 requires C++17.
2326 return 4503599627370496.0; // 1 << 52
2327}
2328
2329// Returns width in bits of the exponent field in IEEE binary16/32/64.
2330template <typename T>
2331constexpr int ExponentBits() {
2332 // Exponent := remaining bits after deducting sign and mantissa.
2333 return 8 * sizeof(T) - 1 - MantissaBits<T>();
2334}
2335
2336// Returns largest value of the biased exponent field in IEEE binary16/32/64,
2337// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
2338// This is expressed as a signed integer for more efficient comparison.
2339template <typename T>
2341 return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
2342}
2343
2344//------------------------------------------------------------------------------
2345// Additional F16/BF16 operators
2346
2347#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
2348
2349#define HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T2) \
2350 template < \
2351 typename T1, \
2352 hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \
2353 hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
2354 typename RawResultT = decltype(DeclVal<T1>() op DeclVal<T2::Native>()), \
2355 typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
2356 HWY_IF_CASTABLE(RawResultT, ResultT)> \
2357 static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
2358 return static_cast<ResultT>(a op b.native); \
2359 }
2360
2361#define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1) \
2362 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1) \
2363 template < \
2364 typename T2, \
2365 hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T2>>() || \
2366 hwy::IsFloat3264<RemoveCvRef<T2>>()>* = nullptr, \
2367 typename RawResultT = decltype(DeclVal<T1::Native>() op DeclVal<T2>()), \
2368 typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
2369 HWY_IF_CASTABLE(RawResultT, ResultT)> \
2370 static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
2371 return static_cast<ResultT>(a.native op b); \
2372 }
2373
2374#if HWY_HAVE_SCALAR_F16_OPERATORS
2375HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
2376HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
2377HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
2378HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
2379HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
2380HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
2381HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
2382HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t)
2383HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t)
2384HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t)
2385#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
2386HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t)
2387#endif
2388#endif // HWY_HAVE_SCALAR_F16_OPERATORS
2389
2390#if HWY_HAVE_SCALAR_BF16_OPERATORS
2391HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
2392HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
2393HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
2394HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
2395HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
2396HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
2397HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
2398HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t)
2399HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t)
2400HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t)
2401#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
2402HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
2403#endif
2404#endif // HWY_HAVE_SCALAR_BF16_OPERATORS
2405
2406#undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
2407#undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
2408
2409#endif // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
2410
2411//------------------------------------------------------------------------------
2412// Type conversions (after IsSpecialFloat)
2413
2414HWY_API float F32FromF16Mem(const void* ptr) {
2415 float16_t f16;
2416 CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &f16);
2417 return F32FromF16(f16);
2418}
2419
2420HWY_API float F32FromBF16Mem(const void* ptr) {
2421 bfloat16_t bf;
2422 CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &bf);
2423 return F32FromBF16(bf);
2424}
2425
2426#if HWY_HAVE_SCALAR_F16_OPERATORS
2427#define HWY_BF16_TO_F16_CONSTEXPR HWY_BF16_CONSTEXPR
2428#else
2429#define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
2430#endif
2431
2432// For casting from TFrom to TTo
2433template <typename TTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TTo),
2434 HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TTo, TFrom)>
2435HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
2436 return static_cast<TTo>(in);
2437}
2438template <typename TTo, typename TFrom, HWY_IF_F16(TTo),
2439 HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
2440HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
2441 return F16FromF32(static_cast<float>(in));
2442}
2443template <typename TTo, HWY_IF_F16(TTo)>
2446 return F16FromF32(F32FromBF16(in));
2447}
2448template <typename TTo, HWY_IF_F16(TTo)>
2450 return F16FromF64(in);
2451}
2452template <typename TTo, typename TFrom, HWY_IF_BF16(TTo),
2453 HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
2455 return BF16FromF32(static_cast<float>(in));
2456}
2457template <typename TTo, HWY_IF_BF16(TTo)>
2461template <typename TTo, HWY_IF_BF16(TTo)>
2463 return BF16FromF64(in);
2464}
2465template <typename TTo, typename TFrom, HWY_IF_F16(TFrom),
2468 return static_cast<TTo>(F32FromF16(in));
2469}
2470template <typename TTo, typename TFrom, HWY_IF_BF16(TFrom),
2473 return static_cast<TTo>(F32FromBF16(in));
2474}
2475// Same: return unchanged
2476template <typename TTo>
2477HWY_API constexpr TTo ConvertScalarTo(TTo in) {
2478 return in;
2479}
2480
2481//------------------------------------------------------------------------------
2482// Helper functions
2483
2484template <typename T1, typename T2>
2485constexpr inline T1 DivCeil(T1 a, T2 b) {
2486 return (a + b - 1) / b;
2487}
2488
2489// Works for any `align`; if a power of two, compiler emits ADD+AND.
2490constexpr inline size_t RoundUpTo(size_t what, size_t align) {
2491 return DivCeil(what, align) * align;
2492}
2493
2494// Works for any `align`; if a power of two, compiler emits AND.
2495constexpr inline size_t RoundDownTo(size_t what, size_t align) {
2496 return what - (what % align);
2497}
2498
2499namespace detail {
2500
2501// T is unsigned or T is signed and (val >> shift_amt) is an arithmetic right
2502// shift
2503template <class T>
2504static HWY_INLINE constexpr T ScalarShr(hwy::UnsignedTag /*type_tag*/, T val,
2505 int shift_amt) {
2506 return static_cast<T>(val >> shift_amt);
2507}
2508
2509// T is signed and (val >> shift_amt) is a non-arithmetic right shift
2510template <class T>
2511static HWY_INLINE constexpr T ScalarShr(hwy::SignedTag /*type_tag*/, T val,
2512 int shift_amt) {
2514 return static_cast<T>(
2515 (val < 0) ? static_cast<TU>(
2516 ~(static_cast<TU>(~static_cast<TU>(val)) >> shift_amt))
2517 : static_cast<TU>(static_cast<TU>(val) >> shift_amt));
2518}
2519
2520} // namespace detail
2521
2522// If T is an signed integer type, ScalarShr is guaranteed to perform an
2523// arithmetic right shift
2524
2525// Otherwise, if T is an unsigned integer type, ScalarShr is guaranteed to
2526// perform a logical right shift
2527template <class T, HWY_IF_INTEGER(RemoveCvRef<T>)>
2528HWY_API constexpr RemoveCvRef<T> ScalarShr(T val, int shift_amt) {
2529 using NonCvRefT = RemoveCvRef<T>;
2530 return detail::ScalarShr(
2531 hwy::SizeTag<((IsSigned<NonCvRefT>() &&
2532 (LimitsMin<NonCvRefT>() >> (sizeof(T) * 8 - 1)) !=
2533 static_cast<NonCvRefT>(-1))
2534 ? 0x100
2535 : 0)>(),
2536 static_cast<NonCvRefT>(val), shift_amt);
2537}
2538
2539// Undefined results for x == 0.
2540HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
2541 HWY_DASSERT(x != 0);
2542#if HWY_COMPILER_MSVC
2543 unsigned long index; // NOLINT
2544 _BitScanForward(&index, x);
2545 return index;
2546#else // HWY_COMPILER_MSVC
2547 return static_cast<size_t>(__builtin_ctz(x));
2548#endif // HWY_COMPILER_MSVC
2549}
2550
2551HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
2552 HWY_DASSERT(x != 0);
2553#if HWY_COMPILER_MSVC
2554#if HWY_ARCH_X86_64
2555 unsigned long index; // NOLINT
2556 _BitScanForward64(&index, x);
2557 return index;
2558#else // HWY_ARCH_X86_64
2559 // _BitScanForward64 not available
2560 uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
2561 unsigned long index; // NOLINT
2562 if (lsb == 0) {
2563 uint32_t msb = static_cast<uint32_t>(x >> 32u);
2564 _BitScanForward(&index, msb);
2565 return 32 + index;
2566 } else {
2567 _BitScanForward(&index, lsb);
2568 return index;
2569 }
2570#endif // HWY_ARCH_X86_64
2571#else // HWY_COMPILER_MSVC
2572 return static_cast<size_t>(__builtin_ctzll(x));
2573#endif // HWY_COMPILER_MSVC
2574}
2575
2576// Undefined results for x == 0.
2577HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
2578 HWY_DASSERT(x != 0);
2579#if HWY_COMPILER_MSVC
2580 unsigned long index; // NOLINT
2581 _BitScanReverse(&index, x);
2582 return 31 - index;
2583#else // HWY_COMPILER_MSVC
2584 return static_cast<size_t>(__builtin_clz(x));
2585#endif // HWY_COMPILER_MSVC
2586}
2587
2588HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
2589 HWY_DASSERT(x != 0);
2590#if HWY_COMPILER_MSVC
2591#if HWY_ARCH_X86_64
2592 unsigned long index; // NOLINT
2593 _BitScanReverse64(&index, x);
2594 return 63 - index;
2595#else // HWY_ARCH_X86_64
2596 // _BitScanReverse64 not available
2597 const uint32_t msb = static_cast<uint32_t>(x >> 32u);
2598 unsigned long index; // NOLINT
2599 if (msb == 0) {
2600 const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
2601 _BitScanReverse(&index, lsb);
2602 return 63 - index;
2603 } else {
2604 _BitScanReverse(&index, msb);
2605 return 31 - index;
2606 }
2607#endif // HWY_ARCH_X86_64
2608#else // HWY_COMPILER_MSVC
2609 return static_cast<size_t>(__builtin_clzll(x));
2610#endif // HWY_COMPILER_MSVC
2611}
2612
2613template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
2614 HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))>
2615HWY_API size_t PopCount(T x) {
2616 uint32_t u32_x = static_cast<uint32_t>(
2617 static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
2618
2619#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
2620 return static_cast<size_t>(__builtin_popcountl(u32_x));
2621#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
2622 return static_cast<size_t>(_mm_popcnt_u32(u32_x));
2623#else
2624 u32_x -= ((u32_x >> 1) & 0x55555555u);
2625 u32_x = (((u32_x >> 2) & 0x33333333u) + (u32_x & 0x33333333u));
2626 u32_x = (((u32_x >> 4) + u32_x) & 0x0F0F0F0Fu);
2627 u32_x += (u32_x >> 8);
2628 u32_x += (u32_x >> 16);
2629 return static_cast<size_t>(u32_x & 0x3Fu);
2630#endif
2631}
2632
2633template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
2634 HWY_IF_T_SIZE(RemoveCvRef<T>, 8)>
2635HWY_API size_t PopCount(T x) {
2636 uint64_t u64_x = static_cast<uint64_t>(
2637 static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
2638
2639#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
2640 return static_cast<size_t>(__builtin_popcountll(u64_x));
2641#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
2642 return _mm_popcnt_u64(u64_x);
2643#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
2644 return _mm_popcnt_u32(static_cast<uint32_t>(u64_x & 0xFFFFFFFFu)) +
2645 _mm_popcnt_u32(static_cast<uint32_t>(u64_x >> 32));
2646#else
2647 u64_x -= ((u64_x >> 1) & 0x5555555555555555ULL);
2648 u64_x = (((u64_x >> 2) & 0x3333333333333333ULL) +
2649 (u64_x & 0x3333333333333333ULL));
2650 u64_x = (((u64_x >> 4) + u64_x) & 0x0F0F0F0F0F0F0F0FULL);
2651 u64_x += (u64_x >> 8);
2652 u64_x += (u64_x >> 16);
2653 u64_x += (u64_x >> 32);
2654 return static_cast<size_t>(u64_x & 0x7Fu);
2655#endif
2656}
2657
2658// Skip HWY_API due to GCC "function not considered for inlining". Previously
2659// such errors were caused by underlying type mismatches, but it's not clear
2660// what is still mismatched despite all the casts.
2661template <typename TI>
2662/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
2663 return x == TI{1}
2664 ? 0
2665 : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
2666}
2667
2668template <typename TI>
2669/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
2670 return x == TI{1}
2671 ? 0
2672 : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
2673}
2674
2675template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)>
2676HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
2677 return t + static_cast<T>(increment);
2678}
2679
2680template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)>
2681HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
2682 return ConvertScalarTo<T>(ConvertScalarTo<float>(t) +
2683 ConvertScalarTo<float>(increment));
2684}
2685
2686template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)>
2687HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
2688 using TU = MakeUnsigned<T>;
2689 // Sub-int types would promote to int, not unsigned, which would trigger
2690 // warnings, so first promote to the largest unsigned type. Due to
2691 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87519, which affected GCC 8
2692 // until fixed in 9.3, we use built-in types rather than uint64_t.
2693 return static_cast<T>(static_cast<TU>(
2694 static_cast<unsigned long long>(static_cast<unsigned long long>(t) +
2695 static_cast<unsigned long long>(n)) &
2696 uint64_t{hwy::LimitsMax<TU>()}));
2697}
2698
2699#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
2700#pragma intrinsic(_mul128)
2701#pragma intrinsic(_umul128)
2702#endif
2703
2704// 64 x 64 = 128 bit multiplication
2705HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
2706#if defined(__SIZEOF_INT128__)
2707 __uint128_t product = (__uint128_t)a * (__uint128_t)b;
2708 *upper = (uint64_t)(product >> 64);
2709 return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
2710#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
2711 return _umul128(a, b, upper);
2712#else
2713 constexpr uint64_t kLo32 = 0xFFFFFFFFU;
2714 const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
2715 const uint64_t hi_lo = (a >> 32) * (b & kLo32);
2716 const uint64_t lo_hi = (a & kLo32) * (b >> 32);
2717 const uint64_t hi_hi = (a >> 32) * (b >> 32);
2718 const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
2719 *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
2720 return (t << 32) | (lo_lo & kLo32);
2721#endif
2722}
2723
2724HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) {
2725#if defined(__SIZEOF_INT128__)
2726 __int128_t product = (__int128_t)a * (__int128_t)b;
2727 *upper = (int64_t)(product >> 64);
2728 return (int64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
2729#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
2730 return _mul128(a, b, upper);
2731#else
2732 uint64_t unsigned_upper;
2733 const int64_t lower = static_cast<int64_t>(Mul128(
2734 static_cast<uint64_t>(a), static_cast<uint64_t>(b), &unsigned_upper));
2735 *upper = static_cast<int64_t>(
2736 unsigned_upper -
2737 (static_cast<uint64_t>(ScalarShr(a, 63)) & static_cast<uint64_t>(b)) -
2738 (static_cast<uint64_t>(ScalarShr(b, 63)) & static_cast<uint64_t>(a)));
2739 return lower;
2740#endif
2741}
2742
2743// Precomputation for fast n / divisor and n % divisor, where n is a variable
2744// and divisor is unchanging but unknown at compile-time.
2745class Divisor {
2746 public:
2747 explicit Divisor(uint32_t divisor) : divisor_(divisor) {
2748 if (divisor <= 1) return;
2749
2750 const uint32_t len =
2751 static_cast<uint32_t>(31 - Num0BitsAboveMS1Bit_Nonzero32(divisor - 1));
2752 const uint64_t u_hi = (2ULL << len) - divisor;
2753 const uint32_t q = Truncate((u_hi << 32) / divisor);
2754
2755 mul_ = q + 1;
2756 shift1_ = 1;
2757 shift2_ = len;
2758 }
2759
2760 uint32_t GetDivisor() const { return divisor_; }
2761
2762 // Returns n / divisor_.
2763 uint32_t Divide(uint32_t n) const {
2764 const uint64_t mul = mul_;
2765 const uint32_t t = Truncate((mul * n) >> 32);
2766 return (t + ((n - t) >> shift1_)) >> shift2_;
2767 }
2768
2769 // Returns n % divisor_.
2770 uint32_t Remainder(uint32_t n) const { return n - (Divide(n) * divisor_); }
2771
2772 private:
2773 static uint32_t Truncate(uint64_t x) {
2774 return static_cast<uint32_t>(x & 0xFFFFFFFFu);
2775 }
2776
2777 uint32_t divisor_;
2778 uint32_t mul_ = 1;
2779 uint32_t shift1_ = 0;
2780 uint32_t shift2_ = 0;
2781};
2782
2783namespace detail {
2784
2785template <typename T>
2787 T val) {
2788 using TU = MakeUnsigned<T>;
2789 return BitCastScalar<T>(
2790 static_cast<TU>(BitCastScalar<TU>(val) & (~SignMask<T>())));
2791}
2792
2793template <typename T>
2796 return ScalarAbs(hwy::FloatTag(), val);
2797}
2798
2799template <typename T>
2801ScalarAbs(hwy::SignedTag /*tag*/, T val) {
2802 using TU = MakeUnsigned<T>;
2803 return (val < T{0}) ? static_cast<T>(TU{0} - static_cast<TU>(val)) : val;
2804}
2805
2806template <typename T>
2809 return val;
2810}
2811
2812} // namespace detail
2813
2814template <typename T>
2816 using TVal = MakeLaneTypeIfInteger<
2818 return detail::ScalarAbs(hwy::TypeTag<TVal>(), static_cast<TVal>(val));
2819}
2820
2821template <typename T>
2824 using TU = MakeUnsigned<TF>;
2825 return (BitCastScalar<TU>(ScalarAbs(val)) > ExponentMask<TF>());
2826}
2827
2828template <typename T>
2831 using TU = MakeUnsigned<TF>;
2832 return static_cast<TU>(BitCastScalar<TU>(static_cast<TF>(val)) << 1) ==
2833 static_cast<TU>(MaxExponentTimes2<TF>());
2834}
2835
2836namespace detail {
2837
2838template <typename T>
2840 hwy::FloatTag /*tag*/, T val) {
2841 using TU = MakeUnsigned<T>;
2842 return (BitCastScalar<TU>(hwy::ScalarAbs(val)) < ExponentMask<T>());
2843}
2844
2845template <typename T>
2847 hwy::NonFloatTag /*tag*/, T /*val*/) {
2848 // Integer values are always finite
2849 return true;
2850}
2851
2852} // namespace detail
2853
2854template <typename T>
2855HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) {
2856 using TVal = MakeLaneTypeIfInteger<
2858 return detail::ScalarIsFinite(hwy::IsFloatTag<TVal>(),
2859 static_cast<TVal>(val));
2860}
2861
2862template <typename T>
2864 T sign) {
2866 using TU = MakeUnsigned<TF>;
2867 return BitCastScalar<TF>(static_cast<TU>(
2868 (BitCastScalar<TU>(static_cast<TF>(magn)) & (~SignMask<TF>())) |
2869 (BitCastScalar<TU>(static_cast<TF>(sign)) & SignMask<TF>())));
2870}
2871
2872template <typename T>
2874 using TVal = MakeLaneTypeIfInteger<
2876 using TU = MakeUnsigned<TVal>;
2877 return ((BitCastScalar<TU>(static_cast<TVal>(val)) & SignMask<TVal>()) != 0);
2878}
2879
2880// Prevents the compiler from eliding the computations that led to "output".
2881#if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
2882 !defined(_SOFT_FLOAT)
2883// Workaround to avoid test failures on PPC if compiled with Clang
2884template <class T, HWY_IF_F32(T)>
2885HWY_API void PreventElision(T&& output) {
2886 asm volatile("" : "+f"(output)::"memory");
2887}
2888template <class T, HWY_IF_F64(T)>
2889HWY_API void PreventElision(T&& output) {
2890 asm volatile("" : "+d"(output)::"memory");
2891}
2892template <class T, HWY_IF_NOT_FLOAT3264(T)>
2893HWY_API void PreventElision(T&& output) {
2894 asm volatile("" : "+r"(output)::"memory");
2895}
2896#else
2897template <class T>
2898HWY_API void PreventElision(T&& output) {
2899#if HWY_COMPILER_MSVC
2900 // MSVC does not support inline assembly anymore (and never supported GCC's
2901 // RTL constraints). Self-assignment with #pragma optimize("off") might be
2902 // expected to prevent elision, but it does not with MSVC 2015. Type-punning
2903 // with volatile pointers generates inefficient code on MSVC 2017.
2904 static std::atomic<RemoveCvRef<T>> sink;
2905 sink.store(output, std::memory_order_relaxed);
2906#else
2907 // Works by indicating to the compiler that "output" is being read and
2908 // modified. The +r constraint avoids unnecessary writes to memory, but only
2909 // works for built-in types (typically FuncOutput).
2910 asm volatile("" : "+r"(output) : : "memory");
2911#endif
2912}
2913#endif
2914
2915} // namespace hwy
2916
2917#endif // HIGHWAY_HWY_BASE_H_
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:109
#define HWY_BITCASTSCALAR_CONSTEXPR
Definition base.h:954
#define HWY_NORETURN
Definition base.h:105
#define HWY_FORMAT(idx_fmt, idx_arg)
Definition base.h:128
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_BF16_CONSTEXPR
Definition base.h:1591
#define HWY_CXX14_CONSTEXPR
Definition base.h:304
#define HWY_INLINE
Definition base.h:101
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:110
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_MAYBE_UNUSED
Definition base.h:113
#define HWY_IF_BF16(T)
Definition base.h:673
#define HWY_F16_CONSTEXPR
Definition base.h:1301
#define HWY_ASSUME_ALIGNED(ptr, align)
Definition base.h:139
#define HWY_BF16_TO_F16_CONSTEXPR
Definition base.h:2429
#define HWY_IF_NOT_SPECIAL_FLOAT(T)
Definition base.h:631
#define HWY_IF_F16(T)
Definition base.h:676
#define HWY_IF_NOT_SAME(T, expected)
Definition base.h:654
#define HWY_F16_FROM_F32_DASSERT(condition)
Definition base.h:1370
Definition base.h:697
@ kDisableDeclValEvaluation
Definition base.h:706
decltype(TryAddRValRef< T >(0)) type
Definition base.h:705
static U TryAddRValRef(Arg)
static URef TryAddRValRef(int)
Definition base.h:2745
uint32_t divisor_
Definition base.h:2777
Divisor(uint32_t divisor)
Definition base.h:2747
uint32_t Divide(uint32_t n) const
Definition base.h:2763
static uint32_t Truncate(uint64_t x)
Definition base.h:2773
uint32_t GetDivisor() const
Definition base.h:2760
uint32_t Remainder(uint32_t n) const
Definition base.h:2770
Definition base.h:805
static hwy::SizeTag< 1 > TryAssignTest(int)
static hwy::SizeTag< 0 > TryAssignTest(Arg)
Definition base.h:745
static decltype(IsConvertibleT< T, U >::template TestFuncWithToArg< U >(DeclVal< T >())) TryConvTest(int)
static hwy::SizeTag< 0 > TryConvTest(Arg)
Definition base.h:779
static hwy::SizeTag< 1 > TryStaticCastTest(int)
static hwy::SizeTag< 0 > TryStaticCastTest(Arg)
#define HWY_DLLEXPORT
Definition highway_export.h:13
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:605
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:601
typename SpecialFloatUnwrapArithOpOperandT< T >::type SpecialFloatUnwrapArithOpOperand
Definition base.h:1099
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag, T val)
Definition base.h:2786
static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits(const uint32_t f32_bits)
Definition base.h:1800
static HWY_INLINE constexpr T ScalarShr(hwy::UnsignedTag, T val, int shift_amt)
Definition base.h:2504
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(hwy::FloatTag, T val)
Definition base.h:2839
static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr(const uint32_t f32_bits)
Definition base.h:1791
typename NativeSpecialFloatToWrapperT< T >::type NativeSpecialFloatToWrapper
Definition base.h:1108
Definition abort.h:8
double float64_t
Definition base.h:406
HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16)
Definition base.h:1304
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_API float F32FromF16Mem(const void *ptr)
Definition base.h:2414
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2551
typename MakeLaneTypeIfIntegerT< T >::type MakeLaneTypeIfInteger
Definition base.h:2170
constexpr MakeSigned< T > MaxExponentTimes2()
Definition base.h:2281
constexpr MakeUnsigned< T > MantissaMask()
Definition base.h:2300
typename RemoveConstT< T >::type RemoveConst
Definition base.h:547
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue()
Definition base.h:2191
HWY_API void ZeroBytes(To *to)
Definition base.h:352
HWY_API void PreventElision(T &&output)
Definition base.h:2898
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef< T > ScalarCopySign(T magn, T sign)
Definition base.h:2863
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:2705
HWY_API constexpr T LimitsMin()
Definition base.h:2181
RemoveConst< RemoveVolatile< RemoveRef< T > > > RemoveCvRef
Definition base.h:578
constexpr bool operator!=(const AlignedAllocator< T > &, const AlignedAllocator< V > &) noexcept
Definition aligned_allocator.h:166
HWY_API constexpr bool IsFloat3264()
Definition base.h:2122
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From &val)
Definition base.h:1024
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment)
Definition base.h:2676
HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf)
Definition base.h:1778
constexpr T1 DivCeil(T1 a, T2 b)
Definition base.h:2485
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
constexpr bool operator==(const AlignedAllocator< T > &, const AlignedAllocator< V > &) noexcept
Definition aligned_allocator.h:160
HWY_API constexpr bool IsSame()
Definition base.h:499
typename RemoveVolatileT< T >::type RemoveVolatile
Definition base.h:559
HWY_API constexpr bool IsConst()
Definition base.h:533
HWY_API constexpr bool IsSigned()
Definition base.h:2134
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:346
constexpr size_t FloorLog2(TI x)
Definition base.h:2662
typename RemovePtrT< T >::type RemovePtr
Definition base.h:602
constexpr MakeUnsigned< T > ExponentMask()
Definition base.h:2293
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val)
Definition base.h:2873
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val)
Definition base.h:2822
typename detail::Relations< T >::Float MakeFloat
Definition base.h:2082
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val)
Definition base.h:2829
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2577
typename IfT< Condition, Then, Else >::type If
Definition base.h:520
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:2092
HWY_API constexpr bool IsIntegerLaneType()
Definition base.h:840
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition base.h:2114
HWY_API constexpr bool IsConvertible()
Definition base.h:774
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon()
Definition base.h:2235
HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64)
Definition base.h:1443
HWY_API DeclValT< T >::type DeclVal() noexcept
Definition base.h:714
typename detail::TypeFromSize< N >::Signed SignedFromSize
Definition base.h:2094
static HWY_MAYBE_UNUSED bool operator>(const uint128_t &a, const uint128_t &b)
Definition base.h:438
float float32_t
Definition base.h:405
constexpr int MantissaBits()
Definition base.h:2257
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2540
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64)
Definition base.h:1826
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f)
Definition base.h:1817
constexpr MakeSigned< T > MaxExponentField()
Definition base.h:2340
HWY_API constexpr bool IsSameEither()
Definition base.h:505
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd()
Definition base.h:2307
HWY_F16_CONSTEXPR bool operator>=(float16_t lhs, float16_t rhs) noexcept
Definition base.h:1524
static constexpr bool IsArray()
Definition base.h:735
HWY_API constexpr bool IsInteger()
Definition base.h:877
static constexpr bool IsStaticCastable()
Definition base.h:794
HWY_API constexpr bool IsSpecialFloat()
Definition base.h:832
constexpr size_t CeilLog2(TI x)
Definition base.h:2669
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2588
HWY_API float F32FromBF16Mem(const void *ptr)
Definition base.h:2420
typename detail::TypeFromSize< N >::Float FloatFromSize
Definition base.h:2096
constexpr MakeUnsigned< T > SignMask()
Definition base.h:2287
static HWY_MAYBE_UNUSED bool operator<(const uint128_t &a, const uint128_t &b)
Definition base.h:433
HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32)
Definition base.h:1374
typename EnableIfT< Condition >::type EnableIf
Definition base.h:486
constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed+R::is_float+R::is_bf16)<< 8)>
Definition base.h:2105
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize
Definition base.h:378
typename detail::Relations< T >::Narrow MakeNarrow
Definition base.h:2088
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef< T > ScalarAbs(T val)
Definition base.h:2815
typename RemoveRefT< T >::type RemoveRef
Definition base.h:575
HWY_API constexpr bool IsFloat()
Definition base.h:2127
static constexpr bool IsAssignable()
Definition base.h:820
constexpr size_t RoundDownTo(size_t what, size_t align)
Definition base.h:2495
HWY_DLLEXPORT HWY_NORETURN void int const char * format
Definition base.h:231
HWY_DLLEXPORT HWY_NORETURN void int line
Definition base.h:231
HWY_API constexpr T LimitsMax()
Definition base.h:2174
constexpr size_t RoundUpTo(size_t what, size_t align)
Definition base.h:2490
HWY_F16_CONSTEXPR bool operator<=(float16_t lhs, float16_t rhs) noexcept
Definition base.h:1509
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
constexpr int ExponentBits()
Definition base.h:2331
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue()
Definition base.h:2212
void type
Definition base.h:482
Definition base.h:479
Else type
Definition base.h:516
Definition base.h:510
Then type
Definition base.h:511
Definition base.h:720
@ value
Definition base.h:721
Definition base.h:523
@ value
Definition base.h:524
Definition base.h:489
@ value
Definition base.h:490
Definition base.h:426
uint32_t value
Definition base.h:427
uint32_t key
Definition base.h:428
Definition base.h:419
uint64_t value
Definition base.h:420
uint64_t key
Definition base.h:421
hwy::If< IsSigned< T >(), SignedFromSize< sizeof(T)>, UnsignedFromSize< sizeof(T)> > type
Definition base.h:2165
Definition base.h:2159
T type
Definition base.h:2160
T type
Definition base.h:543
Definition base.h:538
T type
Definition base.h:539
T type
Definition base.h:586
T type
Definition base.h:590
T type
Definition base.h:594
Definition base.h:581
T type
Definition base.h:582
T type
Definition base.h:567
T type
Definition base.h:571
Definition base.h:562
T type
Definition base.h:563
T type
Definition base.h:555
Definition base.h:550
T type
Definition base.h:551
Definition base.h:694
Definition base.h:1594
constexpr bfloat16_t(BF16FromU16BitsTag, uint16_t u16_bits)
Definition base.h:1630
bfloat16_t() noexcept=default
uint16_t bits
Definition base.h:1606
static constexpr bfloat16_t FromBits(uint16_t bits)
Definition base.h:1634
int16_t Signed
Definition base.h:2022
float Wide
Definition base.h:2023
uint16_t Unsigned
Definition base.h:2021
double Float
Definition base.h:2039
uint64_t Unsigned
Definition base.h:2037
int64_t Signed
Definition base.h:2038
float Narrow
Definition base.h:2040
int16_t Signed
Definition base.h:2014
float Wide
Definition base.h:2016
uint16_t Unsigned
Definition base.h:2013
uint32_t Unsigned
Definition base.h:2028
double Wide
Definition base.h:2031
float Float
Definition base.h:2030
int32_t Signed
Definition base.h:2029
uint16_t Unsigned
Definition base.h:1963
int16_t Signed
Definition base.h:1964
int32_t Wide
Definition base.h:1966
int8_t Narrow
Definition base.h:1967
uint32_t Unsigned
Definition base.h:1981
int64_t Wide
Definition base.h:1984
float Float
Definition base.h:1983
int16_t Narrow
Definition base.h:1985
int32_t Signed
Definition base.h:1982
int32_t Narrow
Definition base.h:2002
double Float
Definition base.h:2001
uint64_t Unsigned
Definition base.h:1999
int64_t Signed
Definition base.h:2000
int16_t Wide
Definition base.h:1949
int8_t Signed
Definition base.h:1948
uint8_t Unsigned
Definition base.h:1947
uint64_t Narrow
Definition base.h:2008
uint8_t Narrow
Definition base.h:1958
int16_t Signed
Definition base.h:1955
uint32_t Wide
Definition base.h:1957
uint16_t Unsigned
Definition base.h:1954
uint32_t Unsigned
Definition base.h:1972
uint64_t Wide
Definition base.h:1975
uint16_t Narrow
Definition base.h:1976
float Float
Definition base.h:1974
int32_t Signed
Definition base.h:1973
uint32_t Narrow
Definition base.h:1994
int64_t Signed
Definition base.h:1991
uint64_t Unsigned
Definition base.h:1990
double Float
Definition base.h:1992
int8_t Signed
Definition base.h:1941
uint8_t Unsigned
Definition base.h:1940
uint16_t Wide
Definition base.h:1942
Definition base.h:1937
int8_t Signed
Definition base.h:2049
uint8_t Unsigned
Definition base.h:2048
int16_t Signed
Definition base.h:2054
uint16_t Unsigned
Definition base.h:2053
int32_t Signed
Definition base.h:2060
uint32_t Unsigned
Definition base.h:2059
float Float
Definition base.h:2061
double Float
Definition base.h:2067
int64_t Signed
Definition base.h:2066
uint64_t Unsigned
Definition base.h:2065
Definition base.h:2045
Definition base.h:1159
Definition base.h:1117
static constexpr float16_t FromBits(uint16_t bits)
Definition base.h:1164
float16_t() noexcept=default
constexpr float16_t(F16FromU16BitsTag, uint16_t u16_bits)
Definition base.h:1160
uint16_t bits
Definition base.h:1135
Definition base.h:412
uint64_t lo
Definition base.h:413
uint64_t hi
Definition base.h:414