Grok 12.0.1
detect_targets.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
17#define HIGHWAY_HWY_DETECT_TARGETS_H_
18
19// Defines targets and chooses which to enable.
20
22
23//------------------------------------------------------------------------------
24// Optional configuration
25
26// See g3doc/quick_reference.md for documentation of these macros.
27
28// Uncomment to override the default baseline determined from predefined macros:
29// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
30
31// Uncomment to override the default blocklist:
32// #define HWY_BROKEN_TARGETS HWY_AVX3
33
34// Uncomment to definitely avoid generating those target(s):
35// #define HWY_DISABLED_TARGETS HWY_SSE4
36
37// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
38// AVX2 target for VMs which support AVX2 but not the other instruction sets)
39// #define HWY_DISABLE_BMI2_FMA
40
41// Uncomment to enable these on MSVC even if the predefined macros are not set.
42// #define HWY_WANT_SSE2 1
43// #define HWY_WANT_SSSE3 1
44// #define HWY_WANT_SSE4 1
45
46//------------------------------------------------------------------------------
47// Targets
48
49// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
50// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
51//
52// All values are unconditionally defined so we can test HWY_TARGETS without
53// first checking the HWY_ARCH_*.
54//
55// The C99 preprocessor evaluates #if expressions using intmax_t types. This
56// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
57// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
58// avoid overflow when computing HWY_TARGETS (subtracting one instead of
59// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
60
61// --------------------------- x86: 15 targets (+ one fallback)
62// Bits 0..3 reserved (4 targets)
63#define HWY_AVX3_SPR (1LL << 4)
64// Bit 5 reserved (likely AVX10.2 with 256-bit vectors)
65// Currently HWY_AVX3_DL plus AVX512BF16 and a special case for CompressStore
66// (10x as fast).
67// We may later also use VPCONFLICT.
68#define HWY_AVX3_ZEN4 (1LL << 6) // see HWY_WANT_AVX3_ZEN4 below
69
70// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
71// VAES, BITALG, GFNI). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is
72// only in Tiger Lake?
73#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below
74#define HWY_AVX3 (1LL << 8) // HWY_AVX2 plus AVX-512F/BW/CD/DQ/VL
75#define HWY_AVX2 (1LL << 9) // HWY_SSE4 plus BMI2 + F16 + FMA
76// Bit 10: reserved
77#define HWY_SSE4 (1LL << 11) // SSE4.2 plus AES + CLMUL
78#define HWY_SSSE3 (1LL << 12) // S-SSE3
79// Bit 13: reserved for SSE3
80#define HWY_SSE2 (1LL << 14)
81// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
82// dynamic dispatch. All x86 target bits must be lower or equal to
83// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
84// HWY_MAX_DYNAMIC_TARGETS in total.
85#define HWY_HIGHEST_TARGET_BIT_X86 14
86
87// --------------------------- Arm: 15 targets (+ one fallback)
88// Bits 15..23 reserved (9 targets)
89#define HWY_SVE2_128 (1LL << 24) // specialized target (e.g. Arm N2)
90#define HWY_SVE_256 (1LL << 25) // specialized target (e.g. Arm V1)
91#define HWY_SVE2 (1LL << 26)
92#define HWY_SVE (1LL << 27)
93#define HWY_NEON (1LL << 28) // Implies support for AES
94#define HWY_NEON_WITHOUT_AES (1LL << 29)
95#define HWY_HIGHEST_TARGET_BIT_ARM 29
96
97// --------------------------- RISC-V: 9 targets (+ one fallback)
98// Bits 30..36 reserved (7 targets)
99#define HWY_RVV (1LL << 37)
100// Bit 38 reserved
101#define HWY_HIGHEST_TARGET_BIT_RVV 38
102
103// --------------------------- Future expansion: 4 targets
104// Bits 39..42 reserved
105
106// --------------------------- IBM Power/ZSeries: 9 targets (+ one fallback)
107// Bits 43..46 reserved (4 targets)
108#define HWY_PPC10 (1LL << 47) // v3.1
109#define HWY_PPC9 (1LL << 48) // v3.0
110#define HWY_PPC8 (1LL << 49) // v2.07
111#define HWY_Z15 (1LL << 50) // Z15
112#define HWY_Z14 (1LL << 51) // Z14
113#define HWY_HIGHEST_TARGET_BIT_PPC 51
114
115// --------------------------- WebAssembly: 9 targets (+ one fallback)
116// Bits 52..57 reserved (6 targets)
117#define HWY_WASM_EMU256 (1LL << 58) // Experimental
118#define HWY_WASM (1LL << 59)
119// Bits 60 reserved
120#define HWY_HIGHEST_TARGET_BIT_WASM 60
121
122// --------------------------- Emulation: 2 targets
123
124#define HWY_EMU128 (1LL << 61)
125// We do not add/left-shift, so this will not overflow to a negative number.
126#define HWY_SCALAR (1LL << 62)
127#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
128
129// Do not use bit 63 - would be confusing to have negative numbers.
130
131//------------------------------------------------------------------------------
132// Set default blocklists
133
134// Disabled means excluded from enabled at user's request. A separate config
135// macro allows disabling without deactivating the blocklist below.
136#ifndef HWY_DISABLED_TARGETS
137#define HWY_DISABLED_TARGETS 0
138#endif
139
140// Broken means excluded from enabled due to known compiler issues. We define
141// separate HWY_BROKEN_* and then OR them together (more than one might apply).
142
143// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
144// SSE4 codegen (possibly only for msan), so disable all those targets.
145#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
146
147#define HWY_BROKEN_CLANG6 (HWY_SSE4 | (HWY_SSE4 - 1))
148// This entails a major speed reduction, so warn unless the user explicitly
149// opts in to scalar-only.
150#if !defined(HWY_COMPILE_ONLY_SCALAR)
151#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
152#endif
153
154#else
155#define HWY_BROKEN_CLANG6 0
156#endif
157
158// 32-bit may fail to compile AVX2/3.
159#if HWY_ARCH_X86_32
160#define HWY_BROKEN_32BIT (HWY_AVX2 | (HWY_AVX2 - 1))
161#else
162#define HWY_BROKEN_32BIT 0
163#endif
164
165// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
166#if HWY_COMPILER_MSVC != 0
167#define HWY_BROKEN_MSVC (HWY_AVX3 | (HWY_AVX3 - 1))
168#else
169#define HWY_BROKEN_MSVC 0
170#endif
171
172// AVX3_DL and AVX3_ZEN4 require clang >= 7 (ensured above), gcc >= 8.1 or ICC
173// 2021.
174#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 801) || \
175 (HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021)
176#define HWY_BROKEN_AVX3_DL_ZEN4 (HWY_AVX3_DL | HWY_AVX3_ZEN4)
177#else
178#define HWY_BROKEN_AVX3_DL_ZEN4 0
179#endif
180
181// AVX3_SPR requires clang >= 14, gcc >= 12, or ICC 2021.
182#if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1400) || \
183 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \
184 (HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021)
185#define HWY_BROKEN_AVX3_SPR (HWY_AVX3_SPR)
186#else
187#define HWY_BROKEN_AVX3_SPR 0
188#endif
189
190// armv7be has not been tested and is not yet supported.
191#if HWY_ARCH_ARM_V7 && HWY_IS_BIG_ENDIAN
192#define HWY_BROKEN_ARM7_BIG_ENDIAN (HWY_NEON | HWY_NEON_WITHOUT_AES)
193#else
194#define HWY_BROKEN_ARM7_BIG_ENDIAN 0
195#endif
196
197// armv7-a without a detected vfpv4 is not supported
198// (for example Cortex-A8, Cortex-A9)
199// vfpv4 always have neon half-float _and_ FMA.
200#if HWY_ARCH_ARM_V7 && (__ARM_ARCH_PROFILE == 'A') && \
201 !defined(__ARM_VFPV4__) && \
202 !((__ARM_NEON_FP & 0x2 /* half-float */) && (__ARM_FEATURE_FMA == 1))
203#define HWY_BROKEN_ARM7_WITHOUT_VFP4 (HWY_NEON | HWY_NEON_WITHOUT_AES)
204#else
205#define HWY_BROKEN_ARM7_WITHOUT_VFP4 0
206#endif
207
208// SVE[2] require recent clang or gcc versions.
209#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
210 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
211#define HWY_BROKEN_SVE (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
212#else
213#define HWY_BROKEN_SVE 0
214#endif
215
216#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1100)
217// GCC 10 supports the -mcpu=power10 option but does not support the PPC10
218// vector intrinsics
219#define HWY_BROKEN_PPC10 (HWY_PPC10)
220#elif HWY_ARCH_PPC && HWY_IS_BIG_ENDIAN && \
221 ((HWY_COMPILER3_CLANG && HWY_COMPILER3_CLANG < 160001) || \
222 (HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_COMPILER_GCC_ACTUAL <= 1203) || \
223 (HWY_COMPILER_GCC_ACTUAL >= 1300 && HWY_COMPILER_GCC_ACTUAL <= 1301))
224// GCC 12.0 through 12.3 and GCC 13.0 through 13.1 have a compiler bug where the
225// vsldoi instruction is sometimes incorrectly optimized out (and this causes
226// some of the Highway unit tests to fail on big-endian PPC10). Details about
227// this compiler bug can be found at
228// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109069, and this bug will be
229// fixed in the upcoming GCC 12.4 and 13.2 releases.
230
231// Clang 16.0.0 and earlier (but not Clang 16.0.1 and later) have a compiler
232// bug in the LLVM DAGCombiner that causes a zero-extend followed by an
233// element insert into a vector, followed by a vector shuffle to be incorrectly
234// optimized on big-endian PPC (and which caused some of the Highway unit tests
235// to fail on big-endian PPC10).
236
237// Details about this bug, which has already been fixed in Clang 16.0.1 and
238// later, can be found at https://github.com/llvm/llvm-project/issues/61315.
239#define HWY_BROKEN_PPC10 (HWY_PPC10)
240#else
241#define HWY_BROKEN_PPC10 0
242#endif
243
244// Allow the user to override this without any guarantee of success.
245#ifndef HWY_BROKEN_TARGETS
246
247#define HWY_BROKEN_TARGETS \
248 (HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC | \
249 HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \
250 HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \
251 HWY_BROKEN_SVE | HWY_BROKEN_PPC10)
252
253#endif // HWY_BROKEN_TARGETS
254
255// Enabled means not disabled nor blocklisted.
256#define HWY_ENABLED(targets) \
257 ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
258
259// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
260// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). An issue still
261// remains with 13.2, see #1683. This is separate from HWY_BROKEN_TARGETS
262// because it affects the fallback target, which must always be enabled. If 1,
263// we instead choose HWY_SCALAR even without HWY_COMPILE_ONLY_SCALAR being set.
264#if !defined(HWY_BROKEN_EMU128) // allow overriding
265#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
266 defined(HWY_NO_LIBCXX)
267#define HWY_BROKEN_EMU128 1
268#else
269#define HWY_BROKEN_EMU128 0
270#endif
271#endif // HWY_BROKEN_EMU128
272
273//------------------------------------------------------------------------------
274// Detect baseline targets using predefined macros
275
276// Baseline means the targets for which the compiler is allowed to generate
277// instructions, implying the target CPU would have to support them. This does
278// not take the blocklist into account.
279
280#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
281#define HWY_BASELINE_SCALAR HWY_SCALAR
282#else
283#define HWY_BASELINE_SCALAR HWY_EMU128
284#endif
285
286// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
287// HWY_TARGET == HWY_BASELINE_SCALAR.
288
289#if HWY_ARCH_WASM && defined(__wasm_simd128__)
290#if defined(HWY_WANT_WASM2)
291#define HWY_BASELINE_WASM HWY_WASM_EMU256
292#else
293#define HWY_BASELINE_WASM HWY_WASM
294#endif // HWY_WANT_WASM2
295#else
296#define HWY_BASELINE_WASM 0
297#endif
298
299// GCC or Clang.
300#if HWY_ARCH_PPC && HWY_COMPILER_GCC && defined(__ALTIVEC__) && \
301 defined(__VSX__) && defined(__POWER8_VECTOR__) && \
302 (defined(__CRYPTO__) || defined(HWY_DISABLE_PPC8_CRYPTO))
303#define HWY_BASELINE_PPC8 HWY_PPC8
304#else
305#define HWY_BASELINE_PPC8 0
306#endif
307
308#if HWY_BASELINE_PPC8 != 0 && defined(__POWER9_VECTOR__)
309#define HWY_BASELINE_PPC9 HWY_PPC9
310#else
311#define HWY_BASELINE_PPC9 0
312#endif
313
314#if HWY_BASELINE_PPC9 != 0 && \
315 (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
316#define HWY_BASELINE_PPC10 HWY_PPC10
317#else
318#define HWY_BASELINE_PPC10 0
319#endif
320
321#if HWY_ARCH_S390X && defined(__VEC__) && defined(__ARCH__) && __ARCH__ >= 12
322#define HWY_BASELINE_Z14 HWY_Z14
323#else
324#define HWY_BASELINE_Z14 0
325#endif
326
327#if HWY_BASELINE_Z14 && __ARCH__ >= 13
328#define HWY_BASELINE_Z15 HWY_Z15
329#else
330#define HWY_BASELINE_Z15 0
331#endif
332
333#define HWY_BASELINE_SVE2 0
334#define HWY_BASELINE_SVE 0
335#define HWY_BASELINE_NEON 0
336
337#if HWY_ARCH_ARM
338
339#if defined(__ARM_FEATURE_SVE2)
340#undef HWY_BASELINE_SVE2 // was 0, will be re-defined
341// If user specified -msve-vector-bits=128, they assert the vector length is
342// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
343#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
344#define HWY_BASELINE_SVE2 HWY_SVE2_128
345// Otherwise we're not sure what the vector length will be. The baseline must be
346// unconditionally valid, so we can only assume HWY_SVE2. However, when running
347// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
348// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
349#else
350#define HWY_BASELINE_SVE2 HWY_SVE2
351#endif // __ARM_FEATURE_SVE_BITS
352#endif // __ARM_FEATURE_SVE2
353
354#if defined(__ARM_FEATURE_SVE)
355#undef HWY_BASELINE_SVE // was 0, will be re-defined
356// See above. If user-specified vector length matches our optimization, use it.
357#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
358#define HWY_BASELINE_SVE HWY_SVE_256
359#else
360#define HWY_BASELINE_SVE HWY_SVE
361#endif // __ARM_FEATURE_SVE_BITS
362#endif // __ARM_FEATURE_SVE
363
364// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
365#if defined(__ARM_NEON__) || defined(__ARM_NEON)
366#undef HWY_BASELINE_NEON
367#if defined(__ARM_FEATURE_AES)
368#define HWY_BASELINE_NEON (HWY_NEON | HWY_NEON_WITHOUT_AES)
369#else
370#define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES)
371#endif
372#endif
373
374#endif // HWY_ARCH_ARM
375
376// Special handling for MSVC because it has fewer predefined macros:
377#if HWY_COMPILER_MSVC
378
379#if HWY_ARCH_X86_32
380#if _M_IX86_FP >= 2
381#define HWY_CHECK_SSE2 1
382#else
383#define HWY_CHECK_SSE2 0
384#endif
385#elif HWY_ARCH_X86_64
386#define HWY_CHECK_SSE2 1
387#else
388#define HWY_CHECK_SSE2 0
389#endif
390
391// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
392// https://stackoverflow.com/questions/18563978/.
393#if defined(__AVX__)
394#define HWY_CHECK_SSSE3 1
395#define HWY_CHECK_SSE4 1
396#else
397#define HWY_CHECK_SSSE3 0
398#define HWY_CHECK_SSE4 0
399#endif
400
401// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
402// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
403#define HWY_CHECK_PCLMUL_AES 1
404#define HWY_CHECK_BMI2_FMA 1
405#define HWY_CHECK_F16C 1
406
407#else // non-MSVC
408
409#if defined(__SSE2__)
410#define HWY_CHECK_SSE2 1
411#else
412#define HWY_CHECK_SSE2 0
413#endif
414
415#if defined(__SSSE3__)
416#define HWY_CHECK_SSSE3 1
417#else
418#define HWY_CHECK_SSSE3 0
419#endif
420
421#if defined(__SSE4_1__) && defined(__SSE4_2__)
422#define HWY_CHECK_SSE4 1
423#else
424#define HWY_CHECK_SSE4 0
425#endif
426
427// If these are disabled, they should not gate the availability of SSE4/AVX2.
428#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
429#define HWY_CHECK_PCLMUL_AES 1
430#else
431#define HWY_CHECK_PCLMUL_AES 0
432#endif
433
434#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
435#define HWY_CHECK_BMI2_FMA 1
436#else
437#define HWY_CHECK_BMI2_FMA 0
438#endif
439
440#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
441#define HWY_CHECK_F16C 1
442#else
443#define HWY_CHECK_F16C 0
444#endif
445
446#endif // non-MSVC
447
448#if HWY_ARCH_X86 && (HWY_WANT_SSE2 || HWY_CHECK_SSE2)
449#define HWY_BASELINE_SSE2 HWY_SSE2
450#else
451#define HWY_BASELINE_SSE2 0
452#endif
453
454#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
455#define HWY_BASELINE_SSSE3 HWY_SSSE3
456#else
457#define HWY_BASELINE_SSSE3 0
458#endif
459
460#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
461#define HWY_BASELINE_SSE4 HWY_SSE4
462#else
463#define HWY_BASELINE_SSE4 0
464#endif
465
466#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
467 defined(__AVX2__)
468#define HWY_BASELINE_AVX2 HWY_AVX2
469#else
470#define HWY_BASELINE_AVX2 0
471#endif
472
473// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
474#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
475 defined(__AVX512DQ__) && defined(__AVX512VL__)
476#define HWY_BASELINE_AVX3 HWY_AVX3
477#else
478#define HWY_BASELINE_AVX3 0
479#endif
480
481// TODO(janwas): not yet known whether these will be set by MSVC
482#if HWY_BASELINE_AVX3 != 0 && defined(__AVX512VNNI__) && defined(__VAES__) && \
483 defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
484 defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
485 defined(__AVX512BITALG__)
486#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
487#else
488#define HWY_BASELINE_AVX3_DL 0
489#endif
490
491// The ZEN4-optimized AVX3 target is numerically lower than AVX3_DL and is thus
492// considered better. Do not enable it unless the user explicitly requests it -
493// we do not want to choose the ZEN4 path on Intel because it could be slower.
494#if defined(HWY_WANT_AVX3_ZEN4) && HWY_BASELINE_AVX3_DL != 0
495#define HWY_BASELINE_AVX3_ZEN4 HWY_AVX3_ZEN4
496#else
497#define HWY_BASELINE_AVX3_ZEN4 0
498#endif
499
500#if HWY_BASELINE_AVX3_DL != 0 && defined(__AVX512BF16__) && \
501 defined(__AVX512FP16__)
502#define HWY_BASELINE_AVX3_SPR HWY_AVX3_SPR
503#else
504#define HWY_BASELINE_AVX3_SPR 0
505#endif
506
507// RVV requires intrinsics 0.11 or later, see #1156.
508#if HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
509 __riscv_v_intrinsic >= 11000
510#define HWY_BASELINE_RVV HWY_RVV
511#else
512#define HWY_BASELINE_RVV 0
513#endif
514
515// Allow the user to override this without any guarantee of success.
516#ifndef HWY_BASELINE_TARGETS
517#define HWY_BASELINE_TARGETS \
518 (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
519 HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10 | HWY_BASELINE_Z14 | \
520 HWY_BASELINE_Z15 | HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | \
521 HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | HWY_BASELINE_SSSE3 | \
522 HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
523 HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | HWY_BASELINE_AVX3_SPR | \
524 HWY_BASELINE_RVV)
525#endif // HWY_BASELINE_TARGETS
526
527//------------------------------------------------------------------------------
528// Choose target for static dispatch
529
530#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
531#if HWY_ENABLED_BASELINE == 0
532#error "At least one baseline target must be defined and enabled"
533#endif
534
535// Best baseline, used for static dispatch. This is the least-significant 1-bit
536// within HWY_ENABLED_BASELINE and lower bit values imply "better".
537#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
538
539// Start by assuming static dispatch. If we later use dynamic dispatch, this
540// will be defined to other targets during the multiple-inclusion, and finally
541// return to the initial value. Defining this outside begin/end_target ensures
542// inl headers successfully compile by themselves (required by Bazel).
543#define HWY_TARGET HWY_STATIC_TARGET
544
545//------------------------------------------------------------------------------
546// Choose targets for dynamic dispatch according to one of four policies
547
548#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
549 defined(HWY_COMPILE_ONLY_STATIC))
550#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
551#endif
552// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
553
554// Clang, GCC and MSVC allow runtime dispatch on x86.
555#if HWY_ARCH_X86
556#define HWY_HAVE_RUNTIME_DISPATCH 1
557// On Arm, PPC, S390X, and RISC-V: GCC and Clang 16+ do, and we require Linux
558// to detect CPU capabilities. Currently require opt-in for Clang on Arm
559// because it is experimental.
560#elif (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X || HWY_ARCH_RISCV) && \
561 (HWY_COMPILER_GCC_ACTUAL || \
562 (HWY_COMPILER_CLANG >= 1600 && \
563 (!HWY_ARCH_ARM || defined(HWY_ENABLE_CLANG_ARM_DISPATCH)))) && \
564 HWY_OS_LINUX && !defined(TOOLCHAIN_MISS_SYS_AUXV_H)
565#define HWY_HAVE_RUNTIME_DISPATCH 1
566#else
567#define HWY_HAVE_RUNTIME_DISPATCH 0
568#endif
569
570// AVX3_DL is not widely available yet. To reduce code size and compile time,
571// only include it in the set of attainable targets (for dynamic dispatch) if
572// the user opts in, OR it is in the baseline (we check whether enabled below).
573#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
574#define HWY_ATTAINABLE_AVX3_DL (HWY_AVX3_DL)
575#else
576#define HWY_ATTAINABLE_AVX3_DL 0
577#endif
578
579#if HWY_ARCH_ARM_A64 && HWY_HAVE_RUNTIME_DISPATCH
580#define HWY_ATTAINABLE_NEON (HWY_NEON | HWY_NEON_WITHOUT_AES)
581#elif HWY_ARCH_ARM // static dispatch, or HWY_ARCH_ARM_V7
582#define HWY_ATTAINABLE_NEON (HWY_BASELINE_NEON)
583#else
584#define HWY_ATTAINABLE_NEON 0
585#endif
586
587#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
588 (HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
589#define HWY_ATTAINABLE_SVE (HWY_SVE | HWY_SVE_256)
590#else
591#define HWY_ATTAINABLE_SVE 0
592#endif
593
594#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
595 (HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
596#define HWY_ATTAINABLE_SVE2 (HWY_SVE2 | HWY_SVE2_128)
597#else
598#define HWY_ATTAINABLE_SVE2 0
599#endif
600
601#if HWY_ARCH_PPC && defined(__ALTIVEC__) && \
602 (!HWY_COMPILER_CLANG || HWY_BASELINE_PPC8 != 0)
603
604#if (HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10) && \
605 !defined(HWY_SKIP_NON_BEST_BASELINE)
606// On POWER with -m flags, we get compile errors (#1707) for targets older than
607// the baseline specified via -m, so only generate the static target and better.
608// Note that some Linux distros actually do set POWER9 as the baseline.
609// This works by skipping case 3 below, so case 4 is reached.
610#define HWY_SKIP_NON_BEST_BASELINE
611#endif
612
613#define HWY_ATTAINABLE_PPC (HWY_PPC8 | HWY_PPC9 | HWY_PPC10)
614
615#else
616#define HWY_ATTAINABLE_PPC 0
617#endif
618
619#if HWY_ARCH_S390X && HWY_BASELINE_Z14 != 0
620#define HWY_ATTAINABLE_S390X (HWY_Z14 | HWY_Z15)
621#else
622#define HWY_ATTAINABLE_S390X 0
623#endif
624
625// Attainable means enabled and the compiler allows intrinsics (even when not
626// allowed to autovectorize). Used in 3 and 4.
627#if HWY_ARCH_X86
628#if HWY_COMPILER_MSVC
629// Fewer targets for faster builds.
630#define HWY_ATTAINABLE_TARGETS \
631 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_STATIC_TARGET | HWY_AVX2)
632#else // !HWY_COMPILER_MSVC
633#define HWY_ATTAINABLE_TARGETS \
634 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \
635 HWY_AVX2 | HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL | HWY_AVX3_ZEN4 | \
636 HWY_AVX3_SPR)
637#endif // !HWY_COMPILER_MSVC
638#elif HWY_ARCH_ARM
639#define HWY_ATTAINABLE_TARGETS \
640 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_NEON | HWY_ATTAINABLE_SVE | \
641 HWY_ATTAINABLE_SVE2)
642#elif HWY_ARCH_PPC
643#define HWY_ATTAINABLE_TARGETS \
644 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_PPC)
645#elif HWY_ARCH_S390X
646#define HWY_ATTAINABLE_TARGETS \
647 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_S390X)
648#else
649#define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE)
650#endif // HWY_ARCH_*
651
652// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
653#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
654#undef HWY_STATIC_TARGET
655#define HWY_STATIC_TARGET HWY_EMU128 // override baseline
656#define HWY_TARGETS HWY_EMU128
657
658// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
659// we currently still support it for backwards compatibility.
660#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
661 (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
662#undef HWY_STATIC_TARGET
663#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
664#define HWY_TARGETS HWY_SCALAR
665
666// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
667#elif defined(HWY_COMPILE_ONLY_STATIC)
668#define HWY_TARGETS HWY_STATIC_TARGET
669
670// 3) For tests: include all attainable targets (in particular: scalar)
671#elif (defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)) && \
672 !defined(HWY_SKIP_NON_BEST_BASELINE)
673#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
674
675// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
676// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
677// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
678// sets all lower bits (better targets), then we also include the static target.
679#else
680#define HWY_TARGETS \
681 (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
682
683#endif // target policy
684
685// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
686// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
687// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
688#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
689#error "Logic error: best baseline should be included in dynamic targets"
690#endif
691
692#endif // HIGHWAY_HWY_DETECT_TARGETS_H_