Grok 12.0.1
targets.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef HIGHWAY_HWY_TARGETS_H_
17#define HIGHWAY_HWY_TARGETS_H_
18
19// Allows opting out of C++ standard library usage, which is not available in
20// some Compiler Explorer environments.
21#ifndef HWY_NO_LIBCXX
22#include <vector>
23#endif
24
25// For SIMD module implementations and their callers. Defines which targets to
26// generate and call.
27
28#include "hwy/base.h"
29#include "hwy/detect_targets.h"
30#include "hwy/highway_export.h"
31
32#if !HWY_ARCH_RISCV && !defined(HWY_NO_LIBCXX)
33#include <atomic>
34#endif
35
36namespace hwy {
37
38// Returns bitfield of enabled targets that are supported on this CPU; there is
39// always at least one such target, hence the return value is never 0. The
40// targets returned may change after calling DisableTargets. This function is
41// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding
42// calls to it if there is only a single target enabled.
44
45// Evaluates to a function call, or literal if there is a single target.
46#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
47#define HWY_SUPPORTED_TARGETS HWY_TARGETS
48#else
49#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
50#endif
51
52// Subsequent SupportedTargets will not return targets whose bit(s) are set in
53// `disabled_targets`. Exception: if SupportedTargets would return 0, it will
54// instead return HWY_STATIC_TARGET (there must always be one target to call).
55//
56// This function is useful for disabling targets known to be buggy, or if the
57// best available target is undesirable (perhaps due to throttling or memory
58// bandwidth limitations). Use SetSupportedTargetsForTest instead of this
59// function for iteratively enabling specific targets for testing.
60HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
61
62// Subsequent SupportedTargets will return the given set of targets, except
63// those disabled via DisableTargets. Call with a mask of 0 to disable the mock
64// and return to the normal SupportedTargets behavior. Used to run tests for
65// all targets.
67
68#ifndef HWY_NO_LIBCXX
69
70// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
71// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
72// is affected by the current SetSupportedTargetsForTest() mock if any.
74 std::vector<int64_t> ret;
75 for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
76 targets = targets & (targets - 1)) {
77 int64_t current_target = targets & ~(targets - 1);
78 ret.push_back(current_target);
79 }
80 return ret;
81}
82
83#endif // HWY_NO_LIBCXX
84
85static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
86 switch (target) {
87#if HWY_ARCH_X86
88 case HWY_SSE2:
89 return "SSE2";
90 case HWY_SSSE3:
91 return "SSSE3";
92 case HWY_SSE4:
93 return "SSE4";
94 case HWY_AVX2:
95 return "AVX2";
96 case HWY_AVX3:
97 return "AVX3";
98 case HWY_AVX3_DL:
99 return "AVX3_DL";
100 case HWY_AVX3_ZEN4:
101 return "AVX3_ZEN4";
102 case HWY_AVX3_SPR:
103 return "AVX3_SPR";
104#endif
105
106#if HWY_ARCH_ARM
107 case HWY_SVE2_128:
108 return "SVE2_128";
109 case HWY_SVE_256:
110 return "SVE_256";
111 case HWY_SVE2:
112 return "SVE2";
113 case HWY_SVE:
114 return "SVE";
115 case HWY_NEON:
116 return "NEON";
118 return "NEON_WITHOUT_AES";
119#endif
120
121#if HWY_ARCH_PPC
122 case HWY_PPC8:
123 return "PPC8";
124 case HWY_PPC9:
125 return "PPC9";
126 case HWY_PPC10:
127 return "PPC10";
128#endif
129
130#if HWY_ARCH_S390X
131 case HWY_Z14:
132 return "Z14";
133 case HWY_Z15:
134 return "Z15";
135#endif
136
137#if HWY_ARCH_WASM
138 case HWY_WASM:
139 return "WASM";
140 case HWY_WASM_EMU256:
141 return "WASM_EMU256";
142#endif
143
144#if HWY_ARCH_RISCV
145 case HWY_RVV:
146 return "RVV";
147#endif
148
149 case HWY_EMU128:
150 return "EMU128";
151 case HWY_SCALAR:
152 return "SCALAR";
153
154 default:
155 return "Unknown"; // must satisfy gtest IsValidParamName()
156 }
157}
158
159// The maximum number of dynamic targets on any architecture is defined by
160// HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
161
162// For the ChosenTarget mask and index we use a different bit arrangement than
163// in the HWY_TARGETS mask. Only the targets involved in the current
164// architecture are used in this mask, and therefore only the least significant
165// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least
166// significant bit is set when the mask is not initialized, the next
167// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
168// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
169// that position and the next more significant bit is used for HWY_SCALAR (if
170// HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to
171// define equivalent values for HWY_TARGETS in this representation.
172// This mask representation allows to use ctz() on this mask and obtain a small
173// number that's used as an index of the table for dynamic dispatch. In this
174// way the first entry is used when the mask is uninitialized, the following
175// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
176// scalar.
177
178// The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format.
179#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1))
180
181// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
182// current architecture.
183#define HWY_CHOSEN_TARGET_SHIFT(X) \
184 ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
185 ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1)) \
186 << 1)
187
188// The HWY_TARGETS mask in the ChosenTarget mask format.
189#define HWY_CHOSEN_TARGET_MASK_TARGETS \
190 (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL)
191
192#if HWY_ARCH_X86
193// Maximum number of dynamic targets, changing this value is an ABI incompatible
194// change
195#define HWY_MAX_DYNAMIC_TARGETS 15
196#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
197// These must match the order in which the HWY_TARGETS are defined
198// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
199// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
200// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
201// corresponds to the best target. Don't include a "," at the end of the list.
202#define HWY_CHOOSE_TARGET_LIST(func_name) \
203 nullptr, /* reserved */ \
204 nullptr, /* reserved */ \
205 nullptr, /* reserved */ \
206 nullptr, /* reserved */ \
207 HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \
208 nullptr, /* reserved */ \
209 HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \
210 HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \
211 HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
212 HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
213 nullptr, /* AVX */ \
214 HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
215 HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \
216 nullptr, /* reserved - SSE3? */ \
217 HWY_CHOOSE_SSE2(func_name) /* SSE2 */
218
219#elif HWY_ARCH_ARM
220// See HWY_ARCH_X86 above for details.
221#define HWY_MAX_DYNAMIC_TARGETS 15
222#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
223#define HWY_CHOOSE_TARGET_LIST(func_name) \
224 nullptr, /* reserved */ \
225 nullptr, /* reserved */ \
226 nullptr, /* reserved */ \
227 nullptr, /* reserved */ \
228 nullptr, /* reserved */ \
229 nullptr, /* reserved */ \
230 nullptr, /* reserved */ \
231 nullptr, /* reserved */ \
232 nullptr, /* reserved */ \
233 HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \
234 HWY_CHOOSE_SVE_256(func_name), /* SVE 256-bit */ \
235 HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \
236 HWY_CHOOSE_SVE(func_name), /* SVE */ \
237 HWY_CHOOSE_NEON(func_name), /* NEON */ \
238 HWY_CHOOSE_NEON_WITHOUT_AES(func_name) /* NEON without AES */
239
240#elif HWY_ARCH_RISCV
241// See HWY_ARCH_X86 above for details.
242#define HWY_MAX_DYNAMIC_TARGETS 9
243#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
244#define HWY_CHOOSE_TARGET_LIST(func_name) \
245 nullptr, /* reserved */ \
246 nullptr, /* reserved */ \
247 nullptr, /* reserved */ \
248 nullptr, /* reserved */ \
249 nullptr, /* reserved */ \
250 nullptr, /* reserved */ \
251 nullptr, /* reserved */ \
252 HWY_CHOOSE_RVV(func_name), /* RVV */ \
253 nullptr /* reserved */
254
255#elif HWY_ARCH_PPC || HWY_ARCH_S390X
256// See HWY_ARCH_X86 above for details.
257#define HWY_MAX_DYNAMIC_TARGETS 9
258#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
259#define HWY_CHOOSE_TARGET_LIST(func_name) \
260 nullptr, /* reserved */ \
261 nullptr, /* reserved */ \
262 nullptr, /* reserved */ \
263 nullptr, /* reserved */ \
264 HWY_CHOOSE_PPC10(func_name), /* PPC10 */ \
265 HWY_CHOOSE_PPC9(func_name), /* PPC9 */ \
266 HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \
267 HWY_CHOOSE_Z15(func_name), /* Z15 */ \
268 HWY_CHOOSE_Z14(func_name) /* Z14 */
269
270#elif HWY_ARCH_WASM
271// See HWY_ARCH_X86 above for details.
272#define HWY_MAX_DYNAMIC_TARGETS 9
273#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
274#define HWY_CHOOSE_TARGET_LIST(func_name) \
275 nullptr, /* reserved */ \
276 nullptr, /* reserved */ \
277 nullptr, /* reserved */ \
278 nullptr, /* reserved */ \
279 nullptr, /* reserved */ \
280 nullptr, /* reserved */ \
281 HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \
282 HWY_CHOOSE_WASM(func_name), /* WASM */ \
283 nullptr /* reserved */
284
285#else
286// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
287// still creating single-entry tables in HWY_EXPORT to ensure portability.
288#define HWY_MAX_DYNAMIC_TARGETS 1
289#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
290#endif
291
292// Bitfield of supported and enabled targets. The format differs from that of
293// HWY_TARGETS; the lowest bit governs the first function pointer (which is
294// special in that it calls FunctionCache, then Update, then dispatches to the
295// actual implementation) in the tables created by HWY_EXPORT. Monostate (see
296// GetChosenTarget), thread-safe except on RVV.
298 public:
299 // Reset bits according to `targets` (typically the return value of
300 // SupportedTargets()). Postcondition: IsInitialized() == true.
301 void Update(int64_t targets) {
302 // These are `targets` shifted downwards, see above. Also include SCALAR
303 // (corresponds to the last entry in the function table) as fallback.
305 }
306
307 // Reset to the uninitialized state, so that FunctionCache will call Update
308 // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false.
309 void DeInit() { StoreMask(1); }
310
311 // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH
312 // function was called, which we check in tests.
313 bool IsInitialized() const { return LoadMask() != 1; }
314
315 // Return the index in the dynamic dispatch table to be used by the current
316 // CPU. Note that this method must be in the header file so it uses the value
317 // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
318 // calls it, which may be different from others. This means we only enable
319 // those targets that were actually compiled in this module.
320 size_t HWY_INLINE GetIndex() const {
322 static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS));
323 }
324
325 private:
326 // TODO(janwas): remove RVV once <atomic> is available
327#if HWY_ARCH_RISCV || defined(HWY_NO_LIBCXX)
328 int64_t LoadMask() const { return mask_; }
329 void StoreMask(int64_t mask) { mask_ = mask; }
330
331 int64_t mask_{1}; // Initialized to 1 so GetIndex() returns 0.
332#else
333 int64_t LoadMask() const { return mask_.load(); }
334 void StoreMask(int64_t mask) { mask_.store(mask); }
335
336 std::atomic<int64_t> mask_{1}; // Initialized to 1 so GetIndex() returns 0.
337#endif // HWY_ARCH_RISCV
338};
339
340// For internal use (e.g. by FunctionCache and DisableTargets).
342
343} // namespace hwy
344
345#endif // HIGHWAY_HWY_TARGETS_H_
#define HWY_INLINE
Definition base.h:101
#define HWY_MAYBE_UNUSED
Definition base.h:113
#define HWY_WASM_EMU256
Definition detect_targets.h:117
#define HWY_SSE2
Definition detect_targets.h:80
#define HWY_AVX3_DL
Definition detect_targets.h:73
#define HWY_NEON
Definition detect_targets.h:93
#define HWY_EMU128
Definition detect_targets.h:124
#define HWY_AVX3_SPR
Definition detect_targets.h:63
#define HWY_PPC8
Definition detect_targets.h:110
#define HWY_Z15
Definition detect_targets.h:111
#define HWY_PPC10
Definition detect_targets.h:108
#define HWY_SVE2
Definition detect_targets.h:91
#define HWY_AVX3
Definition detect_targets.h:74
#define HWY_AVX2
Definition detect_targets.h:75
#define HWY_SCALAR
Definition detect_targets.h:126
#define HWY_SVE_256
Definition detect_targets.h:90
#define HWY_SVE2_128
Definition detect_targets.h:89
#define HWY_AVX3_ZEN4
Definition detect_targets.h:68
#define HWY_Z14
Definition detect_targets.h:112
#define HWY_PPC9
Definition detect_targets.h:109
#define HWY_WASM
Definition detect_targets.h:118
#define HWY_SVE
Definition detect_targets.h:92
#define HWY_RVV
Definition detect_targets.h:99
#define HWY_TARGETS
Definition detect_targets.h:680
#define HWY_SSE4
Definition detect_targets.h:77
#define HWY_NEON_WITHOUT_AES
Definition detect_targets.h:94
#define HWY_SSSE3
Definition detect_targets.h:78
#define HWY_DLLEXPORT
Definition highway_export.h:13
Definition abort.h:8
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:2551
static HWY_MAYBE_UNUSED const char * TargetName(int64_t target)
Definition targets.h:85
HWY_DLLEXPORT ChosenTarget & GetChosenTarget()
HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets)
HWY_INLINE std::vector< int64_t > SupportedAndGeneratedTargets()
Definition targets.h:73
HWY_DLLEXPORT int64_t SupportedTargets()
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets)
Definition targets.h:297
bool IsInitialized() const
Definition targets.h:313
void StoreMask(int64_t mask)
Definition targets.h:334
size_t HWY_INLINE GetIndex() const
Definition targets.h:320
void DeInit()
Definition targets.h:309
void Update(int64_t targets)
Definition targets.h:301
int64_t LoadMask() const
Definition targets.h:333
std::atomic< int64_t > mask_
Definition targets.h:336
#define HWY_CHOSEN_TARGET_MASK_TARGETS
Definition targets.h:189
#define HWY_CHOSEN_TARGET_SHIFT(X)
Definition targets.h:183
#define HWY_CHOSEN_TARGET_MASK_SCALAR
Definition targets.h:179