Grok 12.0.1
cache_control.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
17#define HIGHWAY_HWY_CACHE_CONTROL_H_
18
19#include "hwy/base.h"
20
21// Requires SSE2; fails to compile on 32-bit Clang 7 (see
22// https://github.com/gperftools/gperftools/issues/946).
23#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
24#undef HWY_DISABLE_CACHE_CONTROL
25#define HWY_DISABLE_CACHE_CONTROL
26#endif
27
28#ifndef HWY_DISABLE_CACHE_CONTROL
29// intrin.h is sufficient on MSVC and already included by base.h.
30#if HWY_ARCH_X86 && !HWY_COMPILER_MSVC
31#include <emmintrin.h> // SSE2
32#include <xmmintrin.h> // _mm_prefetch
33#elif HWY_ARCH_ARM_A64
34#include <arm_acle.h>
35#endif
36#endif // HWY_DISABLE_CACHE_CONTROL
37
38namespace hwy {
39
40// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
41#define HWY_STREAM_MULTIPLE 16
42
43// The following functions may also require an attribute.
44#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
45#define HWY_ATTR_CACHE __attribute__((target("sse2")))
46#else
47#define HWY_ATTR_CACHE
48#endif
49
50// Windows.h #defines this, which causes infinite recursion. Temporarily
51// undefine to avoid conflict with our function.
52// TODO(janwas): remove when this function is removed.
53#pragma push_macro("LoadFence")
54#undef LoadFence
55
56// Delays subsequent loads until prior loads are visible. Beware of potentially
57// differing behavior across architectures and vendors: on Intel but not
58// AMD CPUs, also serves as a full fence (waits for all prior instructions to
59// complete).
61#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
62 _mm_lfence();
63#endif
64}
65
66// TODO(janwas): remove when this function is removed. (See above.)
67#pragma pop_macro("LoadFence")
68
69// Ensures values written by previous `Stream` calls are visible on the current
70// core. This is NOT sufficient for synchronizing across cores; when `Stream`
71// outputs are to be consumed by other core(s), the producer must publish
72// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
74#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
75 _mm_sfence();
76#endif
77}
78
79// Optionally begins loading the cache line containing "p" to reduce latency of
80// subsequent actual loads.
81template <typename T>
83 (void)p;
84#ifndef HWY_DISABLE_CACHE_CONTROL
85#if HWY_ARCH_X86
86 _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
87#elif HWY_COMPILER_GCC // includes clang
88 // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
89 // desirable, so use the default 3 (keep in caches).
90 __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
91#endif
92#endif // HWY_DISABLE_CACHE_CONTROL
93}
94
95// Invalidates and flushes the cache line containing "p", if possible.
97#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
98 _mm_clflush(p);
99#else
100 (void)p;
101#endif
102}
103
104// Hints that we are inside a spin loop and potentially reduces power
105// consumption and coherency traffic. For example, x86 avoids multiple
106// outstanding load requests, which reduces the memory order violation penalty
107// when exiting the loop.
109#ifndef HWY_DISABLE_CACHE_CONTROL
110#if HWY_ARCH_X86
111 _mm_pause();
112#elif HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG
113 // This is documented in ACLE and the YIELD instruction is also available in
114 // Armv7, but the intrinsic is broken for Armv7 clang, hence A64 only.
115 __yield();
116#elif HWY_ARCH_ARM && HWY_COMPILER_GCC // includes clang
117 __asm__ volatile("yield" ::: "memory");
118#elif HWY_ARCH_PPC && HWY_COMPILER_GCC // includes clang
119 __asm__ volatile("or 27,27,27" ::: "memory");
120#endif
121#endif // HWY_DISABLE_CACHE_CONTROL
122}
123
124} // namespace hwy
125
126#endif // HIGHWAY_HWY_CACHE_CONTROL_H_
#define HWY_INLINE
Definition base.h:101
#define HWY_ATTR_CACHE
Definition cache_control.h:47
Definition abort.h:8
HWY_INLINE HWY_ATTR_CACHE void FlushStream()
Definition cache_control.h:73
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T *p)
Definition cache_control.h:82
HWY_INLINE HWY_ATTR_CACHE void Pause()
Definition cache_control.h:108
HWY_INLINE HWY_ATTR_CACHE void LoadFence()
Definition cache_control.h:60
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void *p)
Definition cache_control.h:96