Grok 12.0.1
timer-inl.h
Go to the documentation of this file.
1// Copyright 2023 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// High-resolution and high-precision timer
17
18// Per-target include guard
19#if defined(HIGHWAY_HWY_TIMER_INL_H_) == defined(HWY_TARGET_TOGGLE)
20#ifdef HIGHWAY_HWY_TIMER_INL_H_
21#undef HIGHWAY_HWY_TIMER_INL_H_
22#else
23#define HIGHWAY_HWY_TIMER_INL_H_
24#endif
25
26#include "hwy/highway.h"
27
28#if defined(_WIN32) || defined(_WIN64)
29#ifndef NOMINMAX
30#define NOMINMAX
31#endif // NOMINMAX
32#include <windows.h>
33#endif
34
35#if defined(__APPLE__)
36#include <mach/mach.h>
37#include <mach/mach_time.h>
38#endif
39
40#if defined(__HAIKU__)
41#include <OS.h>
42#endif
43
44#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
45#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
46#endif
47
48#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
49#include <intrin.h>
50#endif
51
52#include <stdint.h>
53#include <time.h> // clock_gettime
54
56namespace hwy {
57namespace HWY_NAMESPACE {
58namespace timer {
59
60// Ticks := platform-specific timer values (CPU cycles on x86). Must be
61// unsigned to guarantee wraparound on overflow.
62using Ticks = uint64_t;
63
64// Start/Stop return absolute timestamps and must be placed immediately before
65// and after the region to measure. We provide separate Start/Stop functions
66// because they use different fences.
67//
68// Background: RDTSC is not 'serializing'; earlier instructions may complete
69// after it, and/or later instructions may complete before it. 'Fences' ensure
70// regions' elapsed times are independent of such reordering. The only
71// documented unprivileged serializing instruction is CPUID, which acts as a
72// full fence (no reordering across it in either direction). Unfortunately
73// the latency of CPUID varies wildly (perhaps made worse by not initializing
74// its EAX input). Because it cannot reliably be deducted from the region's
75// elapsed time, it must not be included in the region to measure (i.e.
76// between the two RDTSC).
77//
78// The newer RDTSCP is sometimes described as serializing, but it actually
79// only serves as a half-fence with release semantics. Although all
80// instructions in the region will complete before the final timestamp is
81// captured, subsequent instructions may leak into the region and increase the
82// elapsed time. Inserting another fence after the final RDTSCP would prevent
83// such reordering without affecting the measured region.
84//
85// Fortunately, such a fence exists. The LFENCE instruction is only documented
86// to delay later loads until earlier loads are visible. However, Intel's
87// reference manual says it acts as a full fence (waiting until all earlier
88// instructions have completed, and delaying later instructions until it
89// completes). AMD assigns the same behavior to MFENCE.
90//
91// We need a fence before the initial RDTSC to prevent earlier instructions
92// from leaking into the region, and arguably another after RDTSC to avoid
93// region instructions from completing before the timestamp is recorded.
94// When surrounded by fences, the additional RDTSCP half-fence provides no
95// benefit, so the initial timestamp can be recorded via RDTSC, which has
96// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
97// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
98//
99// Using Start+Start leads to higher variance and overhead than Stop+Stop.
100// However, Stop+Stop includes an LFENCE in the region measurements, which
101// adds a delay dependent on earlier loads. The combination of Start+Stop
102// is faster than Start+Start and more consistent than Stop+Stop because
103// the first LFENCE already delayed subsequent loads before the measured
104// region. This combination seems not to have been considered in prior work:
105// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
106//
107// Note: performance counters can measure 'exact' instructions-retired or
108// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
109// requires fences. Unfortunately, it is not accessible on all OSes and we
110// prefer to avoid kernel-mode drivers. Performance counters are also affected
111// by several under/over-count errata, so we use the TSC instead.
112
113// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
114// divide by InvariantTicksPerSecond.
115inline Ticks Start() {
116 Ticks t;
117#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
118 asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
119#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
120 // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
121 asm volatile("mrs %0, cntvct_el0" : "=r"(t));
122#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
123 _ReadWriteBarrier();
124 _mm_lfence();
125 _ReadWriteBarrier();
126 t = __rdtsc();
127 _ReadWriteBarrier();
128 _mm_lfence();
129 _ReadWriteBarrier();
130#elif HWY_ARCH_X86_64
131 asm volatile(
132 "lfence\n\t"
133 "rdtsc\n\t"
134 "shl $32, %%rdx\n\t"
135 "or %%rdx, %0\n\t"
136 "lfence"
137 : "=a"(t)
138 :
139 // "memory" avoids reordering. rdx = TSC >> 32.
140 // "cc" = flags modified by SHL.
141 : "rdx", "memory", "cc");
142#elif HWY_ARCH_RISCV
143 asm volatile("fence; rdtime %0" : "=r"(t));
144#elif defined(_WIN32) || defined(_WIN64)
145 LARGE_INTEGER counter;
146 (void)QueryPerformanceCounter(&counter);
147 t = counter.QuadPart;
148#elif defined(__APPLE__)
149 t = mach_absolute_time();
150#elif defined(__HAIKU__)
151 t = system_time_nsecs(); // since boot
152#else // POSIX
153 timespec ts;
154 clock_gettime(CLOCK_MONOTONIC, &ts);
155 t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
156#endif
157 return t;
158}
159
160// WARNING: on x86, caller must check HasRDTSCP before using this!
161inline Ticks Stop() {
162 uint64_t t;
163#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
164 asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
165#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
166 // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
167 asm volatile("mrs %0, cntvct_el0" : "=r"(t));
168#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
169 _ReadWriteBarrier();
170 unsigned aux;
171 t = __rdtscp(&aux);
172 _ReadWriteBarrier();
173 _mm_lfence();
174 _ReadWriteBarrier();
175#elif HWY_ARCH_X86_64
176 // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
177 asm volatile(
178 "rdtscp\n\t"
179 "shl $32, %%rdx\n\t"
180 "or %%rdx, %0\n\t"
181 "lfence"
182 : "=a"(t)
183 :
184 // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
185 // "cc" = flags modified by SHL.
186 : "rcx", "rdx", "memory", "cc");
187#else
188 t = Start();
189#endif
190 return t;
191}
192
193} // namespace timer
194
195// NOLINTNEXTLINE(google-readability-namespace-comments)
196} // namespace HWY_NAMESPACE
197} // namespace hwy
199
200#endif // per-target include guard
uint64_t Ticks
Definition timer-inl.h:62
Ticks Stop()
Definition timer-inl.h:161
Ticks Start()
Definition timer-inl.h:115
Definition abort.h:8
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()