Grok 12.0.1
profiler.h
Go to the documentation of this file.
1// Copyright 2017 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#ifndef HIGHWAY_HWY_PROFILER_H_
16#define HIGHWAY_HWY_PROFILER_H_
17
18// High precision, low overhead time measurements. Returns exact call counts and
19// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
20//
21// Uses RAII to capture begin/end timestamps, with user-specified zone names:
22// { PROFILER_ZONE("name"); /*code*/ } or
23// the name of the current function:
24// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
25//
26// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
27// print call counts and average durations [CPU cycles] to stdout, sorted in
28// descending order of total duration.
29//
30// The binary MUST be built with --dynamic_mode=off because we rely on the data
31// segments being nearby; if not, an assertion will likely fail.
32
33#include "hwy/base.h"
34
35// Configuration settings:
36
37// If zero, this file has no effect and no measurements will be recorded.
38#ifndef PROFILER_ENABLED
39#define PROFILER_ENABLED 0
40#endif
41
42// How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
43// enters at least one zone. Once this buffer is full, the thread will analyze
44// and discard packets, thus temporarily adding some observer overhead.
45// Each zone occupies 16 bytes.
46#ifndef PROFILER_THREAD_STORAGE
47#define PROFILER_THREAD_STORAGE 200ULL
48#endif
49
50#if PROFILER_ENABLED || HWY_IDE
51
52#include <stddef.h>
53#include <stdint.h>
54#include <stdio.h>
55#include <string.h> // strcmp
56
57#include <algorithm> // std::sort
58#include <atomic>
59
61#include "hwy/cache_control.h" // FlushStream
62// #include "hwy/contrib/sort/vqsort.h"
63#include "hwy/highway.h" // Stream
65#include "hwy/timer-inl.h"
66#include "hwy/timer.h"
67
68#define PROFILER_PRINT_OVERHEAD 0
69
70namespace hwy {
71
72// Upper bounds for fixed-size data structures (guarded via HWY_DASSERT):
73
74// How many threads can actually enter a zone (those that don't do not count).
75// Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.
76// WARNING: a fiber library can spawn hundreds of threads.
77static constexpr size_t kMaxThreads = 256;
78
79static constexpr size_t kMaxDepth = 64; // Maximum nesting of zones.
80
81static constexpr size_t kMaxZones = 256; // Total number of zones.
82
83// Overwrites "to" without loading it into the cache (read-for-ownership).
84// Both pointers must be aligned.
85HWY_ATTR static void StreamCacheLine(const uint64_t* HWY_RESTRICT from,
86 uint64_t* HWY_RESTRICT to) {
87 namespace hn = HWY_NAMESPACE;
89 for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); i += Lanes(d)) {
90 hn::Stream(hn::Load(d, from + i), d, to + i);
91 }
92}
93
94#pragma pack(push, 1)
95
96// Represents zone entry/exit events. Stores a full-resolution timestamp plus
97// an offset (representing zone name or identifying exit packets). POD.
98class Packet {
99 public:
100 // If offsets do not fit, UpdateOrAdd will overrun our heap allocation
101 // (governed by kMaxZones). We have seen multi-megabyte offsets.
102 static constexpr size_t kOffsetBits = 25;
103 static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);
104
105 // We need full-resolution timestamps; at an effective rate of 4 GHz,
106 // this permits 1 minute zone durations (for longer durations, split into
107 // multiple zones). Wraparound is handled by masking.
108 static constexpr size_t kTimestampBits = 64 - kOffsetBits;
109 static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;
110
111 static Packet Make(const size_t biased_offset, const uint64_t timestamp) {
112 HWY_DASSERT(biased_offset < (1ULL << kOffsetBits));
113
114 Packet packet;
115 packet.bits_ =
116 (biased_offset << kTimestampBits) + (timestamp & kTimestampMask);
117 return packet;
118 }
119
120 uint64_t Timestamp() const { return bits_ & kTimestampMask; }
121
122 size_t BiasedOffset() const { return (bits_ >> kTimestampBits); }
123
124 private:
125 uint64_t bits_;
126};
127static_assert(sizeof(Packet) == 8, "Wrong Packet size");
128
129// Returns the address of a string literal. Assuming zone names are also
130// literals and stored nearby, we can represent them as offsets, which are
131// faster to compute than hashes or even a static index.
132//
133// This function must not be static - each call (even from other translation
134// units) must return the same value.
135inline const char* StringOrigin() {
136 // Chosen such that no zone name is a prefix nor suffix of this string
137 // to ensure they aren't merged (offset 0 identifies zone-exit packets).
138 static const char* string_origin = "__#__";
139 return string_origin - Packet::kOffsetBias;
140}
141
142// Representation of an active zone, stored in a stack. Used to deduct
143// child duration from the parent's self time. POD.
144struct Node {
145 Packet packet;
146 uint64_t child_total;
147};
148static_assert(sizeof(Node) == 16, "Wrong Node size");
149
150// Holds statistics for all zones with the same name. POD.
151struct Accumulator {
152 static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;
153
154 uint64_t BiasedOffset() const { return u128.lo >> kNumCallBits; }
155 uint64_t NumCalls() const { return u128.lo & ((1ULL << kNumCallBits) - 1); }
156 uint64_t Duration() const { return u128.hi; }
157
158 void Set(uint64_t biased_offset, uint64_t num_calls, uint64_t duration) {
159 u128.hi = duration;
160 u128.lo = (biased_offset << kNumCallBits) + num_calls;
161 }
162
163 void Add(uint64_t num_calls, uint64_t duration) {
164 u128.lo += num_calls;
165 u128.hi += duration;
166 }
167
168 // For fast sorting by duration, which must therefore be the hi element.
169 // lo holds BiasedOffset and NumCalls.
170 uint128_t u128;
171};
172static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size");
173
174template <typename T>
175inline T ClampedSubtract(const T minuend, const T subtrahend) {
176 if (subtrahend > minuend) {
177 return 0;
178 }
179 return minuend - subtrahend;
180}
181
182// Per-thread call graph (stack) and Accumulator for each zone.
183class Results {
184 public:
185 Results() { ZeroBytes(zones_, sizeof(zones_)); }
186
187 // Used for computing overhead when this thread encounters its first Zone.
188 // This has no observable effect apart from increasing "analyze_elapsed_".
189 uint64_t ZoneDuration(const Packet* packets) {
190 HWY_DASSERT(depth_ == 0);
191 HWY_DASSERT(num_zones_ == 0);
192 AnalyzePackets(packets, 2);
193 const uint64_t duration = zones_[0].Duration();
194 zones_[0].Set(0, 0, 0);
195 HWY_DASSERT(depth_ == 0);
196 num_zones_ = 0;
197 return duration;
198 }
199
200 void SetSelfOverhead(const uint64_t self_overhead) {
201 self_overhead_ = self_overhead;
202 }
203
204 void SetChildOverhead(const uint64_t child_overhead) {
205 child_overhead_ = child_overhead;
206 }
207
208 // Draw all required information from the packets, which can be discarded
209 // afterwards. Called whenever this thread's storage is full.
210 void AnalyzePackets(const Packet* packets, const size_t num_packets) {
211 namespace hn = HWY_NAMESPACE;
212 const uint64_t t0 = hn::timer::Start();
213
214 for (size_t i = 0; i < num_packets; ++i) {
215 const Packet p = packets[i];
216 // Entering a zone
217 if (p.BiasedOffset() != Packet::kOffsetBias) {
218 HWY_DASSERT(depth_ < kMaxDepth);
219 nodes_[depth_].packet = p;
220 nodes_[depth_].child_total = 0;
221 ++depth_;
222 continue;
223 }
224
225 HWY_DASSERT(depth_ != 0);
226 const Node& node = nodes_[depth_ - 1];
227 // Masking correctly handles unsigned wraparound.
228 const uint64_t duration =
229 (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
230 const uint64_t self_duration = ClampedSubtract(
231 duration, self_overhead_ + child_overhead_ + node.child_total);
232
233 UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration);
234 --depth_;
235
236 // Deduct this nested node's time from its parent's self_duration.
237 if (depth_ != 0) {
238 nodes_[depth_ - 1].child_total += duration + child_overhead_;
239 }
240 }
241
242 const uint64_t t1 = hn::timer::Stop();
243 analyze_elapsed_ += t1 - t0;
244 }
245
246 // Incorporates results from another thread. Call after all threads have
247 // exited any zones.
248 void Assimilate(const Results& other) {
249 namespace hn = HWY_NAMESPACE;
250 const uint64_t t0 = hn::timer::Start();
251 HWY_DASSERT(depth_ == 0);
252 HWY_DASSERT(other.depth_ == 0);
253
254 for (size_t i = 0; i < other.num_zones_; ++i) {
255 const Accumulator& zone = other.zones_[i];
256 UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration());
257 }
258 const uint64_t t1 = hn::timer::Stop();
259 analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
260 }
261
262 // Single-threaded.
263 void Print() {
264 namespace hn = HWY_NAMESPACE;
265 const uint64_t t0 = hn::timer::Start();
266 MergeDuplicates();
267
268 // Sort by decreasing total (self) cost.
269 // VQSort(&zones_[0].u128, num_zones_, SortDescending());
270 std::sort(zones_, zones_ + num_zones_,
271 [](const Accumulator& r1, const Accumulator& r2) {
272 return r1.Duration() > r2.Duration();
273 });
274
275 const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();
276
277 const char* string_origin = StringOrigin();
278 for (size_t i = 0; i < num_zones_; ++i) {
279 const Accumulator& r = zones_[i];
280 const uint64_t num_calls = r.NumCalls();
281 printf("%-40s: %10zu x %15zu = %9.6f\n", string_origin + r.BiasedOffset(),
282 num_calls, r.Duration() / num_calls,
283 static_cast<double>(r.Duration()) * inv_freq);
284 }
285
286 const uint64_t t1 = hn::timer::Stop();
287 analyze_elapsed_ += t1 - t0;
288 printf("Total analysis [s]: %f\n",
289 static_cast<double>(analyze_elapsed_) * inv_freq);
290 }
291
292 private:
293 // Updates an existing Accumulator (uniquely identified by biased_offset) or
294 // adds one if this is the first time this thread analyzed that zone.
295 // Uses a self-organizing list data structure, which avoids dynamic memory
296 // allocations and is far faster than unordered_map. Loads, updates and
297 // stores the entire Accumulator with vector instructions.
298 void UpdateOrAdd(const size_t biased_offset, const uint64_t num_calls,
299 const uint64_t duration) {
300 HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));
301
302 // Special case for first zone: (maybe) update, without swapping.
303 if (zones_[0].BiasedOffset() == biased_offset) {
304 zones_[0].Add(num_calls, duration);
305 HWY_DASSERT(zones_[0].BiasedOffset() == biased_offset);
306 return;
307 }
308
309 // Look for a zone with the same offset.
310 for (size_t i = 1; i < num_zones_; ++i) {
311 if (zones_[i].BiasedOffset() == biased_offset) {
312 zones_[i].Add(num_calls, duration);
313 HWY_DASSERT(zones_[i].BiasedOffset() == biased_offset);
314 // Swap with predecessor (more conservative than move to front,
315 // but at least as successful).
316 const Accumulator prev = zones_[i - 1];
317 zones_[i - 1] = zones_[i];
318 zones_[i] = prev;
319 return;
320 }
321 }
322
323 // Not found; create a new Accumulator.
324 HWY_DASSERT(num_zones_ < kMaxZones);
325 Accumulator* HWY_RESTRICT zone = zones_ + num_zones_;
326 zone->Set(biased_offset, num_calls, duration);
327 HWY_DASSERT(zone->BiasedOffset() == biased_offset);
328 ++num_zones_;
329 }
330
331 // Each instantiation of a function template seems to get its own copy of
332 // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
333 // acceptable because we only expect a few dozen zones.
334 void MergeDuplicates() {
335 const char* string_origin = StringOrigin();
336 for (size_t i = 0; i < num_zones_; ++i) {
337 const size_t biased_offset = zones_[i].BiasedOffset();
338 const char* name = string_origin + biased_offset;
339 // Separate num_calls from biased_offset so we can add them together.
340 uint64_t num_calls = zones_[i].NumCalls();
341
342 // Add any subsequent duplicates to num_calls and total_duration.
343 for (size_t j = i + 1; j < num_zones_;) {
344 if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
345 num_calls += zones_[j].NumCalls();
346 zones_[i].Add(0, zones_[j].Duration());
347 // Fill hole with last item.
348 zones_[j] = zones_[--num_zones_];
349 } else { // Name differed, try next Accumulator.
350 ++j;
351 }
352 }
353
354 HWY_DASSERT(num_calls < (1ULL << Accumulator::kNumCallBits));
355
356 // Re-pack regardless of whether any duplicates were found.
357 zones_[i].Set(biased_offset, num_calls, zones_[i].Duration());
358 }
359 }
360
361 uint64_t analyze_elapsed_ = 0;
362 uint64_t self_overhead_ = 0;
363 uint64_t child_overhead_ = 0;
364
365 size_t depth_ = 0; // Number of active zones.
366 size_t num_zones_ = 0; // Number of retired zones.
367
368 alignas(HWY_ALIGNMENT) Node nodes_[kMaxDepth]; // Stack
369 alignas(HWY_ALIGNMENT) Accumulator zones_[kMaxZones]; // Self-organizing list
370};
371
372// Per-thread packet storage, dynamically allocated.
373class ThreadSpecific {
374 static constexpr size_t kBufferCapacity = HWY_ALIGNMENT / sizeof(Packet);
375
376 public:
377 // "name" is used to sanity-check offsets fit in kOffsetBits.
378 explicit ThreadSpecific(const char* name)
379 : max_packets_((PROFILER_THREAD_STORAGE << 20) / sizeof(Packet)),
380 packets_(AllocateAligned<Packet>(max_packets_)),
381 num_packets_(0),
382 string_origin_(StringOrigin()) {
383 // Even in optimized builds, verify that this zone's name offset fits
384 // within the allotted space. If not, UpdateOrAdd is likely to overrun
385 // zones_[]. Checking here on the cold path (only reached once per thread)
386 // is cheap, but it only covers one zone.
387 const size_t biased_offset = name - string_origin_;
388 HWY_ASSERT(biased_offset <= (1ULL << Packet::kOffsetBits));
389 }
390
391 // Depends on Zone => defined below.
392 void ComputeOverhead();
393
394 void WriteEntry(const char* name, const uint64_t timestamp) {
395 const size_t biased_offset = name - string_origin_;
396 Write(Packet::Make(biased_offset, timestamp));
397 }
398
399 void WriteExit(const uint64_t timestamp) {
400 const size_t biased_offset = Packet::kOffsetBias;
401 Write(Packet::Make(biased_offset, timestamp));
402 }
403
404 void AnalyzeRemainingPackets() {
405 // Ensures prior weakly-ordered streaming stores are globally visible.
406 FlushStream();
407
408 // Storage full => empty it.
409 if (num_packets_ + buffer_size_ > max_packets_) {
410 results_.AnalyzePackets(packets_.get(), num_packets_);
411 num_packets_ = 0;
412 }
413 CopyBytes(buffer_, packets_.get() + num_packets_,
414 buffer_size_ * sizeof(Packet));
415 num_packets_ += buffer_size_;
416
417 results_.AnalyzePackets(packets_.get(), num_packets_);
418 num_packets_ = 0;
419 }
420
421 Results& GetResults() { return results_; }
422
423 private:
424 // Write packet to buffer/storage, emptying them as needed.
425 void Write(const Packet packet) {
426 // Buffer full => copy to storage.
427 if (buffer_size_ == kBufferCapacity) {
428 // Storage full => empty it.
429 if (num_packets_ + kBufferCapacity > max_packets_) {
430 results_.AnalyzePackets(packets_.get(), num_packets_);
431 num_packets_ = 0;
432 }
433 // This buffering halves observer overhead and decreases the overall
434 // runtime by about 3%. Casting is safe because the first member is u64.
435 StreamCacheLine(
436 reinterpret_cast<const uint64_t*>(buffer_),
437 reinterpret_cast<uint64_t*>(packets_.get() + num_packets_));
438 num_packets_ += kBufferCapacity;
439 buffer_size_ = 0;
440 }
441 buffer_[buffer_size_] = packet;
442 ++buffer_size_;
443 }
444
445 // Write-combining buffer to avoid cache pollution. Must be the first
446 // non-static member to ensure cache-line alignment.
447 Packet buffer_[kBufferCapacity];
448 size_t buffer_size_ = 0;
449
450 const size_t max_packets_;
451 // Contiguous storage for zone enter/exit packets.
452 AlignedFreeUniquePtr<Packet[]> packets_;
453 size_t num_packets_;
454 // Cached here because we already read this cache line on zone entry/exit.
455 const char* HWY_RESTRICT string_origin_;
456 Results results_;
457};
458
459class ThreadList {
460 public:
461 // Called from any thread.
462 ThreadSpecific* Add(const char* name) {
463 const size_t index = num_threads_.fetch_add(1, std::memory_order_relaxed);
464 HWY_DASSERT(index < kMaxThreads);
465
466 ThreadSpecific* ts = MakeUniqueAligned<ThreadSpecific>(name).release();
467 threads_[index].store(ts, std::memory_order_release);
468 return ts;
469 }
470
471 // Single-threaded.
472 void PrintResults() {
473 const auto acq = std::memory_order_acquire;
474 const size_t num_threads = num_threads_.load(acq);
475
476 ThreadSpecific* main = threads_[0].load(acq);
477 main->AnalyzeRemainingPackets();
478
479 for (size_t i = 1; i < num_threads; ++i) {
480 ThreadSpecific* ts = threads_[i].load(acq);
481 ts->AnalyzeRemainingPackets();
482 main->GetResults().Assimilate(ts->GetResults());
483 }
484
485 if (num_threads != 0) {
486 main->GetResults().Print();
487 }
488 }
489
490 private:
491 // Owning pointers.
492 alignas(64) std::atomic<ThreadSpecific*> threads_[kMaxThreads];
493 std::atomic<size_t> num_threads_{0};
494};
495
496// RAII zone enter/exit recorder constructed by the ZONE macro; also
497// responsible for initializing ThreadSpecific.
498class Zone {
499 public:
500 // "name" must be a string literal (see StringOrigin).
501 HWY_NOINLINE explicit Zone(const char* name) {
502 HWY_FENCE;
503 ThreadSpecific* HWY_RESTRICT thread_specific = StaticThreadSpecific();
504 if (HWY_UNLIKELY(thread_specific == nullptr)) {
505 // Ensure the CPU supports our timer.
506 char cpu[100];
507 if (!platform::HaveTimerStop(cpu)) {
508 HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu);
509 }
510
511 thread_specific = StaticThreadSpecific() = Threads().Add(name);
512 // Must happen after setting StaticThreadSpecific, because ComputeOverhead
513 // also calls Zone().
514 thread_specific->ComputeOverhead();
515 }
516
517 // (Capture timestamp ASAP, not inside WriteEntry.)
518 HWY_FENCE;
519 const uint64_t timestamp = HWY_NAMESPACE::timer::Start();
520 thread_specific->WriteEntry(name, timestamp);
521 }
522
523 HWY_NOINLINE ~Zone() {
524 HWY_FENCE;
525 const uint64_t timestamp = HWY_NAMESPACE::timer::Stop();
526 StaticThreadSpecific()->WriteExit(timestamp);
527 HWY_FENCE;
528 }
529
530 // Call exactly once after all threads have exited all zones.
531 static void PrintResults() { Threads().PrintResults(); }
532
533 private:
534 // Returns reference to the thread's ThreadSpecific pointer (initially null).
535 // Function-local static avoids needing a separate definition.
536 static ThreadSpecific*& StaticThreadSpecific() {
537 static thread_local ThreadSpecific* thread_specific;
538 return thread_specific;
539 }
540
541 // Returns the singleton ThreadList. Non time-critical.
542 static ThreadList& Threads() {
543 static ThreadList threads_;
544 return threads_;
545 }
546};
547
548// Creates a zone starting from here until the end of the current scope.
549// Timestamps will be recorded when entering and exiting the zone.
550// "name" must be a string literal, which is ensured by merging with "".
551#define PROFILER_ZONE(name) \
552 HWY_FENCE; \
553 const hwy::Zone zone("" name); \
554 HWY_FENCE
555
556// Creates a zone for an entire function (when placed at its beginning).
557// Shorter/more convenient than ZONE.
558#define PROFILER_FUNC \
559 HWY_FENCE; \
560 const hwy::Zone zone(__func__); \
561 HWY_FENCE
562
563#define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults
564
565inline void ThreadSpecific::ComputeOverhead() {
566 namespace hn = HWY_NAMESPACE;
567 // Delay after capturing timestamps before/after the actual zone runs. Even
568 // with frequency throttling disabled, this has a multimodal distribution,
569 // including 32, 34, 48, 52, 59, 62.
570 uint64_t self_overhead;
571 {
572 const size_t kNumSamples = 32;
573 uint32_t samples[kNumSamples];
574 for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
575 const size_t kNumDurations = 1024;
576 uint32_t durations[kNumDurations];
577
578 for (size_t idx_duration = 0; idx_duration < kNumDurations;
579 ++idx_duration) {
580 {
581 PROFILER_ZONE("Dummy Zone (never shown)");
582 }
583 const uint64_t duration = results_.ZoneDuration(buffer_);
584 buffer_size_ = 0;
585 durations[idx_duration] = static_cast<uint32_t>(duration);
586 HWY_DASSERT(num_packets_ == 0);
587 }
588 robust_statistics::CountingSort(durations, kNumDurations);
589 samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
590 }
591 // Median.
592 robust_statistics::CountingSort(samples, kNumSamples);
593 self_overhead = samples[kNumSamples / 2];
594 if (PROFILER_PRINT_OVERHEAD) {
595 printf("Overhead: %zu\n", self_overhead);
596 }
597 results_.SetSelfOverhead(self_overhead);
598 }
599
600 // Delay before capturing start timestamp / after end timestamp.
601 const size_t kNumSamples = 32;
602 uint32_t samples[kNumSamples];
603 for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
604 const size_t kNumDurations = 16;
605 uint32_t durations[kNumDurations];
606 for (size_t idx_duration = 0; idx_duration < kNumDurations;
607 ++idx_duration) {
608 const size_t kReps = 10000;
609 // Analysis time should not be included => must fit within buffer.
610 HWY_DASSERT(kReps * 2 < max_packets_);
611 std::atomic_thread_fence(std::memory_order_seq_cst);
612 const uint64_t t0 = hn::timer::Start();
613 for (size_t i = 0; i < kReps; ++i) {
614 PROFILER_ZONE("Dummy");
615 }
616 FlushStream();
617 const uint64_t t1 = hn::timer::Stop();
618 HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2);
619 buffer_size_ = 0;
620 num_packets_ = 0;
621 const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
622 durations[idx_duration] =
623 static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
624 }
625 robust_statistics::CountingSort(durations, kNumDurations);
626 samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
627 }
628 robust_statistics::CountingSort(samples, kNumSamples);
629 const uint64_t child_overhead = samples[9 * kNumSamples / 10];
630 if (PROFILER_PRINT_OVERHEAD) {
631 printf("Child overhead: %zu\n", child_overhead);
632 }
633 results_.SetChildOverhead(child_overhead);
634}
635
636#pragma pack(pop)
637
638} // namespace hwy
639
640#endif // PROFILER_ENABLED || HWY_IDE
641
642#if !PROFILER_ENABLED && !HWY_IDE
643#define PROFILER_ZONE(name)
644#define PROFILER_FUNC
645#define PROFILER_PRINT_RESULTS()
646#endif
647
648#endif // HIGHWAY_HWY_PROFILER_H_
#define HWY_ALIGNMENT
Definition aligned_allocator.h:41
#define HWY_RESTRICT
Definition base.h:95
#define HWY_NOINLINE
Definition base.h:103
#define HWY_ABORT(format,...)
Definition base.h:233
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_FENCE
Definition base.h:224
#define HWY_UNLIKELY(expr)
Definition base.h:107
Definition copy-inl.h:32
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:367
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
void CountingSort(T *values, size_t num_values)
Definition robust_statistics.h:32
T Mode(T *values, const size_t num_values)
Definition robust_statistics.h:106
Definition abort.h:8
HWY_INLINE HWY_ATTR_CACHE void FlushStream()
Definition cache_control.h:73
#define PROFILER_ZONE(name)
Definition profiler.h:643
#define PROFILER_THREAD_STORAGE
Definition profiler.h:47
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
#define HWY_ATTR
Definition set_macros-inl.h:646
int main(int argc, char **argv)
Definition t1_generate_luts.cpp:253