15#ifndef HIGHWAY_HWY_PROFILER_H_
16#define HIGHWAY_HWY_PROFILER_H_
38#ifndef PROFILER_ENABLED
39#define PROFILER_ENABLED 0
46#ifndef PROFILER_THREAD_STORAGE
47#define PROFILER_THREAD_STORAGE 200ULL
50#if PROFILER_ENABLED || HWY_IDE
68#define PROFILER_PRINT_OVERHEAD 0
77static constexpr size_t kMaxThreads = 256;
79static constexpr size_t kMaxDepth = 64;
81static constexpr size_t kMaxZones = 256;
89 for (
size_t i = 0; i <
HWY_ALIGNMENT /
sizeof(uint64_t); i += Lanes(d)) {
102 static constexpr size_t kOffsetBits = 25;
103 static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);
108 static constexpr size_t kTimestampBits = 64 - kOffsetBits;
109 static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;
111 static Packet Make(
const size_t biased_offset,
const uint64_t timestamp) {
112 HWY_DASSERT(biased_offset < (1ULL << kOffsetBits));
116 (biased_offset << kTimestampBits) + (timestamp & kTimestampMask);
120 uint64_t Timestamp()
const {
return bits_ & kTimestampMask; }
122 size_t BiasedOffset()
const {
return (bits_ >> kTimestampBits); }
127static_assert(
sizeof(Packet) == 8,
"Wrong Packet size");
135inline const char* StringOrigin() {
138 static const char* string_origin =
"__#__";
139 return string_origin - Packet::kOffsetBias;
146 uint64_t child_total;
148static_assert(
sizeof(Node) == 16,
"Wrong Node size");
152 static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;
154 uint64_t BiasedOffset()
const {
return u128.lo >> kNumCallBits; }
155 uint64_t NumCalls()
const {
return u128.lo & ((1ULL << kNumCallBits) - 1); }
156 uint64_t Duration()
const {
return u128.hi; }
158 void Set(uint64_t biased_offset, uint64_t num_calls, uint64_t duration) {
160 u128.lo = (biased_offset << kNumCallBits) + num_calls;
163 void Add(uint64_t num_calls, uint64_t duration) {
164 u128.lo += num_calls;
172static_assert(
sizeof(Accumulator) == 16,
"Wrong Accumulator size");
175inline T ClampedSubtract(
const T minuend,
const T subtrahend) {
176 if (subtrahend > minuend) {
179 return minuend - subtrahend;
185 Results() { ZeroBytes(zones_,
sizeof(zones_)); }
189 uint64_t ZoneDuration(
const Packet* packets) {
192 AnalyzePackets(packets, 2);
193 const uint64_t duration = zones_[0].Duration();
194 zones_[0].Set(0, 0, 0);
200 void SetSelfOverhead(
const uint64_t self_overhead) {
201 self_overhead_ = self_overhead;
204 void SetChildOverhead(
const uint64_t child_overhead) {
205 child_overhead_ = child_overhead;
210 void AnalyzePackets(
const Packet* packets,
const size_t num_packets) {
212 const uint64_t t0 = hn::timer::Start();
214 for (
size_t i = 0; i < num_packets; ++i) {
215 const Packet p = packets[i];
217 if (p.BiasedOffset() != Packet::kOffsetBias) {
219 nodes_[depth_].packet = p;
220 nodes_[depth_].child_total = 0;
226 const Node& node = nodes_[depth_ - 1];
228 const uint64_t duration =
229 (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
230 const uint64_t self_duration = ClampedSubtract(
231 duration, self_overhead_ + child_overhead_ + node.child_total);
233 UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration);
238 nodes_[depth_ - 1].child_total += duration + child_overhead_;
242 const uint64_t t1 = hn::timer::Stop();
243 analyze_elapsed_ += t1 - t0;
248 void Assimilate(
const Results& other) {
250 const uint64_t t0 = hn::timer::Start();
254 for (
size_t i = 0; i < other.num_zones_; ++i) {
255 const Accumulator& zone = other.zones_[i];
256 UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration());
258 const uint64_t t1 = hn::timer::Stop();
259 analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
265 const uint64_t t0 = hn::timer::Start();
270 std::sort(zones_, zones_ + num_zones_,
271 [](
const Accumulator& r1,
const Accumulator& r2) {
272 return r1.Duration() > r2.Duration();
275 const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();
277 const char* string_origin = StringOrigin();
278 for (
size_t i = 0; i < num_zones_; ++i) {
279 const Accumulator& r = zones_[i];
280 const uint64_t num_calls = r.NumCalls();
281 printf(
"%-40s: %10zu x %15zu = %9.6f\n", string_origin + r.BiasedOffset(),
282 num_calls, r.Duration() / num_calls,
283 static_cast<double>(r.Duration()) * inv_freq);
286 const uint64_t t1 = hn::timer::Stop();
287 analyze_elapsed_ += t1 - t0;
288 printf(
"Total analysis [s]: %f\n",
289 static_cast<double>(analyze_elapsed_) * inv_freq);
298 void UpdateOrAdd(
const size_t biased_offset,
const uint64_t num_calls,
299 const uint64_t duration) {
300 HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));
303 if (zones_[0].BiasedOffset() == biased_offset) {
304 zones_[0].Add(num_calls, duration);
305 HWY_DASSERT(zones_[0].BiasedOffset() == biased_offset);
310 for (
size_t i = 1; i < num_zones_; ++i) {
311 if (zones_[i].BiasedOffset() == biased_offset) {
312 zones_[i].Add(num_calls, duration);
313 HWY_DASSERT(zones_[i].BiasedOffset() == biased_offset);
316 const Accumulator prev = zones_[i - 1];
317 zones_[i - 1] = zones_[i];
326 zone->Set(biased_offset, num_calls, duration);
327 HWY_DASSERT(zone->BiasedOffset() == biased_offset);
334 void MergeDuplicates() {
335 const char* string_origin = StringOrigin();
336 for (
size_t i = 0; i < num_zones_; ++i) {
337 const size_t biased_offset = zones_[i].BiasedOffset();
338 const char* name = string_origin + biased_offset;
340 uint64_t num_calls = zones_[i].NumCalls();
343 for (
size_t j = i + 1; j < num_zones_;) {
344 if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
345 num_calls += zones_[j].NumCalls();
346 zones_[i].Add(0, zones_[j].Duration());
348 zones_[j] = zones_[--num_zones_];
354 HWY_DASSERT(num_calls < (1ULL << Accumulator::kNumCallBits));
357 zones_[i].Set(biased_offset, num_calls, zones_[i].Duration());
361 uint64_t analyze_elapsed_ = 0;
362 uint64_t self_overhead_ = 0;
363 uint64_t child_overhead_ = 0;
366 size_t num_zones_ = 0;
373class ThreadSpecific {
374 static constexpr size_t kBufferCapacity =
HWY_ALIGNMENT /
sizeof(Packet);
378 explicit ThreadSpecific(
const char* name)
380 packets_(AllocateAligned<Packet>(max_packets_)),
382 string_origin_(StringOrigin()) {
387 const size_t biased_offset = name - string_origin_;
388 HWY_ASSERT(biased_offset <= (1ULL << Packet::kOffsetBits));
392 void ComputeOverhead();
394 void WriteEntry(
const char* name,
const uint64_t timestamp) {
395 const size_t biased_offset = name - string_origin_;
396 Write(Packet::Make(biased_offset, timestamp));
399 void WriteExit(
const uint64_t timestamp) {
400 const size_t biased_offset = Packet::kOffsetBias;
401 Write(Packet::Make(biased_offset, timestamp));
404 void AnalyzeRemainingPackets() {
409 if (num_packets_ + buffer_size_ > max_packets_) {
410 results_.AnalyzePackets(packets_.get(), num_packets_);
413 CopyBytes(buffer_, packets_.get() + num_packets_,
414 buffer_size_ *
sizeof(Packet));
415 num_packets_ += buffer_size_;
417 results_.AnalyzePackets(packets_.get(), num_packets_);
421 Results& GetResults() {
return results_; }
425 void Write(
const Packet packet) {
427 if (buffer_size_ == kBufferCapacity) {
429 if (num_packets_ + kBufferCapacity > max_packets_) {
430 results_.AnalyzePackets(packets_.get(), num_packets_);
436 reinterpret_cast<const uint64_t*
>(buffer_),
437 reinterpret_cast<uint64_t*
>(packets_.get() + num_packets_));
438 num_packets_ += kBufferCapacity;
441 buffer_[buffer_size_] = packet;
447 Packet buffer_[kBufferCapacity];
448 size_t buffer_size_ = 0;
450 const size_t max_packets_;
452 AlignedFreeUniquePtr<Packet[]> packets_;
462 ThreadSpecific* Add(
const char* name) {
463 const size_t index = num_threads_.fetch_add(1, std::memory_order_relaxed);
466 ThreadSpecific* ts = MakeUniqueAligned<ThreadSpecific>(name).release();
467 threads_[index].store(ts, std::memory_order_release);
472 void PrintResults() {
473 const auto acq = std::memory_order_acquire;
474 const size_t num_threads = num_threads_.load(acq);
476 ThreadSpecific*
main = threads_[0].load(acq);
477 main->AnalyzeRemainingPackets();
479 for (
size_t i = 1; i < num_threads; ++i) {
480 ThreadSpecific* ts = threads_[i].load(acq);
481 ts->AnalyzeRemainingPackets();
482 main->GetResults().Assimilate(ts->GetResults());
485 if (num_threads != 0) {
486 main->GetResults().Print();
492 alignas(64) std::atomic<ThreadSpecific*> threads_[kMaxThreads];
493 std::atomic<size_t> num_threads_{0};
503 ThreadSpecific*
HWY_RESTRICT thread_specific = StaticThreadSpecific();
507 if (!platform::HaveTimerStop(cpu)) {
508 HWY_ABORT(
"CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu);
511 thread_specific = StaticThreadSpecific() = Threads().Add(name);
514 thread_specific->ComputeOverhead();
519 const uint64_t timestamp = HWY_NAMESPACE::timer::Start();
520 thread_specific->WriteEntry(name, timestamp);
525 const uint64_t timestamp = HWY_NAMESPACE::timer::Stop();
526 StaticThreadSpecific()->WriteExit(timestamp);
531 static void PrintResults() { Threads().PrintResults(); }
536 static ThreadSpecific*& StaticThreadSpecific() {
537 static thread_local ThreadSpecific* thread_specific;
538 return thread_specific;
542 static ThreadList& Threads() {
543 static ThreadList threads_;
551#define PROFILER_ZONE(name) \
553 const hwy::Zone zone("" name); \
558#define PROFILER_FUNC \
560 const hwy::Zone zone(__func__); \
563#define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults
565inline void ThreadSpecific::ComputeOverhead() {
570 uint64_t self_overhead;
572 const size_t kNumSamples = 32;
573 uint32_t samples[kNumSamples];
574 for (
size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
575 const size_t kNumDurations = 1024;
576 uint32_t durations[kNumDurations];
578 for (
size_t idx_duration = 0; idx_duration < kNumDurations;
583 const uint64_t duration = results_.ZoneDuration(buffer_);
585 durations[idx_duration] =
static_cast<uint32_t
>(duration);
593 self_overhead = samples[kNumSamples / 2];
594 if (PROFILER_PRINT_OVERHEAD) {
595 printf(
"Overhead: %zu\n", self_overhead);
597 results_.SetSelfOverhead(self_overhead);
601 const size_t kNumSamples = 32;
602 uint32_t samples[kNumSamples];
603 for (
size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
604 const size_t kNumDurations = 16;
605 uint32_t durations[kNumDurations];
606 for (
size_t idx_duration = 0; idx_duration < kNumDurations;
608 const size_t kReps = 10000;
611 std::atomic_thread_fence(std::memory_order_seq_cst);
612 const uint64_t t0 = hn::timer::Start();
613 for (
size_t i = 0; i < kReps; ++i) {
617 const uint64_t t1 = hn::timer::Stop();
618 HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2);
621 const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
622 durations[idx_duration] =
623 static_cast<uint32_t
>(ClampedSubtract(avg_duration, self_overhead));
629 const uint64_t child_overhead = samples[9 * kNumSamples / 10];
630 if (PROFILER_PRINT_OVERHEAD) {
631 printf(
"Child overhead: %zu\n", child_overhead);
633 results_.SetChildOverhead(child_overhead);
642#if !PROFILER_ENABLED && !HWY_IDE
643#define PROFILER_ZONE(name)
645#define PROFILER_PRINT_RESULTS()
#define HWY_ALIGNMENT
Definition aligned_allocator.h:41
#define HWY_RESTRICT
Definition base.h:95
#define HWY_NOINLINE
Definition base.h:103
#define HWY_ABORT(format,...)
Definition base.h:233
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_FENCE
Definition base.h:224
#define HWY_UNLIKELY(expr)
Definition base.h:107
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:367
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
void CountingSort(T *values, size_t num_values)
Definition robust_statistics.h:32
T Mode(T *values, const size_t num_values)
Definition robust_statistics.h:106
HWY_INLINE HWY_ATTR_CACHE void FlushStream()
Definition cache_control.h:73
#define PROFILER_ZONE(name)
Definition profiler.h:643
#define PROFILER_THREAD_STORAGE
Definition profiler.h:47
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
#define HWY_ATTR
Definition set_macros-inl.h:646
int main(int argc, char **argv)
Definition t1_generate_luts.cpp:253