Grok 12.0.1
contrib/sort/shared-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Definitions shared between vqsort-inl and sorting_networks-inl.
17
18// Normal include guard for target-independent parts
19#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
20#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
21
22#include "hwy/base.h"
23
24namespace hwy {
25
26// Internal constants - these are to avoid magic numbers/literals and cannot be
27// changed without also changing the associated code.
29 // SortingNetwork reshapes its input into a matrix. This is the maximum number
30 // of *lanes* per vector. Must be at least 8 because SortSamples assumes the
31 // sorting network can handle 128 bytes with 8 rows, so 16 bytes per vector,
32 // which means 8 lanes for 16-bit types.
33#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
34 static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow
35#else
36 static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector
37#endif
38
39 // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
40 // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
41 // code size reasonable, and minimizing the extra logN factor for larger
42 // networks (for which only loose upper bounds on size are known).
43 static constexpr size_t kMaxRows = 16;
44
45 // Template argument ensures there is no actual division instruction.
46 template <size_t kLPK>
47 static constexpr HWY_INLINE size_t BaseCaseNumLanes(size_t N) {
48 // We use 8, 8x2, 8x4, and 16x{4..} networks, in units of keys. For N/kLPK
49 // < 4, we cannot use the 16-row networks.
50 return (((N / kLPK) >= 4) ? kMaxRows : 8) * HWY_MIN(N, kMaxCols);
51 }
52
53 // Unrolling is important (pipelining and amortizing branch mispredictions);
54 // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
55 // somewhat slower for sorting than 4x.
56 //
57 // To change, must also update left + 3 * N etc. in the loop.
58 static constexpr size_t kPartitionUnroll = 4;
59
60 // Chunk := group of keys loaded for sampling a pivot. Matches the typical
61 // cache line size of 64 bytes to get maximum benefit per L2 miss. Sort()
62 // ensures vectors are no larger than that, so this can be independent of the
63 // vector size and thus constexpr.
64 static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t) {
65 return 64 / sizeof_t;
66 }
67
68 template <typename T>
69 static constexpr HWY_INLINE size_t SampleLanes() {
70 return 2 * LanesPerChunk(sizeof(T)); // Stored samples
71 }
72
73 static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
74 // The main loop reads kPartitionUnroll vectors, and first loads from
75 // both left and right beforehand, so it requires 2 * kPartitionUnroll
76 // vectors. To handle amounts between that and BaseCaseNumLanes(), we
77 // partition up 3 * kPartitionUnroll + 1 vectors into a two-part buffer.
78 return 2 * (3 * kPartitionUnroll + 1) * N;
79 }
80
81 // Max across the three buffer usages.
82 template <typename T, size_t kLPK>
83 static constexpr HWY_INLINE size_t BufNum(size_t N) {
84 // BaseCase may write one padding vector, and SortSamples uses the space
85 // after samples as the buffer.
86 return HWY_MAX(SampleLanes<T>() + BaseCaseNumLanes<kLPK>(N) + N,
88 }
89
90 // Translates vector_size to lanes and returns size in bytes.
91 template <typename T, size_t kLPK>
92 static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
93 return BufNum<T, kLPK>(vector_size / sizeof(T)) * sizeof(T);
94 }
95
96 // Returns max for any type.
97 template <size_t kLPK>
98 static constexpr HWY_INLINE size_t MaxBufBytes(size_t vector_size) {
99 // If 2 lanes per key, it's a 128-bit key with u64 lanes.
100 return kLPK == 2 ? BufBytes<uint64_t, 2>(vector_size)
101 : HWY_MAX((BufBytes<uint16_t, 1>(vector_size)),
102 HWY_MAX((BufBytes<uint32_t, 1>(vector_size)),
103 (BufBytes<uint64_t, 1>(vector_size))));
104 }
105};
106
107static_assert(SortConstants::MaxBufBytes<1>(64) <= 1664, "Unexpectedly high");
108static_assert(SortConstants::MaxBufBytes<2>(64) <= 1664, "Unexpectedly high");
109
110} // namespace hwy
111
112#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
113
114// Per-target
115// clang-format off
116#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE) // NOLINT
117// clang-format on
118#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
119#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
120#else
121#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
122#endif
123
124#include "hwy/highway.h"
125
126// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
127// Armv7 debug, and Armv8 GCC 11 asan hits an internal compiler error likely
128// due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97696.
129#undef VQSORT_ENABLED
130#if (HWY_TARGET == HWY_SCALAR) || \
131 (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
132 (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD) || \
133 (HWY_ARCH_ARM_A64 && HWY_COMPILER_GCC_ACTUAL && HWY_IS_ASAN)
134#define VQSORT_ENABLED 0
135#else
136#define VQSORT_ENABLED 1
137#endif
138
139namespace hwy {
140namespace HWY_NAMESPACE {
141
142// Default tag / vector width selector.
143#if HWY_TARGET == HWY_RVV
144// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
145template <typename T>
146using SortTag = ScalableTag<T, -1>;
147#else
148template <typename T>
149using SortTag = ScalableTag<T>;
150#endif
151
152// NOLINTNEXTLINE(google-readability-namespace-comments)
153} // namespace HWY_NAMESPACE
154} // namespace hwy
155
156#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
ScalableTag< T, -1 > SortTag
Definition contrib/sort/shared-inl.h:146
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:367
Definition abort.h:8
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition contrib/sort/shared-inl.h:28
static constexpr HWY_INLINE size_t BufBytes(size_t vector_size)
Definition contrib/sort/shared-inl.h:92
static constexpr HWY_INLINE size_t BaseCaseNumLanes(size_t N)
Definition contrib/sort/shared-inl.h:47
static constexpr size_t kMaxCols
Definition contrib/sort/shared-inl.h:36
static constexpr HWY_INLINE size_t PartitionBufNum(size_t N)
Definition contrib/sort/shared-inl.h:73
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t)
Definition contrib/sort/shared-inl.h:64
static constexpr HWY_INLINE size_t MaxBufBytes(size_t vector_size)
Definition contrib/sort/shared-inl.h:98
static constexpr size_t kMaxRows
Definition contrib/sort/shared-inl.h:43
static constexpr HWY_INLINE size_t BufNum(size_t N)
Definition contrib/sort/shared-inl.h:83
static constexpr HWY_INLINE size_t SampleLanes()
Definition contrib/sort/shared-inl.h:69
static constexpr size_t kPartitionUnroll
Definition contrib/sort/shared-inl.h:58