Grok 12.0.1
sorting_networks-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Per-target
17#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \
18 defined(HWY_TARGET_TOGGLE)
19#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
20#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
21#else
22#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
23#endif
24
25#include "hwy/contrib/sort/shared-inl.h" // SortConstants
26#include "hwy/highway.h"
27
29namespace hwy {
30namespace HWY_NAMESPACE {
31namespace detail {
32
33#if VQSORT_ENABLED
34
36
37// ------------------------------ SharedTraits
38
39// Code shared between all traits. It's unclear whether these can profitably be
40// specialized for Lane vs Block, or optimized like SortPairsDistance1 using
41// Compare/DupOdd.
42template <class Base>
43struct SharedTraits : public Base {
44 using SharedTraitsForSortingNetwork =
46
47 // Conditionally swaps lane 0 with 2, 1 with 3 etc.
48 template <class D>
49 HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const {
50 const Base* base = static_cast<const Base*>(this);
51 Vec<D> swapped = base->SwapAdjacentPairs(d, v);
52 base->Sort2(d, v, swapped);
53 return base->OddEvenPairs(d, swapped, v);
54 }
55
56 // Swaps with the vector formed by reversing contiguous groups of 8 keys.
57 template <class D>
58 HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const {
59 const Base* base = static_cast<const Base*>(this);
60 Vec<D> swapped = base->ReverseKeys8(d, v);
61 base->Sort2(d, v, swapped);
62 return base->OddEvenQuads(d, swapped, v);
63 }
64
65 // Swaps with the vector formed by reversing contiguous groups of 8 keys.
66 template <class D>
67 HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const {
68 const Base* base = static_cast<const Base*>(this);
69 static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16");
70 Vec<D> swapped = base->ReverseKeys(d, v);
71 base->Sort2(d, v, swapped);
72 return ConcatUpperLower(d, swapped, v); // 8 = half of the vector
73 }
74};
75
76// ------------------------------ Sorting network
77
78// Sorting networks for independent columns in 2, 4 and 8 vectors from
79// https://bertdobbelaere.github.io/sorting_networks.html.
80
81template <class D, class Traits, class V = Vec<D>>
82HWY_INLINE void Sort2(D d, Traits st, V& v0, V& v1) {
83 st.Sort2(d, v0, v1);
84}
85
86template <class D, class Traits, class V = Vec<D>>
87HWY_INLINE void Sort4(D d, Traits st, V& v0, V& v1, V& v2, V& v3) {
88 st.Sort2(d, v0, v2);
89 st.Sort2(d, v1, v3);
90 st.Sort2(d, v0, v1);
91 st.Sort2(d, v2, v3);
92 st.Sort2(d, v1, v2);
93}
94
95template <class D, class Traits, class V = Vec<D>>
96HWY_INLINE void Sort8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
97 V& v6, V& v7) {
98 st.Sort2(d, v0, v2);
99 st.Sort2(d, v1, v3);
100 st.Sort2(d, v4, v6);
101 st.Sort2(d, v5, v7);
102
103 st.Sort2(d, v0, v4);
104 st.Sort2(d, v1, v5);
105 st.Sort2(d, v2, v6);
106 st.Sort2(d, v3, v7);
107
108 st.Sort2(d, v0, v1);
109 st.Sort2(d, v2, v3);
110 st.Sort2(d, v4, v5);
111 st.Sort2(d, v6, v7);
112
113 st.Sort2(d, v2, v4);
114 st.Sort2(d, v3, v5);
115
116 st.Sort2(d, v1, v4);
117 st.Sort2(d, v3, v6);
118
119 st.Sort2(d, v1, v2);
120 st.Sort2(d, v3, v4);
121 st.Sort2(d, v5, v6);
122}
123
124// (Green's irregular) sorting network for independent columns in 16 vectors.
125template <class D, class Traits, class V = Vec<D>>
126HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
127 V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
128 V& ve, V& vf) {
129 st.Sort2(d, v0, v1);
130 st.Sort2(d, v2, v3);
131 st.Sort2(d, v4, v5);
132 st.Sort2(d, v6, v7);
133 st.Sort2(d, v8, v9);
134 st.Sort2(d, va, vb);
135 st.Sort2(d, vc, vd);
136 st.Sort2(d, ve, vf);
137 st.Sort2(d, v0, v2);
138 st.Sort2(d, v1, v3);
139 st.Sort2(d, v4, v6);
140 st.Sort2(d, v5, v7);
141 st.Sort2(d, v8, va);
142 st.Sort2(d, v9, vb);
143 st.Sort2(d, vc, ve);
144 st.Sort2(d, vd, vf);
145 st.Sort2(d, v0, v4);
146 st.Sort2(d, v1, v5);
147 st.Sort2(d, v2, v6);
148 st.Sort2(d, v3, v7);
149 st.Sort2(d, v8, vc);
150 st.Sort2(d, v9, vd);
151 st.Sort2(d, va, ve);
152 st.Sort2(d, vb, vf);
153 st.Sort2(d, v0, v8);
154 st.Sort2(d, v1, v9);
155 st.Sort2(d, v2, va);
156 st.Sort2(d, v3, vb);
157 st.Sort2(d, v4, vc);
158 st.Sort2(d, v5, vd);
159 st.Sort2(d, v6, ve);
160 st.Sort2(d, v7, vf);
161 st.Sort2(d, v5, va);
162 st.Sort2(d, v6, v9);
163 st.Sort2(d, v3, vc);
164 st.Sort2(d, v7, vb);
165 st.Sort2(d, vd, ve);
166 st.Sort2(d, v4, v8);
167 st.Sort2(d, v1, v2);
168 st.Sort2(d, v1, v4);
169 st.Sort2(d, v7, vd);
170 st.Sort2(d, v2, v8);
171 st.Sort2(d, vb, ve);
172 st.Sort2(d, v2, v4);
173 st.Sort2(d, v5, v6);
174 st.Sort2(d, v9, va);
175 st.Sort2(d, vb, vd);
176 st.Sort2(d, v3, v8);
177 st.Sort2(d, v7, vc);
178 st.Sort2(d, v3, v5);
179 st.Sort2(d, v6, v8);
180 st.Sort2(d, v7, v9);
181 st.Sort2(d, va, vc);
182 st.Sort2(d, v3, v4);
183 st.Sort2(d, v5, v6);
184 st.Sort2(d, v7, v8);
185 st.Sort2(d, v9, va);
186 st.Sort2(d, vb, vc);
187 st.Sort2(d, v6, v7);
188 st.Sort2(d, v8, v9);
189}
190
191// ------------------------------ Merging networks
192
193// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
194// For acceptable performance, these must be inlined, otherwise vectors are
195// loaded from the stack. The kKeysPerVector allows calling from generic code
196// but skipping the functions when vectors have too few lanes for
197// st.SortPairsDistance1 to compile. `if constexpr` in the caller would also
198// work, but is not available in C++11. We write out the (unused) argument types
199// rather than `...` because GCC 9 (but not 10) fails to compile with `...`.
200
201template <size_t kKeysPerVector, class D, class Traits, class V,
202 HWY_IF_LANES_LE(kKeysPerVector, 1)>
203HWY_INLINE void Merge8x2(D, Traits, V, V, V, V, V, V, V, V) {}
204template <size_t kKeysPerVector, class D, class Traits, class V,
205 HWY_IF_LANES_LE(kKeysPerVector, 2)>
206HWY_INLINE void Merge8x4(D, Traits, V, V, V, V, V, V, V, V) {}
207
208template <size_t kKeysPerVector, class D, class Traits, class V,
209 HWY_IF_LANES_LE(kKeysPerVector, 1)>
210HWY_INLINE void Merge16x2(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
211 V, V) {}
212template <size_t kKeysPerVector, class D, class Traits, class V,
213 HWY_IF_LANES_LE(kKeysPerVector, 2)>
214HWY_INLINE void Merge16x4(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
215 V, V) {}
216template <size_t kKeysPerVector, class D, class Traits, class V,
217 HWY_IF_LANES_LE(kKeysPerVector, 4)>
218HWY_INLINE void Merge16x8(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
219 V, V) {}
220template <size_t kKeysPerVector, class D, class Traits, class V,
221 HWY_IF_LANES_LE(kKeysPerVector, 8)>
222HWY_INLINE void Merge16x16(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
223 V, V) {}
224
225template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
226 HWY_IF_LANES_GT(kKeysPerVector, 1)>
227HWY_INLINE void Merge8x2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
228 V& v5, V& v6, V& v7) {
229 v7 = st.ReverseKeys2(d, v7);
230 v6 = st.ReverseKeys2(d, v6);
231 v5 = st.ReverseKeys2(d, v5);
232 v4 = st.ReverseKeys2(d, v4);
233 st.Sort2(d, v0, v7);
234 st.Sort2(d, v1, v6);
235 st.Sort2(d, v2, v5);
236 st.Sort2(d, v3, v4);
237
238 v3 = st.ReverseKeys2(d, v3);
239 v2 = st.ReverseKeys2(d, v2);
240 v7 = st.ReverseKeys2(d, v7);
241 v6 = st.ReverseKeys2(d, v6);
242 st.Sort2(d, v0, v3);
243 st.Sort2(d, v1, v2);
244 st.Sort2(d, v4, v7);
245 st.Sort2(d, v5, v6);
246
247 v1 = st.ReverseKeys2(d, v1);
248 v3 = st.ReverseKeys2(d, v3);
249 v5 = st.ReverseKeys2(d, v5);
250 v7 = st.ReverseKeys2(d, v7);
251 st.Sort2(d, v0, v1);
252 st.Sort2(d, v2, v3);
253 st.Sort2(d, v4, v5);
254 st.Sort2(d, v6, v7);
255
256 v0 = st.SortPairsDistance1(d, v0);
257 v1 = st.SortPairsDistance1(d, v1);
258 v2 = st.SortPairsDistance1(d, v2);
259 v3 = st.SortPairsDistance1(d, v3);
260 v4 = st.SortPairsDistance1(d, v4);
261 v5 = st.SortPairsDistance1(d, v5);
262 v6 = st.SortPairsDistance1(d, v6);
263 v7 = st.SortPairsDistance1(d, v7);
264}
265
266template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
267 HWY_IF_LANES_GT(kKeysPerVector, 2)>
268HWY_INLINE void Merge8x4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
269 V& v5, V& v6, V& v7) {
270 v7 = st.ReverseKeys4(d, v7);
271 v6 = st.ReverseKeys4(d, v6);
272 v5 = st.ReverseKeys4(d, v5);
273 v4 = st.ReverseKeys4(d, v4);
274 st.Sort2(d, v0, v7);
275 st.Sort2(d, v1, v6);
276 st.Sort2(d, v2, v5);
277 st.Sort2(d, v3, v4);
278
279 v3 = st.ReverseKeys4(d, v3);
280 v2 = st.ReverseKeys4(d, v2);
281 v7 = st.ReverseKeys4(d, v7);
282 v6 = st.ReverseKeys4(d, v6);
283 st.Sort2(d, v0, v3);
284 st.Sort2(d, v1, v2);
285 st.Sort2(d, v4, v7);
286 st.Sort2(d, v5, v6);
287
288 v1 = st.ReverseKeys4(d, v1);
289 v3 = st.ReverseKeys4(d, v3);
290 v5 = st.ReverseKeys4(d, v5);
291 v7 = st.ReverseKeys4(d, v7);
292 st.Sort2(d, v0, v1);
293 st.Sort2(d, v2, v3);
294 st.Sort2(d, v4, v5);
295 st.Sort2(d, v6, v7);
296
297 v0 = st.SortPairsReverse4(d, v0);
298 v1 = st.SortPairsReverse4(d, v1);
299 v2 = st.SortPairsReverse4(d, v2);
300 v3 = st.SortPairsReverse4(d, v3);
301 v4 = st.SortPairsReverse4(d, v4);
302 v5 = st.SortPairsReverse4(d, v5);
303 v6 = st.SortPairsReverse4(d, v6);
304 v7 = st.SortPairsReverse4(d, v7);
305
306 v0 = st.SortPairsDistance1(d, v0);
307 v1 = st.SortPairsDistance1(d, v1);
308 v2 = st.SortPairsDistance1(d, v2);
309 v3 = st.SortPairsDistance1(d, v3);
310 v4 = st.SortPairsDistance1(d, v4);
311 v5 = st.SortPairsDistance1(d, v5);
312 v6 = st.SortPairsDistance1(d, v6);
313 v7 = st.SortPairsDistance1(d, v7);
314}
315
316// Only used by the now-deprecated SortingNetwork().
317template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
318 HWY_IF_LANES_GT(kKeysPerVector, 1)>
319HWY_INLINE void Merge16x2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
320 V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb,
321 V& vc, V& vd, V& ve, V& vf) {
322 vf = st.ReverseKeys2(d, vf);
323 ve = st.ReverseKeys2(d, ve);
324 vd = st.ReverseKeys2(d, vd);
325 vc = st.ReverseKeys2(d, vc);
326 vb = st.ReverseKeys2(d, vb);
327 va = st.ReverseKeys2(d, va);
328 v9 = st.ReverseKeys2(d, v9);
329 v8 = st.ReverseKeys2(d, v8);
330 st.Sort2(d, v0, vf);
331 st.Sort2(d, v1, ve);
332 st.Sort2(d, v2, vd);
333 st.Sort2(d, v3, vc);
334 st.Sort2(d, v4, vb);
335 st.Sort2(d, v5, va);
336 st.Sort2(d, v6, v9);
337 st.Sort2(d, v7, v8);
338
339 v7 = st.ReverseKeys2(d, v7);
340 v6 = st.ReverseKeys2(d, v6);
341 v5 = st.ReverseKeys2(d, v5);
342 v4 = st.ReverseKeys2(d, v4);
343 vf = st.ReverseKeys2(d, vf);
344 ve = st.ReverseKeys2(d, ve);
345 vd = st.ReverseKeys2(d, vd);
346 vc = st.ReverseKeys2(d, vc);
347 st.Sort2(d, v0, v7);
348 st.Sort2(d, v1, v6);
349 st.Sort2(d, v2, v5);
350 st.Sort2(d, v3, v4);
351 st.Sort2(d, v8, vf);
352 st.Sort2(d, v9, ve);
353 st.Sort2(d, va, vd);
354 st.Sort2(d, vb, vc);
355
356 v3 = st.ReverseKeys2(d, v3);
357 v2 = st.ReverseKeys2(d, v2);
358 v7 = st.ReverseKeys2(d, v7);
359 v6 = st.ReverseKeys2(d, v6);
360 vb = st.ReverseKeys2(d, vb);
361 va = st.ReverseKeys2(d, va);
362 vf = st.ReverseKeys2(d, vf);
363 ve = st.ReverseKeys2(d, ve);
364 st.Sort2(d, v0, v3);
365 st.Sort2(d, v1, v2);
366 st.Sort2(d, v4, v7);
367 st.Sort2(d, v5, v6);
368 st.Sort2(d, v8, vb);
369 st.Sort2(d, v9, va);
370 st.Sort2(d, vc, vf);
371 st.Sort2(d, vd, ve);
372
373 v1 = st.ReverseKeys2(d, v1);
374 v3 = st.ReverseKeys2(d, v3);
375 v5 = st.ReverseKeys2(d, v5);
376 v7 = st.ReverseKeys2(d, v7);
377 v9 = st.ReverseKeys2(d, v9);
378 vb = st.ReverseKeys2(d, vb);
379 vd = st.ReverseKeys2(d, vd);
380 vf = st.ReverseKeys2(d, vf);
381 st.Sort2(d, v0, v1);
382 st.Sort2(d, v2, v3);
383 st.Sort2(d, v4, v5);
384 st.Sort2(d, v6, v7);
385 st.Sort2(d, v8, v9);
386 st.Sort2(d, va, vb);
387 st.Sort2(d, vc, vd);
388 st.Sort2(d, ve, vf);
389
390 v0 = st.SortPairsDistance1(d, v0);
391 v1 = st.SortPairsDistance1(d, v1);
392 v2 = st.SortPairsDistance1(d, v2);
393 v3 = st.SortPairsDistance1(d, v3);
394 v4 = st.SortPairsDistance1(d, v4);
395 v5 = st.SortPairsDistance1(d, v5);
396 v6 = st.SortPairsDistance1(d, v6);
397 v7 = st.SortPairsDistance1(d, v7);
398 v8 = st.SortPairsDistance1(d, v8);
399 v9 = st.SortPairsDistance1(d, v9);
400 va = st.SortPairsDistance1(d, va);
401 vb = st.SortPairsDistance1(d, vb);
402 vc = st.SortPairsDistance1(d, vc);
403 vd = st.SortPairsDistance1(d, vd);
404 ve = st.SortPairsDistance1(d, ve);
405 vf = st.SortPairsDistance1(d, vf);
406}
407
408template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
409 HWY_IF_LANES_GT(kKeysPerVector, 2)>
410HWY_INLINE void Merge16x4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
411 V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb,
412 V& vc, V& vd, V& ve, V& vf) {
413 vf = st.ReverseKeys4(d, vf);
414 ve = st.ReverseKeys4(d, ve);
415 vd = st.ReverseKeys4(d, vd);
416 vc = st.ReverseKeys4(d, vc);
417 vb = st.ReverseKeys4(d, vb);
418 va = st.ReverseKeys4(d, va);
419 v9 = st.ReverseKeys4(d, v9);
420 v8 = st.ReverseKeys4(d, v8);
421 st.Sort2(d, v0, vf);
422 st.Sort2(d, v1, ve);
423 st.Sort2(d, v2, vd);
424 st.Sort2(d, v3, vc);
425 st.Sort2(d, v4, vb);
426 st.Sort2(d, v5, va);
427 st.Sort2(d, v6, v9);
428 st.Sort2(d, v7, v8);
429
430 v7 = st.ReverseKeys4(d, v7);
431 v6 = st.ReverseKeys4(d, v6);
432 v5 = st.ReverseKeys4(d, v5);
433 v4 = st.ReverseKeys4(d, v4);
434 vf = st.ReverseKeys4(d, vf);
435 ve = st.ReverseKeys4(d, ve);
436 vd = st.ReverseKeys4(d, vd);
437 vc = st.ReverseKeys4(d, vc);
438 st.Sort2(d, v0, v7);
439 st.Sort2(d, v1, v6);
440 st.Sort2(d, v2, v5);
441 st.Sort2(d, v3, v4);
442 st.Sort2(d, v8, vf);
443 st.Sort2(d, v9, ve);
444 st.Sort2(d, va, vd);
445 st.Sort2(d, vb, vc);
446
447 v3 = st.ReverseKeys4(d, v3);
448 v2 = st.ReverseKeys4(d, v2);
449 v7 = st.ReverseKeys4(d, v7);
450 v6 = st.ReverseKeys4(d, v6);
451 vb = st.ReverseKeys4(d, vb);
452 va = st.ReverseKeys4(d, va);
453 vf = st.ReverseKeys4(d, vf);
454 ve = st.ReverseKeys4(d, ve);
455 st.Sort2(d, v0, v3);
456 st.Sort2(d, v1, v2);
457 st.Sort2(d, v4, v7);
458 st.Sort2(d, v5, v6);
459 st.Sort2(d, v8, vb);
460 st.Sort2(d, v9, va);
461 st.Sort2(d, vc, vf);
462 st.Sort2(d, vd, ve);
463
464 v1 = st.ReverseKeys4(d, v1);
465 v3 = st.ReverseKeys4(d, v3);
466 v5 = st.ReverseKeys4(d, v5);
467 v7 = st.ReverseKeys4(d, v7);
468 v9 = st.ReverseKeys4(d, v9);
469 vb = st.ReverseKeys4(d, vb);
470 vd = st.ReverseKeys4(d, vd);
471 vf = st.ReverseKeys4(d, vf);
472 st.Sort2(d, v0, v1);
473 st.Sort2(d, v2, v3);
474 st.Sort2(d, v4, v5);
475 st.Sort2(d, v6, v7);
476 st.Sort2(d, v8, v9);
477 st.Sort2(d, va, vb);
478 st.Sort2(d, vc, vd);
479 st.Sort2(d, ve, vf);
480
481 v0 = st.SortPairsReverse4(d, v0);
482 v1 = st.SortPairsReverse4(d, v1);
483 v2 = st.SortPairsReverse4(d, v2);
484 v3 = st.SortPairsReverse4(d, v3);
485 v4 = st.SortPairsReverse4(d, v4);
486 v5 = st.SortPairsReverse4(d, v5);
487 v6 = st.SortPairsReverse4(d, v6);
488 v7 = st.SortPairsReverse4(d, v7);
489 v8 = st.SortPairsReverse4(d, v8);
490 v9 = st.SortPairsReverse4(d, v9);
491 va = st.SortPairsReverse4(d, va);
492 vb = st.SortPairsReverse4(d, vb);
493 vc = st.SortPairsReverse4(d, vc);
494 vd = st.SortPairsReverse4(d, vd);
495 ve = st.SortPairsReverse4(d, ve);
496 vf = st.SortPairsReverse4(d, vf);
497
498 v0 = st.SortPairsDistance1(d, v0);
499 v1 = st.SortPairsDistance1(d, v1);
500 v2 = st.SortPairsDistance1(d, v2);
501 v3 = st.SortPairsDistance1(d, v3);
502 v4 = st.SortPairsDistance1(d, v4);
503 v5 = st.SortPairsDistance1(d, v5);
504 v6 = st.SortPairsDistance1(d, v6);
505 v7 = st.SortPairsDistance1(d, v7);
506 v8 = st.SortPairsDistance1(d, v8);
507 v9 = st.SortPairsDistance1(d, v9);
508 va = st.SortPairsDistance1(d, va);
509 vb = st.SortPairsDistance1(d, vb);
510 vc = st.SortPairsDistance1(d, vc);
511 vd = st.SortPairsDistance1(d, vd);
512 ve = st.SortPairsDistance1(d, ve);
513 vf = st.SortPairsDistance1(d, vf);
514}
515
516template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
517 HWY_IF_LANES_GT(kKeysPerVector, 4)>
518HWY_INLINE void Merge16x8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
519 V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb,
520 V& vc, V& vd, V& ve, V& vf) {
521 vf = st.ReverseKeys8(d, vf);
522 ve = st.ReverseKeys8(d, ve);
523 vd = st.ReverseKeys8(d, vd);
524 vc = st.ReverseKeys8(d, vc);
525 vb = st.ReverseKeys8(d, vb);
526 va = st.ReverseKeys8(d, va);
527 v9 = st.ReverseKeys8(d, v9);
528 v8 = st.ReverseKeys8(d, v8);
529 st.Sort2(d, v0, vf);
530 st.Sort2(d, v1, ve);
531 st.Sort2(d, v2, vd);
532 st.Sort2(d, v3, vc);
533 st.Sort2(d, v4, vb);
534 st.Sort2(d, v5, va);
535 st.Sort2(d, v6, v9);
536 st.Sort2(d, v7, v8);
537
538 v7 = st.ReverseKeys8(d, v7);
539 v6 = st.ReverseKeys8(d, v6);
540 v5 = st.ReverseKeys8(d, v5);
541 v4 = st.ReverseKeys8(d, v4);
542 vf = st.ReverseKeys8(d, vf);
543 ve = st.ReverseKeys8(d, ve);
544 vd = st.ReverseKeys8(d, vd);
545 vc = st.ReverseKeys8(d, vc);
546 st.Sort2(d, v0, v7);
547 st.Sort2(d, v1, v6);
548 st.Sort2(d, v2, v5);
549 st.Sort2(d, v3, v4);
550 st.Sort2(d, v8, vf);
551 st.Sort2(d, v9, ve);
552 st.Sort2(d, va, vd);
553 st.Sort2(d, vb, vc);
554
555 v3 = st.ReverseKeys8(d, v3);
556 v2 = st.ReverseKeys8(d, v2);
557 v7 = st.ReverseKeys8(d, v7);
558 v6 = st.ReverseKeys8(d, v6);
559 vb = st.ReverseKeys8(d, vb);
560 va = st.ReverseKeys8(d, va);
561 vf = st.ReverseKeys8(d, vf);
562 ve = st.ReverseKeys8(d, ve);
563 st.Sort2(d, v0, v3);
564 st.Sort2(d, v1, v2);
565 st.Sort2(d, v4, v7);
566 st.Sort2(d, v5, v6);
567 st.Sort2(d, v8, vb);
568 st.Sort2(d, v9, va);
569 st.Sort2(d, vc, vf);
570 st.Sort2(d, vd, ve);
571
572 v1 = st.ReverseKeys8(d, v1);
573 v3 = st.ReverseKeys8(d, v3);
574 v5 = st.ReverseKeys8(d, v5);
575 v7 = st.ReverseKeys8(d, v7);
576 v9 = st.ReverseKeys8(d, v9);
577 vb = st.ReverseKeys8(d, vb);
578 vd = st.ReverseKeys8(d, vd);
579 vf = st.ReverseKeys8(d, vf);
580 st.Sort2(d, v0, v1);
581 st.Sort2(d, v2, v3);
582 st.Sort2(d, v4, v5);
583 st.Sort2(d, v6, v7);
584 st.Sort2(d, v8, v9);
585 st.Sort2(d, va, vb);
586 st.Sort2(d, vc, vd);
587 st.Sort2(d, ve, vf);
588
589 v0 = st.SortPairsReverse8(d, v0);
590 v1 = st.SortPairsReverse8(d, v1);
591 v2 = st.SortPairsReverse8(d, v2);
592 v3 = st.SortPairsReverse8(d, v3);
593 v4 = st.SortPairsReverse8(d, v4);
594 v5 = st.SortPairsReverse8(d, v5);
595 v6 = st.SortPairsReverse8(d, v6);
596 v7 = st.SortPairsReverse8(d, v7);
597 v8 = st.SortPairsReverse8(d, v8);
598 v9 = st.SortPairsReverse8(d, v9);
599 va = st.SortPairsReverse8(d, va);
600 vb = st.SortPairsReverse8(d, vb);
601 vc = st.SortPairsReverse8(d, vc);
602 vd = st.SortPairsReverse8(d, vd);
603 ve = st.SortPairsReverse8(d, ve);
604 vf = st.SortPairsReverse8(d, vf);
605
606 v0 = st.SortPairsDistance2(d, v0);
607 v1 = st.SortPairsDistance2(d, v1);
608 v2 = st.SortPairsDistance2(d, v2);
609 v3 = st.SortPairsDistance2(d, v3);
610 v4 = st.SortPairsDistance2(d, v4);
611 v5 = st.SortPairsDistance2(d, v5);
612 v6 = st.SortPairsDistance2(d, v6);
613 v7 = st.SortPairsDistance2(d, v7);
614 v8 = st.SortPairsDistance2(d, v8);
615 v9 = st.SortPairsDistance2(d, v9);
616 va = st.SortPairsDistance2(d, va);
617 vb = st.SortPairsDistance2(d, vb);
618 vc = st.SortPairsDistance2(d, vc);
619 vd = st.SortPairsDistance2(d, vd);
620 ve = st.SortPairsDistance2(d, ve);
621 vf = st.SortPairsDistance2(d, vf);
622
623 v0 = st.SortPairsDistance1(d, v0);
624 v1 = st.SortPairsDistance1(d, v1);
625 v2 = st.SortPairsDistance1(d, v2);
626 v3 = st.SortPairsDistance1(d, v3);
627 v4 = st.SortPairsDistance1(d, v4);
628 v5 = st.SortPairsDistance1(d, v5);
629 v6 = st.SortPairsDistance1(d, v6);
630 v7 = st.SortPairsDistance1(d, v7);
631 v8 = st.SortPairsDistance1(d, v8);
632 v9 = st.SortPairsDistance1(d, v9);
633 va = st.SortPairsDistance1(d, va);
634 vb = st.SortPairsDistance1(d, vb);
635 vc = st.SortPairsDistance1(d, vc);
636 vd = st.SortPairsDistance1(d, vd);
637 ve = st.SortPairsDistance1(d, ve);
638 vf = st.SortPairsDistance1(d, vf);
639}
640
641// Unused on MSVC, see below
642#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
643
644template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
645 HWY_IF_LANES_GT(kKeysPerVector, 8)>
646HWY_INLINE void Merge16x16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
647 V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb,
648 V& vc, V& vd, V& ve, V& vf) {
649 vf = st.ReverseKeys16(d, vf);
650 ve = st.ReverseKeys16(d, ve);
651 vd = st.ReverseKeys16(d, vd);
652 vc = st.ReverseKeys16(d, vc);
653 vb = st.ReverseKeys16(d, vb);
654 va = st.ReverseKeys16(d, va);
655 v9 = st.ReverseKeys16(d, v9);
656 v8 = st.ReverseKeys16(d, v8);
657 st.Sort2(d, v0, vf);
658 st.Sort2(d, v1, ve);
659 st.Sort2(d, v2, vd);
660 st.Sort2(d, v3, vc);
661 st.Sort2(d, v4, vb);
662 st.Sort2(d, v5, va);
663 st.Sort2(d, v6, v9);
664 st.Sort2(d, v7, v8);
665
666 v7 = st.ReverseKeys16(d, v7);
667 v6 = st.ReverseKeys16(d, v6);
668 v5 = st.ReverseKeys16(d, v5);
669 v4 = st.ReverseKeys16(d, v4);
670 vf = st.ReverseKeys16(d, vf);
671 ve = st.ReverseKeys16(d, ve);
672 vd = st.ReverseKeys16(d, vd);
673 vc = st.ReverseKeys16(d, vc);
674 st.Sort2(d, v0, v7);
675 st.Sort2(d, v1, v6);
676 st.Sort2(d, v2, v5);
677 st.Sort2(d, v3, v4);
678 st.Sort2(d, v8, vf);
679 st.Sort2(d, v9, ve);
680 st.Sort2(d, va, vd);
681 st.Sort2(d, vb, vc);
682
683 v3 = st.ReverseKeys16(d, v3);
684 v2 = st.ReverseKeys16(d, v2);
685 v7 = st.ReverseKeys16(d, v7);
686 v6 = st.ReverseKeys16(d, v6);
687 vb = st.ReverseKeys16(d, vb);
688 va = st.ReverseKeys16(d, va);
689 vf = st.ReverseKeys16(d, vf);
690 ve = st.ReverseKeys16(d, ve);
691 st.Sort2(d, v0, v3);
692 st.Sort2(d, v1, v2);
693 st.Sort2(d, v4, v7);
694 st.Sort2(d, v5, v6);
695 st.Sort2(d, v8, vb);
696 st.Sort2(d, v9, va);
697 st.Sort2(d, vc, vf);
698 st.Sort2(d, vd, ve);
699
700 v1 = st.ReverseKeys16(d, v1);
701 v3 = st.ReverseKeys16(d, v3);
702 v5 = st.ReverseKeys16(d, v5);
703 v7 = st.ReverseKeys16(d, v7);
704 v9 = st.ReverseKeys16(d, v9);
705 vb = st.ReverseKeys16(d, vb);
706 vd = st.ReverseKeys16(d, vd);
707 vf = st.ReverseKeys16(d, vf);
708 st.Sort2(d, v0, v1);
709 st.Sort2(d, v2, v3);
710 st.Sort2(d, v4, v5);
711 st.Sort2(d, v6, v7);
712 st.Sort2(d, v8, v9);
713 st.Sort2(d, va, vb);
714 st.Sort2(d, vc, vd);
715 st.Sort2(d, ve, vf);
716
717 v0 = st.SortPairsReverse16(d, v0);
718 v1 = st.SortPairsReverse16(d, v1);
719 v2 = st.SortPairsReverse16(d, v2);
720 v3 = st.SortPairsReverse16(d, v3);
721 v4 = st.SortPairsReverse16(d, v4);
722 v5 = st.SortPairsReverse16(d, v5);
723 v6 = st.SortPairsReverse16(d, v6);
724 v7 = st.SortPairsReverse16(d, v7);
725 v8 = st.SortPairsReverse16(d, v8);
726 v9 = st.SortPairsReverse16(d, v9);
727 va = st.SortPairsReverse16(d, va);
728 vb = st.SortPairsReverse16(d, vb);
729 vc = st.SortPairsReverse16(d, vc);
730 vd = st.SortPairsReverse16(d, vd);
731 ve = st.SortPairsReverse16(d, ve);
732 vf = st.SortPairsReverse16(d, vf);
733
734 v0 = st.SortPairsDistance4(d, v0);
735 v1 = st.SortPairsDistance4(d, v1);
736 v2 = st.SortPairsDistance4(d, v2);
737 v3 = st.SortPairsDistance4(d, v3);
738 v4 = st.SortPairsDistance4(d, v4);
739 v5 = st.SortPairsDistance4(d, v5);
740 v6 = st.SortPairsDistance4(d, v6);
741 v7 = st.SortPairsDistance4(d, v7);
742 v8 = st.SortPairsDistance4(d, v8);
743 v9 = st.SortPairsDistance4(d, v9);
744 va = st.SortPairsDistance4(d, va);
745 vb = st.SortPairsDistance4(d, vb);
746 vc = st.SortPairsDistance4(d, vc);
747 vd = st.SortPairsDistance4(d, vd);
748 ve = st.SortPairsDistance4(d, ve);
749 vf = st.SortPairsDistance4(d, vf);
750
751 v0 = st.SortPairsDistance2(d, v0);
752 v1 = st.SortPairsDistance2(d, v1);
753 v2 = st.SortPairsDistance2(d, v2);
754 v3 = st.SortPairsDistance2(d, v3);
755 v4 = st.SortPairsDistance2(d, v4);
756 v5 = st.SortPairsDistance2(d, v5);
757 v6 = st.SortPairsDistance2(d, v6);
758 v7 = st.SortPairsDistance2(d, v7);
759 v8 = st.SortPairsDistance2(d, v8);
760 v9 = st.SortPairsDistance2(d, v9);
761 va = st.SortPairsDistance2(d, va);
762 vb = st.SortPairsDistance2(d, vb);
763 vc = st.SortPairsDistance2(d, vc);
764 vd = st.SortPairsDistance2(d, vd);
765 ve = st.SortPairsDistance2(d, ve);
766 vf = st.SortPairsDistance2(d, vf);
767
768 v0 = st.SortPairsDistance1(d, v0);
769 v1 = st.SortPairsDistance1(d, v1);
770 v2 = st.SortPairsDistance1(d, v2);
771 v3 = st.SortPairsDistance1(d, v3);
772 v4 = st.SortPairsDistance1(d, v4);
773 v5 = st.SortPairsDistance1(d, v5);
774 v6 = st.SortPairsDistance1(d, v6);
775 v7 = st.SortPairsDistance1(d, v7);
776 v8 = st.SortPairsDistance1(d, v8);
777 v9 = st.SortPairsDistance1(d, v9);
778 va = st.SortPairsDistance1(d, va);
779 vb = st.SortPairsDistance1(d, vb);
780 vc = st.SortPairsDistance1(d, vc);
781 vd = st.SortPairsDistance1(d, vd);
782 ve = st.SortPairsDistance1(d, ve);
783 vf = st.SortPairsDistance1(d, vf);
784}
785
786#endif // !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
787
788// Reshapes `buf` into a matrix, sorts columns independently, and then merges
789// into a sorted 1D array without transposing.
790//
791// DEPRECATED, use BaseCase() instead.
792template <class Traits, class V>
793HWY_INLINE void SortingNetwork(Traits st, size_t cols, V& v0, V& v1, V& v2,
794 V& v3, V& v4, V& v5, V& v6, V& v7, V& v8, V& v9,
795 V& va, V& vb, V& vc, V& vd, V& ve, V& vf) {
796 // traits*-inl assume 'full' vectors (but still capped to kMaxCols).
798
800
801 // The network width depends on the number of keys, not lanes.
802 constexpr size_t kLanesPerKey = st.LanesPerKey();
803 const size_t keys = cols / kLanesPerKey;
804 constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey;
805
806 Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
807
808 // Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
809 // code paths: if MaxLanes < 2, then keys <= cols < 2.
810 if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) {
811 Merge16x2<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
812 vc, vd, ve, vf);
813
814 if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) {
815 Merge16x4<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
816 vc, vd, ve, vf);
817
818 if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) {
819 Merge16x8<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va,
820 vb, vc, vd, ve, vf);
821
822 // Avoids build timeout. Must match #if condition in kMaxCols.
823#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
824 if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
825 Merge16x16<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
826 va, vb, vc, vd, ve, vf);
827
828 static_assert(Constants::kMaxCols <= 16, "Add more branches");
829 }
830#endif
831 }
832 }
833 }
834}
835
836// As above, but loads from/stores to `buf`. This ensures full vectors are
837// aligned, and enables loads/stores without bounds checks.
838//
839// DEPRECATED, use BaseCase() instead.
840template <class Traits, typename T>
841HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
842 // traits*-inl assume 'full' vectors (but still capped to kMaxCols).
843 // However, for smaller arrays and sub-maximal `cols` we have overlapping
844 // loads where only the lowest `cols` are valid, and we skip Merge16 etc.
846 using V = decltype(Zero(d));
847
849
850 // These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
851 // offsets to duplicating this code for every value of cols.
852 static_assert(Constants::kMaxRows == 16, "Update loads/stores/args");
853 V v0 = LoadU(d, buf + 0x0 * cols);
854 V v1 = LoadU(d, buf + 0x1 * cols);
855 V v2 = LoadU(d, buf + 0x2 * cols);
856 V v3 = LoadU(d, buf + 0x3 * cols);
857 V v4 = LoadU(d, buf + 0x4 * cols);
858 V v5 = LoadU(d, buf + 0x5 * cols);
859 V v6 = LoadU(d, buf + 0x6 * cols);
860 V v7 = LoadU(d, buf + 0x7 * cols);
861 V v8 = LoadU(d, buf + 0x8 * cols);
862 V v9 = LoadU(d, buf + 0x9 * cols);
863 V va = LoadU(d, buf + 0xa * cols);
864 V vb = LoadU(d, buf + 0xb * cols);
865 V vc = LoadU(d, buf + 0xc * cols);
866 V vd = LoadU(d, buf + 0xd * cols);
867 V ve = LoadU(d, buf + 0xe * cols);
868 V vf = LoadU(d, buf + 0xf * cols);
869
870 SortingNetwork(st, cols, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc,
871 vd, ve, vf);
872
873 StoreU(v0, d, buf + 0x0 * cols);
874 StoreU(v1, d, buf + 0x1 * cols);
875 StoreU(v2, d, buf + 0x2 * cols);
876 StoreU(v3, d, buf + 0x3 * cols);
877 StoreU(v4, d, buf + 0x4 * cols);
878 StoreU(v5, d, buf + 0x5 * cols);
879 StoreU(v6, d, buf + 0x6 * cols);
880 StoreU(v7, d, buf + 0x7 * cols);
881 StoreU(v8, d, buf + 0x8 * cols);
882 StoreU(v9, d, buf + 0x9 * cols);
883 StoreU(va, d, buf + 0xa * cols);
884 StoreU(vb, d, buf + 0xb * cols);
885 StoreU(vc, d, buf + 0xc * cols);
886 StoreU(vd, d, buf + 0xd * cols);
887 StoreU(ve, d, buf + 0xe * cols);
888 StoreU(vf, d, buf + 0xf * cols);
889}
890
891#else
892template <class Base>
893struct SharedTraits : public Base {};
894#endif // VQSORT_ENABLED
895
896} // namespace detail
897// NOLINTNEXTLINE(google-readability-namespace-comments)
898} // namespace HWY_NAMESPACE
899} // namespace hwy
901
902#endif // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
#define HWY_RESTRICT
Definition base.h:95
#define HWY_NOINLINE
Definition base.h:103
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_IF_LANES_LE(kN, lanes)
Definition base.h:617
#define HWY_LIKELY(expr)
Definition base.h:106
D d
Definition arm_sve-inl.h:1915
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
typename detail::CappedTagChecker< T, kLimit, kPow2 >::type CappedTag
Definition ops/shared-inl.h:379
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
Definition abort.h:8
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
Definition sorting_networks-inl.h:893
Definition contrib/sort/shared-inl.h:28
static constexpr size_t kMaxCols
Definition contrib/sort/shared-inl.h:36
static constexpr size_t kMaxRows
Definition contrib/sort/shared-inl.h:43