17#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
18#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
36#ifndef VQSORT_ONLY_STATIC
37#define VQSORT_ONLY_STATIC 0
45#if !VQSORT_ONLY_STATIC
53#if !VQSORT_ONLY_STATIC
57 uint64_t* words =
reinterpret_cast<uint64_t*
>(bytes);
61 uint64_t** seed_stack = &words;
63 const uintptr_t bits_stack =
reinterpret_cast<uintptr_t
>(seed_stack);
64 const uintptr_t bits_code =
reinterpret_cast<uintptr_t
>(seed_code);
65 const uint64_t bits_time =
static_cast<uint64_t
>(clock());
66 words[0] = bits_stack ^ bits_time ^ 0xFEDCBA98;
67 words[1] = bits_code ^ bits_time ^ 0x01234567;
71 thread_local uint64_t state[3] = {0};
86#if defined(HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE) == \
87 defined(HWY_TARGET_TOGGLE)
88#ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
89#undef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
91#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
116 size_t start = 0,
size_t max_lanes = 16) {
118 Print(
d, label, v, start, max_lanes);
130template <
class Traits,
typename T>
133 constexpr size_t N1 = st.LanesPerKey();
136 while (start < num_lanes) {
137 const size_t left = 2 * start + N1;
138 const size_t right = 2 * start + 2 * N1;
139 if (left >= num_lanes)
break;
140 size_t idx_larger = start;
141 const auto key_j = st.SetKey(
d, lanes + start);
142 if (
AllTrue(
d, st.Compare(
d, key_j, st.SetKey(
d, lanes + left)))) {
145 if (right < num_lanes &&
146 AllTrue(
d, st.Compare(
d, st.SetKey(
d, lanes + idx_larger),
147 st.SetKey(
d, lanes + right)))) {
150 if (idx_larger == start)
break;
151 st.Swap(lanes + start, lanes + idx_larger);
158template <
class Traits,
typename T>
160 constexpr size_t N1 = st.LanesPerKey();
165 for (
size_t i = ((num_lanes - N1) / N1 / 2) * N1; i != (~N1 + 1); i -= N1) {
169 for (
size_t i = num_lanes - N1; i != 0; i -= N1) {
171 st.Swap(lanes + 0, lanes + i);
178template <
class Traits,
typename T>
180 const size_t select) {
181 constexpr size_t N1 = st.LanesPerKey();
182 const size_t k = select + 1;
184 HWY_ASSERT(k >= 2 * N1 && num_lanes >= 2 * N1);
189 for (
size_t i = ((k - N1) / N1 / 2) * N1; i != (~N1 + 1); i -= N1) {
193 for (
size_t i = k; i <= num_lanes - N1; i += N1) {
194 if (
AllTrue(
d, st.Compare(
d, st.SetKey(
d, lanes + i),
195 st.SetKey(
d, lanes + 0)))) {
197 st.Swap(lanes + 0, lanes + i);
204 st.Swap(lanes + 0, lanes + k - 1);
207template <
class Traits,
typename T>
209 const size_t select) {
214#if VQSORT_ENABLED || HWY_IDE
219template <
class Traits,
typename T>
222 constexpr size_t kLPK = st.LanesPerKey();
223 const size_t num_keys = num_lanes / kLPK;
229 using V =
Vec<
decltype(
d)>;
231 V v0 =
LoadU(
d, keys + 0x0 * kLPK);
232 V v1 =
LoadU(
d, keys + 0x1 * kLPK);
234 Sort2(
d, st, v0, v1);
236 StoreU(v0,
d, keys + 0x0 * kLPK);
237 StoreU(v1,
d, keys + 0x1 * kLPK);
240template <
class Traits,
typename T>
243 constexpr size_t kLPK = st.LanesPerKey();
244 const size_t num_keys = num_lanes / kLPK;
250 const CappedTag<T, kLPK>
d;
251 using V =
Vec<
decltype(
d)>;
255 Store(st.LastValue(
d),
d, buf);
259 T* in_out3 = num_keys == 3 ? buf : keys + 0x3 * kLPK;
261 V v0 =
LoadU(
d, keys + 0x0 * kLPK);
262 V v1 =
LoadU(
d, keys + 0x1 * kLPK);
263 V v2 =
LoadU(
d, keys + 0x2 * kLPK);
266 Sort4(
d, st, v0, v1, v2, v3);
268 StoreU(v0,
d, keys + 0x0 * kLPK);
269 StoreU(v1,
d, keys + 0x1 * kLPK);
270 StoreU(v2,
d, keys + 0x2 * kLPK);
274#if HWY_MEM_OPS_MIGHT_FAULT
276template <
size_t kRows,
size_t kLanesPerRow,
class D,
class Traits,
277 typename T = TFromD<D>>
280 constexpr size_t kMinLanes = kRows / 2 * kLanesPerRow;
283 const CappedTag<T, kMinLanes> dmax;
284 const size_t Nmax =
Lanes(dmax);
289 const Vec<
decltype(dmax)> kPadding = st.LastValue(dmax);
292 size_t i = num_lanes & ~(Nmax - 1);
295 Store(kPadding, dmax, buf + i);
298 }
while (i < (kRows - 1) * kLanesPerRow +
Lanes(
d));
301 ptrdiff_t end =
static_cast<ptrdiff_t
>(num_lanes);
303 end -=
static_cast<ptrdiff_t
>(Nmax);
305 }
while (end >
static_cast<ptrdiff_t
>(kRows / 2 * kLanesPerRow));
310template <
size_t kKeysPerRow,
class Traits,
typename T>
315 static_assert(kKeysPerRow <= 4,
"");
317 constexpr size_t kLPK = st.LanesPerKey();
320 constexpr size_t kRows = 8;
321 constexpr size_t kLanesPerRow = kKeysPerRow * kLPK;
322 constexpr size_t kMinLanes = kRows / 2 * kLanesPerRow;
323 HWY_DASSERT(kMinLanes < num_lanes && num_lanes <= kRows * kLanesPerRow);
325 const CappedTag<T, kLanesPerRow>
d;
326 using V =
Vec<
decltype(
d)>;
331 V v0 =
LoadU(
d, keys + 0x0 * kLanesPerRow);
332 V v1 =
LoadU(
d, keys + 0x1 * kLanesPerRow);
333 V v2 =
LoadU(
d, keys + 0x2 * kLanesPerRow);
334 V v3 =
LoadU(
d, keys + 0x3 * kLanesPerRow);
335#if HWY_MEM_OPS_MIGHT_FAULT
336 CopyHalfToPaddedBuf<kRows, kLanesPerRow>(
d, st, keys, num_lanes, buf);
337 v4 =
LoadU(
d, buf + 0x4 * kLanesPerRow);
338 v5 =
LoadU(
d, buf + 0x5 * kLanesPerRow);
339 v6 =
LoadU(
d, buf + 0x6 * kLanesPerRow);
340 v7 =
LoadU(
d, buf + 0x7 * kLanesPerRow);
342#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE
344 const V vnum_lanes =
Set(
d, ConvertScalarTo<T>(num_lanes));
346 const V kIota =
Iota(
d,
static_cast<T
>(kMinLanes));
347 const V k1 =
Set(
d,
static_cast<T
>(kLanesPerRow));
348 const V k2 =
Add(k1, k1);
350 using M =
Mask<
decltype(
d)>;
351 const M m4 =
Gt(vnum_lanes, kIota);
352 const M m5 =
Gt(vnum_lanes,
Add(kIota, k1));
353 const M m6 =
Gt(vnum_lanes,
Add(kIota, k2));
354 const M m7 =
Gt(vnum_lanes,
Add(kIota,
Add(k2, k1)));
356 const V kPadding = st.LastValue(
d);
357 v4 =
MaskedLoadOr(kPadding, m4,
d, keys + 0x4 * kLanesPerRow);
358 v5 =
MaskedLoadOr(kPadding, m5,
d, keys + 0x5 * kLanesPerRow);
359 v6 =
MaskedLoadOr(kPadding, m6,
d, keys + 0x6 * kLanesPerRow);
360 v7 =
MaskedLoadOr(kPadding, m7,
d, keys + 0x7 * kLanesPerRow);
363 Sort8(
d, st, v0, v1, v2, v3, v4, v5, v6, v7);
366 Merge8x2<kKeysPerRow>(
d, st, v0, v1, v2, v3, v4, v5, v6, v7);
367 Merge8x4<kKeysPerRow>(
d, st, v0, v1, v2, v3, v4, v5, v6, v7);
369 StoreU(v0,
d, keys + 0x0 * kLanesPerRow);
370 StoreU(v1,
d, keys + 0x1 * kLanesPerRow);
371 StoreU(v2,
d, keys + 0x2 * kLanesPerRow);
372 StoreU(v3,
d, keys + 0x3 * kLanesPerRow);
374#if HWY_MEM_OPS_MIGHT_FAULT
376 StoreU(v4,
d, buf + 0x4 * kLanesPerRow);
377 StoreU(v5,
d, buf + 0x5 * kLanesPerRow);
378 StoreU(v6,
d, buf + 0x6 * kLanesPerRow);
379 StoreU(v7,
d, buf + 0x7 * kLanesPerRow);
381 const ScalableTag<T> dmax;
382 const size_t Nmax =
Lanes(dmax);
386 size_t i = kMinLanes;
388 for (; i + Nmax <= num_lanes; i += Nmax) {
393 const size_t remaining = num_lanes - i;
395 SafeCopyN(remaining, dmax, buf + i, keys + i);
397#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE
405template <
size_t kKeysPerRow,
class Traits,
typename T>
410 constexpr size_t kLPK = st.LanesPerKey();
413 constexpr size_t kRows = 16;
414 constexpr size_t kLanesPerRow = kKeysPerRow * kLPK;
415 constexpr size_t kMinLanes = kRows / 2 * kLanesPerRow;
416 HWY_DASSERT(kMinLanes < num_lanes && num_lanes <= kRows * kLanesPerRow);
418 const CappedTag<T, kLanesPerRow>
d;
419 using V =
Vec<
decltype(
d)>;
420 V v8, v9, va, vb, vc, vd, ve, vf;
424 V v0 =
LoadU(
d, keys + 0x0 * kLanesPerRow);
425 V v1 =
LoadU(
d, keys + 0x1 * kLanesPerRow);
426 V v2 =
LoadU(
d, keys + 0x2 * kLanesPerRow);
427 V v3 =
LoadU(
d, keys + 0x3 * kLanesPerRow);
428 V v4 =
LoadU(
d, keys + 0x4 * kLanesPerRow);
429 V v5 =
LoadU(
d, keys + 0x5 * kLanesPerRow);
430 V v6 =
LoadU(
d, keys + 0x6 * kLanesPerRow);
431 V v7 =
LoadU(
d, keys + 0x7 * kLanesPerRow);
432#if HWY_MEM_OPS_MIGHT_FAULT
433 CopyHalfToPaddedBuf<kRows, kLanesPerRow>(
d, st, keys, num_lanes, buf);
434 v8 =
LoadU(
d, buf + 0x8 * kLanesPerRow);
435 v9 =
LoadU(
d, buf + 0x9 * kLanesPerRow);
436 va =
LoadU(
d, buf + 0xa * kLanesPerRow);
437 vb =
LoadU(
d, buf + 0xb * kLanesPerRow);
438 vc =
LoadU(
d, buf + 0xc * kLanesPerRow);
439 vd =
LoadU(
d, buf + 0xd * kLanesPerRow);
440 ve =
LoadU(
d, buf + 0xe * kLanesPerRow);
441 vf =
LoadU(
d, buf + 0xf * kLanesPerRow);
443#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE
445 const V vnum_lanes =
Set(
d, ConvertScalarTo<T>(num_lanes));
447 const V kIota =
Iota(
d,
static_cast<T
>(kMinLanes));
448 const V k1 =
Set(
d,
static_cast<T
>(kLanesPerRow));
449 const V k2 =
Add(k1, k1);
450 const V k4 =
Add(k2, k2);
451 const V k8 =
Add(k4, k4);
453 using M =
Mask<
decltype(
d)>;
454 const M m8 =
Gt(vnum_lanes, kIota);
455 const M m9 =
Gt(vnum_lanes,
Add(kIota, k1));
456 const M ma =
Gt(vnum_lanes,
Add(kIota, k2));
457 const M mb =
Gt(vnum_lanes,
Add(kIota,
Sub(k4, k1)));
458 const M mc =
Gt(vnum_lanes,
Add(kIota, k4));
459 const M md =
Gt(vnum_lanes,
Add(kIota,
Add(k4, k1)));
460 const M me =
Gt(vnum_lanes,
Add(kIota,
Add(k4, k2)));
461 const M mf =
Gt(vnum_lanes,
Add(kIota,
Sub(k8, k1)));
463 const V kPadding = st.LastValue(
d);
464 v8 =
MaskedLoadOr(kPadding, m8,
d, keys + 0x8 * kLanesPerRow);
465 v9 =
MaskedLoadOr(kPadding, m9,
d, keys + 0x9 * kLanesPerRow);
466 va =
MaskedLoadOr(kPadding, ma,
d, keys + 0xa * kLanesPerRow);
467 vb =
MaskedLoadOr(kPadding, mb,
d, keys + 0xb * kLanesPerRow);
468 vc =
MaskedLoadOr(kPadding, mc,
d, keys + 0xc * kLanesPerRow);
469 vd =
MaskedLoadOr(kPadding, md,
d, keys + 0xd * kLanesPerRow);
470 ve =
MaskedLoadOr(kPadding, me,
d, keys + 0xe * kLanesPerRow);
471 vf =
MaskedLoadOr(kPadding, mf,
d, keys + 0xf * kLanesPerRow);
474 Sort16(
d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
477 Merge16x2<kKeysPerRow>(
d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
479 Merge16x4<kKeysPerRow>(
d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
481 Merge16x8<kKeysPerRow>(
d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
483#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
484 Merge16x16<kKeysPerRow>(
d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
488 StoreU(v0,
d, keys + 0x0 * kLanesPerRow);
489 StoreU(v1,
d, keys + 0x1 * kLanesPerRow);
490 StoreU(v2,
d, keys + 0x2 * kLanesPerRow);
491 StoreU(v3,
d, keys + 0x3 * kLanesPerRow);
492 StoreU(v4,
d, keys + 0x4 * kLanesPerRow);
493 StoreU(v5,
d, keys + 0x5 * kLanesPerRow);
494 StoreU(v6,
d, keys + 0x6 * kLanesPerRow);
495 StoreU(v7,
d, keys + 0x7 * kLanesPerRow);
497#if HWY_MEM_OPS_MIGHT_FAULT
499 StoreU(v8,
d, buf + 0x8 * kLanesPerRow);
500 StoreU(v9,
d, buf + 0x9 * kLanesPerRow);
501 StoreU(va,
d, buf + 0xa * kLanesPerRow);
502 StoreU(vb,
d, buf + 0xb * kLanesPerRow);
503 StoreU(vc,
d, buf + 0xc * kLanesPerRow);
504 StoreU(vd,
d, buf + 0xd * kLanesPerRow);
505 StoreU(ve,
d, buf + 0xe * kLanesPerRow);
506 StoreU(vf,
d, buf + 0xf * kLanesPerRow);
508 const ScalableTag<T> dmax;
509 const size_t Nmax =
Lanes(dmax);
513 size_t i = kMinLanes;
515 for (; i + Nmax <= num_lanes; i += Nmax) {
520 const size_t remaining = num_lanes - i;
522 SafeCopyN(remaining, dmax, buf + i, keys + i);
524#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE
547template <
class D,
class TraitsKV,
typename T>
549 size_t num_lanes, T* buf) {
550 using Traits =
typename TraitsKV::SharedTraitsForSortingNetwork;
552 constexpr size_t kLPK = st.LanesPerKey();
554 const size_t num_keys = num_lanes / kLPK;
560 const size_t ceil_log2 =
564 constexpr size_t kMaxKeysPerVector =
MaxLanes(
d) / kLPK;
566 using FuncPtr =
decltype(&Sort2To2<Traits, T>);
567 const FuncPtr funcs[9] = {
569 &Sort2To2<Traits, T>,
570 &Sort3To4<Traits, T>,
571 &Sort8Rows<1, Traits, T>,
572 kMaxKeysPerVector >= 2 ? &Sort8Rows<2, Traits, T> :
nullptr,
573 kMaxKeysPerVector >= 4 ? &Sort8Rows<4, Traits, T> :
nullptr,
574 kMaxKeysPerVector >= 4 ? &Sort16Rows<4, Traits, T> :
nullptr,
575 kMaxKeysPerVector >= 8 ? &Sort16Rows<8, Traits, T> :
nullptr,
576#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
577 kMaxKeysPerVector >= 16 ? &Sort16Rows<16, Traits, T> :
nullptr,
580 funcs[ceil_log2](st, keys, num_lanes, buf);
591template <
class D,
class Traits,
class T>
592HWY_INLINE size_t PartitionRightmost(D
d, Traits st, T*
const keys,
593 const size_t num,
const Vec<D> pivot,
594 size_t& bufL,
size_t& writeR,
596 const size_t N =
Lanes(
d);
605 const size_t remainder = num & (kUnroll * N - 1);
608 const size_t min = remainder + (remainder < N ? kUnroll * N : 0);
611 num_main = num - num_here;
614 if (num_main < 2 * kUnroll * N) {
630 const size_t max_buf = (3 * kUnroll + 1) * N;
633 const T* pReadR = keys + num;
636 size_t bufR = max_buf;
640 for (; i <= num_here - N; i += N) {
643 const Vec<D> v =
LoadU(
d, pReadR);
645 const Mask<D> comp = st.Compare(
d, pivot, v);
653 const size_t remaining = num_here - i;
655 const Mask<D> mask =
FirstN(
d, remaining);
658 const Vec<D> v =
LoadN(
d, pReadR, remaining);
660 const Mask<D> comp = st.Compare(
d, pivot, v);
664 bufR += (remaining - numL);
667 const size_t numWrittenR = bufR - max_buf;
673 writeR = num - numWrittenR;
674 hwy::CopyBytes(buf + max_buf, keys + writeR, numWrittenR *
sizeof(T));
683template <
class D,
class Traits,
typename T>
684HWY_INLINE void StoreLeftRight(D
d, Traits st,
const Vec<D> v,
686 size_t& writeL,
size_t& remaining) {
687 const size_t N =
Lanes(
d);
689 const Mask<D> comp = st.Compare(
d, pivot, v);
703 const Vec<D> lr = st.CompressKeys(v, comp);
704 const size_t num_left = N -
CountTrue(
d, comp);
708 StoreU(lr,
d, keys + remaining + writeL);
720template <
class D,
class Traits,
typename T>
721HWY_INLINE void StoreLeftRight4(D
d, Traits st,
const Vec<D> v0,
722 const Vec<D> v1,
const Vec<D> v2,
723 const Vec<D> v3,
const Vec<D> pivot,
726 StoreLeftRight(
d, st, v0, pivot, keys, writeL, remaining);
727 StoreLeftRight(
d, st, v1, pivot, keys, writeL, remaining);
728 StoreLeftRight(
d, st, v2, pivot, keys, writeL, remaining);
729 StoreLeftRight(
d, st, v3, pivot, keys, writeL, remaining);
734template <
class D,
class Traits,
typename T>
735HWY_INLINE void StoreRightAndBuf(D
d, Traits st,
const Vec<D> v,
739 const size_t N =
Lanes(
d);
740 const Mask<D> comp = st.Compare(
d, pivot, v);
743 writeR -= (N - numL);
752template <
class D,
class Traits,
typename T>
753HWY_INLINE size_t Partition(D
d, Traits st, T*
const keys,
const size_t num,
755 using V =
decltype(
Zero(
d));
756 const size_t N =
Lanes(
d);
759 const size_t num_main =
760 PartitionRightmost(
d, st, keys, num, pivot, bufL, writeR, buf);
766 fprintf(stderr,
" num_main %zu bufL %zu writeR %zu\n", num_main, bufL,
803 size_t remaining = writeR - writeL;
805 const T* readL = keys;
806 const T* readR = keys + num_main;
813 const V vL0 =
LoadU(
d, readL + 0 * N);
814 const V vL1 =
LoadU(
d, readL + 1 * N);
815 const V vL2 =
LoadU(
d, readL + 2 * N);
816 const V vL3 =
LoadU(
d, readL + 3 * N);
817 readL += kUnroll * N;
818 readR -= kUnroll * N;
819 const V vR0 =
LoadU(
d, readR + 0 * N);
820 const V vR1 =
LoadU(
d, readR + 1 * N);
821 const V vR2 =
LoadU(
d, readR + 2 * N);
822 const V vR3 =
LoadU(
d, readR + 3 * N);
825 while (readL != readR) {
829 const size_t capacityL =
830 static_cast<size_t>((readL - keys) -
static_cast<ptrdiff_t
>(writeL));
852 if (capacityL > kUnroll * N) {
853 readR -= kUnroll * N;
854 v0 =
LoadU(
d, readR + 0 * N);
855 v1 =
LoadU(
d, readR + 1 * N);
856 v2 =
LoadU(
d, readR + 2 * N);
857 v3 =
LoadU(
d, readR + 3 * N);
860 v0 =
LoadU(
d, readL + 0 * N);
861 v1 =
LoadU(
d, readL + 1 * N);
862 v2 =
LoadU(
d, readL + 2 * N);
863 v3 =
LoadU(
d, readL + 3 * N);
864 readL += kUnroll * N;
868 StoreLeftRight4(
d, st, v0, v1, v2, v3, pivot, keys, writeL, remaining);
872 StoreLeftRight4(
d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, remaining);
874 StoreLeftRight(
d, st, vR0, pivot, keys, writeL, remaining);
875 StoreLeftRight(
d, st, vR1, pivot, keys, writeL, remaining);
880 writeR = writeL + remaining;
883 StoreRightAndBuf(
d, st, vR2, pivot, keys, writeR, buf, bufL);
884 StoreRightAndBuf(
d, st, vR3, pivot, keys, writeR, buf, bufL);
892 CopyBytes(buf, keys + writeL, bufL *
sizeof(T));
894 return writeL + bufL;
901template <
class D,
class Traits,
typename T>
903 size_t num,
const Vec<D> valueL,
904 const Vec<D> valueR, Vec<D>& third,
906 const size_t N =
Lanes(
d);
915 for (; i <= num - N; i += N) {
916 const Vec<D> v =
LoadU(
d, keys + i);
921 const Mask<D> eqL = st.EqualKeys(
d, v, valueL);
922 const Mask<D> eqR = st.EqualKeys(
d, v, valueR);
934 third = st.SetKey(
d, keys + i + lane);
936 fprintf(stderr,
"found 3rd value at vec %zu; writeL %zu\n", i,
941 for (; writeL <= i - N; writeL += N) {
942 StoreU(valueR,
d, keys + writeL);
948 StoreU(valueL,
d, keys + writeL);
954 const size_t remaining = num - i;
956 const Vec<D> v =
Load(
d, buf);
957 const Mask<D> valid =
FirstN(
d, remaining);
958 const Mask<D> eqL =
And(st.EqualKeys(
d, v, valueL), valid);
959 const Mask<D> eqR = st.EqualKeys(
d, v, valueR);
961 const Mask<D> eq =
Or(
Or(eqL, eqR),
Not(valid));
965 third = st.SetKey(
d, keys + i + lane);
967 fprintf(stderr,
"found 3rd value at partial vec %zu; writeL %zu\n", i,
972 for (; writeL <= i - N; writeL += N) {
973 StoreU(valueR,
d, keys + writeL);
985 for (; i <= num - N; i += N) {
992 fprintf(stderr,
"Successful MaybePartitionTwoValue\n");
998template <
class D,
class Traits,
typename T>
1000 size_t num,
const Vec<D> valueL,
1001 const Vec<D> valueR, Vec<D>& third,
1003 const size_t N =
Lanes(
d);
1006 size_t pos = num - N;
1012 for (; pos < num; pos -= N) {
1013 const Vec<D> v =
LoadU(
d, keys + pos);
1017 const Mask<D> eqL = st.EqualKeys(
d, v, valueL);
1018 const Mask<D> eqR = st.EqualKeys(
d, v, valueR);
1024 third = st.SetKey(
d, keys + pos + lane);
1026 fprintf(stderr,
"found 3rd value at vec %zu; countR %zu\n", pos,
1034 const size_t endL = num - countR;
1036 for (; pos <= endL - N; pos += N) {
1037 StoreU(valueL,
d, keys + pos);
1043 StoreU(valueR,
d, keys + pos);
1048 const size_t remaining = pos + N;
1050 const Vec<D> v =
LoadU(
d, keys);
1051 const Mask<D> valid =
FirstN(
d, remaining);
1052 const Mask<D> eqL = st.EqualKeys(
d, v, valueL);
1053 const Mask<D> eqR =
And(st.EqualKeys(
d, v, valueR), valid);
1055 const Mask<D> eq =
Or(
Or(eqL, eqR),
Not(valid));
1059 third = st.SetKey(
d, keys + lane);
1061 fprintf(stderr,
"found 3rd value at partial vec %zu; writeR %zu\n", pos,
1069 const size_t endL = num - countR;
1071 for (; pos <= endL - N; pos += N) {
1072 StoreU(valueL,
d, keys + pos);
1085 const size_t endL = num - countR;
1088 for (; i <= endL - N; i += N) {
1097 "MaybePartitionTwoValueR countR %zu pos %zu i %zu endL %zu\n",
1098 countR, pos, i, endL);
1108template <
class D,
class Traits,
typename T>
1109HWY_INLINE bool PartitionIfTwoKeys(D
d, Traits st,
const Vec<D> pivot,
1111 const size_t idx_second,
const Vec<D> second,
1114 const bool is_pivotR =
AllFalse(
d, st.Compare(
d, pivot, second));
1116 fprintf(stderr,
"Samples all equal, diff at %zu, isPivotR %d\n", idx_second,
1123 return is_pivotR ? MaybePartitionTwoValueR(
d, st, keys, num, second, pivot,
1125 : MaybePartitionTwoValue(
d, st, keys + idx_second,
1126 num - idx_second, pivot, second,
1132template <
class D,
class Traits,
typename T>
1135 constexpr size_t kSampleLanes = Constants::SampleLanes<T>();
1136 constexpr size_t N1 = st.LanesPerKey();
1137 const Vec<D> valueL = st.SetKey(
d, samples);
1138 const Vec<D> valueR = st.SetKey(
d, samples + kSampleLanes - N1);
1141 const Vec<D> prev = st.PrevValue(
d, valueR);
1152 return MaybePartitionTwoValue(
d, st, keys, num, valueL, valueR, third, buf);
1157template <
class Traits,
class V>
1158HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) {
1164 const V sum =
Xor(
Xor(v0, v1), v2);
1165 const V first = st.First(
d, st.First(
d, v0, v1), v2);
1166 const V last = st.Last(
d, st.Last(
d, v0, v1), v2);
1167 return Xor(
Xor(sum, first), last);
1169 st.Sort2(
d, v0, v2);
1170 v1 = st.Last(
d, v0, v1);
1171 v1 = st.First(
d, v1, v2);
1177 const uint64_t a = state[0];
1178 const uint64_t b = state[1];
1179 const uint64_t w = state[2] + 1;
1180 const uint64_t next = a ^ w;
1181 state[0] = (b + (b << 3)) ^ (b >> 11);
1182 const uint64_t rot = (b << 24) | (b >> 40);
1183 state[1] = rot + next;
1190HWY_INLINE size_t RandomChunkIndex(
const uint32_t num_chunks, uint32_t bits) {
1191 const uint64_t chunk_index = (
static_cast<uint64_t
>(bits) * num_chunks) >> 32;
1193 return static_cast<size_t>(chunk_index);
1197template <
class D,
class Traits,
typename T>
1200 using V =
decltype(
Zero(
d));
1201 const size_t N =
Lanes(
d);
1209 const size_t misalign =
1210 (
reinterpret_cast<uintptr_t
>(keys) /
sizeof(T)) & (kLanesPerChunk - 1);
1211 if (misalign != 0) {
1212 const size_t consume = kLanesPerChunk - misalign;
1219 for (
size_t i = 0; i < 6; i += 2) {
1220 const uint64_t bits64 = RandomBits(state);
1221 CopyBytes<8>(&bits64, bits + i);
1224 const size_t num_chunks64 = num / kLanesPerChunk;
1226 const uint32_t num_chunks =
1227 static_cast<uint32_t
>(
HWY_MIN(num_chunks64, 0xFFFFFFFFull));
1229 const size_t offset0 = RandomChunkIndex(num_chunks, bits[0]) * kLanesPerChunk;
1230 const size_t offset1 = RandomChunkIndex(num_chunks, bits[1]) * kLanesPerChunk;
1231 const size_t offset2 = RandomChunkIndex(num_chunks, bits[2]) * kLanesPerChunk;
1232 const size_t offset3 = RandomChunkIndex(num_chunks, bits[3]) * kLanesPerChunk;
1233 const size_t offset4 = RandomChunkIndex(num_chunks, bits[4]) * kLanesPerChunk;
1234 const size_t offset5 = RandomChunkIndex(num_chunks, bits[5]) * kLanesPerChunk;
1235 for (
size_t i = 0; i < kLanesPerChunk; i += N) {
1236 const V v0 =
Load(
d, keys + offset0 + i);
1237 const V v1 =
Load(
d, keys + offset1 + i);
1238 const V v2 =
Load(
d, keys + offset2 + i);
1239 const V medians0 = MedianOf3(st, v0, v1, v2);
1240 Store(medians0,
d, buf + i);
1242 const V v3 =
Load(
d, keys + offset3 + i);
1243 const V v4 =
Load(
d, keys + offset4 + i);
1244 const V v5 =
Load(
d, keys + offset5 + i);
1245 const V medians1 = MedianOf3(st, v3, v4, v5);
1246 Store(medians1,
d, buf + i + kLanesPerChunk);
1251V OrXor(
const V o,
const V x1,
const V x2) {
1252 return Or(o,
Xor(x1, x2));
1256template <
class D,
class Traits>
1257HWY_INLINE bool UnsortedSampleEqual(D
d, Traits st,
1259 constexpr size_t kSampleLanes = Constants::SampleLanes<TFromD<D>>();
1260 const size_t N =
Lanes(
d);
1265 const V first = st.SetKey(
d, samples);
1270 for (
size_t i = 0; i < kSampleLanes; i += N) {
1271 const V v =
Load(
d, samples + i);
1272 diff = OrXor(diff, first, v);
1274 return st.NoKeyDifference(
d, diff);
1279 for (
size_t i = 0; i < kSampleLanes; i += N) {
1280 const V v =
Load(
d, samples + i);
1281 if (!
AllTrue(
d, st.EqualKeys(
d, v, first))) {
1289template <
class D,
class Traits,
typename T>
1291 const size_t N =
Lanes(
d);
1292 constexpr size_t kSampleLanes = Constants::SampleLanes<T>();
1296 BaseCase(
d, st, buf, kSampleLanes, buf + kSampleLanes);
1299 fprintf(stderr,
"Samples:\n");
1300 for (
size_t i = 0; i < kSampleLanes; i += N) {
1308enum class PivotResult {
1315HWY_INLINE const char* PivotResultString(PivotResult result) {
1317 case PivotResult::kDone:
1319 case PivotResult::kNormal:
1321 case PivotResult::kIsFirst:
1323 case PivotResult::kWasLast:
1330template <
class Traits,
typename T>
1332 constexpr size_t kSampleLanes = Constants::SampleLanes<T>();
1333 constexpr size_t N1 = st.LanesPerKey();
1335 constexpr size_t kRankMid = kSampleLanes / 2;
1336 static_assert(kRankMid % N1 == 0,
"Mid is not an aligned key");
1339 size_t rank_prev = kRankMid - N1;
1340 for (; st.Equal1(samples + rank_prev, samples + kRankMid); rank_prev -= N1) {
1342 if (rank_prev == 0)
return 0;
1345 size_t rank_next = rank_prev + N1;
1346 for (; st.Equal1(samples + rank_next, samples + kRankMid); rank_next += N1) {
1349 if (rank_next == kSampleLanes - N1)
return rank_prev;
1357 const size_t excess_if_median = rank_next - kRankMid;
1358 const size_t excess_if_prev = kRankMid - rank_prev;
1359 return excess_if_median < excess_if_prev ? kRankMid : rank_prev;
1364template <
class D,
class Traits,
typename T>
1365HWY_INLINE Vec<D> ChoosePivotByRank(D
d, Traits st,
1367 const size_t pivot_rank = PivotRank(st, samples);
1368 const Vec<D> pivot = st.SetKey(
d, samples + pivot_rank);
1370 fprintf(stderr,
" Pivot rank %3zu\n", pivot_rank);
1372 Store(pivot,
d, pivot_lanes);
1373 using KeyType =
typename Traits::KeyType;
1375 CopyBytes<sizeof(KeyType)>(pivot_lanes, &key);
1379 constexpr size_t kSampleLanes = Constants::SampleLanes<T>();
1380 constexpr size_t N1 = st.LanesPerKey();
1381 const Vec<D> last = st.SetKey(
d, samples + kSampleLanes - N1);
1382 const bool all_neq =
AllTrue(
d, st.NotEqualKeys(
d, pivot, last));
1390template <
class D,
class Traits,
typename T>
1391HWY_INLINE bool AllEqual(D
d, Traits st,
const Vec<D> pivot,
1394 const size_t N =
Lanes(
d);
1397 const Vec<D> zero =
Zero(
d);
1400 const size_t misalign =
1401 (
reinterpret_cast<uintptr_t
>(keys) /
sizeof(T)) & (N - 1);
1403 const size_t consume = N - misalign;
1405 const Vec<D> v =
LoadU(
d, keys);
1407 const Mask<D> diff =
And(
FirstN(
d, consume), st.NotEqualKeys(
d, v, pivot));
1410 *first_mismatch = lane;
1415 HWY_DASSERT(((
reinterpret_cast<uintptr_t
>(keys + i) /
sizeof(T)) & (N - 1)) ==
1421 if (!hwy::IsFloat<T>()) {
1425 Vec<D> diff0 = zero;
1426 Vec<D> diff1 = zero;
1431 constexpr size_t kLoops = 8;
1432 const size_t lanes_per_group = kLoops * 2 * N;
1434 if (num >= lanes_per_group) {
1435 for (; i <= num - lanes_per_group; i += lanes_per_group) {
1437 for (
size_t loop = 0; loop < kLoops; ++loop) {
1438 const Vec<D> v0 =
Load(
d, keys + i + loop * 2 * N);
1439 const Vec<D> v1 =
Load(
d, keys + i + loop * 2 * N + N);
1440 diff0 = OrXor(diff0, v0, pivot);
1441 diff1 = OrXor(diff1, v1, pivot);
1448 const Vec<D> v =
Load(
d, keys + i);
1449 const Mask<D> diff = st.NotEqualKeys(
d, v, pivot);
1452 *first_mismatch = i + lane;
1462 for (; i <= num - N; i += N) {
1463 const Vec<D> v =
Load(
d, keys + i);
1464 const Mask<D> diff = st.NotEqualKeys(
d, v, pivot);
1467 *first_mismatch = i + lane;
1473 const Vec<D> v =
LoadU(
d, keys + i);
1474 const Mask<D> diff = st.NotEqualKeys(
d, v, pivot);
1477 *first_mismatch = i + lane;
1482 fprintf(stderr,
"All keys equal\n");
1488template <
class D,
class Traits,
typename T>
1490 size_t num,
const Vec<D> pivot) {
1491 const size_t N =
Lanes(
d);
1495 fprintf(stderr,
"Scanning for before\n");
1500 constexpr size_t kLoops = 16;
1501 const size_t lanes_per_group = kLoops * N;
1503 Vec<D> first = pivot;
1506 if (num >= lanes_per_group) {
1507 for (; i <= num - lanes_per_group; i += lanes_per_group) {
1509 for (
size_t loop = 0; loop < kLoops; ++loop) {
1510 const Vec<D> curr =
LoadU(
d, keys + i + loop * N);
1511 first = st.First(
d, first, curr);
1516 fprintf(stderr,
"Stopped scanning at end of group %zu\n",
1517 i + lanes_per_group);
1524 for (; i <= num - N; i += N) {
1525 const Vec<D> curr =
LoadU(
d, keys + i);
1528 fprintf(stderr,
"Stopped scanning at %zu\n", i);
1535 const Vec<D> curr =
LoadU(
d, keys + num - N);
1538 fprintf(stderr,
"Stopped scanning at last %zu\n", num - N);
1548template <
class D,
class Traits,
typename T>
1550 size_t num,
const Vec<D> pivot) {
1551 const size_t N =
Lanes(
d);
1555 fprintf(stderr,
"Scanning for after\n");
1560 constexpr size_t kLoops = 16;
1561 const size_t lanes_per_group = kLoops * N;
1563 Vec<D> last = pivot;
1566 if (num >= lanes_per_group) {
1567 for (; i + lanes_per_group <= num; i += lanes_per_group) {
1569 for (
size_t loop = 0; loop < kLoops; ++loop) {
1570 const Vec<D> curr =
LoadU(
d, keys + i + loop * N);
1571 last = st.Last(
d, last, curr);
1576 fprintf(stderr,
"Stopped scanning at end of group %zu\n",
1577 i + lanes_per_group);
1584 for (; i <= num - N; i += N) {
1585 const Vec<D> curr =
LoadU(
d, keys + i);
1588 fprintf(stderr,
"Stopped scanning at %zu\n", i);
1595 const Vec<D> curr =
LoadU(
d, keys + num - N);
1598 fprintf(stderr,
"Stopped scanning at last %zu\n", num - N);
1609template <
class D,
class Traits,
typename T>
1610HWY_INLINE Vec<D> ChoosePivotForEqualSamples(D
d, Traits st,
1613 Vec<D> second, Vec<D> third,
1614 PivotResult& result) {
1615 const Vec<D> pivot = st.SetKey(
d, samples);
1619 result = PivotResult::kIsFirst;
1621 fprintf(stderr,
"Pivot equals first possible value\n");
1627 fprintf(stderr,
"Pivot equals last possible value\n");
1629 result = PivotResult::kWasLast;
1630 return st.PrevValue(
d, pivot);
1637 const bool before = !
AllFalse(
d, st.Compare(
d, second, pivot));
1640 if (
HWY_UNLIKELY(ExistsAnyAfter(
d, st, keys, num, pivot))) {
1641 result = PivotResult::kNormal;
1649 result = PivotResult::kWasLast;
1650 return st.PrevValue(
d, pivot);
1654 if (
HWY_UNLIKELY(ExistsAnyBefore(
d, st, keys, num, pivot))) {
1655 result = PivotResult::kNormal;
1662 st.Sort2(
d, second, third);
1664 const bool before = !
AllFalse(
d, st.Compare(
d, second, pivot));
1665 const bool after = !
AllFalse(
d, st.Compare(
d, pivot, third));
1672 if (
HWY_UNLIKELY(after || ExistsAnyAfter(
d, st, keys, num, pivot))) {
1673 result = PivotResult::kNormal;
1681 result = PivotResult::kWasLast;
1682 return st.PrevValue(
d, pivot);
1686 if (
HWY_UNLIKELY(ExistsAnyBefore(
d, st, keys, num, pivot))) {
1687 result = PivotResult::kNormal;
1697 result = PivotResult::kIsFirst;
1703enum class RecurseMode {
1715template <
class D,
class Traits,
typename T>
1719 const size_t N =
Lanes(
d);
1720 if (num < N)
return;
1722 Vec<D> first = st.LastValue(
d);
1723 Vec<D> last = st.FirstValue(
d);
1726 for (; i <= num - N; i += N) {
1727 const Vec<D> v =
LoadU(
d, keys + i);
1728 first = st.First(
d, v, first);
1729 last = st.Last(
d, v, last);
1733 const Vec<D> v =
LoadU(
d, keys + num - N);
1734 first = st.First(
d, v, first);
1735 last = st.Last(
d, v, last);
1738 first = st.FirstOfLanes(
d, first, buf);
1739 last = st.LastOfLanes(
d, last, buf);
1745template <RecurseMode mode,
class D,
class Traits,
typename T>
1749 const size_t remaining_levels,
const size_t k = 0) {
1752 const size_t N =
Lanes(
d);
1753 constexpr size_t kLPK = st.LanesPerKey();
1754 if (
HWY_UNLIKELY(num <= Constants::BaseCaseNumLanes<kLPK>(N))) {
1755 BaseCase(
d, st, keys, num, buf);
1761 fprintf(stderr,
"\n\n=== Recurse depth=%zu len=%zu\n", remaining_levels,
1763 PrintMinMax(
d, st, keys, num, buf);
1766 DrawSamples(
d, st, keys, num, buf, state);
1769 PivotResult result = PivotResult::kNormal;
1771 pivot = st.SetKey(
d, buf);
1772 size_t idx_second = 0;
1773 if (
HWY_UNLIKELY(AllEqual(
d, st, pivot, keys, num, &idx_second))) {
1778 const Vec<D> second = st.SetKey(
d, keys + idx_second);
1786 PartitionIfTwoKeys(
d, st, pivot, keys, num, idx_second,
1787 second, third, buf))) {
1793 pivot = ChoosePivotForEqualSamples(
d, st, keys, num, buf, second, third,
1799 SortSamples(
d, st, buf);
1804 PartitionIfTwoSamples(
d, st, keys, num, buf))) {
1808 pivot = ChoosePivotByRank(
d, st, buf);
1815 fprintf(stderr,
"HeapSort reached, size=%zu\n", num);
1821 const size_t bound = Partition(
d, st, keys, num, pivot, buf);
1823 fprintf(stderr,
"bound %zu num %zu result %s\n", bound, num,
1824 PivotResultString(result));
1836 HWY_DASSERT(bound != num || result == PivotResult::kWasLast);
1839 if (
HWY_LIKELY(result != PivotResult::kIsFirst) && k < bound) {
1840 Recurse<RecurseMode::kSelect>(
d, st, keys, bound, buf, state,
1841 remaining_levels - 1, k);
1842 }
else if (
HWY_LIKELY(result != PivotResult::kWasLast) && k >= bound) {
1843 Recurse<RecurseMode::kSelect>(
d, st, keys + bound, num - bound, buf,
1844 state, remaining_levels - 1, k - bound);
1848 if (
HWY_LIKELY(result != PivotResult::kIsFirst)) {
1849 Recurse<RecurseMode::kSort>(
d, st, keys, bound, buf, state,
1850 remaining_levels - 1);
1852 if (
HWY_LIKELY(result != PivotResult::kWasLast)) {
1853 Recurse<RecurseMode::kSort>(
d, st, keys + bound, num - bound, buf, state,
1854 remaining_levels - 1);
1860template <
class D,
class Traits,
typename T>
1863 const size_t N =
Lanes(
d);
1864 constexpr size_t kLPK = st.LanesPerKey();
1865 const size_t base_case_num = Constants::BaseCaseNumLanes<kLPK>(N);
1871 fprintf(stderr,
"Special-casing small, %d lanes\n",
1872 static_cast<int>(num));
1874 BaseCase(
d, st, keys, num, buf);
1881 const bool partial_128 = !
IsFull(
d) && N < 2 && st.Is128();
1886 constexpr bool kPotentiallyHuge =
1888 const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
1889 if (partial_128 || huge_vec) {
1891 fprintf(stderr,
"WARNING: using slow HeapSort: partial %d huge %d\n",
1892 partial_128, huge_vec);
1906template <
class D,
class Traits,
typename T, HWY_IF_FLOAT(T)>
1909 const size_t N =
Lanes(
d);
1911 const Vec<D> sentinel = st.LastValue(
d);
1915 for (; i <= num - N; i += N) {
1922 const size_t remaining = num - i;
1932template <
class D,
class Traits,
typename T, HWY_IF_NOT_FLOAT(T)>
1942template <
class D,
class Traits,
typename T>
1947 "=============== Sort num %zu is128 %d isKV %d vec bytes %d\n", num,
1948 st.Is128(), st.IsKV(),
static_cast<int>(
sizeof(T) *
Lanes(
d)));
1951#if HWY_MAX_BYTES > 64
1954 return Sort(
CappedTag<T, 64 /
sizeof(T)>(), st, keys, num, buf);
1960#if VQSORT_ENABLED || HWY_IDE
1961 if (!detail::HandleSpecialCases(
d, st, keys, num, buf)) {
1965 const size_t max_levels = 50;
1966 detail::Recurse<detail::RecurseMode::kSort>(
d, st, keys, num, buf, state,
1973 fprintf(stderr,
"WARNING: using slow HeapSort because vqsort disabled\n");
1983template <
class D,
class Traits,
typename T>
1987 fprintf(stderr,
"=============== Select num=%zu, vec bytes=%d\n", num,
1988 static_cast<int>(
sizeof(T) *
Lanes(
d)));
1991#if HWY_MAX_BYTES > 64
2000#if VQSORT_ENABLED || HWY_IDE
2001 if (!detail::HandleSpecialCases(
d, st, keys, num, buf)) {
2005 const size_t max_levels = 50;
2006 detail::Recurse<detail::RecurseMode::kSelect>(
d, st, keys, num, buf, state,
2013 fprintf(stderr,
"WARNING: using slow HeapSort because vqsort disabled\n");
2023template <
class D,
class Traits,
typename T>
2027 fprintf(stderr,
"=============== PartialSort num=%zu, vec bytes=%d\n", num,
2028 static_cast<int>(
sizeof(T) *
Lanes(
d)));
2031#if HWY_MAX_BYTES > 64
2040#if VQSORT_ENABLED || HWY_IDE
2041 if (!detail::HandleSpecialCases(
d, st, keys, num, buf)) {
2045 const size_t max_levels = 50;
2047 detail::Recurse<detail::RecurseMode::kSelect>(
d, st, keys, num, buf, state,
2049 detail::Recurse<detail::RecurseMode::kSort>(
d, st, keys, k, buf, state,
2056 fprintf(stderr,
"WARNING: using slow HeapSort because vqsort disabled\n");
2078template <
class D,
class Traits,
typename T>
2080 constexpr size_t kLPK = st.LanesPerKey();
2082 return Sort(
d, st, keys, num, buf);
2087template <
class D,
class Traits,
typename T>
2091 constexpr size_t kLPK = st.LanesPerKey();
2099template <
class D,
class Traits,
typename T>
2103 constexpr size_t kLPK = st.LanesPerKey();
2105 Select(
d, st, keys, num, k, buf);
2113template <
typename T>
2115 using Ascending = OrderAscending<T>;
2116 using Descending = OrderDescending<T>;
2118 template <
class Order>
2119 using Traits = TraitsLane<Order>;
2123struct KeyAdapter<
hwy::uint128_t> {
2124 using Ascending = OrderAscending128;
2125 using Descending = OrderDescending128;
2127 template <
class Order>
2128 using Traits = Traits128<Order>;
2132struct KeyAdapter<
hwy::K64V64> {
2133 using Ascending = OrderAscendingKV128;
2134 using Descending = OrderDescendingKV128;
2136 template <
class Order>
2137 using Traits = Traits128<Order>;
2141struct KeyAdapter<
hwy::K32V32> {
2142 using Ascending = OrderAscendingKV64;
2143 using Descending = OrderDescendingKV64;
2145 template <
class Order>
2146 using Traits = TraitsLane<Order>;
2156template <
typename T>
2159 using Adapter = detail::KeyAdapter<T>;
2160 using Order =
typename Adapter::Ascending;
2164 Sort(
d, st,
reinterpret_cast<LaneType*
>(keys), num * st.LanesPerKey());
2172template <
typename T>
2175 using Adapter = detail::KeyAdapter<T>;
2176 using Order =
typename Adapter::Descending;
2180 Sort(
d, st,
reinterpret_cast<LaneType*
>(keys), num * st.LanesPerKey());
2188template <
typename T>
2192 using Adapter = detail::KeyAdapter<T>;
2193 using Order =
typename Adapter::Ascending;
2206template <
typename T>
2210 using Adapter = detail::KeyAdapter<T>;
2211 using Order =
typename Adapter::Descending;
2224template <
typename T>
2228 using Adapter = detail::KeyAdapter<T>;
2229 using Order =
typename Adapter::Ascending;
2233 Select(
d, st,
reinterpret_cast<LaneType*
>(keys), num * st.LanesPerKey(), k);
2241template <
typename T>
2245 using Adapter = detail::KeyAdapter<T>;
2246 using Order =
typename Adapter::Descending;
2250 Select(
d, st,
reinterpret_cast<LaneType*
>(keys), num * st.LanesPerKey(), k);
#define HWY_RESTRICT
Definition base.h:95
#define HWY_ASSUME(expr)
Definition base.h:214
#define HWY_NOINLINE
Definition base.h:103
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_IF_CONSTEXPR
Definition base.h:310
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_DEFAULT_UNROLL
Definition base.h:188
#define HWY_UNROLL(factor)
Definition base.h:187
#define HWY_ASSERT(condition)
Definition base.h:237
#define HWY_LIKELY(expr)
Definition base.h:106
#define HWY_UNLIKELY(expr)
Definition base.h:107
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition ops/shared-inl.h:151
void SiftDown(Traits st, T *HWY_RESTRICT lanes, const size_t num_lanes, size_t start)
Definition vqsort-inl.h:131
HWY_INLINE void MaybePrintVector(D d, const char *label, Vec< D > v, size_t start=0, size_t max_lanes=16)
Definition vqsort-inl.h:115
HWY_INLINE Mask128< T > Not(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition x86_128-inl.h:1653
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1593
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:5084
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:560
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1445
HWY_INLINE size_t CountAndReplaceNaN(D d, Traits st, T *HWY_RESTRICT keys, size_t num)
Definition vqsort-inl.h:1907
void HeapSort(Traits st, T *HWY_RESTRICT lanes, const size_t num_lanes)
Definition vqsort-inl.h:159
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition x86_256-inl.h:7076
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1519
void HeapSelect(Traits st, T *HWY_RESTRICT lanes, const size_t num_lanes, const size_t select)
Definition vqsort-inl.h:179
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:570
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1482
void HeapPartialSort(Traits st, T *HWY_RESTRICT lanes, const size_t num_lanes, const size_t select)
Definition vqsort-inl.h:208
HWY_API void StoreN(size_t count, VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p)
Definition rvv-inl.h:1926
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:1269
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, Mask128< T > mask)
Definition arm_neon-inl.h:8296
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition ops/shared-inl.h:325
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:1556
void VQSortStatic(T *HWY_RESTRICT keys, const size_t num, SortAscending)
Definition vqsort-inl.h:2157
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API Vec< D > NaN(D d)
Definition generic_ops-inl.h:82
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
void PartialSort(D d, Traits st, T *HWY_RESTRICT keys, size_t num, size_t k, T *HWY_RESTRICT buf)
Definition vqsort-inl.h:2024
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
void VQPartialSortStatic(T *HWY_RESTRICT keys, const size_t num, const size_t k, SortAscending)
Definition vqsort-inl.h:2189
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
void Sort(D d, Traits st, T *HWY_RESTRICT keys, const size_t num, T *HWY_RESTRICT buf)
Definition vqsort-inl.h:1943
void VQSelectStatic(T *HWY_RESTRICT keys, const size_t num, const size_t k, SortAscending)
Definition vqsort-inl.h:2225
typename detail::CappedTagChecker< T, kLimit, kPow2 >::type CappedTag
Definition ops/shared-inl.h:379
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
ScalableTag< T, -1 > SortTag
Definition contrib/sort/shared-inl.h:146
decltype(MaskFromVec(Zero(D()))) Mask
Definition generic_ops-inl.h:52
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
void Fill(D d, T value, size_t count, T *HWY_RESTRICT to)
Definition copy-inl.h:42
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition ops/shared-inl.h:407
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition generic_ops-inl.h:187
void Select(D d, Traits st, T *HWY_RESTRICT keys, const size_t num, const size_t k, T *HWY_RESTRICT buf)
Definition vqsort-inl.h:1984
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
decltype(GetLane(V())) LaneType
Definition generic_ops-inl.h:39
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API void Print(const D d, const char *caption, V v, size_t lane_u=0, size_t max_lanes=7)
Definition print-inl.h:39
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_INLINE void Fill16BytesStatic(void *bytes)
Definition vqsort-inl.h:52
HWY_INLINE uint64_t * GetGeneratorStateStatic()
Definition vqsort-inl.h:70
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T *p)
Definition cache_control.h:82
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition base.h:2577
HWY_CONTRIB_DLLEXPORT bool Fill16BytesSecure(void *bytes)
HWY_API constexpr bool IsFloat()
Definition base.h:2127
HWY_NOINLINE void PrintValue(T value)
Definition print.h:61
#define HWY_MAX_BYTES
Definition set_macros-inl.h:168
#define HWY_ALIGN
Definition set_macros-inl.h:167
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition arm_neon-inl.h:8428
Definition sorting_networks-inl.h:893
Definition contrib/sort/shared-inl.h:28
static constexpr HWY_INLINE size_t BaseCaseNumLanes(size_t N)
Definition contrib/sort/shared-inl.h:47
static constexpr size_t kMaxCols
Definition contrib/sort/shared-inl.h:36
static constexpr HWY_INLINE size_t PartitionBufNum(size_t N)
Definition contrib/sort/shared-inl.h:73
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t)
Definition contrib/sort/shared-inl.h:64
static constexpr size_t kMaxRows
Definition contrib/sort/shared-inl.h:43
static constexpr size_t kPartitionUnroll
Definition contrib/sort/shared-inl.h:58
#define VQSORT_PRINT
Definition vqsort-inl.h:42