17#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
18#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
31#define HAVE_AVX2SORT 0
34#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
44#if HAVE_PARALLEL_IPS4O
53#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
54#include "third_party/ips4o/include/ips4o.hpp"
55#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
58#include "third_party/boost/allowed/sort/sort.hpp"
73#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
74 apply_to = any(function))
76#pragma clang attribute push(__attribute__((target("avx2"))), \
77 apply_to = any(function))
81#pragma GCC push_options
83#pragma GCC target("avx512f,avx512dq")
85#pragma GCC target("avx2")
91#include "vxsort/machine_traits.avx512.h"
93#include "vxsort/machine_traits.avx2.h"
95#include "vxsort/vxsort.h"
98#pragma clang attribute pop
100#pragma GCC pop_options
122 return "unreachable";
135 static_assert(
sizeof(T) <= 8,
"Expected a built-in type");
136 CopyBytes<sizeof(T)>(&value, &bits);
146 HWY_ABORT(
"Sort %s: count %d vs %d\n", type_name,
147 static_cast<int>(
count_),
static_cast<int>(other.
count_));
151 HWY_ABORT(
"Sort %s: minmax %f/%f vs %f/%f\n", type_name,
152 static_cast<double>(
min_),
static_cast<double>(
max_),
153 static_cast<double>(other.
min_),
154 static_cast<double>(other.
max_));
159 HWY_ABORT(
"Sort %s: Sum mismatch %g %g; min %g max %g\n", type_name,
160 static_cast<double>(
sum_),
static_cast<double>(other.
sum_),
161 static_cast<double>(
min_),
static_cast<double>(
max_));
168 T
min_ = hwy::HighestValue<T>();
169 T
max_ = hwy::LowestValue<T>();
184#if HAVE_PARALLEL_IPS4O
221#if HAVE_PARALLEL_IPS4O
222 case Algo::kParallelIPS4O:
251 return "unreachable";
258#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == defined(HWY_TARGET_TOGGLE)
259#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
260#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
262#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
273#if HAVE_INTEL && HWY_TARGET <= HWY_AVX3
275#include "avx512-32bit-qsort.hpp"
276#include "avx512-64bit-qsort.hpp"
282#if HAVE_INTEL || HAVE_VXSORT
292 z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
293 z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
294 return z ^ (z >> 31);
300 template <
class DU64>
303 for (
size_t i = 1; i < 2 *
Lanes(du64); ++i) {
309 template <
class VU64>
313 const VU64 bits =
Add(s1, s0);
315 s1 =
Xor(s1, ShiftLeft<23>(s1));
316 state1 =
Xor(s1,
Xor(s0,
Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
321template <
class D,
class VU64, HWY_IF_NOT_FLOAT_D(D)>
329template <
class DF,
class VU64, HWY_IF_FLOAT_D(DF)>
333 using VU =
Vec<
decltype(du)>;
337#if HWY_TARGET == HWY_SCALAR
339 const VU bits =
Set(du,
static_cast<TU
>(
GetLane(bits64) & LimitsMax<TU>()));
341 const VU bits =
BitCast(du, bits64);
346 const VU mantissa_mask =
Set(du, MantissaMask<TF>());
347 const VU representation =
OrAnd(k1, bits, mantissa_mask);
348 return BitCast(df, representation);
356 : 0xFFFFFFFFFFFFFFFFull);
360 : 0xFFFFFFFFFFFFFFFFull);
364 : 0x00000000FFFFFFFFull);
374 using VU64 =
Vec<
decltype(du64)>;
375 const size_t N64 =
Lanes(du64);
376 auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64);
378 VU64 s0 =
Load(du64, seeds.get());
379 VU64 s1 =
Load(du64, seeds.get() + N64);
381#if HWY_TARGET == HWY_SCALAR
386 using V =
Vec<
decltype(
d)>;
387 const size_t N =
Lanes(
d);
388 const VU64 mask =
MaskForDist(du64, dist,
sizeof(T));
389 auto buf = hwy::AllocateAligned<T>(N);
392 for (; i + N <= num; i += N) {
399 CopyBytes(buf.get(), v + i, (num - i) *
sizeof(T));
403 for (
size_t i = 0; i < num; ++i) {
410#if HAVE_PARALLEL_IPS4O
411 const unsigned max_threads = hwy::LimitsMax<unsigned>();
412 ips4o::StdThreadPool pool{
static_cast<int>(
413 HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))};
419template <
class Order,
typename KeyType, HWY_IF_NOT_T_SIZE(KeyType, 16)>
424 if (Order().IsAscending()) {
425 const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
428 const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
434template <
class Order>
436 const size_t num_keys,
const size_t k) {
437 using detail::SharedTraits;
438 using detail::Traits128;
439 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
440 const size_t num_lanes = num_keys * 2;
441 if (Order().IsAscending()) {
442 const SharedTraits<Traits128<detail::OrderAscending128>> st;
445 const SharedTraits<Traits128<detail::OrderDescending128>> st;
450template <
class Order>
453 using detail::SharedTraits;
454 using detail::Traits128;
455 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
456 const size_t num_lanes = num_keys * 2;
457 if (Order().IsAscending()) {
458 const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
461 const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
466template <
class Order>
469 using detail::SharedTraits;
470 using detail::TraitsLane;
471 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
472 const size_t num_lanes = num_keys;
473 if (Order().IsAscending()) {
474 const SharedTraits<TraitsLane<detail::OrderAscendingKV64>> st;
477 const SharedTraits<TraitsLane<detail::OrderDescendingKV64>> st;
486template <
class Order,
typename KeyType, HWY_IF_NOT_T_SIZE(KeyType, 16)>
491 if (Order().IsAscending()) {
492 const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
495 const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
501template <
class Order>
504 using detail::SharedTraits;
505 using detail::Traits128;
506 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
507 const size_t num_lanes = num_keys * 2;
508 if (Order().IsAscending()) {
509 const SharedTraits<Traits128<detail::OrderAscending128>> st;
512 const SharedTraits<Traits128<detail::OrderDescending128>> st;
517template <
class Order>
520 using detail::SharedTraits;
521 using detail::Traits128;
522 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
523 const size_t num_lanes = num_keys * 2;
524 if (Order().IsAscending()) {
525 const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
528 const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
533template <
class Order>
536 using detail::SharedTraits;
537 using detail::TraitsLane;
538 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
539 const size_t num_lanes = num_keys;
540 if (Order().IsAscending()) {
541 const SharedTraits<TraitsLane<detail::OrderAscendingKV64>> st;
544 const SharedTraits<TraitsLane<detail::OrderDescendingKV64>> st;
553template <
class Order,
typename KeyType, HWY_IF_NOT_T_SIZE(KeyType, 16)>
557 if (Order().IsAscending()) {
558 const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
561 const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
567template <
class Order>
569 using detail::SharedTraits;
570 using detail::Traits128;
571 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
572 const size_t num_lanes = num_keys * 2;
573 if (Order().IsAscending()) {
574 const SharedTraits<Traits128<detail::OrderAscending128>> st;
577 const SharedTraits<Traits128<detail::OrderDescending128>> st;
582template <
class Order>
584 using detail::SharedTraits;
585 using detail::Traits128;
586 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
587 const size_t num_lanes = num_keys * 2;
588 if (Order().IsAscending()) {
589 const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
592 const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
597template <
class Order>
599 using detail::SharedTraits;
600 using detail::TraitsLane;
601 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
602 const size_t num_lanes = num_keys;
603 if (Order().IsAscending()) {
604 const SharedTraits<TraitsLane<detail::OrderAscendingKV64>> st;
607 const SharedTraits<TraitsLane<detail::OrderDescendingKV64>> st;
614template <
class Order,
typename KeyType>
617 const std::less<KeyType> less;
618 const std::greater<KeyType> greater;
620#if !HAVE_PARALLEL_IPS4O
625#if HAVE_INTEL && HWY_TARGET <= HWY_AVX3
627 return avx512_qsort<KeyType>(inout,
static_cast<int64_t
>(num));
632 return avx2::quicksort(inout,
static_cast<int>(num));
637 if (Order().IsAscending()) {
638 return ips4o::sort(inout, inout + num, less);
640 return ips4o::sort(inout, inout + num, greater);
644#if HAVE_PARALLEL_IPS4O
645 case Algo::kParallelIPS4O:
646 if (Order().IsAscending()) {
647 return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
649 return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
661 if (Order().IsAscending()) {
662 return boost::sort::pdqsort_branchless(inout, inout + num, less);
664 return boost::sort::pdqsort_branchless(inout, inout + num, greater);
669 case Algo::kVXSort: {
670#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
671 (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
672 fprintf(stderr,
"Do not call for target %s\n",
677 vxsort::vxsort<KeyType, vxsort::AVX512> vx;
679 vxsort::vxsort<KeyType, vxsort::AVX2> vx;
681 if (Order().IsAscending()) {
682 return vx.sort(inout, inout + num - 1);
684 fprintf(stderr,
"Skipping VX - does not support descending order\n");
692 if (Order().IsAscending()) {
693 return std::sort(inout, inout + num, less);
695 return std::sort(inout, inout + num, greater);
699 if (Order().IsAscending()) {
700 return std::partial_sort(inout, inout + k, inout + num, less);
702 return std::partial_sort(inout, inout + k, inout + num, greater);
706 if (Order().IsAscending()) {
707 return std::nth_element(inout, inout + k, inout + num, less);
709 return std::nth_element(inout, inout + k, inout + num, greater);
713 return VQSort(inout, num, Order());
719 return VQSelect(inout, num, k, Order());
722 return CallHeapSort<Order>(inout, num);
725 return CallHeapPartialSort<Order>(inout, num, k);
728 return CallHeapSelect<Order>(inout, num, k);
#define HWY_RESTRICT
Definition base.h:95
#define HWY_POP_ATTRIBUTES
Definition base.h:165
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_ABORT(format,...)
Definition base.h:233
#define HWY_INLINE
Definition base.h:101
#define HWY_PUSH_ATTRIBUTES(targets_str)
Definition base.h:164
Definition algo-inl.h:290
static void GenerateSeeds(DU64 du64, TFromD< DU64 > *HWY_RESTRICT seeds)
Definition algo-inl.h:301
static HWY_INLINE uint64_t SplitMix64(uint64_t z)
Definition algo-inl.h:291
static VU64 RandomBits(VU64 &state0, VU64 &state1)
Definition algo-inl.h:310
#define HWY_TARGET
Definition detect_targets.h:543
void HeapSort(Traits st, T *HWY_RESTRICT lanes, const size_t num_lanes)
Definition vqsort-inl.h:159
void HeapSelect(Traits st, T *HWY_RESTRICT lanes, const size_t num_lanes, const size_t select)
Definition vqsort-inl.h:179
void HeapPartialSort(Traits st, T *HWY_RESTRICT lanes, const size_t num_lanes, const size_t select)
Definition vqsort-inl.h:208
InputStats< T > GenerateInput(const Dist dist, T *v, size_t num)
Definition algo-inl.h:372
void CallHeapSort(KeyType *HWY_RESTRICT keys, const size_t num_keys)
Definition algo-inl.h:554
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
Vec< DU64 > MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t)
Definition algo-inl.h:352
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
void Run(Algo algo, KeyType *HWY_RESTRICT inout, size_t num, SharedState &shared, size_t, size_t k=0)
Definition algo-inl.h:615
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
ScalableTag< T, -1 > SortTag
Definition contrib/sort/shared-inl.h:146
void CallHeapSelect(KeyType *HWY_RESTRICT keys, const size_t num_keys, const size_t k)
Definition algo-inl.h:487
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
detail::OrderDescending< T > OtherOrder
Definition algo-inl.h:287
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
Vec< D > RandomValues(D d, VU64 &s0, VU64 &s1, const VU64 mask)
Definition algo-inl.h:322
void CallHeapPartialSort(KeyType *HWY_RESTRICT keys, const size_t num_keys, const size_t k)
Definition algo-inl.h:420
HWY_DLLEXPORT void TypeName(const TypeInfo &info, size_t N, char *string100)
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
static const char * DistName(Dist dist)
Definition algo-inl.h:113
Dist
Definition algo-inl.h:107
static std::vector< Dist > AllDist()
Definition algo-inl.h:109
static const char * AlgoName(Algo algo)
Definition algo-inl.h:207
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
static HWY_MAYBE_UNUSED const char * TargetName(int64_t target)
Definition targets.h:85
HWY_CONTRIB_DLLEXPORT void VQPartialSort(uint16_t *HWY_RESTRICT keys, const size_t n, const size_t k, SortAscending)
HWY_CONTRIB_DLLEXPORT void VQSort(uint16_t *HWY_RESTRICT keys, const size_t n, SortAscending)
Algo
Definition algo-inl.h:174
HWY_CONTRIB_DLLEXPORT void VQSelect(uint16_t *HWY_RESTRICT keys, const size_t n, const size_t k, SortAscending)
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition algo-inl.h:409
Definition ops/shared-inl.h:198
Definition traits-inl.h:635
Definition sorting_networks-inl.h:893
Definition traits-inl.h:652