summaryrefslogtreecommitdiff
path: root/libc/src/stdlib/quick_sort.h
diff options
context:
space:
mode:
Diffstat (limited to 'libc/src/stdlib/quick_sort.h')
-rw-r--r--libc/src/stdlib/quick_sort.h203
1 files changed, 147 insertions, 56 deletions
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 82b90a7d511d..9ab283025001 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -9,84 +9,175 @@
#ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
#define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
-#include "src/__support/macros/attributes.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/cstddef.h"
#include "src/__support/macros/config.h"
-#include "src/stdlib/qsort_data.h"
+#include "src/stdlib/qsort_pivot.h"
#include <stdint.h>
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-// A simple quicksort implementation using the Hoare partition scheme.
-LIBC_INLINE size_t partition(const Array &array) {
- const size_t array_size = array.size();
- size_t pivot_index = array_size / 2;
- uint8_t *pivot = array.get(pivot_index);
- size_t i = 0;
- size_t j = array_size - 1;
+// Branchless Lomuto partition based on the implementation by Lukas
+// Bergdoll and Orson Peters
+// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md.
+// Simplified to avoid having to stack allocate.
+template <typename A, typename F>
+LIBC_INLINE size_t partition_lomuto_branchless(const A &array,
+ const void *pivot,
+ const F &is_less) {
+ const size_t array_len = array.len();
+
+ size_t left = 0;
+ size_t right = 0;
+
+ while (right < array_len) {
+ const bool right_is_lt = is_less(array.get(right), pivot);
+ array.swap(left, right);
+ left += static_cast<size_t>(right_is_lt);
+ right += 1;
+ }
+
+ return left;
+}
+
+// Optimized for large types that are expensive to move. Not optimized
+// for integers. It's possible to use a cyclic permutation here for
+// large types as done in ipnsort but the advantages of this are limited
+// as `is_less` is a small wrapper around a call to a function pointer
+// and won't incur much binary-size overhead. The other reason to use
+// cyclic permutation is to have more efficient swapping, but we don't
+// know the element size so this isn't applicable here either.
+template <typename A, typename F>
+LIBC_INLINE size_t partition_hoare_branchy(const A &array, const void *pivot,
+ const F &is_less) {
+ const size_t array_len = array.len();
+
+ size_t left = 0;
+ size_t right = array_len;
while (true) {
- int compare_i, compare_j;
-
- while ((compare_i = array.elem_compare(i, pivot)) < 0)
- ++i;
- while ((compare_j = array.elem_compare(j, pivot)) > 0)
- --j;
-
- // At some point i will crossover j so we will definitely break out of
- // this while loop.
- if (i >= j)
- return j + 1;
-
- array.swap(i, j);
-
- // The pivot itself might have got swapped so we will update the pivot.
- if (i == pivot_index) {
- pivot = array.get(j);
- pivot_index = j;
- } else if (j == pivot_index) {
- pivot = array.get(i);
- pivot_index = i;
+ while (left < right && is_less(array.get(left), pivot))
+ ++left;
+
+ while (true) {
+ --right;
+ if (left >= right || is_less(array.get(right), pivot)) {
+ break;
+ }
}
- if (compare_i == 0 && compare_j == 0) {
- // If we do not move the pointers, we will end up with an
- // infinite loop as i and j will be stuck without advancing.
- ++i;
- --j;
- }
+ if (left >= right)
+ break;
+
+ array.swap(left, right);
+ ++left;
+ }
+
+ return left;
+}
+
+template <typename A, typename F>
+LIBC_INLINE size_t partition(const A &array, size_t pivot_index,
+ const F &is_less) {
+ // Place the pivot at the beginning of the array.
+ if (pivot_index != 0) {
+ array.swap(0, pivot_index);
}
+
+ const A array_without_pivot = array.make_array(1, array.len() - 1);
+ const void *pivot = array.get(0);
+
+ size_t num_lt;
+ if constexpr (A::has_fixed_size()) {
+ // Branchless Lomuto avoid branch misprediction penalties, but
+ // it also swaps more often which is only faster if the swap is a fast
+ // constant operation.
+ num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less);
+ } else {
+ num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less);
+ }
+
+ // Place the pivot between the two partitions.
+ array.swap(0, num_lt);
+
+ return num_lt;
}
-LIBC_INLINE void quick_sort(Array array) {
+template <typename A, typename F>
+LIBC_INLINE void quick_sort_impl(A &array, const void *ancestor_pivot,
+ size_t limit, const F &is_less) {
while (true) {
- const size_t array_size = array.size();
- if (array_size <= 1)
+ const size_t array_len = array.len();
+ if (array_len <= 1)
return;
- size_t split_index = partition(array);
- if (array_size == 2)
- // The partition operation sorts the two element array.
+
+ // If too many bad pivot choices were made, simply fall back to
+ // heapsort in order to guarantee `O(N x log(N))` worst-case.
+ if (limit == 0) {
+ heap_sort(array, is_less);
return;
+ }
- // Make Arrays describing the two sublists that still need sorting.
- Array left = array.make_array(0, split_index);
- Array right = array.make_array(split_index, array.size() - split_index);
-
- // Recurse to sort the smaller of the two, and then loop round within this
- // function to sort the larger. This way, recursive call depth is bounded
- // by log2 of the total array size, because every recursive call is sorting
- // a list at most half the length of the one in its caller.
- if (left.size() < right.size()) {
- quick_sort(left);
- array.reset_bounds(right.get(0), right.size());
- } else {
- quick_sort(right);
- array.reset_bounds(left.get(0), left.size());
+ limit -= 1;
+
+ const size_t pivot_index = choose_pivot(array, is_less);
+
+ // If the chosen pivot is equal to the predecessor, then it's the smallest
+ // element in the slice. Partition the slice into elements equal to and
+ // elements greater than the pivot. This case is usually hit when the slice
+ // contains many duplicate elements.
+ if (ancestor_pivot) {
+ if (!is_less(ancestor_pivot, array.get(pivot_index))) {
+ const size_t num_lt =
+ partition(array, pivot_index,
+ [is_less](const void *a, const void *b) -> bool {
+ return !is_less(b, a);
+ });
+
+ // Continue sorting elements greater than the pivot. We know that
+ // `num_lt` cont
+ array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1));
+ ancestor_pivot = nullptr;
+ continue;
+ }
}
+
+ size_t split_index = partition(array, pivot_index, is_less);
+
+ if (array_len == 2)
+ // The partition operation sorts the two element array.
+ return;
+
+ // Split the array into `left`, `pivot`, and `right`.
+ A left = array.make_array(0, split_index);
+ const void *pivot = array.get(split_index);
+ const size_t right_start = split_index + 1;
+ A right = array.make_array(right_start, array.len() - right_start);
+
+ // Recurse into the left side. We have a fixed recursion limit,
+ // testing shows no real benefit for recursing into the shorter
+ // side.
+ quick_sort_impl(left, ancestor_pivot, limit, is_less);
+
+ // Continue with the right side.
+ array = right;
+ ancestor_pivot = pivot;
}
}
+constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; }
+
+template <typename A, typename F>
+LIBC_INLINE void quick_sort(A &array, const F &is_less) {
+ const void *ancestor_pivot = nullptr;
+ // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
+ // The binary OR by one is used to eliminate the zero-check in the logarithm.
+ const size_t limit = 2 * ilog2((array.len() | 1));
+ quick_sort_impl(array, ancestor_pivot, limit, is_less);
+}
+
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL