1 files changed, 147 insertions, 56 deletions
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 82b90a7d511d..9ab283025001 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -9,84 +9,175 @@
 #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
 #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
 
-#include "src/__support/macros/attributes.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/cstddef.h"
 #include "src/__support/macros/config.h"
-#include "src/stdlib/qsort_data.h"
+#include "src/stdlib/qsort_pivot.h"
 
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-// A simple quicksort implementation using the Hoare partition scheme.
-LIBC_INLINE size_t partition(const Array &array) {
-  const size_t array_size = array.size();
-  size_t pivot_index = array_size / 2;
-  uint8_t *pivot = array.get(pivot_index);
-  size_t i = 0;
-  size_t j = array_size - 1;
+// Branchless Lomuto partition based on the implementation by Lukas
+// Bergdoll and Orson Peters
+// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md.
+// Simplified to avoid having to stack allocate.
+template <typename A, typename F>
+LIBC_INLINE size_t partition_lomuto_branchless(const A &array,
+                                               const void *pivot,
+                                               const F &is_less) {
+  const size_t array_len = array.len();
+
+  size_t left = 0;
+  size_t right = 0;
+
+  while (right < array_len) {
+    const bool right_is_lt = is_less(array.get(right), pivot);
+    array.swap(left, right);
+    left += static_cast<size_t>(right_is_lt);
+    right += 1;
+  }
+
+  return left;
+}
+
+// Optimized for large types that are expensive to move. Not optimized
+// for integers. It's possible to use a cyclic permutation here for
+// large types as done in ipnsort but the advantages of this are limited
+// as `is_less` is a small wrapper around a call to a function pointer
+// and won't incur much binary-size overhead. The other reason to use
+// cyclic permutation is to have more efficient swapping, but we don't
+// know the element size so this isn't applicable here either.
+template <typename A, typename F>
+LIBC_INLINE size_t partition_hoare_branchy(const A &array, const void *pivot,
+                                           const F &is_less) {
+  const size_t array_len = array.len();
+
+  size_t left = 0;
+  size_t right = array_len;
 
   while (true) {
-    int compare_i, compare_j;
-
-    while ((compare_i = array.elem_compare(i, pivot)) < 0)
-      ++i;
-    while ((compare_j = array.elem_compare(j, pivot)) > 0)
-      --j;
-
-    // At some point i will crossover j so we will definitely break out of
-    // this while loop.
-    if (i >= j)
-      return j + 1;
-
-    array.swap(i, j);
-
-    // The pivot itself might have got swapped so we will update the pivot.
-    if (i == pivot_index) {
-      pivot = array.get(j);
-      pivot_index = j;
-    } else if (j == pivot_index) {
-      pivot = array.get(i);
-      pivot_index = i;
+    while (left < right && is_less(array.get(left), pivot))
+      ++left;
+
+    while (true) {
+      --right;
+      if (left >= right || is_less(array.get(right), pivot)) {
+        break;
+      }
     }
 
-    if (compare_i == 0 && compare_j == 0) {
-      // If we do not move the pointers, we will end up with an
-      // infinite loop as i and j will be stuck without advancing.
-      ++i;
-      --j;
-    }
+    if (left >= right)
+      break;
+
+    array.swap(left, right);
+    ++left;
+  }
+
+  return left;
+}
+
+template <typename A, typename F>
+LIBC_INLINE size_t partition(const A &array, size_t pivot_index,
+                             const F &is_less) {
+  // Place the pivot at the beginning of the array.
+  if (pivot_index != 0) {
+    array.swap(0, pivot_index);
   }
+
+  const A array_without_pivot = array.make_array(1, array.len() - 1);
+  const void *pivot = array.get(0);
+
+  size_t num_lt;
+  if constexpr (A::has_fixed_size()) {
+    // Branchless Lomuto avoid branch misprediction penalties, but
+    // it also swaps more often which is only faster if the swap is a fast
+    // constant operation.
+    num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less);
+  } else {
+    num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less);
+  }
+
+  // Place the pivot between the two partitions.
+  array.swap(0, num_lt);
+
+  return num_lt;
 }
 
-LIBC_INLINE void quick_sort(Array array) {
+template <typename A, typename F>
+LIBC_INLINE void quick_sort_impl(A &array, const void *ancestor_pivot,
+                                 size_t limit, const F &is_less) {
   while (true) {
-    const size_t array_size = array.size();
-    if (array_size <= 1)
+    const size_t array_len = array.len();
+    if (array_len <= 1)
       return;
-    size_t split_index = partition(array);
-    if (array_size == 2)
-      // The partition operation sorts the two element array.
+
+    // If too many bad pivot choices were made, simply fall back to
+    // heapsort in order to guarantee `O(N x log(N))` worst-case.
+    if (limit == 0) {
+      heap_sort(array, is_less);
       return;
+    }
 
-    // Make Arrays describing the two sublists that still need sorting.
-    Array left = array.make_array(0, split_index);
-    Array right = array.make_array(split_index, array.size() - split_index);
-
-    // Recurse to sort the smaller of the two, and then loop round within this
-    // function to sort the larger. This way, recursive call depth is bounded
-    // by log2 of the total array size, because every recursive call is sorting
-    // a list at most half the length of the one in its caller.
-    if (left.size() < right.size()) {
-      quick_sort(left);
-      array.reset_bounds(right.get(0), right.size());
-    } else {
-      quick_sort(right);
-      array.reset_bounds(left.get(0), left.size());
+    limit -= 1;
+
+    const size_t pivot_index = choose_pivot(array, is_less);
+
+    // If the chosen pivot is equal to the predecessor, then it's the smallest
+    // element in the slice. Partition the slice into elements equal to and
+    // elements greater than the pivot. This case is usually hit when the slice
+    // contains many duplicate elements.
+    if (ancestor_pivot) {
+      if (!is_less(ancestor_pivot, array.get(pivot_index))) {
+        const size_t num_lt =
+            partition(array, pivot_index,
+                      [is_less](const void *a, const void *b) -> bool {
+                        return !is_less(b, a);
+                      });
+
+        // Continue sorting elements greater than the pivot. We know that
+        // `num_lt` cont
+        array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1));
+        ancestor_pivot = nullptr;
+        continue;
+      }
     }
+
+    size_t split_index = partition(array, pivot_index, is_less);
+
+    if (array_len == 2)
+      // The partition operation sorts the two element array.
+      return;
+
+    // Split the array into `left`, `pivot`, and `right`.
+    A left = array.make_array(0, split_index);
+    const void *pivot = array.get(split_index);
+    const size_t right_start = split_index + 1;
+    A right = array.make_array(right_start, array.len() - right_start);
+
+    // Recurse into the left side. We have a fixed recursion limit,
+    // testing shows no real benefit for recursing into the shorter
+    // side.
+    quick_sort_impl(left, ancestor_pivot, limit, is_less);
+
+    // Continue with the right side.
+    array = right;
+    ancestor_pivot = pivot;
   }
 }
 
+constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; }
+
+template <typename A, typename F>
+LIBC_INLINE void quick_sort(A &array, const F &is_less) {
+  const void *ancestor_pivot = nullptr;
+  // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
+  // The binary OR by one is used to eliminate the zero-check in the logarithm.
+  const size_t limit = 2 * ilog2((array.len() | 1));
+  quick_sort_impl(array, ancestor_pivot, limit, is_less);
+}
+
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL