summaryrefslogtreecommitdiff
path: root/libcxx/include/__algorithm
diff options
context:
space:
mode:
authorPeng Liu <winner245@hotmail.com>2025-03-04 17:15:36 -0500
committerGitHub <noreply@github.com>2025-03-04 17:15:36 -0500
commita12744ff05bbc2d0de711afb8b3a1c7a03a33914 (patch)
treef962ce00e4d60322b88e114d474b4f29cc5ca2ac /libcxx/include/__algorithm
parentb08769c3ec7ef486fd9b8a315856af53c5a58957 (diff)
[libc++] Optimize ranges::swap_ranges for vector<bool>::iterator (#121150)
This PR optimizes the performance of `std::ranges::swap_ranges` for `vector<bool>::iterator`, addressing a subtask outlined in issue #64038. The optimizations yield performance improvements of up to **611x** for aligned range swap and **78x** for unaligned range swap comparison. Additionally, comprehensive tests covering up to 4 storage words (256 bytes) with odd and even bit sizes are provided, which validate the proposed optimizations in this patch.
Diffstat (limited to 'libcxx/include/__algorithm')
-rw-r--r--libcxx/include/__algorithm/swap_ranges.h162
1 files changed, 162 insertions, 0 deletions
diff --git a/libcxx/include/__algorithm/swap_ranges.h b/libcxx/include/__algorithm/swap_ranges.h
index 54b453b72360..2731d4feaa63 100644
--- a/libcxx/include/__algorithm/swap_ranges.h
+++ b/libcxx/include/__algorithm/swap_ranges.h
@@ -10,9 +10,12 @@
#define _LIBCPP___ALGORITHM_SWAP_RANGES_H
#include <__algorithm/iterator_operations.h>
+#include <__algorithm/min.h>
#include <__config>
+#include <__fwd/bit_reference.h>
#include <__utility/move.h>
#include <__utility/pair.h>
+#include <__utility/swap.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
@@ -23,6 +26,165 @@ _LIBCPP_PUSH_MACROS
_LIBCPP_BEGIN_NAMESPACE_STD
+template <class _Cl, class _Cr>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cr, false> __swap_ranges_aligned(
+ __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) {
+ using _I1 = __bit_iterator<_Cl, false>;
+ using difference_type = typename _I1::difference_type;
+ using __storage_type = typename _I1::__storage_type;
+
+ const int __bits_per_word = _I1::__bits_per_word;
+ difference_type __n = __last - __first;
+ if (__n > 0) {
+ // do first word
+ if (__first.__ctz_ != 0) {
+ unsigned __clz = __bits_per_word - __first.__ctz_;
+ difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
+ __n -= __dn;
+ __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+ __storage_type __b1 = *__first.__seg_ & __m;
+ *__first.__seg_ &= ~__m;
+ __storage_type __b2 = *__result.__seg_ & __m;
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b1;
+ *__first.__seg_ |= __b2;
+ __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+ __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+ ++__first.__seg_;
+ // __first.__ctz_ = 0;
+ }
+ // __first.__ctz_ == 0;
+ // do middle words
+ for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_)
+ swap(*__first.__seg_, *__result.__seg_);
+ // do last word
+ if (__n > 0) {
+ __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+ __storage_type __b1 = *__first.__seg_ & __m;
+ *__first.__seg_ &= ~__m;
+ __storage_type __b2 = *__result.__seg_ & __m;
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b1;
+ *__first.__seg_ |= __b2;
+ __result.__ctz_ = static_cast<unsigned>(__n);
+ }
+ }
+ return __result;
+}
+
+template <class _Cl, class _Cr>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cr, false> __swap_ranges_unaligned(
+ __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) {
+ using _I1 = __bit_iterator<_Cl, false>;
+ using difference_type = typename _I1::difference_type;
+ using __storage_type = typename _I1::__storage_type;
+
+ const int __bits_per_word = _I1::__bits_per_word;
+ difference_type __n = __last - __first;
+ if (__n > 0) {
+ // do first word
+ if (__first.__ctz_ != 0) {
+ unsigned __clz_f = __bits_per_word - __first.__ctz_;
+ difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
+ __n -= __dn;
+ __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+ __storage_type __b1 = *__first.__seg_ & __m;
+ *__first.__seg_ &= ~__m;
+ unsigned __clz_r = __bits_per_word - __result.__ctz_;
+ __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
+ __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+ __storage_type __b2 = *__result.__seg_ & __m;
+ *__result.__seg_ &= ~__m;
+ if (__result.__ctz_ > __first.__ctz_) {
+ unsigned __s = __result.__ctz_ - __first.__ctz_;
+ *__result.__seg_ |= __b1 << __s;
+ *__first.__seg_ |= __b2 >> __s;
+ } else {
+ unsigned __s = __first.__ctz_ - __result.__ctz_;
+ *__result.__seg_ |= __b1 >> __s;
+ *__first.__seg_ |= __b2 << __s;
+ }
+ __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
+ __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
+ __dn -= __ddn;
+ if (__dn > 0) {
+ __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+ __b2 = *__result.__seg_ & __m;
+ *__result.__seg_ &= ~__m;
+ unsigned __s = __first.__ctz_ + __ddn;
+ *__result.__seg_ |= __b1 >> __s;
+ *__first.__seg_ |= __b2 << __s;
+ __result.__ctz_ = static_cast<unsigned>(__dn);
+ }
+ ++__first.__seg_;
+ // __first.__ctz_ = 0;
+ }
+ // __first.__ctz_ == 0;
+ // do middle words
+ __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+ unsigned __clz_r = __bits_per_word - __result.__ctz_;
+ for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
+ __storage_type __b1 = *__first.__seg_;
+ __storage_type __b2 = *__result.__seg_ & __m;
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b1 << __result.__ctz_;
+ *__first.__seg_ = __b2 >> __result.__ctz_;
+ ++__result.__seg_;
+ __b2 = *__result.__seg_ & ~__m;
+ *__result.__seg_ &= __m;
+ *__result.__seg_ |= __b1 >> __clz_r;
+ *__first.__seg_ |= __b2 << __clz_r;
+ }
+ // do last word
+ if (__n > 0) {
+ __m = ~__storage_type(0) >> (__bits_per_word - __n);
+ __storage_type __b1 = *__first.__seg_ & __m;
+ *__first.__seg_ &= ~__m;
+ __storage_type __dn = std::min<__storage_type>(__n, __clz_r);
+ __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+ __storage_type __b2 = *__result.__seg_ & __m;
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b1 << __result.__ctz_;
+ *__first.__seg_ |= __b2 >> __result.__ctz_;
+ __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+ __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+ __n -= __dn;
+ if (__n > 0) {
+ __m = ~__storage_type(0) >> (__bits_per_word - __n);
+ __b2 = *__result.__seg_ & __m;
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __b1 >> __dn;
+ *__first.__seg_ |= __b2 << __dn;
+ __result.__ctz_ = static_cast<unsigned>(__n);
+ }
+ }
+ }
+ return __result;
+}
+
+// 2+1 iterators: size2 >= size1; used by std::swap_ranges.
+template <class, class _Cl, class _Cr>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> >
+__swap_ranges(__bit_iterator<_Cl, false> __first1,
+ __bit_iterator<_Cl, false> __last1,
+ __bit_iterator<_Cr, false> __first2) {
+ if (__first1.__ctz_ == __first2.__ctz_)
+ return std::make_pair(__last1, std::__swap_ranges_aligned(__first1, __last1, __first2));
+ return std::make_pair(__last1, std::__swap_ranges_unaligned(__first1, __last1, __first2));
+}
+
+// 2+2 iterators: used by std::ranges::swap_ranges.
+template <class _AlgPolicy, class _Cl, class _Cr>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> >
+__swap_ranges(__bit_iterator<_Cl, false> __first1,
+ __bit_iterator<_Cl, false> __last1,
+ __bit_iterator<_Cr, false> __first2,
+ __bit_iterator<_Cr, false> __last2) {
+ if (__last1 - __first1 < __last2 - __first2)
+ return std::make_pair(__last1, std::__swap_ranges<_AlgPolicy>(__first1, __last1, __first2).second);
+ return std::make_pair(std::__swap_ranges<_AlgPolicy>(__first2, __last2, __first1).second, __last2);
+}
+
// 2+2 iterators: the shorter size will be used.
template <class _AlgPolicy, class _ForwardIterator1, class _Sentinel1, class _ForwardIterator2, class _Sentinel2>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator1, _ForwardIterator2>