diff options
| author | Peng Liu <winner245@hotmail.com> | 2025-03-04 17:15:36 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-03-04 17:15:36 -0500 |
| commit | a12744ff05bbc2d0de711afb8b3a1c7a03a33914 (patch) | |
| tree | f962ce00e4d60322b88e114d474b4f29cc5ca2ac /libcxx/include/__algorithm | |
| parent | b08769c3ec7ef486fd9b8a315856af53c5a58957 (diff) | |
[libc++] Optimize ranges::swap_ranges for vector<bool>::iterator (#121150)
This PR optimizes the performance of `std::ranges::swap_ranges` for
`vector<bool>::iterator`, addressing a subtask outlined in issue #64038.
The optimizations yield performance improvements of up to **611x** for
aligned range swap and **78x** for unaligned range swap comparison.
Additionally, comprehensive tests covering up to 4 storage words (256
bytes) with odd and even bit sizes are provided, which validate the
proposed optimizations in this patch.
Diffstat (limited to 'libcxx/include/__algorithm')
| -rw-r--r-- | libcxx/include/__algorithm/swap_ranges.h | 162 |
1 files changed, 162 insertions, 0 deletions
diff --git a/libcxx/include/__algorithm/swap_ranges.h b/libcxx/include/__algorithm/swap_ranges.h index 54b453b72360..2731d4feaa63 100644 --- a/libcxx/include/__algorithm/swap_ranges.h +++ b/libcxx/include/__algorithm/swap_ranges.h @@ -10,9 +10,12 @@ #define _LIBCPP___ALGORITHM_SWAP_RANGES_H #include <__algorithm/iterator_operations.h> +#include <__algorithm/min.h> #include <__config> +#include <__fwd/bit_reference.h> #include <__utility/move.h> #include <__utility/pair.h> +#include <__utility/swap.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -23,6 +26,165 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD +template <class _Cl, class _Cr> +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cr, false> __swap_ranges_aligned( + __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) { + using _I1 = __bit_iterator<_Cl, false>; + using difference_type = typename _I1::difference_type; + using __storage_type = typename _I1::__storage_type; + + const int __bits_per_word = _I1::__bits_per_word; + difference_type __n = __last - __first; + if (__n > 0) { + // do first word + if (__first.__ctz_ != 0) { + unsigned __clz = __bits_per_word - __first.__ctz_; + difference_type __dn = std::min(static_cast<difference_type>(__clz), __n); + __n -= __dn; + __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn)); + __storage_type __b1 = *__first.__seg_ & __m; + *__first.__seg_ &= ~__m; + __storage_type __b2 = *__result.__seg_ & __m; + *__result.__seg_ &= ~__m; + *__result.__seg_ |= __b1; + *__first.__seg_ |= __b2; + __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word; + __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word); + ++__first.__seg_; + // __first.__ctz_ = 0; + } + // __first.__ctz_ == 0; + // do middle words + for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_) + swap(*__first.__seg_, *__result.__seg_); + // do last word + if (__n > 0) { + __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); + __storage_type __b1 = *__first.__seg_ & __m; + *__first.__seg_ &= ~__m; + __storage_type __b2 = *__result.__seg_ & __m; + *__result.__seg_ &= ~__m; + *__result.__seg_ |= __b1; + *__first.__seg_ |= __b2; + __result.__ctz_ = static_cast<unsigned>(__n); + } + } + return __result; +} + +template <class _Cl, class _Cr> +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cr, false> __swap_ranges_unaligned( + __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) { + using _I1 = __bit_iterator<_Cl, false>; + using difference_type = typename _I1::difference_type; + using __storage_type = typename _I1::__storage_type; + + const int __bits_per_word = _I1::__bits_per_word; + difference_type __n = __last - __first; + if (__n > 0) { + // do first word + if (__first.__ctz_ != 0) { + unsigned __clz_f = __bits_per_word - __first.__ctz_; + difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n); + __n -= __dn; + __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); + __storage_type __b1 = *__first.__seg_ & __m; + *__first.__seg_ &= ~__m; + unsigned __clz_r = __bits_per_word - __result.__ctz_; + __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r); + __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn)); + __storage_type __b2 = *__result.__seg_ & __m; + *__result.__seg_ &= ~__m; + if (__result.__ctz_ > __first.__ctz_) { + unsigned __s = __result.__ctz_ - __first.__ctz_; + *__result.__seg_ |= __b1 << __s; + *__first.__seg_ |= __b2 >> __s; + } else { + unsigned __s = __first.__ctz_ - __result.__ctz_; + *__result.__seg_ |= __b1 >> __s; + *__first.__seg_ |= __b2 << __s; + } + __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word; + __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word); + __dn -= __ddn; + if (__dn > 0) { + __m = ~__storage_type(0) >> (__bits_per_word - __dn); + __b2 = *__result.__seg_ & __m; + *__result.__seg_ &= ~__m; + unsigned __s = __first.__ctz_ + __ddn; + *__result.__seg_ |= __b1 >> __s; + *__first.__seg_ |= __b2 << __s; + __result.__ctz_ = static_cast<unsigned>(__dn); + } + ++__first.__seg_; + // __first.__ctz_ = 0; + } + // __first.__ctz_ == 0; + // do middle words + __storage_type __m = ~__storage_type(0) << __result.__ctz_; + unsigned __clz_r = __bits_per_word - __result.__ctz_; + for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) { + __storage_type __b1 = *__first.__seg_; + __storage_type __b2 = *__result.__seg_ & __m; + *__result.__seg_ &= ~__m; + *__result.__seg_ |= __b1 << __result.__ctz_; + *__first.__seg_ = __b2 >> __result.__ctz_; + ++__result.__seg_; + __b2 = *__result.__seg_ & ~__m; + *__result.__seg_ &= __m; + *__result.__seg_ |= __b1 >> __clz_r; + *__first.__seg_ |= __b2 << __clz_r; + } + // do last word + if (__n > 0) { + __m = ~__storage_type(0) >> (__bits_per_word - __n); + __storage_type __b1 = *__first.__seg_ & __m; + *__first.__seg_ &= ~__m; + __storage_type __dn = std::min<__storage_type>(__n, __clz_r); + __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn)); + __storage_type __b2 = *__result.__seg_ & __m; + *__result.__seg_ &= ~__m; + *__result.__seg_ |= __b1 << __result.__ctz_; + *__first.__seg_ |= __b2 >> __result.__ctz_; + __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word; + __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word); + __n -= __dn; + if (__n > 0) { + __m = ~__storage_type(0) >> (__bits_per_word - __n); + __b2 = *__result.__seg_ & __m; + *__result.__seg_ &= ~__m; + *__result.__seg_ |= __b1 >> __dn; + *__first.__seg_ |= __b2 << __dn; + __result.__ctz_ = static_cast<unsigned>(__n); + } + } + } + return __result; +} + +// 2+1 iterators: size2 >= size1; used by std::swap_ranges. +template <class, class _Cl, class _Cr> +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> > +__swap_ranges(__bit_iterator<_Cl, false> __first1, + __bit_iterator<_Cl, false> __last1, + __bit_iterator<_Cr, false> __first2) { + if (__first1.__ctz_ == __first2.__ctz_) + return std::make_pair(__last1, std::__swap_ranges_aligned(__first1, __last1, __first2)); + return std::make_pair(__last1, std::__swap_ranges_unaligned(__first1, __last1, __first2)); +} + +// 2+2 iterators: used by std::ranges::swap_ranges. +template <class _AlgPolicy, class _Cl, class _Cr> +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> > +__swap_ranges(__bit_iterator<_Cl, false> __first1, + __bit_iterator<_Cl, false> __last1, + __bit_iterator<_Cr, false> __first2, + __bit_iterator<_Cr, false> __last2) { + if (__last1 - __first1 < __last2 - __first2) + return std::make_pair(__last1, std::__swap_ranges<_AlgPolicy>(__first1, __last1, __first2).second); + return std::make_pair(std::__swap_ranges<_AlgPolicy>(__first2, __last2, __first1).second, __last2); +} + // 2+2 iterators: the shorter size will be used. template <class _AlgPolicy, class _ForwardIterator1, class _Sentinel1, class _ForwardIterator2, class _Sentinel2> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator1, _ForwardIterator2> |
