diff options
| author | Peng Liu <winner245@hotmail.com> | 2025-03-04 17:15:36 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-03-04 17:15:36 -0500 |
| commit | a12744ff05bbc2d0de711afb8b3a1c7a03a33914 (patch) | |
| tree | f962ce00e4d60322b88e114d474b4f29cc5ca2ac /libcxx/include/__bit_reference | |
| parent | b08769c3ec7ef486fd9b8a315856af53c5a58957 (diff) | |
[libc++] Optimize ranges::swap_ranges for vector<bool>::iterator (#121150)
This PR optimizes the performance of `std::ranges::swap_ranges` for
`vector<bool>::iterator`, addressing a subtask outlined in issue #64038.
The optimizations yield performance improvements of up to **611x** for
aligned range swap and **78x** for unaligned range swap comparison.
Additionally, comprehensive tests covering up to 4 storage words (256
bytes) with odd and even bit sizes are provided, which validate the
proposed optimizations in this patch.
Diffstat (limited to 'libcxx/include/__bit_reference')
| -rw-r--r-- | libcxx/include/__bit_reference | 157 |
1 files changed, 6 insertions, 151 deletions
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference index 079d1de9dd52..f91250c4a440 100644 --- a/libcxx/include/__bit_reference +++ b/libcxx/include/__bit_reference @@ -16,6 +16,7 @@ #include <__algorithm/copy_n.h> #include <__algorithm/equal.h> #include <__algorithm/min.h> +#include <__algorithm/swap_ranges.h> #include <__assert> #include <__bit/countr.h> #include <__compare/ordering.h> @@ -215,152 +216,6 @@ private: __mask_(__m) {} }; -// swap_ranges - -template <class _Cl, class _Cr> -_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> __swap_ranges_aligned( - __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) { - using _I1 = __bit_iterator<_Cl, false>; - using difference_type = typename _I1::difference_type; - using __storage_type = typename _I1::__storage_type; - - const int __bits_per_word = _I1::__bits_per_word; - difference_type __n = __last - __first; - if (__n > 0) { - // do first word - if (__first.__ctz_ != 0) { - unsigned __clz = __bits_per_word - __first.__ctz_; - difference_type __dn = std::min(static_cast<difference_type>(__clz), __n); - __n -= __dn; - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn)); - __storage_type __b1 = *__first.__seg_ & __m; - *__first.__seg_ &= ~__m; - __storage_type __b2 = *__result.__seg_ & __m; - *__result.__seg_ &= ~__m; - *__result.__seg_ |= __b1; - *__first.__seg_ |= __b2; - __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word; - __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word); - ++__first.__seg_; - // __first.__ctz_ = 0; - } - // __first.__ctz_ == 0; - // do middle words - for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_) - swap(*__first.__seg_, *__result.__seg_); - // do last word - if (__n > 0) { - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - __storage_type __b1 = *__first.__seg_ & __m; - *__first.__seg_ &= ~__m; - __storage_type __b2 = *__result.__seg_ & __m; - *__result.__seg_ &= ~__m; - *__result.__seg_ |= __b1; - *__first.__seg_ |= __b2; - __result.__ctz_ = static_cast<unsigned>(__n); - } - } - return __result; -} - -template <class _Cl, class _Cr> -_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> __swap_ranges_unaligned( - __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) { - using _I1 = __bit_iterator<_Cl, false>; - using difference_type = typename _I1::difference_type; - using __storage_type = typename _I1::__storage_type; - - const int __bits_per_word = _I1::__bits_per_word; - difference_type __n = __last - __first; - if (__n > 0) { - // do first word - if (__first.__ctz_ != 0) { - unsigned __clz_f = __bits_per_word - __first.__ctz_; - difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n); - __n -= __dn; - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); - __storage_type __b1 = *__first.__seg_ & __m; - *__first.__seg_ &= ~__m; - unsigned __clz_r = __bits_per_word - __result.__ctz_; - __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r); - __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn)); - __storage_type __b2 = *__result.__seg_ & __m; - *__result.__seg_ &= ~__m; - if (__result.__ctz_ > __first.__ctz_) { - unsigned __s = __result.__ctz_ - __first.__ctz_; - *__result.__seg_ |= __b1 << __s; - *__first.__seg_ |= __b2 >> __s; - } else { - unsigned __s = __first.__ctz_ - __result.__ctz_; - *__result.__seg_ |= __b1 >> __s; - *__first.__seg_ |= __b2 << __s; - } - __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word; - __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word); - __dn -= __ddn; - if (__dn > 0) { - __m = ~__storage_type(0) >> (__bits_per_word - __dn); - __b2 = *__result.__seg_ & __m; - *__result.__seg_ &= ~__m; - unsigned __s = __first.__ctz_ + __ddn; - *__result.__seg_ |= __b1 >> __s; - *__first.__seg_ |= __b2 << __s; - __result.__ctz_ = static_cast<unsigned>(__dn); - } - ++__first.__seg_; - // __first.__ctz_ = 0; - } - // __first.__ctz_ == 0; - // do middle words - __storage_type __m = ~__storage_type(0) << __result.__ctz_; - unsigned __clz_r = __bits_per_word - __result.__ctz_; - for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) { - __storage_type __b1 = *__first.__seg_; - __storage_type __b2 = *__result.__seg_ & __m; - *__result.__seg_ &= ~__m; - *__result.__seg_ |= __b1 << __result.__ctz_; - *__first.__seg_ = __b2 >> __result.__ctz_; - ++__result.__seg_; - __b2 = *__result.__seg_ & ~__m; - *__result.__seg_ &= __m; - *__result.__seg_ |= __b1 >> __clz_r; - *__first.__seg_ |= __b2 << __clz_r; - } - // do last word - if (__n > 0) { - __m = ~__storage_type(0) >> (__bits_per_word - __n); - __storage_type __b1 = *__first.__seg_ & __m; - *__first.__seg_ &= ~__m; - __storage_type __dn = std::min<__storage_type>(__n, __clz_r); - __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn)); - __storage_type __b2 = *__result.__seg_ & __m; - *__result.__seg_ &= ~__m; - *__result.__seg_ |= __b1 << __result.__ctz_; - *__first.__seg_ |= __b2 >> __result.__ctz_; - __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word; - __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word); - __n -= __dn; - if (__n > 0) { - __m = ~__storage_type(0) >> (__bits_per_word - __n); - __b2 = *__result.__seg_ & __m; - *__result.__seg_ &= ~__m; - *__result.__seg_ |= __b1 >> __dn; - *__first.__seg_ |= __b2 << __dn; - __result.__ctz_ = static_cast<unsigned>(__n); - } - } - } - return __result; -} - -template <class _Cl, class _Cr> -inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> swap_ranges( - __bit_iterator<_Cl, false> __first1, __bit_iterator<_Cl, false> __last1, __bit_iterator<_Cr, false> __first2) { - if (__first1.__ctz_ == __first2.__ctz_) - return std::__swap_ranges_aligned(__first1, __last1, __first2); - return std::__swap_ranges_unaligned(__first1, __last1, __first2); -} - // rotate template <class _Cp> @@ -644,14 +499,14 @@ private: template <class _AlgPolicy> friend struct __copy_backward_impl; template <class _Cl, class _Cr> - friend __bit_iterator<_Cr, false> + _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Cr, false> __swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>); template <class _Cl, class _Cr> - friend __bit_iterator<_Cr, false> + _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Cr, false> __swap_ranges_unaligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>); - template <class _Cl, class _Cr> - friend __bit_iterator<_Cr, false> - swap_ranges(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>); + template <class, class _Cl, class _Cr> + _LIBCPP_CONSTEXPR_SINCE_CXX20 friend pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> > + __swap_ranges(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>); template <class _Dp> _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> rotate(__bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>); |
