summaryrefslogtreecommitdiff
path: root/libcxx/include/__algorithm
diff options
context:
space:
mode:
authorPeng Liu <winner245@hotmail.com>2025-10-16 19:41:24 -0400
committerGitHub <noreply@github.com>2025-10-17 07:41:24 +0800
commitd0cee6939a1a889b14dcfc0bb049f38063c81f9b (patch)
tree5a9201593a8ba86c8adfefcfd79388881a64398f /libcxx/include/__algorithm
parent65c895dfe084860847e9e220ff9f1b283ebcb289 (diff)
[libc++] Optimize std::{,ranges}::{fill,fill_n} for segmented iterators (#132665)
This patch optimizes `std::fill`, `std::fill_n`, `std::ranges::fill`, and `std::ranges::fill_n` for segmented iterators, achieving substantial performance improvements. Specifically, for `deque<int>` iterators, the performance improvements are above 10x for all these algorithms. The optimization also enables filling segmented memory of `deque<int>` to approach the performance of filling contiguous memory of `vector<int>`. Benchmark results comparing the before and after implementations are provided below. For additional context, we’ve included `vector<int>` results, which remain unchanged, as this patch specifically targets segmented iterators and leaves non-segmented iterator behavior untouched. Fixes two subtasks outlined in #102817. #### `fill_n` ``` ----------------------------------------------------------------------------- Benchmark Before After Speedup ----------------------------------------------------------------------------- std::fill_n(deque<int>)/32 11.4 ns 2.28 ns 5.0x std::fill_n(deque<int>)/50 19.7 ns 3.40 ns 5.8x std::fill_n(deque<int>)/1024 391 ns 37.3 ns 10.5x std::fill_n(deque<int>)/8192 3174 ns 301 ns 10.5x std::fill_n(deque<int>)/65536 26504 ns 2951 ns 9.0x std::fill_n(deque<int>)/1048576 407960 ns 80658 ns 5.1x rng::fill_n(deque<int>)/32 14.3 ns 2.15 ns 6.6x rng::fill_n(deque<int>)/50 20.2 ns 3.22 ns 6.3x rng::fill_n(deque<int>)/1024 381 ns 37.8 ns 10.1x rng::fill_n(deque<int>)/8192 3101 ns 294 ns 10.5x rng::fill_n(deque<int>)/65536 25098 ns 2926 ns 8.6x rng::fill_n(deque<int>)/1048576 394342 ns 78874 ns 5.0x std::fill_n(vector<int>)/32 1.76 ns 1.72 ns 1.0x std::fill_n(vector<int>)/50 3.00 ns 2.73 ns 1.1x std::fill_n(vector<int>)/1024 38.4 ns 37.9 ns 1.0x std::fill_n(vector<int>)/8192 258 ns 252 ns 1.0x std::fill_n(vector<int>)/65536 2993 ns 2889 ns 1.0x std::fill_n(vector<int>)/1048576 80328 ns 80468 ns 1.0x rng::fill_n(vector<int>)/32 1.99 ns 1.35 ns 1.5x rng::fill_n(vector<int>)/50 2.66 ns 2.12 ns 1.3x rng::fill_n(vector<int>)/1024 37.7 ns 35.8 ns 1.1x rng::fill_n(vector<int>)/8192 253 ns 250 ns 1.0x rng::fill_n(vector<int>)/65536 2922 ns 2930 ns 1.0x rng::fill_n(vector<int>)/1048576 79739 ns 79742 ns 1.0x ``` #### `fill` ``` -------------------------------------------------------------------------- Benchmark Before After Speedup -------------------------------------------------------------------------- std::fill(deque<int>)/32 13.7 ns 2.45 ns 5.6x std::fill(deque<int>)/50 21.7 ns 4.57 ns 4.7x std::fill(deque<int>)/1024 367 ns 38.5 ns 9.5x std::fill(deque<int>)/8192 2896 ns 247 ns 11.7x std::fill(deque<int>)/65536 23723 ns 2907 ns 8.2x std::fill(deque<int>)/1048576 379043 ns 79885 ns 4.7x rng::fill(deque<int>)/32 13.6 ns 2.70 ns 5.0x rng::fill(deque<int>)/50 23.4 ns 3.94 ns 5.9x rng::fill(deque<int>)/1024 377 ns 37.9 ns 9.9x rng::fill(deque<int>)/8192 2914 ns 286 ns 10.2x rng::fill(deque<int>)/65536 23612 ns 2939 ns 8.0x rng::fill(deque<int>)/1048576 379841 ns 80079 ns 4.7x std::fill(vector<int>)/32 1.99 ns 1.79 ns 1.1x std::fill(vector<int>)/50 3.05 ns 3.06 ns 1.0x std::fill(vector<int>)/1024 37.6 ns 38.0 ns 1.0x std::fill(vector<int>)/8192 255 ns 257 ns 1.0x std::fill(vector<int>)/65536 2966 ns 2981 ns 1.0x std::fill(vector<int>)/1048576 78300 ns 80348 ns 1.0x rng::fill(vector<int>)/32 1.77 ns 1.75 ns 1.0x rng::fill(vector<int>)/50 4.85 ns 2.31 ns 2.1x rng::fill(vector<int>)/1024 39.6 ns 36.1 ns 1.1x rng::fill(vector<int>)/8192 238 ns 251 ns 0.9x rng::fill(vector<int>)/65536 2941 ns 2918 ns 1.0x rng::fill(vector<int>)/1048576 80497 ns 80442 ns 1.0x ``` --------- Co-authored-by: Louis Dionne <ldionne.2@gmail.com> Co-authored-by: A. Jiang <de34@live.cn>
Diffstat (limited to 'libcxx/include/__algorithm')
-rw-r--r--libcxx/include/__algorithm/fill.h36
-rw-r--r--libcxx/include/__algorithm/fill_n.h48
-rw-r--r--libcxx/include/__algorithm/ranges_fill.h13
3 files changed, 73 insertions, 24 deletions
diff --git a/libcxx/include/__algorithm/fill.h b/libcxx/include/__algorithm/fill.h
index 1ce3eadb013d..328ebb663376 100644
--- a/libcxx/include/__algorithm/fill.h
+++ b/libcxx/include/__algorithm/fill.h
@@ -10,8 +10,11 @@
#define _LIBCPP___ALGORITHM_FILL_H
#include <__algorithm/fill_n.h>
+#include <__algorithm/for_each_segment.h>
#include <__config>
#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+#include <__type_traits/enable_if.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
@@ -21,23 +24,40 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// fill isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
-template <class _ForwardIterator, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, forward_iterator_tag) {
+template <class _ForwardIterator, class _Sentinel, class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+__fill(_ForwardIterator __first, _Sentinel __last, const _Tp& __value) {
for (; __first != __last; ++__first)
*__first = __value;
+ return __first;
}
-template <class _RandomAccessIterator, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value, random_access_iterator_tag) {
- std::fill_n(__first, __last - __first, __value);
+template <class _RandomAccessIterator,
+ class _Tp,
+ __enable_if_t<__has_random_access_iterator_category<_RandomAccessIterator>::value &&
+ !__is_segmented_iterator_v<_RandomAccessIterator>,
+ int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator
+__fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value) {
+ return std::__fill_n(__first, __last - __first, __value);
+}
+
+#ifndef _LIBCPP_CXX03_LANG
+template <class _SegmentedIterator, class _Tp, __enable_if_t<__is_segmented_iterator_v<_SegmentedIterator>, int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+_SegmentedIterator __fill(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value) {
+ using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
+ std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+ std::__fill(__lfirst, __llast, __value);
+ });
+ return __last;
}
+#endif // !_LIBCPP_CXX03_LANG
template <class _ForwardIterator, class _Tp>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
- std::__fill(__first, __last, __value, typename iterator_traits<_ForwardIterator>::iterator_category());
+ std::__fill(__first, __last, __value);
}
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h
index 0da78e1f38c4..2bfacf3178c4 100644
--- a/libcxx/include/__algorithm/fill_n.h
+++ b/libcxx/include/__algorithm/fill_n.h
@@ -9,10 +9,17 @@
#ifndef _LIBCPP___ALGORITHM_FILL_N_H
#define _LIBCPP___ALGORITHM_FILL_N_H
+#include <__algorithm/for_each_n_segment.h>
#include <__algorithm/min.h>
#include <__config>
#include <__fwd/bit_reference.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
#include <__memory/pointer_traits.h>
+#include <__type_traits/conjunction.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/integral_constant.h>
+#include <__type_traits/negation.h>
#include <__utility/convert_to_integral.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,9 +33,38 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
-template <class _OutputIterator, class _Size, class _Tp>
+template <class _OutputIterator,
+ class _Size,
+ class _Tp
+#ifndef _LIBCPP_CXX03_LANG
+ ,
+ __enable_if_t<!_And<_BoolConstant<__is_segmented_iterator_v<_OutputIterator>>,
+ __has_random_access_local_iterator<_OutputIterator>>::value,
+ int> = 0
+#endif
+ >
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
-__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value);
+__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
+ for (; __n > 0; ++__first, (void)--__n)
+ *__first = __value;
+ return __first;
+}
+
+#ifndef _LIBCPP_CXX03_LANG
+template < class _OutputIterator,
+ class _Size,
+ class _Tp,
+ __enable_if_t<_And<_BoolConstant<__is_segmented_iterator_v<_OutputIterator>>,
+ __has_random_access_local_iterator<_OutputIterator>>::value,
+ int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX14 _OutputIterator __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
+ using __local_iterator_t = typename __segmented_iterator_traits<_OutputIterator>::__local_iterator;
+ return std::__for_each_n_segment(__first, __n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+ std::__fill_n(__lfirst, __llast - __lfirst, __value);
+ });
+}
+#endif // !_LIBCPP_CXX03_LANG
template <bool _FillVal, class _Cp>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
@@ -70,14 +106,6 @@ __fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) {
template <class _OutputIterator, class _Size, class _Tp>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
-__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
- for (; __n > 0; ++__first, (void)--__n)
- *__first = __value;
- return __first;
-}
-
-template <class _OutputIterator, class _Size, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
return std::__fill_n(__first, std::__convert_to_integral(__n), __value);
}
diff --git a/libcxx/include/__algorithm/ranges_fill.h b/libcxx/include/__algorithm/ranges_fill.h
index c248009f98fe..814ae6363fcf 100644
--- a/libcxx/include/__algorithm/ranges_fill.h
+++ b/libcxx/include/__algorithm/ranges_fill.h
@@ -9,12 +9,14 @@
#ifndef _LIBCPP___ALGORITHM_RANGES_FILL_H
#define _LIBCPP___ALGORITHM_RANGES_FILL_H
-#include <__algorithm/ranges_fill_n.h>
+#include <__algorithm/fill.h>
+#include <__algorithm/fill_n.h>
#include <__config>
#include <__iterator/concepts.h>
#include <__ranges/access.h>
#include <__ranges/concepts.h>
#include <__ranges/dangling.h>
+#include <__utility/move.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
@@ -31,12 +33,11 @@ namespace ranges {
struct __fill {
template <class _Type, output_iterator<const _Type&> _Iter, sentinel_for<_Iter> _Sent>
_LIBCPP_HIDE_FROM_ABI constexpr _Iter operator()(_Iter __first, _Sent __last, const _Type& __value) const {
- if constexpr (random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) {
- return ranges::fill_n(__first, __last - __first, __value);
+ if constexpr (sized_sentinel_for<_Sent, _Iter>) {
+ auto __n = __last - __first;
+ return std::__fill_n(std::move(__first), __n, __value);
} else {
- for (; __first != __last; ++__first)
- *__first = __value;
- return __first;
+ return std::__fill(std::move(__first), std::move(__last), __value);
}
}