diff options
| author | Schrodinger ZHU Yifan <yifanzhu@rochester.edu> | 2025-11-10 19:15:34 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-11-10 19:15:34 -0500 |
| commit | 8751f26a1bfac82703aaa36c61edf8771ef9e77f (patch) | |
| tree | fd70c2e66c7f5d3dfbbd737525acab57bcef6250 /libc/src/string | |
| parent | cd6c761b53435cfd1a1ead67b461fc07819602e6 (diff) | |
[libc] add an SVE implementation of strlen (#167259)
This PR creates an SVE-based implementation for strlen by translating
from the AOR code in tree. Microbenchmark shows improvements against
NEON when N>=64. Although both implementations fall behind glibc by a
large margin,
this may be a good start point to explore SVE implementations.
Together with the PR:
1. Added two more tests of strlen with special nul symbols.
2. Added strlen's fuzzer and fix a typo in previous heap fuzzer.
```
=== strlen(16 bytes) ===
libc: 1.56115 ns/call, 9.54499 GiB/s
neon: 1.59393 ns/call, 9.34867 GiB/s
sve: 1.66097 ns/call, 8.97134 GiB/s
=== strlen(64 bytes) ===
libc: 2.06967 ns/call, 28.7991 GiB/s
neon: 2.59914 ns/call, 22.9325 GiB/s
sve: 2.58628 ns/call, 23.0465 GiB/s
=== strlen(256 bytes) ===
libc: 3.74165 ns/call, 63.7202 GiB/s
neon: 8.98243 ns/call, 26.5428 GiB/s
sve: 7.36426 ns/call, 32.3751 GiB/s
=== strlen(1024 bytes) ===
libc: 10.5327 ns/call, 90.5438 GiB/s
neon: 34.363 ns/call, 27.7529 GiB/s
sve: 26.9329 ns/call, 35.4092 GiB/s
=== strlen(4096 bytes) ===
libc: 37.7304 ns/call, 101.104 GiB/s
neon: 145.911 ns/call, 26.144 GiB/s
sve: 103.208 ns/call, 36.9612 GiB/s
=== strlen(1048576 bytes) ===
libc: 9623.4 ns/call, 101.478 GiB/s
neon: 36138.2 ns/call, 27.023 GiB/s
sve: 26605.6 ns/call, 36.7051 GiB/s
```
Diffstat (limited to 'libc/src/string')
| -rw-r--r-- | libc/src/string/memory_utils/aarch64/inline_strlen.h | 63 |
1 files changed, 58 insertions, 5 deletions
diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h index 87f5ccdd56e2..eafaca9776a4 100644 --- a/libc/src/string/memory_utils/aarch64/inline_strlen.h +++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h @@ -8,14 +8,13 @@ #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H +#include "src/__support/macros/properties/cpu_features.h" + #if defined(__ARM_NEON) #include "src/__support/CPP/bit.h" // countr_zero - #include <arm_neon.h> #include <stddef.h> // size_t - namespace LIBC_NAMESPACE_DECL { - namespace neon { [[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t string_length(const char *src) { @@ -45,9 +44,63 @@ string_length(const char *src) { } } } // namespace neon +} // namespace LIBC_NAMESPACE_DECL +#endif // __ARM_NEON -namespace string_length_impl = neon; +#ifdef LIBC_TARGET_CPU_HAS_SVE +#include "src/__support/macros/optimization.h" +#include <arm_sve.h> +namespace LIBC_NAMESPACE_DECL { +namespace sve { +[[maybe_unused]] LIBC_INLINE static size_t string_length(const char *src) { + const uint8_t *ptr = reinterpret_cast<const uint8_t *>(src); + // Initialize the first-fault register to all true + svsetffr(); + const svbool_t all_true = svptrue_b8(); // all true predicate + svbool_t cmp_zero; + size_t len = 0; + for (;;) { + // Read a vector's worth of bytes, stopping on first fault. + svuint8_t data = svldff1_u8(all_true, &ptr[len]); + svbool_t fault_mask = svrdffr_z(all_true); + bool has_no_fault = svptest_last(all_true, fault_mask); + if (LIBC_LIKELY(has_no_fault)) { + // First fault did not fail: the whole vector is valid. + // Avoid depending on the contents of FFR beyond the branch. + len += svcntb(); // speculative increment + cmp_zero = svcmpeq_n_u8(all_true, data, 0); + bool has_no_zero = !svptest_any(all_true, cmp_zero); + if (LIBC_LIKELY(has_no_zero)) + continue; + len -= svcntb(); // undo speculative increment + break; + } else { + // First fault failed: only some of the vector is valid. + // Perform the comparison only on the valid bytes. + cmp_zero = svcmpeq_n_u8(fault_mask, data, 0); + bool has_zero = svptest_any(fault_mask, cmp_zero); + if (LIBC_LIKELY(has_zero)) + break; + svsetffr(); + len += svcntp_b8(all_true, fault_mask); + continue; + } + } + // Select the bytes before the first and count them. + svbool_t before_zero = svbrkb_z(all_true, cmp_zero); + len += svcntp_b8(all_true, before_zero); + return len; +} +} // namespace sve +} // namespace LIBC_NAMESPACE_DECL +#endif // LIBC_TARGET_CPU_HAS_SVE + +namespace LIBC_NAMESPACE_DECL { +#ifdef LIBC_TARGET_CPU_HAS_SVE +namespace string_length_impl = sve; +#elif defined(__ARM_NEON) +namespace string_length_impl = neon; +#endif } // namespace LIBC_NAMESPACE_DECL -#endif // __ARM_NEON #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H |
