//===-- Memset implementation for arm ---------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // The functions defined in this file give approximate code size. These sizes // assume the following configuration options: // - LIBC_CONF_KEEP_FRAME_POINTER = false // - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false // - LIBC_ADD_NULL_CHECKS = false #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMSET_H #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMSET_H #include "src/__support/CPP/type_traits.h" // always_false #include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL #include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY #include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align #include // size_t namespace LIBC_NAMESPACE_DECL { namespace { template LIBC_INLINE void set(void *dst, uint32_t value) { static_assert(bytes == 1 || bytes == 2 || bytes == 4); if constexpr (access == AssumeAccess::kAligned) { constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes; memcpy_inline(assume_aligned(dst), &value); } else if constexpr (access == AssumeAccess::kUnknown) { memcpy_inline(dst, &value); } else { static_assert(cpp::always_false, "Invalid AssumeAccess"); } } template LIBC_INLINE void set_block_and_bump_pointers(Ptr &dst, uint32_t value) { if constexpr (bytes <= kWordSize) { set(dst, value); } else { static_assert(bytes % kWordSize == 0 && bytes >= kWordSize); LIBC_LOOP_UNROLL for (size_t offset = 0; offset < bytes; offset += kWordSize) { set(dst + offset, value); } } // In the 1, 2, 4 byte set case, the compiler can fold pointer offsetting // into the store instructions. // e.g., // strb r3, [r0], #1 dst += bytes; } template LIBC_INLINE void consume_by_block(Ptr &dst, uint32_t value, size_t &size) { LIBC_LOOP_NOUNROLL for (size_t i = 0; i < size / bytes; ++i) set_block_and_bump_pointers(dst, value); size %= bytes; } [[maybe_unused]] LIBC_INLINE void set_bytes_and_bump_pointers(Ptr &dst, uint32_t value, size_t size) { LIBC_LOOP_NOUNROLL for (size_t i = 0; i < size; ++i) { set<1, AssumeAccess::kUnknown>(dst++, value); } } } // namespace // Implementation for Cortex-M0, M0+, M1. It compiles down to 140 bytes when // used through `memset` that also needs to return the `dst` ptr. These cores do // not allow unaligned stores so all accesses are aligned. [[maybe_unused]] LIBC_INLINE void inline_memset_arm_low_end(Ptr dst, uint8_t value, size_t size) { if (size >= 8) LIBC_ATTR_LIKELY { // Align `dst` to word boundary. if (const size_t offset = distance_to_align_up(dst)) LIBC_ATTR_UNLIKELY { set_bytes_and_bump_pointers(dst, value, offset); size -= offset; } const uint32_t value32 = value * 0x01010101U; // splat value in each byte consume_by_block<64, AssumeAccess::kAligned>(dst, value32, size); consume_by_block<16, AssumeAccess::kAligned>(dst, value32, size); consume_by_block<4, AssumeAccess::kAligned>(dst, value32, size); } set_bytes_and_bump_pointers(dst, value, size); } // Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware // support for unaligned loads and stores. It compiles down to 186 bytes when // used through `memset` that also needs to return the `dst` ptr. [[maybe_unused]] LIBC_INLINE void inline_memset_arm_mid_end(Ptr dst, uint8_t value, size_t size) { const uint32_t value32 = value * 0x01010101U; // splat value in each byte if (misaligned(dst)) LIBC_ATTR_UNLIKELY { if (size < 8) LIBC_ATTR_UNLIKELY { if (size & 1) set_block_and_bump_pointers<1>(dst, value32); if (size & 2) set_block_and_bump_pointers<2>(dst, value32); if (size & 4) set_block_and_bump_pointers<4>(dst, value32); return; } const size_t offset = distance_to_align_up(dst); if (offset & 1) set_block_and_bump_pointers<1>(dst, value32); if (offset & 2) set_block_and_bump_pointers<2>(dst, value32); size -= offset; } // If we tell the compiler that the stores are aligned it will generate 8 x // STRD instructions. By not specifying alignment, the compiler conservatively // uses 16 x STR.W and is able to use the first one to prefetch the // destination in advance leading to better asymptotic performances. // str r12, [r3, #64]! <- prefetch next cache line // str.w r12, [r3, #0x4] // str.w r12, [r3, #0x8] // ... // str.w r12, [r3, #0x38] // str.w r12, [r3, #0x3c] consume_by_block<64, AssumeAccess::kUnknown>(dst, value32, size); // Prefetching does not matter anymore at this scale so using STRD yields // better results. consume_by_block<16, AssumeAccess::kAligned>(dst, value32, size); consume_by_block<4, AssumeAccess::kAligned>(dst, value32, size); if (size & 1) set_block_and_bump_pointers<1>(dst, value32); if (size & 2) LIBC_ATTR_UNLIKELY set_block_and_bump_pointers<2>(dst, value32); } [[maybe_unused]] LIBC_INLINE void inline_memset_arm_dispatch(Ptr dst, uint8_t value, size_t size) { #ifdef __ARM_FEATURE_UNALIGNED return inline_memset_arm_mid_end(dst, value, size); #else return inline_memset_arm_low_end(dst, value, size); #endif } } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H