diff options
| author | NAKAMURA Takumi <geek4civic@gmail.com> | 2025-01-09 18:49:54 +0900 |
|---|---|---|
| committer | NAKAMURA Takumi <geek4civic@gmail.com> | 2025-01-09 18:49:54 +0900 |
| commit | e2810c9a248f4c7fbfae84bb32b6f7e01027458b (patch) | |
| tree | ae0b02a8491b969a1cee94ea16ffe42c559143c5 /libc/src/__support | |
| parent | fa04eb4af95c1ca7377279728cb004bcd2324d01 (diff) | |
| parent | bdcf47e4bcb92889665825654bb80a8bbe30379e (diff) | |
Merge branch 'users/chapuni/cov/single/base' into users/chapuni/cov/single/switchusers/chapuni/cov/single/switch
Diffstat (limited to 'libc/src/__support')
| -rw-r--r-- | libc/src/__support/CMakeLists.txt | 2 | ||||
| -rw-r--r-- | libc/src/__support/File/file.cpp | 68 | ||||
| -rw-r--r-- | libc/src/__support/File/file.h | 4 | ||||
| -rw-r--r-- | libc/src/__support/GPU/CMakeLists.txt | 10 | ||||
| -rw-r--r-- | libc/src/__support/GPU/amdgpu/CMakeLists.txt | 7 | ||||
| -rw-r--r-- | libc/src/__support/GPU/amdgpu/utils.h | 183 | ||||
| -rw-r--r-- | libc/src/__support/GPU/generic/CMakeLists.txt | 7 | ||||
| -rw-r--r-- | libc/src/__support/GPU/generic/utils.h | 84 | ||||
| -rw-r--r-- | libc/src/__support/GPU/nvptx/CMakeLists.txt | 7 | ||||
| -rw-r--r-- | libc/src/__support/GPU/nvptx/utils.h | 160 | ||||
| -rw-r--r-- | libc/src/__support/GPU/utils.h | 108 | ||||
| -rw-r--r-- | libc/src/__support/fixedvector.h | 54 | ||||
| -rw-r--r-- | libc/src/__support/threads/thread.cpp | 4 |
13 files changed, 190 insertions, 508 deletions
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index 4e90aad9a45b..5090dc218cda 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -267,7 +267,9 @@ add_header_library( HDRS fixedvector.h DEPENDS + .libc_assert libc.src.__support.CPP.array + libc.src.string.memory_utils.inline_memset ) add_header_library( diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp index 972249fef96b..528542cccf32 100644 --- a/libc/src/__support/File/file.cpp +++ b/libc/src/__support/File/file.cpp @@ -42,7 +42,7 @@ FileIOResult File::write_unlocked_nbf(const uint8_t *data, size_t len) { if (pos > 0) { // If the buffer is not empty // Flush the buffer const size_t write_size = pos; - auto write_result = platform_write(this, buf, write_size); + FileIOResult write_result = platform_write(this, buf, write_size); pos = 0; // Buffer is now empty so reset pos to the beginning. // If less bytes were written than expected, then an error occurred. if (write_result < write_size) { @@ -52,7 +52,7 @@ FileIOResult File::write_unlocked_nbf(const uint8_t *data, size_t len) { } } - auto write_result = platform_write(this, data, len); + FileIOResult write_result = platform_write(this, data, len); if (write_result < len) err = true; return write_result; @@ -99,7 +99,7 @@ FileIOResult File::write_unlocked_fbf(const uint8_t *data, size_t len) { // is full. const size_t write_size = pos; - auto buf_result = platform_write(this, buf, write_size); + FileIOResult buf_result = platform_write(this, buf, write_size); size_t bytes_written = buf_result.value; pos = 0; // Buffer is now empty so reset pos to the beginning. @@ -121,7 +121,8 @@ FileIOResult File::write_unlocked_fbf(const uint8_t *data, size_t len) { pos = remainder.size(); } else { - auto result = platform_write(this, remainder.data(), remainder.size()); + FileIOResult result = + platform_write(this, remainder.data(), remainder.size()); size_t bytes_written = buf_result.value; // If less bytes were written than expected, then an error occurred. Return @@ -190,6 +191,17 @@ FileIOResult File::read_unlocked(void *data, size_t len) { prev_op = FileOp::READ; + if (bufmode == _IONBF) { // unbuffered. + return read_unlocked_nbf(static_cast<uint8_t *>(data), len); + } else if (bufmode == _IOFBF) { // fully buffered + return read_unlocked_fbf(static_cast<uint8_t *>(data), len); + } else /*if (bufmode == _IOLBF) */ { // line buffered + // There is no line buffered mode for read. Use fully buffered instead. + return read_unlocked_fbf(static_cast<uint8_t *>(data), len); + } +} + +size_t File::copy_data_from_buf(uint8_t *data, size_t len) { cpp::span<uint8_t> bufref(static_cast<uint8_t *>(buf), bufsize); cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data), len); @@ -209,32 +221,42 @@ FileIOResult File::read_unlocked(void *data, size_t len) { for (size_t i = 0; i < available_data; ++i) dataref[i] = bufref[i + pos]; read_limit = pos = 0; // Reset the pointers. + + return available_data; +} + +FileIOResult File::read_unlocked_fbf(uint8_t *data, size_t len) { + // Read data from the buffer first. + size_t available_data = copy_data_from_buf(data, len); + if (available_data == len) + return available_data; + // Update the dataref to reflect that fact that we have already // copied |available_data| into |data|. - dataref = cpp::span<uint8_t>(dataref.data() + available_data, - dataref.size() - available_data); - size_t to_fetch = len - available_data; + cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data) + available_data, + to_fetch); + if (to_fetch > bufsize) { - auto result = platform_read(this, dataref.data(), to_fetch); + FileIOResult result = platform_read(this, dataref.data(), to_fetch); size_t fetched_size = result.value; if (result.has_error() || fetched_size < to_fetch) { if (!result.has_error()) eof = true; else err = true; - return {available_data + fetched_size, result.has_error()}; + return {available_data + fetched_size, result.error}; } return len; } // Fetch and buffer another buffer worth of data. - auto result = platform_read(this, buf, bufsize); + FileIOResult result = platform_read(this, buf, bufsize); size_t fetched_size = result.value; read_limit += fetched_size; size_t transfer_size = fetched_size >= to_fetch ? to_fetch : fetched_size; for (size_t i = 0; i < transfer_size; ++i) - dataref[i] = bufref[i]; + dataref[i] = buf[i]; pos += transfer_size; if (result.has_error() || fetched_size < to_fetch) { if (!result.has_error()) @@ -245,6 +267,26 @@ FileIOResult File::read_unlocked(void *data, size_t len) { return {transfer_size + available_data, result.error}; } +FileIOResult File::read_unlocked_nbf(uint8_t *data, size_t len) { + // Check whether there is a character in the ungetc buffer. + size_t available_data = copy_data_from_buf(data, len); + if (available_data == len) + return available_data; + + // Directly copy the data into |data|. + cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data) + available_data, + len - available_data); + FileIOResult result = platform_read(this, dataref.data(), dataref.size()); + + if (result.has_error() || result < dataref.size()) { + if (!result.has_error()) + eof = true; + else + err = true; + } + return {result + available_data, result.error}; +} + int File::ungetc_unlocked(int c) { // There is no meaning to unget if: // 1. You are trying to push back EOF. @@ -287,7 +329,7 @@ ErrorOr<int> File::seek(off_t offset, int whence) { FileLock lock(this); if (prev_op == FileOp::WRITE && pos > 0) { - auto buf_result = platform_write(this, buf, pos); + FileIOResult buf_result = platform_write(this, buf, pos); if (buf_result.has_error() || buf_result.value < pos) { err = true; return Error(buf_result.error); @@ -325,7 +367,7 @@ ErrorOr<off_t> File::tell() { int File::flush_unlocked() { if (prev_op == FileOp::WRITE && pos > 0) { - auto buf_result = platform_write(this, buf, pos); + FileIOResult buf_result = platform_write(this, buf, pos); if (buf_result.has_error() || buf_result.value < pos) { err = true; return buf_result.error; diff --git a/libc/src/__support/File/file.h b/libc/src/__support/File/file.h index 42e1d11b4ab1..5c97a9c6419f 100644 --- a/libc/src/__support/File/file.h +++ b/libc/src/__support/File/file.h @@ -280,6 +280,10 @@ private: FileIOResult write_unlocked_fbf(const uint8_t *data, size_t len); FileIOResult write_unlocked_nbf(const uint8_t *data, size_t len); + FileIOResult read_unlocked_fbf(uint8_t *data, size_t len); + FileIOResult read_unlocked_nbf(uint8_t *data, size_t len); + size_t copy_data_from_buf(uint8_t *data, size_t len); + constexpr void adjust_buf() { if (read_allowed() && (buf == nullptr || bufsize == 0)) { // We should allow atleast one ungetc operation. diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt index 28fd9a1ebcc9..9b359f65cdb3 100644 --- a/libc/src/__support/GPU/CMakeLists.txt +++ b/libc/src/__support/GPU/CMakeLists.txt @@ -1,16 +1,12 @@ -if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE}) +# These utilities are GPU only. +if(NOT LIBC_TARGET_OS_IS_GPU) return() endif() -add_subdirectory(${LIBC_TARGET_ARCHITECTURE}) -set(target_gpu_utils libc.src.__support.GPU.${LIBC_TARGET_ARCHITECTURE}.${LIBC_TARGET_ARCHITECTURE}_utils) - add_header_library( utils HDRS utils.h - DEPENDS - ${target_gpu_utils} ) add_object_library( @@ -21,6 +17,6 @@ add_object_library( allocator.h DEPENDS libc.src.__support.common - libc.src.__support.GPU.utils libc.src.__support.RPC.rpc_client + .utils ) diff --git a/libc/src/__support/GPU/amdgpu/CMakeLists.txt b/libc/src/__support/GPU/amdgpu/CMakeLists.txt deleted file mode 100644 index f2b98fc03b21..000000000000 --- a/libc/src/__support/GPU/amdgpu/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_header_library( - amdgpu_utils - HDRS - utils.h - DEPENDS - libc.src.__support.common -) diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h deleted file mode 100644 index 6ab95403ca38..000000000000 --- a/libc/src/__support/GPU/amdgpu/utils.h +++ /dev/null @@ -1,183 +0,0 @@ -//===-------------- AMDGPU implementation of GPU utils ----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H -#define LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -#include <stdint.h> - -namespace LIBC_NAMESPACE_DECL { -namespace gpu { - -/// Type aliases to the address spaces used by the AMDGPU backend. -template <typename T> using Private = [[clang::opencl_private]] T; -template <typename T> using Constant = [[clang::opencl_constant]] T; -template <typename T> using Local = [[clang::opencl_local]] T; -template <typename T> using Global = [[clang::opencl_global]] T; - -/// Returns the number of workgroups in the 'x' dimension of the grid. -LIBC_INLINE uint32_t get_num_blocks_x() { - return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); -} - -/// Returns the number of workgroups in the 'y' dimension of the grid. -LIBC_INLINE uint32_t get_num_blocks_y() { - return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); -} - -/// Returns the number of workgroups in the 'z' dimension of the grid. -LIBC_INLINE uint32_t get_num_blocks_z() { - return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); -} - -/// Returns the total number of workgruops in the grid. -LIBC_INLINE uint64_t get_num_blocks() { - return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); -} - -/// Returns the 'x' dimension of the current AMD workgroup's id. -LIBC_INLINE uint32_t get_block_id_x() { - return __builtin_amdgcn_workgroup_id_x(); -} - -/// Returns the 'y' dimension of the current AMD workgroup's id. -LIBC_INLINE uint32_t get_block_id_y() { - return __builtin_amdgcn_workgroup_id_y(); -} - -/// Returns the 'z' dimension of the current AMD workgroup's id. -LIBC_INLINE uint32_t get_block_id_z() { - return __builtin_amdgcn_workgroup_id_z(); -} - -/// Returns the absolute id of the AMD workgroup. -LIBC_INLINE uint64_t get_block_id() { - return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + - get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); -} - -/// Returns the number of workitems in the 'x' dimension. -LIBC_INLINE uint32_t get_num_threads_x() { - return __builtin_amdgcn_workgroup_size_x(); -} - -/// Returns the number of workitems in the 'y' dimension. -LIBC_INLINE uint32_t get_num_threads_y() { - return __builtin_amdgcn_workgroup_size_y(); -} - -/// Returns the number of workitems in the 'z' dimension. -LIBC_INLINE uint32_t get_num_threads_z() { - return __builtin_amdgcn_workgroup_size_z(); -} - -/// Returns the total number of workitems in the workgroup. -LIBC_INLINE uint64_t get_num_threads() { - return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); -} - -/// Returns the 'x' dimension id of the workitem in the current AMD workgroup. -LIBC_INLINE uint32_t get_thread_id_x() { - return __builtin_amdgcn_workitem_id_x(); -} - -/// Returns the 'y' dimension id of the workitem in the current AMD workgroup. -LIBC_INLINE uint32_t get_thread_id_y() { - return __builtin_amdgcn_workitem_id_y(); -} - -/// Returns the 'z' dimension id of the workitem in the current AMD workgroup. -LIBC_INLINE uint32_t get_thread_id_z() { - return __builtin_amdgcn_workitem_id_z(); -} - -/// Returns the absolute id of the thread in the current AMD workgroup. -LIBC_INLINE uint64_t get_thread_id() { - return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + - get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); -} - -/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware -/// and compilation options. -LIBC_INLINE uint32_t get_lane_size() { - return __builtin_amdgcn_wavefrontsize(); -} - -/// Returns the id of the thread inside of an AMD wavefront executing together. -[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); -} - -/// Returns the bit-mask of active threads in the current wavefront. -[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() { - return __builtin_amdgcn_read_exec(); -} - -/// Copies the value from the first active thread in the wavefront to the rest. -[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t, - uint32_t x) { - return __builtin_amdgcn_readfirstlane(x); -} - -/// Returns a bitmask of threads in the current lane for which \p x is true. -[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { - // the lane_mask & gives the nvptx semantics when lane_mask is a subset of - // the active threads - return lane_mask & __builtin_amdgcn_ballot_w64(x); -} - -/// Waits for all the threads in the block to converge and issues a fence. -[[clang::convergent]] LIBC_INLINE void sync_threads() { - __builtin_amdgcn_s_barrier(); - __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); -} - -/// Waits for all pending memory operations to complete in program order. -[[clang::convergent]] LIBC_INLINE void memory_fence() { - __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, ""); -} - -/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. -[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) { - __builtin_amdgcn_wave_barrier(); -} - -/// Shuffles the the lanes inside the wavefront according to the given index. -[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx, - uint32_t x) { - return __builtin_amdgcn_ds_bpermute(idx << 2, x); -} - -/// Returns the current value of the GPU's processor clock. -/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter. -LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } - -/// Returns a fixed-frequency timestamp. The actual frequency is dependent on -/// the card and can only be queried via the driver. -LIBC_INLINE uint64_t fixed_frequency_clock() { - return __builtin_readsteadycounter(); -} - -/// Terminates execution of the associated wavefront. -[[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); } - -/// Returns a unique identifier for the process cluster the current wavefront is -/// executing on. Here we use the identifier for the compute unit (CU) and -/// shader engine. -/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface -/// than the one at -/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899 -LIBC_INLINE uint32_t get_cluster_id() { return 0; } - -} // namespace gpu -} // namespace LIBC_NAMESPACE_DECL - -#endif diff --git a/libc/src/__support/GPU/generic/CMakeLists.txt b/libc/src/__support/GPU/generic/CMakeLists.txt deleted file mode 100644 index 68ba7d1ec80e..000000000000 --- a/libc/src/__support/GPU/generic/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_header_library( - generic_utils - HDRS - utils.h - DEPENDS - libc.src.__support.common -) diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h deleted file mode 100644 index 9461ef0aa245..000000000000 --- a/libc/src/__support/GPU/generic/utils.h +++ /dev/null @@ -1,84 +0,0 @@ -//===-------------- Generic implementation of GPU utils ---------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H -#define LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -#include <stdint.h> - -namespace LIBC_NAMESPACE_DECL { -namespace gpu { - -template <typename T> using Private = T; -template <typename T> using Constant = T; -template <typename T> using Shared = T; -template <typename T> using Global = T; - -LIBC_INLINE uint32_t get_num_blocks_x() { return 1; } - -LIBC_INLINE uint32_t get_num_blocks_y() { return 1; } - -LIBC_INLINE uint32_t get_num_blocks_z() { return 1; } - -LIBC_INLINE uint64_t get_num_blocks() { return 1; } - -LIBC_INLINE uint32_t get_block_id_x() { return 0; } - -LIBC_INLINE uint32_t get_block_id_y() { return 0; } - -LIBC_INLINE uint32_t get_block_id_z() { return 0; } - -LIBC_INLINE uint64_t get_block_id() { return 0; } - -LIBC_INLINE uint32_t get_num_threads_x() { return 1; } - -LIBC_INLINE uint32_t get_num_threads_y() { return 1; } - -LIBC_INLINE uint32_t get_num_threads_z() { return 1; } - -LIBC_INLINE uint64_t get_num_threads() { return 1; } - -LIBC_INLINE uint32_t get_thread_id_x() { return 0; } - -LIBC_INLINE uint32_t get_thread_id_y() { return 0; } - -LIBC_INLINE uint32_t get_thread_id_z() { return 0; } - -LIBC_INLINE uint64_t get_thread_id() { return 0; } - -LIBC_INLINE uint32_t get_lane_size() { return 1; } - -LIBC_INLINE uint32_t get_lane_id() { return 0; } - -LIBC_INLINE uint64_t get_lane_mask() { return 1; } - -LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; } - -LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; } - -LIBC_INLINE void sync_threads() {} - -LIBC_INLINE void sync_lane(uint64_t) {} - -LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t, uint32_t x) { return x; } - -LIBC_INLINE uint64_t processor_clock() { return 0; } - -LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; } - -[[noreturn]] LIBC_INLINE void end_program() { __builtin_unreachable(); } - -LIBC_INLINE uint32_t get_cluster_id() { return 0; } - -} // namespace gpu -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H diff --git a/libc/src/__support/GPU/nvptx/CMakeLists.txt b/libc/src/__support/GPU/nvptx/CMakeLists.txt deleted file mode 100644 index 0d3f8c7933c8..000000000000 --- a/libc/src/__support/GPU/nvptx/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_header_library( - nvptx_utils - HDRS - utils.h - DEPENDS - libc.src.__support.common -) diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h deleted file mode 100644 index 1a43a839a9ce..000000000000 --- a/libc/src/__support/GPU/nvptx/utils.h +++ /dev/null @@ -1,160 +0,0 @@ -//===-------------- NVPTX implementation of GPU utils -----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-id: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H -#define LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -#include <stdint.h> - -namespace LIBC_NAMESPACE_DECL { -namespace gpu { - -/// Type aliases to the address spaces used by the NVPTX backend. -template <typename T> using Private = [[clang::opencl_private]] T; -template <typename T> using Constant = [[clang::opencl_constant]] T; -template <typename T> using Local = [[clang::opencl_local]] T; -template <typename T> using Global = [[clang::opencl_global]] T; - -/// Returns the number of CUDA blocks in the 'x' dimension. -LIBC_INLINE uint32_t get_num_blocks_x() { - return __nvvm_read_ptx_sreg_nctaid_x(); -} - -/// Returns the number of CUDA blocks in the 'y' dimension. -LIBC_INLINE uint32_t get_num_blocks_y() { - return __nvvm_read_ptx_sreg_nctaid_y(); -} - -/// Returns the number of CUDA blocks in the 'z' dimension. -LIBC_INLINE uint32_t get_num_blocks_z() { - return __nvvm_read_ptx_sreg_nctaid_z(); -} - -/// Returns the total number of CUDA blocks. -LIBC_INLINE uint64_t get_num_blocks() { - return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); -} - -/// Returns the 'x' dimension of the current CUDA block's id. -LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); } - -/// Returns the 'y' dimension of the current CUDA block's id. -LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); } - -/// Returns the 'z' dimension of the current CUDA block's id. -LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); } - -/// Returns the absolute id of the CUDA block. -LIBC_INLINE uint64_t get_block_id() { - return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + - get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); -} - -/// Returns the number of CUDA threads in the 'x' dimension. -LIBC_INLINE uint32_t get_num_threads_x() { - return __nvvm_read_ptx_sreg_ntid_x(); -} - -/// Returns the number of CUDA threads in the 'y' dimension. -LIBC_INLINE uint32_t get_num_threads_y() { - return __nvvm_read_ptx_sreg_ntid_y(); -} - -/// Returns the number of CUDA threads in the 'z' dimension. -LIBC_INLINE uint32_t get_num_threads_z() { - return __nvvm_read_ptx_sreg_ntid_z(); -} - -/// Returns the total number of threads in the block. -LIBC_INLINE uint64_t get_num_threads() { - return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); -} - -/// Returns the 'x' dimension id of the thread in the current CUDA block. -LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); } - -/// Returns the 'y' dimension id of the thread in the current CUDA block. -LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); } - -/// Returns the 'z' dimension id of the thread in the current CUDA block. -LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); } - -/// Returns the absolute id of the thread in the current CUDA block. -LIBC_INLINE uint64_t get_thread_id() { - return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + - get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); -} - -/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware. -LIBC_INLINE uint32_t get_lane_size() { return 32; } - -/// Returns the id of the thread inside of a CUDA warp executing together. -[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { - return __nvvm_read_ptx_sreg_laneid(); -} - -/// Returns the bit-mask of active threads in the current warp. -[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() { - return __nvvm_activemask(); -} - -/// Copies the value from the first active thread in the warp to the rest. -[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, - uint32_t x) { - uint32_t mask = static_cast<uint32_t>(lane_mask); - uint32_t id = __builtin_ffs(mask) - 1; - return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1); -} - -/// Returns a bitmask of threads in the current lane for which \p x is true. -[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { - uint32_t mask = static_cast<uint32_t>(lane_mask); - return __nvvm_vote_ballot_sync(mask, x); -} - -/// Waits for all the threads in the block to converge and issues a fence. -[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); } - -/// Waits for all pending memory operations to complete in program order. -[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); } - -/// Waits for all threads in the warp to reconverge for independent scheduling. -[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) { - __nvvm_bar_warp_sync(static_cast<uint32_t>(mask)); -} - -/// Shuffles the the lanes inside the warp according to the given index. -[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, - uint32_t idx, uint32_t x) { - uint32_t mask = static_cast<uint32_t>(lane_mask); - uint32_t bitmask = (mask >> idx) & 1; - return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1); -} - -/// Returns the current value of the GPU's processor clock. -LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } - -/// Returns a global fixed-frequency timer at nanosecond frequency. -LIBC_INLINE uint64_t fixed_frequency_clock() { - return __builtin_readsteadycounter(); -} - -/// Terminates execution of the calling thread. -[[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); } - -/// Returns a unique identifier for the process cluster the current warp is -/// executing on. Here we use the identifier for the symmetric multiprocessor. -LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); } - -} // namespace gpu -} // namespace LIBC_NAMESPACE_DECL - -#endif diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index ae52e7a088ad..e138c84c0cb2 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -9,48 +9,108 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H #define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H +#include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/architectures.h" -#if defined(LIBC_TARGET_ARCH_IS_AMDGPU) -#include "amdgpu/utils.h" -#elif defined(LIBC_TARGET_ARCH_IS_NVPTX) -#include "nvptx/utils.h" -#else -#include "generic/utils.h" +#if !__has_include(<gpuintrin.h>) +#error "Unsupported compiler" #endif +#include <gpuintrin.h> + namespace LIBC_NAMESPACE_DECL { namespace gpu { -/// Get the first active thread inside the lane. -LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) { - return __builtin_ffsll(lane_mask) - 1; + +template <typename T> using Private = __gpu_private T; +template <typename T> using Constant = __gpu_constant T; +template <typename T> using Local = __gpu_local T; +template <typename T> using Global = __gpu_local T; + +LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); } + +LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); } + +LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); } + +LIBC_INLINE uint64_t get_num_blocks() { + return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); +} + +LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); } + +LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); } + +LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); } + +LIBC_INLINE uint64_t get_block_id() { + return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + + get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); +} + +LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); } + +LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); } + +LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); } + +LIBC_INLINE uint64_t get_num_threads() { + return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); +} + +LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); } + +LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); } + +LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); } + +LIBC_INLINE uint64_t get_thread_id() { + return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + + get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); +} + +LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); } + +LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); } + +LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); } + +LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) { + return __gpu_read_first_lane_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { + return __gpu_ballot(lane_mask, x); +} + +LIBC_INLINE void sync_threads() { __gpu_sync_threads(); } + +LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); } + +LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) { + return __gpu_shuffle_idx_u32(lane_mask, idx, x); } -/// Conditional that is only true for a single thread in a lane. +[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); } + LIBC_INLINE bool is_first_lane(uint64_t lane_mask) { - return gpu::get_lane_id() == get_first_lane_id(lane_mask); + return __gpu_is_first_in_lane(lane_mask); } -/// Gets the sum of all lanes inside the warp or wavefront. LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) { - for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) { - uint32_t index = step + gpu::get_lane_id(); - x += gpu::shuffle(lane_mask, index, x); - } - return gpu::broadcast_value(lane_mask, x); + return __gpu_lane_sum_u32(lane_mask, x); } -/// Gets the accumulator scan of the threads in the warp or wavefront. LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) { - for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) { - uint32_t index = gpu::get_lane_id() - step; - uint32_t bitmask = gpu::get_lane_id() >= step; - x += -bitmask & gpu::shuffle(lane_mask, index, x); - } - return x; + return __gpu_lane_scan_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t fixed_frequency_clock() { + return __builtin_readsteadycounter(); } +LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } + } // namespace gpu } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h index 7ac0c230f9c5..34601f86dc01 100644 --- a/libc/src/__support/fixedvector.h +++ b/libc/src/__support/fixedvector.h @@ -10,9 +10,10 @@ #define LLVM_LIBC_SRC___SUPPORT_FIXEDVECTOR_H #include "src/__support/CPP/array.h" - #include "src/__support/CPP/iterator.h" +#include "src/__support/libc_assert.h" #include "src/__support/macros/config.h" +#include "src/string/memory_utils/inline_memset.h" namespace LIBC_NAMESPACE_DECL { @@ -23,27 +24,32 @@ template <typename T, size_t CAPACITY> class FixedVector { size_t item_count = 0; public: - constexpr FixedVector() = default; + LIBC_INLINE constexpr FixedVector() = default; using iterator = typename cpp::array<T, CAPACITY>::iterator; - constexpr FixedVector(iterator begin, iterator end) : store{}, item_count{} { + LIBC_INLINE constexpr FixedVector(iterator begin, iterator end) + : store{}, item_count{} { + LIBC_ASSERT(begin + CAPACITY >= end); for (; begin != end; ++begin) push_back(*begin); } using const_iterator = typename cpp::array<T, CAPACITY>::const_iterator; - constexpr FixedVector(const_iterator begin, const_iterator end) + LIBC_INLINE constexpr FixedVector(const_iterator begin, const_iterator end) : store{}, item_count{} { + LIBC_ASSERT(begin + CAPACITY >= end); for (; begin != end; ++begin) push_back(*begin); } - constexpr FixedVector(size_t count, const T &value) : store{}, item_count{} { + LIBC_INLINE constexpr FixedVector(size_t count, const T &value) + : store{}, item_count{} { + LIBC_ASSERT(count <= CAPACITY); for (size_t i = 0; i < count; ++i) push_back(value); } - constexpr bool push_back(const T &obj) { + LIBC_INLINE constexpr bool push_back(const T &obj) { if (item_count == CAPACITY) return false; store[item_count] = obj; @@ -51,27 +57,43 @@ public: return true; } - constexpr const T &back() const { return store[item_count - 1]; } + LIBC_INLINE constexpr const T &back() const { + LIBC_ASSERT(!empty()); + return store[item_count - 1]; + } - constexpr T &back() { return store[item_count - 1]; } + LIBC_INLINE constexpr T &back() { + LIBC_ASSERT(!empty()); + return store[item_count - 1]; + } - constexpr bool pop_back() { + LIBC_INLINE constexpr bool pop_back() { if (item_count == 0) return false; + inline_memset(&store[item_count - 1], 0, sizeof(T)); --item_count; return true; } - constexpr T &operator[](size_t idx) { return store[idx]; } + LIBC_INLINE constexpr T &operator[](size_t idx) { + LIBC_ASSERT(idx < item_count); + return store[idx]; + } - constexpr const T &operator[](size_t idx) const { return store[idx]; } + LIBC_INLINE constexpr const T &operator[](size_t idx) const { + LIBC_ASSERT(idx < item_count); + return store[idx]; + } - constexpr bool empty() const { return item_count == 0; } + LIBC_INLINE constexpr bool empty() const { return item_count == 0; } - constexpr size_t size() const { return item_count; } + LIBC_INLINE constexpr size_t size() const { return item_count; } // Empties the store for all practical purposes. - constexpr void reset() { item_count = 0; } + LIBC_INLINE constexpr void reset() { + inline_memset(store.data(), 0, sizeof(T) * item_count); + item_count = 0; + } // This static method does not free up the resources held by |store|, // say by calling `free` or something similar. It just does the equivalent @@ -81,7 +103,9 @@ public: // dynamically allocated storate. So, the `destroy` method like this // matches the `destroy` API of those other data structures so that users // can easily swap one data structure for the other. - static void destroy(FixedVector<T, CAPACITY> *store) { store->reset(); } + LIBC_INLINE static void destroy(FixedVector<T, CAPACITY> *store) { + store->reset(); + } using reverse_iterator = typename cpp::array<T, CAPACITY>::reverse_iterator; LIBC_INLINE constexpr reverse_iterator rbegin() { diff --git a/libc/src/__support/threads/thread.cpp b/libc/src/__support/threads/thread.cpp index dad4f75f092e..6f6b75be5766 100644 --- a/libc/src/__support/threads/thread.cpp +++ b/libc/src/__support/threads/thread.cpp @@ -117,7 +117,9 @@ public: int add_callback(AtExitCallback *callback, void *obj) { cpp::lock_guard lock(mtx); - return callback_list.push_back({callback, obj}); + if (callback_list.push_back({callback, obj})) + return 0; + return -1; } void call() { |
