diff options
Diffstat (limited to 'libc/src/__support/GPU/utils.h')
| -rw-r--r-- | libc/src/__support/GPU/utils.h | 108 |
1 files changed, 84 insertions, 24 deletions
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index ae52e7a088ad..e138c84c0cb2 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -9,48 +9,108 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H #define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H +#include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/architectures.h" -#if defined(LIBC_TARGET_ARCH_IS_AMDGPU) -#include "amdgpu/utils.h" -#elif defined(LIBC_TARGET_ARCH_IS_NVPTX) -#include "nvptx/utils.h" -#else -#include "generic/utils.h" +#if !__has_include(<gpuintrin.h>) +#error "Unsupported compiler" #endif +#include <gpuintrin.h> + namespace LIBC_NAMESPACE_DECL { namespace gpu { -/// Get the first active thread inside the lane. -LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) { - return __builtin_ffsll(lane_mask) - 1; + +template <typename T> using Private = __gpu_private T; +template <typename T> using Constant = __gpu_constant T; +template <typename T> using Local = __gpu_local T; +template <typename T> using Global = __gpu_local T; + +LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); } + +LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); } + +LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); } + +LIBC_INLINE uint64_t get_num_blocks() { + return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); +} + +LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); } + +LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); } + +LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); } + +LIBC_INLINE uint64_t get_block_id() { + return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + + get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); +} + +LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); } + +LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); } + +LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); } + +LIBC_INLINE uint64_t get_num_threads() { + return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); +} + +LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); } + +LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); } + +LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); } + +LIBC_INLINE uint64_t get_thread_id() { + return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + + get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); +} + +LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); } + +LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); } + +LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); } + +LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) { + return __gpu_read_first_lane_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { + return __gpu_ballot(lane_mask, x); +} + +LIBC_INLINE void sync_threads() { __gpu_sync_threads(); } + +LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); } + +LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) { + return __gpu_shuffle_idx_u32(lane_mask, idx, x); } -/// Conditional that is only true for a single thread in a lane. +[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); } + LIBC_INLINE bool is_first_lane(uint64_t lane_mask) { - return gpu::get_lane_id() == get_first_lane_id(lane_mask); + return __gpu_is_first_in_lane(lane_mask); } -/// Gets the sum of all lanes inside the warp or wavefront. LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) { - for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) { - uint32_t index = step + gpu::get_lane_id(); - x += gpu::shuffle(lane_mask, index, x); - } - return gpu::broadcast_value(lane_mask, x); + return __gpu_lane_sum_u32(lane_mask, x); } -/// Gets the accumulator scan of the threads in the warp or wavefront. LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) { - for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) { - uint32_t index = gpu::get_lane_id() - step; - uint32_t bitmask = gpu::get_lane_id() >= step; - x += -bitmask & gpu::shuffle(lane_mask, index, x); - } - return x; + return __gpu_lane_scan_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t fixed_frequency_clock() { + return __builtin_readsteadycounter(); } +LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } + } // namespace gpu } // namespace LIBC_NAMESPACE_DECL |
