diff options
Diffstat (limited to 'offload/DeviceRTL/src')
| -rw-r--r-- | offload/DeviceRTL/src/Allocator.cpp | 77 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Configuration.cpp | 85 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Debug.cpp | 44 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/DeviceUtils.cpp | 64 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Kernel.cpp | 162 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/LibC.cpp | 48 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Mapping.cpp | 212 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Misc.cpp | 138 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Parallelism.cpp | 311 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Profiling.cpp | 18 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Reduction.cpp | 316 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/State.cpp | 482 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Stub.cpp | 1 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Synchronization.cpp | 379 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Tasking.cpp | 103 | ||||
| -rw-r--r-- | offload/DeviceRTL/src/Workshare.cpp | 970 |
16 files changed, 0 insertions, 3410 deletions
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp deleted file mode 100644 index aac2a6005158..000000000000 --- a/offload/DeviceRTL/src/Allocator.cpp +++ /dev/null @@ -1,77 +0,0 @@ -//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Configuration.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Mapping.h" -#include "Synchronization.h" - -using namespace ompx; - -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility( - "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool; -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy - __omp_rtl_device_memory_pool_tracker; - -/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool -/// directly. -struct BumpAllocatorTy final { - - void *alloc(uint64_t Size) { - Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT)); - - if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) { - atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1, - atomic::seq_cst); - atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size, - atomic::seq_cst); - atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size, - atomic::seq_cst); - atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size, - atomic::seq_cst); - } - - uint64_t *Data = - reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr); - uint64_t End = - reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size; - - uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst); - if (OldData + Size > End) - __builtin_trap(); - - return reinterpret_cast<void *>(OldData); - } - - void free(void *) {} -}; - -BumpAllocatorTy BumpAllocator; - -/// allocator namespace implementation -/// -///{ - -void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) { - // TODO: Check KernelEnvironment for an allocator choice as soon as we have - // more than one. -} - -void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); } - -void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); } - -///} diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp deleted file mode 100644 index 0c31c66ab2de..000000000000 --- a/offload/DeviceRTL/src/Configuration.cpp +++ /dev/null @@ -1,85 +0,0 @@ -//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the data object of the constant device environment and the -// query API. -// -//===----------------------------------------------------------------------===// - -#include "Configuration.h" -#include "DeviceTypes.h" -#include "State.h" - -using namespace ompx; - -// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled. -[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_nested_parallelism = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_threads_oversubscription = - 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_teams_oversubscription = 0; - -// This variable should be visible to the plugin so we override the default -// hidden visibility. -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility( - "protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment; - -uint32_t config::getAssumeTeamsOversubscription() { - return __omp_rtl_assume_teams_oversubscription; -} - -uint32_t config::getAssumeThreadsOversubscription() { - return __omp_rtl_assume_threads_oversubscription; -} - -uint32_t config::getDebugKind() { - return __omp_rtl_debug_kind & __omp_rtl_device_environment.DeviceDebugKind; -} - -uint32_t config::getNumDevices() { - return __omp_rtl_device_environment.NumDevices; -} - -uint32_t config::getDeviceNum() { - return __omp_rtl_device_environment.DeviceNum; -} - -uint64_t config::getDynamicMemorySize() { - return __omp_rtl_device_environment.DynamicMemSize; -} - -uint64_t config::getClockFrequency() { - return __omp_rtl_device_environment.ClockFrequency; -} - -void *config::getIndirectCallTablePtr() { - return reinterpret_cast<void *>( - __omp_rtl_device_environment.IndirectCallTable); -} - -uint64_t config::getHardwareParallelism() { - return __omp_rtl_device_environment.HardwareParallelism; -} - -uint64_t config::getIndirectCallTableSize() { - return __omp_rtl_device_environment.IndirectCallTableSize; -} - -bool config::isDebugMode(DeviceDebugKind Kind) { - return config::getDebugKind() & uint32_t(Kind); -} - -bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } - -bool config::mayUseNestedParallelism() { - if (__omp_rtl_assume_no_nested_parallelism) - return false; - return state::getKernelEnvironment().Configuration.MayUseNestedParallelism; -} diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp deleted file mode 100644 index 5b5482d766b1..000000000000 --- a/offload/DeviceRTL/src/Debug.cpp +++ /dev/null @@ -1,44 +0,0 @@ -//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains debug utilities -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Configuration.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" - -using namespace ompx; - -extern "C" { -void __assert_assume(bool condition) { __builtin_assume(condition); } - -#ifndef OMPTARGET_HAS_LIBC -[[gnu::weak]] void __assert_fail(const char *expr, const char *file, - unsigned line, const char *function) { - __assert_fail_internal(expr, nullptr, file, line, function); -} -#endif - -void __assert_fail_internal(const char *expr, const char *msg, const char *file, - unsigned line, const char *function) { - if (msg) { - printf("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function, - msg, expr); - } else { - printf("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr); - } - __builtin_trap(); -} -} diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp deleted file mode 100644 index d6f8c499c890..000000000000 --- a/offload/DeviceRTL/src/DeviceUtils.cpp +++ /dev/null @@ -1,64 +0,0 @@ -//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "DeviceUtils.h" - -#include "Debug.h" -#include "Interface.h" -#include "Mapping.h" -#include "gpuintrin.h" - -using namespace ompx; - -uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { - return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; -} - -void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { - static_assert(sizeof(unsigned long) == 8, ""); - LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL); - HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32); -} - -int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, - int32_t Width) { - return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width); -} - -int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, - int32_t Width) { - int32_t Self = mapping::getThreadIdInWarp(); - int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; - return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); -} - -int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, - int32_t Width) { - int32_t Self = mapping::getThreadIdInWarp(); - int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; - return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); -} - -uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) { - return __gpu_ballot(Mask, Pred); -} - -bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); } - -extern "C" { -int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { - return utils::shuffleDown(lanes::All, Val, Delta, SrcLane); -} - -int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) { - return utils::shuffleDown(lanes::All, Val, Delta, Width); -} -} diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp deleted file mode 100644 index 467e44a65276..000000000000 --- a/offload/DeviceRTL/src/Kernel.cpp +++ /dev/null @@ -1,162 +0,0 @@ -//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the kernel entry points for the device. -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" -#include "Workshare.h" - -using namespace ompx; - -// These flags are copied from "llvm/Frontend/OpenMP/OMPDeviceConstants.h" and -// must be kept in-sync. -enum OMPTgtExecModeFlags : unsigned char { - OMP_TGT_EXEC_MODE_BARE = 0, - OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, - OMP_TGT_EXEC_MODE_SPMD = 1 << 1, - OMP_TGT_EXEC_MODE_GENERIC_SPMD = - OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD -}; - -static void -inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - // Order is important here. - synchronize::init(IsSPMD); - mapping::init(IsSPMD); - state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment); - allocator::init(IsSPMD, KernelEnvironment); - workshare::init(IsSPMD); -} - -/// Simple generic state machine for worker threads. -static void genericStateMachine(IdentTy *Ident) { - uint32_t TId = mapping::getThreadIdInBlock(); - - do { - ParallelRegionFnTy WorkFn = nullptr; - - // Wait for the signal that we have a new work function. - synchronize::threads(atomic::seq_cst); - - // Retrieve the work function from the runtime. - bool IsActive = __kmpc_kernel_parallel(&WorkFn); - - // If there is nothing more to do, break out of the state machine by - // returning to the caller. - if (!WorkFn) - return; - - if (IsActive) { - ASSERT(!mapping::isSPMDMode(), nullptr); - ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId); - __kmpc_kernel_end_parallel(); - } - - synchronize::threads(atomic::seq_cst); - - } while (true); -} - -extern "C" { - -/// Initialization -/// -/// \param Ident Source location identification, can be NULL. -/// -int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; - bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD; - bool UseGenericStateMachine = Configuration.UseGenericStateMachine; - if (IsSPMD) { - inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment, - KernelLaunchEnvironment); - synchronize::threadsAligned(atomic::relaxed); - } else { - inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment, - KernelLaunchEnvironment); - // No need to wait since only the main threads will execute user - // code and workers will run into a barrier right away. - } - - if (IsSPMD) { - state::assumeInitialState(IsSPMD); - - // Synchronize to ensure the assertions above are in an aligned region. - // The barrier is eliminated later. - synchronize::threadsAligned(atomic::relaxed); - return -1; - } - - if (mapping::isInitialThreadInLevel0(IsSPMD)) - return -1; - - // Enter the generic state machine if enabled and if this thread can possibly - // be an active worker thread. - // - // The latter check is important for NVIDIA Pascal (but not Volta) and AMD - // GPU. In those cases, a single thread can apparently satisfy a barrier on - // behalf of all threads in the same warp. Thus, it would not be safe for - // other threads in the main thread's warp to reach the first - // synchronize::threads call in genericStateMachine before the main thread - // reaches its corresponding synchronize::threads call: that would permit all - // active worker threads to proceed before the main thread has actually set - // state::ParallelRegionFn, and then they would immediately quit without - // doing any work. mapping::getMaxTeamThreads() does not include any of the - // main thread's warp, so none of its threads can ever be active worker - // threads. - if (UseGenericStateMachine && - mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) - genericStateMachine(KernelEnvironment.Ident); - - return mapping::getThreadIdInBlock(); -} - -/// De-Initialization -/// -/// In non-SPMD, this function releases the workers trapped in a state machine -/// and also any memory dynamically allocated by the runtime. -/// -/// \param Ident Source location identification, can be NULL. -/// -void __kmpc_target_deinit() { - bool IsSPMD = mapping::isSPMDMode(); - if (IsSPMD) - return; - - if (mapping::isInitialThreadInLevel0(IsSPMD)) { - // Signal the workers to exit the state machine and exit the kernel. - state::ParallelRegionFn = nullptr; - } else if (!state::getKernelEnvironment() - .Configuration.UseGenericStateMachine) { - // Retrieve the work function just to ensure we always call - // __kmpc_kernel_parallel even if a custom state machine is used. - // TODO: this is not super pretty. The problem is we create the call to - // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it - // is not there yet. Thus, we assume we never reach it from - // __kmpc_target_deinit. That allows us to remove the store in there to - // ParallelRegionFn, which leads to bad results later on. - ParallelRegionFnTy WorkFn = nullptr; - __kmpc_kernel_parallel(&WorkFn); - ASSERT(WorkFn == nullptr, nullptr); - } -} - -int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } -} diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp deleted file mode 100644 index 83f9233d9480..000000000000 --- a/offload/DeviceRTL/src/LibC.cpp +++ /dev/null @@ -1,48 +0,0 @@ -//===------- LibC.cpp - Simple implementation of libc functions --- C++ ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "LibC.h" - -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) -extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; } -#else -extern "C" int vprintf(const char *format, __builtin_va_list); -#endif - -extern "C" { -[[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) { - auto *L = reinterpret_cast<const unsigned char *>(lhs); - auto *R = reinterpret_cast<const unsigned char *>(rhs); - - for (size_t I = 0; I < count; ++I) - if (L[I] != R[I]) - return (int)L[I] - (int)R[I]; - - return 0; -} - -[[gnu::weak]] void memset(void *dst, int C, size_t count) { - auto *dstc = reinterpret_cast<char *>(dst); - for (size_t I = 0; I < count; ++I) - dstc[I] = C; -} - -[[gnu::weak]] int printf(const char *Format, ...) { - __builtin_va_list vlist; - __builtin_va_start(vlist, Format); - return ::vprintf(Format, vlist); -} -} - -namespace ompx { -[[clang::no_builtin("printf")]] int printf(const char *Format, ...) { - __builtin_va_list vlist; - __builtin_va_start(vlist, Format); - return ::vprintf(Format, vlist); -} -} // namespace ompx diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp deleted file mode 100644 index b145892d1ece..000000000000 --- a/offload/DeviceRTL/src/Mapping.cpp +++ /dev/null @@ -1,212 +0,0 @@ -//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "Mapping.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "State.h" -#include "gpuintrin.h" - -using namespace ompx; - -// FIXME: This resolves the handling for the AMDGPU workgroup size when the ABI -// is set to 'none'. We only support COV5+ but this can be removed when COV4 is -// fully deprecated. -#ifdef __AMDGPU__ -extern const inline uint32_t __oclc_ABI_version = 500; -[[gnu::alias("__oclc_ABI_version")]] const uint32_t __oclc_ABI_version__; -#endif - -static bool isInLastWarp() { - uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) & - ~(mapping::getWarpSize() - 1); - return mapping::getThreadIdInBlock() == MainTId; -} - -bool mapping::isMainThreadInGenericMode(bool IsSPMD) { - if (IsSPMD || icv::Level) - return false; - - // Check if this is the last warp in the block. - return isInLastWarp(); -} - -bool mapping::isMainThreadInGenericMode() { - return mapping::isMainThreadInGenericMode(mapping::isSPMDMode()); -} - -bool mapping::isInitialThreadInLevel0(bool IsSPMD) { - if (IsSPMD) - return mapping::getThreadIdInBlock() == 0; - return isInLastWarp(); -} - -bool mapping::isLeaderInWarp() { - __kmpc_impl_lanemask_t Active = mapping::activemask(); - __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); - return utils::popc(Active & LaneMaskLT) == 0; -} - -LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); } - -LaneMaskTy mapping::lanemaskLT() { -#ifdef __NVPTX__ - return __nvvm_read_ptx_sreg_lanemask_lt(); -#else - uint32_t Lane = mapping::getThreadIdInWarp(); - int64_t Ballot = mapping::activemask(); - uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; - return Mask & Ballot; -#endif -} - -LaneMaskTy mapping::lanemaskGT() { -#ifdef __NVPTX__ - return __nvvm_read_ptx_sreg_lanemask_gt(); -#else - uint32_t Lane = mapping::getThreadIdInWarp(); - if (Lane == (mapping::getWarpSize() - 1)) - return 0; - int64_t Ballot = mapping::activemask(); - uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); - return Mask & Ballot; -#endif -} - -uint32_t mapping::getThreadIdInWarp() { - uint32_t ThreadIdInWarp = __gpu_lane_id(); - ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr); - return ThreadIdInWarp; -} - -uint32_t mapping::getThreadIdInBlock(int32_t Dim) { - uint32_t ThreadIdInBlock = __gpu_thread_id(Dim); - return ThreadIdInBlock; -} - -uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); } - -uint32_t mapping::getMaxTeamThreads(bool IsSPMD) { - uint32_t BlockSize = mapping::getNumberOfThreadsInBlock(); - // If we are in SPMD mode, remove one warp. - return BlockSize - (!IsSPMD * mapping::getWarpSize()); -} -uint32_t mapping::getMaxTeamThreads() { - return mapping::getMaxTeamThreads(mapping::isSPMDMode()); -} - -uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) { - return __gpu_num_threads(Dim); -} - -uint32_t mapping::getNumberOfThreadsInKernel() { - return mapping::getNumberOfThreadsInBlock(0) * - mapping::getNumberOfBlocksInKernel(0) * - mapping::getNumberOfThreadsInBlock(1) * - mapping::getNumberOfBlocksInKernel(1) * - mapping::getNumberOfThreadsInBlock(2) * - mapping::getNumberOfBlocksInKernel(2); -} - -uint32_t mapping::getWarpIdInBlock() { - uint32_t WarpID = - mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); - ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr); - return WarpID; -} - -uint32_t mapping::getBlockIdInKernel(int32_t Dim) { - uint32_t BlockId = __gpu_block_id(Dim); - ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr); - return BlockId; -} - -uint32_t mapping::getNumberOfWarpsInBlock() { - return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / - mapping::getWarpSize(); -} - -uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) { - return __gpu_num_blocks(Dim); -} - -uint32_t mapping::getNumberOfProcessorElements() { - return static_cast<uint32_t>(config::getHardwareParallelism()); -} - -///} - -/// Execution mode -/// -///{ - -// TODO: This is a workaround for initialization coming from kernels outside of -// the TU. We will need to solve this more correctly in the future. -[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode; - -void mapping::init(bool IsSPMD) { - if (mapping::isInitialThreadInLevel0(IsSPMD)) - IsSPMDMode = IsSPMD; -} - -bool mapping::isSPMDMode() { return IsSPMDMode; } - -bool mapping::isGenericMode() { return !isSPMDMode(); } -///} - -extern "C" { -[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() { - return mapping::getThreadIdInBlock(); -} - -[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() { - return mapping::getNumberOfThreadsInBlock(mapping::DIM_X); -} - -[[gnu::noinline]] uint32_t __kmpc_get_warp_size() { - return mapping::getWarpSize(); -} -} - -#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME) \ - extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); } - -_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock) -_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel) -_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock) -_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel) - -extern "C" { -uint64_t ompx_ballot_sync(uint64_t mask, int pred) { - return utils::ballotSync(mask, pred); -} - -int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) { - return utils::shuffleDown(mask, var, delta, width); -} - -float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta, - int width) { - return utils::bitCast<float>( - utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width)); -} - -long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) { - return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width); -} - -double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta, - int width) { - return utils::bitCast<double>( - utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width)); -} -} diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp deleted file mode 100644 index a89f8b2a7453..000000000000 --- a/offload/DeviceRTL/src/Misc.cpp +++ /dev/null @@ -1,138 +0,0 @@ -//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "Allocator.h" -#include "Configuration.h" -#include "DeviceTypes.h" -#include "Shared/RPCOpcodes.h" -#include "shared/rpc.h" - -#include "Debug.h" - -namespace ompx { -namespace impl { - -/// Lookup a device-side function using a host pointer /p HstPtr using the table -/// provided by the device plugin. The table is an ordered pair of host and -/// device pointers sorted on the value of the host pointer. -void *indirectCallLookup(void *HstPtr) { - if (!HstPtr) - return nullptr; - - struct IndirectCallTable { - void *HstPtr; - void *DevPtr; - }; - IndirectCallTable *Table = - reinterpret_cast<IndirectCallTable *>(config::getIndirectCallTablePtr()); - uint64_t TableSize = config::getIndirectCallTableSize(); - - // If the table is empty we assume this is device pointer. - if (!Table || !TableSize) - return HstPtr; - - uint32_t Left = 0; - uint32_t Right = TableSize; - - // If the pointer is definitely not contained in the table we exit early. - if (HstPtr < Table[Left].HstPtr || HstPtr > Table[Right - 1].HstPtr) - return HstPtr; - - while (Left != Right) { - uint32_t Current = Left + (Right - Left) / 2; - if (Table[Current].HstPtr == HstPtr) - return Table[Current].DevPtr; - - if (HstPtr < Table[Current].HstPtr) - Right = Current; - else - Left = Current; - } - - // If we searched the whole table and found nothing this is a device pointer. - return HstPtr; -} - -/// The openmp client instance used to communicate with the server. -[[gnu::visibility("protected"), - gnu::weak]] rpc::Client Client asm("__llvm_rpc_client"); - -} // namespace impl -} // namespace ompx - -/// Interfaces -/// -///{ - -extern "C" { -int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; } - -int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; } - -double omp_get_wtick(void) { - // The number of ticks per second for the AMDGPU clock varies by card and can - // only be retrieved by querying the driver. We rely on the device environment - // to inform us what the proper frequency is. NVPTX uses a nanosecond - // resolution, we could omit the global read but this makes it consistent. - return 1.0 / ompx::config::getClockFrequency(); -} - -double omp_get_wtime(void) { - return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick(); -} - -void *__llvm_omp_indirect_call_lookup(void *HstPtr) { - return ompx::impl::indirectCallLookup(HstPtr); -} - -void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { - switch (allocator) { - case omp_default_mem_alloc: - case omp_large_cap_mem_alloc: - case omp_const_mem_alloc: - case omp_high_bw_mem_alloc: - case omp_low_lat_mem_alloc: - return malloc(size); - default: - return nullptr; - } -} - -void omp_free(void *ptr, omp_allocator_handle_t allocator) { - switch (allocator) { - case omp_default_mem_alloc: - case omp_large_cap_mem_alloc: - case omp_const_mem_alloc: - case omp_high_bw_mem_alloc: - case omp_low_lat_mem_alloc: - free(ptr); - case omp_null_allocator: - default: - return; - } -} - -unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) { - rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_HOST_CALL>(); - Port.send_n(data, size); - Port.send([=](rpc::Buffer *buffer, uint32_t) { - buffer->data[0] = reinterpret_cast<uintptr_t>(fn); - }); - unsigned long long Ret; - Port.recv([&](rpc::Buffer *Buffer, uint32_t) { - Ret = static_cast<unsigned long long>(Buffer->data[0]); - }); - Port.close(); - return Ret; -} -} - -///} diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp deleted file mode 100644 index 08ce616aee1c..000000000000 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ /dev/null @@ -1,311 +0,0 @@ -//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Parallel implementation in the GPU. Here is the pattern: -// -// while (not finished) { -// -// if (master) { -// sequential code, decide which par loop to do, or if finished -// __kmpc_kernel_prepare_parallel() // exec by master only -// } -// syncthreads // A -// __kmpc_kernel_parallel() // exec by all -// if (this thread is included in the parallel) { -// switch () for all parallel loops -// __kmpc_kernel_end_parallel() // exec only by threads in parallel -// } -// -// -// The reason we don't exec end_parallel for the threads not included -// in the parallel loop is that for each barrier in the parallel -// region, these non-included threads will cycle through the -// syncthread A. Thus they must preserve their current threadId that -// is larger than thread in team. -// -// To make a long story short... -// -//===----------------------------------------------------------------------===// - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "LibC.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -namespace { - -uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { - uint32_t NThreadsICV = - NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; - uint32_t NumThreads = mapping::getMaxTeamThreads(); - - if (NThreadsICV != 0 && NThreadsICV < NumThreads) - NumThreads = NThreadsICV; - - // SPMD mode allows any number of threads, for generic mode we round down to a - // multiple of WARPSIZE since it is legal to do so in OpenMP. - if (mapping::isSPMDMode()) - return NumThreads; - - if (NumThreads < mapping::getWarpSize()) - NumThreads = 1; - else - NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); - - return NumThreads; -} - -// Invoke an outlined parallel function unwrapping arguments (up to 32). -[[clang::always_inline]] void invokeMicrotask(int32_t global_tid, - int32_t bound_tid, void *fn, - void **args, int64_t nargs) { - switch (nargs) { -#include "generated_microtask_cases.gen" - default: - printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); - __builtin_trap(); - } -} - -} // namespace - -extern "C" { - -[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, - int32_t num_threads, - void *fn, void **args, - const int64_t nargs) { - uint32_t TId = mapping::getThreadIdInBlock(); - uint32_t NumThreads = determineNumberOfThreads(num_threads); - uint32_t PTeamSize = - NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; - // Avoid the race between the read of the `icv::Level` above and the write - // below by synchronizing all threads here. - synchronize::threadsAligned(atomic::seq_cst); - { - // Note that the order here is important. `icv::Level` has to be updated - // last or the other updates will cause a thread specific state to be - // created. - state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, - 1u, TId == 0, ident, - /*ForceTeamState=*/true); - state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident, - /*ForceTeamState=*/true); - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, - /*ForceTeamState=*/true); - - // Synchronize all threads after the main thread (TId == 0) set up the - // team state properly. - synchronize::threadsAligned(atomic::acq_rel); - - state::ParallelTeamSize.assert_eq(PTeamSize, ident, - /*ForceTeamState=*/true); - icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true); - icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true); - - // Ensure we synchronize before we run user code to avoid invalidating the - // assumptions above. - synchronize::threadsAligned(atomic::relaxed); - - if (!PTeamSize || TId < PTeamSize) - invokeMicrotask(TId, 0, fn, args, nargs); - - // Synchronize all threads at the end of a parallel region. - synchronize::threadsAligned(atomic::seq_cst); - } - - // Synchronize all threads to make sure every thread exits the scope above; - // otherwise the following assertions and the assumption in - // __kmpc_target_deinit may not hold. - synchronize::threadsAligned(atomic::acq_rel); - - state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true); - icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true); - icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true); - - // Ensure we synchronize to create an aligned region around the assumptions. - synchronize::threadsAligned(atomic::relaxed); - - return; -} - -[[clang::always_inline]] void -__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, - int32_t num_threads, int proc_bind, void *fn, - void *wrapper_fn, void **args, int64_t nargs) { - uint32_t TId = mapping::getThreadIdInBlock(); - - // Assert the parallelism level is zero if disabled by the user. - ASSERT((config::mayUseNestedParallelism() || icv::Level == 0), - "nested parallelism while disabled"); - - // Handle the serialized case first, same for SPMD/non-SPMD: - // 1) if-clause(0) - // 2) parallel in task or other thread state inducing construct - // 3) nested parallel regions - if (OMP_UNLIKELY(!if_expr || state::HasThreadState || - (config::mayUseNestedParallelism() && icv::Level))) { - state::DateEnvironmentRAII DERAII(ident); - ++icv::Level; - invokeMicrotask(TId, 0, fn, args, nargs); - return; - } - - // From this point forward we know that there is no thread state used. - ASSERT(state::HasThreadState == false, nullptr); - - if (mapping::isSPMDMode()) { - // This was moved to its own routine so it could be called directly - // in certain situations to avoid resource consumption of unused - // logic in parallel_51. - __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs); - - return; - } - - uint32_t NumThreads = determineNumberOfThreads(num_threads); - uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); - uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; - - // We do *not* create a new data environment because all threads in the team - // that are active are now running this parallel region. They share the - // TeamState, which has an increase level-var and potentially active-level - // set, but they do not have individual ThreadStates yet. If they ever - // modify the ICVs beyond this point a ThreadStates will be allocated. - - bool IsActiveParallelRegion = NumThreads > 1; - if (!IsActiveParallelRegion) { - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); - invokeMicrotask(TId, 0, fn, args, nargs); - return; - } - - void **GlobalArgs = nullptr; - if (nargs) { - __kmpc_begin_sharing_variables(&GlobalArgs, nargs); - switch (nargs) { - default: - for (int I = 0; I < nargs; I++) - GlobalArgs[I] = args[I]; - break; - case 16: - GlobalArgs[15] = args[15]; - [[fallthrough]]; - case 15: - GlobalArgs[14] = args[14]; - [[fallthrough]]; - case 14: - GlobalArgs[13] = args[13]; - [[fallthrough]]; - case 13: - GlobalArgs[12] = args[12]; - [[fallthrough]]; - case 12: - GlobalArgs[11] = args[11]; - [[fallthrough]]; - case 11: - GlobalArgs[10] = args[10]; - [[fallthrough]]; - case 10: - GlobalArgs[9] = args[9]; - [[fallthrough]]; - case 9: - GlobalArgs[8] = args[8]; - [[fallthrough]]; - case 8: - GlobalArgs[7] = args[7]; - [[fallthrough]]; - case 7: - GlobalArgs[6] = args[6]; - [[fallthrough]]; - case 6: - GlobalArgs[5] = args[5]; - [[fallthrough]]; - case 5: - GlobalArgs[4] = args[4]; - [[fallthrough]]; - case 4: - GlobalArgs[3] = args[3]; - [[fallthrough]]; - case 3: - GlobalArgs[2] = args[2]; - [[fallthrough]]; - case 2: - GlobalArgs[1] = args[1]; - [[fallthrough]]; - case 1: - GlobalArgs[0] = args[0]; - [[fallthrough]]; - case 0: - break; - } - } - - { - // Note that the order here is important. `icv::Level` has to be updated - // last or the other updates will cause a thread specific state to be - // created. - state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, - 1u, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, - (void *)nullptr, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident, - /*ForceTeamState=*/true); - - // Master signals work to activate workers. - synchronize::threads(atomic::seq_cst); - // Master waits for workers to signal. - synchronize::threads(atomic::seq_cst); - } - - if (nargs) - __kmpc_end_sharing_variables(); -} - -[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { - // Work function and arguments for L1 parallel region. - *WorkFn = state::ParallelRegionFn; - - // If this is the termination signal from the master, quit early. - if (!*WorkFn) - return false; - - // Set to true for workers participating in the parallel region. - uint32_t TId = mapping::getThreadIdInBlock(); - bool ThreadIsActive = TId < state::getEffectivePTeamSize(); - return ThreadIsActive; -} - -[[clang::noinline]] void __kmpc_kernel_end_parallel() { - // In case we have modified an ICV for this thread before a ThreadState was - // created. We drop it now to not contaminate the next parallel region. - ASSERT(!mapping::isSPMDMode(), nullptr); - uint32_t TId = mapping::getThreadIdInBlock(); - state::resetStateForThread(TId); - ASSERT(!mapping::isSPMDMode(), nullptr); -} - -uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } - -int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } - -void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, - int32_t thread_limit) {} - -void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} -} diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp deleted file mode 100644 index df141af5ebee..000000000000 --- a/offload/DeviceRTL/src/Profiling.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===------- Profiling.cpp ---------------------------------------- C++ ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "Profiling.h" - -extern "C" { - -// Provides empty implementations for certain functions in compiler-rt -// that are emitted by the PGO instrumentation. -void __llvm_profile_register_function(void *Ptr) {} -void __llvm_profile_register_names_function(void *Ptr, long int I) {} -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {} -} diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp deleted file mode 100644 index fffd0063940c..000000000000 --- a/offload/DeviceRTL/src/Reduction.cpp +++ /dev/null @@ -1,316 +0,0 @@ -//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of reduction with KMPC interface. -// -//===----------------------------------------------------------------------===// - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -namespace { - -void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { - for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { - shflFct(reduce_data, /*LaneId - not used= */ 0, - /*Offset = */ mask, /*AlgoVersion=*/0); - } -} - -void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, - uint32_t size, uint32_t tid) { - uint32_t curr_size; - uint32_t mask; - curr_size = size; - mask = curr_size / 2; - while (mask > 0) { - shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); - curr_size = (curr_size + 1) / 2; - mask = curr_size / 2; - } -} - -static uint32_t gpu_irregular_simd_reduce(void *reduce_data, - ShuffleReductFnTy shflFct) { - uint32_t size, remote_id, physical_lane_id; - physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); - __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); - __kmpc_impl_lanemask_t Liveness = mapping::activemask(); - uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; - __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); - do { - Liveness = mapping::activemask(); - remote_id = utils::ffs(Liveness & lanemask_gt); - size = utils::popc(Liveness); - logical_lane_id /= 2; - shflFct(reduce_data, /*LaneId =*/logical_lane_id, - /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); - } while (logical_lane_id % 2 == 0 && size > 1); - return (logical_lane_id == 0); -} - -static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { - uint32_t BlockThreadId = mapping::getThreadIdInBlock(); - if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false)) - BlockThreadId = 0; - uint32_t NumThreads = omp_get_num_threads(); - if (NumThreads == 1) - return 1; - - // - // This reduce function handles reduction within a team. It handles - // parallel regions in both L1 and L2 parallelism levels. It also - // supports Generic, SPMD, and NoOMP modes. - // - // 1. Reduce within a warp. - // 2. Warp master copies value to warp 0 via shared memory. - // 3. Warp 0 reduces to a single value. - // 4. The reduced value is available in the thread that returns 1. - // - -#if __has_builtin(__nvvm_reflect) - if (__nvvm_reflect("__CUDA_ARCH") >= 700) { - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - uint32_t WarpId = mapping::getWarpIdInBlock(); - - // Volta execution model: - // For the Generic execution mode a parallel region either has 1 thread and - // beyond that, always a multiple of 32. For the SPMD execution mode we may - // have any number of threads. - if ((NumThreads % mapping::getWarpSize() == 0) || - (WarpId < WarpsNeeded - 1)) - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce( - reduce_data, shflFct, - /*LaneCount=*/NumThreads % mapping::getWarpSize(), - /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize()); - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - } - return BlockThreadId == 0; - } -#endif - __kmpc_impl_lanemask_t Liveness = mapping::activemask(); - if (Liveness == lanes::All) // Full warp - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/utils::popc(Liveness), - /*LaneId=*/mapping::getThreadIdInBlock() % - mapping::getWarpSize()); - else { // Dispersed lanes. Only threads in L2 - // parallel region may enter here; return - // early. - return gpu_irregular_simd_reduce(reduce_data, shflFct); - } - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - - return BlockThreadId == 0; - } - - // Get the OMP thread Id. This is different from BlockThreadId in the case - // of an L2 parallel region. - return BlockThreadId == 0; -} - -uint32_t roundToWarpsize(uint32_t s) { - if (s < mapping::getWarpSize()) - return 1; - return (s & ~(unsigned)(mapping::getWarpSize() - 1)); -} - -uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } - -} // namespace - -extern "C" { -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, - uint64_t reduce_data_size, - void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { - return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct); -} - -int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, - uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, - ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) { - // Terminate all threads in non-SPMD mode except for the master thread. - uint32_t ThreadId = mapping::getThreadIdInBlock(); - if (mapping::isGenericMode()) { - if (!mapping::isMainThreadInGenericMode()) - return 0; - ThreadId = 0; - } - - uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt; - uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt; - - // In non-generic mode all workers participate in the teams reduction. - // In generic mode only the team master participates in the teams - // reduction because the workers are waiting for parallel work. - uint32_t NumThreads = omp_get_num_threads(); - uint32_t TeamId = omp_get_team_num(); - uint32_t NumTeams = omp_get_num_teams(); - [[clang::loader_uninitialized]] static Local<unsigned> Bound; - [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount; - - // Block progress for teams greater than the current upper - // limit. We always only allow a number of teams less or equal - // to the number of slots in the buffer. - bool IsMaster = (ThreadId == 0); - while (IsMaster) { - Bound = atomic::load(&IterCnt, atomic::acquire); - if (TeamId < Bound + num_of_records) - break; - } - - if (IsMaster) { - int ModBockId = TeamId % num_of_records; - if (TeamId < num_of_records) { - lgcpyFct(GlobalBuffer, ModBockId, reduce_data); - } else - lgredFct(GlobalBuffer, ModBockId, reduce_data); - - // Propagate the memory writes above to the world. - fence::kernel(atomic::release); - - // Increment team counter. - // This counter is incremented by all teams in the current - // num_of_records chunk. - ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst, - atomic::MemScopeTy::device); - } - - // Synchronize in SPMD mode as in generic mode all but 1 threads are in the - // state machine. - if (mapping::isSPMDMode()) - synchronize::threadsAligned(atomic::acq_rel); - - // reduce_data is global or shared so before being reduced within the - // warp we need to bring it in local memory: - // local_reduce_data = reduce_data[i] - // - // Example for 3 reduction variables a, b, c (of potentially different - // types): - // - // buffer layout (struct of arrays): - // a, a, ..., a, b, b, ... b, c, c, ... c - // |__________| - // num_of_records - // - // local_data_reduce layout (struct): - // a, b, c - // - // Each thread will have a local struct containing the values to be - // reduced: - // 1. do reduction within each warp. - // 2. do reduction across warps. - // 3. write the final result to the main reduction variable - // by returning 1 in the thread holding the reduction result. - - // Check if this is the very last team. - unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); - if (ChunkTeamCount == NumTeams - Bound - 1) { - // Ensure we see the global memory writes by other teams - fence::kernel(atomic::acquire); - - // - // Last team processing. - // - if (ThreadId >= NumRecs) - return 0; - NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); - if (ThreadId >= NumThreads) - return 0; - - // Load from buffer and reduce. - glcpyFct(GlobalBuffer, ThreadId, reduce_data); - for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) - glredFct(GlobalBuffer, i, reduce_data); - - // Reduce across warps to the warp master. - if (NumThreads > 1) { - gpu_regular_warp_reduce(reduce_data, shflFct); - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); - if (ActiveThreads > mapping::getWarpSize()) { - uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / - mapping::getWarpSize(); - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = ThreadId / mapping::getWarpSize(); - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - ThreadId); - } - } - - if (IsMaster) { - Cnt = 0; - IterCnt = 0; - return 1; - } - return 0; - } - if (IsMaster && ChunkTeamCount == num_of_records - 1) { - // Allow SIZE number of teams to proceed writing their - // intermediate results to the global buffer. - atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst); - } - - return 0; -} -} - -void *__kmpc_reduction_get_fixed_buffer() { - return state::getKernelLaunchEnvironment().ReductionBuffer; -} diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp deleted file mode 100644 index 475395102f47..000000000000 --- a/offload/DeviceRTL/src/State.cpp +++ /dev/null @@ -1,482 +0,0 @@ -//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Configuration.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "LibC.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -/// Memory implementation -/// -///{ - -/// External symbol to access dynamic shared memory. -[[gnu::aligned( - allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[]; - -/// The kernel environment passed to the init method by the compiler. -[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *> - KernelEnvironmentPtr; - -/// The kernel launch environment passed as argument to the kernel by the -/// runtime. -[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *> - KernelLaunchEnvironmentPtr; - -///} - -namespace { - -/// Fallback implementations are missing to trigger a link time error. -/// Implementations for new devices, including the host, should go into a -/// dedicated begin/end declare variant. -/// -///{ -extern "C" { -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) - -[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } -[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } - -#else - -[[gnu::weak, gnu::leaf]] void *malloc(size_t Size); -[[gnu::weak, gnu::leaf]] void free(void *Ptr); - -#endif -} -///} - -/// A "smart" stack in shared memory. -/// -/// The stack exposes a malloc/free interface but works like a stack internally. -/// In fact, it is a separate stack *per warp*. That means, each warp must push -/// and pop symmetrically or this breaks, badly. The implementation will (aim -/// to) detect non-lock-step warps and fallback to malloc/free. The same will -/// happen if a warp runs out of memory. The master warp in generic memory is -/// special and is given more memory than the rest. -/// -struct SharedMemorySmartStackTy { - /// Initialize the stack. Must be called by all threads. - void init(bool IsSPMD); - - /// Allocate \p Bytes on the stack for the encountering thread. Each thread - /// can call this function. - void *push(uint64_t Bytes); - - /// Deallocate the last allocation made by the encountering thread and pointed - /// to by \p Ptr from the stack. Each thread can call this function. - void pop(void *Ptr, uint64_t Bytes); - -private: - /// Compute the size of the storage space reserved for a thread. - uint32_t computeThreadStorageTotal() { - uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); - return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock, - allocator::ALIGNMENT); - } - - /// Return the top address of the warp data stack, that is the first address - /// this warp will allocate memory at next. - void *getThreadDataTop(uint32_t TId) { - return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; - } - - /// The actual storage, shared among all warps. - [[gnu::aligned( - allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize]; - [[gnu::aligned( - allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam]; -}; - -static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, - "Shared scratchpad of this size not supported yet."); - -/// The allocation of a single shared memory scratchpad. -[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy> - SharedMemorySmartStack; - -void SharedMemorySmartStackTy::init(bool IsSPMD) { - Usage[mapping::getThreadIdInBlock()] = 0; -} - -void *SharedMemorySmartStackTy::push(uint64_t Bytes) { - // First align the number of requested bytes. - /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to - /// be passed in as an argument and the stack rewritten to support it. - uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); - - uint32_t StorageTotal = computeThreadStorageTotal(); - - // The main thread in generic mode gets the space of its entire warp as the - // other threads do not participate in any computation at all. - if (mapping::isMainThreadInGenericMode()) - StorageTotal *= mapping::getWarpSize(); - - int TId = mapping::getThreadIdInBlock(); - if (Usage[TId] + AlignedBytes <= StorageTotal) { - void *Ptr = getThreadDataTop(TId); - Usage[TId] += AlignedBytes; - return Ptr; - } - - if (config::isDebugMode(DeviceDebugKind::CommonIssues)) - printf("Shared memory stack full, fallback to dynamic allocation of global " - "memory will negatively impact performance.\n"); - void *GlobalMemory = memory::allocGlobal( - AlignedBytes, "Slow path shared memory allocation, insufficient " - "shared memory stack memory!"); - ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!"); - - return GlobalMemory; -} - -void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { - uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); - if (utils::isSharedMemPtr(Ptr)) { - int TId = mapping::getThreadIdInBlock(); - Usage[TId] -= AlignedBytes; - return; - } - memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); -} - -} // namespace - -void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } - -void *memory::allocShared(uint64_t Bytes, const char *Reason) { - return SharedMemorySmartStack.push(Bytes); -} - -void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { - SharedMemorySmartStack.pop(Ptr, Bytes); -} - -void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { - void *Ptr = malloc(Bytes); - if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) - printf("nullptr returned by malloc!\n"); - return Ptr; -} - -void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } - -///} - -bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { - return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & - (ActiveLevelVar == Other.ActiveLevelVar) & - (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & - (RunSchedVar == Other.RunSchedVar) & - (RunSchedChunkVar == Other.RunSchedChunkVar); -} - -void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { - ASSERT(NThreadsVar == Other.NThreadsVar, nullptr); - ASSERT(LevelVar == Other.LevelVar, nullptr); - ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr); - ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr); - ASSERT(RunSchedVar == Other.RunSchedVar, nullptr); - ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr); -} - -void state::TeamStateTy::init(bool IsSPMD) { - ICVState.NThreadsVar = 0; - ICVState.LevelVar = 0; - ICVState.ActiveLevelVar = 0; - ICVState.Padding0Val = 0; - ICVState.MaxActiveLevelsVar = 1; - ICVState.RunSchedVar = omp_sched_static; - ICVState.RunSchedChunkVar = 1; - ParallelTeamSize = 1; - HasThreadState = false; - ParallelRegionFnVar = nullptr; -} - -bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { - return (ICVState == Other.ICVState) & - (HasThreadState == Other.HasThreadState) & - (ParallelTeamSize == Other.ParallelTeamSize); -} - -void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { - ICVState.assertEqual(Other.ICVState); - ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr); - ASSERT(HasThreadState == Other.HasThreadState, nullptr); -} - -[[clang::loader_uninitialized]] Local<state::TeamStateTy> - ompx::state::TeamState; -[[clang::loader_uninitialized]] Local<state::ThreadStateTy **> - ompx::state::ThreadStates; - -namespace { - -int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, - int OutOfBoundsVal = -1) { - if (Level == 0) - return DefaultVal; - int LevelVar = omp_get_level(); - if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) - return OutOfBoundsVal; - int ActiveLevel = icv::ActiveLevel; - if (OMP_UNLIKELY(Level != ActiveLevel)) - return DefaultVal; - return Val; -} - -} // namespace - -void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - SharedMemorySmartStack.init(IsSPMD); - if (mapping::isInitialThreadInLevel0(IsSPMD)) { - TeamState.init(IsSPMD); - ThreadStates = nullptr; - KernelEnvironmentPtr = &KernelEnvironment; - KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; - } -} - -KernelEnvironmentTy &state::getKernelEnvironment() { - return *KernelEnvironmentPtr; -} - -KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() { - return *KernelLaunchEnvironmentPtr; -} - -void state::enterDataEnvironment(IdentTy *Ident) { - ASSERT(config::mayUseThreadStates(), - "Thread state modified while explicitly disabled!"); - if (!config::mayUseThreadStates()) - return; - - unsigned TId = mapping::getThreadIdInBlock(); - ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>( - memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc")); - uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates); - if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) { - uint32_t Bytes = - sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock(); - void *ThreadStatesPtr = - memory::allocGlobal(Bytes, "Thread state array allocation"); - __builtin_memset(ThreadStatesPtr, 0, Bytes); - if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0), - reinterpret_cast<uintptr_t>(ThreadStatesPtr), - atomic::seq_cst, atomic::seq_cst)) - memory::freeGlobal(ThreadStatesPtr, - "Thread state array allocated multiple times"); - ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst), - "Expected valid thread states bit!"); - } - NewThreadState->init(ThreadStates[TId]); - TeamState.HasThreadState = true; - ThreadStates[TId] = NewThreadState; -} - -void state::exitDataEnvironment() { - ASSERT(config::mayUseThreadStates(), - "Thread state modified while explicitly disabled!"); - - unsigned TId = mapping::getThreadIdInBlock(); - resetStateForThread(TId); -} - -void state::resetStateForThread(uint32_t TId) { - if (!config::mayUseThreadStates()) - return; - if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) - return; - - ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; - memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc"); - ThreadStates[TId] = PreviousThreadState; -} - -void state::runAndCheckState(void(Func(void))) { - TeamStateTy OldTeamState = TeamState; - OldTeamState.assertEqual(TeamState); - - Func(); - - OldTeamState.assertEqual(TeamState); -} - -void state::assumeInitialState(bool IsSPMD) { - TeamStateTy InitialTeamState; - InitialTeamState.init(IsSPMD); - InitialTeamState.assertEqual(TeamState); - ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr); -} - -int state::getEffectivePTeamSize() { - int PTeamSize = state::ParallelTeamSize; - return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads(); -} - -extern "C" { -void omp_set_dynamic(int V) {} - -int omp_get_dynamic(void) { return 0; } - -void omp_set_num_threads(int V) { icv::NThreads = V; } - -int omp_get_max_threads(void) { - int NT = icv::NThreads; - return NT > 0 ? NT : mapping::getMaxTeamThreads(); -} - -int omp_get_level(void) { - int LevelVar = icv::Level; - ASSERT(LevelVar >= 0, nullptr); - return LevelVar; -} - -int omp_get_active_level(void) { return !!icv::ActiveLevel; } - -int omp_in_parallel(void) { return !!icv::ActiveLevel; } - -void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { - *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); - *ChunkSize = state::RunSchedChunk; -} - -void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { - icv::RunSched = (int)ScheduleKind; - state::RunSchedChunk = ChunkSize; -} - -int omp_get_ancestor_thread_num(int Level) { - return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); -} - -int omp_get_thread_num(void) { - return omp_get_ancestor_thread_num(omp_get_level()); -} - -int omp_get_team_size(int Level) { - return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1); -} - -int omp_get_num_threads(void) { - return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize(); -} - -int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); } - -int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } - -void omp_set_nested(int) {} - -int omp_get_nested(void) { return false; } - -void omp_set_max_active_levels(int Levels) { - icv::MaxActiveLevels = Levels > 0 ? 1 : 0; -} - -int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } - -omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } - -int omp_get_num_places(void) { return 0; } - -int omp_get_place_num_procs(int) { return omp_get_num_procs(); } - -void omp_get_place_proc_ids(int, int *) { - // TODO -} - -int omp_get_place_num(void) { return 0; } - -int omp_get_partition_num_places(void) { return 0; } - -void omp_get_partition_place_nums(int *) { - // TODO -} - -int omp_get_cancellation(void) { return 0; } - -void omp_set_default_device(int) {} - -int omp_get_default_device(void) { return -1; } - -int omp_get_num_devices(void) { return config::getNumDevices(); } - -int omp_get_device_num(void) { return config::getDeviceNum(); } - -int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); } - -int omp_get_team_num() { return mapping::getBlockIdInKernel(); } - -int omp_get_initial_device(void) { return -1; } - -int omp_is_initial_device(void) { return 0; } -} - -extern "C" { -[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) { - return memory::allocShared(Bytes, "Frontend alloc shared"); -} - -[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { - memory::freeShared(Ptr, Bytes, "Frontend free shared"); -} - -void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } - -void *llvm_omp_target_dynamic_shared_alloc() { - return __kmpc_get_dynamic_shared(); -} - -void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } - -/// Allocate storage in shared memory to communicate arguments from the main -/// thread to the workers in generic mode. If we exceed -/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. -constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; - -[[clang::loader_uninitialized]] static Local<void *> - SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; -[[clang::loader_uninitialized]] static Local<void **> - SharedMemVariableSharingSpacePtr; - -void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { - if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { - SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; - } else { - SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( - nArgs * sizeof(void *), "new extended args"); - ASSERT(SharedMemVariableSharingSpacePtr != nullptr, - "Nullptr returned by malloc!"); - } - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} - -void __kmpc_end_sharing_variables() { - if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) - memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); -} - -void __kmpc_get_shared_variables(void ***GlobalArgs) { - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} -} diff --git a/offload/DeviceRTL/src/Stub.cpp b/offload/DeviceRTL/src/Stub.cpp deleted file mode 100644 index e833423eb265..000000000000 --- a/offload/DeviceRTL/src/Stub.cpp +++ /dev/null @@ -1 +0,0 @@ -// This is an empty file used to create a device fatbinary. diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp deleted file mode 100644 index 2f1ed34a3f6d..000000000000 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ /dev/null @@ -1,379 +0,0 @@ -//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Include all synchronization. -// -//===----------------------------------------------------------------------===// - -#include "Synchronization.h" - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" - -using namespace ompx; - -namespace impl { - -/// Atomics -/// -///{ -///} - -/// AMDGCN Implementation -/// -///{ -#ifdef __AMDGPU__ - -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - // builtin_amdgcn_atomic_inc32 should expand to this switch when - // passed a runtime value, but does not do so yet. Workaround here. - -#define ScopeSwitch(ORDER) \ - switch (MemScope) { \ - case atomic::MemScopeTy::system: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \ - case atomic::MemScopeTy::device: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \ - case atomic::MemScopeTy::workgroup: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \ - case atomic::MemScopeTy::wavefront: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "wavefront"); \ - case atomic::MemScopeTy::single: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "singlethread"); \ - } - -#define Case(ORDER) \ - case ORDER: \ - ScopeSwitch(ORDER) - - switch (Ordering) { - default: - __builtin_unreachable(); - Case(atomic::relaxed); - Case(atomic::acquire); - Case(atomic::release); - Case(atomic::acq_rel); - Case(atomic::seq_cst); -#undef Case -#undef ScopeSwitch - } -} - -[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker; - -void namedBarrierInit() { - // Don't have global ctors, and shared memory is not zero init - atomic::store(&namedBarrierTracker, 0u, atomic::release); -} - -void namedBarrier() { - uint32_t NumThreads = omp_get_num_threads(); - // assert(NumThreads % 32 == 0); - - uint32_t WarpSize = mapping::getWarpSize(); - uint32_t NumWaves = NumThreads / WarpSize; - - fence::team(atomic::acquire); - - // named barrier implementation for amdgcn. - // Uses two 16 bit unsigned counters. One for the number of waves to have - // reached the barrier, and one to count how many times the barrier has been - // passed. These are packed in a single atomically accessed 32 bit integer. - // Low bits for the number of waves, assumed zero before this call. - // High bits to count the number of times the barrier has been passed. - - // precondition: NumWaves != 0; - // invariant: NumWaves * WarpSize == NumThreads; - // precondition: NumWaves < 0xffffu; - - // Increment the low 16 bits once, using the lowest active thread. - if (mapping::isLeaderInWarp()) { - uint32_t load = atomic::add(&namedBarrierTracker, 1, - atomic::relaxed); // commutative - - // Record the number of times the barrier has been passed - uint32_t generation = load & 0xffff0000u; - - if ((load & 0x0000ffffu) == (NumWaves - 1)) { - // Reached NumWaves in low bits so this is the last wave. - // Set low bits to zero and increment high bits - load += 0x00010000u; // wrap is safe - load &= 0xffff0000u; // because bits zeroed second - - // Reset the wave counter and release the waiting waves - atomic::store(&namedBarrierTracker, load, atomic::relaxed); - } else { - // more waves still to go, spin until generation counter changes - do { - __builtin_amdgcn_s_sleep(0); - load = atomic::load(&namedBarrierTracker, atomic::relaxed); - } while ((load & 0xffff0000u) == generation); - } - } - fence::team(atomic::release); -} - -void fenceTeam(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::workgroup); -} - -void fenceKernel(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::device); -} - -void fenceSystem(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::system); -} - -void syncWarp(__kmpc_impl_lanemask_t) { - // This is a no-op on current AMDGPU hardware but it is used by the optimizer - // to enforce convergent behaviour between control flow graphs. - __builtin_amdgcn_wave_barrier(); -} - -void syncThreads(atomic::OrderingTy Ordering) { - if (Ordering != atomic::relaxed) - fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst); - - __builtin_amdgcn_s_barrier(); - - if (Ordering != atomic::relaxed) - fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst); -} -void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } - -// TODO: Don't have wavefront lane locks. Possibly can't have them. -void unsetLock(omp_lock_t *) { __builtin_trap(); } -int testLock(omp_lock_t *) { __builtin_trap(); } -void initLock(omp_lock_t *) { __builtin_trap(); } -void destroyLock(omp_lock_t *) { __builtin_trap(); } -void setLock(omp_lock_t *) { __builtin_trap(); } - -constexpr uint32_t UNSET = 0; -constexpr uint32_t SET = 1; - -void unsetCriticalLock(omp_lock_t *Lock) { - (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel); -} - -void setCriticalLock(omp_lock_t *Lock) { - uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1; - if (mapping::getThreadIdInWarp() == LowestActiveThread) { - fenceKernel(atomic::release); - while ( - !cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) { - __builtin_amdgcn_s_sleep(32); - } - fenceKernel(atomic::acquire); - } -} - -#endif -///} - -/// NVPTX Implementation -/// -///{ -#ifdef __NVPTX__ - -uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - return __nvvm_atom_inc_gen_ui(Address, Val); -} - -void namedBarrierInit() {} - -void namedBarrier() { - uint32_t NumThreads = omp_get_num_threads(); - ASSERT(NumThreads % 32 == 0, nullptr); - - // The named barrier for active parallel threads of a team in an L1 parallel - // region to synchronize with each other. - constexpr int BarrierNo = 7; - __nvvm_barrier_sync_cnt(BarrierNo, NumThreads); -} - -void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); } - -void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); } - -void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); } - -void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } - -void syncThreads(atomic::OrderingTy Ordering) { - constexpr int BarrierNo = 8; - __nvvm_barrier_sync(BarrierNo); -} - -void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); } - -constexpr uint32_t OMP_SPIN = 1000; -constexpr uint32_t UNSET = 0; -constexpr uint32_t SET = 1; - -// TODO: This seems to hide a bug in the declare variant handling. If it is -// called before it is defined -// here the overload won't happen. Investigate lalter! -void unsetLock(omp_lock_t *Lock) { - (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst); -} - -int testLock(omp_lock_t *Lock) { - return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst); -} - -void initLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void setLock(omp_lock_t *Lock) { - // TODO: not sure spinning is a good idea here.. - while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst, - atomic::seq_cst) != UNSET) { - int32_t start = __nvvm_read_ptx_sreg_clock(); - int32_t now; - for (;;) { - now = __nvvm_read_ptx_sreg_clock(); - int32_t cycles = now > start ? now - start : now + (0xffffffff - start); - if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) { - break; - } - } - } // wait for 0 to be the read value -} - -void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } - -#endif -///} - -} // namespace impl - -void synchronize::init(bool IsSPMD) { - if (!IsSPMD) - impl::namedBarrierInit(); -} - -void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } - -void synchronize::threads(atomic::OrderingTy Ordering) { - impl::syncThreads(Ordering); -} - -void synchronize::threadsAligned(atomic::OrderingTy Ordering) { - impl::syncThreadsAligned(Ordering); -} - -void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); } - -void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); } - -void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } - -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - return impl::atomicInc(Addr, V, Ordering, MemScope); -} - -void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); } - -void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); } - -extern "C" { -void __kmpc_ordered(IdentTy *Loc, int32_t TId) {} - -void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { - __kmpc_barrier(Loc, TId); - return 0; -} - -void __kmpc_barrier(IdentTy *Loc, int32_t TId) { - if (mapping::isSPMDMode()) - return __kmpc_barrier_simple_spmd(Loc, TId); - - // Generic parallel regions are run with multiple of the warp size or single - // threaded, in the latter case we need to stop here. - if (omp_get_num_threads() == 1) - return __kmpc_flush(Loc); - - impl::namedBarrier(); -} - -[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { - synchronize::threadsAligned(atomic::OrderingTy::seq_cst); -} - -[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc, - int32_t TId) { - synchronize::threads(atomic::OrderingTy::seq_cst); -} - -int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { - return omp_get_thread_num() == 0; -} - -void __kmpc_end_master(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) { - return omp_get_thread_num() == Filter; -} - -void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_single(IdentTy *Loc, int32_t TId) { - return __kmpc_master(Loc, TId); -} - -void __kmpc_end_single(IdentTy *Loc, int32_t TId) { - // The barrier is explicitly called. -} - -void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); } - -uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); } - -void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); } - -void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { - impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); -} - -void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { - impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); -} - -void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } - -void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); } - -void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } - -void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); } - -int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); } - -void ompx_sync_block(int Ordering) { - impl::syncThreadsAligned(atomic::OrderingTy(Ordering)); -} -void ompx_sync_block_acq_rel() { - impl::syncThreadsAligned(atomic::OrderingTy::acq_rel); -} -void ompx_sync_block_divergent(int Ordering) { - impl::syncThreads(atomic::OrderingTy(Ordering)); -} -} // extern "C" diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp deleted file mode 100644 index d0be0ace50df..000000000000 --- a/offload/DeviceRTL/src/Tasking.cpp +++ /dev/null @@ -1,103 +0,0 @@ -//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Task implementation support. -// -// TODO: We should not allocate and execute the task in two steps. A new API is -// needed for that though. -// -//===----------------------------------------------------------------------===// - -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "State.h" - -using namespace ompx; - -extern "C" { - -TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, - size_t TaskSizeInclPrivateValues, - size_t SharedValuesSize, - TaskFnTy TaskFn) { - auto TaskSizeInclPrivateValuesPadded = - utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *)); - auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize; - TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal( - TaskSizeTotal, "explicit task descriptor"); - TaskDescriptor->Payload = - utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded); - TaskDescriptor->TaskFn = TaskFn; - - return TaskDescriptor; -} - -int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); -} - -int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int32_t, - void *, int32_t, void *) { - state::DateEnvironmentRAII DERAII(Loc); - - TaskDescriptor->TaskFn(0, TaskDescriptor); - - memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); - return 0; -} - -void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - state::enterDataEnvironment(Loc); -} - -void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - state::exitDataEnvironment(); - - memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); -} - -void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, - void *) {} - -void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {} - -void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {} - -int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; } - -int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; } - -void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int, - uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, - int32_t, uint64_t, void *) { - // Skip task entirely if empty iteration space. - if (*LowerBound > *UpperBound) - return; - - // The compiler has already stored lb and ub in the TaskDescriptorTy structure - // as we are using a single task to execute the entire loop, we can leave - // the initial task_t untouched - __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); -} - -int omp_in_final(void) { - // treat all tasks as final... Specs may expect runtime to keep - // track more precisely if a task was actively set by users... This - // is not explicitly specified; will treat as if runtime can - // actively decide to put a non-final task into a final one. - return 1; -} - -int omp_get_max_task_priority(void) { return 0; } -} diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp deleted file mode 100644 index 59a2cc3f27ac..000000000000 --- a/offload/DeviceRTL/src/Workshare.cpp +++ /dev/null @@ -1,970 +0,0 @@ -//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of the KMPC interface -// for the loop construct plus other worksharing constructs that use the same -// interface as loops. -// -//===----------------------------------------------------------------------===// - -#include "Workshare.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -// TODO: -struct DynamicScheduleTracker { - int64_t Chunk; - int64_t LoopUpperBound; - int64_t NextLowerBound; - int64_t Stride; - kmp_sched_t ScheduleType; - DynamicScheduleTracker *NextDST; -}; - -#define ASSERT0(...) - -// used by the library for the interface with the app -#define DISPATCH_FINISHED 0 -#define DISPATCH_NOTFINISHED 1 - -// used by dynamic scheduling -#define FINISHED 0 -#define NOT_FINISHED 1 -#define LAST_CHUNK 2 - -// TODO: This variable is a hack inherited from the old runtime. -[[clang::loader_uninitialized]] static Local<uint64_t> Cnt; - -template <typename T, typename ST> struct omptarget_nvptx_LoopSupport { - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling with chunk - - // Generic implementation of OMP loop scheduling with static policy - /*! \brief Calculate initial bounds for static loop and stride - * @param[in] loc location in code of the call (not used here) - * @param[in] global_tid global thread id - * @param[in] schetype type of scheduling (see omptarget-nvptx.h) - * @param[in] plastiter pointer to last iteration - * @param[in,out] pointer to loop lower bound. it will contain value of - * lower bound of first chunk - * @param[in,out] pointer to loop upper bound. It will contain value of - * upper bound of first chunk - * @param[in,out] pointer to loop stride. It will contain value of stride - * between two successive chunks executed by the same thread - * @param[in] loop increment bump - * @param[in] chunk size - */ - - // helper function for static chunk - static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, - T entityId, T numberOfEntities) { - // each thread executes multiple chunks all of the same size, except - // the last one - // distance between two successive chunks - stride = numberOfEntities * chunk; - lb = lb + entityId * chunk; - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - // Say ub' is the beginning of the last chunk. Then who ever has a - // lower bound plus a multiple of the increment equal to ub' is - // the last one. - T beginingLastChunk = inputUb - (inputUb % chunk); - last = ((beginingLastChunk - lb) % stride) == 0; - } - - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling without chunk - - // helper function for static no chunk - static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, - T entityId, T numberOfEntities) { - // No chunk size specified. Each thread or warp gets at most one - // chunk; chunks are all almost of equal size - T loopSize = ub - lb + 1; - - chunk = loopSize / numberOfEntities; - T leftOver = loopSize - chunk * numberOfEntities; - - if (entityId < leftOver) { - chunk++; - lb = lb + entityId * chunk; - } else { - lb = lb + entityId * chunk + leftOver; - } - - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - last = lb <= inputUb && inputUb <= ub; - stride = loopSize; // make sure we only do 1 chunk per warp - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for Static Init - - static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter, - T *plower, T *pupper, ST *pstride, ST chunk, - bool IsSPMDExecutionMode) { - int32_t gtid = omp_get_thread_num(); - int numberOfActiveOMPThreads = omp_get_num_threads(); - - // All warps that are in excess of the maximum requested, do - // not execute the loop - ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, - "current thread is not needed here; error"); - - // copy - int lastiter = 0; - T lb = *plower; - T ub = *pupper; - ST stride = *pstride; - - // init - switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { - case kmp_sched_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_static_balanced_chunk: { - if (chunk > 0) { - // round up to make sure the chunk is enough to cover all iterations - T tripCount = ub - lb + 1; // +1 because ub is inclusive - T span = (tripCount + numberOfActiveOMPThreads - 1) / - numberOfActiveOMPThreads; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - if (ub > oldUb) - ub = oldUb; - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - case kmp_sched_distr_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), - omp_get_num_teams()); - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_distr_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), - omp_get_num_teams()); - break; - } - case kmp_sched_distr_static_chunk_sched_static_chunkone: { - ForStaticChunk(lastiter, lb, ub, stride, chunk, - numberOfActiveOMPThreads * omp_get_team_num() + gtid, - omp_get_num_teams() * numberOfActiveOMPThreads); - break; - } - default: { - // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - } - // copy back - *plastiter = lastiter; - *plower = lb; - *pupper = ub; - *pstride = stride; - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch Init - - static int OrderedSchedule(kmp_sched_t schedule) { - return schedule >= kmp_sched_ordered_first && - schedule <= kmp_sched_ordered_last; - } - - static void dispatch_init(IdentTy *loc, int32_t threadId, - kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, - DynamicScheduleTracker *DST) { - int tid = mapping::getThreadIdInBlock(); - T tnum = omp_get_num_threads(); - T tripCount = ub - lb + 1; // +1 because ub is inclusive - ASSERT0(LT_FUSSY, threadId < tnum, - "current thread is not needed here; error"); - - /* Currently just ignore the monotonic and non-monotonic modifiers - * (the compiler isn't producing them * yet anyway). - * When it is we'll want to look at them somewhere here and use that - * information to add to our schedule choice. We shouldn't need to pass - * them on, they merely affect which schedule we can legally choose for - * various dynamic cases. (In particular, whether or not a stealing scheme - * is legal). - */ - schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); - - // Process schedule. - if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { - if (OrderedSchedule(schedule)) - __kmpc_barrier(loc, threadId); - schedule = kmp_sched_static_chunk; - chunk = tripCount; // one thread gets the whole loop - } else if (schedule == kmp_sched_runtime) { - // process runtime - omp_sched_t rtSched; - int ChunkInt; - omp_get_schedule(&rtSched, &ChunkInt); - chunk = ChunkInt; - switch (rtSched) { - case omp_sched_static: { - if (chunk > 0) - schedule = kmp_sched_static_chunk; - else - schedule = kmp_sched_static_nochunk; - break; - } - case omp_sched_auto: { - schedule = kmp_sched_static_chunk; - chunk = 1; - break; - } - case omp_sched_dynamic: - case omp_sched_guided: { - schedule = kmp_sched_dynamic; - break; - } - } - } else if (schedule == kmp_sched_auto) { - schedule = kmp_sched_static_chunk; - chunk = 1; - } else { - // ASSERT(LT_FUSSY, - // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - // "unknown schedule %d & chunk %lld\n", (int)schedule, - // (long long)chunk); - } - - // init schedules - if (schedule == kmp_sched_static_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_static_balanced_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - // round up to make sure the chunk is enough to cover all iterations - T span = (tripCount + tnum - 1) / tnum; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - if (ub > oldUb) - ub = oldUb; - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_static_nochunk) { - ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { - // save data - DST->ScheduleType = schedule; - if (chunk < 1) - chunk = 1; - DST->Chunk = chunk; - DST->LoopUpperBound = ub; - DST->NextLowerBound = lb; - __kmpc_barrier(loc, threadId); - if (tid == 0) { - Cnt = 0; - fence::team(atomic::seq_cst); - } - __kmpc_barrier(loc, threadId); - } - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch next - - static uint64_t NextIter() { - __kmpc_impl_lanemask_t active = mapping::activemask(); - uint32_t leader = utils::ffs(active) - 1; - uint32_t change = utils::popc(active); - __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT(); - unsigned int rank = utils::popc(active & lane_mask_lt); - uint64_t warp_res = 0; - if (rank == 0) { - warp_res = atomic::add(&Cnt, change, atomic::seq_cst); - } - warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize()); - return warp_res + rank; - } - - static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, - T loopUpperBound) { - T N = NextIter(); - lb = loopLowerBound + N * chunkSize; - ub = lb + chunkSize - 1; // Clang uses i <= ub - - // 3 result cases: - // a. lb and ub < loopUpperBound --> NOT_FINISHED - // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> - // NOT_FINISHED - // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED - // a. - if (lb <= loopUpperBound && ub < loopUpperBound) { - return NOT_FINISHED; - } - // b. - if (lb <= loopUpperBound) { - ub = loopUpperBound; - return LAST_CHUNK; - } - // c. if we are here, we are in case 'c' - lb = loopUpperBound + 2; - ub = loopUpperBound + 1; - return FINISHED; - } - - static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, - T *plower, T *pupper, ST *pstride, - DynamicScheduleTracker *DST) { - // ID of a thread in its own warp - - // automatically selects thread or warp ID based on selected implementation - ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(), - "current thread is not needed here; error"); - // retrieve schedule - kmp_sched_t schedule = DST->ScheduleType; - - // xxx reduce to one - if (schedule == kmp_sched_static_chunk || - schedule == kmp_sched_static_nochunk) { - T myLb = DST->NextLowerBound; - T ub = DST->LoopUpperBound; - // finished? - if (myLb > ub) { - return DISPATCH_FINISHED; - } - // not finished, save current bounds - ST chunk = DST->Chunk; - *plower = myLb; - T myUb = myLb + chunk - 1; // Clang uses i <= ub - if (myUb > ub) - myUb = ub; - *pupper = myUb; - *plast = (int32_t)(myUb == ub); - - // increment next lower bound by the stride - ST stride = DST->Stride; - DST->NextLowerBound = myLb + stride; - return DISPATCH_NOTFINISHED; - } - ASSERT0(LT_FUSSY, - schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - "bad sched"); - T myLb, myUb; - int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound, - DST->LoopUpperBound); - - if (finished == FINISHED) - return DISPATCH_FINISHED; - - // not finished (either not finished or last chunk) - *plast = (int32_t)(finished == LAST_CHUNK); - *plower = myLb; - *pupper = myUb; - *pstride = 1; - - return DISPATCH_NOTFINISHED; - } - - static void dispatch_fini() { - // nothing - } - - //////////////////////////////////////////////////////////////////////////////// - // end of template class that encapsulate all the helper functions - //////////////////////////////////////////////////////////////////////////////// -}; - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (dyn loops) -//////////////////////////////////////////////////////////////////////////////// - -// TODO: Expand the dispatch API to take a DST pointer which can then be -// allocated properly without malloc. -// For now, each team will contain an LDS pointer (ThreadDST) to a global array -// of references to the DST structs allocated (in global memory) for each thread -// in the team. The global memory array is allocated during the init phase if it -// was not allocated already and will be deallocated when the dispatch phase -// ends: -// -// __kmpc_dispatch_init -// -// ** Dispatch loop ** -// -// __kmpc_dispatch_deinit -// -[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **> - ThreadDST; - -// Create a new DST, link the current one, and define the new as current. -static DynamicScheduleTracker *pushDST() { - int32_t ThreadIndex = mapping::getThreadIdInBlock(); - // Each block will allocate an array of pointers to DST structs. The array is - // equal in length to the number of threads in that block. - if (!ThreadDST) { - // Allocate global memory array of pointers to DST structs: - if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0) - ThreadDST = static_cast<DynamicScheduleTracker **>( - memory::allocGlobal(mapping::getNumberOfThreadsInBlock() * - sizeof(DynamicScheduleTracker *), - "new ThreadDST array")); - synchronize::threads(atomic::seq_cst); - - // Initialize the array pointers: - ThreadDST[ThreadIndex] = nullptr; - } - - // Create a DST struct for the current thread: - DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>( - memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST")); - *NewDST = DynamicScheduleTracker({0}); - - // Add the new DST struct to the array of DST structs: - NewDST->NextDST = ThreadDST[ThreadIndex]; - ThreadDST[ThreadIndex] = NewDST; - return NewDST; -} - -// Return the current DST. -static DynamicScheduleTracker *peekDST() { - return ThreadDST[mapping::getThreadIdInBlock()]; -} - -// Pop the current DST and restore the last one. -static void popDST() { - int32_t ThreadIndex = mapping::getThreadIdInBlock(); - DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex]; - DynamicScheduleTracker *OldDST = CurrentDST->NextDST; - memory::freeGlobal(CurrentDST, "remove DST"); - ThreadDST[ThreadIndex] = OldDST; - - // Check if we need to deallocate the global array. Ensure all threads - // in the block have finished deallocating the individual DSTs. - synchronize::threads(atomic::seq_cst); - if (!ThreadDST[ThreadIndex] && !ThreadIndex) { - memory::freeGlobal(ThreadDST, "remove ThreadDST array"); - ThreadDST = nullptr; - } - synchronize::threads(atomic::seq_cst); -} - -void workshare::init(bool IsSPMD) { - if (mapping::isInitialThreadInLevel0(IsSPMD)) - ThreadDST = nullptr; -} - -extern "C" { - -// init -void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule, - int32_t lb, int32_t ub, int32_t st, int32_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule, - uint32_t lb, uint32_t ub, int32_t st, - int32_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule, - int64_t lb, int64_t ub, int64_t st, int64_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule, - uint64_t lb, uint64_t ub, int64_t st, - int64_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -// next -int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, - int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last, - uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, - int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last, - uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -// fini -void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini(); -} - -// deinit -void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); } - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (static loops) -//////////////////////////////////////////////////////////////////////////////// - -void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} - -void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {} -} - -namespace ompx { - -/// Helper class to hide the generic loop nest and provide the template argument -/// throughout. -template <typename Ty> class StaticLoopChunker { - - /// Generic loop nest that handles block and/or thread distribution in the - /// absence of user specified chunk sizes. This implicitly picks a block chunk - /// size equal to the number of threads in the block and a thread chunk size - /// equal to one. In contrast to the chunked version we can get away with a - /// single loop in this case - static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, - Ty NumBlocks, Ty BId, Ty NumThreads, - Ty TId, Ty NumIters, - uint8_t OneIterationPerThread) { - Ty KernelIteration = NumBlocks * NumThreads; - - // Start index in the normalized space. - Ty IV = BId * NumThreads + TId; - ASSERT(IV >= 0, "Bad index"); - - // Cover the entire iteration space, assumptions in the caller might allow - // to simplify this loop to a conditional. - if (IV < NumIters) { - do { - - // Execute the loop body. - LoopBody(IV, Arg); - - // Every thread executed one block and thread chunk now. - IV += KernelIteration; - - if (OneIterationPerThread) - return; - - } while (IV < NumIters); - } - } - - /// Generic loop nest that handles block and/or thread distribution in the - /// presence of user specified chunk sizes (for at least one of them). - static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg, - Ty BlockChunk, Ty NumBlocks, Ty BId, - Ty ThreadChunk, Ty NumThreads, Ty TId, - Ty NumIters, - uint8_t OneIterationPerThread) { - Ty KernelIteration = NumBlocks * BlockChunk; - - // Start index in the chunked space. - Ty IV = BId * BlockChunk + TId; - ASSERT(IV >= 0, "Bad index"); - - // Cover the entire iteration space, assumptions in the caller might allow - // to simplify this loop to a conditional. - do { - - Ty BlockChunkLeft = - BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0; - Ty ThreadChunkLeft = - ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft; - - while (ThreadChunkLeft--) { - - // Given the blocking it's hard to keep track of what to execute. - if (IV >= NumIters) - return; - - // Execute the loop body. - LoopBody(IV, Arg); - - if (OneIterationPerThread) - return; - - ++IV; - } - - IV += KernelIteration; - - } while (IV < NumIters); - } - -public: - /// Worksharing `for`-loop. - /// \param[in] Loc Description of source location - /// \param[in] LoopBody Function which corresponds to loop body - /// \param[in] Arg Pointer to struct which contains loop body args - /// \param[in] NumIters Number of loop iterations - /// \param[in] NumThreads Number of GPU threads - /// \param[in] ThreadChunk Size of thread chunk - /// \param[in] OneIterationPerThread If true/nonzero, each thread executes - /// only one loop iteration or one thread chunk. This avoids an outer loop - /// over all loop iterations/chunks. - static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty NumThreads, Ty ThreadChunk, - uint8_t OneIterationPerThread) { - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(ThreadChunk >= 0, "Bad thread count"); - - // All threads need to participate but we don't know if we are in a - // parallel at all or if the user might have used a `num_threads` clause - // on the parallel and reduced the number compared to the block size. - // Since nested parallels are possible too we need to get the thread id - // from the `omp` getter and not the mapping directly. - Ty TId = omp_get_thread_num(); - - // There are no blocks involved here. - Ty BlockChunk = 0; - Ty NumBlocks = 1; - Ty BId = 0; - - // If the thread chunk is not specified we pick a default now. - if (ThreadChunk == 0) - ThreadChunk = 1; - - // If we know we have more threads than iterations we can indicate that to - // avoid an outer loop. - if (config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - } - - if (OneIterationPerThread) - ASSERT(NumThreads >= NumIters, "Broken assumption"); - - if (ThreadChunk != 1) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - } - - /// Worksharing `distribute`-loop. - /// \param[in] Loc Description of source location - /// \param[in] LoopBody Function which corresponds to loop body - /// \param[in] Arg Pointer to struct which contains loop body args - /// \param[in] NumIters Number of loop iterations - /// \param[in] BlockChunk Size of block chunk - /// \param[in] OneIterationPerThread If true/nonzero, each thread executes - /// only one loop iteration or one thread chunk. This avoids an outer loop - /// over all loop iterations/chunks. - static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty BlockChunk, - uint8_t OneIterationPerThread) { - ASSERT(icv::Level == 0, "Bad distribute"); - ASSERT(icv::ActiveLevel == 0, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); - - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(BlockChunk >= 0, "Bad block count"); - - // There are no threads involved here. - Ty ThreadChunk = 0; - Ty NumThreads = 1; - Ty TId = 0; - - // All teams need to participate. - Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); - Ty BId = mapping::getBlockIdInKernel(); - - // If the block chunk is not specified we pick a default now. - if (BlockChunk == 0) - BlockChunk = NumThreads; - - // If we know we have more blocks than iterations we can indicate that to - // avoid an outer loop. - if (config::getAssumeTeamsOversubscription()) { - OneIterationPerThread = true; - } - - if (OneIterationPerThread) - ASSERT(NumBlocks >= NumIters, "Broken assumption"); - - if (BlockChunk != NumThreads) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - - ASSERT(icv::Level == 0, "Bad distribute"); - ASSERT(icv::ActiveLevel == 0, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); - } - - /// Worksharing `distribute parallel for`-loop. - /// \param[in] Loc Description of source location - /// \param[in] LoopBody Function which corresponds to loop body - /// \param[in] Arg Pointer to struct which contains loop body args - /// \param[in] NumIters Number of loop iterations - /// \param[in] NumThreads Number of GPU threads - /// \param[in] BlockChunk Size of block chunk - /// \param[in] ThreadChunk Size of thread chunk - /// \param[in] OneIterationPerThread If true/nonzero, each thread executes - /// only one loop iteration or one thread chunk. This avoids an outer loop - /// over all loop iterations/chunks. - static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), - void *Arg, Ty NumIters, Ty NumThreads, - Ty BlockChunk, Ty ThreadChunk, - uint8_t OneIterationPerThread) { - ASSERT(icv::Level == 1, "Bad distribute"); - ASSERT(icv::ActiveLevel == 1, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(BlockChunk >= 0, "Bad block count"); - ASSERT(ThreadChunk >= 0, "Bad thread count"); - - // All threads need to participate but the user might have used a - // `num_threads` clause on the parallel and reduced the number compared to - // the block size. - Ty TId = mapping::getThreadIdInBlock(); - - // All teams need to participate. - Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); - Ty BId = mapping::getBlockIdInKernel(); - - // If the block chunk is not specified we pick a default now. - if (BlockChunk == 0) - BlockChunk = NumThreads; - - // If the thread chunk is not specified we pick a default now. - if (ThreadChunk == 0) - ThreadChunk = 1; - - // If we know we have more threads (across all blocks) than iterations we - // can indicate that to avoid an outer loop. - if (config::getAssumeTeamsOversubscription() & - config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - } - - if (OneIterationPerThread) - ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); - - if (BlockChunk != NumThreads || ThreadChunk != 1) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - - ASSERT(icv::Level == 1, "Bad distribute"); - ASSERT(icv::ActiveLevel == 1, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - } -}; - -} // namespace ompx - -#define OMP_LOOP_ENTRY(BW, TY) \ - [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_for_static_loop##BW( \ - IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY block_chunk, TY thread_chunk, \ - uint8_t one_iteration_per_thread) { \ - ompx::StaticLoopChunker<TY>::DistributeFor( \ - loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk, \ - one_iteration_per_thread); \ - } \ - [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ - void *arg, TY num_iters, TY block_chunk, \ - uint8_t one_iteration_per_thread) { \ - ompx::StaticLoopChunker<TY>::Distribute( \ - loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread); \ - } \ - [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ - IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) { \ - ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \ - thread_chunk, one_iteration_per_thread); \ - } - -extern "C" { -OMP_LOOP_ENTRY(_4, int32_t) -OMP_LOOP_ENTRY(_4u, uint32_t) -OMP_LOOP_ENTRY(_8, int64_t) -OMP_LOOP_ENTRY(_8u, uint64_t) -} |
