diff options
Diffstat (limited to 'offload/DeviceRTL/src/State.cpp')
| -rw-r--r-- | offload/DeviceRTL/src/State.cpp | 482 |
1 files changed, 0 insertions, 482 deletions
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp deleted file mode 100644 index 475395102f47..000000000000 --- a/offload/DeviceRTL/src/State.cpp +++ /dev/null @@ -1,482 +0,0 @@ -//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Configuration.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "LibC.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -/// Memory implementation -/// -///{ - -/// External symbol to access dynamic shared memory. -[[gnu::aligned( - allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[]; - -/// The kernel environment passed to the init method by the compiler. -[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *> - KernelEnvironmentPtr; - -/// The kernel launch environment passed as argument to the kernel by the -/// runtime. -[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *> - KernelLaunchEnvironmentPtr; - -///} - -namespace { - -/// Fallback implementations are missing to trigger a link time error. -/// Implementations for new devices, including the host, should go into a -/// dedicated begin/end declare variant. -/// -///{ -extern "C" { -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) - -[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } -[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } - -#else - -[[gnu::weak, gnu::leaf]] void *malloc(size_t Size); -[[gnu::weak, gnu::leaf]] void free(void *Ptr); - -#endif -} -///} - -/// A "smart" stack in shared memory. -/// -/// The stack exposes a malloc/free interface but works like a stack internally. -/// In fact, it is a separate stack *per warp*. That means, each warp must push -/// and pop symmetrically or this breaks, badly. The implementation will (aim -/// to) detect non-lock-step warps and fallback to malloc/free. The same will -/// happen if a warp runs out of memory. The master warp in generic memory is -/// special and is given more memory than the rest. -/// -struct SharedMemorySmartStackTy { - /// Initialize the stack. Must be called by all threads. - void init(bool IsSPMD); - - /// Allocate \p Bytes on the stack for the encountering thread. Each thread - /// can call this function. - void *push(uint64_t Bytes); - - /// Deallocate the last allocation made by the encountering thread and pointed - /// to by \p Ptr from the stack. Each thread can call this function. - void pop(void *Ptr, uint64_t Bytes); - -private: - /// Compute the size of the storage space reserved for a thread. - uint32_t computeThreadStorageTotal() { - uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); - return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock, - allocator::ALIGNMENT); - } - - /// Return the top address of the warp data stack, that is the first address - /// this warp will allocate memory at next. - void *getThreadDataTop(uint32_t TId) { - return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; - } - - /// The actual storage, shared among all warps. - [[gnu::aligned( - allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize]; - [[gnu::aligned( - allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam]; -}; - -static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, - "Shared scratchpad of this size not supported yet."); - -/// The allocation of a single shared memory scratchpad. -[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy> - SharedMemorySmartStack; - -void SharedMemorySmartStackTy::init(bool IsSPMD) { - Usage[mapping::getThreadIdInBlock()] = 0; -} - -void *SharedMemorySmartStackTy::push(uint64_t Bytes) { - // First align the number of requested bytes. - /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to - /// be passed in as an argument and the stack rewritten to support it. - uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); - - uint32_t StorageTotal = computeThreadStorageTotal(); - - // The main thread in generic mode gets the space of its entire warp as the - // other threads do not participate in any computation at all. - if (mapping::isMainThreadInGenericMode()) - StorageTotal *= mapping::getWarpSize(); - - int TId = mapping::getThreadIdInBlock(); - if (Usage[TId] + AlignedBytes <= StorageTotal) { - void *Ptr = getThreadDataTop(TId); - Usage[TId] += AlignedBytes; - return Ptr; - } - - if (config::isDebugMode(DeviceDebugKind::CommonIssues)) - printf("Shared memory stack full, fallback to dynamic allocation of global " - "memory will negatively impact performance.\n"); - void *GlobalMemory = memory::allocGlobal( - AlignedBytes, "Slow path shared memory allocation, insufficient " - "shared memory stack memory!"); - ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!"); - - return GlobalMemory; -} - -void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { - uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); - if (utils::isSharedMemPtr(Ptr)) { - int TId = mapping::getThreadIdInBlock(); - Usage[TId] -= AlignedBytes; - return; - } - memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); -} - -} // namespace - -void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } - -void *memory::allocShared(uint64_t Bytes, const char *Reason) { - return SharedMemorySmartStack.push(Bytes); -} - -void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { - SharedMemorySmartStack.pop(Ptr, Bytes); -} - -void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { - void *Ptr = malloc(Bytes); - if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) - printf("nullptr returned by malloc!\n"); - return Ptr; -} - -void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } - -///} - -bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { - return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & - (ActiveLevelVar == Other.ActiveLevelVar) & - (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & - (RunSchedVar == Other.RunSchedVar) & - (RunSchedChunkVar == Other.RunSchedChunkVar); -} - -void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { - ASSERT(NThreadsVar == Other.NThreadsVar, nullptr); - ASSERT(LevelVar == Other.LevelVar, nullptr); - ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr); - ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr); - ASSERT(RunSchedVar == Other.RunSchedVar, nullptr); - ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr); -} - -void state::TeamStateTy::init(bool IsSPMD) { - ICVState.NThreadsVar = 0; - ICVState.LevelVar = 0; - ICVState.ActiveLevelVar = 0; - ICVState.Padding0Val = 0; - ICVState.MaxActiveLevelsVar = 1; - ICVState.RunSchedVar = omp_sched_static; - ICVState.RunSchedChunkVar = 1; - ParallelTeamSize = 1; - HasThreadState = false; - ParallelRegionFnVar = nullptr; -} - -bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { - return (ICVState == Other.ICVState) & - (HasThreadState == Other.HasThreadState) & - (ParallelTeamSize == Other.ParallelTeamSize); -} - -void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { - ICVState.assertEqual(Other.ICVState); - ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr); - ASSERT(HasThreadState == Other.HasThreadState, nullptr); -} - -[[clang::loader_uninitialized]] Local<state::TeamStateTy> - ompx::state::TeamState; -[[clang::loader_uninitialized]] Local<state::ThreadStateTy **> - ompx::state::ThreadStates; - -namespace { - -int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, - int OutOfBoundsVal = -1) { - if (Level == 0) - return DefaultVal; - int LevelVar = omp_get_level(); - if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) - return OutOfBoundsVal; - int ActiveLevel = icv::ActiveLevel; - if (OMP_UNLIKELY(Level != ActiveLevel)) - return DefaultVal; - return Val; -} - -} // namespace - -void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - SharedMemorySmartStack.init(IsSPMD); - if (mapping::isInitialThreadInLevel0(IsSPMD)) { - TeamState.init(IsSPMD); - ThreadStates = nullptr; - KernelEnvironmentPtr = &KernelEnvironment; - KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; - } -} - -KernelEnvironmentTy &state::getKernelEnvironment() { - return *KernelEnvironmentPtr; -} - -KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() { - return *KernelLaunchEnvironmentPtr; -} - -void state::enterDataEnvironment(IdentTy *Ident) { - ASSERT(config::mayUseThreadStates(), - "Thread state modified while explicitly disabled!"); - if (!config::mayUseThreadStates()) - return; - - unsigned TId = mapping::getThreadIdInBlock(); - ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>( - memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc")); - uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates); - if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) { - uint32_t Bytes = - sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock(); - void *ThreadStatesPtr = - memory::allocGlobal(Bytes, "Thread state array allocation"); - __builtin_memset(ThreadStatesPtr, 0, Bytes); - if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0), - reinterpret_cast<uintptr_t>(ThreadStatesPtr), - atomic::seq_cst, atomic::seq_cst)) - memory::freeGlobal(ThreadStatesPtr, - "Thread state array allocated multiple times"); - ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst), - "Expected valid thread states bit!"); - } - NewThreadState->init(ThreadStates[TId]); - TeamState.HasThreadState = true; - ThreadStates[TId] = NewThreadState; -} - -void state::exitDataEnvironment() { - ASSERT(config::mayUseThreadStates(), - "Thread state modified while explicitly disabled!"); - - unsigned TId = mapping::getThreadIdInBlock(); - resetStateForThread(TId); -} - -void state::resetStateForThread(uint32_t TId) { - if (!config::mayUseThreadStates()) - return; - if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) - return; - - ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; - memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc"); - ThreadStates[TId] = PreviousThreadState; -} - -void state::runAndCheckState(void(Func(void))) { - TeamStateTy OldTeamState = TeamState; - OldTeamState.assertEqual(TeamState); - - Func(); - - OldTeamState.assertEqual(TeamState); -} - -void state::assumeInitialState(bool IsSPMD) { - TeamStateTy InitialTeamState; - InitialTeamState.init(IsSPMD); - InitialTeamState.assertEqual(TeamState); - ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr); -} - -int state::getEffectivePTeamSize() { - int PTeamSize = state::ParallelTeamSize; - return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads(); -} - -extern "C" { -void omp_set_dynamic(int V) {} - -int omp_get_dynamic(void) { return 0; } - -void omp_set_num_threads(int V) { icv::NThreads = V; } - -int omp_get_max_threads(void) { - int NT = icv::NThreads; - return NT > 0 ? NT : mapping::getMaxTeamThreads(); -} - -int omp_get_level(void) { - int LevelVar = icv::Level; - ASSERT(LevelVar >= 0, nullptr); - return LevelVar; -} - -int omp_get_active_level(void) { return !!icv::ActiveLevel; } - -int omp_in_parallel(void) { return !!icv::ActiveLevel; } - -void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { - *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); - *ChunkSize = state::RunSchedChunk; -} - -void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { - icv::RunSched = (int)ScheduleKind; - state::RunSchedChunk = ChunkSize; -} - -int omp_get_ancestor_thread_num(int Level) { - return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); -} - -int omp_get_thread_num(void) { - return omp_get_ancestor_thread_num(omp_get_level()); -} - -int omp_get_team_size(int Level) { - return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1); -} - -int omp_get_num_threads(void) { - return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize(); -} - -int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); } - -int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } - -void omp_set_nested(int) {} - -int omp_get_nested(void) { return false; } - -void omp_set_max_active_levels(int Levels) { - icv::MaxActiveLevels = Levels > 0 ? 1 : 0; -} - -int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } - -omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } - -int omp_get_num_places(void) { return 0; } - -int omp_get_place_num_procs(int) { return omp_get_num_procs(); } - -void omp_get_place_proc_ids(int, int *) { - // TODO -} - -int omp_get_place_num(void) { return 0; } - -int omp_get_partition_num_places(void) { return 0; } - -void omp_get_partition_place_nums(int *) { - // TODO -} - -int omp_get_cancellation(void) { return 0; } - -void omp_set_default_device(int) {} - -int omp_get_default_device(void) { return -1; } - -int omp_get_num_devices(void) { return config::getNumDevices(); } - -int omp_get_device_num(void) { return config::getDeviceNum(); } - -int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); } - -int omp_get_team_num() { return mapping::getBlockIdInKernel(); } - -int omp_get_initial_device(void) { return -1; } - -int omp_is_initial_device(void) { return 0; } -} - -extern "C" { -[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) { - return memory::allocShared(Bytes, "Frontend alloc shared"); -} - -[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { - memory::freeShared(Ptr, Bytes, "Frontend free shared"); -} - -void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } - -void *llvm_omp_target_dynamic_shared_alloc() { - return __kmpc_get_dynamic_shared(); -} - -void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } - -/// Allocate storage in shared memory to communicate arguments from the main -/// thread to the workers in generic mode. If we exceed -/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. -constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; - -[[clang::loader_uninitialized]] static Local<void *> - SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; -[[clang::loader_uninitialized]] static Local<void **> - SharedMemVariableSharingSpacePtr; - -void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { - if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { - SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; - } else { - SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( - nArgs * sizeof(void *), "new extended args"); - ASSERT(SharedMemVariableSharingSpacePtr != nullptr, - "Nullptr returned by malloc!"); - } - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} - -void __kmpc_end_sharing_variables() { - if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) - memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); -} - -void __kmpc_get_shared_variables(void ***GlobalArgs) { - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} -} |
