diff options
Diffstat (limited to 'offload/DeviceRTL/src/Kernel.cpp')
| -rw-r--r-- | offload/DeviceRTL/src/Kernel.cpp | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp new file mode 100644 index 000000000000..95d4c728016d --- /dev/null +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -0,0 +1,157 @@ +//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the kernel entry points for the device. +// +//===----------------------------------------------------------------------===// + +#include "Shared/Environment.h" + +#include "Allocator.h" +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Synchronization.h" +#include "Types.h" + +#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" + +using namespace ompx; + +#pragma omp begin declare target device_type(nohost) + +static void +inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, + KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { + // Order is important here. + synchronize::init(IsSPMD); + mapping::init(IsSPMD); + state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment); + allocator::init(IsSPMD, KernelEnvironment); +} + +/// Simple generic state machine for worker threads. +static void genericStateMachine(IdentTy *Ident) { + uint32_t TId = mapping::getThreadIdInBlock(); + + do { + ParallelRegionFnTy WorkFn = nullptr; + + // Wait for the signal that we have a new work function. + synchronize::threads(atomic::seq_cst); + + // Retrieve the work function from the runtime. + bool IsActive = __kmpc_kernel_parallel(&WorkFn); + + // If there is nothing more to do, break out of the state machine by + // returning to the caller. + if (!WorkFn) + return; + + if (IsActive) { + ASSERT(!mapping::isSPMDMode(), nullptr); + ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId); + __kmpc_kernel_end_parallel(); + } + + synchronize::threads(atomic::seq_cst); + + } while (true); +} + +extern "C" { + +/// Initialization +/// +/// \param Ident Source location identification, can be NULL. +/// +int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, + KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { + ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; + bool IsSPMD = Configuration.ExecMode & + llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; + bool UseGenericStateMachine = Configuration.UseGenericStateMachine; + if (IsSPMD) { + inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment, + KernelLaunchEnvironment); + synchronize::threadsAligned(atomic::relaxed); + } else { + inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment, + KernelLaunchEnvironment); + // No need to wait since only the main threads will execute user + // code and workers will run into a barrier right away. + } + + if (IsSPMD) { + state::assumeInitialState(IsSPMD); + + // Synchronize to ensure the assertions above are in an aligned region. + // The barrier is eliminated later. + synchronize::threadsAligned(atomic::relaxed); + return -1; + } + + if (mapping::isInitialThreadInLevel0(IsSPMD)) + return -1; + + // Enter the generic state machine if enabled and if this thread can possibly + // be an active worker thread. + // + // The latter check is important for NVIDIA Pascal (but not Volta) and AMD + // GPU. In those cases, a single thread can apparently satisfy a barrier on + // behalf of all threads in the same warp. Thus, it would not be safe for + // other threads in the main thread's warp to reach the first + // synchronize::threads call in genericStateMachine before the main thread + // reaches its corresponding synchronize::threads call: that would permit all + // active worker threads to proceed before the main thread has actually set + // state::ParallelRegionFn, and then they would immediately quit without + // doing any work. mapping::getMaxTeamThreads() does not include any of the + // main thread's warp, so none of its threads can ever be active worker + // threads. + if (UseGenericStateMachine && + mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) + genericStateMachine(KernelEnvironment.Ident); + + return mapping::getThreadIdInBlock(); +} + +/// De-Initialization +/// +/// In non-SPMD, this function releases the workers trapped in a state machine +/// and also any memory dynamically allocated by the runtime. +/// +/// \param Ident Source location identification, can be NULL. +/// +void __kmpc_target_deinit() { + bool IsSPMD = mapping::isSPMDMode(); + if (IsSPMD) + return; + + if (mapping::isInitialThreadInLevel0(IsSPMD)) { + // Signal the workers to exit the state machine and exit the kernel. + state::ParallelRegionFn = nullptr; + } else if (!state::getKernelEnvironment() + .Configuration.UseGenericStateMachine) { + // Retrieve the work function just to ensure we always call + // __kmpc_kernel_parallel even if a custom state machine is used. + // TODO: this is not super pretty. The problem is we create the call to + // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it + // is not there yet. Thus, we assume we never reach it from + // __kmpc_target_deinit. That allows us to remove the store in there to + // ParallelRegionFn, which leads to bad results later on. + ParallelRegionFnTy WorkFn = nullptr; + __kmpc_kernel_parallel(&WorkFn); + ASSERT(WorkFn == nullptr, nullptr); + } +} + +int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } +} + +#pragma omp end declare target |
