diff options
Diffstat (limited to 'offload/DeviceRTL/src/Parallelism.cpp')
| -rw-r--r-- | offload/DeviceRTL/src/Parallelism.cpp | 311 |
1 files changed, 0 insertions, 311 deletions
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp deleted file mode 100644 index 08ce616aee1c..000000000000 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ /dev/null @@ -1,311 +0,0 @@ -//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Parallel implementation in the GPU. Here is the pattern: -// -// while (not finished) { -// -// if (master) { -// sequential code, decide which par loop to do, or if finished -// __kmpc_kernel_prepare_parallel() // exec by master only -// } -// syncthreads // A -// __kmpc_kernel_parallel() // exec by all -// if (this thread is included in the parallel) { -// switch () for all parallel loops -// __kmpc_kernel_end_parallel() // exec only by threads in parallel -// } -// -// -// The reason we don't exec end_parallel for the threads not included -// in the parallel loop is that for each barrier in the parallel -// region, these non-included threads will cycle through the -// syncthread A. Thus they must preserve their current threadId that -// is larger than thread in team. -// -// To make a long story short... -// -//===----------------------------------------------------------------------===// - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "LibC.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -namespace { - -uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { - uint32_t NThreadsICV = - NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; - uint32_t NumThreads = mapping::getMaxTeamThreads(); - - if (NThreadsICV != 0 && NThreadsICV < NumThreads) - NumThreads = NThreadsICV; - - // SPMD mode allows any number of threads, for generic mode we round down to a - // multiple of WARPSIZE since it is legal to do so in OpenMP. - if (mapping::isSPMDMode()) - return NumThreads; - - if (NumThreads < mapping::getWarpSize()) - NumThreads = 1; - else - NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); - - return NumThreads; -} - -// Invoke an outlined parallel function unwrapping arguments (up to 32). -[[clang::always_inline]] void invokeMicrotask(int32_t global_tid, - int32_t bound_tid, void *fn, - void **args, int64_t nargs) { - switch (nargs) { -#include "generated_microtask_cases.gen" - default: - printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); - __builtin_trap(); - } -} - -} // namespace - -extern "C" { - -[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, - int32_t num_threads, - void *fn, void **args, - const int64_t nargs) { - uint32_t TId = mapping::getThreadIdInBlock(); - uint32_t NumThreads = determineNumberOfThreads(num_threads); - uint32_t PTeamSize = - NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; - // Avoid the race between the read of the `icv::Level` above and the write - // below by synchronizing all threads here. - synchronize::threadsAligned(atomic::seq_cst); - { - // Note that the order here is important. `icv::Level` has to be updated - // last or the other updates will cause a thread specific state to be - // created. - state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, - 1u, TId == 0, ident, - /*ForceTeamState=*/true); - state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident, - /*ForceTeamState=*/true); - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, - /*ForceTeamState=*/true); - - // Synchronize all threads after the main thread (TId == 0) set up the - // team state properly. - synchronize::threadsAligned(atomic::acq_rel); - - state::ParallelTeamSize.assert_eq(PTeamSize, ident, - /*ForceTeamState=*/true); - icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true); - icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true); - - // Ensure we synchronize before we run user code to avoid invalidating the - // assumptions above. - synchronize::threadsAligned(atomic::relaxed); - - if (!PTeamSize || TId < PTeamSize) - invokeMicrotask(TId, 0, fn, args, nargs); - - // Synchronize all threads at the end of a parallel region. - synchronize::threadsAligned(atomic::seq_cst); - } - - // Synchronize all threads to make sure every thread exits the scope above; - // otherwise the following assertions and the assumption in - // __kmpc_target_deinit may not hold. - synchronize::threadsAligned(atomic::acq_rel); - - state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true); - icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true); - icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true); - - // Ensure we synchronize to create an aligned region around the assumptions. - synchronize::threadsAligned(atomic::relaxed); - - return; -} - -[[clang::always_inline]] void -__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, - int32_t num_threads, int proc_bind, void *fn, - void *wrapper_fn, void **args, int64_t nargs) { - uint32_t TId = mapping::getThreadIdInBlock(); - - // Assert the parallelism level is zero if disabled by the user. - ASSERT((config::mayUseNestedParallelism() || icv::Level == 0), - "nested parallelism while disabled"); - - // Handle the serialized case first, same for SPMD/non-SPMD: - // 1) if-clause(0) - // 2) parallel in task or other thread state inducing construct - // 3) nested parallel regions - if (OMP_UNLIKELY(!if_expr || state::HasThreadState || - (config::mayUseNestedParallelism() && icv::Level))) { - state::DateEnvironmentRAII DERAII(ident); - ++icv::Level; - invokeMicrotask(TId, 0, fn, args, nargs); - return; - } - - // From this point forward we know that there is no thread state used. - ASSERT(state::HasThreadState == false, nullptr); - - if (mapping::isSPMDMode()) { - // This was moved to its own routine so it could be called directly - // in certain situations to avoid resource consumption of unused - // logic in parallel_51. - __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs); - - return; - } - - uint32_t NumThreads = determineNumberOfThreads(num_threads); - uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); - uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; - - // We do *not* create a new data environment because all threads in the team - // that are active are now running this parallel region. They share the - // TeamState, which has an increase level-var and potentially active-level - // set, but they do not have individual ThreadStates yet. If they ever - // modify the ICVs beyond this point a ThreadStates will be allocated. - - bool IsActiveParallelRegion = NumThreads > 1; - if (!IsActiveParallelRegion) { - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); - invokeMicrotask(TId, 0, fn, args, nargs); - return; - } - - void **GlobalArgs = nullptr; - if (nargs) { - __kmpc_begin_sharing_variables(&GlobalArgs, nargs); - switch (nargs) { - default: - for (int I = 0; I < nargs; I++) - GlobalArgs[I] = args[I]; - break; - case 16: - GlobalArgs[15] = args[15]; - [[fallthrough]]; - case 15: - GlobalArgs[14] = args[14]; - [[fallthrough]]; - case 14: - GlobalArgs[13] = args[13]; - [[fallthrough]]; - case 13: - GlobalArgs[12] = args[12]; - [[fallthrough]]; - case 12: - GlobalArgs[11] = args[11]; - [[fallthrough]]; - case 11: - GlobalArgs[10] = args[10]; - [[fallthrough]]; - case 10: - GlobalArgs[9] = args[9]; - [[fallthrough]]; - case 9: - GlobalArgs[8] = args[8]; - [[fallthrough]]; - case 8: - GlobalArgs[7] = args[7]; - [[fallthrough]]; - case 7: - GlobalArgs[6] = args[6]; - [[fallthrough]]; - case 6: - GlobalArgs[5] = args[5]; - [[fallthrough]]; - case 5: - GlobalArgs[4] = args[4]; - [[fallthrough]]; - case 4: - GlobalArgs[3] = args[3]; - [[fallthrough]]; - case 3: - GlobalArgs[2] = args[2]; - [[fallthrough]]; - case 2: - GlobalArgs[1] = args[1]; - [[fallthrough]]; - case 1: - GlobalArgs[0] = args[0]; - [[fallthrough]]; - case 0: - break; - } - } - - { - // Note that the order here is important. `icv::Level` has to be updated - // last or the other updates will cause a thread specific state to be - // created. - state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, - 1u, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, - (void *)nullptr, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident, - /*ForceTeamState=*/true); - - // Master signals work to activate workers. - synchronize::threads(atomic::seq_cst); - // Master waits for workers to signal. - synchronize::threads(atomic::seq_cst); - } - - if (nargs) - __kmpc_end_sharing_variables(); -} - -[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { - // Work function and arguments for L1 parallel region. - *WorkFn = state::ParallelRegionFn; - - // If this is the termination signal from the master, quit early. - if (!*WorkFn) - return false; - - // Set to true for workers participating in the parallel region. - uint32_t TId = mapping::getThreadIdInBlock(); - bool ThreadIsActive = TId < state::getEffectivePTeamSize(); - return ThreadIsActive; -} - -[[clang::noinline]] void __kmpc_kernel_end_parallel() { - // In case we have modified an ICV for this thread before a ThreadState was - // created. We drop it now to not contaminate the next parallel region. - ASSERT(!mapping::isSPMDMode(), nullptr); - uint32_t TId = mapping::getThreadIdInBlock(); - state::resetStateForThread(TId); - ASSERT(!mapping::isSPMDMode(), nullptr); -} - -uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } - -int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } - -void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, - int32_t thread_limit) {} - -void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} -} |
