summaryrefslogtreecommitdiff
path: root/offload/DeviceRTL/src/Parallelism.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'offload/DeviceRTL/src/Parallelism.cpp')
-rw-r--r--offload/DeviceRTL/src/Parallelism.cpp311
1 files changed, 0 insertions, 311 deletions
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
deleted file mode 100644
index 08ce616aee1c..000000000000
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Parallel implementation in the GPU. Here is the pattern:
-//
-// while (not finished) {
-//
-// if (master) {
-// sequential code, decide which par loop to do, or if finished
-// __kmpc_kernel_prepare_parallel() // exec by master only
-// }
-// syncthreads // A
-// __kmpc_kernel_parallel() // exec by all
-// if (this thread is included in the parallel) {
-// switch () for all parallel loops
-// __kmpc_kernel_end_parallel() // exec only by threads in parallel
-// }
-//
-//
-// The reason we don't exec end_parallel for the threads not included
-// in the parallel loop is that for each barrier in the parallel
-// region, these non-included threads will cycle through the
-// syncthread A. Thus they must preserve their current threadId that
-// is larger than thread in team.
-//
-// To make a long story short...
-//
-//===----------------------------------------------------------------------===//
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "LibC.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-namespace {
-
-uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
- uint32_t NThreadsICV =
- NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
- uint32_t NumThreads = mapping::getMaxTeamThreads();
-
- if (NThreadsICV != 0 && NThreadsICV < NumThreads)
- NumThreads = NThreadsICV;
-
- // SPMD mode allows any number of threads, for generic mode we round down to a
- // multiple of WARPSIZE since it is legal to do so in OpenMP.
- if (mapping::isSPMDMode())
- return NumThreads;
-
- if (NumThreads < mapping::getWarpSize())
- NumThreads = 1;
- else
- NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
-
- return NumThreads;
-}
-
-// Invoke an outlined parallel function unwrapping arguments (up to 32).
-[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
- int32_t bound_tid, void *fn,
- void **args, int64_t nargs) {
- switch (nargs) {
-#include "generated_microtask_cases.gen"
- default:
- printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
- __builtin_trap();
- }
-}
-
-} // namespace
-
-extern "C" {
-
-[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
- int32_t num_threads,
- void *fn, void **args,
- const int64_t nargs) {
- uint32_t TId = mapping::getThreadIdInBlock();
- uint32_t NumThreads = determineNumberOfThreads(num_threads);
- uint32_t PTeamSize =
- NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
- // Avoid the race between the read of the `icv::Level` above and the write
- // below by synchronizing all threads here.
- synchronize::threadsAligned(atomic::seq_cst);
- {
- // Note that the order here is important. `icv::Level` has to be updated
- // last or the other updates will cause a thread specific state to be
- // created.
- state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
- 1u, TId == 0, ident,
- /*ForceTeamState=*/true);
- state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
- /*ForceTeamState=*/true);
- state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
- /*ForceTeamState=*/true);
-
- // Synchronize all threads after the main thread (TId == 0) set up the
- // team state properly.
- synchronize::threadsAligned(atomic::acq_rel);
-
- state::ParallelTeamSize.assert_eq(PTeamSize, ident,
- /*ForceTeamState=*/true);
- icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true);
- icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true);
-
- // Ensure we synchronize before we run user code to avoid invalidating the
- // assumptions above.
- synchronize::threadsAligned(atomic::relaxed);
-
- if (!PTeamSize || TId < PTeamSize)
- invokeMicrotask(TId, 0, fn, args, nargs);
-
- // Synchronize all threads at the end of a parallel region.
- synchronize::threadsAligned(atomic::seq_cst);
- }
-
- // Synchronize all threads to make sure every thread exits the scope above;
- // otherwise the following assertions and the assumption in
- // __kmpc_target_deinit may not hold.
- synchronize::threadsAligned(atomic::acq_rel);
-
- state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true);
- icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true);
- icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true);
-
- // Ensure we synchronize to create an aligned region around the assumptions.
- synchronize::threadsAligned(atomic::relaxed);
-
- return;
-}
-
-[[clang::always_inline]] void
-__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
- int32_t num_threads, int proc_bind, void *fn,
- void *wrapper_fn, void **args, int64_t nargs) {
- uint32_t TId = mapping::getThreadIdInBlock();
-
- // Assert the parallelism level is zero if disabled by the user.
- ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
- "nested parallelism while disabled");
-
- // Handle the serialized case first, same for SPMD/non-SPMD:
- // 1) if-clause(0)
- // 2) parallel in task or other thread state inducing construct
- // 3) nested parallel regions
- if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
- (config::mayUseNestedParallelism() && icv::Level))) {
- state::DateEnvironmentRAII DERAII(ident);
- ++icv::Level;
- invokeMicrotask(TId, 0, fn, args, nargs);
- return;
- }
-
- // From this point forward we know that there is no thread state used.
- ASSERT(state::HasThreadState == false, nullptr);
-
- if (mapping::isSPMDMode()) {
- // This was moved to its own routine so it could be called directly
- // in certain situations to avoid resource consumption of unused
- // logic in parallel_51.
- __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
-
- return;
- }
-
- uint32_t NumThreads = determineNumberOfThreads(num_threads);
- uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
- uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
-
- // We do *not* create a new data environment because all threads in the team
- // that are active are now running this parallel region. They share the
- // TeamState, which has an increase level-var and potentially active-level
- // set, but they do not have individual ThreadStates yet. If they ever
- // modify the ICVs beyond this point a ThreadStates will be allocated.
-
- bool IsActiveParallelRegion = NumThreads > 1;
- if (!IsActiveParallelRegion) {
- state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
- invokeMicrotask(TId, 0, fn, args, nargs);
- return;
- }
-
- void **GlobalArgs = nullptr;
- if (nargs) {
- __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
- switch (nargs) {
- default:
- for (int I = 0; I < nargs; I++)
- GlobalArgs[I] = args[I];
- break;
- case 16:
- GlobalArgs[15] = args[15];
- [[fallthrough]];
- case 15:
- GlobalArgs[14] = args[14];
- [[fallthrough]];
- case 14:
- GlobalArgs[13] = args[13];
- [[fallthrough]];
- case 13:
- GlobalArgs[12] = args[12];
- [[fallthrough]];
- case 12:
- GlobalArgs[11] = args[11];
- [[fallthrough]];
- case 11:
- GlobalArgs[10] = args[10];
- [[fallthrough]];
- case 10:
- GlobalArgs[9] = args[9];
- [[fallthrough]];
- case 9:
- GlobalArgs[8] = args[8];
- [[fallthrough]];
- case 8:
- GlobalArgs[7] = args[7];
- [[fallthrough]];
- case 7:
- GlobalArgs[6] = args[6];
- [[fallthrough]];
- case 6:
- GlobalArgs[5] = args[5];
- [[fallthrough]];
- case 5:
- GlobalArgs[4] = args[4];
- [[fallthrough]];
- case 4:
- GlobalArgs[3] = args[3];
- [[fallthrough]];
- case 3:
- GlobalArgs[2] = args[2];
- [[fallthrough]];
- case 2:
- GlobalArgs[1] = args[1];
- [[fallthrough]];
- case 1:
- GlobalArgs[0] = args[0];
- [[fallthrough]];
- case 0:
- break;
- }
- }
-
- {
- // Note that the order here is important. `icv::Level` has to be updated
- // last or the other updates will cause a thread specific state to be
- // created.
- state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
- 1u, true, ident,
- /*ForceTeamState=*/true);
- state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
- (void *)nullptr, true, ident,
- /*ForceTeamState=*/true);
- state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
- /*ForceTeamState=*/true);
- state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
- /*ForceTeamState=*/true);
-
- // Master signals work to activate workers.
- synchronize::threads(atomic::seq_cst);
- // Master waits for workers to signal.
- synchronize::threads(atomic::seq_cst);
- }
-
- if (nargs)
- __kmpc_end_sharing_variables();
-}
-
-[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
- // Work function and arguments for L1 parallel region.
- *WorkFn = state::ParallelRegionFn;
-
- // If this is the termination signal from the master, quit early.
- if (!*WorkFn)
- return false;
-
- // Set to true for workers participating in the parallel region.
- uint32_t TId = mapping::getThreadIdInBlock();
- bool ThreadIsActive = TId < state::getEffectivePTeamSize();
- return ThreadIsActive;
-}
-
-[[clang::noinline]] void __kmpc_kernel_end_parallel() {
- // In case we have modified an ICV for this thread before a ThreadState was
- // created. We drop it now to not contaminate the next parallel region.
- ASSERT(!mapping::isSPMDMode(), nullptr);
- uint32_t TId = mapping::getThreadIdInBlock();
- state::resetStateForThread(TId);
- ASSERT(!mapping::isSPMDMode(), nullptr);
-}
-
-uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
-
-int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
-
-void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
- int32_t thread_limit) {}
-
-void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
-}