summaryrefslogtreecommitdiff
path: root/offload/DeviceRTL/src/Workshare.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'offload/DeviceRTL/src/Workshare.cpp')
-rw-r--r--offload/DeviceRTL/src/Workshare.cpp970
1 files changed, 0 insertions, 970 deletions
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
deleted file mode 100644
index 59a2cc3f27ac..000000000000
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ /dev/null
@@ -1,970 +0,0 @@
-//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the KMPC interface
-// for the loop construct plus other worksharing constructs that use the same
-// interface as loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Workshare.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-// TODO:
-struct DynamicScheduleTracker {
- int64_t Chunk;
- int64_t LoopUpperBound;
- int64_t NextLowerBound;
- int64_t Stride;
- kmp_sched_t ScheduleType;
- DynamicScheduleTracker *NextDST;
-};
-
-#define ASSERT0(...)
-
-// used by the library for the interface with the app
-#define DISPATCH_FINISHED 0
-#define DISPATCH_NOTFINISHED 1
-
-// used by dynamic scheduling
-#define FINISHED 0
-#define NOT_FINISHED 1
-#define LAST_CHUNK 2
-
-// TODO: This variable is a hack inherited from the old runtime.
-[[clang::loader_uninitialized]] static Local<uint64_t> Cnt;
-
-template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
- ////////////////////////////////////////////////////////////////////////////////
- // Loop with static scheduling with chunk
-
- // Generic implementation of OMP loop scheduling with static policy
- /*! \brief Calculate initial bounds for static loop and stride
- * @param[in] loc location in code of the call (not used here)
- * @param[in] global_tid global thread id
- * @param[in] schetype type of scheduling (see omptarget-nvptx.h)
- * @param[in] plastiter pointer to last iteration
- * @param[in,out] pointer to loop lower bound. it will contain value of
- * lower bound of first chunk
- * @param[in,out] pointer to loop upper bound. It will contain value of
- * upper bound of first chunk
- * @param[in,out] pointer to loop stride. It will contain value of stride
- * between two successive chunks executed by the same thread
- * @param[in] loop increment bump
- * @param[in] chunk size
- */
-
- // helper function for static chunk
- static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
- T entityId, T numberOfEntities) {
- // each thread executes multiple chunks all of the same size, except
- // the last one
- // distance between two successive chunks
- stride = numberOfEntities * chunk;
- lb = lb + entityId * chunk;
- T inputUb = ub;
- ub = lb + chunk - 1; // Clang uses i <= ub
- // Say ub' is the beginning of the last chunk. Then who ever has a
- // lower bound plus a multiple of the increment equal to ub' is
- // the last one.
- T beginingLastChunk = inputUb - (inputUb % chunk);
- last = ((beginingLastChunk - lb) % stride) == 0;
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Loop with static scheduling without chunk
-
- // helper function for static no chunk
- static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
- T entityId, T numberOfEntities) {
- // No chunk size specified. Each thread or warp gets at most one
- // chunk; chunks are all almost of equal size
- T loopSize = ub - lb + 1;
-
- chunk = loopSize / numberOfEntities;
- T leftOver = loopSize - chunk * numberOfEntities;
-
- if (entityId < leftOver) {
- chunk++;
- lb = lb + entityId * chunk;
- } else {
- lb = lb + entityId * chunk + leftOver;
- }
-
- T inputUb = ub;
- ub = lb + chunk - 1; // Clang uses i <= ub
- last = lb <= inputUb && inputUb <= ub;
- stride = loopSize; // make sure we only do 1 chunk per warp
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Support for Static Init
-
- static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter,
- T *plower, T *pupper, ST *pstride, ST chunk,
- bool IsSPMDExecutionMode) {
- int32_t gtid = omp_get_thread_num();
- int numberOfActiveOMPThreads = omp_get_num_threads();
-
- // All warps that are in excess of the maximum requested, do
- // not execute the loop
- ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
- "current thread is not needed here; error");
-
- // copy
- int lastiter = 0;
- T lb = *plower;
- T ub = *pupper;
- ST stride = *pstride;
-
- // init
- switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
- case kmp_sched_static_chunk: {
- if (chunk > 0) {
- ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- break;
- }
- [[fallthrough]];
- } // note: if chunk <=0, use nochunk
- case kmp_sched_static_balanced_chunk: {
- if (chunk > 0) {
- // round up to make sure the chunk is enough to cover all iterations
- T tripCount = ub - lb + 1; // +1 because ub is inclusive
- T span = (tripCount + numberOfActiveOMPThreads - 1) /
- numberOfActiveOMPThreads;
- // perform chunk adjustment
- chunk = (span + chunk - 1) & ~(chunk - 1);
-
- ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
- T oldUb = ub;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- if (ub > oldUb)
- ub = oldUb;
- break;
- }
- [[fallthrough]];
- } // note: if chunk <=0, use nochunk
- case kmp_sched_static_nochunk: {
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- break;
- }
- case kmp_sched_distr_static_chunk: {
- if (chunk > 0) {
- ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
- omp_get_num_teams());
- break;
- }
- [[fallthrough]];
- } // note: if chunk <=0, use nochunk
- case kmp_sched_distr_static_nochunk: {
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
- omp_get_num_teams());
- break;
- }
- case kmp_sched_distr_static_chunk_sched_static_chunkone: {
- ForStaticChunk(lastiter, lb, ub, stride, chunk,
- numberOfActiveOMPThreads * omp_get_team_num() + gtid,
- omp_get_num_teams() * numberOfActiveOMPThreads);
- break;
- }
- default: {
- // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
- ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- break;
- }
- }
- // copy back
- *plastiter = lastiter;
- *plower = lb;
- *pupper = ub;
- *pstride = stride;
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Support for dispatch Init
-
- static int OrderedSchedule(kmp_sched_t schedule) {
- return schedule >= kmp_sched_ordered_first &&
- schedule <= kmp_sched_ordered_last;
- }
-
- static void dispatch_init(IdentTy *loc, int32_t threadId,
- kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
- DynamicScheduleTracker *DST) {
- int tid = mapping::getThreadIdInBlock();
- T tnum = omp_get_num_threads();
- T tripCount = ub - lb + 1; // +1 because ub is inclusive
- ASSERT0(LT_FUSSY, threadId < tnum,
- "current thread is not needed here; error");
-
- /* Currently just ignore the monotonic and non-monotonic modifiers
- * (the compiler isn't producing them * yet anyway).
- * When it is we'll want to look at them somewhere here and use that
- * information to add to our schedule choice. We shouldn't need to pass
- * them on, they merely affect which schedule we can legally choose for
- * various dynamic cases. (In particular, whether or not a stealing scheme
- * is legal).
- */
- schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
-
- // Process schedule.
- if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
- if (OrderedSchedule(schedule))
- __kmpc_barrier(loc, threadId);
- schedule = kmp_sched_static_chunk;
- chunk = tripCount; // one thread gets the whole loop
- } else if (schedule == kmp_sched_runtime) {
- // process runtime
- omp_sched_t rtSched;
- int ChunkInt;
- omp_get_schedule(&rtSched, &ChunkInt);
- chunk = ChunkInt;
- switch (rtSched) {
- case omp_sched_static: {
- if (chunk > 0)
- schedule = kmp_sched_static_chunk;
- else
- schedule = kmp_sched_static_nochunk;
- break;
- }
- case omp_sched_auto: {
- schedule = kmp_sched_static_chunk;
- chunk = 1;
- break;
- }
- case omp_sched_dynamic:
- case omp_sched_guided: {
- schedule = kmp_sched_dynamic;
- break;
- }
- }
- } else if (schedule == kmp_sched_auto) {
- schedule = kmp_sched_static_chunk;
- chunk = 1;
- } else {
- // ASSERT(LT_FUSSY,
- // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
- // "unknown schedule %d & chunk %lld\n", (int)schedule,
- // (long long)chunk);
- }
-
- // init schedules
- if (schedule == kmp_sched_static_chunk) {
- ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
- // save sched state
- DST->ScheduleType = schedule;
- // save ub
- DST->LoopUpperBound = ub;
- // compute static chunk
- ST stride;
- int lastiter = 0;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
- // save computed params
- DST->Chunk = chunk;
- DST->NextLowerBound = lb;
- DST->Stride = stride;
- } else if (schedule == kmp_sched_static_balanced_chunk) {
- ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
- // save sched state
- DST->ScheduleType = schedule;
- // save ub
- DST->LoopUpperBound = ub;
- // compute static chunk
- ST stride;
- int lastiter = 0;
- // round up to make sure the chunk is enough to cover all iterations
- T span = (tripCount + tnum - 1) / tnum;
- // perform chunk adjustment
- chunk = (span + chunk - 1) & ~(chunk - 1);
-
- T oldUb = ub;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
- ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
- if (ub > oldUb)
- ub = oldUb;
- // save computed params
- DST->Chunk = chunk;
- DST->NextLowerBound = lb;
- DST->Stride = stride;
- } else if (schedule == kmp_sched_static_nochunk) {
- ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
- // save sched state
- DST->ScheduleType = schedule;
- // save ub
- DST->LoopUpperBound = ub;
- // compute static chunk
- ST stride;
- int lastiter = 0;
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
- // save computed params
- DST->Chunk = chunk;
- DST->NextLowerBound = lb;
- DST->Stride = stride;
- } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
- // save data
- DST->ScheduleType = schedule;
- if (chunk < 1)
- chunk = 1;
- DST->Chunk = chunk;
- DST->LoopUpperBound = ub;
- DST->NextLowerBound = lb;
- __kmpc_barrier(loc, threadId);
- if (tid == 0) {
- Cnt = 0;
- fence::team(atomic::seq_cst);
- }
- __kmpc_barrier(loc, threadId);
- }
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Support for dispatch next
-
- static uint64_t NextIter() {
- __kmpc_impl_lanemask_t active = mapping::activemask();
- uint32_t leader = utils::ffs(active) - 1;
- uint32_t change = utils::popc(active);
- __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
- unsigned int rank = utils::popc(active & lane_mask_lt);
- uint64_t warp_res = 0;
- if (rank == 0) {
- warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
- }
- warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize());
- return warp_res + rank;
- }
-
- static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
- T loopUpperBound) {
- T N = NextIter();
- lb = loopLowerBound + N * chunkSize;
- ub = lb + chunkSize - 1; // Clang uses i <= ub
-
- // 3 result cases:
- // a. lb and ub < loopUpperBound --> NOT_FINISHED
- // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
- // NOT_FINISHED
- // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
- // a.
- if (lb <= loopUpperBound && ub < loopUpperBound) {
- return NOT_FINISHED;
- }
- // b.
- if (lb <= loopUpperBound) {
- ub = loopUpperBound;
- return LAST_CHUNK;
- }
- // c. if we are here, we are in case 'c'
- lb = loopUpperBound + 2;
- ub = loopUpperBound + 1;
- return FINISHED;
- }
-
- static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
- T *plower, T *pupper, ST *pstride,
- DynamicScheduleTracker *DST) {
- // ID of a thread in its own warp
-
- // automatically selects thread or warp ID based on selected implementation
- ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
- "current thread is not needed here; error");
- // retrieve schedule
- kmp_sched_t schedule = DST->ScheduleType;
-
- // xxx reduce to one
- if (schedule == kmp_sched_static_chunk ||
- schedule == kmp_sched_static_nochunk) {
- T myLb = DST->NextLowerBound;
- T ub = DST->LoopUpperBound;
- // finished?
- if (myLb > ub) {
- return DISPATCH_FINISHED;
- }
- // not finished, save current bounds
- ST chunk = DST->Chunk;
- *plower = myLb;
- T myUb = myLb + chunk - 1; // Clang uses i <= ub
- if (myUb > ub)
- myUb = ub;
- *pupper = myUb;
- *plast = (int32_t)(myUb == ub);
-
- // increment next lower bound by the stride
- ST stride = DST->Stride;
- DST->NextLowerBound = myLb + stride;
- return DISPATCH_NOTFINISHED;
- }
- ASSERT0(LT_FUSSY,
- schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
- "bad sched");
- T myLb, myUb;
- int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
- DST->LoopUpperBound);
-
- if (finished == FINISHED)
- return DISPATCH_FINISHED;
-
- // not finished (either not finished or last chunk)
- *plast = (int32_t)(finished == LAST_CHUNK);
- *plower = myLb;
- *pupper = myUb;
- *pstride = 1;
-
- return DISPATCH_NOTFINISHED;
- }
-
- static void dispatch_fini() {
- // nothing
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // end of template class that encapsulate all the helper functions
- ////////////////////////////////////////////////////////////////////////////////
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (dyn loops)
-////////////////////////////////////////////////////////////////////////////////
-
-// TODO: Expand the dispatch API to take a DST pointer which can then be
-// allocated properly without malloc.
-// For now, each team will contain an LDS pointer (ThreadDST) to a global array
-// of references to the DST structs allocated (in global memory) for each thread
-// in the team. The global memory array is allocated during the init phase if it
-// was not allocated already and will be deallocated when the dispatch phase
-// ends:
-//
-// __kmpc_dispatch_init
-//
-// ** Dispatch loop **
-//
-// __kmpc_dispatch_deinit
-//
-[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **>
- ThreadDST;
-
-// Create a new DST, link the current one, and define the new as current.
-static DynamicScheduleTracker *pushDST() {
- int32_t ThreadIndex = mapping::getThreadIdInBlock();
- // Each block will allocate an array of pointers to DST structs. The array is
- // equal in length to the number of threads in that block.
- if (!ThreadDST) {
- // Allocate global memory array of pointers to DST structs:
- if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0)
- ThreadDST = static_cast<DynamicScheduleTracker **>(
- memory::allocGlobal(mapping::getNumberOfThreadsInBlock() *
- sizeof(DynamicScheduleTracker *),
- "new ThreadDST array"));
- synchronize::threads(atomic::seq_cst);
-
- // Initialize the array pointers:
- ThreadDST[ThreadIndex] = nullptr;
- }
-
- // Create a DST struct for the current thread:
- DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
- memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
- *NewDST = DynamicScheduleTracker({0});
-
- // Add the new DST struct to the array of DST structs:
- NewDST->NextDST = ThreadDST[ThreadIndex];
- ThreadDST[ThreadIndex] = NewDST;
- return NewDST;
-}
-
-// Return the current DST.
-static DynamicScheduleTracker *peekDST() {
- return ThreadDST[mapping::getThreadIdInBlock()];
-}
-
-// Pop the current DST and restore the last one.
-static void popDST() {
- int32_t ThreadIndex = mapping::getThreadIdInBlock();
- DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex];
- DynamicScheduleTracker *OldDST = CurrentDST->NextDST;
- memory::freeGlobal(CurrentDST, "remove DST");
- ThreadDST[ThreadIndex] = OldDST;
-
- // Check if we need to deallocate the global array. Ensure all threads
- // in the block have finished deallocating the individual DSTs.
- synchronize::threads(atomic::seq_cst);
- if (!ThreadDST[ThreadIndex] && !ThreadIndex) {
- memory::freeGlobal(ThreadDST, "remove ThreadDST array");
- ThreadDST = nullptr;
- }
- synchronize::threads(atomic::seq_cst);
-}
-
-void workshare::init(bool IsSPMD) {
- if (mapping::isInitialThreadInLevel0(IsSPMD))
- ThreadDST = nullptr;
-}
-
-extern "C" {
-
-// init
-void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
- int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
- DynamicScheduleTracker *DST = pushDST();
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
- uint32_t lb, uint32_t ub, int32_t st,
- int32_t chunk) {
- DynamicScheduleTracker *DST = pushDST();
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
- int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
- DynamicScheduleTracker *DST = pushDST();
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
- uint64_t lb, uint64_t ub, int64_t st,
- int64_t chunk) {
- DynamicScheduleTracker *DST = pushDST();
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-// next
-int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
- int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
- DynamicScheduleTracker *DST = peekDST();
- return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
- uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
- DynamicScheduleTracker *DST = peekDST();
- return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
- int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
- DynamicScheduleTracker *DST = peekDST();
- return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
- uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
- DynamicScheduleTracker *DST = peekDST();
- return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-// fini
-void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
-}
-
-// deinit
-void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); }
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (static loops)
-////////////////////////////////////////////////////////////////////////////////
-
-void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr, int32_t chunk) {
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr, int32_t chunk) {
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr, int64_t chunk) {
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr, int64_t chunk) {
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
-
-void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
-}
-
-namespace ompx {
-
-/// Helper class to hide the generic loop nest and provide the template argument
-/// throughout.
-template <typename Ty> class StaticLoopChunker {
-
- /// Generic loop nest that handles block and/or thread distribution in the
- /// absence of user specified chunk sizes. This implicitly picks a block chunk
- /// size equal to the number of threads in the block and a thread chunk size
- /// equal to one. In contrast to the chunked version we can get away with a
- /// single loop in this case
- static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumBlocks, Ty BId, Ty NumThreads,
- Ty TId, Ty NumIters,
- uint8_t OneIterationPerThread) {
- Ty KernelIteration = NumBlocks * NumThreads;
-
- // Start index in the normalized space.
- Ty IV = BId * NumThreads + TId;
- ASSERT(IV >= 0, "Bad index");
-
- // Cover the entire iteration space, assumptions in the caller might allow
- // to simplify this loop to a conditional.
- if (IV < NumIters) {
- do {
-
- // Execute the loop body.
- LoopBody(IV, Arg);
-
- // Every thread executed one block and thread chunk now.
- IV += KernelIteration;
-
- if (OneIterationPerThread)
- return;
-
- } while (IV < NumIters);
- }
- }
-
- /// Generic loop nest that handles block and/or thread distribution in the
- /// presence of user specified chunk sizes (for at least one of them).
- static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
- Ty BlockChunk, Ty NumBlocks, Ty BId,
- Ty ThreadChunk, Ty NumThreads, Ty TId,
- Ty NumIters,
- uint8_t OneIterationPerThread) {
- Ty KernelIteration = NumBlocks * BlockChunk;
-
- // Start index in the chunked space.
- Ty IV = BId * BlockChunk + TId;
- ASSERT(IV >= 0, "Bad index");
-
- // Cover the entire iteration space, assumptions in the caller might allow
- // to simplify this loop to a conditional.
- do {
-
- Ty BlockChunkLeft =
- BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
- Ty ThreadChunkLeft =
- ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
-
- while (ThreadChunkLeft--) {
-
- // Given the blocking it's hard to keep track of what to execute.
- if (IV >= NumIters)
- return;
-
- // Execute the loop body.
- LoopBody(IV, Arg);
-
- if (OneIterationPerThread)
- return;
-
- ++IV;
- }
-
- IV += KernelIteration;
-
- } while (IV < NumIters);
- }
-
-public:
- /// Worksharing `for`-loop.
- /// \param[in] Loc Description of source location
- /// \param[in] LoopBody Function which corresponds to loop body
- /// \param[in] Arg Pointer to struct which contains loop body args
- /// \param[in] NumIters Number of loop iterations
- /// \param[in] NumThreads Number of GPU threads
- /// \param[in] ThreadChunk Size of thread chunk
- /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
- /// only one loop iteration or one thread chunk. This avoids an outer loop
- /// over all loop iterations/chunks.
- static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumIters, Ty NumThreads, Ty ThreadChunk,
- uint8_t OneIterationPerThread) {
- ASSERT(NumIters >= 0, "Bad iteration count");
- ASSERT(ThreadChunk >= 0, "Bad thread count");
-
- // All threads need to participate but we don't know if we are in a
- // parallel at all or if the user might have used a `num_threads` clause
- // on the parallel and reduced the number compared to the block size.
- // Since nested parallels are possible too we need to get the thread id
- // from the `omp` getter and not the mapping directly.
- Ty TId = omp_get_thread_num();
-
- // There are no blocks involved here.
- Ty BlockChunk = 0;
- Ty NumBlocks = 1;
- Ty BId = 0;
-
- // If the thread chunk is not specified we pick a default now.
- if (ThreadChunk == 0)
- ThreadChunk = 1;
-
- // If we know we have more threads than iterations we can indicate that to
- // avoid an outer loop.
- if (config::getAssumeThreadsOversubscription()) {
- OneIterationPerThread = true;
- }
-
- if (OneIterationPerThread)
- ASSERT(NumThreads >= NumIters, "Broken assumption");
-
- if (ThreadChunk != 1)
- NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
- ThreadChunk, NumThreads, TId, NumIters,
- OneIterationPerThread);
- else
- NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
- NumIters, OneIterationPerThread);
- }
-
- /// Worksharing `distribute`-loop.
- /// \param[in] Loc Description of source location
- /// \param[in] LoopBody Function which corresponds to loop body
- /// \param[in] Arg Pointer to struct which contains loop body args
- /// \param[in] NumIters Number of loop iterations
- /// \param[in] BlockChunk Size of block chunk
- /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
- /// only one loop iteration or one thread chunk. This avoids an outer loop
- /// over all loop iterations/chunks.
- static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumIters, Ty BlockChunk,
- uint8_t OneIterationPerThread) {
- ASSERT(icv::Level == 0, "Bad distribute");
- ASSERT(icv::ActiveLevel == 0, "Bad distribute");
- ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
- ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
-
- ASSERT(NumIters >= 0, "Bad iteration count");
- ASSERT(BlockChunk >= 0, "Bad block count");
-
- // There are no threads involved here.
- Ty ThreadChunk = 0;
- Ty NumThreads = 1;
- Ty TId = 0;
-
- // All teams need to participate.
- Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
- Ty BId = mapping::getBlockIdInKernel();
-
- // If the block chunk is not specified we pick a default now.
- if (BlockChunk == 0)
- BlockChunk = NumThreads;
-
- // If we know we have more blocks than iterations we can indicate that to
- // avoid an outer loop.
- if (config::getAssumeTeamsOversubscription()) {
- OneIterationPerThread = true;
- }
-
- if (OneIterationPerThread)
- ASSERT(NumBlocks >= NumIters, "Broken assumption");
-
- if (BlockChunk != NumThreads)
- NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
- ThreadChunk, NumThreads, TId, NumIters,
- OneIterationPerThread);
- else
- NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
- NumIters, OneIterationPerThread);
-
- ASSERT(icv::Level == 0, "Bad distribute");
- ASSERT(icv::ActiveLevel == 0, "Bad distribute");
- ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
- ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
- }
-
- /// Worksharing `distribute parallel for`-loop.
- /// \param[in] Loc Description of source location
- /// \param[in] LoopBody Function which corresponds to loop body
- /// \param[in] Arg Pointer to struct which contains loop body args
- /// \param[in] NumIters Number of loop iterations
- /// \param[in] NumThreads Number of GPU threads
- /// \param[in] BlockChunk Size of block chunk
- /// \param[in] ThreadChunk Size of thread chunk
- /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
- /// only one loop iteration or one thread chunk. This avoids an outer loop
- /// over all loop iterations/chunks.
- static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
- void *Arg, Ty NumIters, Ty NumThreads,
- Ty BlockChunk, Ty ThreadChunk,
- uint8_t OneIterationPerThread) {
- ASSERT(icv::Level == 1, "Bad distribute");
- ASSERT(icv::ActiveLevel == 1, "Bad distribute");
- ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-
- ASSERT(NumIters >= 0, "Bad iteration count");
- ASSERT(BlockChunk >= 0, "Bad block count");
- ASSERT(ThreadChunk >= 0, "Bad thread count");
-
- // All threads need to participate but the user might have used a
- // `num_threads` clause on the parallel and reduced the number compared to
- // the block size.
- Ty TId = mapping::getThreadIdInBlock();
-
- // All teams need to participate.
- Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
- Ty BId = mapping::getBlockIdInKernel();
-
- // If the block chunk is not specified we pick a default now.
- if (BlockChunk == 0)
- BlockChunk = NumThreads;
-
- // If the thread chunk is not specified we pick a default now.
- if (ThreadChunk == 0)
- ThreadChunk = 1;
-
- // If we know we have more threads (across all blocks) than iterations we
- // can indicate that to avoid an outer loop.
- if (config::getAssumeTeamsOversubscription() &
- config::getAssumeThreadsOversubscription()) {
- OneIterationPerThread = true;
- }
-
- if (OneIterationPerThread)
- ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
-
- if (BlockChunk != NumThreads || ThreadChunk != 1)
- NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
- ThreadChunk, NumThreads, TId, NumIters,
- OneIterationPerThread);
- else
- NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
- NumIters, OneIterationPerThread);
-
- ASSERT(icv::Level == 1, "Bad distribute");
- ASSERT(icv::ActiveLevel == 1, "Bad distribute");
- ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
- }
-};
-
-} // namespace ompx
-
-#define OMP_LOOP_ENTRY(BW, TY) \
- [[gnu::flatten, clang::always_inline]] void \
- __kmpc_distribute_for_static_loop##BW( \
- IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
- TY num_threads, TY block_chunk, TY thread_chunk, \
- uint8_t one_iteration_per_thread) { \
- ompx::StaticLoopChunker<TY>::DistributeFor( \
- loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk, \
- one_iteration_per_thread); \
- } \
- [[gnu::flatten, clang::always_inline]] void \
- __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
- void *arg, TY num_iters, TY block_chunk, \
- uint8_t one_iteration_per_thread) { \
- ompx::StaticLoopChunker<TY>::Distribute( \
- loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread); \
- } \
- [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
- IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
- TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) { \
- ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \
- thread_chunk, one_iteration_per_thread); \
- }
-
-extern "C" {
-OMP_LOOP_ENTRY(_4, int32_t)
-OMP_LOOP_ENTRY(_4u, uint32_t)
-OMP_LOOP_ENTRY(_8, int64_t)
-OMP_LOOP_ENTRY(_8u, uint64_t)
-}