16 files changed, 0 insertions, 3410 deletions
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
deleted file mode 100644
index aac2a6005158..000000000000
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Mapping.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility(
-      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
-    __omp_rtl_device_memory_pool_tracker;
-
-/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
-/// directly.
-struct BumpAllocatorTy final {
-
-  void *alloc(uint64_t Size) {
-    Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
-
-    if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
-                  atomic::seq_cst);
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
-                  atomic::seq_cst);
-      atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
-                  atomic::seq_cst);
-      atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
-                  atomic::seq_cst);
-    }
-
-    uint64_t *Data =
-        reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
-    uint64_t End =
-        reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
-
-    uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
-    if (OldData + Size > End)
-      __builtin_trap();
-
-    return reinterpret_cast<void *>(OldData);
-  }
-
-  void free(void *) {}
-};
-
-BumpAllocatorTy BumpAllocator;
-
-/// allocator namespace implementation
-///
-///{
-
-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
-  // TODO: Check KernelEnvironment for an allocator choice as soon as we have
-  // more than one.
-}
-
-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
-
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
-
-///}
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
deleted file mode 100644
index 0c31c66ab2de..000000000000
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the data object of the constant device environment and the
-// query API.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Configuration.h"
-#include "DeviceTypes.h"
-#include "State.h"
-
-using namespace ompx;
-
-// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
-[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_nested_parallelism = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_threads_oversubscription =
-    0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_teams_oversubscription = 0;
-
-// This variable should be visible to the plugin so we override the default
-// hidden visibility.
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility(
-      "protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment;
-
-uint32_t config::getAssumeTeamsOversubscription() {
-  return __omp_rtl_assume_teams_oversubscription;
-}
-
-uint32_t config::getAssumeThreadsOversubscription() {
-  return __omp_rtl_assume_threads_oversubscription;
-}
-
-uint32_t config::getDebugKind() {
-  return __omp_rtl_debug_kind & __omp_rtl_device_environment.DeviceDebugKind;
-}
-
-uint32_t config::getNumDevices() {
-  return __omp_rtl_device_environment.NumDevices;
-}
-
-uint32_t config::getDeviceNum() {
-  return __omp_rtl_device_environment.DeviceNum;
-}
-
-uint64_t config::getDynamicMemorySize() {
-  return __omp_rtl_device_environment.DynamicMemSize;
-}
-
-uint64_t config::getClockFrequency() {
-  return __omp_rtl_device_environment.ClockFrequency;
-}
-
-void *config::getIndirectCallTablePtr() {
-  return reinterpret_cast<void *>(
-      __omp_rtl_device_environment.IndirectCallTable);
-}
-
-uint64_t config::getHardwareParallelism() {
-  return __omp_rtl_device_environment.HardwareParallelism;
-}
-
-uint64_t config::getIndirectCallTableSize() {
-  return __omp_rtl_device_environment.IndirectCallTableSize;
-}
-
-bool config::isDebugMode(DeviceDebugKind Kind) {
-  return config::getDebugKind() & uint32_t(Kind);
-}
-
-bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
-
-bool config::mayUseNestedParallelism() {
-  if (__omp_rtl_assume_no_nested_parallelism)
-    return false;
-  return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
-}
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
deleted file mode 100644
index 5b5482d766b1..000000000000
--- a/offload/DeviceRTL/src/Debug.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains debug utilities
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Configuration.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-
-using namespace ompx;
-
-extern "C" {
-void __assert_assume(bool condition) { __builtin_assume(condition); }
-
-#ifndef OMPTARGET_HAS_LIBC
-[[gnu::weak]] void __assert_fail(const char *expr, const char *file,
-                                 unsigned line, const char *function) {
-  __assert_fail_internal(expr, nullptr, file, line, function);
-}
-#endif
-
-void __assert_fail_internal(const char *expr, const char *msg, const char *file,
-                            unsigned line, const char *function) {
-  if (msg) {
-    printf("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function,
-           msg, expr);
-  } else {
-    printf("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr);
-  }
-  __builtin_trap();
-}
-}
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
deleted file mode 100644
index d6f8c499c890..000000000000
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "DeviceUtils.h"
-
-#include "Debug.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "gpuintrin.h"
-
-using namespace ompx;
-
-uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
-  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
-}
-
-void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
-  static_assert(sizeof(unsigned long) == 8, "");
-  LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
-  HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
-}
-
-int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
-                       int32_t Width) {
-  return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width);
-}
-
-int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
-                           int32_t Width) {
-  int32_t Self = mapping::getThreadIdInWarp();
-  int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
-  return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
-}
-
-int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
-                           int32_t Width) {
-  int32_t Self = mapping::getThreadIdInWarp();
-  int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
-  return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
-}
-
-uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
-  return __gpu_ballot(Mask, Pred);
-}
-
-bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
-
-extern "C" {
-int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
-  return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
-}
-
-int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
-  return utils::shuffleDown(lanes::All, Val, Delta, Width);
-}
-}
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
deleted file mode 100644
index 467e44a65276..000000000000
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the kernel entry points for the device.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-#include "Workshare.h"
-
-using namespace ompx;
-
-// These flags are copied from "llvm/Frontend/OpenMP/OMPDeviceConstants.h" and
-// must be kept in-sync.
-enum OMPTgtExecModeFlags : unsigned char {
-  OMP_TGT_EXEC_MODE_BARE = 0,
-  OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
-  OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
-  OMP_TGT_EXEC_MODE_GENERIC_SPMD =
-      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
-};
-
-static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                    KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
-  // Order is important here.
-  synchronize::init(IsSPMD);
-  mapping::init(IsSPMD);
-  state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
-  allocator::init(IsSPMD, KernelEnvironment);
-  workshare::init(IsSPMD);
-}
-
-/// Simple generic state machine for worker threads.
-static void genericStateMachine(IdentTy *Ident) {
-  uint32_t TId = mapping::getThreadIdInBlock();
-
-  do {
-    ParallelRegionFnTy WorkFn = nullptr;
-
-    // Wait for the signal that we have a new work function.
-    synchronize::threads(atomic::seq_cst);
-
-    // Retrieve the work function from the runtime.
-    bool IsActive = __kmpc_kernel_parallel(&WorkFn);
-
-    // If there is nothing more to do, break out of the state machine by
-    // returning to the caller.
-    if (!WorkFn)
-      return;
-
-    if (IsActive) {
-      ASSERT(!mapping::isSPMDMode(), nullptr);
-      ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId);
-      __kmpc_kernel_end_parallel();
-    }
-
-    synchronize::threads(atomic::seq_cst);
-
-  } while (true);
-}
-
-extern "C" {
-
-/// Initialization
-///
-/// \param Ident               Source location identification, can be NULL.
-///
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
-  ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
-  bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
-  bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
-  if (IsSPMD) {
-    inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
-                        KernelLaunchEnvironment);
-    synchronize::threadsAligned(atomic::relaxed);
-  } else {
-    inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
-                        KernelLaunchEnvironment);
-    // No need to wait since only the main threads will execute user
-    // code and workers will run into a barrier right away.
-  }
-
-  if (IsSPMD) {
-    state::assumeInitialState(IsSPMD);
-
-    // Synchronize to ensure the assertions above are in an aligned region.
-    // The barrier is eliminated later.
-    synchronize::threadsAligned(atomic::relaxed);
-    return -1;
-  }
-
-  if (mapping::isInitialThreadInLevel0(IsSPMD))
-    return -1;
-
-  // Enter the generic state machine if enabled and if this thread can possibly
-  // be an active worker thread.
-  //
-  // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
-  // GPU.  In those cases, a single thread can apparently satisfy a barrier on
-  // behalf of all threads in the same warp.  Thus, it would not be safe for
-  // other threads in the main thread's warp to reach the first
-  // synchronize::threads call in genericStateMachine before the main thread
-  // reaches its corresponding synchronize::threads call: that would permit all
-  // active worker threads to proceed before the main thread has actually set
-  // state::ParallelRegionFn, and then they would immediately quit without
-  // doing any work.  mapping::getMaxTeamThreads() does not include any of the
-  // main thread's warp, so none of its threads can ever be active worker
-  // threads.
-  if (UseGenericStateMachine &&
-      mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD))
-    genericStateMachine(KernelEnvironment.Ident);
-
-  return mapping::getThreadIdInBlock();
-}
-
-/// De-Initialization
-///
-/// In non-SPMD, this function releases the workers trapped in a state machine
-/// and also any memory dynamically allocated by the runtime.
-///
-/// \param Ident Source location identification, can be NULL.
-///
-void __kmpc_target_deinit() {
-  bool IsSPMD = mapping::isSPMDMode();
-  if (IsSPMD)
-    return;
-
-  if (mapping::isInitialThreadInLevel0(IsSPMD)) {
-    // Signal the workers to exit the state machine and exit the kernel.
-    state::ParallelRegionFn = nullptr;
-  } else if (!state::getKernelEnvironment()
-                  .Configuration.UseGenericStateMachine) {
-    // Retrieve the work function just to ensure we always call
-    // __kmpc_kernel_parallel even if a custom state machine is used.
-    // TODO: this is not super pretty. The problem is we create the call to
-    // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it
-    // is not there yet. Thus, we assume we never reach it from
-    // __kmpc_target_deinit. That allows us to remove the store in there to
-    // ParallelRegionFn, which leads to bad results later on.
-    ParallelRegionFnTy WorkFn = nullptr;
-    __kmpc_kernel_parallel(&WorkFn);
-    ASSERT(WorkFn == nullptr, nullptr);
-  }
-}
-
-int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
-}
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
deleted file mode 100644
index 83f9233d9480..000000000000
--- a/offload/DeviceRTL/src/LibC.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===------- LibC.cpp - Simple implementation of libc functions --- C++ ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "LibC.h"
-
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
-extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; }
-#else
-extern "C" int vprintf(const char *format, __builtin_va_list);
-#endif
-
-extern "C" {
-[[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) {
-  auto *L = reinterpret_cast<const unsigned char *>(lhs);
-  auto *R = reinterpret_cast<const unsigned char *>(rhs);
-
-  for (size_t I = 0; I < count; ++I)
-    if (L[I] != R[I])
-      return (int)L[I] - (int)R[I];
-
-  return 0;
-}
-
-[[gnu::weak]] void memset(void *dst, int C, size_t count) {
-  auto *dstc = reinterpret_cast<char *>(dst);
-  for (size_t I = 0; I < count; ++I)
-    dstc[I] = C;
-}
-
-[[gnu::weak]] int printf(const char *Format, ...) {
-  __builtin_va_list vlist;
-  __builtin_va_start(vlist, Format);
-  return ::vprintf(Format, vlist);
-}
-}
-
-namespace ompx {
-[[clang::no_builtin("printf")]] int printf(const char *Format, ...) {
-  __builtin_va_list vlist;
-  __builtin_va_start(vlist, Format);
-  return ::vprintf(Format, vlist);
-}
-} // namespace ompx
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
deleted file mode 100644
index b145892d1ece..000000000000
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Mapping.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "State.h"
-#include "gpuintrin.h"
-
-using namespace ompx;
-
-// FIXME: This resolves the handling for the AMDGPU workgroup size when the ABI
-// is set to 'none'. We only support COV5+ but this can be removed when COV4 is
-// fully deprecated.
-#ifdef __AMDGPU__
-extern const inline uint32_t __oclc_ABI_version = 500;
-[[gnu::alias("__oclc_ABI_version")]] const uint32_t __oclc_ABI_version__;
-#endif
-
-static bool isInLastWarp() {
-  uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
-                     ~(mapping::getWarpSize() - 1);
-  return mapping::getThreadIdInBlock() == MainTId;
-}
-
-bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
-  if (IsSPMD || icv::Level)
-    return false;
-
-  // Check if this is the last warp in the block.
-  return isInLastWarp();
-}
-
-bool mapping::isMainThreadInGenericMode() {
-  return mapping::isMainThreadInGenericMode(mapping::isSPMDMode());
-}
-
-bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
-  if (IsSPMD)
-    return mapping::getThreadIdInBlock() == 0;
-  return isInLastWarp();
-}
-
-bool mapping::isLeaderInWarp() {
-  __kmpc_impl_lanemask_t Active = mapping::activemask();
-  __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
-  return utils::popc(Active & LaneMaskLT) == 0;
-}
-
-LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
-
-LaneMaskTy mapping::lanemaskLT() {
-#ifdef __NVPTX__
-  return __nvvm_read_ptx_sreg_lanemask_lt();
-#else
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
-  return Mask & Ballot;
-#endif
-}
-
-LaneMaskTy mapping::lanemaskGT() {
-#ifdef __NVPTX__
-  return __nvvm_read_ptx_sreg_lanemask_gt();
-#else
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  if (Lane == (mapping::getWarpSize() - 1))
-    return 0;
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
-  return Mask & Ballot;
-#endif
-}
-
-uint32_t mapping::getThreadIdInWarp() {
-  uint32_t ThreadIdInWarp = __gpu_lane_id();
-  ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
-  return ThreadIdInWarp;
-}
-
-uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
-  uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
-  return ThreadIdInBlock;
-}
-
-uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
-
-uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
-  uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
-  // If we are in SPMD mode, remove one warp.
-  return BlockSize - (!IsSPMD * mapping::getWarpSize());
-}
-uint32_t mapping::getMaxTeamThreads() {
-  return mapping::getMaxTeamThreads(mapping::isSPMDMode());
-}
-
-uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
-  return __gpu_num_threads(Dim);
-}
-
-uint32_t mapping::getNumberOfThreadsInKernel() {
-  return mapping::getNumberOfThreadsInBlock(0) *
-         mapping::getNumberOfBlocksInKernel(0) *
-         mapping::getNumberOfThreadsInBlock(1) *
-         mapping::getNumberOfBlocksInKernel(1) *
-         mapping::getNumberOfThreadsInBlock(2) *
-         mapping::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t mapping::getWarpIdInBlock() {
-  uint32_t WarpID =
-      mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-  ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
-  return WarpID;
-}
-
-uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
-  uint32_t BlockId = __gpu_block_id(Dim);
-  ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
-  return BlockId;
-}
-
-uint32_t mapping::getNumberOfWarpsInBlock() {
-  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
-         mapping::getWarpSize();
-}
-
-uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
-  return __gpu_num_blocks(Dim);
-}
-
-uint32_t mapping::getNumberOfProcessorElements() {
-  return static_cast<uint32_t>(config::getHardwareParallelism());
-}
-
-///}
-
-/// Execution mode
-///
-///{
-
-// TODO: This is a workaround for initialization coming from kernels outside of
-//       the TU. We will need to solve this more correctly in the future.
-[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode;
-
-void mapping::init(bool IsSPMD) {
-  if (mapping::isInitialThreadInLevel0(IsSPMD))
-    IsSPMDMode = IsSPMD;
-}
-
-bool mapping::isSPMDMode() { return IsSPMDMode; }
-
-bool mapping::isGenericMode() { return !isSPMDMode(); }
-///}
-
-extern "C" {
-[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
-  return mapping::getThreadIdInBlock();
-}
-
-[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
-  return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
-}
-
-[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
-  return mapping::getWarpSize();
-}
-}
-
-#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME)                                \
-  extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); }
-
-_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock)
-_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
-_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
-_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
-
-extern "C" {
-uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
-  return utils::ballotSync(mask, pred);
-}
-
-int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) {
-  return utils::shuffleDown(mask, var, delta, width);
-}
-
-float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
-                            int width) {
-  return utils::bitCast<float>(
-      utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width));
-}
-
-long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
-  return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width);
-}
-
-double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
-                             int width) {
-  return utils::bitCast<double>(
-      utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
-}
-}
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
deleted file mode 100644
index a89f8b2a7453..000000000000
--- a/offload/DeviceRTL/src/Misc.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "DeviceTypes.h"
-#include "Shared/RPCOpcodes.h"
-#include "shared/rpc.h"
-
-#include "Debug.h"
-
-namespace ompx {
-namespace impl {
-
-/// Lookup a device-side function using a host pointer /p HstPtr using the table
-/// provided by the device plugin. The table is an ordered pair of host and
-/// device pointers sorted on the value of the host pointer.
-void *indirectCallLookup(void *HstPtr) {
-  if (!HstPtr)
-    return nullptr;
-
-  struct IndirectCallTable {
-    void *HstPtr;
-    void *DevPtr;
-  };
-  IndirectCallTable *Table =
-      reinterpret_cast<IndirectCallTable *>(config::getIndirectCallTablePtr());
-  uint64_t TableSize = config::getIndirectCallTableSize();
-
-  // If the table is empty we assume this is device pointer.
-  if (!Table || !TableSize)
-    return HstPtr;
-
-  uint32_t Left = 0;
-  uint32_t Right = TableSize;
-
-  // If the pointer is definitely not contained in the table we exit early.
-  if (HstPtr < Table[Left].HstPtr || HstPtr > Table[Right - 1].HstPtr)
-    return HstPtr;
-
-  while (Left != Right) {
-    uint32_t Current = Left + (Right - Left) / 2;
-    if (Table[Current].HstPtr == HstPtr)
-      return Table[Current].DevPtr;
-
-    if (HstPtr < Table[Current].HstPtr)
-      Right = Current;
-    else
-      Left = Current;
-  }
-
-  // If we searched the whole table and found nothing this is a device pointer.
-  return HstPtr;
-}
-
-/// The openmp client instance used to communicate with the server.
-[[gnu::visibility("protected"),
-  gnu::weak]] rpc::Client Client asm("__llvm_rpc_client");
-
-} // namespace impl
-} // namespace ompx
-
-/// Interfaces
-///
-///{
-
-extern "C" {
-int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
-
-int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
-
-double omp_get_wtick(void) {
-  // The number of ticks per second for the AMDGPU clock varies by card and can
-  // only be retrieved by querying the driver. We rely on the device environment
-  // to inform us what the proper frequency is. NVPTX uses a nanosecond
-  // resolution, we could omit the global read but this makes it consistent.
-  return 1.0 / ompx::config::getClockFrequency();
-}
-
-double omp_get_wtime(void) {
-  return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick();
-}
-
-void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
-  return ompx::impl::indirectCallLookup(HstPtr);
-}
-
-void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
-  switch (allocator) {
-  case omp_default_mem_alloc:
-  case omp_large_cap_mem_alloc:
-  case omp_const_mem_alloc:
-  case omp_high_bw_mem_alloc:
-  case omp_low_lat_mem_alloc:
-    return malloc(size);
-  default:
-    return nullptr;
-  }
-}
-
-void omp_free(void *ptr, omp_allocator_handle_t allocator) {
-  switch (allocator) {
-  case omp_default_mem_alloc:
-  case omp_large_cap_mem_alloc:
-  case omp_const_mem_alloc:
-  case omp_high_bw_mem_alloc:
-  case omp_low_lat_mem_alloc:
-    free(ptr);
-  case omp_null_allocator:
-  default:
-    return;
-  }
-}
-
-unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) {
-  rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_HOST_CALL>();
-  Port.send_n(data, size);
-  Port.send([=](rpc::Buffer *buffer, uint32_t) {
-    buffer->data[0] = reinterpret_cast<uintptr_t>(fn);
-  });
-  unsigned long long Ret;
-  Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
-    Ret = static_cast<unsigned long long>(Buffer->data[0]);
-  });
-  Port.close();
-  return Ret;
-}
-}
-
-///}
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
deleted file mode 100644
index 08ce616aee1c..000000000000
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Parallel implementation in the GPU. Here is the pattern:
-//
-//    while (not finished) {
-//
-//    if (master) {
-//      sequential code, decide which par loop to do, or if finished
-//     __kmpc_kernel_prepare_parallel() // exec by master only
-//    }
-//    syncthreads // A
-//    __kmpc_kernel_parallel() // exec by all
-//    if (this thread is included in the parallel) {
-//      switch () for all parallel loops
-//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
-//    }
-//
-//
-//    The reason we don't exec end_parallel for the threads not included
-//    in the parallel loop is that for each barrier in the parallel
-//    region, these non-included threads will cycle through the
-//    syncthread A. Thus they must preserve their current threadId that
-//    is larger than thread in team.
-//
-//    To make a long story short...
-//
-//===----------------------------------------------------------------------===//
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "LibC.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-namespace {
-
-uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
-  uint32_t NThreadsICV =
-      NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
-  uint32_t NumThreads = mapping::getMaxTeamThreads();
-
-  if (NThreadsICV != 0 && NThreadsICV < NumThreads)
-    NumThreads = NThreadsICV;
-
-  // SPMD mode allows any number of threads, for generic mode we round down to a
-  // multiple of WARPSIZE since it is legal to do so in OpenMP.
-  if (mapping::isSPMDMode())
-    return NumThreads;
-
-  if (NumThreads < mapping::getWarpSize())
-    NumThreads = 1;
-  else
-    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
-
-  return NumThreads;
-}
-
-// Invoke an outlined parallel function unwrapping arguments (up to 32).
-[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
-                                              int32_t bound_tid, void *fn,
-                                              void **args, int64_t nargs) {
-  switch (nargs) {
-#include "generated_microtask_cases.gen"
-  default:
-    printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
-    __builtin_trap();
-  }
-}
-
-} // namespace
-
-extern "C" {
-
-[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
-                                                   int32_t num_threads,
-                                                   void *fn, void **args,
-                                                   const int64_t nargs) {
-  uint32_t TId = mapping::getThreadIdInBlock();
-  uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  uint32_t PTeamSize =
-      NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
-  // Avoid the race between the read of the `icv::Level` above and the write
-  // below by synchronizing all threads here.
-  synchronize::threadsAligned(atomic::seq_cst);
-  {
-    // Note that the order here is important. `icv::Level` has to be updated
-    // last or the other updates will cause a thread specific state to be
-    // created.
-    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
-                                          1u, TId == 0, ident,
-                                          /*ForceTeamState=*/true);
-    state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
-                                     /*ForceTeamState=*/true);
-    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
-                               /*ForceTeamState=*/true);
-
-    // Synchronize all threads after the main thread (TId == 0) set up the
-    // team state properly.
-    synchronize::threadsAligned(atomic::acq_rel);
-
-    state::ParallelTeamSize.assert_eq(PTeamSize, ident,
-                                      /*ForceTeamState=*/true);
-    icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true);
-    icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true);
-
-    // Ensure we synchronize before we run user code to avoid invalidating the
-    // assumptions above.
-    synchronize::threadsAligned(atomic::relaxed);
-
-    if (!PTeamSize || TId < PTeamSize)
-      invokeMicrotask(TId, 0, fn, args, nargs);
-
-    // Synchronize all threads at the end of a parallel region.
-    synchronize::threadsAligned(atomic::seq_cst);
-  }
-
-  // Synchronize all threads to make sure every thread exits the scope above;
-  // otherwise the following assertions and the assumption in
-  // __kmpc_target_deinit may not hold.
-  synchronize::threadsAligned(atomic::acq_rel);
-
-  state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true);
-  icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true);
-  icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true);
-
-  // Ensure we synchronize to create an aligned region around the assumptions.
-  synchronize::threadsAligned(atomic::relaxed);
-
-  return;
-}
-
-[[clang::always_inline]] void
-__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
-                   int32_t num_threads, int proc_bind, void *fn,
-                   void *wrapper_fn, void **args, int64_t nargs) {
-  uint32_t TId = mapping::getThreadIdInBlock();
-
-  // Assert the parallelism level is zero if disabled by the user.
-  ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
-         "nested parallelism while disabled");
-
-  // Handle the serialized case first, same for SPMD/non-SPMD:
-  // 1) if-clause(0)
-  // 2) parallel in task or other thread state inducing construct
-  // 3) nested parallel regions
-  if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
-                   (config::mayUseNestedParallelism() && icv::Level))) {
-    state::DateEnvironmentRAII DERAII(ident);
-    ++icv::Level;
-    invokeMicrotask(TId, 0, fn, args, nargs);
-    return;
-  }
-
-  // From this point forward we know that there is no thread state used.
-  ASSERT(state::HasThreadState == false, nullptr);
-
-  if (mapping::isSPMDMode()) {
-    // This was moved to its own routine so it could be called directly
-    // in certain situations to avoid resource consumption of unused
-    // logic in parallel_51.
-    __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
-
-    return;
-  }
-
-  uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
-  uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
-
-  // We do *not* create a new data environment because all threads in the team
-  // that are active are now running this parallel region. They share the
-  // TeamState, which has an increase level-var and potentially active-level
-  // set, but they do not have individual ThreadStates yet. If they ever
-  // modify the ICVs beyond this point a ThreadStates will be allocated.
-
-  bool IsActiveParallelRegion = NumThreads > 1;
-  if (!IsActiveParallelRegion) {
-    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
-    invokeMicrotask(TId, 0, fn, args, nargs);
-    return;
-  }
-
-  void **GlobalArgs = nullptr;
-  if (nargs) {
-    __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
-    switch (nargs) {
-    default:
-      for (int I = 0; I < nargs; I++)
-        GlobalArgs[I] = args[I];
-      break;
-    case 16:
-      GlobalArgs[15] = args[15];
-      [[fallthrough]];
-    case 15:
-      GlobalArgs[14] = args[14];
-      [[fallthrough]];
-    case 14:
-      GlobalArgs[13] = args[13];
-      [[fallthrough]];
-    case 13:
-      GlobalArgs[12] = args[12];
-      [[fallthrough]];
-    case 12:
-      GlobalArgs[11] = args[11];
-      [[fallthrough]];
-    case 11:
-      GlobalArgs[10] = args[10];
-      [[fallthrough]];
-    case 10:
-      GlobalArgs[9] = args[9];
-      [[fallthrough]];
-    case 9:
-      GlobalArgs[8] = args[8];
-      [[fallthrough]];
-    case 8:
-      GlobalArgs[7] = args[7];
-      [[fallthrough]];
-    case 7:
-      GlobalArgs[6] = args[6];
-      [[fallthrough]];
-    case 6:
-      GlobalArgs[5] = args[5];
-      [[fallthrough]];
-    case 5:
-      GlobalArgs[4] = args[4];
-      [[fallthrough]];
-    case 4:
-      GlobalArgs[3] = args[3];
-      [[fallthrough]];
-    case 3:
-      GlobalArgs[2] = args[2];
-      [[fallthrough]];
-    case 2:
-      GlobalArgs[1] = args[1];
-      [[fallthrough]];
-    case 1:
-      GlobalArgs[0] = args[0];
-      [[fallthrough]];
-    case 0:
-      break;
-    }
-  }
-
-  {
-    // Note that the order here is important. `icv::Level` has to be updated
-    // last or the other updates will cause a thread specific state to be
-    // created.
-    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
-                                          1u, true, ident,
-                                          /*ForceTeamState=*/true);
-    state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
-                                          (void *)nullptr, true, ident,
-                                          /*ForceTeamState=*/true);
-    state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
-                                     /*ForceTeamState=*/true);
-    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
-                               /*ForceTeamState=*/true);
-
-    // Master signals work to activate workers.
-    synchronize::threads(atomic::seq_cst);
-    // Master waits for workers to signal.
-    synchronize::threads(atomic::seq_cst);
-  }
-
-  if (nargs)
-    __kmpc_end_sharing_variables();
-}
-
-[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
-  // Work function and arguments for L1 parallel region.
-  *WorkFn = state::ParallelRegionFn;
-
-  // If this is the termination signal from the master, quit early.
-  if (!*WorkFn)
-    return false;
-
-  // Set to true for workers participating in the parallel region.
-  uint32_t TId = mapping::getThreadIdInBlock();
-  bool ThreadIsActive = TId < state::getEffectivePTeamSize();
-  return ThreadIsActive;
-}
-
-[[clang::noinline]] void __kmpc_kernel_end_parallel() {
-  // In case we have modified an ICV for this thread before a ThreadState was
-  // created. We drop it now to not contaminate the next parallel region.
-  ASSERT(!mapping::isSPMDMode(), nullptr);
-  uint32_t TId = mapping::getThreadIdInBlock();
-  state::resetStateForThread(TId);
-  ASSERT(!mapping::isSPMDMode(), nullptr);
-}
-
-uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
-
-int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
-
-void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
-                           int32_t thread_limit) {}
-
-void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
-}
diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp
deleted file mode 100644
index df141af5ebee..000000000000
--- a/offload/DeviceRTL/src/Profiling.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===------- Profiling.cpp ---------------------------------------- C++ ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Profiling.h"
-
-extern "C" {
-
-// Provides empty implementations for certain functions in compiler-rt
-// that are emitted by the PGO instrumentation.
-void __llvm_profile_register_function(void *Ptr) {}
-void __llvm_profile_register_names_function(void *Ptr, long int I) {}
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
-}
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
deleted file mode 100644
index fffd0063940c..000000000000
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ /dev/null
@@ -1,316 +0,0 @@
-//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of reduction with KMPC interface.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-namespace {
-
-void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
-  for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
-    shflFct(reduce_data, /*LaneId - not used= */ 0,
-            /*Offset = */ mask, /*AlgoVersion=*/0);
-  }
-}
-
-void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
-                               uint32_t size, uint32_t tid) {
-  uint32_t curr_size;
-  uint32_t mask;
-  curr_size = size;
-  mask = curr_size / 2;
-  while (mask > 0) {
-    shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
-    curr_size = (curr_size + 1) / 2;
-    mask = curr_size / 2;
-  }
-}
-
-static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
-                                          ShuffleReductFnTy shflFct) {
-  uint32_t size, remote_id, physical_lane_id;
-  physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
-  __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
-  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
-  uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
-  __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
-  do {
-    Liveness = mapping::activemask();
-    remote_id = utils::ffs(Liveness & lanemask_gt);
-    size = utils::popc(Liveness);
-    logical_lane_id /= 2;
-    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
-            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
-  } while (logical_lane_id % 2 == 0 && size > 1);
-  return (logical_lane_id == 0);
-}
-
-static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
-                                            ShuffleReductFnTy shflFct,
-                                            InterWarpCopyFnTy cpyFct) {
-  uint32_t BlockThreadId = mapping::getThreadIdInBlock();
-  if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false))
-    BlockThreadId = 0;
-  uint32_t NumThreads = omp_get_num_threads();
-  if (NumThreads == 1)
-    return 1;
-
-  //
-  // This reduce function handles reduction within a team. It handles
-  // parallel regions in both L1 and L2 parallelism levels. It also
-  // supports Generic, SPMD, and NoOMP modes.
-  //
-  // 1. Reduce within a warp.
-  // 2. Warp master copies value to warp 0 via shared memory.
-  // 3. Warp 0 reduces to a single value.
-  // 4. The reduced value is available in the thread that returns 1.
-  //
-
-#if __has_builtin(__nvvm_reflect)
-  if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
-    uint32_t WarpsNeeded =
-        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-    uint32_t WarpId = mapping::getWarpIdInBlock();
-
-    // Volta execution model:
-    // For the Generic execution mode a parallel region either has 1 thread and
-    // beyond that, always a multiple of 32. For the SPMD execution mode we may
-    // have any number of threads.
-    if ((NumThreads % mapping::getWarpSize() == 0) ||
-        (WarpId < WarpsNeeded - 1))
-      gpu_regular_warp_reduce(reduce_data, shflFct);
-    else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-      gpu_irregular_warp_reduce(
-          reduce_data, shflFct,
-          /*LaneCount=*/NumThreads % mapping::getWarpSize(),
-          /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize());
-
-    // When we have more than [mapping::getWarpSize()] number of threads
-    // a block reduction is performed here.
-    //
-    // Only L1 parallel region can enter this if condition.
-    if (NumThreads > mapping::getWarpSize()) {
-      // Gather all the reduced values from each warp
-      // to the first warp.
-      cpyFct(reduce_data, WarpsNeeded);
-
-      if (WarpId == 0)
-        gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                  BlockThreadId);
-    }
-    return BlockThreadId == 0;
-  }
-#endif
-  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
-  if (Liveness == lanes::All) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/utils::popc(Liveness),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
-  else { // Dispersed lanes. Only threads in L2
-         // parallel region may enter here; return
-         // early.
-    return gpu_irregular_simd_reduce(reduce_data, shflFct);
-  }
-
-  // When we have more than [mapping::getWarpSize()] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > mapping::getWarpSize()) {
-    uint32_t WarpsNeeded =
-        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-
-    return BlockThreadId == 0;
-  }
-
-  // Get the OMP thread Id. This is different from BlockThreadId in the case
-  // of an L2 parallel region.
-  return BlockThreadId == 0;
-}
-
-uint32_t roundToWarpsize(uint32_t s) {
-  if (s < mapping::getWarpSize())
-    return 1;
-  return (s & ~(unsigned)(mapping::getWarpSize() - 1));
-}
-
-uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
-
-} // namespace
-
-extern "C" {
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
-                                               uint64_t reduce_data_size,
-                                               void *reduce_data,
-                                               ShuffleReductFnTy shflFct,
-                                               InterWarpCopyFnTy cpyFct) {
-  return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
-}
-
-int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
-    IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
-    uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
-    InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
-    ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
-  // Terminate all threads in non-SPMD mode except for the master thread.
-  uint32_t ThreadId = mapping::getThreadIdInBlock();
-  if (mapping::isGenericMode()) {
-    if (!mapping::isMainThreadInGenericMode())
-      return 0;
-    ThreadId = 0;
-  }
-
-  uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
-  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
-
-  // In non-generic mode all workers participate in the teams reduction.
-  // In generic mode only the team master participates in the teams
-  // reduction because the workers are waiting for parallel work.
-  uint32_t NumThreads = omp_get_num_threads();
-  uint32_t TeamId = omp_get_team_num();
-  uint32_t NumTeams = omp_get_num_teams();
-  [[clang::loader_uninitialized]] static Local<unsigned> Bound;
-  [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
-
-  // Block progress for teams greater than the current upper
-  // limit. We always only allow a number of teams less or equal
-  // to the number of slots in the buffer.
-  bool IsMaster = (ThreadId == 0);
-  while (IsMaster) {
-    Bound = atomic::load(&IterCnt, atomic::acquire);
-    if (TeamId < Bound + num_of_records)
-      break;
-  }
-
-  if (IsMaster) {
-    int ModBockId = TeamId % num_of_records;
-    if (TeamId < num_of_records) {
-      lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
-    } else
-      lgredFct(GlobalBuffer, ModBockId, reduce_data);
-
-    // Propagate the memory writes above to the world.
-    fence::kernel(atomic::release);
-
-    // Increment team counter.
-    // This counter is incremented by all teams in the current
-    // num_of_records chunk.
-    ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
-                                 atomic::MemScopeTy::device);
-  }
-
-  // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
-  // state machine.
-  if (mapping::isSPMDMode())
-    synchronize::threadsAligned(atomic::acq_rel);
-
-  // reduce_data is global or shared so before being reduced within the
-  // warp we need to bring it in local memory:
-  // local_reduce_data = reduce_data[i]
-  //
-  // Example for 3 reduction variables a, b, c (of potentially different
-  // types):
-  //
-  // buffer layout (struct of arrays):
-  // a, a, ..., a, b, b, ... b, c, c, ... c
-  // |__________|
-  //     num_of_records
-  //
-  // local_data_reduce layout (struct):
-  // a, b, c
-  //
-  // Each thread will have a local struct containing the values to be
-  // reduced:
-  //      1. do reduction within each warp.
-  //      2. do reduction across warps.
-  //      3. write the final result to the main reduction variable
-  //         by returning 1 in the thread holding the reduction result.
-
-  // Check if this is the very last team.
-  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
-  if (ChunkTeamCount == NumTeams - Bound - 1) {
-    // Ensure we see the global memory writes by other teams
-    fence::kernel(atomic::acquire);
-
-    //
-    // Last team processing.
-    //
-    if (ThreadId >= NumRecs)
-      return 0;
-    NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
-    if (ThreadId >= NumThreads)
-      return 0;
-
-    // Load from buffer and reduce.
-    glcpyFct(GlobalBuffer, ThreadId, reduce_data);
-    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
-      glredFct(GlobalBuffer, i, reduce_data);
-
-    // Reduce across warps to the warp master.
-    if (NumThreads > 1) {
-      gpu_regular_warp_reduce(reduce_data, shflFct);
-
-      // When we have more than [mapping::getWarpSize()] number of threads
-      // a block reduction is performed here.
-      uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
-      if (ActiveThreads > mapping::getWarpSize()) {
-        uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
-                               mapping::getWarpSize();
-        // Gather all the reduced values from each warp
-        // to the first warp.
-        cpyFct(reduce_data, WarpsNeeded);
-
-        uint32_t WarpId = ThreadId / mapping::getWarpSize();
-        if (WarpId == 0)
-          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                    ThreadId);
-      }
-    }
-
-    if (IsMaster) {
-      Cnt = 0;
-      IterCnt = 0;
-      return 1;
-    }
-    return 0;
-  }
-  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
-    // Allow SIZE number of teams to proceed writing their
-    // intermediate results to the global buffer.
-    atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst);
-  }
-
-  return 0;
-}
-}
-
-void *__kmpc_reduction_get_fixed_buffer() {
-  return state::getKernelLaunchEnvironment().ReductionBuffer;
-}
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
deleted file mode 100644
index 475395102f47..000000000000
--- a/offload/DeviceRTL/src/State.cpp
+++ /dev/null
@@ -1,482 +0,0 @@
-//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "LibC.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-/// Memory implementation
-///
-///{
-
-/// External symbol to access dynamic shared memory.
-[[gnu::aligned(
-    allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[];
-
-/// The kernel environment passed to the init method by the compiler.
-[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *>
-    KernelEnvironmentPtr;
-
-/// The kernel launch environment passed as argument to the kernel by the
-/// runtime.
-[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
-    KernelLaunchEnvironmentPtr;
-
-///}
-
-namespace {
-
-/// Fallback implementations are missing to trigger a link time error.
-/// Implementations for new devices, including the host, should go into a
-/// dedicated begin/end declare variant.
-///
-///{
-extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
-
-[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
-[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
-
-#else
-
-[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
-[[gnu::weak, gnu::leaf]] void free(void *Ptr);
-
-#endif
-}
-///}
-
-/// A "smart" stack in shared memory.
-///
-/// The stack exposes a malloc/free interface but works like a stack internally.
-/// In fact, it is a separate stack *per warp*. That means, each warp must push
-/// and pop symmetrically or this breaks, badly. The implementation will (aim
-/// to) detect non-lock-step warps and fallback to malloc/free. The same will
-/// happen if a warp runs out of memory. The master warp in generic memory is
-/// special and is given more memory than the rest.
-///
-struct SharedMemorySmartStackTy {
-  /// Initialize the stack. Must be called by all threads.
-  void init(bool IsSPMD);
-
-  /// Allocate \p Bytes on the stack for the encountering thread. Each thread
-  /// can call this function.
-  void *push(uint64_t Bytes);
-
-  /// Deallocate the last allocation made by the encountering thread and pointed
-  /// to by \p Ptr from the stack. Each thread can call this function.
-  void pop(void *Ptr, uint64_t Bytes);
-
-private:
-  /// Compute the size of the storage space reserved for a thread.
-  uint32_t computeThreadStorageTotal() {
-    uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
-    return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock,
-                                allocator::ALIGNMENT);
-  }
-
-  /// Return the top address of the warp data stack, that is the first address
-  /// this warp will allocate memory at next.
-  void *getThreadDataTop(uint32_t TId) {
-    return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
-  }
-
-  /// The actual storage, shared among all warps.
-  [[gnu::aligned(
-      allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
-  [[gnu::aligned(
-      allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
-};
-
-static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
-              "Shared scratchpad of this size not supported yet.");
-
-/// The allocation of a single shared memory scratchpad.
-[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy>
-    SharedMemorySmartStack;
-
-void SharedMemorySmartStackTy::init(bool IsSPMD) {
-  Usage[mapping::getThreadIdInBlock()] = 0;
-}
-
-void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
-  // First align the number of requested bytes.
-  /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
-  /// be passed in as an argument and the stack rewritten to support it.
-  uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT);
-
-  uint32_t StorageTotal = computeThreadStorageTotal();
-
-  // The main thread in generic mode gets the space of its entire warp as the
-  // other threads do not participate in any computation at all.
-  if (mapping::isMainThreadInGenericMode())
-    StorageTotal *= mapping::getWarpSize();
-
-  int TId = mapping::getThreadIdInBlock();
-  if (Usage[TId] + AlignedBytes <= StorageTotal) {
-    void *Ptr = getThreadDataTop(TId);
-    Usage[TId] += AlignedBytes;
-    return Ptr;
-  }
-
-  if (config::isDebugMode(DeviceDebugKind::CommonIssues))
-    printf("Shared memory stack full, fallback to dynamic allocation of global "
-           "memory will negatively impact performance.\n");
-  void *GlobalMemory = memory::allocGlobal(
-      AlignedBytes, "Slow path shared memory allocation, insufficient "
-                    "shared memory stack memory!");
-  ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
-
-  return GlobalMemory;
-}
-
-void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
-  uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT);
-  if (utils::isSharedMemPtr(Ptr)) {
-    int TId = mapping::getThreadIdInBlock();
-    Usage[TId] -= AlignedBytes;
-    return;
-  }
-  memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
-}
-
-} // namespace
-
-void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
-
-void *memory::allocShared(uint64_t Bytes, const char *Reason) {
-  return SharedMemorySmartStack.push(Bytes);
-}
-
-void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
-  SharedMemorySmartStack.pop(Ptr, Bytes);
-}
-
-void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
-  void *Ptr = malloc(Bytes);
-  if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
-    printf("nullptr returned by malloc!\n");
-  return Ptr;
-}
-
-void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
-
-///}
-
-bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
-  return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
-         (ActiveLevelVar == Other.ActiveLevelVar) &
-         (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
-         (RunSchedVar == Other.RunSchedVar) &
-         (RunSchedChunkVar == Other.RunSchedChunkVar);
-}
-
-void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
-  ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
-  ASSERT(LevelVar == Other.LevelVar, nullptr);
-  ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
-  ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
-  ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
-  ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
-}
-
-void state::TeamStateTy::init(bool IsSPMD) {
-  ICVState.NThreadsVar = 0;
-  ICVState.LevelVar = 0;
-  ICVState.ActiveLevelVar = 0;
-  ICVState.Padding0Val = 0;
-  ICVState.MaxActiveLevelsVar = 1;
-  ICVState.RunSchedVar = omp_sched_static;
-  ICVState.RunSchedChunkVar = 1;
-  ParallelTeamSize = 1;
-  HasThreadState = false;
-  ParallelRegionFnVar = nullptr;
-}
-
-bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
-  return (ICVState == Other.ICVState) &
-         (HasThreadState == Other.HasThreadState) &
-         (ParallelTeamSize == Other.ParallelTeamSize);
-}
-
-void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
-  ICVState.assertEqual(Other.ICVState);
-  ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
-  ASSERT(HasThreadState == Other.HasThreadState, nullptr);
-}
-
-[[clang::loader_uninitialized]] Local<state::TeamStateTy>
-    ompx::state::TeamState;
-[[clang::loader_uninitialized]] Local<state::ThreadStateTy **>
-    ompx::state::ThreadStates;
-
-namespace {
-
-int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
-                             int OutOfBoundsVal = -1) {
-  if (Level == 0)
-    return DefaultVal;
-  int LevelVar = omp_get_level();
-  if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
-    return OutOfBoundsVal;
-  int ActiveLevel = icv::ActiveLevel;
-  if (OMP_UNLIKELY(Level != ActiveLevel))
-    return DefaultVal;
-  return Val;
-}
-
-} // namespace
-
-void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
-  SharedMemorySmartStack.init(IsSPMD);
-  if (mapping::isInitialThreadInLevel0(IsSPMD)) {
-    TeamState.init(IsSPMD);
-    ThreadStates = nullptr;
-    KernelEnvironmentPtr = &KernelEnvironment;
-    KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
-  }
-}
-
-KernelEnvironmentTy &state::getKernelEnvironment() {
-  return *KernelEnvironmentPtr;
-}
-
-KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
-  return *KernelLaunchEnvironmentPtr;
-}
-
-void state::enterDataEnvironment(IdentTy *Ident) {
-  ASSERT(config::mayUseThreadStates(),
-         "Thread state modified while explicitly disabled!");
-  if (!config::mayUseThreadStates())
-    return;
-
-  unsigned TId = mapping::getThreadIdInBlock();
-  ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
-      memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
-  uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
-  if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
-    uint32_t Bytes =
-        sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
-    void *ThreadStatesPtr =
-        memory::allocGlobal(Bytes, "Thread state array allocation");
-    __builtin_memset(ThreadStatesPtr, 0, Bytes);
-    if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
-                     reinterpret_cast<uintptr_t>(ThreadStatesPtr),
-                     atomic::seq_cst, atomic::seq_cst))
-      memory::freeGlobal(ThreadStatesPtr,
-                         "Thread state array allocated multiple times");
-    ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
-           "Expected valid thread states bit!");
-  }
-  NewThreadState->init(ThreadStates[TId]);
-  TeamState.HasThreadState = true;
-  ThreadStates[TId] = NewThreadState;
-}
-
-void state::exitDataEnvironment() {
-  ASSERT(config::mayUseThreadStates(),
-         "Thread state modified while explicitly disabled!");
-
-  unsigned TId = mapping::getThreadIdInBlock();
-  resetStateForThread(TId);
-}
-
-void state::resetStateForThread(uint32_t TId) {
-  if (!config::mayUseThreadStates())
-    return;
-  if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
-    return;
-
-  ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
-  memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
-  ThreadStates[TId] = PreviousThreadState;
-}
-
-void state::runAndCheckState(void(Func(void))) {
-  TeamStateTy OldTeamState = TeamState;
-  OldTeamState.assertEqual(TeamState);
-
-  Func();
-
-  OldTeamState.assertEqual(TeamState);
-}
-
-void state::assumeInitialState(bool IsSPMD) {
-  TeamStateTy InitialTeamState;
-  InitialTeamState.init(IsSPMD);
-  InitialTeamState.assertEqual(TeamState);
-  ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
-}
-
-int state::getEffectivePTeamSize() {
-  int PTeamSize = state::ParallelTeamSize;
-  return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
-}
-
-extern "C" {
-void omp_set_dynamic(int V) {}
-
-int omp_get_dynamic(void) { return 0; }
-
-void omp_set_num_threads(int V) { icv::NThreads = V; }
-
-int omp_get_max_threads(void) {
-  int NT = icv::NThreads;
-  return NT > 0 ? NT : mapping::getMaxTeamThreads();
-}
-
-int omp_get_level(void) {
-  int LevelVar = icv::Level;
-  ASSERT(LevelVar >= 0, nullptr);
-  return LevelVar;
-}
-
-int omp_get_active_level(void) { return !!icv::ActiveLevel; }
-
-int omp_in_parallel(void) { return !!icv::ActiveLevel; }
-
-void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
-  *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
-  *ChunkSize = state::RunSchedChunk;
-}
-
-void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
-  icv::RunSched = (int)ScheduleKind;
-  state::RunSchedChunk = ChunkSize;
-}
-
-int omp_get_ancestor_thread_num(int Level) {
-  return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
-}
-
-int omp_get_thread_num(void) {
-  return omp_get_ancestor_thread_num(omp_get_level());
-}
-
-int omp_get_team_size(int Level) {
-  return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
-}
-
-int omp_get_num_threads(void) {
-  return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
-}
-
-int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
-
-int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
-
-void omp_set_nested(int) {}
-
-int omp_get_nested(void) { return false; }
-
-void omp_set_max_active_levels(int Levels) {
-  icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
-}
-
-int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
-
-omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
-
-int omp_get_num_places(void) { return 0; }
-
-int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
-
-void omp_get_place_proc_ids(int, int *) {
-  // TODO
-}
-
-int omp_get_place_num(void) { return 0; }
-
-int omp_get_partition_num_places(void) { return 0; }
-
-void omp_get_partition_place_nums(int *) {
-  // TODO
-}
-
-int omp_get_cancellation(void) { return 0; }
-
-void omp_set_default_device(int) {}
-
-int omp_get_default_device(void) { return -1; }
-
-int omp_get_num_devices(void) { return config::getNumDevices(); }
-
-int omp_get_device_num(void) { return config::getDeviceNum(); }
-
-int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
-
-int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
-
-int omp_get_initial_device(void) { return -1; }
-
-int omp_is_initial_device(void) { return 0; }
-}
-
-extern "C" {
-[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
-  return memory::allocShared(Bytes, "Frontend alloc shared");
-}
-
-[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
-  memory::freeShared(Ptr, Bytes, "Frontend free shared");
-}
-
-void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
-
-void *llvm_omp_target_dynamic_shared_alloc() {
-  return __kmpc_get_dynamic_shared();
-}
-
-void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
-
-/// Allocate storage in shared memory to communicate arguments from the main
-/// thread to the workers in generic mode. If we exceed
-/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
-constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
-
-[[clang::loader_uninitialized]] static Local<void *>
-    SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
-[[clang::loader_uninitialized]] static Local<void **>
-    SharedMemVariableSharingSpacePtr;
-
-void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
-  if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
-    SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
-  } else {
-    SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
-        nArgs * sizeof(void *), "new extended args");
-    ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
-           "Nullptr returned by malloc!");
-  }
-  *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-
-void __kmpc_end_sharing_variables() {
-  if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
-    memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
-}
-
-void __kmpc_get_shared_variables(void ***GlobalArgs) {
-  *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-}
diff --git a/offload/DeviceRTL/src/Stub.cpp b/offload/DeviceRTL/src/Stub.cpp
deleted file mode 100644
index e833423eb265..000000000000
--- a/offload/DeviceRTL/src/Stub.cpp
+++ /dev/null
@@ -1 +0,0 @@
-// This is an empty file used to create a device fatbinary.
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
deleted file mode 100644
index 2f1ed34a3f6d..000000000000
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Include all synchronization.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Synchronization.h"
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-
-using namespace ompx;
-
-namespace impl {
-
-/// Atomics
-///
-///{
-///}
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope) {
-  // builtin_amdgcn_atomic_inc32 should expand to this switch when
-  // passed a runtime value, but does not do so yet. Workaround here.
-
-#define ScopeSwitch(ORDER)                                                     \
-  switch (MemScope) {                                                          \
-  case atomic::MemScopeTy::system:                                             \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "");                     \
-  case atomic::MemScopeTy::device:                                             \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent");                \
-  case atomic::MemScopeTy::workgroup:                                          \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup");            \
-  case atomic::MemScopeTy::wavefront:                                          \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "wavefront");            \
-  case atomic::MemScopeTy::single:                                             \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "singlethread");         \
-  }
-
-#define Case(ORDER)                                                            \
-  case ORDER:                                                                  \
-    ScopeSwitch(ORDER)
-
-  switch (Ordering) {
-  default:
-    __builtin_unreachable();
-    Case(atomic::relaxed);
-    Case(atomic::acquire);
-    Case(atomic::release);
-    Case(atomic::acq_rel);
-    Case(atomic::seq_cst);
-#undef Case
-#undef ScopeSwitch
-  }
-}
-
-[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker;
-
-void namedBarrierInit() {
-  // Don't have global ctors, and shared memory is not zero init
-  atomic::store(&namedBarrierTracker, 0u, atomic::release);
-}
-
-void namedBarrier() {
-  uint32_t NumThreads = omp_get_num_threads();
-  // assert(NumThreads % 32 == 0);
-
-  uint32_t WarpSize = mapping::getWarpSize();
-  uint32_t NumWaves = NumThreads / WarpSize;
-
-  fence::team(atomic::acquire);
-
-  // named barrier implementation for amdgcn.
-  // Uses two 16 bit unsigned counters. One for the number of waves to have
-  // reached the barrier, and one to count how many times the barrier has been
-  // passed. These are packed in a single atomically accessed 32 bit integer.
-  // Low bits for the number of waves, assumed zero before this call.
-  // High bits to count the number of times the barrier has been passed.
-
-  // precondition: NumWaves != 0;
-  // invariant: NumWaves * WarpSize == NumThreads;
-  // precondition: NumWaves < 0xffffu;
-
-  // Increment the low 16 bits once, using the lowest active thread.
-  if (mapping::isLeaderInWarp()) {
-    uint32_t load = atomic::add(&namedBarrierTracker, 1,
-                                atomic::relaxed); // commutative
-
-    // Record the number of times the barrier has been passed
-    uint32_t generation = load & 0xffff0000u;
-
-    if ((load & 0x0000ffffu) == (NumWaves - 1)) {
-      // Reached NumWaves in low bits so this is the last wave.
-      // Set low bits to zero and increment high bits
-      load += 0x00010000u; // wrap is safe
-      load &= 0xffff0000u; // because bits zeroed second
-
-      // Reset the wave counter and release the waiting waves
-      atomic::store(&namedBarrierTracker, load, atomic::relaxed);
-    } else {
-      // more waves still to go, spin until generation counter changes
-      do {
-        __builtin_amdgcn_s_sleep(0);
-        load = atomic::load(&namedBarrierTracker, atomic::relaxed);
-      } while ((load & 0xffff0000u) == generation);
-    }
-  }
-  fence::team(atomic::release);
-}
-
-void fenceTeam(atomic::OrderingTy Ordering) {
-  return __scoped_atomic_thread_fence(Ordering, atomic::workgroup);
-}
-
-void fenceKernel(atomic::OrderingTy Ordering) {
-  return __scoped_atomic_thread_fence(Ordering, atomic::device);
-}
-
-void fenceSystem(atomic::OrderingTy Ordering) {
-  return __scoped_atomic_thread_fence(Ordering, atomic::system);
-}
-
-void syncWarp(__kmpc_impl_lanemask_t) {
-  // This is a no-op on current AMDGPU hardware but it is used by the optimizer
-  // to enforce convergent behaviour between control flow graphs.
-  __builtin_amdgcn_wave_barrier();
-}
-
-void syncThreads(atomic::OrderingTy Ordering) {
-  if (Ordering != atomic::relaxed)
-    fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
-
-  __builtin_amdgcn_s_barrier();
-
-  if (Ordering != atomic::relaxed)
-    fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst);
-}
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
-
-// TODO: Don't have wavefront lane locks. Possibly can't have them.
-void unsetLock(omp_lock_t *) { __builtin_trap(); }
-int testLock(omp_lock_t *) { __builtin_trap(); }
-void initLock(omp_lock_t *) { __builtin_trap(); }
-void destroyLock(omp_lock_t *) { __builtin_trap(); }
-void setLock(omp_lock_t *) { __builtin_trap(); }
-
-constexpr uint32_t UNSET = 0;
-constexpr uint32_t SET = 1;
-
-void unsetCriticalLock(omp_lock_t *Lock) {
-  (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel);
-}
-
-void setCriticalLock(omp_lock_t *Lock) {
-  uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1;
-  if (mapping::getThreadIdInWarp() == LowestActiveThread) {
-    fenceKernel(atomic::release);
-    while (
-        !cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) {
-      __builtin_amdgcn_s_sleep(32);
-    }
-    fenceKernel(atomic::acquire);
-  }
-}
-
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope) {
-  return __nvvm_atom_inc_gen_ui(Address, Val);
-}
-
-void namedBarrierInit() {}
-
-void namedBarrier() {
-  uint32_t NumThreads = omp_get_num_threads();
-  ASSERT(NumThreads % 32 == 0, nullptr);
-
-  // The named barrier for active parallel threads of a team in an L1 parallel
-  // region to synchronize with each other.
-  constexpr int BarrierNo = 7;
-  __nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
-}
-
-void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
-
-void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); }
-
-void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
-
-void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
-
-void syncThreads(atomic::OrderingTy Ordering) {
-  constexpr int BarrierNo = 8;
-  __nvvm_barrier_sync(BarrierNo);
-}
-
-void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
-
-constexpr uint32_t OMP_SPIN = 1000;
-constexpr uint32_t UNSET = 0;
-constexpr uint32_t SET = 1;
-
-// TODO: This seems to hide a bug in the declare variant handling. If it is
-// called before it is defined
-//       here the overload won't happen. Investigate lalter!
-void unsetLock(omp_lock_t *Lock) {
-  (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst);
-}
-
-int testLock(omp_lock_t *Lock) {
-  return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst);
-}
-
-void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void setLock(omp_lock_t *Lock) {
-  // TODO: not sure spinning is a good idea here..
-  while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst,
-                     atomic::seq_cst) != UNSET) {
-    int32_t start = __nvvm_read_ptx_sreg_clock();
-    int32_t now;
-    for (;;) {
-      now = __nvvm_read_ptx_sreg_clock();
-      int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) {
-        break;
-      }
-    }
-  } // wait for 0 to be the read value
-}
-
-void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
-
-#endif
-///}
-
-} // namespace impl
-
-void synchronize::init(bool IsSPMD) {
-  if (!IsSPMD)
-    impl::namedBarrierInit();
-}
-
-void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
-
-void synchronize::threads(atomic::OrderingTy Ordering) {
-  impl::syncThreads(Ordering);
-}
-
-void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
-  impl::syncThreadsAligned(Ordering);
-}
-
-void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
-
-void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); }
-
-void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); }
-
-uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering,
-                     atomic::MemScopeTy MemScope) {
-  return impl::atomicInc(Addr, V, Ordering, MemScope);
-}
-
-void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
-
-void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
-
-extern "C" {
-void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
-
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
-  __kmpc_barrier(Loc, TId);
-  return 0;
-}
-
-void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
-  if (mapping::isSPMDMode())
-    return __kmpc_barrier_simple_spmd(Loc, TId);
-
-  // Generic parallel regions are run with multiple of the warp size or single
-  // threaded, in the latter case we need to stop here.
-  if (omp_get_num_threads() == 1)
-    return __kmpc_flush(Loc);
-
-  impl::namedBarrier();
-}
-
-[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
-  synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
-}
-
-[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc,
-                                                       int32_t TId) {
-  synchronize::threads(atomic::OrderingTy::seq_cst);
-}
-
-int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
-  return omp_get_thread_num() == 0;
-}
-
-void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) {
-  return omp_get_thread_num() == Filter;
-}
-
-void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
-  return __kmpc_master(Loc, TId);
-}
-
-void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
-  // The barrier is explicitly called.
-}
-
-void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); }
-
-uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
-
-void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
-
-void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
-  impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
-}
-
-void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
-  impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
-}
-
-void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
-
-void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
-
-void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
-
-void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
-
-int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); }
-
-void ompx_sync_block(int Ordering) {
-  impl::syncThreadsAligned(atomic::OrderingTy(Ordering));
-}
-void ompx_sync_block_acq_rel() {
-  impl::syncThreadsAligned(atomic::OrderingTy::acq_rel);
-}
-void ompx_sync_block_divergent(int Ordering) {
-  impl::syncThreads(atomic::OrderingTy(Ordering));
-}
-} // extern "C"
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp
deleted file mode 100644
index d0be0ace50df..000000000000
--- a/offload/DeviceRTL/src/Tasking.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Task implementation support.
-//
-// TODO: We should not allocate and execute the task in two steps. A new API is
-//       needed for that though.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "State.h"
-
-using namespace ompx;
-
-extern "C" {
-
-TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
-                                        size_t TaskSizeInclPrivateValues,
-                                        size_t SharedValuesSize,
-                                        TaskFnTy TaskFn) {
-  auto TaskSizeInclPrivateValuesPadded =
-      utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *));
-  auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
-  TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
-      TaskSizeTotal, "explicit task descriptor");
-  TaskDescriptor->Payload =
-      utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
-  TaskDescriptor->TaskFn = TaskFn;
-
-  return TaskDescriptor;
-}
-
-int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
-                        TaskDescriptorTy *TaskDescriptor) {
-  return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
-}
-
-int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
-                                  TaskDescriptorTy *TaskDescriptor, int32_t,
-                                  void *, int32_t, void *) {
-  state::DateEnvironmentRAII DERAII(Loc);
-
-  TaskDescriptor->TaskFn(0, TaskDescriptor);
-
-  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
-  return 0;
-}
-
-void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
-                               TaskDescriptorTy *TaskDescriptor) {
-  state::enterDataEnvironment(Loc);
-}
-
-void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
-                                  TaskDescriptorTy *TaskDescriptor) {
-  state::exitDataEnvironment();
-
-  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
-}
-
-void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
-                          void *) {}
-
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
-
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
-
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; }
-
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
-
-void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
-                     TaskDescriptorTy *TaskDescriptor, int,
-                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
-                     int32_t, uint64_t, void *) {
-  // Skip task entirely if empty iteration space.
-  if (*LowerBound > *UpperBound)
-    return;
-
-  // The compiler has already stored lb and ub in the TaskDescriptorTy structure
-  // as we are using a single task to execute the entire loop, we can leave
-  // the initial task_t untouched
-  __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
-}
-
-int omp_in_final(void) {
-  // treat all tasks as final... Specs may expect runtime to keep
-  // track more precisely if a task was actively set by users... This
-  // is not explicitly specified; will treat as if runtime can
-  // actively decide to put a non-final task into a final one.
-  return 1;
-}
-
-int omp_get_max_task_priority(void) { return 0; }
-}
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
deleted file mode 100644
index 59a2cc3f27ac..000000000000
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ /dev/null
@@ -1,970 +0,0 @@
-//===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the KMPC interface
-// for the loop construct plus other worksharing constructs that use the same
-// interface as loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Workshare.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-// TODO:
-struct DynamicScheduleTracker {
-  int64_t Chunk;
-  int64_t LoopUpperBound;
-  int64_t NextLowerBound;
-  int64_t Stride;
-  kmp_sched_t ScheduleType;
-  DynamicScheduleTracker *NextDST;
-};
-
-#define ASSERT0(...)
-
-// used by the library for the interface with the app
-#define DISPATCH_FINISHED 0
-#define DISPATCH_NOTFINISHED 1
-
-// used by dynamic scheduling
-#define FINISHED 0
-#define NOT_FINISHED 1
-#define LAST_CHUNK 2
-
-// TODO: This variable is a hack inherited from the old runtime.
-[[clang::loader_uninitialized]] static Local<uint64_t> Cnt;
-
-template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling with chunk
-
-  // Generic implementation of OMP loop scheduling with static policy
-  /*! \brief Calculate initial bounds for static loop and stride
-   *  @param[in] loc location in code of the call (not used here)
-   *  @param[in] global_tid global thread id
-   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
-   *  @param[in] plastiter pointer to last iteration
-   *  @param[in,out] pointer to loop lower bound. it will contain value of
-   *  lower bound of first chunk
-   *  @param[in,out] pointer to loop upper bound. It will contain value of
-   *  upper bound of first chunk
-   *  @param[in,out] pointer to loop stride. It will contain value of stride
-   *  between two successive chunks executed by the same thread
-   *  @param[in] loop increment bump
-   *  @param[in] chunk size
-   */
-
-  // helper function for static chunk
-  static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
-                             T entityId, T numberOfEntities) {
-    // each thread executes multiple chunks all of the same size, except
-    // the last one
-    // distance between two successive chunks
-    stride = numberOfEntities * chunk;
-    lb = lb + entityId * chunk;
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    // Say ub' is the beginning of the last chunk. Then who ever has a
-    // lower bound plus a multiple of the increment equal to ub' is
-    // the last one.
-    T beginingLastChunk = inputUb - (inputUb % chunk);
-    last = ((beginingLastChunk - lb) % stride) == 0;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling without chunk
-
-  // helper function for static no chunk
-  static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
-                               T entityId, T numberOfEntities) {
-    // No chunk size specified.  Each thread or warp gets at most one
-    // chunk; chunks are all almost of equal size
-    T loopSize = ub - lb + 1;
-
-    chunk = loopSize / numberOfEntities;
-    T leftOver = loopSize - chunk * numberOfEntities;
-
-    if (entityId < leftOver) {
-      chunk++;
-      lb = lb + entityId * chunk;
-    } else {
-      lb = lb + entityId * chunk + leftOver;
-    }
-
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    last = lb <= inputUb && inputUb <= ub;
-    stride = loopSize; // make sure we only do 1 chunk per warp
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for Static Init
-
-  static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter,
-                              T *plower, T *pupper, ST *pstride, ST chunk,
-                              bool IsSPMDExecutionMode) {
-    int32_t gtid = omp_get_thread_num();
-    int numberOfActiveOMPThreads = omp_get_num_threads();
-
-    // All warps that are in excess of the maximum requested, do
-    // not execute the loop
-    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
-            "current thread is not needed here; error");
-
-    // copy
-    int lastiter = 0;
-    T lb = *plower;
-    T ub = *pupper;
-    ST stride = *pstride;
-
-    // init
-    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
-    case kmp_sched_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        break;
-      }
-      [[fallthrough]];
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_balanced_chunk: {
-      if (chunk > 0) {
-        // round up to make sure the chunk is enough to cover all iterations
-        T tripCount = ub - lb + 1; // +1 because ub is inclusive
-        T span = (tripCount + numberOfActiveOMPThreads - 1) /
-                 numberOfActiveOMPThreads;
-        // perform chunk adjustment
-        chunk = (span + chunk - 1) & ~(chunk - 1);
-
-        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-        T oldUb = ub;
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        if (ub > oldUb)
-          ub = oldUb;
-        break;
-      }
-      [[fallthrough]];
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-      break;
-    }
-    case kmp_sched_distr_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
-                       omp_get_num_teams());
-        break;
-      }
-      [[fallthrough]];
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_distr_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
-                       omp_get_num_teams());
-      break;
-    }
-    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
-      ForStaticChunk(lastiter, lb, ub, stride, chunk,
-                     numberOfActiveOMPThreads * omp_get_team_num() + gtid,
-                     omp_get_num_teams() * numberOfActiveOMPThreads);
-      break;
-    }
-    default: {
-      // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                     numberOfActiveOMPThreads);
-      break;
-    }
-    }
-    // copy back
-    *plastiter = lastiter;
-    *plower = lb;
-    *pupper = ub;
-    *pstride = stride;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch Init
-
-  static int OrderedSchedule(kmp_sched_t schedule) {
-    return schedule >= kmp_sched_ordered_first &&
-           schedule <= kmp_sched_ordered_last;
-  }
-
-  static void dispatch_init(IdentTy *loc, int32_t threadId,
-                            kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
-                            DynamicScheduleTracker *DST) {
-    int tid = mapping::getThreadIdInBlock();
-    T tnum = omp_get_num_threads();
-    T tripCount = ub - lb + 1; // +1 because ub is inclusive
-    ASSERT0(LT_FUSSY, threadId < tnum,
-            "current thread is not needed here; error");
-
-    /* Currently just ignore the monotonic and non-monotonic modifiers
-     * (the compiler isn't producing them * yet anyway).
-     * When it is we'll want to look at them somewhere here and use that
-     * information to add to our schedule choice. We shouldn't need to pass
-     * them on, they merely affect which schedule we can legally choose for
-     * various dynamic cases. (In particular, whether or not a stealing scheme
-     * is legal).
-     */
-    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
-
-    // Process schedule.
-    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
-      if (OrderedSchedule(schedule))
-        __kmpc_barrier(loc, threadId);
-      schedule = kmp_sched_static_chunk;
-      chunk = tripCount; // one thread gets the whole loop
-    } else if (schedule == kmp_sched_runtime) {
-      // process runtime
-      omp_sched_t rtSched;
-      int ChunkInt;
-      omp_get_schedule(&rtSched, &ChunkInt);
-      chunk = ChunkInt;
-      switch (rtSched) {
-      case omp_sched_static: {
-        if (chunk > 0)
-          schedule = kmp_sched_static_chunk;
-        else
-          schedule = kmp_sched_static_nochunk;
-        break;
-      }
-      case omp_sched_auto: {
-        schedule = kmp_sched_static_chunk;
-        chunk = 1;
-        break;
-      }
-      case omp_sched_dynamic:
-      case omp_sched_guided: {
-        schedule = kmp_sched_dynamic;
-        break;
-      }
-      }
-    } else if (schedule == kmp_sched_auto) {
-      schedule = kmp_sched_static_chunk;
-      chunk = 1;
-    } else {
-      // ASSERT(LT_FUSSY,
-      //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-      //        "unknown schedule %d & chunk %lld\n", (int)schedule,
-      //        (long long)chunk);
-    }
-
-    // init schedules
-    if (schedule == kmp_sched_static_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      DST->ScheduleType = schedule;
-      // save ub
-      DST->LoopUpperBound = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      DST->Chunk = chunk;
-      DST->NextLowerBound = lb;
-      DST->Stride = stride;
-    } else if (schedule == kmp_sched_static_balanced_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      DST->ScheduleType = schedule;
-      // save ub
-      DST->LoopUpperBound = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      // round up to make sure the chunk is enough to cover all iterations
-      T span = (tripCount + tnum - 1) / tnum;
-      // perform chunk adjustment
-      chunk = (span + chunk - 1) & ~(chunk - 1);
-
-      T oldUb = ub;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-      if (ub > oldUb)
-        ub = oldUb;
-      // save computed params
-      DST->Chunk = chunk;
-      DST->NextLowerBound = lb;
-      DST->Stride = stride;
-    } else if (schedule == kmp_sched_static_nochunk) {
-      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
-      // save sched state
-      DST->ScheduleType = schedule;
-      // save ub
-      DST->LoopUpperBound = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      DST->Chunk = chunk;
-      DST->NextLowerBound = lb;
-      DST->Stride = stride;
-    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      // save data
-      DST->ScheduleType = schedule;
-      if (chunk < 1)
-        chunk = 1;
-      DST->Chunk = chunk;
-      DST->LoopUpperBound = ub;
-      DST->NextLowerBound = lb;
-      __kmpc_barrier(loc, threadId);
-      if (tid == 0) {
-        Cnt = 0;
-        fence::team(atomic::seq_cst);
-      }
-      __kmpc_barrier(loc, threadId);
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch next
-
-  static uint64_t NextIter() {
-    __kmpc_impl_lanemask_t active = mapping::activemask();
-    uint32_t leader = utils::ffs(active) - 1;
-    uint32_t change = utils::popc(active);
-    __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
-    unsigned int rank = utils::popc(active & lane_mask_lt);
-    uint64_t warp_res = 0;
-    if (rank == 0) {
-      warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
-    }
-    warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize());
-    return warp_res + rank;
-  }
-
-  static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
-                              T loopUpperBound) {
-    T N = NextIter();
-    lb = loopLowerBound + N * chunkSize;
-    ub = lb + chunkSize - 1; // Clang uses i <= ub
-
-    // 3 result cases:
-    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
-    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
-    //  NOT_FINISHED
-    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
-    // a.
-    if (lb <= loopUpperBound && ub < loopUpperBound) {
-      return NOT_FINISHED;
-    }
-    // b.
-    if (lb <= loopUpperBound) {
-      ub = loopUpperBound;
-      return LAST_CHUNK;
-    }
-    // c. if we are here, we are in case 'c'
-    lb = loopUpperBound + 2;
-    ub = loopUpperBound + 1;
-    return FINISHED;
-  }
-
-  static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
-                           T *plower, T *pupper, ST *pstride,
-                           DynamicScheduleTracker *DST) {
-    // ID of a thread in its own warp
-
-    // automatically selects thread or warp ID based on selected implementation
-    ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
-            "current thread is not needed here; error");
-    // retrieve schedule
-    kmp_sched_t schedule = DST->ScheduleType;
-
-    // xxx reduce to one
-    if (schedule == kmp_sched_static_chunk ||
-        schedule == kmp_sched_static_nochunk) {
-      T myLb = DST->NextLowerBound;
-      T ub = DST->LoopUpperBound;
-      // finished?
-      if (myLb > ub) {
-        return DISPATCH_FINISHED;
-      }
-      // not finished, save current bounds
-      ST chunk = DST->Chunk;
-      *plower = myLb;
-      T myUb = myLb + chunk - 1; // Clang uses i <= ub
-      if (myUb > ub)
-        myUb = ub;
-      *pupper = myUb;
-      *plast = (int32_t)(myUb == ub);
-
-      // increment next lower bound by the stride
-      ST stride = DST->Stride;
-      DST->NextLowerBound = myLb + stride;
-      return DISPATCH_NOTFINISHED;
-    }
-    ASSERT0(LT_FUSSY,
-            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-            "bad sched");
-    T myLb, myUb;
-    int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
-                                    DST->LoopUpperBound);
-
-    if (finished == FINISHED)
-      return DISPATCH_FINISHED;
-
-    // not finished (either not finished or last chunk)
-    *plast = (int32_t)(finished == LAST_CHUNK);
-    *plower = myLb;
-    *pupper = myUb;
-    *pstride = 1;
-
-    return DISPATCH_NOTFINISHED;
-  }
-
-  static void dispatch_fini() {
-    // nothing
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // end of template class that encapsulate all the helper functions
-  ////////////////////////////////////////////////////////////////////////////////
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (dyn loops)
-////////////////////////////////////////////////////////////////////////////////
-
-// TODO: Expand the dispatch API to take a DST pointer which can then be
-//       allocated properly without malloc.
-// For now, each team will contain an LDS pointer (ThreadDST) to a global array
-// of references to the DST structs allocated (in global memory) for each thread
-// in the team. The global memory array is allocated during the init phase if it
-// was not allocated already and will be deallocated when the dispatch phase
-// ends:
-//
-//  __kmpc_dispatch_init
-//
-//  ** Dispatch loop **
-//
-//  __kmpc_dispatch_deinit
-//
-[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **>
-    ThreadDST;
-
-// Create a new DST, link the current one, and define the new as current.
-static DynamicScheduleTracker *pushDST() {
-  int32_t ThreadIndex = mapping::getThreadIdInBlock();
-  // Each block will allocate an array of pointers to DST structs. The array is
-  // equal in length to the number of threads in that block.
-  if (!ThreadDST) {
-    // Allocate global memory array of pointers to DST structs:
-    if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0)
-      ThreadDST = static_cast<DynamicScheduleTracker **>(
-          memory::allocGlobal(mapping::getNumberOfThreadsInBlock() *
-                                  sizeof(DynamicScheduleTracker *),
-                              "new ThreadDST array"));
-    synchronize::threads(atomic::seq_cst);
-
-    // Initialize the array pointers:
-    ThreadDST[ThreadIndex] = nullptr;
-  }
-
-  // Create a DST struct for the current thread:
-  DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
-      memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
-  *NewDST = DynamicScheduleTracker({0});
-
-  // Add the new DST struct to the array of DST structs:
-  NewDST->NextDST = ThreadDST[ThreadIndex];
-  ThreadDST[ThreadIndex] = NewDST;
-  return NewDST;
-}
-
-// Return the current DST.
-static DynamicScheduleTracker *peekDST() {
-  return ThreadDST[mapping::getThreadIdInBlock()];
-}
-
-// Pop the current DST and restore the last one.
-static void popDST() {
-  int32_t ThreadIndex = mapping::getThreadIdInBlock();
-  DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex];
-  DynamicScheduleTracker *OldDST = CurrentDST->NextDST;
-  memory::freeGlobal(CurrentDST, "remove DST");
-  ThreadDST[ThreadIndex] = OldDST;
-
-  // Check if we need to deallocate the global array. Ensure all threads
-  // in the block have finished deallocating the individual DSTs.
-  synchronize::threads(atomic::seq_cst);
-  if (!ThreadDST[ThreadIndex] && !ThreadIndex) {
-    memory::freeGlobal(ThreadDST, "remove ThreadDST array");
-    ThreadDST = nullptr;
-  }
-  synchronize::threads(atomic::seq_cst);
-}
-
-void workshare::init(bool IsSPMD) {
-  if (mapping::isInitialThreadInLevel0(IsSPMD))
-    ThreadDST = nullptr;
-}
-
-extern "C" {
-
-// init
-void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
-                            int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
-                             uint32_t lb, uint32_t ub, int32_t st,
-                             int32_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
-                            int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
-                             uint64_t lb, uint64_t ub, int64_t st,
-                             int64_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-// next
-int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
-                           int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
-                            uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
-                           int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
-                            uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-// fini
-void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
-}
-
-// deinit
-void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); }
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (static loops)
-////////////////////////////////////////////////////////////////////////////////
-
-void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
-                              int32_t schedtype, int32_t *plastiter,
-                              int32_t *plower, int32_t *pupper,
-                              int32_t *pstride, int32_t incr, int32_t chunk) {
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
-                               int32_t schedtype, int32_t *plastiter,
-                               uint32_t *plower, uint32_t *pupper,
-                               int32_t *pstride, int32_t incr, int32_t chunk) {
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
-                              int32_t schedtype, int32_t *plastiter,
-                              int64_t *plower, int64_t *pupper,
-                              int64_t *pstride, int64_t incr, int64_t chunk) {
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
-                               int32_t schedtype, int32_t *plastiter,
-                               uint64_t *plower, uint64_t *pupper,
-                               int64_t *pstride, int64_t incr, int64_t chunk) {
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int32_t *plower, int32_t *pupper,
-                                     int32_t *pstride, int32_t incr,
-                                     int32_t chunk) {
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint32_t *plower, uint32_t *pupper,
-                                      int32_t *pstride, int32_t incr,
-                                      int32_t chunk) {
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int64_t *plower, int64_t *pupper,
-                                     int64_t *pstride, int64_t incr,
-                                     int64_t chunk) {
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint64_t *plower, uint64_t *pupper,
-                                      int64_t *pstride, int64_t incr,
-                                      int64_t chunk) {
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
-
-void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
-}
-
-namespace ompx {
-
-/// Helper class to hide the generic loop nest and provide the template argument
-/// throughout.
-template <typename Ty> class StaticLoopChunker {
-
-  /// Generic loop nest that handles block and/or thread distribution in the
-  /// absence of user specified chunk sizes. This implicitly picks a block chunk
-  /// size equal to the number of threads in the block and a thread chunk size
-  /// equal to one. In contrast to the chunked version we can get away with a
-  /// single loop in this case
-  static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
-                                        Ty NumBlocks, Ty BId, Ty NumThreads,
-                                        Ty TId, Ty NumIters,
-                                        uint8_t OneIterationPerThread) {
-    Ty KernelIteration = NumBlocks * NumThreads;
-
-    // Start index in the normalized space.
-    Ty IV = BId * NumThreads + TId;
-    ASSERT(IV >= 0, "Bad index");
-
-    // Cover the entire iteration space, assumptions in the caller might allow
-    // to simplify this loop to a conditional.
-    if (IV < NumIters) {
-      do {
-
-        // Execute the loop body.
-        LoopBody(IV, Arg);
-
-        // Every thread executed one block and thread chunk now.
-        IV += KernelIteration;
-
-        if (OneIterationPerThread)
-          return;
-
-      } while (IV < NumIters);
-    }
-  }
-
-  /// Generic loop nest that handles block and/or thread distribution in the
-  /// presence of user specified chunk sizes (for at least one of them).
-  static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
-                                        Ty BlockChunk, Ty NumBlocks, Ty BId,
-                                        Ty ThreadChunk, Ty NumThreads, Ty TId,
-                                        Ty NumIters,
-                                        uint8_t OneIterationPerThread) {
-    Ty KernelIteration = NumBlocks * BlockChunk;
-
-    // Start index in the chunked space.
-    Ty IV = BId * BlockChunk + TId;
-    ASSERT(IV >= 0, "Bad index");
-
-    // Cover the entire iteration space, assumptions in the caller might allow
-    // to simplify this loop to a conditional.
-    do {
-
-      Ty BlockChunkLeft =
-          BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
-      Ty ThreadChunkLeft =
-          ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
-
-      while (ThreadChunkLeft--) {
-
-        // Given the blocking it's hard to keep track of what to execute.
-        if (IV >= NumIters)
-          return;
-
-        // Execute the loop body.
-        LoopBody(IV, Arg);
-
-        if (OneIterationPerThread)
-          return;
-
-        ++IV;
-      }
-
-      IV += KernelIteration;
-
-    } while (IV < NumIters);
-  }
-
-public:
-  /// Worksharing `for`-loop.
-  /// \param[in] Loc Description of source location
-  /// \param[in] LoopBody Function which corresponds to loop body
-  /// \param[in] Arg Pointer to struct which contains loop body args
-  /// \param[in] NumIters Number of loop iterations
-  /// \param[in] NumThreads Number of GPU threads
-  /// \param[in] ThreadChunk Size of thread chunk
-  /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
-  /// only one loop iteration or one thread chunk. This avoids an outer loop
-  /// over all loop iterations/chunks.
-  static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                  Ty NumIters, Ty NumThreads, Ty ThreadChunk,
-                  uint8_t OneIterationPerThread) {
-    ASSERT(NumIters >= 0, "Bad iteration count");
-    ASSERT(ThreadChunk >= 0, "Bad thread count");
-
-    // All threads need to participate but we don't know if we are in a
-    // parallel at all or if the user might have used a `num_threads` clause
-    // on the parallel and reduced the number compared to the block size.
-    // Since nested parallels are possible too we need to get the thread id
-    // from the `omp` getter and not the mapping directly.
-    Ty TId = omp_get_thread_num();
-
-    // There are no blocks involved here.
-    Ty BlockChunk = 0;
-    Ty NumBlocks = 1;
-    Ty BId = 0;
-
-    // If the thread chunk is not specified we pick a default now.
-    if (ThreadChunk == 0)
-      ThreadChunk = 1;
-
-    // If we know we have more threads than iterations we can indicate that to
-    // avoid an outer loop.
-    if (config::getAssumeThreadsOversubscription()) {
-      OneIterationPerThread = true;
-    }
-
-    if (OneIterationPerThread)
-      ASSERT(NumThreads >= NumIters, "Broken assumption");
-
-    if (ThreadChunk != 1)
-      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
-                                ThreadChunk, NumThreads, TId, NumIters,
-                                OneIterationPerThread);
-    else
-      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
-                                NumIters, OneIterationPerThread);
-  }
-
-  /// Worksharing `distribute`-loop.
-  /// \param[in] Loc Description of source location
-  /// \param[in] LoopBody Function which corresponds to loop body
-  /// \param[in] Arg Pointer to struct which contains loop body args
-  /// \param[in] NumIters Number of loop iterations
-  /// \param[in] BlockChunk Size of block chunk
-  /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
-  /// only one loop iteration or one thread chunk. This avoids an outer loop
-  /// over all loop iterations/chunks.
-  static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                         Ty NumIters, Ty BlockChunk,
-                         uint8_t OneIterationPerThread) {
-    ASSERT(icv::Level == 0, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
-
-    ASSERT(NumIters >= 0, "Bad iteration count");
-    ASSERT(BlockChunk >= 0, "Bad block count");
-
-    // There are no threads involved here.
-    Ty ThreadChunk = 0;
-    Ty NumThreads = 1;
-    Ty TId = 0;
-
-    // All teams need to participate.
-    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
-    Ty BId = mapping::getBlockIdInKernel();
-
-    // If the block chunk is not specified we pick a default now.
-    if (BlockChunk == 0)
-      BlockChunk = NumThreads;
-
-    // If we know we have more blocks than iterations we can indicate that to
-    // avoid an outer loop.
-    if (config::getAssumeTeamsOversubscription()) {
-      OneIterationPerThread = true;
-    }
-
-    if (OneIterationPerThread)
-      ASSERT(NumBlocks >= NumIters, "Broken assumption");
-
-    if (BlockChunk != NumThreads)
-      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
-                                ThreadChunk, NumThreads, TId, NumIters,
-                                OneIterationPerThread);
-    else
-      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
-                                NumIters, OneIterationPerThread);
-
-    ASSERT(icv::Level == 0, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
-  }
-
-  /// Worksharing `distribute parallel for`-loop.
-  /// \param[in] Loc Description of source location
-  /// \param[in] LoopBody Function which corresponds to loop body
-  /// \param[in] Arg Pointer to struct which contains loop body args
-  /// \param[in] NumIters Number of loop iterations
-  /// \param[in] NumThreads Number of GPU threads
-  /// \param[in] BlockChunk Size of block chunk
-  /// \param[in] ThreadChunk Size of thread chunk
-  /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
-  /// only one loop iteration or one thread chunk. This avoids an outer loop
-  /// over all loop iterations/chunks.
-  static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
-                            void *Arg, Ty NumIters, Ty NumThreads,
-                            Ty BlockChunk, Ty ThreadChunk,
-                            uint8_t OneIterationPerThread) {
-    ASSERT(icv::Level == 1, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-
-    ASSERT(NumIters >= 0, "Bad iteration count");
-    ASSERT(BlockChunk >= 0, "Bad block count");
-    ASSERT(ThreadChunk >= 0, "Bad thread count");
-
-    // All threads need to participate but the user might have used a
-    // `num_threads` clause on the parallel and reduced the number compared to
-    // the block size.
-    Ty TId = mapping::getThreadIdInBlock();
-
-    // All teams need to participate.
-    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
-    Ty BId = mapping::getBlockIdInKernel();
-
-    // If the block chunk is not specified we pick a default now.
-    if (BlockChunk == 0)
-      BlockChunk = NumThreads;
-
-    // If the thread chunk is not specified we pick a default now.
-    if (ThreadChunk == 0)
-      ThreadChunk = 1;
-
-    // If we know we have more threads (across all blocks) than iterations we
-    // can indicate that to avoid an outer loop.
-    if (config::getAssumeTeamsOversubscription() &
-        config::getAssumeThreadsOversubscription()) {
-      OneIterationPerThread = true;
-    }
-
-    if (OneIterationPerThread)
-      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
-
-    if (BlockChunk != NumThreads || ThreadChunk != 1)
-      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
-                                ThreadChunk, NumThreads, TId, NumIters,
-                                OneIterationPerThread);
-    else
-      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
-                                NumIters, OneIterationPerThread);
-
-    ASSERT(icv::Level == 1, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-  }
-};
-
-} // namespace ompx
-
-#define OMP_LOOP_ENTRY(BW, TY)                                                 \
-  [[gnu::flatten, clang::always_inline]] void                                  \
-  __kmpc_distribute_for_static_loop##BW(                                       \
-      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
-      TY num_threads, TY block_chunk, TY thread_chunk,                         \
-      uint8_t one_iteration_per_thread) {                                      \
-    ompx::StaticLoopChunker<TY>::DistributeFor(                                \
-        loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk,       \
-        one_iteration_per_thread);                                             \
-  }                                                                            \
-  [[gnu::flatten, clang::always_inline]] void                                  \
-  __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),      \
-                                    void *arg, TY num_iters, TY block_chunk,   \
-                                    uint8_t one_iteration_per_thread) {        \
-    ompx::StaticLoopChunker<TY>::Distribute(                                   \
-        loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread);       \
-  }                                                                            \
-  [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW(      \
-      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
-      TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) {     \
-    ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads,     \
-                                     thread_chunk, one_iteration_per_thread);  \
-  }
-
-extern "C" {
-OMP_LOOP_ENTRY(_4, int32_t)
-OMP_LOOP_ENTRY(_4u, uint32_t)
-OMP_LOOP_ENTRY(_8, int64_t)
-OMP_LOOP_ENTRY(_8u, uint64_t)
-}