[𝘀𝗽𝗿] changes introduced through rebaseusers/arichardson/spr/main.amdgpu-baseline-test-for-ptrtoaddr-code-generation

Created using spr 1.3.8-beta.1 [skip ci]
author: Guillaume Chatelet <gchatelet@google.com> 2025-10-14 09:02:30 -0700
committer: Alex Richardson <alexrichardson@google.com> 2025-10-14 09:02:30 -0700
commit: e2d7be24a8dc31bb36380abd088b7eb0da7ef6b4 (patch)
tree: 4811d025c12321c442695ad5aa4f511fa2fbd10b /offload
parent: 1be5a8430be58baa5754e6f046eeacf7ca2f1a54 (diff)
parent: 57726bdca274b152d2f36aaad7c961767bb1f91a (diff)
216 files changed, 10026 insertions, 7086 deletions
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 38fa77e41bb5..b27738078350 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -4,7 +4,8 @@
 cmake_minimum_required(VERSION 3.20.0)
 set(LLVM_SUBPROJECT_TITLE "liboffload")
 
-if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+# Permit redefining OPENMP_STANDALONE_BUILD when doing a runtimes build.
+if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
   set(OPENMP_STANDALONE_BUILD TRUE)
   project(offload C CXX ASM)
 else()
@@ -371,7 +372,6 @@ add_subdirectory(tools/offload-tblgen)
 
 # Build offloading plugins and device RTLs if they are available.
 add_subdirectory(plugins-nextgen)
-add_subdirectory(DeviceRTL)
 add_subdirectory(tools)
 add_subdirectory(docs)
 
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
deleted file mode 100644
index e4916f4d4975..000000000000
--- a/offload/DeviceRTL/CMakeLists.txt
+++ /dev/null
@@ -1,188 +0,0 @@
-set(LIBOMPTARGET_BUILD_DEVICERTL_BCLIB TRUE CACHE BOOL
-  "Can be set to false to disable building this library.")
-
-if (NOT LIBOMPTARGET_BUILD_DEVICERTL_BCLIB)
-  message(STATUS "Not building DeviceRTL: Disabled by LIBOMPTARGET_BUILD_DEVICERTL_BCLIB")
-  return()
-endif()
-
-# Check to ensure the host system is a supported host architecture.
-if(NOT ${CMAKE_SIZEOF_VOID_P} EQUAL "8")
-  message(STATUS "Not building DeviceRTL: Runtime does not support 32-bit hosts")
-  return()
-endif()
-
-if (LLVM_DIR)
-  # Builds that use pre-installed LLVM have LLVM_DIR set.
-  # A standalone or LLVM_ENABLE_RUNTIMES=openmp build takes this route
-  find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
-elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD)
-  # LLVM in-tree builds may use CMake target names to discover the tools.
-  # A LLVM_ENABLE_PROJECTS=openmp build takes this route
-  set(CLANG_TOOL $<TARGET_FILE:clang>)
-else()
-  message(STATUS "Not building DeviceRTL. No appropriate clang found")
-  return()
-endif()
-
-set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
-set(include_directory ${devicertl_base_directory}/include)
-set(source_directory ${devicertl_base_directory}/src)
-
-set(include_files
-  ${include_directory}/Allocator.h
-  ${include_directory}/Configuration.h
-  ${include_directory}/Debug.h
-  ${include_directory}/Interface.h
-  ${include_directory}/LibC.h
-  ${include_directory}/Mapping.h
-  ${include_directory}/Profiling.h
-  ${include_directory}/State.h
-  ${include_directory}/Synchronization.h
-  ${include_directory}/DeviceTypes.h
-  ${include_directory}/DeviceUtils.h
-  ${include_directory}/Workshare.h
-)
-
-set(src_files
-  ${source_directory}/Allocator.cpp
-  ${source_directory}/Configuration.cpp
-  ${source_directory}/Debug.cpp
-  ${source_directory}/Kernel.cpp
-  ${source_directory}/LibC.cpp
-  ${source_directory}/Mapping.cpp
-  ${source_directory}/Misc.cpp
-  ${source_directory}/Parallelism.cpp
-  ${source_directory}/Profiling.cpp
-  ${source_directory}/Reduction.cpp
-  ${source_directory}/State.cpp
-  ${source_directory}/Synchronization.cpp
-  ${source_directory}/Tasking.cpp
-  ${source_directory}/DeviceUtils.cpp
-  ${source_directory}/Workshare.cpp
-)
-
-# We disable the slp vectorizer during the runtime optimization to avoid
-# vectorized accesses to the shared state. Generally, those are "good" but
-# the optimizer pipeline (esp. Attributor) does not fully support vectorized
-# instructions yet and we end up missing out on way more important constant
-# propagation. That said, we will run the vectorizer again after the runtime
-# has been linked into the user program.
-set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512 -mllvm -vectorize-slp=false )
-
-# If the user built with the GPU C library enabled we will use that instead.
-if(${LIBOMPTARGET_GPU_LIBC_SUPPORT})
-  list(APPEND clang_opt_flags -DOMPTARGET_HAS_LIBC)
-endif()
-
-# Set flags for LLVM Bitcode compilation.
-set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
-             ${clang_opt_flags} -nogpulib -nostdlibinc
-             -fno-rtti -fno-exceptions -fconvergent-functions
-             -Wno-unknown-cuda-version
-             -DOMPTARGET_DEVICE_RUNTIME
-             -I${include_directory}
-             -I${devicertl_base_directory}/../include
-             -I${devicertl_base_directory}/../../libc
-)
-
-# first create an object target
-function(compileDeviceRTLLibrary target_name target_triple)
-  set(target_bc_flags ${ARGN})
-
-  foreach(src ${src_files})
-    get_filename_component(infile ${src} ABSOLUTE)
-    get_filename_component(outfile ${src} NAME)
-    set(outfile "${outfile}-${target_name}.o")
-    set(depfile "${outfile}.d")
-
-    # Passing an empty CPU to -march= suppressed target specific metadata.
-    add_custom_command(OUTPUT ${outfile}
-      COMMAND ${CLANG_TOOL}
-      ${bc_flags}
-      --target=${target_triple}
-      ${target_bc_flags}
-      -MD -MF ${depfile}
-      ${infile} -o ${outfile}
-      DEPENDS ${infile}
-      DEPFILE ${depfile}
-      COMMENT "Building LLVM bitcode ${outfile}"
-      VERBATIM
-    )
-    if(TARGET clang)
-      # Add a file-level dependency to ensure that clang is up-to-date.
-      # By default, add_custom_command only builds clang if the
-      # executable is missing.
-      add_custom_command(OUTPUT ${outfile}
-        DEPENDS clang
-        APPEND
-      )
-    endif()
-    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
-
-    list(APPEND obj_files ${CMAKE_CURRENT_BINARY_DIR}/${outfile})
-  endforeach()
-  # Trick to combine these into a bitcode file via the linker's LTO pass. This
-  # is used to provide the legacy `libomptarget-<name>.bc` files. Hack this
-  # through as an executable to get it to use the relocatable link.
-  add_executable(libomptarget-${target_name} ${obj_files})
-  set_target_properties(libomptarget-${target_name} PROPERTIES
-    RUNTIME_OUTPUT_DIRECTORY ${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}
-    LINKER_LANGUAGE CXX
-    BUILD_RPATH ""
-    INSTALL_RPATH ""
-    RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
-  target_compile_options(libomptarget-${target_name} PRIVATE
-    "--target=${target_triple}" "-fuse-ld=lld" "-march=" "-mcpu="
-    "-Wno-unused-command-line-argument")
-  target_link_options(libomptarget-${target_name} PRIVATE "--target=${target_triple}"
-                      "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm"
-                      "-fuse-ld=lld" "-march=" "-mcpu=")
-  install(TARGETS libomptarget-${target_name}
-          PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
-          DESTINATION "lib${LLVM_LIBDIR_SUFFIX}/${target_triple}")
-
-  add_library(omptarget.${target_name}.all_objs OBJECT IMPORTED)
-  set_property(TARGET omptarget.${target_name}.all_objs APPEND PROPERTY IMPORTED_OBJECTS
-               ${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}/libomptarget-${target_name}.bc)
-  add_dependencies(omptarget.${target_name}.all_objs libomptarget-${target_name})
-
-  # Archive all the object files generated above into a static library
-  add_library(omptarget.${target_name} STATIC)
-  set_target_properties(omptarget.${target_name} PROPERTIES
-    ARCHIVE_OUTPUT_DIRECTORY "${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}/${target_triple}"
-    ARCHIVE_OUTPUT_NAME ompdevice
-    LINKER_LANGUAGE CXX
-  )
-  target_link_libraries(omptarget.${target_name} PRIVATE omptarget.${target_name}.all_objs)
-  target_link_options(omptarget.${target_name} PRIVATE "--target=${target_triple}"
-                      "-Wno-unused-command-line-argument" "-r" "-nostdlib" "-flto"
-                       "-Wl,--lto-emit-llvm" "-fuse-ld=lld" "-march=" "-mcpu=")
-
-  install(TARGETS omptarget.${target_name}
-          ARCHIVE DESTINATION "lib${LLVM_LIBDIR_SUFFIX}/${target_triple}")
-
-  if (CMAKE_EXPORT_COMPILE_COMMANDS)
-    set(ide_target_name omptarget-ide-${target_name})
-    add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
-    target_compile_options(${ide_target_name} PRIVATE
-      -fvisibility=hidden --target=${target_triple}
-      -nogpulib -nostdlibinc -Wno-unknown-cuda-version
-    )
-    target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
-    target_include_directories(${ide_target_name} PRIVATE
-      ${include_directory}
-      ${devicertl_base_directory}/../../libc
-      ${devicertl_base_directory}/../include
-    )
-    install(TARGETS ${ide_target_name} EXCLUDE_FROM_ALL)
-  endif()
-endfunction()
-
-if(NOT LLVM_TARGETS_TO_BUILD OR "AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
-  compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
-endif()
-
-if(NOT LLVM_TARGETS_TO_BUILD OR "NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
-  compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
-endif()
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
deleted file mode 100644
index dc4d029ed75f..000000000000
--- a/offload/DeviceRTL/include/Allocator.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_ALLOCATOR_H
-#define OMPTARGET_ALLOCATOR_H
-
-#include "DeviceTypes.h"
-
-// Forward declaration.
-struct KernelEnvironmentTy;
-
-namespace ompx {
-
-namespace allocator {
-
-static uint64_t constexpr ALIGNMENT = 16;
-
-/// Initialize the allocator according to \p KernelEnvironment
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
-
-/// Allocate \p Size bytes.
-[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
-alloc(uint64_t Size);
-
-/// Free the allocation pointed to by \p Ptr.
-void free(void *Ptr);
-
-} // namespace allocator
-
-} // namespace ompx
-
-extern "C" {
-void *malloc(size_t Size);
-void free(void *Ptr);
-}
-
-#endif
diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h
deleted file mode 100644
index 95408933dd86..000000000000
--- a/offload/DeviceRTL/include/Configuration.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// API to query the global (constant) device environment.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_CONFIGURATION_H
-#define OMPTARGET_CONFIGURATION_H
-
-#include "Shared/Environment.h"
-
-#include "DeviceTypes.h"
-
-namespace ompx {
-namespace config {
-
-/// Return the number of devices in the system, same number as returned on the
-/// host by omp_get_num_devices.
-uint32_t getNumDevices();
-
-/// Return the device number in the system for omp_get_device_num.
-uint32_t getDeviceNum();
-
-/// Return the user chosen debug level.
-uint32_t getDebugKind();
-
-/// Return if teams oversubscription is assumed
-uint32_t getAssumeTeamsOversubscription();
-
-/// Return if threads oversubscription is assumed
-uint32_t getAssumeThreadsOversubscription();
-
-/// Return the amount of dynamic shared memory that was allocated at launch.
-uint64_t getDynamicMemorySize();
-
-/// Returns the cycles per second of the device's fixed frequency clock.
-uint64_t getClockFrequency();
-
-/// Returns the pointer to the beginning of the indirect call table.
-void *getIndirectCallTablePtr();
-
-/// Returns the size of the indirect call table.
-uint64_t getIndirectCallTableSize();
-
-/// Returns the size of the indirect call table.
-uint64_t getHardwareParallelism();
-
-/// Return if debugging is enabled for the given debug kind.
-bool isDebugMode(DeviceDebugKind Level);
-
-/// Indicates if this kernel may require thread-specific states, or if it was
-/// explicitly disabled by the user.
-bool mayUseThreadStates();
-
-/// Indicates if this kernel may require data environments for nested
-/// parallelism, or if it was explicitly disabled by the user.
-bool mayUseNestedParallelism();
-
-} // namespace config
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/Debug.h b/offload/DeviceRTL/include/Debug.h
deleted file mode 100644
index 98d0fa498d95..000000000000
--- a/offload/DeviceRTL/include/Debug.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_DEBUG_H
-#define OMPTARGET_DEVICERTL_DEBUG_H
-
-#include "Configuration.h"
-#include "LibC.h"
-
-/// Assertion
-///
-/// {
-extern "C" {
-void __assert_assume(bool condition);
-void __assert_fail(const char *expr, const char *file, unsigned line,
-                   const char *function);
-void __assert_fail_internal(const char *expr, const char *msg, const char *file,
-                            unsigned line, const char *function);
-}
-
-#define ASSERT(expr, msg)                                                      \
-  {                                                                            \
-    if (config::isDebugMode(DeviceDebugKind::Assertion) && !(expr))            \
-      __assert_fail_internal(#expr, msg, __FILE__, __LINE__,                   \
-                             __PRETTY_FUNCTION__);                             \
-    else                                                                       \
-      __assert_assume(expr);                                                   \
-  }
-#define UNREACHABLE(msg)                                                       \
-  printf(msg);                                                                 \
-  __builtin_trap();                                                            \
-  __builtin_unreachable();
-
-///}
-
-#endif
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
deleted file mode 100644
index 2e5d92380f04..000000000000
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ /dev/null
@@ -1,166 +0,0 @@
-//===---------- DeviceTypes.h - OpenMP types ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_TYPES_H
-#define OMPTARGET_TYPES_H
-
-#include <gpuintrin.h>
-#include <stddef.h>
-#include <stdint.h>
-
-template <typename T> using Private = __gpu_private T;
-template <typename T> using Constant = __gpu_constant T;
-template <typename T> using Local = __gpu_local T;
-template <typename T> using Global = __gpu_local T;
-
-enum omp_proc_bind_t {
-  omp_proc_bind_false = 0,
-  omp_proc_bind_true = 1,
-  omp_proc_bind_master = 2,
-  omp_proc_bind_close = 3,
-  omp_proc_bind_spread = 4
-};
-
-enum omp_sched_t {
-  omp_sched_static = 1,  /* chunkSize >0 */
-  omp_sched_dynamic = 2, /* chunkSize >0 */
-  omp_sched_guided = 3,  /* chunkSize >0 */
-  omp_sched_auto = 4,    /* no chunkSize */
-};
-
-enum kmp_sched_t {
-  kmp_sched_static_chunk = 33,
-  kmp_sched_static_nochunk = 34,
-  kmp_sched_dynamic = 35,
-  kmp_sched_guided = 36,
-  kmp_sched_runtime = 37,
-  kmp_sched_auto = 38,
-
-  kmp_sched_static_balanced_chunk = 45,
-
-  kmp_sched_static_ordered = 65,
-  kmp_sched_static_nochunk_ordered = 66,
-  kmp_sched_dynamic_ordered = 67,
-  kmp_sched_guided_ordered = 68,
-  kmp_sched_runtime_ordered = 69,
-  kmp_sched_auto_ordered = 70,
-
-  kmp_sched_distr_static_chunk = 91,
-  kmp_sched_distr_static_nochunk = 92,
-  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
-
-  kmp_sched_default = kmp_sched_static_nochunk,
-  kmp_sched_unordered_first = kmp_sched_static_chunk,
-  kmp_sched_unordered_last = kmp_sched_auto,
-  kmp_sched_ordered_first = kmp_sched_static_ordered,
-  kmp_sched_ordered_last = kmp_sched_auto_ordered,
-  kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
-  kmp_sched_distribute_last =
-      kmp_sched_distr_static_chunk_sched_static_chunkone,
-
-  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
-   * Since we need to distinguish the three possible cases (no modifier,
-   * monotonic modifier, nonmonotonic modifier), we need separate bits for
-   * each modifier. The absence of monotonic does not imply nonmonotonic,
-   * especially since 4.5 says that the behaviour of the "no modifier" case
-   * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
-   *
-   * Since we're passing a full 32 bit value, we can use a couple of high
-   * bits for these flags; out of paranoia we avoid the sign bit.
-   *
-   * These modifiers can be or-ed into non-static schedules by the compiler
-   * to pass the additional information. They will be stripped early in the
-   * processing in __kmp_dispatch_init when setting up schedules, so
-   * most of the code won't ever see schedules with these bits set.
-   */
-  kmp_sched_modifier_monotonic = (1 << 29),
-  /**< Set if the monotonic schedule modifier was present */
-  kmp_sched_modifier_nonmonotonic = (1 << 30),
-/**< Set if the nonmonotonic schedule modifier was present */
-
-#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
-  (enum kmp_sched_t)(                                                          \
-      (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
-#define SCHEDULE_HAS_MONOTONIC(s) (((s) & kmp_sched_modifier_monotonic) != 0)
-#define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
-  (((s) & kmp_sched_modifier_nonmonotonic) != 0)
-#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
-  (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
-   0)
-
-};
-
-struct TaskDescriptorTy;
-using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr);
-struct TaskDescriptorTy {
-  void *Payload;
-  TaskFnTy TaskFn;
-};
-
-using LaneMaskTy = uint64_t;
-
-namespace lanes {
-enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
-} // namespace lanes
-
-/// The ident structure that describes a source location. The struct is
-/// identical to the one in the kmp.h file. We maintain the same data structure
-/// for compatibility.
-struct IdentTy {
-  int32_t reserved_1;  /**<  might be used in Fortran; see above  */
-  int32_t flags;       /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
-                            identifies this union member  */
-  int32_t reserved_2;  /**<  not really used in Fortran any more; see above */
-  int32_t reserved_3;  /**<  source[4] in Fortran, do not use for C++  */
-  char const *psource; /**<  String describing the source location.
-                       The string is composed of semi-colon separated fields
-                       which describe the source file, the function and a pair
-                       of line numbers that delimit the construct. */
-};
-
-using __kmpc_impl_lanemask_t = LaneMaskTy;
-
-using ParallelRegionFnTy = void *;
-
-using CriticalNameTy = int32_t[8];
-
-struct omp_lock_t {
-  void *Lock;
-};
-
-using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num);
-using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id,
-                                   int16_t lane_offset, int16_t shortCircuit);
-using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data);
-
-/// Macros for allocating variables in different address spaces.
-///{
-
-// Follows the pattern in interface.h
-typedef enum omp_allocator_handle_t {
-  omp_null_allocator = 0,
-  omp_default_mem_alloc = 1,
-  omp_large_cap_mem_alloc = 2,
-  omp_const_mem_alloc = 3,
-  omp_high_bw_mem_alloc = 4,
-  omp_low_lat_mem_alloc = 5,
-  omp_cgroup_mem_alloc = 6,
-  omp_pteam_mem_alloc = 7,
-  omp_thread_mem_alloc = 8,
-  KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
-} omp_allocator_handle_t;
-
-#define __PRAGMA(STR) _Pragma(#STR)
-#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
-
-///}
-
-#endif
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
deleted file mode 100644
index b92514ee9838..000000000000
--- a/offload/DeviceRTL/include/DeviceUtils.h
+++ /dev/null
@@ -1,96 +0,0 @@
-//===--- DeviceUtils.h - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H
-#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H
-
-#include "DeviceTypes.h"
-#include "Shared/Utils.h"
-
-namespace utils {
-
-template <typename T> struct type_identity {
-  using type = T;
-};
-
-template <typename T, T v> struct integral_constant {
-  inline static constexpr T value = v;
-};
-
-/// Freestanding SFINAE helpers.
-template <class T> struct remove_cv : type_identity<T> {};
-template <class T> struct remove_cv<const T> : type_identity<T> {};
-template <class T> struct remove_cv<volatile T> : type_identity<T> {};
-template <class T> struct remove_cv<const volatile T> : type_identity<T> {};
-template <class T> using remove_cv_t = typename remove_cv<T>::type;
-
-using true_type = integral_constant<bool, true>;
-using false_type = integral_constant<bool, false>;
-
-template <typename T, typename U> struct is_same : false_type {};
-template <typename T> struct is_same<T, T> : true_type {};
-template <typename T, typename U>
-inline constexpr bool is_same_v = is_same<T, U>::value;
-
-template <typename T> struct is_floating_point {
-  inline static constexpr bool value =
-      is_same_v<remove_cv_t<T>, float> || is_same_v<remove_cv_t<T>, double>;
-};
-template <typename T>
-inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
-
-template <bool B, typename T = void> struct enable_if;
-template <typename T> struct enable_if<true, T> : type_identity<T> {};
-template <bool B, typename T = void>
-using enable_if_t = typename enable_if<B, T>::type;
-
-template <class T> struct remove_addrspace : type_identity<T> {};
-template <class T, int N>
-struct remove_addrspace<T [[clang::address_space(N)]]> : type_identity<T> {};
-template <class T>
-using remove_addrspace_t = typename remove_addrspace<T>::type;
-
-template <typename To, typename From> inline To bitCast(From V) {
-  static_assert(sizeof(To) == sizeof(From), "Bad conversion");
-  return __builtin_bit_cast(To, V);
-}
-
-/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
-/// is identified by \p Mask.
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
-
-int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
-/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
-uint64_t pack(uint32_t LowBits, uint32_t HighBits);
-
-/// Unpack \p Val into \p LowBits and \p HighBits.
-void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
-
-/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
-bool isSharedMemPtr(void *Ptr);
-
-/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)).
-bool isThreadLocalMemPtr(void *Ptr);
-
-/// A  pointer variable that has by design an `undef` value. Use with care.
-[[clang::loader_uninitialized]] static void *const UndefPtr;
-
-#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
-#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
-
-} // namespace utils
-
-#endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
deleted file mode 100644
index c4bfaaa2404b..000000000000
--- a/offload/DeviceRTL/include/Interface.h
+++ /dev/null
@@ -1,366 +0,0 @@
-//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_INTERFACE_H
-#define OMPTARGET_DEVICERTL_INTERFACE_H
-
-#include "Shared/Environment.h"
-
-#include "DeviceTypes.h"
-
-/// External API
-///
-///{
-
-extern "C" {
-
-/// ICV: dyn-var, constant 0
-///
-/// setter: ignored.
-/// getter: returns 0.
-///
-///{
-void omp_set_dynamic(int);
-int omp_get_dynamic(void);
-///}
-
-/// ICV: nthreads-var, integer
-///
-/// scope: data environment
-///
-/// setter: ignored.
-/// getter: returns false.
-///
-/// implementation notes:
-///
-///
-///{
-void omp_set_num_threads(int);
-int omp_get_max_threads(void);
-///}
-
-/// ICV: thread-limit-var, computed
-///
-/// getter: returns thread limited defined during launch.
-///
-///{
-int omp_get_thread_limit(void);
-///}
-
-/// ICV: max-active-level-var, constant 1
-///
-/// setter: ignored.
-/// getter: returns 1.
-///
-///{
-void omp_set_max_active_levels(int);
-int omp_get_max_active_levels(void);
-///}
-
-/// ICV: places-partition-var
-///
-///
-///{
-///}
-
-/// ICV: active-level-var, 0 or 1
-///
-/// getter: returns 0 or 1.
-///
-///{
-int omp_get_active_level(void);
-///}
-
-/// ICV: level-var
-///
-/// getter: returns parallel region nesting
-///
-///{
-int omp_get_level(void);
-///}
-
-/// ICV: run-sched-var
-///
-///
-///{
-void omp_set_schedule(omp_sched_t, int);
-void omp_get_schedule(omp_sched_t *, int *);
-///}
-
-/// TODO this is incomplete.
-int omp_get_num_threads(void);
-int omp_get_thread_num(void);
-void omp_set_nested(int);
-
-int omp_get_nested(void);
-
-void omp_set_max_active_levels(int Level);
-
-int omp_get_max_active_levels(void);
-
-omp_proc_bind_t omp_get_proc_bind(void);
-
-int omp_get_num_places(void);
-
-int omp_get_place_num_procs(int place_num);
-
-void omp_get_place_proc_ids(int place_num, int *ids);
-
-int omp_get_place_num(void);
-
-int omp_get_partition_num_places(void);
-
-void omp_get_partition_place_nums(int *place_nums);
-
-int omp_get_cancellation(void);
-
-void omp_set_default_device(int deviceId);
-
-int omp_get_default_device(void);
-
-int omp_get_num_devices(void);
-
-int omp_get_device_num(void);
-
-int omp_get_num_teams(void);
-
-int omp_get_team_num();
-
-int omp_get_initial_device(void);
-
-void *llvm_omp_target_dynamic_shared_alloc();
-
-/// Synchronization
-///
-///{
-void omp_init_lock(omp_lock_t *Lock);
-
-void omp_destroy_lock(omp_lock_t *Lock);
-
-void omp_set_lock(omp_lock_t *Lock);
-
-void omp_unset_lock(omp_lock_t *Lock);
-
-int omp_test_lock(omp_lock_t *Lock);
-///}
-
-/// Tasking
-///
-///{
-int omp_in_final(void);
-
-int omp_get_max_task_priority(void);
-///}
-
-/// Misc
-///
-///{
-double omp_get_wtick(void);
-
-double omp_get_wtime(void);
-///}
-}
-
-extern "C" {
-/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
-/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
-/// called by any thread, allocation happens *per thread*.
-void *__kmpc_alloc_shared(uint64_t Bytes);
-
-/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
-/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the
-/// allocated by __kmpc_alloc_shared by the same thread.
-void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
-
-/// Get a pointer to the memory buffer containing dynamically allocated shared
-/// memory configured at launch.
-void *__kmpc_get_dynamic_shared();
-
-/// Allocate sufficient space for \p NumArgs sequential `void*` and store the
-/// allocation address in \p GlobalArgs.
-///
-/// Called by the main thread prior to a parallel region.
-///
-/// We also remember it in GlobalArgsPtr to ensure the worker threads and
-/// deallocation function know the allocation address too.
-void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs);
-
-/// Deallocate the memory allocated by __kmpc_begin_sharing_variables.
-///
-/// Called by the main thread after a parallel region.
-void __kmpc_end_sharing_variables();
-
-/// Store the allocation address obtained via __kmpc_begin_sharing_variables in
-/// \p GlobalArgs.
-///
-/// Called by the worker threads in the parallel region (function).
-void __kmpc_get_shared_variables(void ***GlobalArgs);
-
-/// External interface to get the thread ID.
-uint32_t __kmpc_get_hardware_thread_id_in_block();
-
-/// External interface to get the number of threads.
-uint32_t __kmpc_get_hardware_num_threads_in_block();
-
-/// External interface to get the warp size.
-uint32_t __kmpc_get_warp_size();
-
-/// Kernel
-///
-///{
-// Forward declaration
-struct KernelEnvironmentTy;
-
-int8_t __kmpc_is_spmd_exec_mode();
-
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
-
-void __kmpc_target_deinit();
-
-///}
-
-/// Reduction
-///
-///{
-void *__kmpc_reduction_get_fixed_buffer();
-
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
-                                               uint64_t reduce_data_size,
-                                               void *reduce_data,
-                                               ShuffleReductFnTy shflFct,
-                                               InterWarpCopyFnTy cpyFct);
-
-int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
-    IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
-    uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
-    InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
-    ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct);
-///}
-
-/// Synchronization
-///
-///{
-void __kmpc_ordered(IdentTy *Loc, int32_t TId);
-
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId);
-
-int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
-
-void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
-
-void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
-
-void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId);
-
-int32_t __kmpc_master(IdentTy *Loc, int32_t TId);
-
-void __kmpc_end_master(IdentTy *Loc, int32_t TId);
-
-int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter);
-
-void __kmpc_end_masked(IdentTy *Loc, int32_t TId);
-
-int32_t __kmpc_single(IdentTy *Loc, int32_t TId);
-
-void __kmpc_end_single(IdentTy *Loc, int32_t TId);
-
-void __kmpc_flush(IdentTy *Loc);
-
-uint64_t __kmpc_warp_active_thread_mask(void);
-
-void __kmpc_syncwarp(uint64_t Mask);
-
-void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
-
-void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
-///}
-
-/// Parallelism
-///
-///{
-/// TODO
-void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn);
-
-/// TODO
-bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn);
-
-/// TODO
-void __kmpc_kernel_end_parallel();
-
-/// TODO
-void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
-
-/// TODO
-void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams,
-                           int32_t ThreadLimit);
-
-/// TODO
-uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t);
-
-///}
-
-/// Tasking
-///
-///{
-TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
-                                        size_t TaskSizeInclPrivateValues,
-                                        size_t SharedValuesSize,
-                                        TaskFnTy TaskFn);
-
-int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
-                        TaskDescriptorTy *TaskDescriptor);
-
-int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
-                                  TaskDescriptorTy *TaskDescriptor, int32_t,
-                                  void *, int32_t, void *);
-
-void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
-                               TaskDescriptorTy *TaskDescriptor);
-
-void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
-                                  TaskDescriptorTy *TaskDescriptor);
-
-void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
-                          void *);
-
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId);
-
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId);
-
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int);
-
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId);
-
-void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
-                     TaskDescriptorTy *TaskDescriptor, int,
-                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
-                     int32_t, uint64_t, void *);
-///}
-
-/// Misc
-///
-///{
-int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal);
-
-int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
-///}
-
-/// Shuffle
-///
-///{
-int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
-int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
-
-///}
-}
-
-#endif
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
deleted file mode 100644
index 94b5e6519606..000000000000
--- a/offload/DeviceRTL/include/LibC.h
+++ /dev/null
@@ -1,23 +0,0 @@
-//===--------- LibC.h - Simple implementation of libc functions --- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_LIBC_H
-#define OMPTARGET_LIBC_H
-
-#include "DeviceTypes.h"
-
-namespace ompx {
-
-int printf(const char *Format, ...);
-
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
deleted file mode 100644
index 8ba018b5314a..000000000000
--- a/offload/DeviceRTL/include/Mapping.h
+++ /dev/null
@@ -1,108 +0,0 @@
-//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_MAPPING_H
-#define OMPTARGET_MAPPING_H
-
-#include "DeviceTypes.h"
-
-namespace ompx {
-
-namespace mapping {
-
-enum {
-  DIM_X = __GPU_X_DIM,
-  DIM_Y = __GPU_Y_DIM,
-  DIM_Z = __GPU_Z_DIM,
-};
-
-inline constexpr uint32_t MaxThreadsPerTeam = 1024;
-
-/// Initialize the mapping machinery.
-void init(bool IsSPMD);
-
-/// Return true if the kernel is executed in SPMD mode.
-bool isSPMDMode();
-
-/// Return true if the kernel is executed in generic mode.
-bool isGenericMode();
-
-/// Return true if the executing thread is the main thread in generic mode.
-/// These functions will lookup state and it is required that that is OK for the
-/// thread and location. See also `isInitialThreadInLevel0` for a stateless
-/// alternative for certain situations, e.g. during initialization.
-bool isMainThreadInGenericMode();
-bool isMainThreadInGenericMode(bool IsSPMD);
-
-/// Return true if this thread is the initial thread in parallel level 0.
-///
-/// The thread for which this returns true should be used for single threaded
-/// initialization tasks. We pick a special thread to ensure there are no
-/// races between the initialization and the first read of initialized state.
-bool isInitialThreadInLevel0(bool IsSPMD);
-
-/// Return true if the executing thread has the lowest Id of the active threads
-/// in the warp.
-bool isLeaderInWarp();
-
-/// Return a mask describing all active threads in the warp.
-LaneMaskTy activemask();
-
-/// Return a mask describing all threads with a smaller Id in the warp.
-LaneMaskTy lanemaskLT();
-
-/// Return a mask describing all threads with a larger Id in the warp.
-LaneMaskTy lanemaskGT();
-
-/// Return the thread Id in the warp, in [0, getWarpSize()).
-uint32_t getThreadIdInWarp();
-
-/// Return the warp size, thus number of threads in the warp.
-uint32_t getWarpSize();
-
-/// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()]
-uint32_t getWarpIdInBlock();
-
-/// Return the number of warps in the block.
-uint32_t getNumberOfWarpsInBlock();
-
-/// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)).
-uint32_t getThreadIdInBlock(int32_t Dim = DIM_X);
-
-/// Return the block size, thus number of threads in the block.
-uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X);
-
-/// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)).
-uint32_t getBlockIdInKernel(int32_t Dim = DIM_X);
-
-/// Return the number of blocks in the kernel.
-uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X);
-
-/// Return the kernel size, thus number of threads in the kernel.
-uint32_t getNumberOfThreadsInKernel();
-
-/// Return the maximal number of threads in the block usable for a team (=
-/// parallel region).
-///
-/// Note: The version taking \p IsSPMD mode explicitly can be used during the
-/// initialization of the target region, that is before `mapping::isSPMDMode()`
-/// can be called by any thread other than the main one.
-uint32_t getMaxTeamThreads();
-uint32_t getMaxTeamThreads(bool IsSPMD);
-
-/// Return the number of processing elements on the device.
-uint32_t getNumberOfProcessorElements();
-
-} // namespace mapping
-
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/Profiling.h b/offload/DeviceRTL/include/Profiling.h
deleted file mode 100644
index d99475225412..000000000000
--- a/offload/DeviceRTL/include/Profiling.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_PROFILING_H
-#define OMPTARGET_DEVICERTL_PROFILING_H
-
-extern "C" {
-void __llvm_profile_register_function(void *Ptr);
-void __llvm_profile_register_names_function(void *Ptr, long int I);
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
-}
-
-#endif
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
deleted file mode 100644
index db396dae6e44..000000000000
--- a/offload/DeviceRTL/include/State.h
+++ /dev/null
@@ -1,377 +0,0 @@
-//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_STATE_H
-#define OMPTARGET_STATE_H
-
-#include "Shared/Environment.h"
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Mapping.h"
-
-// Forward declaration.
-struct KernelEnvironmentTy;
-
-namespace ompx {
-
-namespace memory {
-
-/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
-///
-/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
-void *allocShared(uint64_t Size, const char *Reason);
-
-/// Free \p Ptr, allocated via allocShared, for \p Reason.
-///
-/// Note: See the restrictions on __kmpc_free_shared for proper usage.
-void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
-
-/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
-void *allocGlobal(uint64_t Size, const char *Reason);
-
-/// Return a pointer to the dynamic shared memory buffer.
-void *getDynamicBuffer();
-
-/// Free \p Ptr, allocated via allocGlobal, for \p Reason.
-void freeGlobal(void *Ptr, const char *Reason);
-
-} // namespace memory
-
-namespace state {
-
-inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
-
-struct ICVStateTy {
-  uint32_t NThreadsVar;
-  uint32_t LevelVar;
-  uint32_t ActiveLevelVar;
-  uint32_t Padding0Val;
-  uint32_t MaxActiveLevelsVar;
-  uint32_t RunSchedVar;
-  uint32_t RunSchedChunkVar;
-
-  bool operator==(const ICVStateTy &Other) const;
-
-  void assertEqual(const ICVStateTy &Other) const;
-};
-
-struct TeamStateTy {
-  void init(bool IsSPMD);
-
-  bool operator==(const TeamStateTy &) const;
-
-  void assertEqual(TeamStateTy &Other) const;
-
-  /// ICVs
-  ///
-  /// Preallocated storage for ICV values that are used if the threads have not
-  /// set a custom default. The latter is supported but unlikely and slow(er).
-  ///
-  ///{
-  ICVStateTy ICVState;
-  ///}
-
-  uint32_t ParallelTeamSize;
-  uint32_t HasThreadState;
-  ParallelRegionFnTy ParallelRegionFnVar;
-};
-
-extern Local<TeamStateTy> TeamState;
-
-struct ThreadStateTy {
-
-  /// ICVs have preallocated storage in the TeamStateTy which is used if a
-  /// thread has not set a custom value. The latter is supported but unlikely.
-  /// When it happens we will allocate dynamic memory to hold the values of all
-  /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
-  /// ICV struct to hold them all. This is slower than alternatives but allows
-  /// users to pay only for what they use.
-  ///
-  state::ICVStateTy ICVState;
-
-  ThreadStateTy *PreviousThreadState;
-
-  void init() {
-    ICVState = TeamState.ICVState;
-    PreviousThreadState = nullptr;
-  }
-
-  void init(ThreadStateTy *PreviousTS) {
-    ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
-    PreviousThreadState = PreviousTS;
-  }
-};
-
-extern Local<ThreadStateTy **> ThreadStates;
-
-/// Initialize the state machinery. Must be called by all threads.
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-          KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
-
-/// Return the kernel and kernel launch environment associated with the current
-/// kernel. The former is static and contains compile time information that
-/// holds for all instances of the kernel. The latter is dynamic and provides
-/// per-launch information.
-KernelEnvironmentTy &getKernelEnvironment();
-KernelLaunchEnvironmentTy &getKernelLaunchEnvironment();
-
-/// TODO
-enum ValueKind {
-  VK_NThreads,
-  VK_Level,
-  VK_ActiveLevel,
-  VK_MaxActiveLevels,
-  VK_RunSched,
-  // ---
-  VK_RunSchedChunk,
-  VK_ParallelRegionFn,
-  VK_ParallelTeamSize,
-  VK_HasThreadState,
-};
-
-/// TODO
-void enterDataEnvironment(IdentTy *Ident);
-
-/// TODO
-void exitDataEnvironment();
-
-/// TODO
-struct DateEnvironmentRAII {
-  DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); }
-  ~DateEnvironmentRAII() { exitDataEnvironment(); }
-};
-
-/// TODO
-void resetStateForThread(uint32_t TId);
-
-// FIXME: https://github.com/llvm/llvm-project/issues/123241.
-#define lookupForModify32Impl(Member, Ident, ForceTeamState)                   \
-  {                                                                            \
-    if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() ||          \
-                   !TeamState.HasThreadState))                                 \
-      return TeamState.ICVState.Member;                                        \
-    uint32_t TId = mapping::getThreadIdInBlock();                              \
-    if (OMP_UNLIKELY(!ThreadStates[TId])) {                                    \
-      ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(                   \
-          memory::allocGlobal(sizeof(ThreadStateTy),                           \
-                              "ICV modification outside data environment"));   \
-      ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!");     \
-      TeamState.HasThreadState = true;                                         \
-      ThreadStates[TId]->init();                                               \
-    }                                                                          \
-    return ThreadStates[TId]->ICVState.Member;                                 \
-  }
-
-// FIXME: https://github.com/llvm/llvm-project/issues/123241.
-#define lookupImpl(Member, ForceTeamState)                                     \
-  {                                                                            \
-    auto TId = mapping::getThreadIdInBlock();                                  \
-    if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() &&        \
-                     TeamState.HasThreadState && ThreadStates[TId]))           \
-      return ThreadStates[TId]->ICVState.Member;                               \
-    return TeamState.ICVState.Member;                                          \
-  }
-
-[[gnu::always_inline, gnu::flatten]] inline uint32_t &
-lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
-  switch (Kind) {
-  case state::VK_NThreads:
-    if (IsReadonly)
-      lookupImpl(NThreadsVar, ForceTeamState);
-    lookupForModify32Impl(NThreadsVar, Ident, ForceTeamState);
-  case state::VK_Level:
-    if (IsReadonly)
-      lookupImpl(LevelVar, ForceTeamState);
-    lookupForModify32Impl(LevelVar, Ident, ForceTeamState);
-  case state::VK_ActiveLevel:
-    if (IsReadonly)
-      lookupImpl(ActiveLevelVar, ForceTeamState);
-    lookupForModify32Impl(ActiveLevelVar, Ident, ForceTeamState);
-  case state::VK_MaxActiveLevels:
-    if (IsReadonly)
-      lookupImpl(MaxActiveLevelsVar, ForceTeamState);
-    lookupForModify32Impl(MaxActiveLevelsVar, Ident, ForceTeamState);
-  case state::VK_RunSched:
-    if (IsReadonly)
-      lookupImpl(RunSchedVar, ForceTeamState);
-    lookupForModify32Impl(RunSchedVar, Ident, ForceTeamState);
-  case state::VK_RunSchedChunk:
-    if (IsReadonly)
-      lookupImpl(RunSchedChunkVar, ForceTeamState);
-    lookupForModify32Impl(RunSchedChunkVar, Ident, ForceTeamState);
-  case state::VK_ParallelTeamSize:
-    return TeamState.ParallelTeamSize;
-  case state::VK_HasThreadState:
-    return TeamState.HasThreadState;
-  default:
-    break;
-  }
-  __builtin_unreachable();
-}
-
-[[gnu::always_inline, gnu::flatten]] inline void *&
-lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
-  switch (Kind) {
-  case state::VK_ParallelRegionFn:
-    return TeamState.ParallelRegionFnVar;
-  default:
-    break;
-  }
-  __builtin_unreachable();
-}
-
-/// A class without actual state used to provide a nice interface to lookup and
-/// update ICV values we can declare in global scope.
-template <typename Ty, ValueKind Kind> struct Value {
-  [[gnu::flatten, gnu::always_inline]] operator Ty() {
-    return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
-                  /*ForceTeamState=*/false);
-  }
-
-  [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) {
-    set(Other, /*IdentTy=*/nullptr);
-    return *this;
-  }
-
-  [[gnu::flatten, gnu::always_inline]] Value &operator++() {
-    inc(1, /*IdentTy=*/nullptr);
-    return *this;
-  }
-
-  [[gnu::flatten, gnu::always_inline]] Value &operator--() {
-    inc(-1, /*IdentTy=*/nullptr);
-    return *this;
-  }
-
-  [[gnu::flatten, gnu::always_inline]] void
-  assert_eq(const Ty &V, IdentTy *Ident = nullptr,
-            bool ForceTeamState = false) {
-    ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr);
-  }
-
-private:
-  [[gnu::flatten, gnu::always_inline]] Ty &
-  lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
-    Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState);
-    return t;
-  }
-
-  [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) {
-    return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) +=
-            UpdateVal);
-  }
-
-  [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) {
-    return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) =
-                UpdateVal);
-  }
-
-  template <typename VTy, typename Ty2> friend struct ValueRAII;
-};
-
-/// A mookup class without actual state used to provide
-/// a nice interface to lookup and update ICV values
-/// we can declare in global scope.
-template <typename Ty, ValueKind Kind> struct PtrValue {
-  [[gnu::flatten, gnu::always_inline]] operator Ty() {
-    return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
-                  /*ForceTeamState=*/false);
-  }
-
-  [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) {
-    set(Other);
-    return *this;
-  }
-
-private:
-  Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) {
-    return lookupPtr(Kind, IsReadonly, ForceTeamState);
-  }
-
-  Ty &set(Ty UpdateVal) {
-    return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr,
-                   /*ForceTeamState=*/false) = UpdateVal);
-  }
-
-  template <typename VTy, typename Ty2> friend struct ValueRAII;
-};
-
-template <typename VTy, typename Ty> struct ValueRAII {
-  ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident,
-            bool ForceTeamState = false)
-      : Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState)
-                   : (Ty *)utils::UndefPtr),
-        Val(OldValue), Active(Active) {
-    if (!Active)
-      return;
-    ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!");
-    *Ptr = NewValue;
-  }
-  ~ValueRAII() {
-    if (Active)
-      *Ptr = Val;
-  }
-
-private:
-  Ty *Ptr;
-  Ty Val;
-  bool Active;
-};
-
-/// TODO
-inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState;
-
-/// TODO
-inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
-    ParallelRegionFn;
-
-void runAndCheckState(void(Func(void)));
-
-void assumeInitialState(bool IsSPMD);
-
-/// Return the value of the ParallelTeamSize ICV.
-int getEffectivePTeamSize();
-
-} // namespace state
-
-namespace icv {
-
-/// TODO
-inline state::Value<uint32_t, state::VK_NThreads> NThreads;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_Level> Level;
-
-/// The `active-level` describes which of the parallel level counted with the
-/// `level-var` is active. There can only be one.
-///
-/// active-level-var is 1, if ActiveLevelVar is not 0, otherwise it is 0.
-inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_RunSched> RunSched;
-
-} // namespace icv
-
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
deleted file mode 100644
index 7e7c8eacb917..000000000000
--- a/offload/DeviceRTL/include/Synchronization.h
+++ /dev/null
@@ -1,225 +0,0 @@
-//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
-#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
-
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-
-namespace ompx {
-namespace atomic {
-
-enum OrderingTy {
-  relaxed = __ATOMIC_RELAXED,
-  acquire = __ATOMIC_ACQUIRE,
-  release = __ATOMIC_RELEASE,
-  acq_rel = __ATOMIC_ACQ_REL,
-  seq_cst = __ATOMIC_SEQ_CST,
-};
-
-enum MemScopeTy {
-  system = __MEMORY_SCOPE_SYSTEM,
-  device = __MEMORY_SCOPE_DEVICE,
-  workgroup = __MEMORY_SCOPE_WRKGRP,
-  wavefront = __MEMORY_SCOPE_WVFRNT,
-  single = __MEMORY_SCOPE_SINGLE,
-};
-
-/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
-uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
-             MemScopeTy MemScope = MemScopeTy::device);
-
-/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
-/// result is stored in \p *Addr;
-/// {
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc,
-         atomic::OrderingTy OrderingFail,
-         MemScopeTy MemScope = MemScopeTy::device) {
-  return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false,
-                                          OrderingSucc, OrderingFail, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V add(Ty *Address, V Val, atomic::OrderingTy Ordering,
-      MemScopeTy MemScope = MemScopeTy::device) {
-  return __scoped_atomic_fetch_add(Address, Val, Ordering, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V load(Ty *Address, atomic::OrderingTy Ordering,
-       MemScopeTy MemScope = MemScopeTy::device) {
-#ifdef __NVPTX__
-  return __scoped_atomic_fetch_add(Address, V(0), Ordering, MemScope);
-#else
-  return __scoped_atomic_load_n(Address, Ordering, MemScope);
-#endif
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-void store(Ty *Address, V Val, atomic::OrderingTy Ordering,
-           MemScopeTy MemScope = MemScopeTy::device) {
-  __scoped_atomic_store_n(Address, Val, Ordering, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V mul(Ty *Address, V Val, atomic::OrderingTy Ordering,
-      MemScopeTy MemScope = MemScopeTy::device) {
-  Ty TypedCurrentVal, TypedResultVal, TypedNewVal;
-  bool Success;
-  do {
-    TypedCurrentVal = atomic::load(Address, Ordering);
-    TypedNewVal = TypedCurrentVal * Val;
-    Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering,
-                          atomic::relaxed, MemScope);
-  } while (!Success);
-  return TypedResultVal;
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<!utils::is_floating_point_v<V>, V>
-max(Ty *Address, V Val, atomic::OrderingTy Ordering,
-    MemScopeTy MemScope = MemScopeTy::device) {
-  return __scoped_atomic_fetch_max(Address, Val, Ordering, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, float>, V>
-max(Ty *Address, V Val, atomic::OrderingTy Ordering,
-    MemScopeTy MemScope = MemScopeTy::device) {
-  if (Val >= 0)
-    return utils::bitCast<float>(max(
-        (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope));
-  return utils::bitCast<float>(min(
-      (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope));
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, double>, V>
-max(Ty *Address, V Val, atomic::OrderingTy Ordering,
-    MemScopeTy MemScope = MemScopeTy::device) {
-  if (Val >= 0)
-    return utils::bitCast<double>(max(
-        (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope));
-  return utils::bitCast<double>(min(
-      (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope));
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<!utils::is_floating_point_v<V>, V>
-min(Ty *Address, V Val, atomic::OrderingTy Ordering,
-    MemScopeTy MemScope = MemScopeTy::device) {
-  return __scoped_atomic_fetch_min(Address, Val, Ordering, MemScope);
-}
-
-// TODO: Implement this with __atomic_fetch_max and remove the duplication.
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, float>, V>
-min(Ty *Address, V Val, atomic::OrderingTy Ordering,
-    MemScopeTy MemScope = MemScopeTy::device) {
-  if (Val >= 0)
-    return utils::bitCast<float>(min(
-        (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope));
-  return utils::bitCast<float>(max(
-      (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope));
-}
-
-// TODO: Implement this with __atomic_fetch_max and remove the duplication.
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, double>, V>
-min(Ty *Address, utils::remove_addrspace_t<Ty> Val, atomic::OrderingTy Ordering,
-    MemScopeTy MemScope = MemScopeTy::device) {
-  if (Val >= 0)
-    return utils::bitCast<double>(min(
-        (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope));
-  return utils::bitCast<double>(max(
-      (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope));
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering,
-         MemScopeTy MemScope = MemScopeTy::device) {
-  return __scoped_atomic_fetch_or(Address, Val, Ordering, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering,
-          MemScopeTy MemScope = MemScopeTy::device) {
-  return __scoped_atomic_fetch_and(Address, Val, Ordering, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering,
-          MemScopeTy MemScope = MemScopeTy::device) {
-  return __scoped_atomic_fetch_xor(Address, Val, Ordering, MemScope);
-}
-
-static inline uint32_t
-atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
-               MemScopeTy MemScope = MemScopeTy::device) {
-  uint32_t R;
-  __scoped_atomic_exchange(Address, &Val, &R, Ordering, MemScope);
-  return R;
-}
-
-///}
-
-} // namespace atomic
-
-namespace synchronize {
-
-/// Initialize the synchronization machinery. Must be called by all threads.
-void init(bool IsSPMD);
-
-/// Synchronize all threads in a warp identified by \p Mask.
-void warp(LaneMaskTy Mask);
-
-/// Synchronize all threads in a block and perform a fence before and after the
-/// barrier according to \p Ordering. Note that the fence might be part of the
-/// barrier.
-void threads(atomic::OrderingTy Ordering);
-
-/// Synchronizing threads is allowed even if they all hit different instances of
-/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
-/// restrictive in that it requires all threads to hit the same instance. The
-/// noinline is removed by the openmp-opt pass and helps to preserve the
-/// information till then.
-///{
-
-/// Synchronize all threads in a block, they are reaching the same instruction
-/// (hence all threads in the block are "aligned"). Also perform a fence before
-/// and after the barrier according to \p Ordering. Note that the
-/// fence might be part of the barrier if the target offers this.
-[[gnu::noinline, omp::assume("ompx_aligned_barrier")]] void
-threadsAligned(atomic::OrderingTy Ordering);
-
-///}
-
-} // namespace synchronize
-
-namespace fence {
-
-/// Memory fence with \p Ordering semantics for the team.
-void team(atomic::OrderingTy Ordering);
-
-/// Memory fence with \p Ordering semantics for the contention group.
-void kernel(atomic::OrderingTy Ordering);
-
-/// Memory fence with \p Ordering semantics for the system.
-void system(atomic::OrderingTy Ordering);
-
-} // namespace fence
-
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h
deleted file mode 100644
index 554c3271c334..000000000000
--- a/offload/DeviceRTL/include/Workshare.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-------- Workshare.h - OpenMP Workshare interface ------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_WORKSHARE_H
-#define OMPTARGET_WORKSHARE_H
-
-namespace ompx {
-
-namespace workshare {
-
-/// Initialize the worksharing machinery.
-void init(bool IsSPMD);
-
-} // namespace workshare
-
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/generated_microtask_cases.gen b/offload/DeviceRTL/include/generated_microtask_cases.gen
deleted file mode 100644
index a05f6da2f84f..000000000000
--- a/offload/DeviceRTL/include/generated_microtask_cases.gen
+++ /dev/null
@@ -1,797 +0,0 @@
-case 0:
-((void (*)(int32_t *, int32_t *))fn)(&global_tid, &bound_tid);
-break;
-case 1:
-((void (*)(int32_t *, int32_t *, void *))fn)(&global_tid, &bound_tid, args[0]);
-break;
-case 2:
-((void (*)(int32_t *, int32_t *, void *, void *))fn)(&global_tid, &bound_tid,
-                                                     args[0], args[1]);
-break;
-case 3:
-((void (*)(int32_t *, int32_t *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2]);
-break;
-case 4:
-((void (*)(int32_t *, int32_t *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3]);
-break;
-case 5:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4]);
-break;
-case 6:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5]);
-break;
-case 7:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6]);
-break;
-case 8:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7]);
-break;
-case 9:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
-                                       args[1], args[2], args[3], args[4],
-                                       args[5], args[6], args[7], args[8]);
-break;
-case 10:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
-                                               args[1], args[2], args[3],
-                                               args[4], args[5], args[6],
-                                               args[7], args[8], args[9]);
-break;
-case 11:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10]);
-break;
-case 12:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11]);
-break;
-case 13:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12]);
-break;
-case 14:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13]);
-break;
-case 15:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14]);
-break;
-case 16:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7], args[8], args[9], args[10], args[11],
-                               args[12], args[13], args[14], args[15]);
-break;
-case 17:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
-                                       args[1], args[2], args[3], args[4],
-                                       args[5], args[6], args[7], args[8],
-                                       args[9], args[10], args[11], args[12],
-                                       args[13], args[14], args[15], args[16]);
-break;
-case 18:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17]);
-break;
-case 19:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18]);
-break;
-case 20:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19]);
-break;
-case 21:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20]);
-break;
-case 22:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21]);
-break;
-case 23:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22]);
-break;
-case 24:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7], args[8], args[9], args[10], args[11],
-                               args[12], args[13], args[14], args[15], args[16],
-                               args[17], args[18], args[19], args[20], args[21],
-                               args[22], args[23]);
-break;
-case 25:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
-                                       args[1], args[2], args[3], args[4],
-                                       args[5], args[6], args[7], args[8],
-                                       args[9], args[10], args[11], args[12],
-                                       args[13], args[14], args[15], args[16],
-                                       args[17], args[18], args[19], args[20],
-                                       args[21], args[22], args[23], args[24]);
-break;
-case 26:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25]);
-break;
-case 27:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26]);
-break;
-case 28:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27]);
-break;
-case 29:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27], args[28]);
-break;
-case 30:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29]);
-break;
-case 31:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27], args[28],
-                       args[29], args[30]);
-break;
-case 32:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7], args[8], args[9], args[10], args[11],
-                               args[12], args[13], args[14], args[15], args[16],
-                               args[17], args[18], args[19], args[20], args[21],
-                               args[22], args[23], args[24], args[25], args[26],
-                               args[27], args[28], args[29], args[30],
-                               args[31]);
-break;
-case 33:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32]);
-break;
-case 34:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33]);
-break;
-case 35:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34]);
-break;
-case 36:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35]);
-break;
-case 37:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36]);
-break;
-case 38:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37]);
-break;
-case 39:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27], args[28],
-                       args[29], args[30], args[31], args[32], args[33],
-                       args[34], args[35], args[36], args[37], args[38]);
-break;
-case 40:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7], args[8], args[9], args[10], args[11],
-                               args[12], args[13], args[14], args[15], args[16],
-                               args[17], args[18], args[19], args[20], args[21],
-                               args[22], args[23], args[24], args[25], args[26],
-                               args[27], args[28], args[29], args[30], args[31],
-                               args[32], args[33], args[34], args[35], args[36],
-                               args[37], args[38], args[39]);
-break;
-case 41:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40]);
-break;
-case 42:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41]);
-break;
-case 43:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42]);
-break;
-case 44:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43]);
-break;
-case 45:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44]);
-break;
-case 46:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45]);
-break;
-///  DONE TO HERE
-case 47:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27], args[28],
-                       args[29], args[30], args[31], args[32], args[33],
-                       args[34], args[35], args[36], args[37], args[38],
-                       args[39], args[40], args[41], args[42], args[43],
-                       args[44], args[45], args[46]);
-break;
-case 48:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47]);
-break;
-case 49:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48]);
-break;
-case 50:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49]);
-break;
-case 51:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50]);
-break;
-case 52:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51]);
-break;
-case 53:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52]);
-break;
-case 54:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53]);
-break;
-case 55:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54]);
-break;
-case 56:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7], args[8], args[9], args[10], args[11],
-                               args[12], args[13], args[14], args[15], args[16],
-                               args[17], args[18], args[19], args[20], args[21],
-                               args[22], args[23], args[24], args[25], args[26],
-                               args[27], args[28], args[29], args[30], args[31],
-                               args[32], args[33], args[34], args[35], args[36],
-                               args[37], args[38], args[39], args[40], args[41],
-                               args[42], args[43], args[44], args[45], args[46],
-                               args[47], args[48], args[49], args[50], args[51],
-                               args[52], args[53], args[54], args[55]);
-break;
-case 57:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56]);
-break;
-case 58:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57]);
-break;
-case 59:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57], args[58]);
-break;
-case 60:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57], args[58], args[59]);
-break;
-case 61:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57], args[58], args[59], args[60]);
-break;
-case 62:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57], args[58], args[59], args[60], args[61]);
-break;
-case 63:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27], args[28],
-                       args[29], args[30], args[31], args[32], args[33],
-                       args[34], args[35], args[36], args[37], args[38],
-                       args[39], args[40], args[41], args[42], args[43],
-                       args[44], args[45], args[46], args[47], args[48],
-                       args[49], args[50], args[51], args[52], args[53],
-                       args[54], args[55], args[56], args[57], args[58],
-                       args[59], args[60], args[61], args[62]);
-break;
-case 64:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57], args[58], args[59], args[60], args[61],
-    args[62], args[63]);
-break;
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
deleted file mode 100644
index aac2a6005158..000000000000
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Mapping.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility(
-      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
-    __omp_rtl_device_memory_pool_tracker;
-
-/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
-/// directly.
-struct BumpAllocatorTy final {
-
-  void *alloc(uint64_t Size) {
-    Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
-
-    if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
-                  atomic::seq_cst);
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
-                  atomic::seq_cst);
-      atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
-                  atomic::seq_cst);
-      atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
-                  atomic::seq_cst);
-    }
-
-    uint64_t *Data =
-        reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
-    uint64_t End =
-        reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
-
-    uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
-    if (OldData + Size > End)
-      __builtin_trap();
-
-    return reinterpret_cast<void *>(OldData);
-  }
-
-  void free(void *) {}
-};
-
-BumpAllocatorTy BumpAllocator;
-
-/// allocator namespace implementation
-///
-///{
-
-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
-  // TODO: Check KernelEnvironment for an allocator choice as soon as we have
-  // more than one.
-}
-
-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
-
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
-
-///}
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
deleted file mode 100644
index 0c31c66ab2de..000000000000
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the data object of the constant device environment and the
-// query API.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Configuration.h"
-#include "DeviceTypes.h"
-#include "State.h"
-
-using namespace ompx;
-
-// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
-[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_nested_parallelism = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_threads_oversubscription =
-    0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_teams_oversubscription = 0;
-
-// This variable should be visible to the plugin so we override the default
-// hidden visibility.
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility(
-      "protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment;
-
-uint32_t config::getAssumeTeamsOversubscription() {
-  return __omp_rtl_assume_teams_oversubscription;
-}
-
-uint32_t config::getAssumeThreadsOversubscription() {
-  return __omp_rtl_assume_threads_oversubscription;
-}
-
-uint32_t config::getDebugKind() {
-  return __omp_rtl_debug_kind & __omp_rtl_device_environment.DeviceDebugKind;
-}
-
-uint32_t config::getNumDevices() {
-  return __omp_rtl_device_environment.NumDevices;
-}
-
-uint32_t config::getDeviceNum() {
-  return __omp_rtl_device_environment.DeviceNum;
-}
-
-uint64_t config::getDynamicMemorySize() {
-  return __omp_rtl_device_environment.DynamicMemSize;
-}
-
-uint64_t config::getClockFrequency() {
-  return __omp_rtl_device_environment.ClockFrequency;
-}
-
-void *config::getIndirectCallTablePtr() {
-  return reinterpret_cast<void *>(
-      __omp_rtl_device_environment.IndirectCallTable);
-}
-
-uint64_t config::getHardwareParallelism() {
-  return __omp_rtl_device_environment.HardwareParallelism;
-}
-
-uint64_t config::getIndirectCallTableSize() {
-  return __omp_rtl_device_environment.IndirectCallTableSize;
-}
-
-bool config::isDebugMode(DeviceDebugKind Kind) {
-  return config::getDebugKind() & uint32_t(Kind);
-}
-
-bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
-
-bool config::mayUseNestedParallelism() {
-  if (__omp_rtl_assume_no_nested_parallelism)
-    return false;
-  return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
-}
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
deleted file mode 100644
index 5b5482d766b1..000000000000
--- a/offload/DeviceRTL/src/Debug.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains debug utilities
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Configuration.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-
-using namespace ompx;
-
-extern "C" {
-void __assert_assume(bool condition) { __builtin_assume(condition); }
-
-#ifndef OMPTARGET_HAS_LIBC
-[[gnu::weak]] void __assert_fail(const char *expr, const char *file,
-                                 unsigned line, const char *function) {
-  __assert_fail_internal(expr, nullptr, file, line, function);
-}
-#endif
-
-void __assert_fail_internal(const char *expr, const char *msg, const char *file,
-                            unsigned line, const char *function) {
-  if (msg) {
-    printf("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function,
-           msg, expr);
-  } else {
-    printf("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr);
-  }
-  __builtin_trap();
-}
-}
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
deleted file mode 100644
index d6f8c499c890..000000000000
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "DeviceUtils.h"
-
-#include "Debug.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "gpuintrin.h"
-
-using namespace ompx;
-
-uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
-  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
-}
-
-void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
-  static_assert(sizeof(unsigned long) == 8, "");
-  LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
-  HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
-}
-
-int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
-                       int32_t Width) {
-  return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width);
-}
-
-int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
-                           int32_t Width) {
-  int32_t Self = mapping::getThreadIdInWarp();
-  int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
-  return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
-}
-
-int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
-                           int32_t Width) {
-  int32_t Self = mapping::getThreadIdInWarp();
-  int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
-  return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
-}
-
-uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
-  return __gpu_ballot(Mask, Pred);
-}
-
-bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
-
-extern "C" {
-int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
-  return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
-}
-
-int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
-  return utils::shuffleDown(lanes::All, Val, Delta, Width);
-}
-}
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
deleted file mode 100644
index 467e44a65276..000000000000
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the kernel entry points for the device.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-#include "Workshare.h"
-
-using namespace ompx;
-
-// These flags are copied from "llvm/Frontend/OpenMP/OMPDeviceConstants.h" and
-// must be kept in-sync.
-enum OMPTgtExecModeFlags : unsigned char {
-  OMP_TGT_EXEC_MODE_BARE = 0,
-  OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
-  OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
-  OMP_TGT_EXEC_MODE_GENERIC_SPMD =
-      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
-};
-
-static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                    KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
-  // Order is important here.
-  synchronize::init(IsSPMD);
-  mapping::init(IsSPMD);
-  state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
-  allocator::init(IsSPMD, KernelEnvironment);
-  workshare::init(IsSPMD);
-}
-
-/// Simple generic state machine for worker threads.
-static void genericStateMachine(IdentTy *Ident) {
-  uint32_t TId = mapping::getThreadIdInBlock();
-
-  do {
-    ParallelRegionFnTy WorkFn = nullptr;
-
-    // Wait for the signal that we have a new work function.
-    synchronize::threads(atomic::seq_cst);
-
-    // Retrieve the work function from the runtime.
-    bool IsActive = __kmpc_kernel_parallel(&WorkFn);
-
-    // If there is nothing more to do, break out of the state machine by
-    // returning to the caller.
-    if (!WorkFn)
-      return;
-
-    if (IsActive) {
-      ASSERT(!mapping::isSPMDMode(), nullptr);
-      ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId);
-      __kmpc_kernel_end_parallel();
-    }
-
-    synchronize::threads(atomic::seq_cst);
-
-  } while (true);
-}
-
-extern "C" {
-
-/// Initialization
-///
-/// \param Ident               Source location identification, can be NULL.
-///
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
-  ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
-  bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
-  bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
-  if (IsSPMD) {
-    inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
-                        KernelLaunchEnvironment);
-    synchronize::threadsAligned(atomic::relaxed);
-  } else {
-    inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
-                        KernelLaunchEnvironment);
-    // No need to wait since only the main threads will execute user
-    // code and workers will run into a barrier right away.
-  }
-
-  if (IsSPMD) {
-    state::assumeInitialState(IsSPMD);
-
-    // Synchronize to ensure the assertions above are in an aligned region.
-    // The barrier is eliminated later.
-    synchronize::threadsAligned(atomic::relaxed);
-    return -1;
-  }
-
-  if (mapping::isInitialThreadInLevel0(IsSPMD))
-    return -1;
-
-  // Enter the generic state machine if enabled and if this thread can possibly
-  // be an active worker thread.
-  //
-  // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
-  // GPU.  In those cases, a single thread can apparently satisfy a barrier on
-  // behalf of all threads in the same warp.  Thus, it would not be safe for
-  // other threads in the main thread's warp to reach the first
-  // synchronize::threads call in genericStateMachine before the main thread
-  // reaches its corresponding synchronize::threads call: that would permit all
-  // active worker threads to proceed before the main thread has actually set
-  // state::ParallelRegionFn, and then they would immediately quit without
-  // doing any work.  mapping::getMaxTeamThreads() does not include any of the
-  // main thread's warp, so none of its threads can ever be active worker
-  // threads.
-  if (UseGenericStateMachine &&
-      mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD))
-    genericStateMachine(KernelEnvironment.Ident);
-
-  return mapping::getThreadIdInBlock();
-}
-
-/// De-Initialization
-///
-/// In non-SPMD, this function releases the workers trapped in a state machine
-/// and also any memory dynamically allocated by the runtime.
-///
-/// \param Ident Source location identification, can be NULL.
-///
-void __kmpc_target_deinit() {
-  bool IsSPMD = mapping::isSPMDMode();
-  if (IsSPMD)
-    return;
-
-  if (mapping::isInitialThreadInLevel0(IsSPMD)) {
-    // Signal the workers to exit the state machine and exit the kernel.
-    state::ParallelRegionFn = nullptr;
-  } else if (!state::getKernelEnvironment()
-                  .Configuration.UseGenericStateMachine) {
-    // Retrieve the work function just to ensure we always call
-    // __kmpc_kernel_parallel even if a custom state machine is used.
-    // TODO: this is not super pretty. The problem is we create the call to
-    // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it
-    // is not there yet. Thus, we assume we never reach it from
-    // __kmpc_target_deinit. That allows us to remove the store in there to
-    // ParallelRegionFn, which leads to bad results later on.
-    ParallelRegionFnTy WorkFn = nullptr;
-    __kmpc_kernel_parallel(&WorkFn);
-    ASSERT(WorkFn == nullptr, nullptr);
-  }
-}
-
-int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
-}
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
deleted file mode 100644
index 83f9233d9480..000000000000
--- a/offload/DeviceRTL/src/LibC.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===------- LibC.cpp - Simple implementation of libc functions --- C++ ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "LibC.h"
-
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
-extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; }
-#else
-extern "C" int vprintf(const char *format, __builtin_va_list);
-#endif
-
-extern "C" {
-[[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) {
-  auto *L = reinterpret_cast<const unsigned char *>(lhs);
-  auto *R = reinterpret_cast<const unsigned char *>(rhs);
-
-  for (size_t I = 0; I < count; ++I)
-    if (L[I] != R[I])
-      return (int)L[I] - (int)R[I];
-
-  return 0;
-}
-
-[[gnu::weak]] void memset(void *dst, int C, size_t count) {
-  auto *dstc = reinterpret_cast<char *>(dst);
-  for (size_t I = 0; I < count; ++I)
-    dstc[I] = C;
-}
-
-[[gnu::weak]] int printf(const char *Format, ...) {
-  __builtin_va_list vlist;
-  __builtin_va_start(vlist, Format);
-  return ::vprintf(Format, vlist);
-}
-}
-
-namespace ompx {
-[[clang::no_builtin("printf")]] int printf(const char *Format, ...) {
-  __builtin_va_list vlist;
-  __builtin_va_start(vlist, Format);
-  return ::vprintf(Format, vlist);
-}
-} // namespace ompx
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
deleted file mode 100644
index b145892d1ece..000000000000
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Mapping.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "State.h"
-#include "gpuintrin.h"
-
-using namespace ompx;
-
-// FIXME: This resolves the handling for the AMDGPU workgroup size when the ABI
-// is set to 'none'. We only support COV5+ but this can be removed when COV4 is
-// fully deprecated.
-#ifdef __AMDGPU__
-extern const inline uint32_t __oclc_ABI_version = 500;
-[[gnu::alias("__oclc_ABI_version")]] const uint32_t __oclc_ABI_version__;
-#endif
-
-static bool isInLastWarp() {
-  uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
-                     ~(mapping::getWarpSize() - 1);
-  return mapping::getThreadIdInBlock() == MainTId;
-}
-
-bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
-  if (IsSPMD || icv::Level)
-    return false;
-
-  // Check if this is the last warp in the block.
-  return isInLastWarp();
-}
-
-bool mapping::isMainThreadInGenericMode() {
-  return mapping::isMainThreadInGenericMode(mapping::isSPMDMode());
-}
-
-bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
-  if (IsSPMD)
-    return mapping::getThreadIdInBlock() == 0;
-  return isInLastWarp();
-}
-
-bool mapping::isLeaderInWarp() {
-  __kmpc_impl_lanemask_t Active = mapping::activemask();
-  __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
-  return utils::popc(Active & LaneMaskLT) == 0;
-}
-
-LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
-
-LaneMaskTy mapping::lanemaskLT() {
-#ifdef __NVPTX__
-  return __nvvm_read_ptx_sreg_lanemask_lt();
-#else
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
-  return Mask & Ballot;
-#endif
-}
-
-LaneMaskTy mapping::lanemaskGT() {
-#ifdef __NVPTX__
-  return __nvvm_read_ptx_sreg_lanemask_gt();
-#else
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  if (Lane == (mapping::getWarpSize() - 1))
-    return 0;
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
-  return Mask & Ballot;
-#endif
-}
-
-uint32_t mapping::getThreadIdInWarp() {
-  uint32_t ThreadIdInWarp = __gpu_lane_id();
-  ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
-  return ThreadIdInWarp;
-}
-
-uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
-  uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
-  return ThreadIdInBlock;
-}
-
-uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
-
-uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
-  uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
-  // If we are in SPMD mode, remove one warp.
-  return BlockSize - (!IsSPMD * mapping::getWarpSize());
-}
-uint32_t mapping::getMaxTeamThreads() {
-  return mapping::getMaxTeamThreads(mapping::isSPMDMode());
-}
-
-uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
-  return __gpu_num_threads(Dim);
-}
-
-uint32_t mapping::getNumberOfThreadsInKernel() {
-  return mapping::getNumberOfThreadsInBlock(0) *
-         mapping::getNumberOfBlocksInKernel(0) *
-         mapping::getNumberOfThreadsInBlock(1) *
-         mapping::getNumberOfBlocksInKernel(1) *
-         mapping::getNumberOfThreadsInBlock(2) *
-         mapping::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t mapping::getWarpIdInBlock() {
-  uint32_t WarpID =
-      mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-  ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
-  return WarpID;
-}
-
-uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
-  uint32_t BlockId = __gpu_block_id(Dim);
-  ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
-  return BlockId;
-}
-
-uint32_t mapping::getNumberOfWarpsInBlock() {
-  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
-         mapping::getWarpSize();
-}
-
-uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
-  return __gpu_num_blocks(Dim);
-}
-
-uint32_t mapping::getNumberOfProcessorElements() {
-  return static_cast<uint32_t>(config::getHardwareParallelism());
-}
-
-///}
-
-/// Execution mode
-///
-///{
-
-// TODO: This is a workaround for initialization coming from kernels outside of
-//       the TU. We will need to solve this more correctly in the future.
-[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode;
-
-void mapping::init(bool IsSPMD) {
-  if (mapping::isInitialThreadInLevel0(IsSPMD))
-    IsSPMDMode = IsSPMD;
-}
-
-bool mapping::isSPMDMode() { return IsSPMDMode; }
-
-bool mapping::isGenericMode() { return !isSPMDMode(); }
-///}
-
-extern "C" {
-[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
-  return mapping::getThreadIdInBlock();
-}
-
-[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
-  return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
-}
-
-[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
-  return mapping::getWarpSize();
-}
-}
-
-#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME)                                \
-  extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); }
-
-_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock)
-_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
-_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
-_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
-
-extern "C" {
-uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
-  return utils::ballotSync(mask, pred);
-}
-
-int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) {
-  return utils::shuffleDown(mask, var, delta, width);
-}
-
-float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
-                            int width) {
-  return utils::bitCast<float>(
-      utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width));
-}
-
-long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
-  return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width);
-}
-
-double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
-                             int width) {
-  return utils::bitCast<double>(
-      utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
-}
-}
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
deleted file mode 100644
index a89f8b2a7453..000000000000
--- a/offload/DeviceRTL/src/Misc.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "DeviceTypes.h"
-#include "Shared/RPCOpcodes.h"
-#include "shared/rpc.h"
-
-#include "Debug.h"
-
-namespace ompx {
-namespace impl {
-
-/// Lookup a device-side function using a host pointer /p HstPtr using the table
-/// provided by the device plugin. The table is an ordered pair of host and
-/// device pointers sorted on the value of the host pointer.
-void *indirectCallLookup(void *HstPtr) {
-  if (!HstPtr)
-    return nullptr;
-
-  struct IndirectCallTable {
-    void *HstPtr;
-    void *DevPtr;
-  };
-  IndirectCallTable *Table =
-      reinterpret_cast<IndirectCallTable *>(config::getIndirectCallTablePtr());
-  uint64_t TableSize = config::getIndirectCallTableSize();
-
-  // If the table is empty we assume this is device pointer.
-  if (!Table || !TableSize)
-    return HstPtr;
-
-  uint32_t Left = 0;
-  uint32_t Right = TableSize;
-
-  // If the pointer is definitely not contained in the table we exit early.
-  if (HstPtr < Table[Left].HstPtr || HstPtr > Table[Right - 1].HstPtr)
-    return HstPtr;
-
-  while (Left != Right) {
-    uint32_t Current = Left + (Right - Left) / 2;
-    if (Table[Current].HstPtr == HstPtr)
-      return Table[Current].DevPtr;
-
-    if (HstPtr < Table[Current].HstPtr)
-      Right = Current;
-    else
-      Left = Current;
-  }
-
-  // If we searched the whole table and found nothing this is a device pointer.
-  return HstPtr;
-}
-
-/// The openmp client instance used to communicate with the server.
-[[gnu::visibility("protected"),
-  gnu::weak]] rpc::Client Client asm("__llvm_rpc_client");
-
-} // namespace impl
-} // namespace ompx
-
-/// Interfaces
-///
-///{
-
-extern "C" {
-int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
-
-int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
-
-double omp_get_wtick(void) {
-  // The number of ticks per second for the AMDGPU clock varies by card and can
-  // only be retrieved by querying the driver. We rely on the device environment
-  // to inform us what the proper frequency is. NVPTX uses a nanosecond
-  // resolution, we could omit the global read but this makes it consistent.
-  return 1.0 / ompx::config::getClockFrequency();
-}
-
-double omp_get_wtime(void) {
-  return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick();
-}
-
-void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
-  return ompx::impl::indirectCallLookup(HstPtr);
-}
-
-void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
-  switch (allocator) {
-  case omp_default_mem_alloc:
-  case omp_large_cap_mem_alloc:
-  case omp_const_mem_alloc:
-  case omp_high_bw_mem_alloc:
-  case omp_low_lat_mem_alloc:
-    return malloc(size);
-  default:
-    return nullptr;
-  }
-}
-
-void omp_free(void *ptr, omp_allocator_handle_t allocator) {
-  switch (allocator) {
-  case omp_default_mem_alloc:
-  case omp_large_cap_mem_alloc:
-  case omp_const_mem_alloc:
-  case omp_high_bw_mem_alloc:
-  case omp_low_lat_mem_alloc:
-    free(ptr);
-  case omp_null_allocator:
-  default:
-    return;
-  }
-}
-
-unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) {
-  rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_HOST_CALL>();
-  Port.send_n(data, size);
-  Port.send([=](rpc::Buffer *buffer, uint32_t) {
-    buffer->data[0] = reinterpret_cast<uintptr_t>(fn);
-  });
-  unsigned long long Ret;
-  Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
-    Ret = static_cast<unsigned long long>(Buffer->data[0]);
-  });
-  Port.close();
-  return Ret;
-}
-}
-
-///}
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
deleted file mode 100644
index 08ce616aee1c..000000000000
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Parallel implementation in the GPU. Here is the pattern:
-//
-//    while (not finished) {
-//
-//    if (master) {
-//      sequential code, decide which par loop to do, or if finished
-//     __kmpc_kernel_prepare_parallel() // exec by master only
-//    }
-//    syncthreads // A
-//    __kmpc_kernel_parallel() // exec by all
-//    if (this thread is included in the parallel) {
-//      switch () for all parallel loops
-//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
-//    }
-//
-//
-//    The reason we don't exec end_parallel for the threads not included
-//    in the parallel loop is that for each barrier in the parallel
-//    region, these non-included threads will cycle through the
-//    syncthread A. Thus they must preserve their current threadId that
-//    is larger than thread in team.
-//
-//    To make a long story short...
-//
-//===----------------------------------------------------------------------===//
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "LibC.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-namespace {
-
-uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
-  uint32_t NThreadsICV =
-      NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
-  uint32_t NumThreads = mapping::getMaxTeamThreads();
-
-  if (NThreadsICV != 0 && NThreadsICV < NumThreads)
-    NumThreads = NThreadsICV;
-
-  // SPMD mode allows any number of threads, for generic mode we round down to a
-  // multiple of WARPSIZE since it is legal to do so in OpenMP.
-  if (mapping::isSPMDMode())
-    return NumThreads;
-
-  if (NumThreads < mapping::getWarpSize())
-    NumThreads = 1;
-  else
-    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
-
-  return NumThreads;
-}
-
-// Invoke an outlined parallel function unwrapping arguments (up to 32).
-[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
-                                              int32_t bound_tid, void *fn,
-                                              void **args, int64_t nargs) {
-  switch (nargs) {
-#include "generated_microtask_cases.gen"
-  default:
-    printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
-    __builtin_trap();
-  }
-}
-
-} // namespace
-
-extern "C" {
-
-[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
-                                                   int32_t num_threads,
-                                                   void *fn, void **args,
-                                                   const int64_t nargs) {
-  uint32_t TId = mapping::getThreadIdInBlock();
-  uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  uint32_t PTeamSize =
-      NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
-  // Avoid the race between the read of the `icv::Level` above and the write
-  // below by synchronizing all threads here.
-  synchronize::threadsAligned(atomic::seq_cst);
-  {
-    // Note that the order here is important. `icv::Level` has to be updated
-    // last or the other updates will cause a thread specific state to be
-    // created.
-    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
-                                          1u, TId == 0, ident,
-                                          /*ForceTeamState=*/true);
-    state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
-                                     /*ForceTeamState=*/true);
-    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
-                               /*ForceTeamState=*/true);
-
-    // Synchronize all threads after the main thread (TId == 0) set up the
-    // team state properly.
-    synchronize::threadsAligned(atomic::acq_rel);
-
-    state::ParallelTeamSize.assert_eq(PTeamSize, ident,
-                                      /*ForceTeamState=*/true);
-    icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true);
-    icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true);
-
-    // Ensure we synchronize before we run user code to avoid invalidating the
-    // assumptions above.
-    synchronize::threadsAligned(atomic::relaxed);
-
-    if (!PTeamSize || TId < PTeamSize)
-      invokeMicrotask(TId, 0, fn, args, nargs);
-
-    // Synchronize all threads at the end of a parallel region.
-    synchronize::threadsAligned(atomic::seq_cst);
-  }
-
-  // Synchronize all threads to make sure every thread exits the scope above;
-  // otherwise the following assertions and the assumption in
-  // __kmpc_target_deinit may not hold.
-  synchronize::threadsAligned(atomic::acq_rel);
-
-  state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true);
-  icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true);
-  icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true);
-
-  // Ensure we synchronize to create an aligned region around the assumptions.
-  synchronize::threadsAligned(atomic::relaxed);
-
-  return;
-}
-
-[[clang::always_inline]] void
-__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
-                   int32_t num_threads, int proc_bind, void *fn,
-                   void *wrapper_fn, void **args, int64_t nargs) {
-  uint32_t TId = mapping::getThreadIdInBlock();
-
-  // Assert the parallelism level is zero if disabled by the user.
-  ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
-         "nested parallelism while disabled");
-
-  // Handle the serialized case first, same for SPMD/non-SPMD:
-  // 1) if-clause(0)
-  // 2) parallel in task or other thread state inducing construct
-  // 3) nested parallel regions
-  if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
-                   (config::mayUseNestedParallelism() && icv::Level))) {
-    state::DateEnvironmentRAII DERAII(ident);
-    ++icv::Level;
-    invokeMicrotask(TId, 0, fn, args, nargs);
-    return;
-  }
-
-  // From this point forward we know that there is no thread state used.
-  ASSERT(state::HasThreadState == false, nullptr);
-
-  if (mapping::isSPMDMode()) {
-    // This was moved to its own routine so it could be called directly
-    // in certain situations to avoid resource consumption of unused
-    // logic in parallel_51.
-    __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
-
-    return;
-  }
-
-  uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
-  uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
-
-  // We do *not* create a new data environment because all threads in the team
-  // that are active are now running this parallel region. They share the
-  // TeamState, which has an increase level-var and potentially active-level
-  // set, but they do not have individual ThreadStates yet. If they ever
-  // modify the ICVs beyond this point a ThreadStates will be allocated.
-
-  bool IsActiveParallelRegion = NumThreads > 1;
-  if (!IsActiveParallelRegion) {
-    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
-    invokeMicrotask(TId, 0, fn, args, nargs);
-    return;
-  }
-
-  void **GlobalArgs = nullptr;
-  if (nargs) {
-    __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
-    switch (nargs) {
-    default:
-      for (int I = 0; I < nargs; I++)
-        GlobalArgs[I] = args[I];
-      break;
-    case 16:
-      GlobalArgs[15] = args[15];
-      [[fallthrough]];
-    case 15:
-      GlobalArgs[14] = args[14];
-      [[fallthrough]];
-    case 14:
-      GlobalArgs[13] = args[13];
-      [[fallthrough]];
-    case 13:
-      GlobalArgs[12] = args[12];
-      [[fallthrough]];
-    case 12:
-      GlobalArgs[11] = args[11];
-      [[fallthrough]];
-    case 11:
-      GlobalArgs[10] = args[10];
-      [[fallthrough]];
-    case 10:
-      GlobalArgs[9] = args[9];
-      [[fallthrough]];
-    case 9:
-      GlobalArgs[8] = args[8];
-      [[fallthrough]];
-    case 8:
-      GlobalArgs[7] = args[7];
-      [[fallthrough]];
-    case 7:
-      GlobalArgs[6] = args[6];
-      [[fallthrough]];
-    case 6:
-      GlobalArgs[5] = args[5];
-      [[fallthrough]];
-    case 5:
-      GlobalArgs[4] = args[4];
-      [[fallthrough]];
-    case 4:
-      GlobalArgs[3] = args[3];
-      [[fallthrough]];
-    case 3:
-      GlobalArgs[2] = args[2];
-      [[fallthrough]];
-    case 2:
-      GlobalArgs[1] = args[1];
-      [[fallthrough]];
-    case 1:
-      GlobalArgs[0] = args[0];
-      [[fallthrough]];
-    case 0:
-      break;
-    }
-  }
-
-  {
-    // Note that the order here is important. `icv::Level` has to be updated
-    // last or the other updates will cause a thread specific state to be
-    // created.
-    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
-                                          1u, true, ident,
-                                          /*ForceTeamState=*/true);
-    state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
-                                          (void *)nullptr, true, ident,
-                                          /*ForceTeamState=*/true);
-    state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
-                                     /*ForceTeamState=*/true);
-    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
-                               /*ForceTeamState=*/true);
-
-    // Master signals work to activate workers.
-    synchronize::threads(atomic::seq_cst);
-    // Master waits for workers to signal.
-    synchronize::threads(atomic::seq_cst);
-  }
-
-  if (nargs)
-    __kmpc_end_sharing_variables();
-}
-
-[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
-  // Work function and arguments for L1 parallel region.
-  *WorkFn = state::ParallelRegionFn;
-
-  // If this is the termination signal from the master, quit early.
-  if (!*WorkFn)
-    return false;
-
-  // Set to true for workers participating in the parallel region.
-  uint32_t TId = mapping::getThreadIdInBlock();
-  bool ThreadIsActive = TId < state::getEffectivePTeamSize();
-  return ThreadIsActive;
-}
-
-[[clang::noinline]] void __kmpc_kernel_end_parallel() {
-  // In case we have modified an ICV for this thread before a ThreadState was
-  // created. We drop it now to not contaminate the next parallel region.
-  ASSERT(!mapping::isSPMDMode(), nullptr);
-  uint32_t TId = mapping::getThreadIdInBlock();
-  state::resetStateForThread(TId);
-  ASSERT(!mapping::isSPMDMode(), nullptr);
-}
-
-uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
-
-int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
-
-void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
-                           int32_t thread_limit) {}
-
-void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
-}
diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp
deleted file mode 100644
index df141af5ebee..000000000000
--- a/offload/DeviceRTL/src/Profiling.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===------- Profiling.cpp ---------------------------------------- C++ ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Profiling.h"
-
-extern "C" {
-
-// Provides empty implementations for certain functions in compiler-rt
-// that are emitted by the PGO instrumentation.
-void __llvm_profile_register_function(void *Ptr) {}
-void __llvm_profile_register_names_function(void *Ptr, long int I) {}
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
-}
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
deleted file mode 100644
index fffd0063940c..000000000000
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ /dev/null
@@ -1,316 +0,0 @@
-//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of reduction with KMPC interface.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-namespace {
-
-void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
-  for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
-    shflFct(reduce_data, /*LaneId - not used= */ 0,
-            /*Offset = */ mask, /*AlgoVersion=*/0);
-  }
-}
-
-void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
-                               uint32_t size, uint32_t tid) {
-  uint32_t curr_size;
-  uint32_t mask;
-  curr_size = size;
-  mask = curr_size / 2;
-  while (mask > 0) {
-    shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
-    curr_size = (curr_size + 1) / 2;
-    mask = curr_size / 2;
-  }
-}
-
-static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
-                                          ShuffleReductFnTy shflFct) {
-  uint32_t size, remote_id, physical_lane_id;
-  physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
-  __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
-  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
-  uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
-  __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
-  do {
-    Liveness = mapping::activemask();
-    remote_id = utils::ffs(Liveness & lanemask_gt);
-    size = utils::popc(Liveness);
-    logical_lane_id /= 2;
-    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
-            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
-  } while (logical_lane_id % 2 == 0 && size > 1);
-  return (logical_lane_id == 0);
-}
-
-static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
-                                            ShuffleReductFnTy shflFct,
-                                            InterWarpCopyFnTy cpyFct) {
-  uint32_t BlockThreadId = mapping::getThreadIdInBlock();
-  if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false))
-    BlockThreadId = 0;
-  uint32_t NumThreads = omp_get_num_threads();
-  if (NumThreads == 1)
-    return 1;
-
-  //
-  // This reduce function handles reduction within a team. It handles
-  // parallel regions in both L1 and L2 parallelism levels. It also
-  // supports Generic, SPMD, and NoOMP modes.
-  //
-  // 1. Reduce within a warp.
-  // 2. Warp master copies value to warp 0 via shared memory.
-  // 3. Warp 0 reduces to a single value.
-  // 4. The reduced value is available in the thread that returns 1.
-  //
-
-#if __has_builtin(__nvvm_reflect)
-  if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
-    uint32_t WarpsNeeded =
-        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-    uint32_t WarpId = mapping::getWarpIdInBlock();
-
-    // Volta execution model:
-    // For the Generic execution mode a parallel region either has 1 thread and
-    // beyond that, always a multiple of 32. For the SPMD execution mode we may
-    // have any number of threads.
-    if ((NumThreads % mapping::getWarpSize() == 0) ||
-        (WarpId < WarpsNeeded - 1))
-      gpu_regular_warp_reduce(reduce_data, shflFct);
-    else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-      gpu_irregular_warp_reduce(
-          reduce_data, shflFct,
-          /*LaneCount=*/NumThreads % mapping::getWarpSize(),
-          /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize());
-
-    // When we have more than [mapping::getWarpSize()] number of threads
-    // a block reduction is performed here.
-    //
-    // Only L1 parallel region can enter this if condition.
-    if (NumThreads > mapping::getWarpSize()) {
-      // Gather all the reduced values from each warp
-      // to the first warp.
-      cpyFct(reduce_data, WarpsNeeded);
-
-      if (WarpId == 0)
-        gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                  BlockThreadId);
-    }
-    return BlockThreadId == 0;
-  }
-#endif
-  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
-  if (Liveness == lanes::All) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/utils::popc(Liveness),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
-  else { // Dispersed lanes. Only threads in L2
-         // parallel region may enter here; return
-         // early.
-    return gpu_irregular_simd_reduce(reduce_data, shflFct);
-  }
-
-  // When we have more than [mapping::getWarpSize()] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > mapping::getWarpSize()) {
-    uint32_t WarpsNeeded =
-        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-
-    return BlockThreadId == 0;
-  }
-
-  // Get the OMP thread Id. This is different from BlockThreadId in the case
-  // of an L2 parallel region.
-  return BlockThreadId == 0;
-}
-
-uint32_t roundToWarpsize(uint32_t s) {
-  if (s < mapping::getWarpSize())
-    return 1;
-  return (s & ~(unsigned)(mapping::getWarpSize() - 1));
-}
-
-uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
-
-} // namespace
-
-extern "C" {
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
-                                               uint64_t reduce_data_size,
-                                               void *reduce_data,
-                                               ShuffleReductFnTy shflFct,
-                                               InterWarpCopyFnTy cpyFct) {
-  return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
-}
-
-int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
-    IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
-    uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
-    InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
-    ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
-  // Terminate all threads in non-SPMD mode except for the master thread.
-  uint32_t ThreadId = mapping::getThreadIdInBlock();
-  if (mapping::isGenericMode()) {
-    if (!mapping::isMainThreadInGenericMode())
-      return 0;
-    ThreadId = 0;
-  }
-
-  uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
-  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
-
-  // In non-generic mode all workers participate in the teams reduction.
-  // In generic mode only the team master participates in the teams
-  // reduction because the workers are waiting for parallel work.
-  uint32_t NumThreads = omp_get_num_threads();
-  uint32_t TeamId = omp_get_team_num();
-  uint32_t NumTeams = omp_get_num_teams();
-  [[clang::loader_uninitialized]] static Local<unsigned> Bound;
-  [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
-
-  // Block progress for teams greater than the current upper
-  // limit. We always only allow a number of teams less or equal
-  // to the number of slots in the buffer.
-  bool IsMaster = (ThreadId == 0);
-  while (IsMaster) {
-    Bound = atomic::load(&IterCnt, atomic::acquire);
-    if (TeamId < Bound + num_of_records)
-      break;
-  }
-
-  if (IsMaster) {
-    int ModBockId = TeamId % num_of_records;
-    if (TeamId < num_of_records) {
-      lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
-    } else
-      lgredFct(GlobalBuffer, ModBockId, reduce_data);
-
-    // Propagate the memory writes above to the world.
-    fence::kernel(atomic::release);
-
-    // Increment team counter.
-    // This counter is incremented by all teams in the current
-    // num_of_records chunk.
-    ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
-                                 atomic::MemScopeTy::device);
-  }
-
-  // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
-  // state machine.
-  if (mapping::isSPMDMode())
-    synchronize::threadsAligned(atomic::acq_rel);
-
-  // reduce_data is global or shared so before being reduced within the
-  // warp we need to bring it in local memory:
-  // local_reduce_data = reduce_data[i]
-  //
-  // Example for 3 reduction variables a, b, c (of potentially different
-  // types):
-  //
-  // buffer layout (struct of arrays):
-  // a, a, ..., a, b, b, ... b, c, c, ... c
-  // |__________|
-  //     num_of_records
-  //
-  // local_data_reduce layout (struct):
-  // a, b, c
-  //
-  // Each thread will have a local struct containing the values to be
-  // reduced:
-  //      1. do reduction within each warp.
-  //      2. do reduction across warps.
-  //      3. write the final result to the main reduction variable
-  //         by returning 1 in the thread holding the reduction result.
-
-  // Check if this is the very last team.
-  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
-  if (ChunkTeamCount == NumTeams - Bound - 1) {
-    // Ensure we see the global memory writes by other teams
-    fence::kernel(atomic::acquire);
-
-    //
-    // Last team processing.
-    //
-    if (ThreadId >= NumRecs)
-      return 0;
-    NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
-    if (ThreadId >= NumThreads)
-      return 0;
-
-    // Load from buffer and reduce.
-    glcpyFct(GlobalBuffer, ThreadId, reduce_data);
-    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
-      glredFct(GlobalBuffer, i, reduce_data);
-
-    // Reduce across warps to the warp master.
-    if (NumThreads > 1) {
-      gpu_regular_warp_reduce(reduce_data, shflFct);
-
-      // When we have more than [mapping::getWarpSize()] number of threads
-      // a block reduction is performed here.
-      uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
-      if (ActiveThreads > mapping::getWarpSize()) {
-        uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
-                               mapping::getWarpSize();
-        // Gather all the reduced values from each warp
-        // to the first warp.
-        cpyFct(reduce_data, WarpsNeeded);
-
-        uint32_t WarpId = ThreadId / mapping::getWarpSize();
-        if (WarpId == 0)
-          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                    ThreadId);
-      }
-    }
-
-    if (IsMaster) {
-      Cnt = 0;
-      IterCnt = 0;
-      return 1;
-    }
-    return 0;
-  }
-  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
-    // Allow SIZE number of teams to proceed writing their
-    // intermediate results to the global buffer.
-    atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst);
-  }
-
-  return 0;
-}
-}
-
-void *__kmpc_reduction_get_fixed_buffer() {
-  return state::getKernelLaunchEnvironment().ReductionBuffer;
-}
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
deleted file mode 100644
index 475395102f47..000000000000
--- a/offload/DeviceRTL/src/State.cpp
+++ /dev/null
@@ -1,482 +0,0 @@
-//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "LibC.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-/// Memory implementation
-///
-///{
-
-/// External symbol to access dynamic shared memory.
-[[gnu::aligned(
-    allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[];
-
-/// The kernel environment passed to the init method by the compiler.
-[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *>
-    KernelEnvironmentPtr;
-
-/// The kernel launch environment passed as argument to the kernel by the
-/// runtime.
-[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
-    KernelLaunchEnvironmentPtr;
-
-///}
-
-namespace {
-
-/// Fallback implementations are missing to trigger a link time error.
-/// Implementations for new devices, including the host, should go into a
-/// dedicated begin/end declare variant.
-///
-///{
-extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
-
-[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
-[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
-
-#else
-
-[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
-[[gnu::weak, gnu::leaf]] void free(void *Ptr);
-
-#endif
-}
-///}
-
-/// A "smart" stack in shared memory.
-///
-/// The stack exposes a malloc/free interface but works like a stack internally.
-/// In fact, it is a separate stack *per warp*. That means, each warp must push
-/// and pop symmetrically or this breaks, badly. The implementation will (aim
-/// to) detect non-lock-step warps and fallback to malloc/free. The same will
-/// happen if a warp runs out of memory. The master warp in generic memory is
-/// special and is given more memory than the rest.
-///
-struct SharedMemorySmartStackTy {
-  /// Initialize the stack. Must be called by all threads.
-  void init(bool IsSPMD);
-
-  /// Allocate \p Bytes on the stack for the encountering thread. Each thread
-  /// can call this function.
-  void *push(uint64_t Bytes);
-
-  /// Deallocate the last allocation made by the encountering thread and pointed
-  /// to by \p Ptr from the stack. Each thread can call this function.
-  void pop(void *Ptr, uint64_t Bytes);
-
-private:
-  /// Compute the size of the storage space reserved for a thread.
-  uint32_t computeThreadStorageTotal() {
-    uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
-    return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock,
-                                allocator::ALIGNMENT);
-  }
-
-  /// Return the top address of the warp data stack, that is the first address
-  /// this warp will allocate memory at next.
-  void *getThreadDataTop(uint32_t TId) {
-    return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
-  }
-
-  /// The actual storage, shared among all warps.
-  [[gnu::aligned(
-      allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
-  [[gnu::aligned(
-      allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
-};
-
-static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
-              "Shared scratchpad of this size not supported yet.");
-
-/// The allocation of a single shared memory scratchpad.
-[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy>
-    SharedMemorySmartStack;
-
-void SharedMemorySmartStackTy::init(bool IsSPMD) {
-  Usage[mapping::getThreadIdInBlock()] = 0;
-}
-
-void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
-  // First align the number of requested bytes.
-  /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
-  /// be passed in as an argument and the stack rewritten to support it.
-  uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT);
-
-  uint32_t StorageTotal = computeThreadStorageTotal();
-
-  // The main thread in generic mode gets the space of its entire warp as the
-  // other threads do not participate in any computation at all.
-  if (mapping::isMainThreadInGenericMode())
-    StorageTotal *= mapping::getWarpSize();
-
-  int TId = mapping::getThreadIdInBlock();
-  if (Usage[TId] + AlignedBytes <= StorageTotal) {
-    void *Ptr = getThreadDataTop(TId);
-    Usage[TId] += AlignedBytes;
-    return Ptr;
-  }
-
-  if (config::isDebugMode(DeviceDebugKind::CommonIssues))
-    printf("Shared memory stack full, fallback to dynamic allocation of global "
-           "memory will negatively impact performance.\n");
-  void *GlobalMemory = memory::allocGlobal(
-      AlignedBytes, "Slow path shared memory allocation, insufficient "
-                    "shared memory stack memory!");
-  ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
-
-  return GlobalMemory;
-}
-
-void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
-  uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT);
-  if (utils::isSharedMemPtr(Ptr)) {
-    int TId = mapping::getThreadIdInBlock();
-    Usage[TId] -= AlignedBytes;
-    return;
-  }
-  memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
-}
-
-} // namespace
-
-void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
-
-void *memory::allocShared(uint64_t Bytes, const char *Reason) {
-  return SharedMemorySmartStack.push(Bytes);
-}
-
-void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
-  SharedMemorySmartStack.pop(Ptr, Bytes);
-}
-
-void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
-  void *Ptr = malloc(Bytes);
-  if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
-    printf("nullptr returned by malloc!\n");
-  return Ptr;
-}
-
-void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
-
-///}
-
-bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
-  return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
-         (ActiveLevelVar == Other.ActiveLevelVar) &
-         (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
-         (RunSchedVar == Other.RunSchedVar) &
-         (RunSchedChunkVar == Other.RunSchedChunkVar);
-}
-
-void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
-  ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
-  ASSERT(LevelVar == Other.LevelVar, nullptr);
-  ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
-  ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
-  ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
-  ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
-}
-
-void state::TeamStateTy::init(bool IsSPMD) {
-  ICVState.NThreadsVar = 0;
-  ICVState.LevelVar = 0;
-  ICVState.ActiveLevelVar = 0;
-  ICVState.Padding0Val = 0;
-  ICVState.MaxActiveLevelsVar = 1;
-  ICVState.RunSchedVar = omp_sched_static;
-  ICVState.RunSchedChunkVar = 1;
-  ParallelTeamSize = 1;
-  HasThreadState = false;
-  ParallelRegionFnVar = nullptr;
-}
-
-bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
-  return (ICVState == Other.ICVState) &
-         (HasThreadState == Other.HasThreadState) &
-         (ParallelTeamSize == Other.ParallelTeamSize);
-}
-
-void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
-  ICVState.assertEqual(Other.ICVState);
-  ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
-  ASSERT(HasThreadState == Other.HasThreadState, nullptr);
-}
-
-[[clang::loader_uninitialized]] Local<state::TeamStateTy>
-    ompx::state::TeamState;
-[[clang::loader_uninitialized]] Local<state::ThreadStateTy **>
-    ompx::state::ThreadStates;
-
-namespace {
-
-int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
-                             int OutOfBoundsVal = -1) {
-  if (Level == 0)
-    return DefaultVal;
-  int LevelVar = omp_get_level();
-  if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
-    return OutOfBoundsVal;
-  int ActiveLevel = icv::ActiveLevel;
-  if (OMP_UNLIKELY(Level != ActiveLevel))
-    return DefaultVal;
-  return Val;
-}
-
-} // namespace
-
-void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
-  SharedMemorySmartStack.init(IsSPMD);
-  if (mapping::isInitialThreadInLevel0(IsSPMD)) {
-    TeamState.init(IsSPMD);
-    ThreadStates = nullptr;
-    KernelEnvironmentPtr = &KernelEnvironment;
-    KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
-  }
-}
-
-KernelEnvironmentTy &state::getKernelEnvironment() {
-  return *KernelEnvironmentPtr;
-}
-
-KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
-  return *KernelLaunchEnvironmentPtr;
-}
-
-void state::enterDataEnvironment(IdentTy *Ident) {
-  ASSERT(config::mayUseThreadStates(),
-         "Thread state modified while explicitly disabled!");
-  if (!config::mayUseThreadStates())
-    return;
-
-  unsigned TId = mapping::getThreadIdInBlock();
-  ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
-      memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
-  uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
-  if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
-    uint32_t Bytes =
-        sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
-    void *ThreadStatesPtr =
-        memory::allocGlobal(Bytes, "Thread state array allocation");
-    __builtin_memset(ThreadStatesPtr, 0, Bytes);
-    if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
-                     reinterpret_cast<uintptr_t>(ThreadStatesPtr),
-                     atomic::seq_cst, atomic::seq_cst))
-      memory::freeGlobal(ThreadStatesPtr,
-                         "Thread state array allocated multiple times");
-    ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
-           "Expected valid thread states bit!");
-  }
-  NewThreadState->init(ThreadStates[TId]);
-  TeamState.HasThreadState = true;
-  ThreadStates[TId] = NewThreadState;
-}
-
-void state::exitDataEnvironment() {
-  ASSERT(config::mayUseThreadStates(),
-         "Thread state modified while explicitly disabled!");
-
-  unsigned TId = mapping::getThreadIdInBlock();
-  resetStateForThread(TId);
-}
-
-void state::resetStateForThread(uint32_t TId) {
-  if (!config::mayUseThreadStates())
-    return;
-  if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
-    return;
-
-  ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
-  memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
-  ThreadStates[TId] = PreviousThreadState;
-}
-
-void state::runAndCheckState(void(Func(void))) {
-  TeamStateTy OldTeamState = TeamState;
-  OldTeamState.assertEqual(TeamState);
-
-  Func();
-
-  OldTeamState.assertEqual(TeamState);
-}
-
-void state::assumeInitialState(bool IsSPMD) {
-  TeamStateTy InitialTeamState;
-  InitialTeamState.init(IsSPMD);
-  InitialTeamState.assertEqual(TeamState);
-  ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
-}
-
-int state::getEffectivePTeamSize() {
-  int PTeamSize = state::ParallelTeamSize;
-  return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
-}
-
-extern "C" {
-void omp_set_dynamic(int V) {}
-
-int omp_get_dynamic(void) { return 0; }
-
-void omp_set_num_threads(int V) { icv::NThreads = V; }
-
-int omp_get_max_threads(void) {
-  int NT = icv::NThreads;
-  return NT > 0 ? NT : mapping::getMaxTeamThreads();
-}
-
-int omp_get_level(void) {
-  int LevelVar = icv::Level;
-  ASSERT(LevelVar >= 0, nullptr);
-  return LevelVar;
-}
-
-int omp_get_active_level(void) { return !!icv::ActiveLevel; }
-
-int omp_in_parallel(void) { return !!icv::ActiveLevel; }
-
-void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
-  *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
-  *ChunkSize = state::RunSchedChunk;
-}
-
-void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
-  icv::RunSched = (int)ScheduleKind;
-  state::RunSchedChunk = ChunkSize;
-}
-
-int omp_get_ancestor_thread_num(int Level) {
-  return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
-}
-
-int omp_get_thread_num(void) {
-  return omp_get_ancestor_thread_num(omp_get_level());
-}
-
-int omp_get_team_size(int Level) {
-  return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
-}
-
-int omp_get_num_threads(void) {
-  return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
-}
-
-int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
-
-int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
-
-void omp_set_nested(int) {}
-
-int omp_get_nested(void) { return false; }
-
-void omp_set_max_active_levels(int Levels) {
-  icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
-}
-
-int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
-
-omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
-
-int omp_get_num_places(void) { return 0; }
-
-int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
-
-void omp_get_place_proc_ids(int, int *) {
-  // TODO
-}
-
-int omp_get_place_num(void) { return 0; }
-
-int omp_get_partition_num_places(void) { return 0; }
-
-void omp_get_partition_place_nums(int *) {
-  // TODO
-}
-
-int omp_get_cancellation(void) { return 0; }
-
-void omp_set_default_device(int) {}
-
-int omp_get_default_device(void) { return -1; }
-
-int omp_get_num_devices(void) { return config::getNumDevices(); }
-
-int omp_get_device_num(void) { return config::getDeviceNum(); }
-
-int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
-
-int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
-
-int omp_get_initial_device(void) { return -1; }
-
-int omp_is_initial_device(void) { return 0; }
-}
-
-extern "C" {
-[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
-  return memory::allocShared(Bytes, "Frontend alloc shared");
-}
-
-[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
-  memory::freeShared(Ptr, Bytes, "Frontend free shared");
-}
-
-void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
-
-void *llvm_omp_target_dynamic_shared_alloc() {
-  return __kmpc_get_dynamic_shared();
-}
-
-void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
-
-/// Allocate storage in shared memory to communicate arguments from the main
-/// thread to the workers in generic mode. If we exceed
-/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
-constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
-
-[[clang::loader_uninitialized]] static Local<void *>
-    SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
-[[clang::loader_uninitialized]] static Local<void **>
-    SharedMemVariableSharingSpacePtr;
-
-void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
-  if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
-    SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
-  } else {
-    SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
-        nArgs * sizeof(void *), "new extended args");
-    ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
-           "Nullptr returned by malloc!");
-  }
-  *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-
-void __kmpc_end_sharing_variables() {
-  if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
-    memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
-}
-
-void __kmpc_get_shared_variables(void ***GlobalArgs) {
-  *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-}
diff --git a/offload/DeviceRTL/src/Stub.cpp b/offload/DeviceRTL/src/Stub.cpp
deleted file mode 100644
index e833423eb265..000000000000
--- a/offload/DeviceRTL/src/Stub.cpp
+++ /dev/null
@@ -1 +0,0 @@
-// This is an empty file used to create a device fatbinary.
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
deleted file mode 100644
index 2f1ed34a3f6d..000000000000
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Include all synchronization.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Synchronization.h"
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-
-using namespace ompx;
-
-namespace impl {
-
-/// Atomics
-///
-///{
-///}
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope) {
-  // builtin_amdgcn_atomic_inc32 should expand to this switch when
-  // passed a runtime value, but does not do so yet. Workaround here.
-
-#define ScopeSwitch(ORDER)                                                     \
-  switch (MemScope) {                                                          \
-  case atomic::MemScopeTy::system:                                             \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "");                     \
-  case atomic::MemScopeTy::device:                                             \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent");                \
-  case atomic::MemScopeTy::workgroup:                                          \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup");            \
-  case atomic::MemScopeTy::wavefront:                                          \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "wavefront");            \
-  case atomic::MemScopeTy::single:                                             \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "singlethread");         \
-  }
-
-#define Case(ORDER)                                                            \
-  case ORDER:                                                                  \
-    ScopeSwitch(ORDER)
-
-  switch (Ordering) {
-  default:
-    __builtin_unreachable();
-    Case(atomic::relaxed);
-    Case(atomic::acquire);
-    Case(atomic::release);
-    Case(atomic::acq_rel);
-    Case(atomic::seq_cst);
-#undef Case
-#undef ScopeSwitch
-  }
-}
-
-[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker;
-
-void namedBarrierInit() {
-  // Don't have global ctors, and shared memory is not zero init
-  atomic::store(&namedBarrierTracker, 0u, atomic::release);
-}
-
-void namedBarrier() {
-  uint32_t NumThreads = omp_get_num_threads();
-  // assert(NumThreads % 32 == 0);
-
-  uint32_t WarpSize = mapping::getWarpSize();
-  uint32_t NumWaves = NumThreads / WarpSize;
-
-  fence::team(atomic::acquire);
-
-  // named barrier implementation for amdgcn.
-  // Uses two 16 bit unsigned counters. One for the number of waves to have
-  // reached the barrier, and one to count how many times the barrier has been
-  // passed. These are packed in a single atomically accessed 32 bit integer.
-  // Low bits for the number of waves, assumed zero before this call.
-  // High bits to count the number of times the barrier has been passed.
-
-  // precondition: NumWaves != 0;
-  // invariant: NumWaves * WarpSize == NumThreads;
-  // precondition: NumWaves < 0xffffu;
-
-  // Increment the low 16 bits once, using the lowest active thread.
-  if (mapping::isLeaderInWarp()) {
-    uint32_t load = atomic::add(&namedBarrierTracker, 1,
-                                atomic::relaxed); // commutative
-
-    // Record the number of times the barrier has been passed
-    uint32_t generation = load & 0xffff0000u;
-
-    if ((load & 0x0000ffffu) == (NumWaves - 1)) {
-      // Reached NumWaves in low bits so this is the last wave.
-      // Set low bits to zero and increment high bits
-      load += 0x00010000u; // wrap is safe
-      load &= 0xffff0000u; // because bits zeroed second
-
-      // Reset the wave counter and release the waiting waves
-      atomic::store(&namedBarrierTracker, load, atomic::relaxed);
-    } else {
-      // more waves still to go, spin until generation counter changes
-      do {
-        __builtin_amdgcn_s_sleep(0);
-        load = atomic::load(&namedBarrierTracker, atomic::relaxed);
-      } while ((load & 0xffff0000u) == generation);
-    }
-  }
-  fence::team(atomic::release);
-}
-
-void fenceTeam(atomic::OrderingTy Ordering) {
-  return __scoped_atomic_thread_fence(Ordering, atomic::workgroup);
-}
-
-void fenceKernel(atomic::OrderingTy Ordering) {
-  return __scoped_atomic_thread_fence(Ordering, atomic::device);
-}
-
-void fenceSystem(atomic::OrderingTy Ordering) {
-  return __scoped_atomic_thread_fence(Ordering, atomic::system);
-}
-
-void syncWarp(__kmpc_impl_lanemask_t) {
-  // This is a no-op on current AMDGPU hardware but it is used by the optimizer
-  // to enforce convergent behaviour between control flow graphs.
-  __builtin_amdgcn_wave_barrier();
-}
-
-void syncThreads(atomic::OrderingTy Ordering) {
-  if (Ordering != atomic::relaxed)
-    fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
-
-  __builtin_amdgcn_s_barrier();
-
-  if (Ordering != atomic::relaxed)
-    fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst);
-}
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
-
-// TODO: Don't have wavefront lane locks. Possibly can't have them.
-void unsetLock(omp_lock_t *) { __builtin_trap(); }
-int testLock(omp_lock_t *) { __builtin_trap(); }
-void initLock(omp_lock_t *) { __builtin_trap(); }
-void destroyLock(omp_lock_t *) { __builtin_trap(); }
-void setLock(omp_lock_t *) { __builtin_trap(); }
-
-constexpr uint32_t UNSET = 0;
-constexpr uint32_t SET = 1;
-
-void unsetCriticalLock(omp_lock_t *Lock) {
-  (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel);
-}
-
-void setCriticalLock(omp_lock_t *Lock) {
-  uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1;
-  if (mapping::getThreadIdInWarp() == LowestActiveThread) {
-    fenceKernel(atomic::release);
-    while (
-        !cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) {
-      __builtin_amdgcn_s_sleep(32);
-    }
-    fenceKernel(atomic::acquire);
-  }
-}
-
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope) {
-  return __nvvm_atom_inc_gen_ui(Address, Val);
-}
-
-void namedBarrierInit() {}
-
-void namedBarrier() {
-  uint32_t NumThreads = omp_get_num_threads();
-  ASSERT(NumThreads % 32 == 0, nullptr);
-
-  // The named barrier for active parallel threads of a team in an L1 parallel
-  // region to synchronize with each other.
-  constexpr int BarrierNo = 7;
-  __nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
-}
-
-void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
-
-void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); }
-
-void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
-
-void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
-
-void syncThreads(atomic::OrderingTy Ordering) {
-  constexpr int BarrierNo = 8;
-  __nvvm_barrier_sync(BarrierNo);
-}
-
-void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
-
-constexpr uint32_t OMP_SPIN = 1000;
-constexpr uint32_t UNSET = 0;
-constexpr uint32_t SET = 1;
-
-// TODO: This seems to hide a bug in the declare variant handling. If it is
-// called before it is defined
-//       here the overload won't happen. Investigate lalter!
-void unsetLock(omp_lock_t *Lock) {
-  (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst);
-}
-
-int testLock(omp_lock_t *Lock) {
-  return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst);
-}
-
-void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void setLock(omp_lock_t *Lock) {
-  // TODO: not sure spinning is a good idea here..
-  while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst,
-                     atomic::seq_cst) != UNSET) {
-    int32_t start = __nvvm_read_ptx_sreg_clock();
-    int32_t now;
-    for (;;) {
-      now = __nvvm_read_ptx_sreg_clock();
-      int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) {
-        break;
-      }
-    }
-  } // wait for 0 to be the read value
-}
-
-void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
-
-#endif
-///}
-
-} // namespace impl
-
-void synchronize::init(bool IsSPMD) {
-  if (!IsSPMD)
-    impl::namedBarrierInit();
-}
-
-void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
-
-void synchronize::threads(atomic::OrderingTy Ordering) {
-  impl::syncThreads(Ordering);
-}
-
-void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
-  impl::syncThreadsAligned(Ordering);
-}
-
-void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
-
-void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); }
-
-void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); }
-
-uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering,
-                     atomic::MemScopeTy MemScope) {
-  return impl::atomicInc(Addr, V, Ordering, MemScope);
-}
-
-void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
-
-void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
-
-extern "C" {
-void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
-
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
-  __kmpc_barrier(Loc, TId);
-  return 0;
-}
-
-void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
-  if (mapping::isSPMDMode())
-    return __kmpc_barrier_simple_spmd(Loc, TId);
-
-  // Generic parallel regions are run with multiple of the warp size or single
-  // threaded, in the latter case we need to stop here.
-  if (omp_get_num_threads() == 1)
-    return __kmpc_flush(Loc);
-
-  impl::namedBarrier();
-}
-
-[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
-  synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
-}
-
-[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc,
-                                                       int32_t TId) {
-  synchronize::threads(atomic::OrderingTy::seq_cst);
-}
-
-int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
-  return omp_get_thread_num() == 0;
-}
-
-void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) {
-  return omp_get_thread_num() == Filter;
-}
-
-void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
-  return __kmpc_master(Loc, TId);
-}
-
-void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
-  // The barrier is explicitly called.
-}
-
-void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); }
-
-uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
-
-void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
-
-void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
-  impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
-}
-
-void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
-  impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
-}
-
-void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
-
-void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
-
-void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
-
-void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
-
-int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); }
-
-void ompx_sync_block(int Ordering) {
-  impl::syncThreadsAligned(atomic::OrderingTy(Ordering));
-}
-void ompx_sync_block_acq_rel() {
-  impl::syncThreadsAligned(atomic::OrderingTy::acq_rel);
-}
-void ompx_sync_block_divergent(int Ordering) {
-  impl::syncThreads(atomic::OrderingTy(Ordering));
-}
-} // extern "C"
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp
deleted file mode 100644
index d0be0ace50df..000000000000
--- a/offload/DeviceRTL/src/Tasking.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Task implementation support.
-//
-// TODO: We should not allocate and execute the task in two steps. A new API is
-//       needed for that though.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "State.h"
-
-using namespace ompx;
-
-extern "C" {
-
-TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
-                                        size_t TaskSizeInclPrivateValues,
-                                        size_t SharedValuesSize,
-                                        TaskFnTy TaskFn) {
-  auto TaskSizeInclPrivateValuesPadded =
-      utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *));
-  auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
-  TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
-      TaskSizeTotal, "explicit task descriptor");
-  TaskDescriptor->Payload =
-      utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
-  TaskDescriptor->TaskFn = TaskFn;
-
-  return TaskDescriptor;
-}
-
-int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
-                        TaskDescriptorTy *TaskDescriptor) {
-  return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
-}
-
-int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
-                                  TaskDescriptorTy *TaskDescriptor, int32_t,
-                                  void *, int32_t, void *) {
-  state::DateEnvironmentRAII DERAII(Loc);
-
-  TaskDescriptor->TaskFn(0, TaskDescriptor);
-
-  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
-  return 0;
-}
-
-void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
-                               TaskDescriptorTy *TaskDescriptor) {
-  state::enterDataEnvironment(Loc);
-}
-
-void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
-                                  TaskDescriptorTy *TaskDescriptor) {
-  state::exitDataEnvironment();
-
-  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
-}
-
-void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
-                          void *) {}
-
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
-
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
-
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; }
-
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
-
-void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
-                     TaskDescriptorTy *TaskDescriptor, int,
-                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
-                     int32_t, uint64_t, void *) {
-  // Skip task entirely if empty iteration space.
-  if (*LowerBound > *UpperBound)
-    return;
-
-  // The compiler has already stored lb and ub in the TaskDescriptorTy structure
-  // as we are using a single task to execute the entire loop, we can leave
-  // the initial task_t untouched
-  __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
-}
-
-int omp_in_final(void) {
-  // treat all tasks as final... Specs may expect runtime to keep
-  // track more precisely if a task was actively set by users... This
-  // is not explicitly specified; will treat as if runtime can
-  // actively decide to put a non-final task into a final one.
-  return 1;
-}
-
-int omp_get_max_task_priority(void) { return 0; }
-}
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
deleted file mode 100644
index a8759307b42b..000000000000
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ /dev/null
@@ -1,935 +0,0 @@
-//===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the KMPC interface
-// for the loop construct plus other worksharing constructs that use the same
-// interface as loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Workshare.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-// TODO:
-struct DynamicScheduleTracker {
-  int64_t Chunk;
-  int64_t LoopUpperBound;
-  int64_t NextLowerBound;
-  int64_t Stride;
-  kmp_sched_t ScheduleType;
-  DynamicScheduleTracker *NextDST;
-};
-
-#define ASSERT0(...)
-
-// used by the library for the interface with the app
-#define DISPATCH_FINISHED 0
-#define DISPATCH_NOTFINISHED 1
-
-// used by dynamic scheduling
-#define FINISHED 0
-#define NOT_FINISHED 1
-#define LAST_CHUNK 2
-
-// TODO: This variable is a hack inherited from the old runtime.
-[[clang::loader_uninitialized]] static Local<uint64_t> Cnt;
-
-template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling with chunk
-
-  // Generic implementation of OMP loop scheduling with static policy
-  /*! \brief Calculate initial bounds for static loop and stride
-   *  @param[in] loc location in code of the call (not used here)
-   *  @param[in] global_tid global thread id
-   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
-   *  @param[in] plastiter pointer to last iteration
-   *  @param[in,out] pointer to loop lower bound. it will contain value of
-   *  lower bound of first chunk
-   *  @param[in,out] pointer to loop upper bound. It will contain value of
-   *  upper bound of first chunk
-   *  @param[in,out] pointer to loop stride. It will contain value of stride
-   *  between two successive chunks executed by the same thread
-   *  @param[in] loop increment bump
-   *  @param[in] chunk size
-   */
-
-  // helper function for static chunk
-  static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
-                             T entityId, T numberOfEntities) {
-    // each thread executes multiple chunks all of the same size, except
-    // the last one
-    // distance between two successive chunks
-    stride = numberOfEntities * chunk;
-    lb = lb + entityId * chunk;
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    // Say ub' is the beginning of the last chunk. Then who ever has a
-    // lower bound plus a multiple of the increment equal to ub' is
-    // the last one.
-    T beginingLastChunk = inputUb - (inputUb % chunk);
-    last = ((beginingLastChunk - lb) % stride) == 0;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling without chunk
-
-  // helper function for static no chunk
-  static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
-                               T entityId, T numberOfEntities) {
-    // No chunk size specified.  Each thread or warp gets at most one
-    // chunk; chunks are all almost of equal size
-    T loopSize = ub - lb + 1;
-
-    chunk = loopSize / numberOfEntities;
-    T leftOver = loopSize - chunk * numberOfEntities;
-
-    if (entityId < leftOver) {
-      chunk++;
-      lb = lb + entityId * chunk;
-    } else {
-      lb = lb + entityId * chunk + leftOver;
-    }
-
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    last = lb <= inputUb && inputUb <= ub;
-    stride = loopSize; // make sure we only do 1 chunk per warp
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for Static Init
-
-  static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter,
-                              T *plower, T *pupper, ST *pstride, ST chunk,
-                              bool IsSPMDExecutionMode) {
-    int32_t gtid = omp_get_thread_num();
-    int numberOfActiveOMPThreads = omp_get_num_threads();
-
-    // All warps that are in excess of the maximum requested, do
-    // not execute the loop
-    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
-            "current thread is not needed here; error");
-
-    // copy
-    int lastiter = 0;
-    T lb = *plower;
-    T ub = *pupper;
-    ST stride = *pstride;
-
-    // init
-    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
-    case kmp_sched_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        break;
-      }
-      [[fallthrough]];
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_balanced_chunk: {
-      if (chunk > 0) {
-        // round up to make sure the chunk is enough to cover all iterations
-        T tripCount = ub - lb + 1; // +1 because ub is inclusive
-        T span = (tripCount + numberOfActiveOMPThreads - 1) /
-                 numberOfActiveOMPThreads;
-        // perform chunk adjustment
-        chunk = (span + chunk - 1) & ~(chunk - 1);
-
-        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-        T oldUb = ub;
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        if (ub > oldUb)
-          ub = oldUb;
-        break;
-      }
-      [[fallthrough]];
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-      break;
-    }
-    case kmp_sched_distr_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
-                       omp_get_num_teams());
-        break;
-      }
-      [[fallthrough]];
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_distr_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
-                       omp_get_num_teams());
-      break;
-    }
-    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
-      ForStaticChunk(lastiter, lb, ub, stride, chunk,
-                     numberOfActiveOMPThreads * omp_get_team_num() + gtid,
-                     omp_get_num_teams() * numberOfActiveOMPThreads);
-      break;
-    }
-    default: {
-      // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                     numberOfActiveOMPThreads);
-      break;
-    }
-    }
-    // copy back
-    *plastiter = lastiter;
-    *plower = lb;
-    *pupper = ub;
-    *pstride = stride;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch Init
-
-  static int OrderedSchedule(kmp_sched_t schedule) {
-    return schedule >= kmp_sched_ordered_first &&
-           schedule <= kmp_sched_ordered_last;
-  }
-
-  static void dispatch_init(IdentTy *loc, int32_t threadId,
-                            kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
-                            DynamicScheduleTracker *DST) {
-    int tid = mapping::getThreadIdInBlock();
-    T tnum = omp_get_num_threads();
-    T tripCount = ub - lb + 1; // +1 because ub is inclusive
-    ASSERT0(LT_FUSSY, threadId < tnum,
-            "current thread is not needed here; error");
-
-    /* Currently just ignore the monotonic and non-monotonic modifiers
-     * (the compiler isn't producing them * yet anyway).
-     * When it is we'll want to look at them somewhere here and use that
-     * information to add to our schedule choice. We shouldn't need to pass
-     * them on, they merely affect which schedule we can legally choose for
-     * various dynamic cases. (In particular, whether or not a stealing scheme
-     * is legal).
-     */
-    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
-
-    // Process schedule.
-    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
-      if (OrderedSchedule(schedule))
-        __kmpc_barrier(loc, threadId);
-      schedule = kmp_sched_static_chunk;
-      chunk = tripCount; // one thread gets the whole loop
-    } else if (schedule == kmp_sched_runtime) {
-      // process runtime
-      omp_sched_t rtSched;
-      int ChunkInt;
-      omp_get_schedule(&rtSched, &ChunkInt);
-      chunk = ChunkInt;
-      switch (rtSched) {
-      case omp_sched_static: {
-        if (chunk > 0)
-          schedule = kmp_sched_static_chunk;
-        else
-          schedule = kmp_sched_static_nochunk;
-        break;
-      }
-      case omp_sched_auto: {
-        schedule = kmp_sched_static_chunk;
-        chunk = 1;
-        break;
-      }
-      case omp_sched_dynamic:
-      case omp_sched_guided: {
-        schedule = kmp_sched_dynamic;
-        break;
-      }
-      }
-    } else if (schedule == kmp_sched_auto) {
-      schedule = kmp_sched_static_chunk;
-      chunk = 1;
-    } else {
-      // ASSERT(LT_FUSSY,
-      //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-      //        "unknown schedule %d & chunk %lld\n", (int)schedule,
-      //        (long long)chunk);
-    }
-
-    // init schedules
-    if (schedule == kmp_sched_static_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      DST->ScheduleType = schedule;
-      // save ub
-      DST->LoopUpperBound = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      DST->Chunk = chunk;
-      DST->NextLowerBound = lb;
-      DST->Stride = stride;
-    } else if (schedule == kmp_sched_static_balanced_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      DST->ScheduleType = schedule;
-      // save ub
-      DST->LoopUpperBound = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      // round up to make sure the chunk is enough to cover all iterations
-      T span = (tripCount + tnum - 1) / tnum;
-      // perform chunk adjustment
-      chunk = (span + chunk - 1) & ~(chunk - 1);
-
-      T oldUb = ub;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-      if (ub > oldUb)
-        ub = oldUb;
-      // save computed params
-      DST->Chunk = chunk;
-      DST->NextLowerBound = lb;
-      DST->Stride = stride;
-    } else if (schedule == kmp_sched_static_nochunk) {
-      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
-      // save sched state
-      DST->ScheduleType = schedule;
-      // save ub
-      DST->LoopUpperBound = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      DST->Chunk = chunk;
-      DST->NextLowerBound = lb;
-      DST->Stride = stride;
-    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      // save data
-      DST->ScheduleType = schedule;
-      if (chunk < 1)
-        chunk = 1;
-      DST->Chunk = chunk;
-      DST->LoopUpperBound = ub;
-      DST->NextLowerBound = lb;
-      __kmpc_barrier(loc, threadId);
-      if (tid == 0) {
-        Cnt = 0;
-        fence::team(atomic::seq_cst);
-      }
-      __kmpc_barrier(loc, threadId);
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch next
-
-  static uint64_t NextIter() {
-    __kmpc_impl_lanemask_t active = mapping::activemask();
-    uint32_t leader = utils::ffs(active) - 1;
-    uint32_t change = utils::popc(active);
-    __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
-    unsigned int rank = utils::popc(active & lane_mask_lt);
-    uint64_t warp_res = 0;
-    if (rank == 0) {
-      warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
-    }
-    warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize());
-    return warp_res + rank;
-  }
-
-  static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
-                              T loopUpperBound) {
-    T N = NextIter();
-    lb = loopLowerBound + N * chunkSize;
-    ub = lb + chunkSize - 1; // Clang uses i <= ub
-
-    // 3 result cases:
-    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
-    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
-    //  NOT_FINISHED
-    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
-    // a.
-    if (lb <= loopUpperBound && ub < loopUpperBound) {
-      return NOT_FINISHED;
-    }
-    // b.
-    if (lb <= loopUpperBound) {
-      ub = loopUpperBound;
-      return LAST_CHUNK;
-    }
-    // c. if we are here, we are in case 'c'
-    lb = loopUpperBound + 2;
-    ub = loopUpperBound + 1;
-    return FINISHED;
-  }
-
-  static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
-                           T *plower, T *pupper, ST *pstride,
-                           DynamicScheduleTracker *DST) {
-    // ID of a thread in its own warp
-
-    // automatically selects thread or warp ID based on selected implementation
-    ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
-            "current thread is not needed here; error");
-    // retrieve schedule
-    kmp_sched_t schedule = DST->ScheduleType;
-
-    // xxx reduce to one
-    if (schedule == kmp_sched_static_chunk ||
-        schedule == kmp_sched_static_nochunk) {
-      T myLb = DST->NextLowerBound;
-      T ub = DST->LoopUpperBound;
-      // finished?
-      if (myLb > ub) {
-        return DISPATCH_FINISHED;
-      }
-      // not finished, save current bounds
-      ST chunk = DST->Chunk;
-      *plower = myLb;
-      T myUb = myLb + chunk - 1; // Clang uses i <= ub
-      if (myUb > ub)
-        myUb = ub;
-      *pupper = myUb;
-      *plast = (int32_t)(myUb == ub);
-
-      // increment next lower bound by the stride
-      ST stride = DST->Stride;
-      DST->NextLowerBound = myLb + stride;
-      return DISPATCH_NOTFINISHED;
-    }
-    ASSERT0(LT_FUSSY,
-            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-            "bad sched");
-    T myLb, myUb;
-    int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
-                                    DST->LoopUpperBound);
-
-    if (finished == FINISHED)
-      return DISPATCH_FINISHED;
-
-    // not finished (either not finished or last chunk)
-    *plast = (int32_t)(finished == LAST_CHUNK);
-    *plower = myLb;
-    *pupper = myUb;
-    *pstride = 1;
-
-    return DISPATCH_NOTFINISHED;
-  }
-
-  static void dispatch_fini() {
-    // nothing
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // end of template class that encapsulate all the helper functions
-  ////////////////////////////////////////////////////////////////////////////////
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (dyn loops)
-////////////////////////////////////////////////////////////////////////////////
-
-// TODO: Expand the dispatch API to take a DST pointer which can then be
-//       allocated properly without malloc.
-// For now, each team will contain an LDS pointer (ThreadDST) to a global array
-// of references to the DST structs allocated (in global memory) for each thread
-// in the team. The global memory array is allocated during the init phase if it
-// was not allocated already and will be deallocated when the dispatch phase
-// ends:
-//
-//  __kmpc_dispatch_init
-//
-//  ** Dispatch loop **
-//
-//  __kmpc_dispatch_deinit
-//
-[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **>
-    ThreadDST;
-
-// Create a new DST, link the current one, and define the new as current.
-static DynamicScheduleTracker *pushDST() {
-  int32_t ThreadIndex = mapping::getThreadIdInBlock();
-  // Each block will allocate an array of pointers to DST structs. The array is
-  // equal in length to the number of threads in that block.
-  if (!ThreadDST) {
-    // Allocate global memory array of pointers to DST structs:
-    if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0)
-      ThreadDST = static_cast<DynamicScheduleTracker **>(
-          memory::allocGlobal(mapping::getNumberOfThreadsInBlock() *
-                                  sizeof(DynamicScheduleTracker *),
-                              "new ThreadDST array"));
-    synchronize::threads(atomic::seq_cst);
-
-    // Initialize the array pointers:
-    ThreadDST[ThreadIndex] = nullptr;
-  }
-
-  // Create a DST struct for the current thread:
-  DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
-      memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
-  *NewDST = DynamicScheduleTracker({0});
-
-  // Add the new DST struct to the array of DST structs:
-  NewDST->NextDST = ThreadDST[ThreadIndex];
-  ThreadDST[ThreadIndex] = NewDST;
-  return NewDST;
-}
-
-// Return the current DST.
-static DynamicScheduleTracker *peekDST() {
-  return ThreadDST[mapping::getThreadIdInBlock()];
-}
-
-// Pop the current DST and restore the last one.
-static void popDST() {
-  int32_t ThreadIndex = mapping::getThreadIdInBlock();
-  DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex];
-  DynamicScheduleTracker *OldDST = CurrentDST->NextDST;
-  memory::freeGlobal(CurrentDST, "remove DST");
-  ThreadDST[ThreadIndex] = OldDST;
-
-  // Check if we need to deallocate the global array. Ensure all threads
-  // in the block have finished deallocating the individual DSTs.
-  synchronize::threads(atomic::seq_cst);
-  if (!ThreadDST[ThreadIndex] && !ThreadIndex) {
-    memory::freeGlobal(ThreadDST, "remove ThreadDST array");
-    ThreadDST = nullptr;
-  }
-  synchronize::threads(atomic::seq_cst);
-}
-
-void workshare::init(bool IsSPMD) {
-  if (mapping::isInitialThreadInLevel0(IsSPMD))
-    ThreadDST = nullptr;
-}
-
-extern "C" {
-
-// init
-void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
-                            int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
-                             uint32_t lb, uint32_t ub, int32_t st,
-                             int32_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
-                            int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
-                             uint64_t lb, uint64_t ub, int64_t st,
-                             int64_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-// next
-int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
-                           int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
-                            uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
-                           int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
-                            uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-// fini
-void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
-}
-
-// deinit
-void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); }
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (static loops)
-////////////////////////////////////////////////////////////////////////////////
-
-void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
-                              int32_t schedtype, int32_t *plastiter,
-                              int32_t *plower, int32_t *pupper,
-                              int32_t *pstride, int32_t incr, int32_t chunk) {
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
-                               int32_t schedtype, int32_t *plastiter,
-                               uint32_t *plower, uint32_t *pupper,
-                               int32_t *pstride, int32_t incr, int32_t chunk) {
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
-                              int32_t schedtype, int32_t *plastiter,
-                              int64_t *plower, int64_t *pupper,
-                              int64_t *pstride, int64_t incr, int64_t chunk) {
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
-                               int32_t schedtype, int32_t *plastiter,
-                               uint64_t *plower, uint64_t *pupper,
-                               int64_t *pstride, int64_t incr, int64_t chunk) {
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int32_t *plower, int32_t *pupper,
-                                     int32_t *pstride, int32_t incr,
-                                     int32_t chunk) {
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint32_t *plower, uint32_t *pupper,
-                                      int32_t *pstride, int32_t incr,
-                                      int32_t chunk) {
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int64_t *plower, int64_t *pupper,
-                                     int64_t *pstride, int64_t incr,
-                                     int64_t chunk) {
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint64_t *plower, uint64_t *pupper,
-                                      int64_t *pstride, int64_t incr,
-                                      int64_t chunk) {
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
-
-void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
-}
-
-namespace ompx {
-
-/// Helper class to hide the generic loop nest and provide the template argument
-/// throughout.
-template <typename Ty> class StaticLoopChunker {
-
-  /// Generic loop nest that handles block and/or thread distribution in the
-  /// absence of user specified chunk sizes. This implicitly picks a block chunk
-  /// size equal to the number of threads in the block and a thread chunk size
-  /// equal to one. In contrast to the chunked version we can get away with a
-  /// single loop in this case
-  static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
-                                        Ty NumBlocks, Ty BId, Ty NumThreads,
-                                        Ty TId, Ty NumIters,
-                                        bool OneIterationPerThread) {
-    Ty KernelIteration = NumBlocks * NumThreads;
-
-    // Start index in the normalized space.
-    Ty IV = BId * NumThreads + TId;
-    ASSERT(IV >= 0, "Bad index");
-
-    // Cover the entire iteration space, assumptions in the caller might allow
-    // to simplify this loop to a conditional.
-    if (IV < NumIters) {
-      do {
-
-        // Execute the loop body.
-        LoopBody(IV, Arg);
-
-        // Every thread executed one block and thread chunk now.
-        IV += KernelIteration;
-
-        if (OneIterationPerThread)
-          return;
-
-      } while (IV < NumIters);
-    }
-  }
-
-  /// Generic loop nest that handles block and/or thread distribution in the
-  /// presence of user specified chunk sizes (for at least one of them).
-  static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
-                                        Ty BlockChunk, Ty NumBlocks, Ty BId,
-                                        Ty ThreadChunk, Ty NumThreads, Ty TId,
-                                        Ty NumIters,
-                                        bool OneIterationPerThread) {
-    Ty KernelIteration = NumBlocks * BlockChunk;
-
-    // Start index in the chunked space.
-    Ty IV = BId * BlockChunk + TId;
-    ASSERT(IV >= 0, "Bad index");
-
-    // Cover the entire iteration space, assumptions in the caller might allow
-    // to simplify this loop to a conditional.
-    do {
-
-      Ty BlockChunkLeft =
-          BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
-      Ty ThreadChunkLeft =
-          ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
-
-      while (ThreadChunkLeft--) {
-
-        // Given the blocking it's hard to keep track of what to execute.
-        if (IV >= NumIters)
-          return;
-
-        // Execute the loop body.
-        LoopBody(IV, Arg);
-
-        if (OneIterationPerThread)
-          return;
-
-        ++IV;
-      }
-
-      IV += KernelIteration;
-
-    } while (IV < NumIters);
-  }
-
-public:
-  /// Worksharing `for`-loop.
-  static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                  Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
-    ASSERT(NumIters >= 0, "Bad iteration count");
-    ASSERT(ThreadChunk >= 0, "Bad thread count");
-
-    // All threads need to participate but we don't know if we are in a
-    // parallel at all or if the user might have used a `num_threads` clause
-    // on the parallel and reduced the number compared to the block size.
-    // Since nested parallels are possible too we need to get the thread id
-    // from the `omp` getter and not the mapping directly.
-    Ty TId = omp_get_thread_num();
-
-    // There are no blocks involved here.
-    Ty BlockChunk = 0;
-    Ty NumBlocks = 1;
-    Ty BId = 0;
-
-    // If the thread chunk is not specified we pick a default now.
-    if (ThreadChunk == 0)
-      ThreadChunk = 1;
-
-    // If we know we have more threads than iterations we can indicate that to
-    // avoid an outer loop.
-    bool OneIterationPerThread = false;
-    if (config::getAssumeThreadsOversubscription()) {
-      ASSERT(NumThreads >= NumIters, "Broken assumption");
-      OneIterationPerThread = true;
-    }
-
-    if (ThreadChunk != 1)
-      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
-                                ThreadChunk, NumThreads, TId, NumIters,
-                                OneIterationPerThread);
-    else
-      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
-                                NumIters, OneIterationPerThread);
-  }
-
-  /// Worksharing `distribute`-loop.
-  static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                         Ty NumIters, Ty BlockChunk) {
-    ASSERT(icv::Level == 0, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
-
-    ASSERT(NumIters >= 0, "Bad iteration count");
-    ASSERT(BlockChunk >= 0, "Bad block count");
-
-    // There are no threads involved here.
-    Ty ThreadChunk = 0;
-    Ty NumThreads = 1;
-    Ty TId = 0;
-
-    // All teams need to participate.
-    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
-    Ty BId = mapping::getBlockIdInKernel();
-
-    // If the block chunk is not specified we pick a default now.
-    if (BlockChunk == 0)
-      BlockChunk = NumThreads;
-
-    // If we know we have more blocks than iterations we can indicate that to
-    // avoid an outer loop.
-    bool OneIterationPerThread = false;
-    if (config::getAssumeTeamsOversubscription()) {
-      ASSERT(NumBlocks >= NumIters, "Broken assumption");
-      OneIterationPerThread = true;
-    }
-
-    if (BlockChunk != NumThreads)
-      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
-                                ThreadChunk, NumThreads, TId, NumIters,
-                                OneIterationPerThread);
-    else
-      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
-                                NumIters, OneIterationPerThread);
-
-    ASSERT(icv::Level == 0, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
-  }
-
-  /// Worksharing `distribute parallel for`-loop.
-  static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
-                            void *Arg, Ty NumIters, Ty NumThreads,
-                            Ty BlockChunk, Ty ThreadChunk) {
-    ASSERT(icv::Level == 1, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-
-    ASSERT(NumIters >= 0, "Bad iteration count");
-    ASSERT(BlockChunk >= 0, "Bad block count");
-    ASSERT(ThreadChunk >= 0, "Bad thread count");
-
-    // All threads need to participate but the user might have used a
-    // `num_threads` clause on the parallel and reduced the number compared to
-    // the block size.
-    Ty TId = mapping::getThreadIdInBlock();
-
-    // All teams need to participate.
-    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
-    Ty BId = mapping::getBlockIdInKernel();
-
-    // If the block chunk is not specified we pick a default now.
-    if (BlockChunk == 0)
-      BlockChunk = NumThreads;
-
-    // If the thread chunk is not specified we pick a default now.
-    if (ThreadChunk == 0)
-      ThreadChunk = 1;
-
-    // If we know we have more threads (across all blocks) than iterations we
-    // can indicate that to avoid an outer loop.
-    bool OneIterationPerThread = false;
-    if (config::getAssumeTeamsOversubscription() &
-        config::getAssumeThreadsOversubscription()) {
-      OneIterationPerThread = true;
-      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
-    }
-
-    if (BlockChunk != NumThreads || ThreadChunk != 1)
-      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
-                                ThreadChunk, NumThreads, TId, NumIters,
-                                OneIterationPerThread);
-    else
-      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
-                                NumIters, OneIterationPerThread);
-
-    ASSERT(icv::Level == 1, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-  }
-};
-
-} // namespace ompx
-
-#define OMP_LOOP_ENTRY(BW, TY)                                                 \
-  [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_for_static_loop##BW(                                   \
-          IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,       \
-          TY num_threads, TY block_chunk, TY thread_chunk) {                   \
-    ompx::StaticLoopChunker<TY>::DistributeFor(                                \
-        loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk);      \
-  }                                                                            \
-  [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),  \
-                                        void *arg, TY num_iters,               \
-                                        TY block_chunk) {                      \
-    ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters,           \
-                                            block_chunk);                      \
-  }                                                                            \
-  [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW(      \
-      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
-      TY num_threads, TY thread_chunk) {                                       \
-    ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads,     \
-                                     thread_chunk);                            \
-  }
-
-extern "C" {
-OMP_LOOP_ENTRY(_4, int32_t)
-OMP_LOOP_ENTRY(_4u, uint32_t)
-OMP_LOOP_ENTRY(_8, int64_t)
-OMP_LOOP_ENTRY(_8u, uint64_t)
-}
diff --git a/offload/cmake/OpenMPTesting.cmake b/offload/cmake/OpenMPTesting.cmake
index 8e955ff39927..ef8cf34ba0c8 100644
--- a/offload/cmake/OpenMPTesting.cmake
+++ b/offload/cmake/OpenMPTesting.cmake
@@ -57,7 +57,7 @@ if (${OPENMP_STANDALONE_BUILD})
   if (MSVC OR XCODE)
     set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --no-progress-bar")
   endif()
-  if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  if ("${CMAKE_SYSTEM_NAME}" MATCHES "AIX")
     set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --time-tests --timeout=1800")
   endif()
   set(OPENMP_LIT_ARGS "${DEFAULT_LIT_ARGS}" CACHE STRING "Options for lit.")
diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake
index 0236f5f0b698..5a27a81c736b 100644
--- a/offload/cmake/caches/AMDGPUBot.cmake
+++ b/offload/cmake/caches/AMDGPUBot.cmake
@@ -15,7 +15,10 @@ set(LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;offload;flang-rt" CACHE STRING "")
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
 set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
 set(LLVM_TARGETS_TO_BUILD "host;AMDGPU;SPIRV" CACHE STRING "")
-set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "")
+set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 16" CACHE STRING "")
 
 set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
 set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "")
+
+set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "openmp" CACHE STRING "")
diff --git a/offload/cmake/caches/AMDGPULibcBot.cmake b/offload/cmake/caches/AMDGPULibcBot.cmake
index a772043c7966..798f080a41ad 100644
--- a/offload/cmake/caches/AMDGPULibcBot.cmake
+++ b/offload/cmake/caches/AMDGPULibcBot.cmake
@@ -17,5 +17,6 @@ set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
 set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "")
 
 set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "")
-set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc" CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/AMDGPU.cmake" CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;libc;libcxxabi;libcxx" CACHE STRING "")
 set(RUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_TEST_JOBS 4 CACHE STRING "")
diff --git a/offload/cmake/caches/Offload.cmake b/offload/cmake/caches/Offload.cmake
index 5533a6508f5d..3747a1d3eb29 100644
--- a/offload/cmake/caches/Offload.cmake
+++ b/offload/cmake/caches/Offload.cmake
@@ -5,5 +5,5 @@ set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
 set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda CACHE STRING "") 
 set(RUNTIMES_nvptx64-nvidia-cuda_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/NVPTX.cmake" CACHE STRING "")
 set(RUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/AMDGPU.cmake" CACHE STRING "")
-set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "")
-set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "")
+set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "")
diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h
index b9f5c1658293..45bd9c6e7da8 100644
--- a/offload/include/OpenMP/Mapping.h
+++ b/offload/include/OpenMP/Mapping.h
@@ -49,9 +49,46 @@ public:
 /// Information about shadow pointers.
 struct ShadowPtrInfoTy {
   void **HstPtrAddr = nullptr;
-  void *HstPtrVal = nullptr;
   void **TgtPtrAddr = nullptr;
-  void *TgtPtrVal = nullptr;
+  int64_t PtrSize = sizeof(void *); // Size of the pointer/descriptor
+
+  // Store the complete contents for both host and target pointers/descriptors.
+  // 96 bytes is chosen as the "Small" size to cover simple Fortran
+  // descriptors of up to 3 dimensions.
+  llvm::SmallVector<char, 96> HstPtrContent;
+  llvm::SmallVector<char, 96> TgtPtrContent;
+
+  ShadowPtrInfoTy(void **HstPtrAddr, void **TgtPtrAddr, void *TgtPteeBase,
+                  int64_t PtrSize)
+      : HstPtrAddr(HstPtrAddr), TgtPtrAddr(TgtPtrAddr), PtrSize(PtrSize),
+        HstPtrContent(PtrSize), TgtPtrContent(PtrSize) {
+    constexpr int64_t VoidPtrSize = sizeof(void *);
+    assert(HstPtrAddr != nullptr && "HstPtrAddr is nullptr");
+    assert(TgtPtrAddr != nullptr && "TgtPtrAddr is nullptr");
+    assert(PtrSize >= VoidPtrSize && "PtrSize is less than sizeof(void *)");
+
+    void *HstPteeBase = *HstPtrAddr;
+    // The first VoidPtrSize bytes for HstPtrContent/TgtPtrContent are from
+    // HstPteeBase/TgtPteeBase.
+    std::memcpy(HstPtrContent.data(), &HstPteeBase, VoidPtrSize);
+    std::memcpy(TgtPtrContent.data(), &TgtPteeBase, VoidPtrSize);
+
+    // If we are not dealing with Fortran descriptors (pointers larger than
+    // VoidPtrSize), then that's that.
+    if (PtrSize <= VoidPtrSize)
+      return;
+
+    // For larger pointers, i.e. Fortran descriptors, the remaining contents of
+    // the descriptor come from the host descriptor, i.e. HstPtrAddr.
+    std::memcpy(HstPtrContent.data() + VoidPtrSize,
+                reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize,
+                PtrSize - VoidPtrSize);
+    std::memcpy(TgtPtrContent.data() + VoidPtrSize,
+                reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize,
+                PtrSize - VoidPtrSize);
+  }
+
+  ShadowPtrInfoTy() = delete;
 
   bool operator==(const ShadowPtrInfoTy &Other) const {
     return HstPtrAddr == Other.HstPtrAddr;
@@ -243,9 +280,25 @@ public:
     auto Pair = States->ShadowPtrInfos.insert(ShadowPtrInfo);
     if (Pair.second)
       return true;
+
     // Check for a stale entry, if found, replace the old one.
-    if ((*Pair.first).TgtPtrVal == ShadowPtrInfo.TgtPtrVal)
+
+    // For Fortran descriptors, we need to compare their full contents,
+    // as the starting address may be the same while other fields have
+    // been updated. e.g.
+    //
+    //   !$omp target enter data map(x(1:100)) !             (1)
+    //   p => x(10: 19)
+    //   !$omp target enter data map(p, p(:)) !              (2)
+    //   p => x(5: 9)
+    //   !$omp target enter data map(attach(always): p(:)) ! (3)
+    //
+    // While &desc_p and &p(1) (TgtPtrAddr and first "sizeof(void*)" bytes of
+    // TgtPtrContent) are same for (2) and (3), the pointer attachment for (3)
+    // needs to update the bounds information in the descriptor of p on device.
+    if ((*Pair.first).TgtPtrContent == ShadowPtrInfo.TgtPtrContent)
       return false;
+
     States->ShadowPtrInfos.erase(ShadowPtrInfo);
     return addShadowPointer(ShadowPtrInfo);
   }
@@ -417,12 +470,42 @@ struct MapperComponentsTy {
 typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t,
                                 void *);
 
+/// Structure to store information about a single ATTACH map entry.
+struct AttachMapInfo {
+  void *PointerBase;
+  void *PointeeBegin;
+  int64_t PointerSize;
+  int64_t MapType;
+  map_var_info_t Pointername;
+
+  AttachMapInfo(void *PointerBase, void *PointeeBegin, int64_t Size,
+                int64_t Type, map_var_info_t Name)
+      : PointerBase(PointerBase), PointeeBegin(PointeeBegin), PointerSize(Size),
+        MapType(Type), Pointername(Name) {}
+};
+
+/// Structure to track ATTACH entries and new allocations across recursive calls
+/// (for handling mappers) to targetDataBegin for a given construct.
+struct AttachInfoTy {
+  /// ATTACH map entries for deferred processing.
+  llvm::SmallVector<AttachMapInfo> AttachEntries;
+
+  /// Key: host pointer, Value: allocation size.
+  llvm::DenseMap<void *, int64_t> NewAllocations;
+
+  AttachInfoTy() = default;
+
+  // Delete copy constructor and copy assignment operator to prevent copying
+  AttachInfoTy(const AttachInfoTy &) = delete;
+  AttachInfoTy &operator=(const AttachInfoTy &) = delete;
+};
+
 // Function pointer type for targetData* functions (targetDataBegin,
 // targetDataEnd and targetDataUpdate).
 typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
                                    void **, int64_t *, int64_t *,
                                    map_var_info_t *, void **, AsyncInfoTy &,
-                                   bool);
+                                   AttachInfoTy *, bool);
 
 void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
                                bool toStdOut = false);
@@ -431,20 +514,26 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     void **ArgsBase, void **Args, int64_t *ArgSizes,
                     int64_t *ArgTypes, map_var_info_t *ArgNames,
                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
+                    AttachInfoTy *AttachInfo = nullptr,
                     bool FromMapper = false);
 
 int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *ArgNames,
                   void **ArgMappers, AsyncInfoTy &AsyncInfo,
-                  bool FromMapper = false);
+                  AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false);
 
 int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                      void **ArgsBase, void **Args, int64_t *ArgSizes,
                      int64_t *ArgTypes, map_var_info_t *ArgNames,
                      void **ArgMappers, AsyncInfoTy &AsyncInfo,
+                     AttachInfoTy *AttachInfo = nullptr,
                      bool FromMapper = false);
 
+// Process deferred ATTACH map entries collected during targetDataBegin.
+int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
+                         AsyncInfoTy &AsyncInfo);
+
 struct MappingInfoTy {
   MappingInfoTy(DeviceTy &Device) : Device(Device) {}
 
diff --git a/offload/include/device.h b/offload/include/device.h
index f4b10abbaa3f..bf93ce0460ae 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -33,7 +33,9 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 
+#include "GlobalHandler.h"
 #include "PluginInterface.h"
+
 using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
 
 // Forward declarations.
@@ -98,6 +100,10 @@ struct DeviceTy {
   int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
                        int64_t Size, AsyncInfoTy &AsyncInfo);
 
+  // Insert a data fence between previous data operations and the following
+  // operations if necessary for the device.
+  int32_t dataFence(AsyncInfoTy &AsyncInfo);
+
   /// Notify the plugin about a new mapping starting at the host address
   /// \p HstPtr and \p Size bytes.
   int32_t notifyDataMapped(void *HstPtr, int64_t Size);
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 625bbaa0db85..794b79e07674 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -77,6 +77,9 @@ enum tgt_map_type {
   // the structured region
   // This is an OpenMP extension for the sake of OpenACC support.
   OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000,
+  // Attach pointer and pointee, after processing all other maps.
+  // Applicable to map-entering directives. Does not change ref-count.
+  OMP_TGT_MAPTYPE_ATTACH = 0x4000,
   // descriptor for non-contiguous target-update
   OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000,
   // member of struct, member given by [16 MSBs] - 1
@@ -98,8 +101,6 @@ enum TargetAllocTy : int32_t {
   TARGET_ALLOC_HOST,
   TARGET_ALLOC_SHARED,
   TARGET_ALLOC_DEFAULT,
-  /// The allocation will not block on other streams.
-  TARGET_ALLOC_DEVICE_NON_BLOCKING,
 };
 
 inline KernelArgsTy CTorDTorKernelArgs = {
diff --git a/offload/liboffload/API/APIDefs.td b/offload/liboffload/API/APIDefs.td
index 640932dcf846..ea3896fc3103 100644
--- a/offload/liboffload/API/APIDefs.td
+++ b/offload/liboffload/API/APIDefs.td
@@ -31,6 +31,13 @@ class IsHandleType<string Type> {
                 !ne(!find(Type, "_handle_t", !sub(!size(Type), 9)), -1));
 }
 
+// Does the type end with '_cb_t'?
+class IsCallbackType<string Type> {
+  // size("_cb_t") == 5
+  bit ret = !if(!lt(!size(Type), 5), 0,
+                !ne(!find(Type, "_cb_t", !sub(!size(Type), 5)), -1));
+}
+
 // Does the type end with '*'?
 class IsPointerType<string Type> {
   bit ret = !ne(!find(Type, "*", !sub(!size(Type), 1)), -1);
@@ -58,6 +65,7 @@ class Param<string Type, string Name, string Desc, bits<3> Flags = 0> {
   TypeInfo type_info = TypeInfo<"", "">;
   bit IsHandle = IsHandleType<type>.ret;
   bit IsPointer = IsPointerType<type>.ret;
+  bit IsCallback = IsCallbackType<type>.ret;
 }
 
 // A parameter whose range is described by other parameters in the function.
@@ -81,7 +89,7 @@ class ShouldCheckHandle<Param P> {
 }
 
 class ShouldCheckPointer<Param P> {
-  bit ret = !and(P.IsPointer, !eq(!and(PARAM_OPTIONAL, P.flags), 0));
+  bit ret = !and(!or(P.IsPointer, P.IsCallback), !eq(!and(PARAM_OPTIONAL, P.flags), 0));
 }
 
 // For a list of returns that contains a specific return code, find and append
@@ -137,7 +145,6 @@ defvar DefaultReturns = [Return<PREFIX#"_RESULT_SUCCESS">,
                          Return<PREFIX#"_ERRC_DEVICE_LOST">];
 
 class APIObject {
-  string name;
   string desc;
 }
 
@@ -168,6 +175,10 @@ class Enum : APIObject {
   // all Etor values must be TaggedEtor records
   bit is_typed = 0;
 
+  // This refers to whether the enumerator is used to name bits of a bit field,
+  // where consecutive values are bit-shifted rather than incremented.
+  bit is_bit_field = 0;
+
   list<Etor> etors = [];
 }
 
diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index 6eaf604c8ebb..ac27d85b6c96 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -10,77 +10,64 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Macro {
-  let name = "OL_VERSION_MAJOR";
+def OL_VERSION_MAJOR : Macro {
   let desc = "Major version of the Offload API";
   let value = "0";
 }
 
-def : Macro {
-  let name = "OL_VERSION_MINOR";
+def OL_VERSION_MINOR : Macro {
   let desc = "Minor version of the Offload API";
   let value = "0";
 }
 
-def : Macro {
-  let name = "OL_VERSION_PATCH";
+def OL_VERSION_PATCH : Macro {
   let desc = "Patch version of the Offload API";
   let value = "1";
 }
 
-def : Macro {
-  let name = "OL_APICALL";
+def OL_APICALL : Macro {
   let desc = "Calling convention for all API functions";
   let condition = "defined(_WIN32)";
   let value = "__cdecl";
   let alt_value = "";
 }
 
-def : Macro {
-  let name = "OL_APIEXPORT";
+def OL_APIEXPORT : Macro {
   let desc = "Microsoft-specific dllexport storage-class attribute";
   let condition = "defined(_WIN32)";
   let value = "__declspec(dllexport)";
   let alt_value = "";
 }
 
-def : Handle {
-  let name = "ol_platform_handle_t";
+def ol_platform_handle_t : Handle {
   let desc = "Handle of a platform instance";
 }
 
-def : Handle {
-  let name = "ol_device_handle_t";
+def ol_device_handle_t : Handle {
   let desc = "Handle of platform's device object";
 }
 
-def : Handle {
-  let name = "ol_context_handle_t";
+def ol_context_handle_t : Handle {
   let desc = "Handle of context object";
 }
 
-def : Handle {
-  let name = "ol_queue_handle_t";
+def ol_queue_handle_t : Handle {
   let desc = "Handle of queue object";
 }
 
-def : Handle {
-  let name = "ol_event_handle_t";
+def ol_event_handle_t : Handle {
   let desc = "Handle of event object";
 }
 
-def : Handle {
-  let name = "ol_program_handle_t";
+def ol_program_handle_t : Handle {
   let desc = "Handle of program object";
 }
 
-def : Handle {
-  let name = "ol_symbol_handle_t";
+def ol_symbol_handle_t : Handle {
   let desc = "Handle of an object in a device's memory for a specific program";
 }
 
-def ErrorCode : Enum {
-  let name = "ol_errc_t";
+def ol_errc_t : Enum {
   let desc = "Defines Return/Error codes";
   let etors =[
     Etor<"SUCCESS", "success">,
@@ -115,8 +102,7 @@ def ErrorCode : Enum {
   ];
 }
 
-def : Struct {
-  let name = "ol_error_struct_t";
+def ol_error_struct_t : Struct {
   let desc = "Details of the error condition returned by an API call";
   let members = [
     StructMember<"ol_errc_t", "Code", "The error code">,
@@ -124,20 +110,17 @@ def : Struct {
   ];
 }
 
-def : Typedef {
-  let name = "ol_result_t";
+def ol_result_t : Typedef {
   let desc = "Result type returned by all entry points.";
-  let value = "const ol_error_struct_t*";
+  let value = "const struct ol_error_struct_t*";
 }
 
-def : Macro {
-  let name = "OL_SUCCESS";
+def OL_SUCCESS : Macro {
   let desc = "Success condition";
   let value = "NULL";
 }
 
-def : Struct {
-  let name = "ol_code_location_t";
+def ol_code_location_t : Struct {
   let desc = "Code location information that can optionally be associated with an API call";
   let members = [
     StructMember<"const char*", "FunctionName", "Function name">,
@@ -147,8 +130,7 @@ def : Struct {
   ];
 }
 
-def : Struct {
-  let name = "ol_dimensions_t";
+def ol_dimensions_t : Struct {
   let desc = "A three element vector";
   let members = [
     StructMember<"uint32_t", "x", "X">,
@@ -157,8 +139,7 @@ def : Struct {
   ];
 }
 
-def : Function {
-  let name = "olInit";
+def olInit : Function {
   let desc = "Perform initialization of the Offload library and plugins";
   let details = [
     "This must be the first API call made by a user of the Offload library",
@@ -168,8 +149,7 @@ def : Function {
   let returns = [];
 }
 
-def : Function {
-  let name = "olShutDown";
+def olShutDown : Function {
   let desc = "Release the resources in use by Offload";
   let details = [
     "This decrements an internal reference count. When this reaches 0, all resources will be released",
diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index 857c596124b2..5b54c79d83f9 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Enum {
-  let name = "ol_device_type_t";
+def ol_device_type_t : Enum {
   let desc = "Supported device types.";
   let etors =[
     Etor<"DEFAULT", "The default device type as preferred by the runtime">,
@@ -22,23 +21,54 @@ def : Enum {
   ];
 }
 
-def DeviceInfo : Enum {
-  let name = "ol_device_info_t";
+def ol_device_info_t : Enum {
   let desc = "Supported device info.";
   let is_typed = 1;
-  let etors =[
+  list<TaggedEtor> basic_etors =[
     TaggedEtor<"TYPE", "ol_device_type_t", "type of the device">,
     TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">,
     TaggedEtor<"NAME", "char[]", "Device name">,
+    TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">,
     TaggedEtor<"VENDOR", "char[]", "Device vendor">,
     TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">,
     TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">,
     TaggedEtor<"MAX_WORK_GROUP_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work group size in each dimension">,
+    TaggedEtor<"MAX_WORK_SIZE", "uint32_t", "Maximum total work items">,
+    TaggedEtor<"MAX_WORK_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work items in each dimension">,
+    TaggedEtor<"VENDOR_ID", "uint32_t", "A unique vendor device identifier assigned by PCI-SIG">,
+    TaggedEtor<"NUM_COMPUTE_UNITS", "uint32_t", "The number of parallel compute units available to the device">,
+    TaggedEtor<"MAX_CLOCK_FREQUENCY", "uint32_t", "The maximum configured clock frequency of this device in MHz">,
+    TaggedEtor<"MEMORY_CLOCK_RATE", "uint32_t", "Memory clock frequency in MHz">,
+    TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">,
+    TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">,
+    TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">,
   ];
+  list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">);
+  list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>);
+  let etors = !listconcat(basic_etors, fp_configs, native_vec_widths);
+}
+
+def ol_device_fp_capability_flag_t : Enum {
+  let desc = "Device floating-point capability flags";
+  let is_bit_field = 1;
+  let etors =[
+    Etor<"CORRECTLY_ROUNDED_DIVIDE_SQRT", "Support correctly rounded divide and sqrt">,
+    Etor<"ROUND_TO_NEAREST", "Support round to nearest">,
+    Etor<"ROUND_TO_ZERO", "Support round to zero">,
+    Etor<"ROUND_TO_INF", "Support round to infinity">,
+    Etor<"INF_NAN", "Support INF to NAN">,
+    Etor<"DENORM", "Support denorm">,
+    Etor<"FMA", "Support fused multiply-add">,
+    Etor<"SOFT_FLOAT", "Basic floating point operations implemented in software">,
+  ];
+}
+
+def ol_device_fp_capability_flags_t : Typedef {
+  let desc = "Device floating-point capability flags";
+  let value = "uint32_t";
 }
 
-def : FptrTypedef {
-  let name = "ol_device_iterate_cb_t";
+def ol_device_iterate_cb_t : FptrTypedef {
   let desc = "User-provided function to be used with `olIterateDevices`";
   let params = [
     Param<"ol_device_handle_t", "Device", "the device handle of the current iteration", PARAM_IN>,
@@ -47,8 +77,7 @@ def : FptrTypedef {
   let return = "bool";
 }
 
-def : Function {
-  let name = "olIterateDevices";
+def olIterateDevices : Function {
   let desc = "Iterates over all available devices, calling the callback for each device.";
   let details = [
     "If the user-provided callback returns `false`, the iteration is stopped."
@@ -62,8 +91,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetDeviceInfo";
+def olGetDeviceInfo : Function {
   let desc = "Queries the given property of the device.";
   let details = [];
   let params = [
@@ -86,8 +114,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetDeviceInfoSize";
+def olGetDeviceInfoSize : Function {
   let desc = "Returns the storage size of the given device query.";
   let details = [];
   let params = [
diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td
index 9d217ae23038..075bf5bafaa6 100644
--- a/offload/liboffload/API/Event.td
+++ b/offload/liboffload/API/Event.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Function {
-    let name = "olCreateEvent";
+def olCreateEvent : Function {
     let desc = "Enqueue an event to `Queue` and return it.";
     let details = [
       "This event can be used with `olSyncEvent` and `olWaitEvents` and will be complete once all enqueued work prior to the `olCreateEvent` call is complete.",
@@ -23,8 +22,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olDestroyEvent";
+def olDestroyEvent : Function {
     let desc = "Destroy the event and free all underlying resources.";
     let details = [];
     let params = [
@@ -33,8 +31,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olSyncEvent";
+def olSyncEvent : Function {
     let desc = "Block the calling thread until the event is complete.";
     let details = [];
     let params = [
@@ -43,17 +40,16 @@ def : Function {
     let returns = [];
 }
 
-def : Enum {
-  let name = "ol_event_info_t";
+def ol_event_info_t : Enum {
   let desc = "Supported event info.";
   let is_typed = 1;
   let etors = [
-    TaggedEtor<"QUEUE", "ol_queue_handle_t", "The handle of the queue associated with the device.">
+    TaggedEtor<"QUEUE", "ol_queue_handle_t", "The handle of the queue associated with the device.">,
+    TaggedEtor<"IS_COMPLETE", "bool", "True if and only if the event is complete.">,
   ];
 }
 
-def : Function {
-  let name = "olGetEventInfo";
+def olGetEventInfo : Function {
   let desc = "Queries the given property of the event.";
   let details = [
     "`olGetEventInfoSize` can be used to query the storage size "
@@ -77,8 +73,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetEventInfoSize";
+def olGetEventInfoSize : Function {
   let desc = "Returns the storage size of the given event query.";
   let details = [];
   let params = [
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 502fb36467db..2f5692a19d71 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains Offload API definitions related to launching kernels
+// This file contains Offload API definitions related to kernels
 //
 //===----------------------------------------------------------------------===//
 
-def : Struct {
-    let name = "ol_kernel_launch_size_args_t";
+def ol_kernel_launch_size_args_t : Struct {
     let desc = "Size-related arguments for a kernel launch.";
     let members = [
         StructMember<"size_t", "Dimensions", "Number of work dimensions">,
@@ -21,8 +20,7 @@ def : Struct {
     ];
 }
 
-def : Function {
-    let name = "olLaunchKernel";
+def olLaunchKernel : Function {
     let desc = "Enqueue a kernel launch with the specified size and parameters.";
     let details = [
         "If a queue is not specified, kernel execution happens synchronously",
@@ -42,3 +40,20 @@ def : Function {
         Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
     ];
 }
+
+def olCalculateOptimalOccupancy : Function {
+    let desc = "Given dynamic memory size, query the device for a workgroup size that will result in optimal occupancy.";
+    let details = [
+        "For most devices, this will be the largest workgroup size that will result in all work items fitting on the device at once.",
+    ];
+    let params = [
+        Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
+        Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"size_t", "SharedMemory", "dynamic shared memory required per work item in bytes", PARAM_IN>,
+        Param<"size_t*", "GroupSize", "optimal block size", PARAM_OUT>
+    ];
+    let returns = [
+        Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
+        Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>,
+    ];
+}
diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index 5f7158588bc7..79e803833004 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Enum {
-  let name = "ol_alloc_type_t";
+def ol_alloc_type_t : Enum {
   let desc = "Represents the type of allocation made with olMemAlloc.";
   let etors = [
     Etor<"HOST", "Host allocation">,
@@ -20,9 +19,11 @@ def : Enum {
   ];
 }
 
-def : Function {
-  let name = "olMemAlloc";
+def olMemAlloc : Function {
   let desc = "Creates a memory allocation on the specified device.";
+  let details = [
+      "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory."
+  ];
   let params = [
     Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
     Param<"ol_alloc_type_t", "Type", "type of the allocation", PARAM_IN>,
@@ -36,8 +37,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olMemFree";
+def olMemFree : Function {
   let desc = "Frees a memory allocation previously made by olMemAlloc.";
   let params = [
     Param<"void*", "Address", "address of the allocation to free", PARAM_IN>,
@@ -45,8 +45,57 @@ def : Function {
   let returns = [];
 }
 
-def : Function {
-    let name = "olMemcpy";
+def ol_mem_info_t : Enum {
+  let desc = "Supported memory info.";
+  let is_typed = 1;
+  let etors = [
+    TaggedEtor<"DEVICE", "ol_device_handle_t", "The handle of the device associated with the allocation.">,
+    TaggedEtor<"BASE", "void *", "Base address of this allocation.">,
+    TaggedEtor<"SIZE", "size_t", "Size of this allocation in bytes.">,
+    TaggedEtor<"TYPE", "ol_alloc_type_t", "Type of this allocation.">,
+  ];
+}
+
+def olGetMemInfo : Function {
+  let desc = "Queries the given property of a memory allocation allocated with olMemAlloc.";
+  let details = [
+    "`olGetMemInfoSize` can be used to query the storage size required for the given query.",
+    "The provided pointer can point to any location inside the allocation.",
+  ];
+  let params = [
+    Param<"const void *", "Ptr", "pointer to the allocated memory", PARAM_IN>,
+    Param<"ol_mem_info_t", "PropName", "type of the info to retrieve", PARAM_IN>,
+    Param<"size_t", "PropSize", "the number of bytes pointed to by PropValue.", PARAM_IN>,
+    TypeTaggedParam<"void*", "PropValue", "array of bytes holding the info. "
+      "If Size is not equal to or greater to the real number of bytes needed to return the info "
+      "then the OL_ERRC_INVALID_SIZE error is returned and pPlatformInfo is not used.", PARAM_OUT,
+      TypeInfo<"PropName" , "PropSize">>
+  ];
+  let returns = [
+    Return<"OL_ERRC_INVALID_SIZE", [
+      "`PropSize == 0`",
+      "If `PropSize` is less than the real number of bytes needed to return the info."
+    ]>,
+    Return<"OL_ERRC_NOT_FOUND", ["memory was not allocated by liboffload"]>
+  ];
+}
+
+def olGetMemInfoSize : Function {
+  let desc = "Returns the storage size of the given queue query.";
+  let details = [
+    "The provided pointer can point to any location inside the allocation.",
+  ];
+  let params = [
+    Param<"const void *", "Ptr", "pointer to the allocated memory", PARAM_IN>,
+    Param<"ol_mem_info_t", "PropName", "type of the info to query", PARAM_IN>,
+    Param<"size_t*", "PropSizeRet", "pointer to the number of bytes required to store the query", PARAM_OUT>
+  ];
+  let returns = [
+    Return<"OL_ERRC_NOT_FOUND", ["memory was not allocated by liboffload"]>
+  ];
+}
+
+def olMemcpy : Function {
     let desc = "Enqueue a memcpy operation.";
     let details = [
         "For host pointers, use the host device belonging to the OL_PLATFORM_BACKEND_HOST platform.",
@@ -63,3 +112,22 @@ def : Function {
     ];
     let returns = [];
 }
+
+def olMemFill : Function {
+  let desc = "Fill memory with copies of the given pattern";
+  let details = [
+    "Filling with patterns larger than 4 bytes may be less performant",
+    "The destination pointer and queue must be associated with the same device",
+    "The fill size must be a multiple of the pattern size",
+  ];
+  let params = [
+      Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
+      Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>,
+      Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>,
+      Param<"const void*", "PatternPtr", "", PARAM_IN>,
+      Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>,
+  ];
+  let returns = [
+    Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]>
+  ];
+}
diff --git a/offload/liboffload/API/Platform.td b/offload/liboffload/API/Platform.td
index 97c2cc2d0570..906f899076a8 100644
--- a/offload/liboffload/API/Platform.td
+++ b/offload/liboffload/API/Platform.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Enum {
-  let name = "ol_platform_info_t";
+def ol_platform_info_t : Enum {
   let desc = "Supported platform info.";
   let is_typed = 1;
   let etors = [
@@ -22,8 +21,7 @@ def : Enum {
   ];
 }
 
-def : Enum {
-  let name = "ol_platform_backend_t";
+def ol_platform_backend_t : Enum {
   let desc = "Identifies the native backend of the platform.";
   let etors =[
     Etor<"UNKNOWN", "The backend is not recognized">,
@@ -33,8 +31,7 @@ def : Enum {
   ];
 }
 
-def : Function {
-  let name = "olGetPlatformInfo";
+def olGetPlatformInfo : Function {
   let desc = "Queries the given property of the platform.";
   let details = [
     "`olGetPlatformInfoSize` can be used to query the storage size "
@@ -61,8 +58,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetPlatformInfoSize";
+def olGetPlatformInfoSize : Function {
   let desc = "Returns the storage size of the given platform query.";
   let details = [];
   let params = [
diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td
index 0476fa1f7c27..7e11b3d8e331 100644
--- a/offload/liboffload/API/Program.td
+++ b/offload/liboffload/API/Program.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Function {
-    let name = "olCreateProgram";
+def olCreateProgram : Function {
     let desc = "Create a program for the device from the binary image pointed to by `ProgData`.";
     let details = [
         "The provided `ProgData` will be copied and need not outlive the returned handle",
@@ -25,8 +24,19 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olDestroyProgram";
+def olIsValidBinary : Function {
+    let desc = "Validate if the binary image pointed to by `ProgData` is compatible with the device.";
+    let details = ["The provided `ProgData` will not be loaded onto the device"];
+    let params = [
+        Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>,
+        Param<"const void*", "ProgData", "pointer to the program binary data", PARAM_IN>,
+        Param<"size_t", "ProgDataSize", "size of the program binary in bytes", PARAM_IN>,
+        Param<"bool*", "Valid", "output is true if the image is compatible", PARAM_OUT>
+    ];
+    let returns = [];
+}
+
+def olDestroyProgram : Function {
     let desc = "Destroy the program and free all underlying resources.";
     let details = [];
     let params = [
diff --git a/offload/liboffload/API/Queue.td b/offload/liboffload/API/Queue.td
index 1d9f6f2d11c9..ededa9cc92fe 100644
--- a/offload/liboffload/API/Queue.td
+++ b/offload/liboffload/API/Queue.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Function {
-    let name = "olCreateQueue";
+def olCreateQueue : Function {
     let desc = "Create a queue for the given device.";
     let details = [];
     let params = [
@@ -21,8 +20,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olDestroyQueue";
+def olDestroyQueue : Function {
     let desc = "Destroy the queue and free all underlying resources.";
     let details = [
       "Any work previously enqueued to the queue is still performed and any events generated for this queue remain valid."
@@ -33,8 +31,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olSyncQueue";
+def olSyncQueue : Function {
     let desc = "Block the calling thread until the enqueued work on a queue is complete.";
     let details = [];
     let params = [
@@ -43,8 +40,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olWaitEvents";
+def olWaitEvents : Function {
     let desc = "Make any future work submitted to this queue wait until the provided events are complete.";
     let details = [
       "All events in `Events` must complete before the queue is unblocked.",
@@ -60,8 +56,7 @@ def : Function {
     ];
 }
 
-def : Enum {
-  let name = "ol_queue_info_t";
+def ol_queue_info_t : Enum {
   let desc = "Supported queue info.";
   let is_typed = 1;
   let etors = [
@@ -70,8 +65,7 @@ def : Enum {
   ];
 }
 
-def : Function {
-  let name = "olGetQueueInfo";
+def olGetQueueInfo : Function {
   let desc = "Queries the given property of the queue.";
   let details = [
     "`olGetQueueInfoSize` can be used to query the storage size "
@@ -95,8 +89,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetQueueInfoSize";
+def olGetQueueInfoSize : Function {
   let desc = "Returns the storage size of the given queue query.";
   let details = [];
   let params = [
@@ -108,3 +101,27 @@ def : Function {
     Return<"OL_ERRC_INVALID_QUEUE">
   ];
 }
+
+def ol_host_function_cb_t : FptrTypedef {
+  let desc = "Host function for use by `olLaunchHostFunction`.";
+  let params = [
+    Param<"void *", "UserData", "user specified data passed into `olLaunchHostFunction`.", PARAM_IN>,
+  ];
+  let return = "void";
+}
+
+def olLaunchHostFunction : Function {
+  let desc = "Enqueue a callback function on the host.";
+  let details = [
+    "The provided function will be called from the same process as the one that called `olLaunchHostFunction`.",
+    "The callback will not run until all previous work submitted to the queue has completed.",
+    "The callback must return before any work submitted to the queue after it is started.",
+    "The callback must not call any liboffload API functions or any backend specific functions (such as Cuda or HSA library functions).",
+  ];
+  let params = [
+    Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
+    Param<"ol_host_function_cb_t", "Callback", "the callback function to call on the host", PARAM_IN>,
+    Param<"void *", "UserData", "a pointer that will be passed verbatim to the callback function", PARAM_IN_OPTIONAL>,
+  ];
+  let returns = [];
+}
diff --git a/offload/liboffload/API/Symbol.td b/offload/liboffload/API/Symbol.td
index 2e94d703809e..c57a2e1b8363 100644
--- a/offload/liboffload/API/Symbol.td
+++ b/offload/liboffload/API/Symbol.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Enum {
-  let name = "ol_symbol_kind_t";
+def ol_symbol_kind_t : Enum {
   let desc = "The kind of a symbol";
   let etors =[
     Etor<"KERNEL", "a kernel object">,
@@ -19,8 +18,7 @@ def : Enum {
   ];
 }
 
-def : Function {
-    let name = "olGetSymbol";
+def olGetSymbol : Function {
     let desc = "Get a symbol (kernel or global variable) identified by `Name` in the given program.";
     let details = [
         "Symbol handles are owned by the program and do not need to be manually destroyed."
@@ -34,8 +32,7 @@ def : Function {
     let returns = [];
 }
 
-def : Enum {
-  let name = "ol_symbol_info_t";
+def ol_symbol_info_t : Enum {
   let desc = "Supported symbol info.";
   let is_typed = 1;
   let etors = [
@@ -45,8 +42,7 @@ def : Enum {
   ];
 }
 
-def : Function {
-  let name = "olGetSymbolInfo";
+def olGetSymbolInfo : Function {
   let desc = "Queries the given property of the symbol.";
   let details = [
     "`olGetSymbolInfoSize` can be used to query the storage size "
@@ -73,8 +69,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetSymbolInfoSize";
+def olGetSymbolInfoSize : Function {
   let desc = "Returns the storage size of the given symbol query.";
   let details = [];
   let params = [
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index f5365ca27430..c549ae04361d 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -39,55 +39,131 @@ using namespace llvm::omp::target;
 using namespace llvm::omp::target::plugin;
 using namespace error;
 
+struct ol_platform_impl_t {
+  ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin,
+                     ol_platform_backend_t BackendType)
+      : Plugin(std::move(Plugin)), BackendType(BackendType) {}
+  std::unique_ptr<GenericPluginTy> Plugin;
+  llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices;
+  ol_platform_backend_t BackendType;
+
+  /// Complete all pending work for this platform and perform any needed
+  /// cleanup.
+  ///
+  /// After calling this function, no liboffload functions should be called with
+  /// this platform handle.
+  llvm::Error destroy();
+};
+
 // Handle type definitions. Ideally these would be 1:1 with the plugins, but
 // we add some additional data here for now to avoid churn in the plugin
 // interface.
 struct ol_device_impl_t {
   ol_device_impl_t(int DeviceNum, GenericDeviceTy *Device,
-                   ol_platform_handle_t Platform, InfoTreeNode &&DevInfo)
+                   ol_platform_impl_t &Platform, InfoTreeNode &&DevInfo)
       : DeviceNum(DeviceNum), Device(Device), Platform(Platform),
         Info(std::forward<InfoTreeNode>(DevInfo)) {}
+
+  ~ol_device_impl_t() {
+    assert(!OutstandingQueues.size() &&
+           "Device object dropped with outstanding queues");
+  }
+
   int DeviceNum;
   GenericDeviceTy *Device;
-  ol_platform_handle_t Platform;
+  ol_platform_impl_t &Platform;
   InfoTreeNode Info;
-};
 
-struct ol_platform_impl_t {
-  ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin,
-                     ol_platform_backend_t BackendType)
-      : Plugin(std::move(Plugin)), BackendType(BackendType) {}
-  std::unique_ptr<GenericPluginTy> Plugin;
-  std::vector<ol_device_impl_t> Devices;
-  ol_platform_backend_t BackendType;
+  llvm::SmallVector<__tgt_async_info *> OutstandingQueues;
+  std::mutex OutstandingQueuesMutex;
+
+  /// If the device has any outstanding queues that are now complete, remove it
+  /// from the list and return it.
+  ///
+  /// Queues may be added to the outstanding queue list by olDestroyQueue if
+  /// they are destroyed but not completed.
+  __tgt_async_info *getOutstandingQueue() {
+    // Not locking the `size()` access is fine here - In the worst case we
+    // either miss a queue that exists or loop through an empty array after
+    // taking the lock. Both are sub-optimal but not that bad.
+    if (OutstandingQueues.size()) {
+      std::lock_guard<std::mutex> Lock(OutstandingQueuesMutex);
+
+      // As queues are pulled and popped from this list, longer running queues
+      // naturally bubble to the start of the array. Hence looping backwards.
+      for (auto Q = OutstandingQueues.rbegin(); Q != OutstandingQueues.rend();
+           Q++) {
+        if (!Device->hasPendingWork(*Q)) {
+          auto OutstandingQueue = *Q;
+          *Q = OutstandingQueues.back();
+          OutstandingQueues.pop_back();
+          return OutstandingQueue;
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  /// Complete all pending work for this device and perform any needed cleanup.
+  ///
+  /// After calling this function, no liboffload functions should be called with
+  /// this device handle.
+  llvm::Error destroy() {
+    llvm::Error Result = Plugin::success();
+    for (auto Q : OutstandingQueues)
+      if (auto Err = Device->synchronize(Q, /*Release=*/true))
+        Result = llvm::joinErrors(std::move(Result), std::move(Err));
+    OutstandingQueues.clear();
+    return Result;
+  }
 };
 
+llvm::Error ol_platform_impl_t::destroy() {
+  llvm::Error Result = Plugin::success();
+  for (auto &D : Devices)
+    if (auto Err = D->destroy())
+      Result = llvm::joinErrors(std::move(Result), std::move(Err));
+
+  if (auto Res = Plugin->deinit())
+    Result = llvm::joinErrors(std::move(Result), std::move(Res));
+
+  return Result;
+}
+
 struct ol_queue_impl_t {
   ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device)
-      : AsyncInfo(AsyncInfo), Device(Device) {}
+      : AsyncInfo(AsyncInfo), Device(Device), Id(IdCounter++) {}
   __tgt_async_info *AsyncInfo;
   ol_device_handle_t Device;
+  // A unique identifier for the queue
+  size_t Id;
+  static std::atomic<size_t> IdCounter;
 };
+std::atomic<size_t> ol_queue_impl_t::IdCounter(0);
 
 struct ol_event_impl_t {
-  ol_event_impl_t(void *EventInfo, ol_queue_handle_t Queue)
-      : EventInfo(EventInfo), Queue(Queue) {}
+  ol_event_impl_t(void *EventInfo, ol_device_handle_t Device,
+                  ol_queue_handle_t Queue)
+      : EventInfo(EventInfo), Device(Device), QueueId(Queue->Id), Queue(Queue) {
+  }
   // EventInfo may be null, in which case the event should be considered always
   // complete
   void *EventInfo;
+  ol_device_handle_t Device;
+  size_t QueueId;
+  // Events may outlive the queue - don't assume this is always valid.
+  // It is provided only to implement OL_EVENT_INFO_QUEUE. Use QueueId to check
+  // for queue equality instead.
   ol_queue_handle_t Queue;
 };
 
 struct ol_program_impl_t {
   ol_program_impl_t(plugin::DeviceImageTy *Image,
-                    std::unique_ptr<llvm::MemoryBuffer> ImageData,
-                    const __tgt_device_image &DeviceImage)
-      : Image(Image), ImageData(std::move(ImageData)),
-        DeviceImage(DeviceImage) {}
+                    llvm::MemoryBufferRef DeviceImage)
+      : Image(Image), DeviceImage(DeviceImage) {}
   plugin::DeviceImageTy *Image;
-  std::unique_ptr<llvm::MemoryBuffer> ImageData;
   std::mutex SymbolListMutex;
-  __tgt_device_image DeviceImage;
+  llvm::MemoryBufferRef DeviceImage;
   llvm::StringMap<std::unique_ptr<ol_symbol_impl_t>> KernelSymbols;
   llvm::StringMap<std::unique_ptr<ol_symbol_impl_t>> GlobalSymbols;
 };
@@ -108,6 +184,9 @@ namespace offload {
 struct AllocInfo {
   ol_device_handle_t Device;
   ol_alloc_type_t Type;
+  void *Start;
+  // One byte past the end
+  void *End;
 };
 
 // Global shared state for liboffload
@@ -125,12 +204,16 @@ struct OffloadContext {
   bool TracingEnabled = false;
   bool ValidationEnabled = true;
   DenseMap<void *, AllocInfo> AllocInfoMap{};
-  SmallVector<ol_platform_impl_t, 4> Platforms{};
+  std::mutex AllocInfoMapMutex{};
+  // Partitioned list of memory base addresses. Each element in this list is a
+  // key in AllocInfoMap
+  llvm::SmallVector<void *> AllocBases{};
+  SmallVector<std::unique_ptr<ol_platform_impl_t>, 4> Platforms{};
   size_t RefCount;
 
   ol_device_handle_t HostDevice() {
     // The host platform is always inserted last
-    return &Platforms.back().Devices[0];
+    return Platforms.back()->Devices[0].get();
   }
 
   static OffloadContext &get() {
@@ -169,37 +252,35 @@ Error initPlugins(OffloadContext &Context) {
   // Attempt to create an instance of each supported plugin.
 #define PLUGIN_TARGET(Name)                                                    \
   do {                                                                         \
-    Context.Platforms.emplace_back(ol_platform_impl_t{                         \
-        std::unique_ptr<GenericPluginTy>(createPlugin_##Name()),               \
-        pluginNameToBackend(#Name)});                                          \
+    if (StringRef(#Name) != "host")                                            \
+      Context.Platforms.emplace_back(std::make_unique<ol_platform_impl_t>(     \
+          std::unique_ptr<GenericPluginTy>(createPlugin_##Name()),             \
+          pluginNameToBackend(#Name)));                                        \
   } while (false);
 #include "Shared/Targets.def"
 
   // Preemptively initialize all devices in the plugin
   for (auto &Platform : Context.Platforms) {
-    // Do not use the host plugin - it isn't supported.
-    if (Platform.BackendType == OL_PLATFORM_BACKEND_UNKNOWN)
-      continue;
-    auto Err = Platform.Plugin->init();
+    auto Err = Platform->Plugin->init();
     [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
-    for (auto DevNum = 0; DevNum < Platform.Plugin->number_of_devices();
+    for (auto DevNum = 0; DevNum < Platform->Plugin->number_of_devices();
          DevNum++) {
-      if (Platform.Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
-        auto Device = &Platform.Plugin->getDevice(DevNum);
+      if (Platform->Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
+        auto Device = &Platform->Plugin->getDevice(DevNum);
         auto Info = Device->obtainInfoImpl();
         if (auto Err = Info.takeError())
           return Err;
-        Platform.Devices.emplace_back(DevNum, Device, &Platform,
-                                      std::move(*Info));
+        Platform->Devices.emplace_back(std::make_unique<ol_device_impl_t>(
+            DevNum, Device, *Platform, std::move(*Info)));
       }
     }
   }
 
   // Add the special host device
   auto &HostPlatform = Context.Platforms.emplace_back(
-      ol_platform_impl_t{nullptr, OL_PLATFORM_BACKEND_HOST});
-  HostPlatform.Devices.emplace_back(-1, nullptr, nullptr, InfoTreeNode{});
-  Context.HostDevice()->Platform = &HostPlatform;
+      std::make_unique<ol_platform_impl_t>(nullptr, OL_PLATFORM_BACKEND_HOST));
+  HostPlatform->Devices.emplace_back(std::make_unique<ol_device_impl_t>(
+      -1, nullptr, *HostPlatform, InfoTreeNode{}));
 
   Context.TracingEnabled = std::getenv("OFFLOAD_TRACE");
   Context.ValidationEnabled = !std::getenv("OFFLOAD_DISABLE_VALIDATION");
@@ -236,10 +317,10 @@ Error olShutDown_impl() {
 
   for (auto &P : OldContext->Platforms) {
     // Host plugin is nullptr and has no deinit
-    if (!P.Plugin || !P.Plugin->is_initialized())
+    if (!P->Plugin || !P->Plugin->is_initialized())
       continue;
 
-    if (auto Res = P.Plugin->deinit())
+    if (auto Res = P->destroy())
       Result = llvm::joinErrors(std::move(Result), std::move(Res));
   }
 
@@ -302,10 +383,57 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
   };
 
   // These are not implemented by the plugin interface
-  if (PropName == OL_DEVICE_INFO_PLATFORM)
-    return Info.write<void *>(Device->Platform);
-  if (PropName == OL_DEVICE_INFO_TYPE)
+  switch (PropName) {
+  case OL_DEVICE_INFO_PLATFORM:
+    return Info.write<void *>(&Device->Platform);
+
+  case OL_DEVICE_INFO_TYPE:
     return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_GPU);
+
+  case OL_DEVICE_INFO_SINGLE_FP_CONFIG:
+  case OL_DEVICE_INFO_DOUBLE_FP_CONFIG: {
+    ol_device_fp_capability_flags_t flags{0};
+    flags |= OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT |
+             OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
+             OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
+             OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
+             OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
+             OL_DEVICE_FP_CAPABILITY_FLAG_DENORM |
+             OL_DEVICE_FP_CAPABILITY_FLAG_FMA;
+    return Info.write(flags);
+  }
+
+  case OL_DEVICE_INFO_HALF_FP_CONFIG:
+    return Info.write<ol_device_fp_capability_flags_t>(0);
+
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE:
+    return Info.write<uint32_t>(1);
+
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF:
+    return Info.write<uint32_t>(0);
+
+  // None of the existing plugins specify a limit on a single allocation,
+  // so return the global memory size instead
+  case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
+    [[fallthrough]];
+  // AMD doesn't provide the global memory size (trivially) with the device info
+  // struct, so use the plugin interface
+  case OL_DEVICE_INFO_GLOBAL_MEM_SIZE: {
+    uint64_t Mem;
+    if (auto Err = Device->Device->getDeviceMemorySize(Mem))
+      return Err;
+    return Info.write<uint64_t>(Mem);
+  } break;
+
+  default:
+    break;
+  }
+
   if (PropName >= OL_DEVICE_INFO_LAST)
     return createOffloadError(ErrorCode::INVALID_ENUMERATION,
                               "getDeviceInfo enum '%i' is invalid", PropName);
@@ -316,8 +444,10 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
                      "plugin did not provide a response for this information");
   auto Entry = *EntryOpt;
 
+  // Retrieve properties from the plugin interface
   switch (PropName) {
   case OL_DEVICE_INFO_NAME:
+  case OL_DEVICE_INFO_PRODUCT_NAME:
   case OL_DEVICE_INFO_VENDOR:
   case OL_DEVICE_INFO_DRIVER_VERSION: {
     // String values
@@ -327,7 +457,13 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
     return Info.writeString(std::get<std::string>(Entry->Value).c_str());
   }
 
-  case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
+  case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
+  case OL_DEVICE_INFO_MAX_WORK_SIZE:
+  case OL_DEVICE_INFO_VENDOR_ID:
+  case OL_DEVICE_INFO_NUM_COMPUTE_UNITS:
+  case OL_DEVICE_INFO_ADDRESS_BITS:
+  case OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY:
+  case OL_DEVICE_INFO_MEMORY_CLOCK_RATE: {
     // Uint32 values
     if (!std::holds_alternative<uint64_t>(Entry->Value))
       return makeError(ErrorCode::BACKEND_FAILURE,
@@ -339,6 +475,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
     return Info.write(static_cast<uint32_t>(Value));
   }
 
+  case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION:
   case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: {
     // {x, y, z} triples
     ol_dimensions_t Out{0, 0, 0};
@@ -377,21 +514,61 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
   assert(Device == OffloadContext::get().HostDevice());
   InfoWriter Info(PropSize, PropValue, PropSizeRet);
 
+  constexpr auto uint32_max = std::numeric_limits<uint32_t>::max();
+
   switch (PropName) {
   case OL_DEVICE_INFO_PLATFORM:
-    return Info.write<void *>(Device->Platform);
+    return Info.write<void *>(&Device->Platform);
   case OL_DEVICE_INFO_TYPE:
     return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_HOST);
   case OL_DEVICE_INFO_NAME:
     return Info.writeString("Virtual Host Device");
+  case OL_DEVICE_INFO_PRODUCT_NAME:
+    return Info.writeString("Virtual Host Device");
   case OL_DEVICE_INFO_VENDOR:
     return Info.writeString("Liboffload");
   case OL_DEVICE_INFO_DRIVER_VERSION:
     return Info.writeString(LLVM_VERSION_STRING);
   case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
-    return Info.write<uint64_t>(1);
+    return Info.write<uint32_t>(1);
   case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION:
     return Info.write<ol_dimensions_t>(ol_dimensions_t{1, 1, 1});
+  case OL_DEVICE_INFO_MAX_WORK_SIZE:
+    return Info.write<uint32_t>(uint32_max);
+  case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION:
+    return Info.write<ol_dimensions_t>(
+        ol_dimensions_t{uint32_max, uint32_max, uint32_max});
+  case OL_DEVICE_INFO_VENDOR_ID:
+    return Info.write<uint32_t>(0);
+  case OL_DEVICE_INFO_NUM_COMPUTE_UNITS:
+    return Info.write<uint32_t>(1);
+  case OL_DEVICE_INFO_SINGLE_FP_CONFIG:
+  case OL_DEVICE_INFO_DOUBLE_FP_CONFIG:
+    return Info.write<ol_device_fp_capability_flags_t>(
+        OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT |
+        OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
+        OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
+        OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
+        OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
+        OL_DEVICE_FP_CAPABILITY_FLAG_DENORM | OL_DEVICE_FP_CAPABILITY_FLAG_FMA);
+  case OL_DEVICE_INFO_HALF_FP_CONFIG:
+    return Info.write<ol_device_fp_capability_flags_t>(0);
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE:
+    return Info.write<uint32_t>(1);
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF:
+    return Info.write<uint32_t>(0);
+  case OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY:
+  case OL_DEVICE_INFO_MEMORY_CLOCK_RATE:
+  case OL_DEVICE_INFO_ADDRESS_BITS:
+    return Info.write<uint32_t>(std::numeric_limits<uintptr_t>::digits);
+  case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
+  case OL_DEVICE_INFO_GLOBAL_MEM_SIZE:
+    return Info.write<uint64_t>(0);
   default:
     return createOffloadError(ErrorCode::INVALID_ENUMERATION,
                               "getDeviceInfo enum '%i' is invalid", PropName);
@@ -419,8 +596,8 @@ Error olGetDeviceInfoSize_impl(ol_device_handle_t Device,
 
 Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) {
   for (auto &Platform : OffloadContext::get().Platforms) {
-    for (auto &Device : Platform.Devices) {
-      if (!Callback(&Device, UserData)) {
+    for (auto &Device : Platform->Devices) {
+      if (!Callback(Device.get(), UserData)) {
         break;
       }
     }
@@ -441,47 +618,184 @@ TargetAllocTy convertOlToPluginAllocTy(ol_alloc_type_t Type) {
   }
 }
 
+constexpr size_t MAX_ALLOC_TRIES = 50;
 Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
                       size_t Size, void **AllocationOut) {
-  auto Alloc =
-      Device->Device->dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type));
-  if (!Alloc)
-    return Alloc.takeError();
-
-  *AllocationOut = *Alloc;
-  OffloadContext::get().AllocInfoMap.insert_or_assign(*Alloc,
-                                                      AllocInfo{Device, Type});
-  return Error::success();
+  SmallVector<void *> Rejects;
+
+  // Repeat the allocation up to a certain amount of times. If it happens to
+  // already be allocated (e.g. by a device from another vendor) throw it away
+  // and try again.
+  for (size_t Count = 0; Count < MAX_ALLOC_TRIES; Count++) {
+    auto NewAlloc = Device->Device->dataAlloc(Size, nullptr,
+                                              convertOlToPluginAllocTy(Type));
+    if (!NewAlloc)
+      return NewAlloc.takeError();
+
+    void *NewEnd = &static_cast<char *>(*NewAlloc)[Size];
+    auto &AllocBases = OffloadContext::get().AllocBases;
+    auto &AllocInfoMap = OffloadContext::get().AllocInfoMap;
+    {
+      std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex);
+
+      // Check that this memory region doesn't overlap another one
+      // That is, the start of this allocation needs to be after another
+      // allocation's end point, and the end of this allocation needs to be
+      // before the next one's start.
+      // `Gap` is the first alloc who ends after the new alloc's start point.
+      auto Gap =
+          std::lower_bound(AllocBases.begin(), AllocBases.end(), *NewAlloc,
+                           [&](const void *Iter, const void *Val) {
+                             return AllocInfoMap.at(Iter).End <= Val;
+                           });
+      if (Gap == AllocBases.end() || NewEnd <= AllocInfoMap.at(*Gap).Start) {
+        // Success, no conflict
+        AllocInfoMap.insert_or_assign(
+            *NewAlloc, AllocInfo{Device, Type, *NewAlloc, NewEnd});
+        AllocBases.insert(
+            std::lower_bound(AllocBases.begin(), AllocBases.end(), *NewAlloc),
+            *NewAlloc);
+        *AllocationOut = *NewAlloc;
+
+        for (void *R : Rejects)
+          if (auto Err =
+                  Device->Device->dataDelete(R, convertOlToPluginAllocTy(Type)))
+            return Err;
+        return Error::success();
+      }
+
+      // To avoid the next attempt allocating the same memory we just freed, we
+      // hold onto it until we complete the allocation
+      Rejects.push_back(*NewAlloc);
+    }
+  }
+
+  // We've tried multiple times, and can't allocate a non-overlapping region.
+  return createOffloadError(ErrorCode::BACKEND_FAILURE,
+                            "failed to allocate non-overlapping memory");
 }
 
 Error olMemFree_impl(void *Address) {
-  if (!OffloadContext::get().AllocInfoMap.contains(Address))
-    return createOffloadError(ErrorCode::INVALID_ARGUMENT,
-                              "address is not a known allocation");
-
-  auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address);
-  auto Device = AllocInfo.Device;
-  auto Type = AllocInfo.Type;
+  ol_device_handle_t Device;
+  ol_alloc_type_t Type;
+  {
+    std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex);
+    if (!OffloadContext::get().AllocInfoMap.contains(Address))
+      return createOffloadError(ErrorCode::INVALID_ARGUMENT,
+                                "address is not a known allocation");
+
+    auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address);
+    Device = AllocInfo.Device;
+    Type = AllocInfo.Type;
+    OffloadContext::get().AllocInfoMap.erase(Address);
+
+    auto &Bases = OffloadContext::get().AllocBases;
+    Bases.erase(std::lower_bound(Bases.begin(), Bases.end(), Address));
+  }
 
   if (auto Res =
           Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type)))
     return Res;
 
-  OffloadContext::get().AllocInfoMap.erase(Address);
+  return Error::success();
+}
+
+Error olGetMemInfoImplDetail(const void *Ptr, ol_mem_info_t PropName,
+                             size_t PropSize, void *PropValue,
+                             size_t *PropSizeRet) {
+  InfoWriter Info(PropSize, PropValue, PropSizeRet);
+  std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex);
+
+  auto &AllocBases = OffloadContext::get().AllocBases;
+  auto &AllocInfoMap = OffloadContext::get().AllocInfoMap;
+  const AllocInfo *Alloc = nullptr;
+  if (AllocInfoMap.contains(Ptr)) {
+    // Fast case, we have been given the base pointer directly
+    Alloc = &AllocInfoMap.at(Ptr);
+  } else {
+    // Slower case, we need to look up the base pointer first
+    // Find the first memory allocation whose end is after the target pointer,
+    // and then check to see if it is in range
+    auto Loc = std::lower_bound(AllocBases.begin(), AllocBases.end(), Ptr,
+                                [&](const void *Iter, const void *Val) {
+                                  return AllocInfoMap.at(Iter).End <= Val;
+                                });
+    if (Loc == AllocBases.end() || Ptr < AllocInfoMap.at(*Loc).Start)
+      return Plugin::error(ErrorCode::NOT_FOUND,
+                           "allocated memory information not found");
+    Alloc = &AllocInfoMap.at(*Loc);
+  }
+
+  switch (PropName) {
+  case OL_MEM_INFO_DEVICE:
+    return Info.write<ol_device_handle_t>(Alloc->Device);
+  case OL_MEM_INFO_BASE:
+    return Info.write<void *>(Alloc->Start);
+  case OL_MEM_INFO_SIZE:
+    return Info.write<size_t>(static_cast<char *>(Alloc->End) -
+                              static_cast<char *>(Alloc->Start));
+  case OL_MEM_INFO_TYPE:
+    return Info.write<ol_alloc_type_t>(Alloc->Type);
+  default:
+    return createOffloadError(ErrorCode::INVALID_ENUMERATION,
+                              "olGetMemInfo enum '%i' is invalid", PropName);
+  }
 
   return Error::success();
 }
 
+Error olGetMemInfo_impl(const void *Ptr, ol_mem_info_t PropName,
+                        size_t PropSize, void *PropValue) {
+  return olGetMemInfoImplDetail(Ptr, PropName, PropSize, PropValue, nullptr);
+}
+
+Error olGetMemInfoSize_impl(const void *Ptr, ol_mem_info_t PropName,
+                            size_t *PropSizeRet) {
+  return olGetMemInfoImplDetail(Ptr, PropName, 0, nullptr, PropSizeRet);
+}
+
 Error olCreateQueue_impl(ol_device_handle_t Device, ol_queue_handle_t *Queue) {
   auto CreatedQueue = std::make_unique<ol_queue_impl_t>(nullptr, Device);
-  if (auto Err = Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo)))
+
+  auto OutstandingQueue = Device->getOutstandingQueue();
+  if (OutstandingQueue) {
+    // The queue is empty, but we still need to sync it to release any temporary
+    // memory allocations or do other cleanup.
+    if (auto Err =
+            Device->Device->synchronize(OutstandingQueue, /*Release=*/false))
+      return Err;
+    CreatedQueue->AsyncInfo = OutstandingQueue;
+  } else if (auto Err =
+                 Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo))) {
     return Err;
+  }
 
   *Queue = CreatedQueue.release();
   return Error::success();
 }
 
-Error olDestroyQueue_impl(ol_queue_handle_t Queue) { return olDestroy(Queue); }
+Error olDestroyQueue_impl(ol_queue_handle_t Queue) {
+  auto *Device = Queue->Device;
+  // This is safe; as soon as olDestroyQueue is called it is not possible to add
+  // any more work to the queue, so if it's finished now it will remain finished
+  // forever.
+  auto Res = Device->Device->hasPendingWork(Queue->AsyncInfo);
+  if (!Res)
+    return Res.takeError();
+
+  if (!*Res) {
+    // The queue is complete, so sync it and throw it back into the pool.
+    if (auto Err = Device->Device->synchronize(Queue->AsyncInfo,
+                                               /*Release=*/true))
+      return Err;
+  } else {
+    // The queue still has outstanding work. Store it so we can check it later.
+    std::lock_guard<std::mutex> Lock(Device->OutstandingQueuesMutex);
+    Device->OutstandingQueues.push_back(Queue->AsyncInfo);
+  }
+
+  return olDestroy(Queue);
+}
 
 Error olSyncQueue_impl(ol_queue_handle_t Queue) {
   // Host plugin doesn't have a queue set so it's not safe to call synchronize
@@ -509,7 +823,7 @@ Error olWaitEvents_impl(ol_queue_handle_t Queue, ol_event_handle_t *Events,
                            "olWaitEvents asked to wait on a NULL event");
 
     // Do nothing if the event is for this queue or the event is always complete
-    if (Event->Queue == Queue || !Event->EventInfo)
+    if (Event->QueueId == Queue->Id || !Event->EventInfo)
       continue;
 
     if (auto Err = Device->waitEvent(Event->EventInfo, Queue->AsyncInfo))
@@ -553,11 +867,11 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName,
 }
 
 Error olSyncEvent_impl(ol_event_handle_t Event) {
+  // No event info means that this event was complete on creation
   if (!Event->EventInfo)
-    // Event always complete
     return Plugin::success();
 
-  if (auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo))
+  if (auto Res = Event->Device->Device->syncEvent(Event->EventInfo))
     return Res;
 
   return Error::success();
@@ -565,7 +879,7 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
 
 Error olDestroyEvent_impl(ol_event_handle_t Event) {
   if (Event->EventInfo)
-    if (auto Res = Event->Queue->Device->Device->destroyEvent(Event->EventInfo))
+    if (auto Res = Event->Device->Device->destroyEvent(Event->EventInfo))
       return Res;
 
   return olDestroy(Event);
@@ -575,10 +889,22 @@ Error olGetEventInfoImplDetail(ol_event_handle_t Event,
                                ol_event_info_t PropName, size_t PropSize,
                                void *PropValue, size_t *PropSizeRet) {
   InfoWriter Info(PropSize, PropValue, PropSizeRet);
+  auto Queue = Event->Queue;
 
   switch (PropName) {
   case OL_EVENT_INFO_QUEUE:
-    return Info.write<ol_queue_handle_t>(Event->Queue);
+    return Info.write<ol_queue_handle_t>(Queue);
+  case OL_EVENT_INFO_IS_COMPLETE: {
+    // No event info means that this event was complete on creation
+    if (!Event->EventInfo)
+      return Info.write<bool>(true);
+
+    auto Res = Queue->Device->Device->isEventComplete(Event->EventInfo,
+                                                      Queue->AsyncInfo);
+    if (auto Err = Res.takeError())
+      return Err;
+    return Info.write<bool>(*Res);
+  }
   default:
     return createOffloadError(ErrorCode::INVALID_ENUMERATION,
                               "olGetEventInfo enum '%i' is invalid", PropName);
@@ -604,7 +930,7 @@ Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) {
   if (auto Err = Pending.takeError())
     return Err;
 
-  *EventOut = new ol_event_impl_t(nullptr, Queue);
+  *EventOut = new ol_event_impl_t(nullptr, Queue->Device, Queue);
   if (!*Pending)
     // Queue is empty, don't record an event and consider the event always
     // complete
@@ -656,31 +982,31 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
   return Error::success();
 }
 
+Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
+                     const void *PatternPtr, size_t FillSize) {
+  return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
+                                         Queue->AsyncInfo);
+}
+
 Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
                            size_t ProgDataSize, ol_program_handle_t *Program) {
-  // Make a copy of the program binary in case it is released by the caller.
-  auto ImageData = MemoryBuffer::getMemBufferCopy(
-      StringRef(reinterpret_cast<const char *>(ProgData), ProgDataSize));
-
-  auto DeviceImage = __tgt_device_image{
-      const_cast<char *>(ImageData->getBuffer().data()),
-      const_cast<char *>(ImageData->getBuffer().data()) + ProgDataSize, nullptr,
-      nullptr};
-
-  ol_program_handle_t Prog =
-      new ol_program_impl_t(nullptr, std::move(ImageData), DeviceImage);
-
-  auto Res =
-      Device->Device->loadBinary(Device->Device->Plugin, &Prog->DeviceImage);
-  if (!Res) {
-    delete Prog;
+  StringRef Buffer(reinterpret_cast<const char *>(ProgData), ProgDataSize);
+  Expected<plugin::DeviceImageTy *> Res =
+      Device->Device->loadBinary(Device->Device->Plugin, Buffer);
+  if (!Res)
     return Res.takeError();
-  }
-  assert(*Res != nullptr && "loadBinary returned nullptr");
+  assert(*Res && "loadBinary returned nullptr");
 
-  Prog->Image = *Res;
-  *Program = Prog;
+  *Program = new ol_program_impl_t(*Res, (*Res)->getMemoryBuffer());
+  return Error::success();
+}
 
+Error olIsValidBinary_impl(ol_device_handle_t Device, const void *ProgData,
+                           size_t ProgDataSize, bool *IsValid) {
+  StringRef Buffer(reinterpret_cast<const char *>(ProgData), ProgDataSize);
+  *IsValid = Device->Device ? Device->Device->Plugin.isDeviceCompatible(
+                                  Device->Device->getDeviceId(), Buffer)
+                            : false;
   return Error::success();
 }
 
@@ -696,6 +1022,24 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
   return olDestroy(Program);
 }
 
+Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device,
+                                       ol_symbol_handle_t Kernel,
+                                       size_t DynamicMemSize,
+                                       size_t *GroupSize) {
+  if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL)
+    return createOffloadError(ErrorCode::SYMBOL_KIND,
+                              "provided symbol is not a kernel");
+  auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
+
+  auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
+  if (auto Err = Res.takeError())
+    return Err;
+
+  *GroupSize = *Res;
+
+  return Error::success();
+}
+
 Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                           ol_symbol_handle_t Kernel, const void *ArgumentsData,
                           size_t ArgumentsSize,
@@ -765,7 +1109,7 @@ Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name,
     return Error::success();
   }
   case OL_SYMBOL_KIND_GLOBAL_VARIABLE: {
-    auto &Global = Program->KernelSymbols[Name];
+    auto &Global = Program->GlobalSymbols[Name];
     if (!Global) {
       GlobalTy GlobalObj{Name};
       if (auto Res =
@@ -833,5 +1177,12 @@ Error olGetSymbolInfoSize_impl(ol_symbol_handle_t Symbol,
   return olGetSymbolInfoImplDetail(Symbol, PropName, 0, nullptr, PropSizeRet);
 }
 
+Error olLaunchHostFunction_impl(ol_queue_handle_t Queue,
+                                ol_host_function_cb_t Callback,
+                                void *UserData) {
+  return Queue->Device->Device->enqueueHostCall(Callback, UserData,
+                                                Queue->AsyncInfo);
+}
+
 } // namespace offload
 } // namespace llvm
diff --git a/offload/libomptarget/OpenMP/InteropAPI.cpp b/offload/libomptarget/OpenMP/InteropAPI.cpp
index eb5425ecbf06..c55ef2c2e672 100644
--- a/offload/libomptarget/OpenMP/InteropAPI.cpp
+++ b/offload/libomptarget/OpenMP/InteropAPI.cpp
@@ -124,7 +124,7 @@ void *getProperty<void *>(omp_interop_val_t &InteropVal,
   case omp_ipr_device_context:
     return InteropVal.device_info.Context;
   case omp_ipr_targetsync:
-    return InteropVal.async_info->Queue;
+    return InteropVal.async_info ? InteropVal.async_info->Queue : nullptr;
   default:;
   }
   getTypeMismatch(Property, Err);
@@ -167,7 +167,6 @@ bool getPropertyCheck(omp_interop_val_t **InteropPtr,
                                        omp_interop_property_t property_id,     \
                                        int *err) {                             \
     omp_interop_val_t *interop_val = (omp_interop_val_t *)interop;             \
-    assert((interop_val)->interop_type == kmp_interop_type_targetsync);        \
     if (!getPropertyCheck(&interop_val, property_id, err)) {                   \
       return (RETURN_TYPE)(0);                                                 \
     }                                                                          \
@@ -275,8 +274,8 @@ omp_interop_val_t *__tgt_interop_get(ident_t *LocRef, int32_t InteropType,
   return Interop;
 }
 
-int __tgt_interop_use(ident_t *LocRef, omp_interop_val_t *Interop,
-                      interop_ctx_t *Ctx, dep_pack_t *Deps) {
+int __tgt_interop_use60(ident_t *LocRef, omp_interop_val_t *Interop,
+                        interop_ctx_t *Ctx, dep_pack_t *Deps) {
   bool Nowait = Ctx->flags.nowait;
   DP("Call to %s with interop " DPxMOD ", nowait %" PRId32 "\n", __func__,
      DPxPTR(Interop), Nowait);
@@ -359,6 +358,40 @@ EXTERN int ompx_interop_add_completion_callback(omp_interop_val_t *Interop,
   return omp_irc_success;
 }
 
+// Backwards compatibility wrappers
+void __tgt_interop_init(ident_t *LocRef, int32_t Gtid,
+                        omp_interop_val_t *&InteropPtr, int32_t InteropType,
+                        int32_t DeviceId, int32_t Ndeps,
+                        kmp_depend_info_t *DepList, int32_t HaveNowait) {
+  constexpr int32_t old_kmp_interop_type_targetsync = 2;
+  interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid};
+  dep_pack_t Deps = {Ndeps, 0, DepList, nullptr};
+  InteropPtr =
+      __tgt_interop_get(LocRef,
+                        InteropType == old_kmp_interop_type_targetsync
+                            ? kmp_interop_type_targetsync
+                            : kmp_interop_type_target,
+                        DeviceId, 0, nullptr, &Ctx, Ndeps ? &Deps : nullptr);
+}
+
+void __tgt_interop_use(ident_t *LocRef, int32_t Gtid,
+                       omp_interop_val_t *&InteropPtr, int32_t DeviceId,
+                       int32_t Ndeps, kmp_depend_info_t *DepList,
+                       int32_t HaveNowait) {
+  interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid};
+  dep_pack_t Deps = {Ndeps, 0, DepList, nullptr};
+  __tgt_interop_use60(LocRef, InteropPtr, &Ctx, Ndeps ? &Deps : nullptr);
+}
+
+void __tgt_interop_destroy(ident_t *LocRef, int32_t Gtid,
+                           omp_interop_val_t *&InteropPtr, int32_t DeviceId,
+                           int32_t Ndeps, kmp_depend_info_t *DepList,
+                           int32_t HaveNowait) {
+  interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid};
+  dep_pack_t Deps = {Ndeps, 0, DepList, nullptr};
+  __tgt_interop_release(LocRef, InteropPtr, &Ctx, Ndeps ? &Deps : nullptr);
+}
+
 } // extern "C"
 
 llvm::Expected<DeviceTy &> omp_interop_val_t::getDevice() const {
diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp
index b57a2f815cba..c8d6b42114d0 100644
--- a/offload/libomptarget/PluginManager.cpp
+++ b/offload/libomptarget/PluginManager.cpp
@@ -219,7 +219,10 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
     // Scan the RTLs that have associated images until we find one that supports
     // the current image.
     for (auto &R : plugins()) {
-      if (!R.is_plugin_compatible(Img))
+      StringRef Buffer(reinterpret_cast<const char *>(Img->ImageStart),
+                       utils::getPtrDiff(Img->ImageEnd, Img->ImageStart));
+
+      if (!R.isPluginCompatible(Buffer))
         continue;
 
       if (!initializePlugin(R))
@@ -242,7 +245,7 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
           continue;
         }
 
-        if (!R.is_device_compatible(DeviceId, Img))
+        if (!R.isDeviceCompatible(DeviceId, Buffer))
           continue;
 
         DP("Image " DPxMOD " is compatible with RTL %s device %d!\n",
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index f88e30ae9e76..71423ae0c94d 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -37,6 +37,8 @@
 using namespace llvm::omp::target::ompt;
 #endif
 
+using namespace llvm::omp::target::plugin;
+
 int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
                                             AsyncInfoTy &AsyncInfo) const {
   // First, check if the user disabled atomic map transfer/malloc/dealloc.
@@ -97,7 +99,55 @@ llvm::Error DeviceTy::init() {
   return llvm::Error::success();
 }
 
-// Load binary to device.
+// Extract the mapping of host function pointers to device function pointers
+// from the entry table. Functions marked as 'indirect' in OpenMP will have
+// offloading entries generated for them which map the host's function pointer
+// to a global containing the corresponding function pointer on the device.
+static llvm::Expected<std::pair<void *, uint64_t>>
+setupIndirectCallTable(DeviceTy &Device, __tgt_device_image *Image,
+                       __tgt_device_binary Binary) {
+  AsyncInfoTy AsyncInfo(Device);
+  llvm::ArrayRef<llvm::offloading::EntryTy> Entries(Image->EntriesBegin,
+                                                    Image->EntriesEnd);
+  llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
+  for (const auto &Entry : Entries) {
+    if (Entry.Kind != llvm::object::OffloadKind::OFK_OpenMP ||
+        Entry.Size == 0 || !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT))
+      continue;
+
+    assert(Entry.Size == sizeof(void *) && "Global not a function pointer?");
+    auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
+
+    void *Ptr;
+    if (Device.RTL->get_global(Binary, Entry.Size, Entry.SymbolName, &Ptr))
+      return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
+                                       "failed to load %s", Entry.SymbolName);
+
+    HstPtr = Entry.Address;
+    if (Device.retrieveData(&DevPtr, Ptr, Entry.Size, AsyncInfo))
+      return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
+                                       "failed to load %s", Entry.SymbolName);
+  }
+
+  // If we do not have any indirect globals we exit early.
+  if (IndirectCallTable.empty())
+    return std::pair{nullptr, 0};
+
+  // Sort the array to allow for more efficient lookup of device pointers.
+  llvm::sort(IndirectCallTable,
+             [](const auto &x, const auto &y) { return x.first < y.first; });
+
+  uint64_t TableSize =
+      IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
+  void *DevicePtr = Device.allocData(TableSize, nullptr, TARGET_ALLOC_DEVICE);
+  if (Device.submitData(DevicePtr, IndirectCallTable.data(), TableSize,
+                        AsyncInfo))
+    return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
+                                     "failed to copy data");
+  return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
+}
+
+// Load binary to device and perform global initialization if needed.
 llvm::Expected<__tgt_device_binary>
 DeviceTy::loadBinary(__tgt_device_image *Img) {
   __tgt_device_binary Binary;
@@ -105,6 +155,38 @@ DeviceTy::loadBinary(__tgt_device_image *Img) {
   if (RTL->load_binary(RTLDeviceID, Img, &Binary) != OFFLOAD_SUCCESS)
     return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
                                      "failed to load binary %p", Img);
+
+  // This symbol is optional.
+  void *DeviceEnvironmentPtr;
+  if (RTL->get_global(Binary, sizeof(DeviceEnvironmentTy),
+                      "__omp_rtl_device_environment", &DeviceEnvironmentPtr))
+    return Binary;
+
+  // Obtain a table mapping host function pointers to device function pointers.
+  auto CallTablePairOrErr = setupIndirectCallTable(*this, Img, Binary);
+  if (!CallTablePairOrErr)
+    return CallTablePairOrErr.takeError();
+
+  GenericDeviceTy &GenericDevice = RTL->getDevice(RTLDeviceID);
+  DeviceEnvironmentTy DeviceEnvironment;
+  DeviceEnvironment.DeviceDebugKind = GenericDevice.getDebugKind();
+  DeviceEnvironment.NumDevices = RTL->getNumDevices();
+  // TODO: The device ID used here is not the real device ID used by OpenMP.
+  DeviceEnvironment.DeviceNum = RTLDeviceID;
+  DeviceEnvironment.DynamicMemSize = GenericDevice.getDynamicMemorySize();
+  DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency();
+  DeviceEnvironment.IndirectCallTable =
+      reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
+  DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
+  DeviceEnvironment.HardwareParallelism =
+      GenericDevice.getHardwareParallelism();
+
+  AsyncInfoTy AsyncInfo(*this);
+  if (submitData(DeviceEnvironmentPtr, &DeviceEnvironment,
+                 sizeof(DeviceEnvironment), AsyncInfo))
+    return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
+                                     "failed to copy data");
+
   return Binary;
 }
 
@@ -191,6 +273,10 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
                                   DstPtr, Size, AsyncInfo);
 }
 
+int32_t DeviceTy::dataFence(AsyncInfoTy &AsyncInfo) {
+  return RTL->data_fence(RTLDeviceID, AsyncInfo);
+}
+
 int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
   DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n",
      DPxPTR(HstPtr), Size);
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 8e2db6ba8bba..1374bfea8151 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -68,8 +68,11 @@ VERS1.0 {
     omp_get_interop_int;
     omp_get_interop_name;
     omp_get_interop_type_desc;
-    __tgt_interop_get;
+    __tgt_interop_init;
     __tgt_interop_use;
+    __tgt_interop_destroy;
+    __tgt_interop_get;
+    __tgt_interop_use60;
     __tgt_interop_release;
     __tgt_target_sync;
     __llvmPushCallConfiguration;
diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index e9b148d8a260..fe1828976590 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -30,6 +30,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
 
 #ifdef OMPT_SUPPORT
 using namespace llvm::omp::target::ompt;
@@ -165,12 +166,24 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
                                              OMPT_GET_RETURN_ADDRESS);)
 
   int Rc = OFFLOAD_SUCCESS;
+
+  // Only allocate AttachInfo for targetDataBegin
+  std::unique_ptr<AttachInfoTy> AttachInfo;
+  if (TargetDataFunction == targetDataBegin)
+    AttachInfo = std::make_unique<AttachInfoTy>();
+
   Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes,
                           ArgTypes, ArgNames, ArgMappers, AsyncInfo,
-                          false /*FromMapper=*/);
+                          AttachInfo.get(), /*FromMapper=*/false);
 
-  if (Rc == OFFLOAD_SUCCESS)
-    Rc = AsyncInfo.synchronize();
+  if (Rc == OFFLOAD_SUCCESS) {
+    // Process deferred ATTACH entries BEFORE synchronization
+    if (AttachInfo && !AttachInfo->AttachEntries.empty())
+      Rc = processAttachEntries(*DeviceOrErr, *AttachInfo, AsyncInfo);
+
+    if (Rc == OFFLOAD_SUCCESS)
+      Rc = AsyncInfo.synchronize();
+  }
 
   handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
 }
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 5b25d955dd32..69725e77bae0 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -293,7 +293,8 @@ void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
 int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
                      int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
                      void *ArgMapper, AsyncInfoTy &AsyncInfo,
-                     TargetDataFuncPtrTy TargetDataFunction) {
+                     TargetDataFuncPtrTy TargetDataFunction,
+                     AttachInfoTy *AttachInfo = nullptr) {
   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
 
   // The mapper function fills up Components.
@@ -324,17 +325,193 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
                               MapperArgsBase.data(), MapperArgs.data(),
                               MapperArgSizes.data(), MapperArgTypes.data(),
                               MapperArgNames.data(), /*arg_mappers*/ nullptr,
-                              AsyncInfo, /*FromMapper=*/true);
+                              AsyncInfo, AttachInfo, /*FromMapper=*/true);
 
   return Rc;
 }
 
+/// Returns a buffer of the requested \p Size, to be used as the source for
+/// `submitData`.
+///
+/// For small buffers (`Size <= sizeof(void*)`), uses \p AsyncInfo's
+/// getVoidPtrLocation().
+/// For larger buffers, creates a dynamic buffer which will be eventually
+/// deleted by \p AsyncInfo's post-processing callback.
+static char *getOrCreateSourceBufferForSubmitData(AsyncInfoTy &AsyncInfo,
+                                                  int64_t Size) {
+  constexpr int64_t VoidPtrSize = sizeof(void *);
+
+  if (Size <= VoidPtrSize) {
+    void *&BufferElement = AsyncInfo.getVoidPtrLocation();
+    return reinterpret_cast<char *>(&BufferElement);
+  }
+
+  // Create a dynamic buffer for larger data and schedule its deletion.
+  char *DataBuffer = new char[Size];
+  AsyncInfo.addPostProcessingFunction([DataBuffer]() {
+    delete[] DataBuffer;
+    return OFFLOAD_SUCCESS;
+  });
+  return DataBuffer;
+}
+
+/// Calculates the target pointee base by applying the host
+/// pointee begin/base delta to the target pointee begin.
+///
+/// ```
+/// TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase)
+/// ```
+static void *calculateTargetPointeeBase(void *HstPteeBase, void *HstPteeBegin,
+                                        void *TgtPteeBegin) {
+  uint64_t Delta = reinterpret_cast<uint64_t>(HstPteeBegin) -
+                   reinterpret_cast<uint64_t>(HstPteeBase);
+  void *TgtPteeBase = reinterpret_cast<void *>(
+      reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta);
+
+  DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD
+     ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n",
+     DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta);
+  DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD
+     "\n",
+     DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin));
+
+  return TgtPteeBase;
+}
+
+/// Utility function to perform a pointer attachment operation.
+///
+/// For something like:
+/// ```cpp
+///  int *p;
+///  ...
+///  #pragma omp target enter data map(to:p[10:10])
+/// ```
+///
+/// for which the attachment operation gets represented using:
+/// ```
+///   &p, &p[10], sizeof(p), ATTACH
+/// ```
+///
+/// (Hst|Tgt)PtrAddr   represents &p
+/// (Hst|Tgt)PteeBase  represents &p[0]
+/// (Hst|Tgt)PteeBegin represents &p[10]
+///
+/// This function first computes the expected TgtPteeBase using:
+///   `<Select>TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase)`
+///
+/// and then attaches TgtPteeBase to TgtPtrAddr.
+///
+/// \p HstPtrSize represents the size of the pointer p. For C/C++, this
+/// should be same as "sizeof(void*)" (say 8).
+///
+/// However, for Fortran, pointers/allocatables, which are also eligible for
+/// "pointer-attachment", may be implemented using descriptors that contain the
+/// address of the pointee in the first 8 bytes, but also contain other
+/// information such as lower-bound/upper-bound etc in their subsequent fields.
+///
+/// For example, for the following:
+/// ```fortran
+///   integer, allocatable :: x(:)
+///   integer, pointer :: p(:)
+///   ...
+///   p => x(10: 19)
+///   ...
+///   !$omp target enter data map(to:p(:))
+/// ```
+///
+/// The map should trigger a pointer-attachment (assuming the pointer-attachment
+/// conditions as noted on processAttachEntries are met) between the descriptor
+/// for p, and its pointee data.
+///
+/// Since only the first 8 bytes of the descriptor contain the address of the
+/// pointee, an attachment operation on device descriptors involves:
+/// * Setting the first 8 bytes of the device descriptor to point the device
+/// address of the pointee.
+/// * Copying the remaining information about bounds/offset etc. from the host
+/// descriptor to the device descriptor.
+///
+/// The function also handles pointer-attachment portion of PTR_AND_OBJ maps,
+/// like:
+/// ```
+///   &p, &p[10], 10 * sizeof(p[10]), PTR_AND_OBJ
+/// ```
+/// by using `sizeof(void*)` as \p HstPtrSize.
+static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
+                                    void **HstPtrAddr, void *HstPteeBase,
+                                    void *HstPteeBegin, void **TgtPtrAddr,
+                                    void *TgtPteeBegin, int64_t HstPtrSize,
+                                    TargetPointerResultTy &PtrTPR) {
+  assert(PtrTPR.getEntry() &&
+         "Need a valid pointer entry to perform pointer-attachment");
+
+  constexpr int64_t VoidPtrSize = sizeof(void *);
+  assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small");
+
+  void *TgtPteeBase =
+      calculateTargetPointeeBase(HstPteeBase, HstPteeBegin, TgtPteeBegin);
+
+  // Add shadow pointer tracking
+  if (!PtrTPR.getEntry()->addShadowPointer(
+          ShadowPtrInfoTy{HstPtrAddr, TgtPtrAddr, TgtPteeBase, HstPtrSize})) {
+    DP("Pointer " DPxMOD " is already attached to " DPxMOD "\n",
+       DPxPTR(TgtPtrAddr), DPxPTR(TgtPteeBase));
+    return OFFLOAD_SUCCESS;
+  }
+
+  DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(TgtPtrAddr),
+     DPxPTR(TgtPteeBase));
+
+  // Lambda to handle submitData result and perform final steps.
+  auto HandleSubmitResult = [&](int SubmitResult) -> int {
+    if (SubmitResult != OFFLOAD_SUCCESS) {
+      REPORT("Failed to update pointer on device.\n");
+      return OFFLOAD_FAIL;
+    }
+
+    if (PtrTPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
+        OFFLOAD_SUCCESS)
+      return OFFLOAD_FAIL;
+
+    return OFFLOAD_SUCCESS;
+  };
+
+  // Get a buffer to be used as the source for data submission.
+  char *SrcBuffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize);
+
+  // The pointee's address should occupy the first VoidPtrSize bytes
+  // irrespective of HstPtrSize.
+  std::memcpy(SrcBuffer, &TgtPteeBase, VoidPtrSize);
+
+  // For larger "pointers" (e.g., Fortran descriptors), copy remaining
+  // descriptor fields from the host descriptor into the buffer.
+  if (HstPtrSize > VoidPtrSize) {
+    uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
+    void *HstDescriptorFieldsAddr =
+        reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize;
+    std::memcpy(SrcBuffer + VoidPtrSize, HstDescriptorFieldsAddr,
+                HstDescriptorFieldsSize);
+
+    DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD
+       ") (pointer + %" PRId64 " additional bytes from host descriptor " DPxMOD
+       ")\n",
+       HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize,
+       DPxPTR(HstDescriptorFieldsAddr));
+  }
+
+  // Submit the populated source buffer to device.
+  int SubmitResult = Device.submitData(TgtPtrAddr, SrcBuffer, HstPtrSize,
+                                       AsyncInfo, PtrTPR.getEntry());
+  return HandleSubmitResult(SubmitResult);
+}
+
 /// Internal function to do the mapping and transfer the data to the device
 int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     void **ArgsBase, void **Args, int64_t *ArgSizes,
                     int64_t *ArgTypes, map_var_info_t *ArgNames,
                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
-                    bool FromMapper) {
+                    AttachInfoTy *AttachInfo, bool FromMapper) {
+  assert(AttachInfo && "AttachInfo must be available for targetDataBegin for "
+                       "handling ATTACH map-types.");
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     // Ignore private variables and arrays - there is no mapping for them.
@@ -352,7 +529,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
       int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
                                 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
-                                targetDataBegin);
+                                targetDataBegin, AttachInfo);
 
       if (Rc != OFFLOAD_SUCCESS) {
         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
@@ -369,6 +546,25 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     int64_t DataSize = ArgSizes[I];
     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
 
+    // ATTACH map-types are supposed to be handled after all mapping for the
+    // construct is done. Defer their processing.
+    if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) {
+      const bool IsCorrespondingPointerInit =
+          (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE);
+      // We don't need to keep track of PRIVATE | ATTACH entries. They
+      // represent corresponding-pointer-initialization, and are handled
+      // similar to firstprivate (PRIVATE | TO) entries by
+      // PrivateArgumentManager.
+      if (!IsCorrespondingPointerInit)
+        AttachInfo->AttachEntries.emplace_back(
+            /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin,
+            /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I],
+            /*PointeeName=*/HstPtrName);
+
+      DP("Deferring ATTACH map-type processing for argument %d\n", I);
+      continue;
+    }
+
     // Adjust for proper alignment if this is a combined entry (for structs).
     // Look at the next argument - if that is MEMBER_OF this one, then this one
     // is a combined entry.
@@ -434,13 +630,18 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                                   : "device failure or illegal mapping");
         return OFFLOAD_FAIL;
       }
+
+      // Track new allocation, for eventual use in attachment decision-making.
+      if (PointerTpr.Flags.IsNewEntry && !IsHostPtr)
+        AttachInfo->NewAllocations[HstPtrBase] = sizeof(void *);
+
       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
          "\n",
          sizeof(void *), DPxPTR(PointerTgtPtrBegin),
          (PointerTpr.Flags.IsNewEntry ? "" : " not"));
       PointerHstPtrBegin = HstPtrBase;
       // modify current entry.
-      HstPtrBase = *(void **)HstPtrBase;
+      HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
       // No need to update pointee ref count for the first element of the
       // subelement that comes from mapper.
       UpdateRef =
@@ -464,6 +665,11 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                                 : "device failure or illegal mapping");
       return OFFLOAD_FAIL;
     }
+
+    // Track new allocation, for eventual use in attachment decision-making.
+    if (TPR.Flags.IsNewEntry && !IsHostPtr && TgtPtrBegin)
+      AttachInfo->NewAllocations[HstPtrBegin] = DataSize;
+
     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
        " - is%s new\n",
        DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
@@ -476,30 +682,13 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     }
 
     if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
-
-      uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
-      void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
-
-      if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{
-              (void **)PointerHstPtrBegin, HstPtrBase,
-              (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) {
-        DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
-           DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
-
-        void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
-        TgtPtrBase = ExpectedTgtPtrBase;
-
-        int Ret =
-            Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *),
-                              AsyncInfo, PointerTpr.getEntry());
-        if (Ret != OFFLOAD_SUCCESS) {
-          REPORT("Copying data to device failed.\n");
-          return OFFLOAD_FAIL;
-        }
-        if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
-            OFFLOAD_SUCCESS)
-          return OFFLOAD_FAIL;
-      }
+      int Ret = performPointerAttachment(
+          Device, AsyncInfo, reinterpret_cast<void **>(PointerHstPtrBegin),
+          HstPtrBase, HstPtrBegin,
+          reinterpret_cast<void **>(PointerTgtPtrBegin), TgtPtrBegin,
+          sizeof(void *), PointerTpr);
+      if (Ret != OFFLOAD_SUCCESS)
+        return OFFLOAD_FAIL;
     }
 
     // Check if variable can be used on the device:
@@ -515,6 +704,189 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
   return OFFLOAD_SUCCESS;
 }
 
+/// Process deferred ATTACH map entries collected during targetDataBegin.
+///
+/// From OpenMP's perspective, when mapping something that has a base pointer,
+/// such as:
+/// ```cpp
+///   int *p;
+///   #pragma omp enter target data map(to: p[10:20])
+/// ```
+///
+/// a pointer-attachment between p and &p[10] should occur if both p and
+/// p[10] are present on the device after doing all allocations for all maps
+/// on the construct, and one of the following is true:
+///
+/// * The pointer p was newly allocated while handling the construct
+/// * The pointee p[10:20] was newly allocated while handling the construct
+/// * attach(always) map-type modifier was specified (OpenMP 6.1)
+///
+/// That's why we collect all attach entries and new memory allocations during
+/// targetDataBegin, and use that information to make the decision of whether
+/// to perform a pointer-attachment or not here, after maps have been handled.
+///
+/// Additionally, once we decide that a pointer-attachment should be performed,
+/// we need to make sure that it happens after any previously submitted data
+/// transfers have completed, to avoid the possibility of the pending transfers
+/// clobbering the attachment. For example:
+///
+/// ```cpp
+///   int *p = ...;
+///   int **pp = &p;
+///   map(to: pp[0], p[0])
+/// ```
+///
+/// Which would be represented by:
+/// ```
+/// &pp[0], &pp[0], sizeof(pp[0]), TO (1)
+/// &p[0], &p[0], sizeof(p[0]), TO    (2)
+///
+/// &pp, &pp[0], sizeof(pp), ATTACH   (3)
+/// &p, &p[0], sizeof(p), ATTACH      (4)
+/// ```
+///
+/// (4) and (1) are both trying to modify the device memory corresponding to
+/// `&p`. So, if we decide that (4) should do an attachment, we also need to
+/// ensure that (4) happens after (1) is complete.
+///
+/// For this purpose, we insert a data_fence before the first
+/// pointer-attachment, (3), to ensure that all pending transfers finish first.
+int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
+                         AsyncInfoTy &AsyncInfo) {
+  // Report all tracked allocations from both main loop and ATTACH processing
+  if (!AttachInfo.NewAllocations.empty()) {
+    DP("Tracked %u total new allocations:\n",
+       (unsigned)AttachInfo.NewAllocations.size());
+    for ([[maybe_unused]] const auto &Alloc : AttachInfo.NewAllocations) {
+      DP("  Host ptr: " DPxMOD ", Size: %" PRId64 " bytes\n",
+         DPxPTR(Alloc.first), Alloc.second);
+    }
+  }
+
+  if (AttachInfo.AttachEntries.empty())
+    return OFFLOAD_SUCCESS;
+
+  DP("Processing %zu deferred ATTACH map entries\n",
+     AttachInfo.AttachEntries.size());
+
+  int Ret = OFFLOAD_SUCCESS;
+  bool IsFirstPointerAttachment = true;
+  for (size_t EntryIdx = 0; EntryIdx < AttachInfo.AttachEntries.size();
+       ++EntryIdx) {
+    const auto &AttachEntry = AttachInfo.AttachEntries[EntryIdx];
+
+    void **HstPtr = reinterpret_cast<void **>(AttachEntry.PointerBase);
+
+    void *HstPteeBase = *HstPtr;
+    void *HstPteeBegin = AttachEntry.PointeeBegin;
+
+    int64_t PtrSize = AttachEntry.PointerSize;
+    int64_t MapType = AttachEntry.MapType;
+
+    DP("Processing ATTACH entry %zu: HstPtr=" DPxMOD ", HstPteeBegin=" DPxMOD
+       ", Size=%" PRId64 ", Type=0x%" PRIx64 "\n",
+       EntryIdx, DPxPTR(HstPtr), DPxPTR(HstPteeBegin), PtrSize, MapType);
+
+    const bool IsAttachAlways = MapType & OMP_TGT_MAPTYPE_ALWAYS;
+
+    // Lambda to check if a pointer was newly allocated
+    auto WasNewlyAllocated = [&](void *Ptr, const char *PtrName) {
+      bool IsNewlyAllocated =
+          llvm::any_of(AttachInfo.NewAllocations, [&](const auto &Alloc) {
+            void *AllocPtr = Alloc.first;
+            int64_t AllocSize = Alloc.second;
+            return Ptr >= AllocPtr &&
+                   Ptr < reinterpret_cast<void *>(
+                             reinterpret_cast<char *>(AllocPtr) + AllocSize);
+          });
+      DP("Attach %s " DPxMOD " was newly allocated: %s\n", PtrName, DPxPTR(Ptr),
+         IsNewlyAllocated ? "yes" : "no");
+      return IsNewlyAllocated;
+    };
+
+    // Only process ATTACH if either the pointee or the pointer was newly
+    // allocated, or the ALWAYS flag is set.
+    if (!IsAttachAlways && !WasNewlyAllocated(HstPteeBegin, "pointee") &&
+        !WasNewlyAllocated(HstPtr, "pointer")) {
+      DP("Skipping ATTACH entry %zu: neither pointer nor pointee was newly "
+         "allocated and no ALWAYS flag\n",
+         EntryIdx);
+      continue;
+    }
+
+    // Lambda to perform target pointer lookup and validation
+    auto LookupTargetPointer =
+        [&](void *Ptr, int64_t Size,
+            const char *PtrType) -> std::optional<TargetPointerResultTy> {
+      // ATTACH map-type does not change ref-count, or do any allocation
+      // We just need to do a lookup for the pointer/pointee.
+      TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
+          Ptr, Size, /*UpdateRefCount=*/false,
+          /*UseHoldRefCount=*/false, /*MustContain=*/true);
+
+      DP("Attach %s lookup - IsPresent=%s, IsHostPtr=%s\n", PtrType,
+         TPR.isPresent() ? "yes" : "no",
+         TPR.Flags.IsHostPointer ? "yes" : "no");
+
+      if (!TPR.isPresent()) {
+        DP("Skipping ATTACH entry %zu: %s not present on device\n", EntryIdx,
+           PtrType);
+        return std::nullopt;
+      }
+      if (TPR.Flags.IsHostPointer) {
+        DP("Skipping ATTACH entry %zu: device version of the %s is a host "
+           "pointer.\n",
+           EntryIdx, PtrType);
+        return std::nullopt;
+      }
+
+      return TPR;
+    };
+
+    // Get device version of the pointee (e.g., &p[10]) first, as we can
+    // release its TPR after extracting the pointer value.
+    void *TgtPteeBegin = [&]() -> void * {
+      if (auto PteeTPROpt = LookupTargetPointer(HstPteeBegin, 0, "pointee"))
+        return PteeTPROpt->TargetPointer;
+      return nullptr;
+    }();
+
+    if (!TgtPteeBegin)
+      continue;
+
+    // Get device version of the pointer (e.g., &p) next. We need to keep its
+    // TPR for use in shadow-pointer handling during pointer-attachment.
+    auto PtrTPROpt = LookupTargetPointer(HstPtr, PtrSize, "pointer");
+    if (!PtrTPROpt)
+      continue;
+    TargetPointerResultTy &PtrTPR = *PtrTPROpt;
+    void **TgtPtrBase = reinterpret_cast<void **>(PtrTPR.TargetPointer);
+
+    // Insert a data-fence before the first pointer-attachment.
+    if (IsFirstPointerAttachment) {
+      IsFirstPointerAttachment = false;
+      DP("Inserting a data fence before the first pointer attachment.\n");
+      Ret = Device.dataFence(AsyncInfo);
+      if (Ret != OFFLOAD_SUCCESS) {
+        REPORT("Failed to insert data fence.\n");
+        return OFFLOAD_FAIL;
+      }
+    }
+
+    // Do the pointer-attachment, i.e. update the device pointer to point to
+    // device pointee.
+    Ret = performPointerAttachment(Device, AsyncInfo, HstPtr, HstPteeBase,
+                                   HstPteeBegin, TgtPtrBase, TgtPteeBegin,
+                                   PtrSize, PtrTPR);
+    if (Ret != OFFLOAD_SUCCESS)
+      return OFFLOAD_FAIL;
+
+    DP("ATTACH entry %zu processed successfully\n", EntryIdx);
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
 namespace {
 /// This structure contains information to deallocate a target pointer, aka.
 /// used to fix up the shadow map and potentially delete the entry from the
@@ -584,17 +956,29 @@ postProcessingTargetDataEnd(DeviceTy *Device,
       DelEntry = false;
     }
 
-    // If we copied back to the host a struct/array containing pointers,
-    // we need to restore the original host pointer values from their
-    // shadow copies. If the struct is going to be deallocated, remove any
-    // remaining shadow pointer entries for this struct.
+    // If we copied back to the host a struct/array containing pointers, or
+    // Fortran descriptors (which are larger than a "void *"), we need to
+    // restore the original host pointer/descriptor values from their shadow
+    // copies. If the struct is going to be deallocated, remove any remaining
+    // shadow pointer entries for this struct.
     const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
     if (HasFrom) {
       Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
-        *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
-        DP("Restoring original host pointer value " DPxMOD " for host "
-           "pointer " DPxMOD "\n",
-           DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
+        constexpr int64_t VoidPtrSize = sizeof(void *);
+        if (ShadowPtr.PtrSize > VoidPtrSize) {
+          DP("Restoring host descriptor " DPxMOD
+             " to its original content (%" PRId64
+             " bytes), containing pointee address " DPxMOD "\n",
+             DPxPTR(ShadowPtr.HstPtrAddr), ShadowPtr.PtrSize,
+             DPxPTR(ShadowPtr.HstPtrContent.data()));
+        } else {
+          DP("Restoring host pointer " DPxMOD " to its original value " DPxMOD
+             "\n",
+             DPxPTR(ShadowPtr.HstPtrAddr),
+             DPxPTR(ShadowPtr.HstPtrContent.data()));
+        }
+        std::memcpy(ShadowPtr.HstPtrAddr, ShadowPtr.HstPtrContent.data(),
+                    ShadowPtr.PtrSize);
         return OFFLOAD_SUCCESS;
       });
     }
@@ -624,7 +1008,8 @@ postProcessingTargetDataEnd(DeviceTy *Device,
 int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *ArgNames,
-                  void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
+                  void **ArgMappers, AsyncInfoTy &AsyncInfo,
+                  AttachInfoTy *AttachInfo, bool FromMapper) {
   int Ret = OFFLOAD_SUCCESS;
   auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
   // process each input.
@@ -635,6 +1020,14 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
 
+    // Ignore ATTACH entries - they should only be honored on map-entering
+    // directives. They may be encountered here while handling the "end" part of
+    // "#pragma omp target".
+    if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) {
+      DP("Ignoring ATTACH entry %d in targetDataEnd\n", I);
+      continue;
+    }
+
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataEnd, call the
       // targetDataMapper variant which will call targetDataEnd again
@@ -798,12 +1191,22 @@ static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
     if (TPR.getEntry()) {
       int Ret = TPR.getEntry()->foreachShadowPointerInfo(
           [&](ShadowPtrInfoTy &ShadowPtr) {
-            DP("Restoring original target pointer value " DPxMOD " for target "
-               "pointer " DPxMOD "\n",
-               DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr));
+            constexpr int64_t VoidPtrSize = sizeof(void *);
+            if (ShadowPtr.PtrSize > VoidPtrSize) {
+              DP("Restoring target descriptor " DPxMOD
+                 " to its original content (%" PRId64
+                 " bytes), containing pointee address " DPxMOD "\n",
+                 DPxPTR(ShadowPtr.TgtPtrAddr), ShadowPtr.PtrSize,
+                 DPxPTR(ShadowPtr.TgtPtrContent.data()));
+            } else {
+              DP("Restoring target pointer " DPxMOD
+                 " to its original value " DPxMOD "\n",
+                 DPxPTR(ShadowPtr.TgtPtrAddr),
+                 DPxPTR(ShadowPtr.TgtPtrContent.data()));
+            }
             Ret = Device.submitData(ShadowPtr.TgtPtrAddr,
-                                    (void *)&ShadowPtr.TgtPtrVal,
-                                    sizeof(void *), AsyncInfo);
+                                    ShadowPtr.TgtPtrContent.data(),
+                                    ShadowPtr.PtrSize, AsyncInfo);
             if (Ret != OFFLOAD_SUCCESS) {
               REPORT("Copying data to device failed.\n");
               return OFFLOAD_FAIL;
@@ -828,15 +1231,26 @@ static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
     }
 
     // Wait for device-to-host memcopies for whole struct to complete,
-    // before restoring the correct host pointer.
+    // before restoring the correct host pointer/descriptor.
     if (auto *Entry = TPR.getEntry()) {
       AsyncInfo.addPostProcessingFunction([=]() -> int {
         int Ret = Entry->foreachShadowPointerInfo(
             [&](const ShadowPtrInfoTy &ShadowPtr) {
-              *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
-              DP("Restoring original host pointer value " DPxMOD
-                 " for host pointer " DPxMOD "\n",
-                 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
+              constexpr int64_t VoidPtrSize = sizeof(void *);
+              if (ShadowPtr.PtrSize > VoidPtrSize) {
+                DP("Restoring host descriptor " DPxMOD
+                   " to its original content (%" PRId64
+                   " bytes), containing pointee address " DPxMOD "\n",
+                   DPxPTR(ShadowPtr.HstPtrAddr), ShadowPtr.PtrSize,
+                   DPxPTR(ShadowPtr.HstPtrContent.data()));
+              } else {
+                DP("Restoring host pointer " DPxMOD
+                   " to its original value " DPxMOD "\n",
+                   DPxPTR(ShadowPtr.HstPtrAddr),
+                   DPxPTR(ShadowPtr.HstPtrContent.data()));
+              }
+              std::memcpy(ShadowPtr.HstPtrAddr, ShadowPtr.HstPtrContent.data(),
+                          ShadowPtr.PtrSize);
               return OFFLOAD_SUCCESS;
             });
         Entry->unlock();
@@ -900,7 +1314,8 @@ static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
 int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                      void **ArgsBase, void **Args, int64_t *ArgSizes,
                      int64_t *ArgTypes, map_var_info_t *ArgNames,
-                     void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
+                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
+                     AttachInfoTy *AttachInfo, bool FromMapper) {
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
@@ -1013,13 +1428,24 @@ class PrivateArgumentManagerTy {
     uint32_t Padding;
     /// Host pointer name
     map_var_info_t HstPtrName = nullptr;
+    /// For corresponding-pointer-initialization: host pointee base address.
+    void *HstPteeBase = nullptr;
+    /// For corresponding-pointer-initialization: host pointee begin address.
+    void *HstPteeBegin = nullptr;
+    /// Whether this argument needs corresponding-pointer-initialization.
+    bool IsCorrespondingPointerInit = false;
 
     FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size,
                           uint32_t Alignment, uint32_t Padding,
-                          map_var_info_t HstPtrName = nullptr)
+                          map_var_info_t HstPtrName = nullptr,
+                          void *HstPteeBase = nullptr,
+                          void *HstPteeBegin = nullptr,
+                          bool IsCorrespondingPointerInit = false)
         : HstPtrBegin(reinterpret_cast<char *>(HstPtr)),
           HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment),
-          Size(Size), Padding(Padding), HstPtrName(HstPtrName) {}
+          Size(Size), Padding(Padding), HstPtrName(HstPtrName),
+          HstPteeBase(HstPteeBase), HstPteeBegin(HstPteeBegin),
+          IsCorrespondingPointerInit(IsCorrespondingPointerInit) {}
   };
 
   /// A vector of target pointers for all private arguments
@@ -1037,6 +1463,153 @@ class PrivateArgumentManagerTy {
   /// A pointer to a \p AsyncInfoTy object
   AsyncInfoTy &AsyncInfo;
 
+  /// \returns the value of the target pointee's base to be used for
+  /// corresponding-pointer-initialization.
+  void *getTargetPointeeBaseForCorrespondingPointerInitialization(
+      void *HstPteeBase, void *HstPteeBegin) {
+    // See if the pointee's begin address has corresponding storage on device.
+    void *TgtPteeBegin = [&]() -> void * {
+      if (!HstPteeBegin) {
+        DP("Corresponding-pointer-initialization: pointee begin address is "
+           "null\n");
+        return nullptr;
+      }
+
+      return Device.getMappingInfo()
+          .getTgtPtrBegin(HstPteeBegin, /*Size=*/0, /*UpdateRefCount=*/false,
+                          /*UseHoldRefCount=*/false)
+          .TargetPointer;
+    }();
+
+    // If it does, we calculate target pointee base using it, and return it.
+    // Otherwise, we retain the host pointee's base as the target pointee base
+    // of the initialized pointer. It's the user's responsibility to ensure
+    // that if a lookup fails, the host pointee is accessible on the device.
+    return TgtPteeBegin ? calculateTargetPointeeBase(HstPteeBase, HstPteeBegin,
+                                                     TgtPteeBegin)
+                        : HstPteeBase;
+  }
+
+  /// Initialize the source buffer for corresponding-pointer-initialization.
+  ///
+  /// It computes and stores the target pointee base address (or the host
+  /// pointee's base address, if lookup of target pointee fails) to the first
+  /// `sizeof(void*)` bytes of \p Buffer, and for larger pointers
+  /// (Fortran descriptors), the remaining fields of the host descriptor
+  /// \p HstPtr after those `sizeof(void*)` bytes.
+  ///
+  /// Corresponding-pointer-initialization represents the initialization of the
+  /// private version of a base-pointer/referring-pointer on a target construct.
+  ///
+  /// For example, for the following test:
+  /// ```cpp
+  ///   int x[10];
+  ///   int *px = &x[0];
+  ///   ...
+  ///   #pragma omp target data map(tofrom:px)
+  ///   {
+  ///     int **ppx = omp_get_mapped_ptr(&px, omp_get_default_device());
+  ///     #pragma omp target map(tofrom:px[1]) is_device_ptr(ppx)
+  ///     {
+  ///        foo(px, ppx);
+  ///     }
+  ///   }
+  /// ```
+  /// The following shows a possible way to implement the mapping of `px`,
+  /// which is pre-determined firstprivate and should get initialized
+  /// via corresponding-pointer-initialization:
+  ///
+  /// (A) Possible way to implement the above with PRIVATE | ATTACH:
+  /// ```llvm
+  ///  ; maps for px:
+  ///  ; &px[0], &px[1], sizeof(px[1]), TO | FROM                // (1)
+  ///  ; &px,    &px[1], sizeof(px),    ATTACH                   // (2)
+  ///  ; &px,    &px[1], sizeof(px),    PRIVATE | ATTACH | PARAM // (3)
+  ///  call... @__omp_outlined...(ptr %px, ptr %ppx)
+  ///  define ... @__omp_outlined(ptr %px, ptr %ppx) {...
+  ///    foo(%px, %ppx)
+  ///  ...}
+  /// ```
+  /// `(1)` maps the pointee `px[1].
+  /// `(2)` attaches it to the mapped version of `px`. It can be controlled by
+  /// the user based on the `attach(auto/always/never)` map-type modifier.
+  /// `(3)` privatizes and initializes the private pointer `px`, and passes it
+  /// into the kernel as the argument `%px`. Can be skipped if `px` is not
+  /// referenced in the target construct.
+  ///
+  /// While this method is not too beneficial compared to just doing the
+  /// initialization in the body of the kernel, like:
+  /// (B) Possible way to implement the above without PRIVATE | ATTACH:
+  /// ```llvm
+  ///  ; maps for px:
+  ///  ; &px[0], &px[1], sizeof(px[1]), TO | FROM | PARAM        // (4)
+  ///  ; &px,    &px[1], sizeof(px),    ATTACH                   // (5)
+  ///  call... @__omp_outlined...(ptr %px0, ptr %ppx)
+  ///  define ... __omp_outlined...(ptr %px0, ptr %ppx) {
+  ///    %px = alloca ptr;
+  ///    store ptr %px0, ptr %px
+  ///    foo(%px, %ppx)
+  ///  }
+  /// ```
+  ///
+  /// (B) is not so convenient for Fortran descriptors, because in
+  /// addition to the lookup, the remaining fields of the descriptor have
+  /// to be passed into the kernel to initialize the private copy, which
+  /// makes (A) a cleaner option for them. e.g.
+  /// ```f90
+  /// integer, pointer :: p(:)
+  /// !$omp target map(p(1))
+  /// ```
+  ///
+  /// (C) Possible mapping for the above Fortran test using PRIVATE | ATTACH:
+  /// ```llvm
+  ///  ; maps for p:
+  ///  ; &p(1),       &p(1), sizeof(p(1)),       TO | FROM
+  ///  ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), ATTACH
+  ///  ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), PRIVATE | ATTACH | PARAM
+  ///  call... @__omp_outlined...(ptr %ref_ptr_of_p)
+  void initBufferForCorrespondingPointerInitialization(char *Buffer,
+                                                       void *HstPtr,
+                                                       int64_t HstPtrSize,
+                                                       void *HstPteeBase,
+                                                       void *HstPteeBegin) {
+    constexpr int64_t VoidPtrSize = sizeof(void *);
+    assert(HstPtrSize >= VoidPtrSize &&
+           "corresponding-pointer-initialization: pointer size is too small");
+
+    void *TgtPteeBase =
+        getTargetPointeeBaseForCorrespondingPointerInitialization(HstPteeBase,
+                                                                  HstPteeBegin);
+
+    // Store the target pointee base address to the first VoidPtrSize bytes
+    DP("Initializing corresponding-pointer-initialization source buffer "
+       "for " DPxMOD ", with pointee base " DPxMOD "\n",
+       DPxPTR(HstPtr), DPxPTR(TgtPteeBase));
+    std::memcpy(Buffer, &TgtPteeBase, VoidPtrSize);
+    if (HstPtrSize <= VoidPtrSize)
+      return;
+
+    // For Fortran descriptors, copy the remaining descriptor fields from host
+    uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
+    void *HstDescriptorFieldsAddr = static_cast<char *>(HstPtr) + VoidPtrSize;
+    DP("Copying %" PRId64
+       " bytes of descriptor fields into corresponding-pointer-initialization "
+       "buffer at offset %" PRId64 ", from " DPxMOD "\n",
+       HstDescriptorFieldsSize, VoidPtrSize, DPxPTR(HstDescriptorFieldsAddr));
+    std::memcpy(Buffer + VoidPtrSize, HstDescriptorFieldsAddr,
+                HstDescriptorFieldsSize);
+  }
+
+  /// Helper function to create and initialize a buffer to be used as the source
+  /// for corresponding-pointer-initialization.
+  void *createAndInitSourceBufferForCorrespondingPointerInitialization(
+      void *HstPtr, int64_t HstPtrSize, void *HstPteeBase, void *HstPteeBegin) {
+    char *Buffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize);
+    initBufferForCorrespondingPointerInitialization(Buffer, HstPtr, HstPtrSize,
+                                                    HstPteeBase, HstPteeBegin);
+    return Buffer;
+  }
+
   // TODO: What would be the best value here? Should we make it configurable?
   // If the size is larger than this threshold, we will allocate and transfer it
   // immediately instead of packing it.
@@ -1051,7 +1624,9 @@ public:
   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
              map_var_info_t HstPtrName = nullptr,
-             const bool AllocImmediately = false) {
+             const bool AllocImmediately = false, void *HstPteeBase = nullptr,
+             void *HstPteeBegin = nullptr,
+             bool IsCorrespondingPointerInit = false) {
     // If the argument is not first-private, or its size is greater than a
     // predefined threshold, we will allocate memory and issue the transfer
     // immediately.
@@ -1074,9 +1649,19 @@ public:
       // If first-private, copy data from host
       if (IsFirstPrivate) {
         DP("Submitting firstprivate data to the device.\n");
-        int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
+
+        // The source value used for corresponding-pointer-initialization
+        // is different vs regular firstprivates.
+        void *DataSource =
+            IsCorrespondingPointerInit
+                ? createAndInitSourceBufferForCorrespondingPointerInitialization(
+                      HstPtr, ArgSize, HstPteeBase, HstPteeBegin)
+                : HstPtr;
+        int Ret = Device.submitData(TgtPtr, DataSource, ArgSize, AsyncInfo);
         if (Ret != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed, failed.\n");
+          DP("Copying %s data to device failed.\n",
+             IsCorrespondingPointerInit ? "corresponding-pointer-initialization"
+                                        : "firstprivate");
           return OFFLOAD_FAIL;
         }
       }
@@ -1122,8 +1707,10 @@ public:
         }
       }
 
-      FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
-                                       StartAlignment, Padding, HstPtrName);
+      FirstPrivateArgInfo.emplace_back(
+          TgtArgsIndex, HstPtr, ArgSize, StartAlignment, Padding, HstPtrName,
+          HstPteeBase, HstPteeBegin, IsCorrespondingPointerInit);
+
       FirstPrivateArgSize += Padding + ArgSize;
     }
 
@@ -1142,7 +1729,13 @@ public:
       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
         // First pad the pointer as we (have to) pad it on the device too.
         Itr = std::next(Itr, Info.Padding);
-        std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
+
+        if (Info.IsCorrespondingPointerInit)
+          initBufferForCorrespondingPointerInitialization(
+              &*Itr, Info.HstPtrBegin, Info.Size, Info.HstPteeBase,
+              Info.HstPteeBegin);
+        else
+          std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
         Itr = std::next(Itr, Info.Size);
       }
       // Allocate target memory
@@ -1213,13 +1806,27 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
 
+  // Create AttachInfo for tracking any ATTACH entries, or new-allocations
+  // when handling the "begin" mapping for a target constructs.
+  AttachInfoTy AttachInfo;
+
   int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
-                            ArgTypes, ArgNames, ArgMappers, AsyncInfo);
+                            ArgTypes, ArgNames, ArgMappers, AsyncInfo,
+                            &AttachInfo, false /*FromMapper=*/);
   if (Ret != OFFLOAD_SUCCESS) {
     REPORT("Call to targetDataBegin failed, abort target.\n");
     return OFFLOAD_FAIL;
   }
 
+  // Process collected ATTACH entries
+  if (!AttachInfo.AttachEntries.empty()) {
+    Ret = processAttachEntries(*DeviceOrErr, AttachInfo, AsyncInfo);
+    if (Ret != OFFLOAD_SUCCESS) {
+      REPORT("Failed to process ATTACH entries.\n");
+      return OFFLOAD_FAIL;
+    }
+  }
+
   // List of (first-)private arrays allocated for this target region
   SmallVector<int> TgtArgsPositions(ArgNum, -1);
 
@@ -1284,8 +1891,40 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
       TgtPtrBegin = HstPtrBase;
       TgtBaseOffset = 0;
     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
+      // For cases like:
+      // ```
+      // int *p = ...;
+      // #pragma omp target map(p[0:10])
+      // ```
+      // `p` is predetermined firstprivate on the target construct, and the
+      // method to determine the initial value of the private copy on the
+      // device is called "corresponding-pointer-initialization".
+      //
+      // Such firstprivate pointers that need
+      // corresponding-pointer-initialization are represented using the
+      // `PRIVATE | ATTACH` map-types, in contrast to regular firstprivate
+      // entries, which use `PRIVATE | TO`. The structure of these
+      // `PRIVATE | ATTACH` entries is the same as the non-private
+      // `ATTACH` entries used to represent pointer-attachments, i.e.:
+      // ```
+      //  &hst_ptr_base/begin, &hst_ptee_begin, sizeof(hst_ptr)
+      // ```
+      const bool IsAttach = (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH);
+      void *HstPteeBase = nullptr;
+      void *HstPteeBegin = nullptr;
+      if (IsAttach) {
+        // For corresponding-pointer-initialization, Args[I] is HstPteeBegin,
+        // and ArgBases[I] is both HstPtrBase/HstPtrBegin.
+        HstPteeBase = *reinterpret_cast<void **>(HstPtrBase);
+        HstPteeBegin = Args[I];
+        HstPtrBegin = ArgBases[I];
+      }
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
-      const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
+      // Corresponding-pointer-initialization is a special case of firstprivate,
+      // since it also involves initializing the private pointer.
+      const bool IsFirstPrivate =
+          (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) || IsAttach;
+
       // If there is a next argument and it depends on the current one, we need
       // to allocate the private memory immediately. If this is not the case,
       // then the argument can be marked for optimization and packed with the
@@ -1294,9 +1933,11 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
           (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
       Ret = PrivateArgumentManager.addArg(
           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
-          TgtArgs.size(), HstPtrName, AllocImmediately);
+          /*TgtArgsIndex=*/TgtArgs.size(), HstPtrName, AllocImmediately,
+          HstPteeBase, HstPteeBegin, /*IsCorrespondingPointerInit=*/IsAttach);
       if (Ret != OFFLOAD_SUCCESS) {
-        REPORT("Failed to process %sprivate argument " DPxMOD "\n",
+        REPORT("Failed to process %s%sprivate argument " DPxMOD "\n",
+               IsAttach ? "corresponding-pointer-initialization " : "",
                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
         return OFFLOAD_FAIL;
       }
diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h
index 0b3d54599048..90e5e1780e66 100644
--- a/offload/libomptarget/private.h
+++ b/offload/libomptarget/private.h
@@ -55,7 +55,14 @@ printKernelArguments(const ident_t *Loc, const int64_t DeviceId,
     const char *Type = nullptr;
     const char *Implicit =
         (ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT) ? "(implicit)" : "";
-    if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO && ArgTypes[I] & OMP_TGT_MAPTYPE_FROM)
+
+    if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH &&
+        ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS)
+      Type = "attach:always";
+    else if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH)
+      Type = "attach";
+    else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO &&
+             ArgTypes[I] & OMP_TGT_MAPTYPE_FROM)
       Type = "tofrom";
     else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO)
       Type = "to";
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index 61f680bab3a0..ad135f72fff1 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -71,9 +71,15 @@ typedef enum {
 } hsa_isa_info_t;
 
 typedef enum {
+  HSA_MACHINE_MODEL_SMALL = 0,
+  HSA_MACHINE_MODEL_LARGE = 1
+} hsa_machine_model_t;
+
+typedef enum {
   HSA_AGENT_INFO_NAME = 0,
   HSA_AGENT_INFO_VENDOR_NAME = 1,
   HSA_AGENT_INFO_FEATURE = 2,
+  HSA_AGENT_INFO_MACHINE_MODEL = 3,
   HSA_AGENT_INFO_PROFILE = 4,
   HSA_AGENT_INFO_WAVEFRONT_SIZE = 6,
   HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7,
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 3117763e3589..29cfe78082db 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -67,6 +67,7 @@ typedef enum hsa_amd_agent_info_s {
   HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
   HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
   HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
+  HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY = 0xA008,
   HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009,
   HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
   HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 796182075ff3..a7723b859881 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -99,7 +99,7 @@ namespace hsa_utils {
 /// Iterate elements using an HSA iterate function. Do not use this function
 /// directly but the specialized ones below instead.
 template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
-hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
+static hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
   auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
     CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
     return (*Unwrapped)(Elem);
@@ -111,7 +111,8 @@ hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
 /// use this function directly but the specialized ones below instead.
 template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
           typename CallbackTy>
-hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
+static hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg,
+                            CallbackTy Cb) {
   auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
     CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
     return (*Unwrapped)(Elem);
@@ -123,7 +124,8 @@ hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
 /// use this function directly but the specialized ones below instead.
 template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
           typename IterFuncArgTy, typename CallbackTy>
-hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
+static hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg,
+                            CallbackTy Cb) {
   auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
     CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
     return (*Unwrapped)(Elem1, Elem2);
@@ -132,21 +134,21 @@ hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
 }
 
 /// Iterate agents.
-template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
+template <typename CallbackTy> static Error iterateAgents(CallbackTy Callback) {
   hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
   return Plugin::check(Status, "error in hsa_iterate_agents: %s");
 }
 
 /// Iterate ISAs of an agent.
 template <typename CallbackTy>
-Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
+static Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
   hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
   return Plugin::check(Status, "error in hsa_agent_iterate_isas: %s");
 }
 
 /// Iterate memory pools of an agent.
 template <typename CallbackTy>
-Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
+static Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
   hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
       hsa_amd_agent_iterate_memory_pools, Agent, Cb);
   return Plugin::check(Status,
@@ -155,10 +157,12 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
 
 /// Dispatches an asynchronous memory copy.
 /// Enables different SDMA engines for the dispatch in a round-robin fashion.
-Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
-                   const void *Src, hsa_agent_t SrcAgent, size_t Size,
-                   uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
-                   hsa_signal_t CompletionSignal) {
+static Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst,
+                          hsa_agent_t DstAgent, const void *Src,
+                          hsa_agent_t SrcAgent, size_t Size,
+                          uint32_t NumDepSignals,
+                          const hsa_signal_t *DepSignals,
+                          hsa_signal_t CompletionSignal) {
   if (!UseMultipleSdmaEngines) {
     hsa_status_t S =
         hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
@@ -193,8 +197,8 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
 #endif
 }
 
-Error getTargetTripleAndFeatures(hsa_agent_t Agent,
-                                 SmallVector<SmallString<32>> &Targets) {
+static Error getTargetTripleAndFeatures(hsa_agent_t Agent,
+                                        SmallVector<SmallString<32>> &Targets) {
   auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
     uint32_t Length;
     hsa_status_t Status;
@@ -419,7 +423,11 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
     assert(MemoryManager && "Invalid memory manager");
     assert(PtrStorage && "Invalid pointer storage");
 
-    *PtrStorage = MemoryManager->allocate(Size, nullptr);
+    auto PtrStorageOrErr = MemoryManager->allocate(Size, nullptr);
+    if (!PtrStorageOrErr)
+      return PtrStorageOrErr.takeError();
+
+    *PtrStorage = *PtrStorageOrErr;
     if (Size && *PtrStorage == nullptr)
       return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
                            "failure to allocate from AMDGPU memory manager");
@@ -439,15 +447,12 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
 private:
   /// Allocation callback that will be called once the memory manager does not
   /// have more previously allocated buffers.
-  void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
+  Expected<void *> allocate(size_t Size, void *HstPtr,
+                            TargetAllocTy Kind) override;
 
   /// Deallocation callback that will be called by the memory manager.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
-    if (auto Err = MemoryPool->deallocate(TgtPtr)) {
-      consumeError(std::move(Err));
-      return OFFLOAD_FAIL;
-    }
-    return OFFLOAD_SUCCESS;
+  Error free(void *TgtPtr, TargetAllocTy Kind) override {
+    return MemoryPool->deallocate(TgtPtr);
   }
 
   /// The underlying plugin that owns this memory manager.
@@ -464,8 +469,8 @@ private:
 struct AMDGPUDeviceImageTy : public DeviceImageTy {
   /// Create the AMDGPU image with the id and the target image pointer.
   AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
-                      const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, Device, TgtImage) {}
+                      std::unique_ptr<MemoryBuffer> &&TgtImage)
+      : DeviceImageTy(ImageId, Device, std::move(TgtImage)) {}
 
   /// Prepare and load the executable corresponding to the image.
   Error loadExecutable(const AMDGPUDeviceTy &Device);
@@ -570,6 +575,16 @@ struct AMDGPUKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  ///
+  /// TODO: This needs to be implemented for amdgpu
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                  uint64_t DynamicMemSize) const override {
+    return Plugin::error(
+        ErrorCode::UNSUPPORTED,
+        "occupancy calculations for AMDGPU are not yet implemented");
+  }
+
   /// Print more elaborate kernel launch info for AMDGPU
   Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
                                KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
@@ -914,6 +929,7 @@ private:
     void *Dst;
     const void *Src;
     size_t Size;
+    size_t NumTimes;
   };
 
   /// Utility struct holding arguments for freeing buffers to memory managers.
@@ -964,9 +980,14 @@ private:
     StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
 
     /// Schedule a host memory copy action on the slot.
-    Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
+    ///
+    /// Num times will repeat the copy that many times, sequentually in the dest
+    /// buffer.
+    Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size,
+                              size_t NumTimes = 1) {
       Callbacks.emplace_back(memcpyAction);
-      ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
+      ActionArgs.emplace_back().MemcpyArgs =
+          MemcpyArgsTy{Dst, Src, Size, NumTimes};
       return Plugin::success();
     }
 
@@ -1063,6 +1084,20 @@ private:
   /// Indicate to spread data transfers across all available SDMAs
   bool UseMultipleSdmaEngines;
 
+  /// Wrapper function for implementing host callbacks
+  static void CallbackWrapper(AMDGPUSignalTy *InputSignal,
+                              AMDGPUSignalTy *OutputSignal,
+                              void (*Callback)(void *), void *UserData) {
+    // The wait call will not error in this context.
+    if (InputSignal)
+      if (auto Err = InputSignal->wait())
+        reportFatalInternalError(std::move(Err));
+
+    Callback(UserData);
+
+    OutputSignal->signal();
+  }
+
   /// Return the current number of asynchronous operations on the stream.
   uint32_t size() const { return NextSlot; }
 
@@ -1192,7 +1227,11 @@ private:
     assert(Args->Dst && "Invalid destination buffer");
     assert(Args->Src && "Invalid source buffer");
 
-    std::memcpy(Args->Dst, Args->Src, Args->Size);
+    auto *BasePtr = Args->Dst;
+    for (size_t I = 0; I < Args->NumTimes; I++) {
+      std::memcpy(BasePtr, Args->Src, Args->Size);
+      BasePtr = reinterpret_cast<uint8_t *>(BasePtr) + Args->Size;
+    }
 
     return Plugin::success();
   }
@@ -1397,7 +1436,8 @@ public:
   /// manager once the operation completes.
   Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
                                uint64_t CopySize,
-                               AMDGPUMemoryManagerTy &MemoryManager) {
+                               AMDGPUMemoryManagerTy &MemoryManager,
+                               size_t NumTimes = 1) {
     // Retrieve available signals for the operation's outputs.
     AMDGPUSignalTy *OutputSignals[2] = {};
     if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
@@ -1419,7 +1459,8 @@ public:
       // The std::memcpy is done asynchronously using an async handler. We store
       // the function's information in the action but it is not actually a
       // post action.
-      if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
+      if (auto Err =
+              Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize, NumTimes))
         return Err;
 
       // Make changes on this slot visible to the async handler's thread.
@@ -1440,7 +1481,11 @@ public:
       std::tie(Curr, InputSignal) = consume(OutputSignal);
     } else {
       // All preceding operations completed, copy the memory synchronously.
-      std::memcpy(Inter, Src, CopySize);
+      auto *InterPtr = Inter;
+      for (size_t I = 0; I < NumTimes; I++) {
+        std::memcpy(InterPtr, Src, CopySize);
+        InterPtr = reinterpret_cast<uint8_t *>(InterPtr) + CopySize;
+      }
 
       // Return the second signal because it will not be used.
       OutputSignals[1]->decreaseUseCount();
@@ -1457,11 +1502,11 @@ public:
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
       return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
-                                     Agent, CopySize, 1, &InputSignalRaw,
-                                     OutputSignal->get());
+                                     Agent, CopySize * NumTimes, 1,
+                                     &InputSignalRaw, OutputSignal->get());
     }
     return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
-                                   Agent, CopySize, 0, nullptr,
+                                   Agent, CopySize * NumTimes, 0, nullptr,
                                    OutputSignal->get());
   }
 
@@ -1495,6 +1540,31 @@ public:
                                    OutputSignal->get());
   }
 
+  Error pushHostCallback(void (*Callback)(void *), void *UserData) {
+    // Retrieve an available signal for the operation's output.
+    AMDGPUSignalTy *OutputSignal = nullptr;
+    if (auto Err = SignalManager.getResource(OutputSignal))
+      return Err;
+    OutputSignal->reset();
+    OutputSignal->increaseUseCount();
+
+    AMDGPUSignalTy *InputSignal;
+    {
+      std::lock_guard<std::mutex> Lock(Mutex);
+
+      // Consume stream slot and compute dependencies.
+      InputSignal = consume(OutputSignal).second;
+    }
+
+    // "Leaking" the thread here is consistent with other work added to the
+    // queue. The input and output signals will remain valid until the output is
+    // signaled.
+    std::thread(CallbackWrapper, InputSignal, OutputSignal, Callback, UserData)
+        .detach();
+
+    return Plugin::success();
+  }
+
   /// Synchronize with the stream. The current thread waits until all operations
   /// are finalized and it performs the pending post actions (i.e., releasing
   /// intermediate buffers).
@@ -1519,6 +1589,9 @@ public:
   /// actions for that and prior events.
   Error synchronizeOn(AMDGPUEventTy &Event);
 
+  /// Return true if the event from this queue is complete
+  Expected<bool> isEventComplete(const AMDGPUEventTy &Event);
+
   /// Query the stream and complete pending post actions if operations finished.
   /// Return whether all the operations completed. This operation does not block
   /// the calling thread.
@@ -1683,6 +1756,18 @@ Error AMDGPUStreamTy::synchronizeOn(AMDGPUEventTy &Event) {
   return completeUntil(Event.RecordedSlot);
 }
 
+Expected<bool> AMDGPUStreamTy::isEventComplete(const AMDGPUEventTy &Event) {
+  std::lock_guard<std::mutex> Lock(Mutex);
+  assert(Event.RecordedStream == this && "event is for a different stream");
+
+  if (Event.RecordedSyncCycle < SyncCycle) {
+    return true;
+  }
+  assert(Event.RecordedSyncCycle == SyncCycle && "event is from the future?");
+
+  return !Slots[Event.RecordedSlot].Signal->load();
+}
+
 struct AMDGPUStreamManagerTy final
     : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
   using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
@@ -2080,7 +2165,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(*Image);
 
     // Unload the executable of the image.
-    return AMDImage.unloadExecutable();
+    if (auto Err = AMDImage.unloadExecutable())
+      return Err;
+
+    // Destroy the associated memory and invalidate the object.
+    Plugin.free(Image);
+    return Error::success();
   }
 
   /// Deinitialize the device and release its resources.
@@ -2103,18 +2193,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
                                        DeviceImageTy &Image) override {
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
-      Image.setPendingGlobalDtors();
-
     return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
   }
 
   virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
                                       DeviceImageTy &Image) override {
-    if (Image.hasPendingGlobalDtors())
-      return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
-    return Plugin::success();
+    return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
   }
 
   uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
@@ -2241,11 +2325,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Load the binary image into the device and allocate an image object.
-  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
-                                           int32_t ImageId) override {
+  Expected<DeviceImageTy *>
+  loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
+                 int32_t ImageId) override {
     // Allocate and initialize the image object.
     AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
-    new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
+    new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, std::move(TgtImage));
 
     // Load the HSA executable.
     if (Error Err = AMDImage->loadExecutable(*this))
@@ -2255,18 +2340,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Allocate memory on the device or related to the device.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
+  Expected<void *> allocate(size_t Size, void *, TargetAllocTy Kind) override;
 
   /// Deallocate memory on the device or related to the device.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  Error free(void *TgtPtr, TargetAllocTy Kind) override {
     if (TgtPtr == nullptr)
-      return OFFLOAD_SUCCESS;
+      return Plugin::success();
 
     AMDGPUMemoryPoolTy *MemoryPool = nullptr;
     switch (Kind) {
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemoryPool = CoarseGrainedMemoryPools[0];
       break;
     case TARGET_ALLOC_HOST:
@@ -2277,17 +2361,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       break;
     }
 
-    if (!MemoryPool) {
-      REPORT("No memory pool for the specified allocation kind\n");
-      return OFFLOAD_FAIL;
-    }
+    if (!MemoryPool)
+      return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
+                           "no memory pool for the specified allocation kind");
 
-    if (Error Err = MemoryPool->deallocate(TgtPtr)) {
-      REPORT("%s\n", toString(std::move(Err)).data());
-      return OFFLOAD_FAIL;
-    }
+    if (auto Err = MemoryPool->deallocate(TgtPtr))
+      return Err;
 
-    return OFFLOAD_SUCCESS;
+    return Plugin::success();
   }
 
   /// Synchronize current thread with the pending operations on the async info.
@@ -2537,22 +2618,130 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                           getAgent(), (uint64_t)Size);
   }
 
-  /// Initialize the async info for interoperability purposes.
+  /// Insert a data fence between previous data operations and the following
+  /// operations. This is a no-op for AMDGPU devices as operations inserted into
+  /// a queue are in-order.
+  Error dataFence(__tgt_async_info *Async) override {
+    return Plugin::success();
+  }
+
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    // Fast case, where we can use the 4 byte hsa_amd_memory_fill
+    if (Size % 4 == 0 &&
+        (PatternSize == 4 || PatternSize == 2 || PatternSize == 1)) {
+      uint32_t Pattern;
+      if (PatternSize == 1) {
+        auto *Byte = reinterpret_cast<const uint8_t *>(PatternPtr);
+        Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24;
+      } else if (PatternSize == 2) {
+        auto *Word = reinterpret_cast<const uint16_t *>(PatternPtr);
+        Pattern = *Word | (*Word << 16);
+      } else if (PatternSize == 4) {
+        Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr);
+      } else {
+        // Shouldn't be here if the pattern size is outwith those values
+        llvm_unreachable("Invalid pattern size");
+      }
+
+      if (hasPendingWorkImpl(AsyncInfoWrapper)) {
+        AMDGPUStreamTy *Stream = nullptr;
+        if (auto Err = getStream(AsyncInfoWrapper, Stream))
+          return Err;
+
+        struct MemFillArgsTy {
+          void *Dst;
+          uint32_t Pattern;
+          int64_t Size;
+        };
+        auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4};
+        auto Fill = [](void *Data) {
+          MemFillArgsTy *Args = reinterpret_cast<MemFillArgsTy *>(Data);
+          assert(Args && "Invalid arguments");
+
+          auto Status =
+              hsa_amd_memory_fill(Args->Dst, Args->Pattern, Args->Size);
+          delete Args;
+          auto Err =
+              Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
+          if (Err) {
+            FATAL_MESSAGE(1, "error performing async fill: %s",
+                          toString(std::move(Err)).data());
+          }
+        };
+
+        // hsa_amd_memory_fill doesn't signal completion using a signal, so use
+        // the existing host callback logic to handle that instead
+        return Stream->pushHostCallback(Fill, Args);
+      }
+      // If there is no pending work, do the fill synchronously
+      auto Status = hsa_amd_memory_fill(TgtPtr, Pattern, Size / 4);
+      return Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
+    }
+
+    // Slow case; allocate an appropriate memory size and enqueue copies
+    void *PinnedPtr = nullptr;
+    AMDGPUMemoryManagerTy &PinnedMemoryManager =
+        HostDevice.getPinnedMemoryManager();
+    if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
+      return Err;
+
+    AMDGPUStreamTy *Stream = nullptr;
+    if (auto Err = getStream(AsyncInfoWrapper, Stream))
+      return Err;
+
+    return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr,
+                                          PatternSize, PinnedMemoryManager,
+                                          Size / PatternSize);
+  }
+
+  /// Initialize the async info
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     // TODO: Implement this function.
     return Plugin::success();
   }
 
-  /// Initialize the device info for interoperability purposes.
-  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
-    DeviceInfo->Context = nullptr;
+  interop_spec_t selectInteropPreference(int32_t InteropType,
+                                         int32_t NumPrefers,
+                                         interop_spec_t *Prefers) override {
+    // TODO: update once targetsync is supported
+    if (InteropType == kmp_interop_type_target)
+      return interop_spec_t{tgt_fr_hsa, {false, 0}, 0};
+    return interop_spec_t{tgt_fr_none, {false, 0}, 0};
+  }
+
+  Expected<omp_interop_val_t *>
+  createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override {
+    auto *Ret = new omp_interop_val_t(
+        DeviceId, static_cast<kmp_interop_type_t>(InteropType));
+    Ret->fr_id = tgt_fr_hsa;
+    Ret->vendor_id = omp_vendor_amd;
+
+    // TODO: implement targetsync support
 
-    if (!DeviceInfo->Device)
-      DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
+    Ret->device_info.Platform = nullptr;
+    Ret->device_info.Device = reinterpret_cast<void *>(Agent.handle);
+    Ret->device_info.Context = nullptr;
+
+    return Ret;
+  }
 
+  Error releaseInterop(omp_interop_val_t *Interop) override {
+    if (Interop)
+      delete Interop;
     return Plugin::success();
   }
 
+  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                            AsyncInfoWrapperTy &AsyncInfo) override {
+    AMDGPUStreamTy *Stream = nullptr;
+    if (auto Err = getStream(AsyncInfo, Stream))
+      return Err;
+
+    return Stream->pushHostCallback(Callback, UserData);
+  };
+
   /// Create an event.
   Error createEventImpl(void **EventPtrStorage) override {
     AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
@@ -2591,7 +2780,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
-    auto Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>();
+    auto *Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>();
     if (!Stream)
       return false;
 
@@ -2601,6 +2790,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Query.takeError();
   }
 
+  Expected<bool> isEventCompleteImpl(void *EventPtr,
+                                     AsyncInfoWrapperTy &AsyncInfo) override {
+    AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
+    auto *Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>();
+    return Stream && Stream->isEventComplete(*Event);
+  }
+
   /// Synchronize the current thread with the event.
   Error syncEventImpl(void *EventPtr) override {
     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
@@ -2632,7 +2828,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
     if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Product Name", TmpChar);
+      Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME);
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
     if (Status == HSA_STATUS_SUCCESS)
@@ -2642,10 +2838,19 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (Status == HSA_STATUS_SUCCESS)
       Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR);
 
+    Info.add("Vendor ID", uint64_t{4130}, "", DeviceInfo::VENDOR_ID);
+
+    hsa_machine_model_t MachineModel;
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_MACHINE_MODEL, MachineModel);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Memory Address Size",
+               uint64_t{MachineModel == HSA_MACHINE_MODEL_SMALL ? 32u : 64u},
+               "bits", DeviceInfo::ADDRESS_BITS);
+
     hsa_device_type_t DevType;
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
     if (Status == HSA_STATUS_SUCCESS) {
-      switch (DevType) {
+      switch (static_cast<int>(DevType)) {
       case HSA_DEVICE_TYPE_CPU:
         TmpCharPtr = "CPU";
         break;
@@ -2692,11 +2897,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Max Clock Freq", TmpUInt, "MHz");
+      Info.add("Max Clock Freq", TmpUInt, "MHz",
+               DeviceInfo::MAX_CLOCK_FREQUENCY);
+
+    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Max Memory Clock Freq", TmpUInt, "MHz",
+               DeviceInfo::MEMORY_CLOCK_RATE);
 
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Compute Units", TmpUInt);
+      Info.add("Compute Units", TmpUInt, "", DeviceInfo::NUM_COMPUTE_UNITS);
 
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
@@ -2734,11 +2945,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Grid Max Size", TmpUInt);
+      Info.add("Grid Max Size", TmpUInt, "", DeviceInfo::MAX_WORK_SIZE);
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
     if (Status == HSA_STATUS_SUCCESS) {
-      auto &MaxDim = *Info.add("Grid Max Size per Dimension");
+      auto &MaxDim = *Info.add("Grid Max Size per Dimension", std::monostate{},
+                               "", DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION);
       MaxDim.add("x", GridMaxDim.x);
       MaxDim.add("y", GridMaxDim.y);
       MaxDim.add("z", GridMaxDim.z);
@@ -2778,7 +2990,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
       if (Status == HSA_STATUS_SUCCESS)
-        PoolNode.add("Size", TmpSt, "bytes");
+        PoolNode.add(
+            "Size", TmpSt, "bytes",
+            (Pool->isGlobal() && Pool->isCoarseGrained())
+                ? std::optional<DeviceInfo>{DeviceInfo::GLOBAL_MEM_SIZE}
+                : std::nullopt);
 
       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
                                 TmpBool);
@@ -2910,7 +3126,7 @@ private:
     // Perform a quick check for the named kernel in the image. The kernel
     // should be created by the 'amdgpu-lower-ctor-dtor' pass.
     GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
+    if (!Handler.isSymbolInImage(*this, Image, KernelName))
       return Plugin::success();
 
     // Allocate and construct the AMDGPU kernel.
@@ -3461,11 +3677,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                  KernelArgsTy &KernelArgs,
                                  KernelLaunchParamsTy LaunchParams,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) const {
-  if (ArgsSize != LaunchParams.Size &&
-      ArgsSize > LaunchParams.Size + getImplicitArgsSize())
-    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                         "invalid kernel arguments size");
-
   AMDGPUPluginTy &AMDGPUPlugin =
       static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
   AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
@@ -3551,8 +3762,8 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
     return Plugin::success();
 
   // General Info
-  auto NumGroups = NumBlocks;
-  auto ThreadsPerGroup = NumThreads;
+  auto *NumGroups = NumBlocks;
+  auto *ThreadsPerGroup = NumThreads;
 
   // Kernel Arguments Info
   auto ArgNum = KernelArgs.NumArgs;
@@ -3616,14 +3827,13 @@ static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
   return Plugin::error(OffloadErrCode, ErrFmt, Args..., Desc);
 }
 
-void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
-                                      TargetAllocTy Kind) {
+Expected<void *> AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
+                                                 TargetAllocTy Kind) {
   // Allocate memory from the pool.
   void *Ptr = nullptr;
-  if (auto Err = MemoryPool->allocate(Size, &Ptr)) {
-    consumeError(std::move(Err));
-    return nullptr;
-  }
+  if (auto Err = MemoryPool->allocate(Size, &Ptr))
+    return std::move(Err);
+
   assert(Ptr && "Invalid pointer");
 
   // Get a list of agents that can access this memory pool.
@@ -3633,14 +3843,13 @@ void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
       [&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); });
 
   // Allow all valid kernel agents to access the allocation.
-  if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) {
-    REPORT("%s\n", toString(std::move(Err)).data());
-    return nullptr;
-  }
+  if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents))
+    return std::move(Err);
   return Ptr;
 }
 
-void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
+Expected<void *> AMDGPUDeviceTy::allocate(size_t Size, void *,
+                                          TargetAllocTy Kind) {
   if (Size == 0)
     return nullptr;
 
@@ -3649,7 +3858,6 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
   case TARGET_ALLOC_DEVICE:
-  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
     MemoryPool = CoarseGrainedMemoryPools[0];
     break;
   case TARGET_ALLOC_HOST:
@@ -3660,17 +3868,14 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
     break;
   }
 
-  if (!MemoryPool) {
-    REPORT("No memory pool for the specified allocation kind\n");
-    return nullptr;
-  }
+  if (!MemoryPool)
+    return Plugin::error(ErrorCode::UNSUPPORTED,
+                         "no memory pool for the specified allocation kind");
 
   // Allocate from the corresponding memory pool.
   void *Alloc = nullptr;
-  if (Error Err = MemoryPool->allocate(Size, &Alloc)) {
-    REPORT("%s\n", toString(std::move(Err)).data());
-    return nullptr;
-  }
+  if (auto Err = MemoryPool->allocate(Size, &Alloc))
+    return std::move(Err);
 
   if (Alloc) {
     // Get a list of agents that can access this memory pool. Inherently
@@ -3683,10 +3888,8 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
                   });
 
     // Enable all valid kernel agents to access the buffer.
-    if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) {
-      REPORT("%s\n", toString(std::move(Err)).data());
-      return nullptr;
-    }
+    if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents))
+      return std::move(Err);
   }
 
   return Alloc;
diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h
index 2ad0f2b7dd6c..68d82cbea0f3 100644
--- a/offload/plugins-nextgen/common/include/ErrorReporting.h
+++ b/offload/plugins-nextgen/common/include/ErrorReporting.h
@@ -61,7 +61,6 @@ class ErrorReporter {
   /// Return a nice name for an TargetAllocTy.
   static StringRef getAllocTyName(TargetAllocTy Kind) {
     switch (Kind) {
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
       return "device memory";
diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h
index d62516d20764..b4e3712d9c98 100644
--- a/offload/plugins-nextgen/common/include/JIT.h
+++ b/offload/plugins-nextgen/common/include/JIT.h
@@ -51,27 +51,22 @@ struct JITEngine {
   /// Run jit compilation if \p Image is a bitcode image, otherwise simply
   /// return \p Image. It is expected to return a memory buffer containing the
   /// generated device image that could be loaded to the device directly.
-  Expected<const __tgt_device_image *>
-  process(const __tgt_device_image &Image,
-          target::plugin::GenericDeviceTy &Device);
-
-  /// Remove \p Image from the jit engine's cache
-  void erase(const __tgt_device_image &Image,
-             target::plugin::GenericDeviceTy &Device);
+  Expected<std::unique_ptr<MemoryBuffer>>
+  process(StringRef Image, target::plugin::GenericDeviceTy &Device);
 
 private:
   /// Compile the bitcode image \p Image and generate the binary image that can
   /// be loaded to the target device of the triple \p Triple architecture \p
   /// MCpu. \p PostProcessing will be called after codegen to handle cases such
   /// as assembler as an external tool.
-  Expected<const __tgt_device_image *>
-  compile(const __tgt_device_image &Image, const std::string &ComputeUnitKind,
+  Expected<std::unique_ptr<MemoryBuffer>>
+  compile(StringRef Image, const std::string &ComputeUnitKind,
           PostProcessingFn PostProcessing);
 
   /// Create or retrieve the object image file from the file system or via
   /// compilation of the \p Image.
   Expected<std::unique_ptr<MemoryBuffer>>
-  getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx,
+  getOrCreateObjFile(StringRef Image, LLVMContext &Ctx,
                      const std::string &ComputeUnitKind);
 
   /// Run backend, which contains optimization and code generation.
@@ -92,14 +87,6 @@ private:
   struct ComputeUnitInfo {
     /// LLVM Context in which the modules will be constructed.
     LLVMContext Context;
-
-    /// A map of embedded IR images to the buffer used to store JITed code
-    DenseMap<const __tgt_device_image *, std::unique_ptr<MemoryBuffer>>
-        JITImages;
-
-    /// A map of embedded IR images to JITed images.
-    DenseMap<const __tgt_device_image *, std::unique_ptr<__tgt_device_image>>
-        TgtImageMap;
   };
 
   /// Map from (march) "CPUs" (e.g., sm_80, or gfx90a), which we call compute
diff --git a/offload/plugins-nextgen/common/include/MemoryManager.h b/offload/plugins-nextgen/common/include/MemoryManager.h
index a4f6e628c403..8f6c1adcdaa5 100644
--- a/offload/plugins-nextgen/common/include/MemoryManager.h
+++ b/offload/plugins-nextgen/common/include/MemoryManager.h
@@ -25,6 +25,10 @@
 #include "Shared/Utils.h"
 #include "omptarget.h"
 
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+
 /// Base class of per-device allocator.
 class DeviceAllocatorTy {
 public:
@@ -32,11 +36,13 @@ public:
 
   /// Allocate a memory of size \p Size . \p HstPtr is used to assist the
   /// allocation.
-  virtual void *allocate(size_t Size, void *HstPtr,
-                         TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
+  virtual Expected<void *>
+  allocate(size_t Size, void *HstPtr,
+           TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
 
   /// Delete the pointer \p TgtPtr on the device
-  virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
+  virtual Error free(void *TgtPtr,
+                     TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
 };
 
 /// Class of memory manager. The memory manager is per-device by using
@@ -134,17 +140,17 @@ class MemoryManagerTy {
   size_t SizeThreshold = 1U << 13;
 
   /// Request memory from target device
-  void *allocateOnDevice(size_t Size, void *HstPtr) const {
+  Expected<void *> allocateOnDevice(size_t Size, void *HstPtr) const {
     return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE);
   }
 
   /// Deallocate data on device
-  int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); }
+  Error deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); }
 
   /// This function is called when it tries to allocate memory on device but the
   /// device returns out of memory. It will first free all memory in the
   /// FreeList and try to allocate again.
-  void *freeAndAllocate(size_t Size, void *HstPtr) {
+  Expected<void *> freeAndAllocate(size_t Size, void *HstPtr) {
     std::vector<void *> RemoveList;
 
     // Deallocate all memory in FreeList
@@ -154,7 +160,8 @@ class MemoryManagerTy {
       if (List.empty())
         continue;
       for (const NodeTy &N : List) {
-        deleteOnDevice(N.Ptr);
+        if (auto Err = deleteOnDevice(N.Ptr))
+          return Err;
         RemoveList.push_back(N.Ptr);
       }
       FreeLists[I].clear();
@@ -175,14 +182,22 @@ class MemoryManagerTy {
   /// allocate directly on the device. If a \p nullptr is returned, it might
   /// be because the device is OOM. In that case, it will free all unused
   /// memory and then try again.
-  void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) {
-    void *TgtPtr = allocateOnDevice(Size, HstPtr);
+  Expected<void *> allocateOrFreeAndAllocateOnDevice(size_t Size,
+                                                     void *HstPtr) {
+    auto TgtPtrOrErr = allocateOnDevice(Size, HstPtr);
+    if (!TgtPtrOrErr)
+      return TgtPtrOrErr.takeError();
+
+    void *TgtPtr = *TgtPtrOrErr;
     // We cannot get memory from the device. It might be due to OOM. Let's
     // free all memory in FreeLists and try again.
     if (TgtPtr == nullptr) {
       DP("Failed to get memory on device. Free all memory in FreeLists and "
          "try again.\n");
-      TgtPtr = freeAndAllocate(Size, HstPtr);
+      TgtPtrOrErr = freeAndAllocate(Size, HstPtr);
+      if (!TgtPtrOrErr)
+        return TgtPtrOrErr.takeError();
+      TgtPtr = *TgtPtrOrErr;
     }
 
     if (TgtPtr == nullptr)
@@ -204,16 +219,17 @@ public:
 
   /// Destructor
   ~MemoryManagerTy() {
-    for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end();
-         ++Itr) {
-      assert(Itr->second.Ptr && "nullptr in map table");
-      deleteOnDevice(Itr->second.Ptr);
+    for (auto &PtrToNode : PtrToNodeTable) {
+      assert(PtrToNode.second.Ptr && "nullptr in map table");
+      if (auto Err = deleteOnDevice(PtrToNode.second.Ptr))
+        REPORT("Failure to delete memory: %s\n",
+               toString(std::move(Err)).data());
     }
   }
 
   /// Allocate memory of size \p Size from target device. \p HstPtr is used to
   /// assist the allocation.
-  void *allocate(size_t Size, void *HstPtr) {
+  Expected<void *> allocate(size_t Size, void *HstPtr) {
     // If the size is zero, we will not bother the target device. Just return
     // nullptr directly.
     if (Size == 0)
@@ -228,11 +244,14 @@ public:
       DP("%zu is greater than the threshold %zu. Allocate it directly from "
          "device\n",
          Size, SizeThreshold);
-      void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
+      auto TgtPtrOrErr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
+      if (!TgtPtrOrErr)
+        return TgtPtrOrErr.takeError();
 
-      DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr));
+      DP("Got target pointer " DPxMOD ". Return directly.\n",
+         DPxPTR(*TgtPtrOrErr));
 
-      return TgtPtr;
+      return *TgtPtrOrErr;
     }
 
     NodeTy *NodePtr = nullptr;
@@ -260,8 +279,11 @@ public:
     if (NodePtr == nullptr) {
       DP("Cannot find a node in the FreeLists. Allocate on device.\n");
       // Allocate one on device
-      void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
+      auto TgtPtrOrErr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
+      if (!TgtPtrOrErr)
+        return TgtPtrOrErr.takeError();
 
+      void *TgtPtr = *TgtPtrOrErr;
       if (TgtPtr == nullptr)
         return nullptr;
 
@@ -282,7 +304,7 @@ public:
   }
 
   /// Deallocate memory pointed by \p TgtPtr
-  int free(void *TgtPtr) {
+  Error free(void *TgtPtr) {
     DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr));
 
     NodeTy *P = nullptr;
@@ -314,7 +336,7 @@ public:
       FreeLists[B].insert(*P);
     }
 
-    return OFFLOAD_SUCCESS;
+    return Error::success();
   }
 
   /// Get the size threshold from the environment variable
@@ -344,4 +366,6 @@ public:
 constexpr const size_t MemoryManagerTy::BucketSize[];
 constexpr const int MemoryManagerTy::NumBuckets;
 
+} // namespace llvm
+
 #endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index c9ab34b024b7..8c530bba3882 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -193,7 +193,7 @@ struct InfoTreeNode {
 
   InfoTreeNode() : InfoTreeNode("", std::monostate{}, "") {}
   InfoTreeNode(std::string Key, VariantType Value, std::string Units)
-      : Key(Key), Value(Value), Units(Units) {}
+      : Key(std::move(Key)), Value(Value), Units(std::move(Units)) {}
 
   /// Add a new info entry as a child of this node. The entry requires at least
   /// a key string in \p Key. The value in \p Value is optional and can be any
@@ -202,7 +202,7 @@ struct InfoTreeNode {
   /// use that value for an appropriate olGetDeviceInfo query
   template <typename T = std::monostate>
   InfoTreeNode *add(std::string Key, T Value = T(),
-                    const std::string &Units = std::string(),
+                    std::string Units = std::string(),
                     std::optional<DeviceInfo> DeviceInfoKey = std::nullopt) {
     assert(!Key.empty() && "Invalid info key");
 
@@ -217,7 +217,8 @@ struct InfoTreeNode {
     else
       ValueVariant = std::string{Value};
 
-    auto Ptr = &Children->emplace_back(Key, ValueVariant, Units);
+    auto Ptr =
+        &Children->emplace_back(std::move(Key), ValueVariant, std::move(Units));
 
     if (DeviceInfoKey)
       DeviceInfoMap[*DeviceInfoKey] = Children->size() - 1;
@@ -306,26 +307,18 @@ class DeviceImageTy {
   /// not unique between different device; they may overlap.
   int32_t ImageId;
 
-  /// The pointer to the raw __tgt_device_image.
-  const __tgt_device_image *TgtImage;
-  const __tgt_device_image *TgtImageBitcode;
+  /// The managed image data.
+  std::unique_ptr<MemoryBuffer> Image;
 
   /// Reference to the device this image is loaded on.
   GenericDeviceTy &Device;
 
-  /// If this image has any global destructors that much be called.
-  /// FIXME: This is only required because we currently have no invariants
-  ///        towards the lifetime of the underlying image. We should either copy
-  ///        the image into memory locally or erase the pointers after init.
-  bool PendingGlobalDtors;
-
 public:
+  virtual ~DeviceImageTy() = default;
+
   DeviceImageTy(int32_t Id, GenericDeviceTy &Device,
-                const __tgt_device_image *Image)
-      : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device),
-        PendingGlobalDtors(false) {
-    assert(TgtImage && "Invalid target image");
-  }
+                std::unique_ptr<MemoryBuffer> &&Image)
+      : ImageId(Id), Image(std::move(Image)), Device(Device) {}
 
   /// Get the image identifier within the device.
   int32_t getId() const { return ImageId; }
@@ -333,33 +326,17 @@ public:
   /// Get the device that this image is loaded onto.
   GenericDeviceTy &getDevice() const { return Device; }
 
-  /// Get the pointer to the raw __tgt_device_image.
-  const __tgt_device_image *getTgtImage() const { return TgtImage; }
-
-  void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) {
-    this->TgtImageBitcode = TgtImageBitcode;
-  }
-
-  const __tgt_device_image *getTgtImageBitcode() const {
-    return TgtImageBitcode;
-  }
-
   /// Get the image starting address.
-  void *getStart() const { return TgtImage->ImageStart; }
+  const void *getStart() const { return Image->getBufferStart(); }
 
   /// Get the image size.
-  size_t getSize() const {
-    return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
-  }
+  size_t getSize() const { return Image->getBufferSize(); }
 
   /// Get a memory buffer reference to the whole image.
   MemoryBufferRef getMemoryBuffer() const {
     return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
                            "Image");
   }
-  /// Accessors to the boolean value
-  bool setPendingGlobalDtors() { return PendingGlobalDtors = true; }
-  bool hasPendingGlobalDtors() const { return PendingGlobalDtors; }
 };
 
 /// Class implementing common functionalities of offload kernels. Each plugin
@@ -388,6 +365,9 @@ struct GenericKernelTy {
                            KernelLaunchParamsTy LaunchParams,
                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
 
+  virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                          uint64_t DynamicMemSize) const = 0;
+
   /// Get the kernel name.
   const char *getName() const { return Name.c_str(); }
 
@@ -414,6 +394,7 @@ struct GenericKernelTy {
     case OMP_TGT_EXEC_MODE_SPMD:
     case OMP_TGT_EXEC_MODE_GENERIC:
     case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
+    case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
       return true;
     }
     return false;
@@ -431,6 +412,8 @@ protected:
       return "Generic";
     case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
       return "Generic-SPMD";
+    case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
+      return "SPMD-No-Loop";
     }
     llvm_unreachable("Unknown execution mode!");
   }
@@ -468,7 +451,8 @@ private:
                         uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
                         uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
 
-  /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+  /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop
+  /// or SPMD mode.
   bool isGenericSPMDMode() const {
     return KernelEnvironment.Configuration.ExecMode ==
            OMP_TGT_EXEC_MODE_GENERIC_SPMD;
@@ -483,6 +467,10 @@ private:
   bool isBareMode() const {
     return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE;
   }
+  bool isNoLoopMode() const {
+    return KernelEnvironment.Configuration.ExecMode ==
+           OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
+  }
 
   /// The kernel name.
   std::string Name;
@@ -820,19 +808,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   /// Load the binary image into the device and return the target table.
   Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
-                                       const __tgt_device_image *TgtImage);
+                                       StringRef TgtImage);
   virtual Expected<DeviceImageTy *>
-  loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
+  loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage, int32_t ImageId) = 0;
 
   /// Unload a previously loaded Image from the device
   Error unloadBinary(DeviceImageTy *Image);
   virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
 
-  /// Setup the device environment if needed. Notice this setup may not be run
-  /// on some plugins. By default, it will be executed, but plugins can change
-  /// this behavior by overriding the shouldSetupDeviceEnvironment function.
-  Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
-
   /// Setup the global device memory pool, if the plugin requires one.
   Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
                               uint64_t PoolSize);
@@ -944,6 +927,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  /// Instert a data fence between previous data operations and the following
+  /// operations if necessary for the device
+  virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0;
+
   /// Exchange data between devices (device to device transfer). Calling this
   /// function is only valid if GenericPlugin::isDataExchangable() passing the
   /// two devices returns true.
@@ -953,17 +940,26 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                                  void *DstPtr, int64_t Size,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  /// Fill data on the device with a pattern from the host
+  Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                 int64_t Size, __tgt_async_info *AsyncInfo);
+  virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr,
+                             int64_t PatternSize, int64_t Size,
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Run the kernel associated with \p EntryPtr
   Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
                      KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
 
-  /// Initialize a __tgt_async_info structure. Related to interop features.
+  /// Initialize a __tgt_async_info structure.
   Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
   virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
-  /// Initialize a __tgt_device_info structure. Related to interop features.
-  Error initDeviceInfo(__tgt_device_info *DeviceInfo);
-  virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
+  /// Enqueue a host call to AsyncInfo
+  Error enqueueHostCall(void (*Callback)(void *), void *UserData,
+                        __tgt_async_info *AsyncInfo);
+  virtual Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                                    AsyncInfoWrapperTy &AsyncInfo) = 0;
 
   /// Create an event.
   Error createEvent(void **EventPtrStorage);
@@ -984,6 +980,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual Error waitEventImpl(void *EventPtr,
                               AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  /// Check if the event enqueued to AsyncInfo is complete
+  Expected<bool> isEventComplete(void *Event, __tgt_async_info *AsyncInfo);
+  virtual Expected<bool>
+  isEventCompleteImpl(void *EventPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Synchronize the current thread with the event.
   Error syncEvent(void *EventPtr);
   virtual Error syncEventImpl(void *EventPtr) = 0;
@@ -1010,6 +1011,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   uint32_t getDefaultNumBlocks() const {
     return GridValues.GV_Default_Num_Teams;
   }
+  uint32_t getDebugKind() const { return OMPX_DebugKind; }
   uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
   virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
 
@@ -1150,11 +1152,6 @@ private:
   virtual Error getDeviceHeapSize(uint64_t &V) = 0;
   virtual Error setDeviceHeapSize(uint64_t V) = 0;
 
-  /// Indicate whether the device should setup the device environment. Notice
-  /// that returning false in this function will change the behavior of the
-  /// setupDeviceEnvironment() function.
-  virtual bool shouldSetupDeviceEnvironment() const { return true; }
-
   /// Indicate whether the device should setup the global device memory pool. If
   /// false is return the value on the device will be uninitialized.
   virtual bool shouldSetupDeviceMemoryPool() const { return true; }
@@ -1210,7 +1207,7 @@ protected:
   enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
 
   /// Array of peer access states with the rest of devices. This means that if
-  /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
+  /// the device I has a matrix PeerAccesses with PeerAccesses == AVAILABLE,
   /// the device I can access device J's memory directly. However, notice this
   /// does not mean that device J can access device I's memory directly.
   llvm::SmallVector<PeerAccessState> PeerAccesses;
@@ -1378,10 +1375,10 @@ public:
 
   /// Returns non-zero if the \p Image is compatible with the plugin. This
   /// function does not require the plugin to be initialized before use.
-  int32_t is_plugin_compatible(__tgt_device_image *Image);
+  int32_t isPluginCompatible(StringRef Image);
 
   /// Returns non-zero if the \p Image is compatible with the device.
-  int32_t is_device_compatible(int32_t DeviceId, __tgt_device_image *Image);
+  int32_t isDeviceCompatible(int32_t DeviceId, StringRef Image);
 
   /// Returns non-zero if the plugin device has been initialized.
   int32_t is_device_initialized(int32_t DeviceId) const;
@@ -1448,6 +1445,10 @@ public:
                               int DstDeviceId, void *DstPtr, int64_t Size,
                               __tgt_async_info *AsyncInfo);
 
+  /// Places a fence between previous data movements and following data
+  /// movements if necessary on the device
+  int32_t data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo);
+
   /// Begin executing a kernel on the given device.
   int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
                         ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
@@ -1485,10 +1486,6 @@ public:
   /// Creates an asynchronous queue for the given plugin.
   int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr);
 
-  /// Creates device information to be used for diagnostics.
-  int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo,
-                           const char **ErrStr);
-
   /// Sets the offset into the devices for use by OMPT.
   int32_t set_device_identifier(int32_t UserId, int32_t DeviceId);
 
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 00720fa2d810..881e27dad384 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -49,13 +49,6 @@ using namespace omp::target;
 
 namespace {
 
-bool isImageBitcode(const __tgt_device_image &Image) {
-  StringRef Binary(reinterpret_cast<const char *>(Image.ImageStart),
-                   utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
-
-  return identify_magic(Binary) == file_magic::bitcode;
-}
-
 Expected<std::unique_ptr<Module>>
 createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB,
                              LLVMContext &Context) {
@@ -66,12 +59,10 @@ createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB,
                                      "failed to create module");
   return std::move(Mod);
 }
-Expected<std::unique_ptr<Module>>
-createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) {
-  StringRef Data((const char *)Image.ImageStart,
-                 utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+Expected<std::unique_ptr<Module>> createModuleFromImage(StringRef Image,
+                                                        LLVMContext &Context) {
   std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(
-      Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false);
+      Image, /*BufferName=*/"", /*RequiresNullTerminator=*/false);
   return createModuleFromMemoryBuffer(MB, Context);
 }
 
@@ -189,9 +180,10 @@ Expected<std::unique_ptr<MemoryBuffer>>
 JITEngine::backend(Module &M, const std::string &ComputeUnitKind,
                    unsigned OptLevel) {
 
-  auto RemarksFileOrErr = setupLLVMOptimizationRemarks(
-      M.getContext(), /*RemarksFilename=*/"", /*RemarksPasses=*/"",
-      /*RemarksFormat=*/"", /*RemarksWithHotness=*/false);
+  Expected<LLVMRemarkFileHandle> RemarksFileOrErr =
+      setupLLVMOptimizationRemarks(
+          M.getContext(), /*RemarksFilename=*/"", /*RemarksPasses=*/"",
+          /*RemarksFormat=*/"", /*RemarksWithHotness=*/false);
   if (Error E = RemarksFileOrErr.takeError())
     return std::move(E);
   if (*RemarksFileOrErr)
@@ -238,7 +230,7 @@ JITEngine::backend(Module &M, const std::string &ComputeUnitKind,
 }
 
 Expected<std::unique_ptr<MemoryBuffer>>
-JITEngine::getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx,
+JITEngine::getOrCreateObjFile(StringRef Image, LLVMContext &Ctx,
                               const std::string &ComputeUnitKind) {
 
   // Check if the user replaces the module at runtime with a finished object.
@@ -277,58 +269,28 @@ JITEngine::getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx,
   return backend(*Mod, ComputeUnitKind, JITOptLevel);
 }
 
-Expected<const __tgt_device_image *>
-JITEngine::compile(const __tgt_device_image &Image,
-                   const std::string &ComputeUnitKind,
+Expected<std::unique_ptr<MemoryBuffer>>
+JITEngine::compile(StringRef Image, const std::string &ComputeUnitKind,
                    PostProcessingFn PostProcessing) {
   std::lock_guard<std::mutex> Lock(ComputeUnitMapMutex);
 
-  // Check if we JITed this image for the given compute unit kind before.
-  ComputeUnitInfo &CUI = ComputeUnitMap[ComputeUnitKind];
-  if (CUI.TgtImageMap.contains(&Image))
-    return CUI.TgtImageMap[&Image].get();
-
-  auto ObjMBOrErr = getOrCreateObjFile(Image, CUI.Context, ComputeUnitKind);
+  LLVMContext Ctz;
+  auto ObjMBOrErr = getOrCreateObjFile(Image, Ctz, ComputeUnitKind);
   if (!ObjMBOrErr)
     return ObjMBOrErr.takeError();
 
-  auto ImageMBOrErr = PostProcessing(std::move(*ObjMBOrErr));
-  if (!ImageMBOrErr)
-    return ImageMBOrErr.takeError();
-
-  CUI.JITImages.insert({&Image, std::move(*ImageMBOrErr)});
-  auto &ImageMB = CUI.JITImages[&Image];
-  CUI.TgtImageMap.insert({&Image, std::make_unique<__tgt_device_image>()});
-  auto &JITedImage = CUI.TgtImageMap[&Image];
-  *JITedImage = Image;
-  JITedImage->ImageStart = const_cast<char *>(ImageMB->getBufferStart());
-  JITedImage->ImageEnd = const_cast<char *>(ImageMB->getBufferEnd());
-
-  return JITedImage.get();
+  return PostProcessing(std::move(*ObjMBOrErr));
 }
 
-Expected<const __tgt_device_image *>
-JITEngine::process(const __tgt_device_image &Image,
-                   target::plugin::GenericDeviceTy &Device) {
-  const std::string &ComputeUnitKind = Device.getComputeUnitKind();
+Expected<std::unique_ptr<MemoryBuffer>>
+JITEngine::process(StringRef Image, target::plugin::GenericDeviceTy &Device) {
+  assert(identify_magic(Image) == file_magic::bitcode && "Image not LLVM-IR");
 
+  const std::string &ComputeUnitKind = Device.getComputeUnitKind();
   PostProcessingFn PostProcessing = [&Device](std::unique_ptr<MemoryBuffer> MB)
       -> Expected<std::unique_ptr<MemoryBuffer>> {
     return Device.doJITPostProcessing(std::move(MB));
   };
 
-  if (isImageBitcode(Image))
-    return compile(Image, ComputeUnitKind, PostProcessing);
-
-  return &Image;
-}
-
-void JITEngine::erase(const __tgt_device_image &Image,
-                      target::plugin::GenericDeviceTy &Device) {
-  std::lock_guard<std::mutex> Lock(ComputeUnitMapMutex);
-  const std::string &ComputeUnitKind = Device.getComputeUnitKind();
-  ComputeUnitInfo &CUI = ComputeUnitMap[ComputeUnitKind];
-
-  CUI.TgtImageMap.erase(&Image);
-  CUI.JITImages.erase(&Image);
+  return compile(Image, ComputeUnitKind, PostProcessing);
 }
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 083d41659a46..db43cbe49cc2 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -73,11 +73,17 @@ private:
   };
   llvm::SmallVector<GlobalEntry> GlobalEntries{};
 
-  void *suggestAddress(uint64_t MaxMemoryAllocation) {
+  Expected<void *> suggestAddress(uint64_t MaxMemoryAllocation) {
     // Get a valid pointer address for this system
-    void *Addr =
+    auto AddrOrErr =
         Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
-    Device->free(Addr);
+    if (!AddrOrErr)
+      return AddrOrErr.takeError();
+
+    void *Addr = *AddrOrErr;
+    if (auto Err = Device->free(Addr))
+      return std::move(Err);
+
     // Align Address to MaxMemoryAllocation
     Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation);
     return Addr;
@@ -86,8 +92,12 @@ private:
   Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
     size_t ASize = MaxMemoryAllocation;
 
-    if (!VAddr && isRecording())
-      VAddr = suggestAddress(MaxMemoryAllocation);
+    if (!VAddr && isRecording()) {
+      auto VAddrOrErr = suggestAddress(MaxMemoryAllocation);
+      if (!VAddrOrErr)
+        return VAddrOrErr.takeError();
+      VAddr = *VAddrOrErr;
+    }
 
     DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
 
@@ -117,8 +127,11 @@ private:
     constexpr size_t STEP = 1024 * 1024 * 1024ULL;
     MemoryStart = nullptr;
     for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
-      MemoryStart =
+      auto MemoryStartOrErr =
           Device->allocate(TotalSize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
+      if (!MemoryStartOrErr)
+        return MemoryStartOrErr.takeError();
+      MemoryStart = *MemoryStartOrErr;
       if (MemoryStart)
         break;
     }
@@ -214,15 +227,7 @@ public:
     raw_fd_ostream OS(ImageName, EC);
     if (EC)
       report_fatal_error("Error saving image : " + StringRef(EC.message()));
-    if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
-      size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd,
-                                      TgtImageBitcode->ImageStart);
-      MemoryBufferRef MBR = MemoryBufferRef(
-          StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
-      OS << MBR.getBuffer();
-    } else {
-      OS << Image.getMemoryBuffer().getBuffer();
-    }
+    OS << Image.getMemoryBuffer().getBuffer();
     OS.close();
   }
 
@@ -360,65 +365,19 @@ public:
     return Plugin::success();
   }
 
-  void deinit() {
+  Error deinit() {
     if (UsedVAMap) {
       if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
-        report_fatal_error("Error on releasing virtual memory space");
+        return Err;
     } else {
-      Device->free(MemoryStart);
+      if (auto Err = Device->free(MemoryStart))
+        return Err;
     }
+    return Plugin::success();
   }
 };
 } // namespace llvm::omp::target::plugin
 
-// Extract the mapping of host function pointers to device function pointers
-// from the entry table. Functions marked as 'indirect' in OpenMP will have
-// offloading entries generated for them which map the host's function pointer
-// to a global containing the corresponding function pointer on the device.
-static Expected<std::pair<void *, uint64_t>>
-setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device,
-                       DeviceImageTy &Image) {
-  GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-
-  llvm::ArrayRef<llvm::offloading::EntryTy> Entries(
-      Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd);
-  llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
-  for (const auto &Entry : Entries) {
-    if (Entry.Kind != object::OffloadKind::OFK_OpenMP || Entry.Size == 0 ||
-        !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT))
-      continue;
-
-    assert(Entry.Size == sizeof(void *) && "Global not a function pointer?");
-    auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
-
-    GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size);
-    if (auto Err =
-            Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
-      return std::move(Err);
-
-    HstPtr = Entry.Address;
-    if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(),
-                                       Entry.Size, nullptr))
-      return std::move(Err);
-  }
-
-  // If we do not have any indirect globals we exit early.
-  if (IndirectCallTable.empty())
-    return std::pair{nullptr, 0};
-
-  // Sort the array to allow for more efficient lookup of device pointers.
-  llvm::sort(IndirectCallTable,
-             [](const auto &x, const auto &y) { return x.first < y.first; });
-
-  uint64_t TableSize =
-      IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
-  void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE);
-  if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(),
-                                   TableSize, nullptr))
-    return std::move(Err);
-  return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
-}
-
 AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
                                        __tgt_async_info *AsyncInfoPtr)
     : Device(Device),
@@ -662,6 +621,10 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
     return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
   }
 
+  // Return the number of teams required to cover the loop iterations.
+  if (isNoLoopMode())
+    return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1;
+
   uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
   uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
   if (LoopTripCount > 0) {
@@ -857,9 +820,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
       return Err;
   }
 
-  if (Image->getTgtImageBitcode())
-    Plugin.getJIT().erase(*Image->getTgtImageBitcode(), Image->getDevice());
-
   return unloadBinaryImpl(Image);
 }
 
@@ -893,7 +853,8 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
 
   RecordReplayTy &RecordReplay = Plugin.getRecordReplay();
   if (RecordReplay.isRecordingOrReplaying())
-    RecordReplay.deinit();
+    if (auto Err = RecordReplay.deinit())
+      return Err;
 
   if (RPCServer)
     if (auto Err = RPCServer->deinitDevice(*this))
@@ -909,40 +870,33 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
 
   return deinitImpl();
 }
-Expected<DeviceImageTy *>
-GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
-                            const __tgt_device_image *InputTgtImage) {
-  assert(InputTgtImage && "Expected non-null target image");
-  DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage->ImageStart));
-
-  auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this);
-  if (!PostJITImageOrErr) {
-    auto Err = PostJITImageOrErr.takeError();
-    REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage,
-           DeviceId, toStringWithoutConsuming(Err).data());
-    return Plugin::error(ErrorCode::COMPILE_FAILURE, std::move(Err),
-                         "failure to jit IR image");
+Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
+                                                      StringRef InputTgtImage) {
+  DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage.bytes_begin()));
+
+  std::unique_ptr<MemoryBuffer> Buffer;
+  if (identify_magic(InputTgtImage) == file_magic::bitcode) {
+    auto CompiledImageOrErr = Plugin.getJIT().process(InputTgtImage, *this);
+    if (!CompiledImageOrErr) {
+      return Plugin::error(ErrorCode::COMPILE_FAILURE,
+                           CompiledImageOrErr.takeError(),
+                           "failure to jit IR image");
+    }
+    Buffer = std::move(*CompiledImageOrErr);
+  } else {
+    Buffer = MemoryBuffer::getMemBufferCopy(InputTgtImage);
   }
 
   // Load the binary and allocate the image object. Use the next available id
   // for the image id, which is the number of previously loaded images.
-  auto ImageOrErr =
-      loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size());
+  auto ImageOrErr = loadBinaryImpl(std::move(Buffer), LoadedImages.size());
   if (!ImageOrErr)
     return ImageOrErr.takeError();
-
   DeviceImageTy *Image = *ImageOrErr;
-  assert(Image != nullptr && "Invalid image");
-  if (InputTgtImage != PostJITImageOrErr.get())
-    Image->setTgtImageBitcode(InputTgtImage);
 
   // Add the image to list.
   LoadedImages.push_back(Image);
 
-  // Setup the device environment if needed.
-  if (auto Err = setupDeviceEnvironment(Plugin, *Image))
-    return std::move(Err);
-
   // Setup the global device memory pool if needed.
   if (!Plugin.getRecordReplay().isReplaying() &&
       shouldSetupDeviceMemoryPool()) {
@@ -960,12 +914,12 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
 
 #ifdef OMPT_SUPPORT
   if (ompt::Initialized) {
-    size_t Bytes =
-        utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
+    size_t Bytes = InputTgtImage.size();
     performOmptCallback(
         device_load, Plugin.getUserId(DeviceId),
         /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
-        /*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart,
+        /*ImgSize=*/Bytes,
+        /*HostAddr=*/const_cast<unsigned char *>(InputTgtImage.bytes_begin()),
         /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
   }
 #endif
@@ -978,43 +932,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   return Image;
 }
 
-Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
-                                              DeviceImageTy &Image) {
-  // There are some plugins that do not need this step.
-  if (!shouldSetupDeviceEnvironment())
-    return Plugin::success();
-
-  // Obtain a table mapping host function pointers to device function pointers.
-  auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image);
-  if (!CallTablePairOrErr)
-    return CallTablePairOrErr.takeError();
-
-  DeviceEnvironmentTy DeviceEnvironment;
-  DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind;
-  DeviceEnvironment.NumDevices = Plugin.getNumDevices();
-  // TODO: The device ID used here is not the real device ID used by OpenMP.
-  DeviceEnvironment.DeviceNum = DeviceId;
-  DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
-  DeviceEnvironment.ClockFrequency = getClockFrequency();
-  DeviceEnvironment.IndirectCallTable =
-      reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
-  DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
-  DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
-
-  // Create the metainfo of the device environment global.
-  GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
-                        sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
-
-  // Write device environment values to the device.
-  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-  if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) {
-    DP("Missing symbol %s, continue execution anyway.\n",
-       DevEnvGlobal.getName().data());
-    consumeError(std::move(Err));
-  }
-  return Plugin::success();
-}
-
 Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
                                              DeviceImageTy &Image,
                                              uint64_t PoolSize) {
@@ -1337,16 +1254,19 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
 
 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
                                    bool ReleaseQueue) {
+  if (!AsyncInfo)
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "invalid async info queue");
+
   SmallVector<void *> AllocsToDelete{};
   {
     std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
 
-    if (!AsyncInfo || !AsyncInfo->Queue)
-      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                           "invalid async info queue");
-
-    if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
-      return Err;
+    // This can be false when no work has been added to the AsyncInfo. In which
+    // case, the device has nothing to synchronize.
+    if (AsyncInfo->Queue)
+      if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
+        return Err;
 
     std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations);
   }
@@ -1391,10 +1311,12 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
 
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
-  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
   case TARGET_ALLOC_DEVICE:
     if (MemoryManager) {
-      Alloc = MemoryManager->allocate(Size, HostPtr);
+      auto AllocOrErr = MemoryManager->allocate(Size, HostPtr);
+      if (!AllocOrErr)
+        return AllocOrErr.takeError();
+      Alloc = *AllocOrErr;
       if (!Alloc)
         return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
                              "failed to allocate from memory manager");
@@ -1402,12 +1324,16 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
     }
     [[fallthrough]];
   case TARGET_ALLOC_HOST:
-  case TARGET_ALLOC_SHARED:
-    Alloc = allocate(Size, HostPtr, Kind);
+  case TARGET_ALLOC_SHARED: {
+    auto AllocOrErr = allocate(Size, HostPtr, Kind);
+    if (!AllocOrErr)
+      return AllocOrErr.takeError();
+    Alloc = *AllocOrErr;
     if (!Alloc)
       return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
                            "failed to allocate from device allocator");
   }
+  }
 
   // Report error if the memory manager or the device allocator did not return
   // any memory buffer.
@@ -1479,29 +1405,19 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
 #undef DEALLOCATION_ERROR
   }
 
-  int Res;
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
-  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
   case TARGET_ALLOC_DEVICE:
     if (MemoryManager) {
-      Res = MemoryManager->free(TgtPtr);
-      if (Res)
-        return Plugin::error(
-            ErrorCode::OUT_OF_RESOURCES,
-            "failure to deallocate device pointer %p via memory manager",
-            TgtPtr);
+      if (auto Err = MemoryManager->free(TgtPtr))
+        return Err;
       break;
     }
     [[fallthrough]];
   case TARGET_ALLOC_HOST:
   case TARGET_ALLOC_SHARED:
-    Res = free(TgtPtr, Kind);
-    if (Res)
-      return Plugin::error(
-          ErrorCode::UNKNOWN,
-          "failure to deallocate device pointer %p via device deallocator",
-          TgtPtr);
+    if (auto Err = free(TgtPtr, Kind))
+      return Err;
   }
 
   // Unregister deallocated pinned memory buffer if the type is host memory.
@@ -1540,6 +1456,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
   return Err;
 }
 
+Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr,
+                                int64_t PatternSize, int64_t Size,
+                                __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+  auto Err =
+      dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper);
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
 Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
                                     ptrdiff_t *ArgOffsets,
                                     KernelArgsTy &KernelArgs,
@@ -1589,10 +1515,13 @@ Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
   return Err;
 }
 
-Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
-  assert(DeviceInfo && "Invalid device info");
+Error GenericDeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData,
+                                       __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
 
-  return initDeviceInfoImpl(DeviceInfo);
+  auto Err = enqueueHostCallImpl(Callback, UserData, AsyncInfoWrapper);
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
 }
 
 Error GenericDeviceTy::printInfo() {
@@ -1648,6 +1577,22 @@ Expected<bool> GenericDeviceTy::hasPendingWork(__tgt_async_info *AsyncInfo) {
   return Res;
 }
 
+Expected<bool> GenericDeviceTy::isEventComplete(void *Event,
+                                                __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+  auto Res = isEventCompleteImpl(Event, AsyncInfoWrapper);
+  if (auto Err = Res.takeError()) {
+    AsyncInfoWrapper.finalize(Err);
+    return Err;
+  }
+
+  auto Err = Plugin::success();
+  AsyncInfoWrapper.finalize(Err);
+  if (Err)
+    return Err;
+  return Res;
+}
+
 Error GenericDeviceTy::syncEvent(void *EventPtr) {
   return syncEventImpl(EventPtr);
 }
@@ -1774,28 +1719,26 @@ Expected<bool> GenericPluginTy::checkBitcodeImage(StringRef Image) const {
 
 int32_t GenericPluginTy::is_initialized() const { return Initialized; }
 
-int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
-  StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
-                   utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
-
+int32_t GenericPluginTy::isPluginCompatible(StringRef Image) {
   auto HandleError = [&](Error Err) -> bool {
     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
-    DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
+    DP("Failure to check validity of image %p: %s", Image.data(),
+       ErrStr.c_str());
     return false;
   };
-  switch (identify_magic(Buffer)) {
+  switch (identify_magic(Image)) {
   case file_magic::elf:
   case file_magic::elf_relocatable:
   case file_magic::elf_executable:
   case file_magic::elf_shared_object:
   case file_magic::elf_core: {
-    auto MatchOrErr = checkELFImage(Buffer);
+    auto MatchOrErr = checkELFImage(Image);
     if (Error Err = MatchOrErr.takeError())
       return HandleError(std::move(Err));
     return *MatchOrErr;
   }
   case file_magic::bitcode: {
-    auto MatchOrErr = checkBitcodeImage(Buffer);
+    auto MatchOrErr = checkBitcodeImage(Image);
     if (Error Err = MatchOrErr.takeError())
       return HandleError(std::move(Err));
     return *MatchOrErr;
@@ -1805,36 +1748,33 @@ int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
   }
 }
 
-int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId,
-                                              __tgt_device_image *Image) {
-  StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
-                   utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
-
+int32_t GenericPluginTy::isDeviceCompatible(int32_t DeviceId, StringRef Image) {
   auto HandleError = [&](Error Err) -> bool {
     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
-    DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
+    DP("Failure to check validity of image %p: %s", Image.data(),
+       ErrStr.c_str());
     return false;
   };
-  switch (identify_magic(Buffer)) {
+  switch (identify_magic(Image)) {
   case file_magic::elf:
   case file_magic::elf_relocatable:
   case file_magic::elf_executable:
   case file_magic::elf_shared_object:
   case file_magic::elf_core: {
-    auto MatchOrErr = checkELFImage(Buffer);
+    auto MatchOrErr = checkELFImage(Image);
     if (Error Err = MatchOrErr.takeError())
       return HandleError(std::move(Err));
     if (!*MatchOrErr)
       return false;
 
     // Perform plugin-dependent checks for the specific architecture if needed.
-    auto CompatibleOrErr = isELFCompatible(DeviceId, Buffer);
+    auto CompatibleOrErr = isELFCompatible(DeviceId, Image);
     if (Error Err = CompatibleOrErr.takeError())
       return HandleError(std::move(Err));
     return *CompatibleOrErr;
   }
   case file_magic::bitcode: {
-    auto MatchOrErr = checkBitcodeImage(Buffer);
+    auto MatchOrErr = checkBitcodeImage(Image);
     if (Error Err = MatchOrErr.takeError())
       return HandleError(std::move(Err));
     return *MatchOrErr;
@@ -1895,7 +1835,9 @@ int32_t GenericPluginTy::load_binary(int32_t DeviceId,
                                      __tgt_device_binary *Binary) {
   GenericDeviceTy &Device = getDevice(DeviceId);
 
-  auto ImageOrErr = Device.loadBinary(*this, TgtImage);
+  StringRef Buffer(reinterpret_cast<const char *>(TgtImage->ImageStart),
+                   utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart));
+  auto ImageOrErr = Device.loadBinary(*this, Buffer);
   if (!ImageOrErr) {
     auto Err = ImageOrErr.takeError();
     REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
@@ -2180,21 +2122,6 @@ int32_t GenericPluginTy::init_async_info(int32_t DeviceId,
   return OFFLOAD_SUCCESS;
 }
 
-int32_t GenericPluginTy::init_device_info(int32_t DeviceId,
-                                          __tgt_device_info *DeviceInfo,
-                                          const char **ErrStr) {
-  *ErrStr = "";
-
-  auto Err = getDevice(DeviceId).initDeviceInfo(DeviceInfo);
-  if (Err) {
-    REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n",
-           DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
 int32_t GenericPluginTy::set_device_identifier(int32_t UserId,
                                                int32_t DeviceId) {
   UserDeviceIds[DeviceId] = UserId;
@@ -2217,8 +2144,7 @@ int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
   GenericGlobalHandlerTy &GHandler = getGlobalHandler();
   if (auto Err =
           GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
-    REPORT("Failure to look up global address: %s\n",
-           toString(std::move(Err)).data());
+    consumeError(std::move(Err));
     return OFFLOAD_FAIL;
   }
 
@@ -2324,3 +2250,15 @@ int32_t GenericPluginTy::async_barrier(omp_interop_val_t *Interop) {
   }
   return OFFLOAD_SUCCESS;
 }
+
+int32_t GenericPluginTy::data_fence(int32_t DeviceId,
+                                    __tgt_async_info *AsyncInfo) {
+  auto Err = getDevice(DeviceId).dataFence(AsyncInfo);
+  if (Err) {
+    REPORT("failure to place data fence on device %d: %s\n", DeviceId,
+           toString(std::move(Err)).data());
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 678be78b56af..e19f2ef94de6 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -28,15 +28,22 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
   switch (Port.get_opcode()) {
   case LIBC_MALLOC: {
     Port.recv_and_send([&](rpc::Buffer *Buffer, uint32_t) {
-      Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
-          Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
+      auto PtrOrErr =
+          Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE);
+      void *Ptr = nullptr;
+      if (!PtrOrErr)
+        llvm::consumeError(PtrOrErr.takeError());
+      else
+        Ptr = *PtrOrErr;
+      Buffer->data[0] = reinterpret_cast<uintptr_t>(Ptr);
     });
     break;
   }
   case LIBC_FREE: {
     Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
-      Device.free(reinterpret_cast<void *>(Buffer->data[0]),
-                  TARGET_ALLOC_DEVICE_NON_BLOCKING);
+      if (auto Err = Device.free(reinterpret_cast<void *>(Buffer->data[0]),
+                                 TARGET_ALLOC_DEVICE))
+        llvm::consumeError(std::move(Err));
     });
     break;
   }
@@ -171,9 +178,13 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
                               plugin::DeviceImageTy &Image) {
   uint64_t NumPorts =
       std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT);
-  void *RPCBuffer = Device.allocate(
+  auto RPCBufferOrErr = Device.allocate(
       rpc::Server::allocation_size(Device.getWarpSize(), NumPorts), nullptr,
       TARGET_ALLOC_HOST);
+  if (!RPCBufferOrErr)
+    return RPCBufferOrErr.takeError();
+
+  void *RPCBuffer = *RPCBufferOrErr;
   if (!RPCBuffer)
     return plugin::Plugin::error(
         error::ErrorCode::UNKNOWN,
@@ -198,7 +209,8 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
 
 Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
   std::lock_guard<decltype(BufferMutex)> Lock(BufferMutex);
-  Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST);
+  if (auto Err = Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST))
+    return Err;
   Buffers[Device.getDeviceId()] = nullptr;
   Devices[Device.getDeviceId()] = nullptr;
   return Error::success();
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 361a781e8f9b..f5b2d074a47e 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4)
 DLWRAP(cuMemcpyHtoD, 3)
 DLWRAP(cuMemcpyHtoDAsync, 4)
 
+DLWRAP(cuMemsetD8Async, 4)
+DLWRAP(cuMemsetD16Async, 4)
+DLWRAP(cuMemsetD32Async, 4)
+DLWRAP(cuMemsetD2D8Async, 6)
+DLWRAP(cuMemsetD2D16Async, 6)
+DLWRAP(cuMemsetD2D32Async, 6)
+
 DLWRAP(cuMemFree, 1)
 DLWRAP(cuMemFreeHost, 1)
 DLWRAP(cuMemFreeAsync, 2)
@@ -72,6 +79,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
 DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
+DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
 
 DLWRAP(cuDeviceCanAccessPeer, 3)
 DLWRAP(cuCtxEnablePeerAccess, 2)
@@ -82,6 +90,7 @@ DLWRAP(cuCtxSetLimit, 2)
 
 DLWRAP(cuEventCreate, 2)
 DLWRAP(cuEventRecord, 2)
+DLWRAP(cuEventQuery, 1)
 DLWRAP(cuStreamWaitEvent, 3)
 DLWRAP(cuEventSynchronize, 1)
 DLWRAP(cuEventDestroy, 1)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index b6c022c8e7e8..dec4e33508c6 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
 static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
 
 typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+typedef size_t (*CUoccupancyB2DSize)(int);
 
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
@@ -321,6 +322,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
 CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
 CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
 
+CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+                           CUstream);
+CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+                            CUstream);
+CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+                            CUstream);
+
 CUresult cuMemFree(CUdeviceptr);
 CUresult cuMemFreeHost(void *);
 CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
@@ -352,6 +363,7 @@ CUresult cuCtxSetLimit(CUlimit, size_t);
 
 CUresult cuEventCreate(CUevent *, unsigned int);
 CUresult cuEventRecord(CUevent, CUstream);
+CUresult cuEventQuery(CUevent);
 CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
 CUresult cuEventSynchronize(CUevent);
 CUresult cuEventDestroy(CUevent);
@@ -372,5 +384,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
 CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                        const CUmemAllocationProp *prop,
                                        CUmemAllocationGranularity_flags option);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+                                          CUoccupancyB2DSize, size_t, int);
 
 #endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index e94f3f6af7dd..db94f7f2dd99 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -81,8 +81,8 @@ CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
 struct CUDADeviceImageTy : public DeviceImageTy {
   /// Create the CUDA image with the id and the target image pointer.
   CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
-                    const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {}
+                    std::unique_ptr<MemoryBuffer> &&TgtImage)
+      : DeviceImageTy(ImageId, Device, std::move(TgtImage)), Module(nullptr) {}
 
   /// Load the image as a CUDA module.
   Error loadModule() {
@@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &,
+                                  uint64_t DynamicMemSize) const override {
+    int MinGridSize;
+    int MaxBlockSize;
+    auto Res = cuOccupancyMaxPotentialBlockSize(
+        &MinGridSize, &MaxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
+    if (auto Err = Plugin::check(
+            Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
+      return Err;
+    }
+    return MaxBlockSize;
+  }
+
 private:
   /// The CUDA kernel function to execute.
   CUfunction Func;
@@ -371,6 +385,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (auto Err = CUDAImage.unloadModule())
       return Err;
 
+    // Destroy the associated memory and invalidate the object.
+    Plugin.free(Image);
     return Plugin::success();
   }
 
@@ -404,20 +420,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
   virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
                                        DeviceImageTy &Image) override {
-    // Check for the presence of global destructors at initialization time. This
-    // is required when the image may be deallocated before destructors are run.
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
-      Image.setPendingGlobalDtors();
-
     return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
   }
 
   virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
                                       DeviceImageTy &Image) override {
-    if (Image.hasPendingGlobalDtors())
-      return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
-    return Plugin::success();
+    return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
   }
 
   Expected<std::unique_ptr<MemoryBuffer>>
@@ -535,14 +543,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
   CUdevice getCUDADevice() const { return Device; }
 
   /// Load the binary image into the device and allocate an image object.
-  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
-                                           int32_t ImageId) override {
+  Expected<DeviceImageTy *>
+  loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
+                 int32_t ImageId) override {
     if (auto Err = setContext())
       return std::move(Err);
 
     // Allocate and initialize the image object.
     CUDADeviceImageTy *CUDAImage = Plugin.allocate<CUDADeviceImageTy>();
-    new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
+    new (CUDAImage) CUDADeviceImageTy(ImageId, *this, std::move(TgtImage));
 
     // Load the CUDA module.
     if (auto Err = CUDAImage->loadModule())
@@ -552,14 +561,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate memory on the device or related to the device.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
+  Expected<void *> allocate(size_t Size, void *, TargetAllocTy Kind) override {
     if (Size == 0)
       return nullptr;
 
-    if (auto Err = setContext()) {
-      REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
-      return nullptr;
-    }
+    if (auto Err = setContext())
+      return std::move(Err);
 
     void *MemAlloc = nullptr;
     CUdeviceptr DevicePtr;
@@ -578,35 +585,20 @@ struct CUDADeviceTy : public GenericDeviceTy {
       Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
       MemAlloc = (void *)DevicePtr;
       break;
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
-      CUstream Stream;
-      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
-        break;
-      if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
-        break;
-      cuStreamSynchronize(Stream);
-      Res = cuStreamDestroy(Stream);
-      MemAlloc = (void *)DevicePtr;
-    }
     }
 
-    if (auto Err =
-            Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) {
-      REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
-      return nullptr;
-    }
+    if (auto Err = Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s"))
+      return std::move(Err);
     return MemAlloc;
   }
 
   /// Deallocate memory on the device or related to the device.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  Error free(void *TgtPtr, TargetAllocTy Kind) override {
     if (TgtPtr == nullptr)
-      return OFFLOAD_SUCCESS;
+      return Plugin::success();
 
-    if (auto Err = setContext()) {
-      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
-      return OFFLOAD_FAIL;
-    }
+    if (auto Err = setContext())
+      return Err;
 
     CUresult Res;
     switch (Kind) {
@@ -618,22 +610,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
     case TARGET_ALLOC_HOST:
       Res = cuMemFreeHost(TgtPtr);
       break;
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
-      CUstream Stream;
-      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
-        break;
-      cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
-      cuStreamSynchronize(Stream);
-      if ((Res = cuStreamDestroy(Stream)))
-        break;
-    }
     }
 
-    if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) {
-      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
-      return OFFLOAD_FAIL;
-    }
-    return OFFLOAD_SUCCESS;
+    return Plugin::check(Res, "error in cuMemFree[Host]: %s");
   }
 
   /// Synchronize current thread with the pending operations on the async info.
@@ -844,6 +823,64 @@ struct CUDADeviceTy : public GenericDeviceTy {
                          void *DstPtr, int64_t Size,
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override;
 
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (auto Err = setContext())
+      return Err;
+
+    CUstream Stream;
+    if (auto Err = getStream(AsyncInfoWrapper, Stream))
+      return Err;
+
+    CUresult Res;
+    size_t N = Size / PatternSize;
+    if (PatternSize == 1) {
+      Res = cuMemsetD8Async((CUdeviceptr)TgtPtr,
+                            *(static_cast<const uint8_t *>(PatternPtr)), N,
+                            Stream);
+    } else if (PatternSize == 2) {
+      Res = cuMemsetD16Async((CUdeviceptr)TgtPtr,
+                             *(static_cast<const uint16_t *>(PatternPtr)), N,
+                             Stream);
+    } else if (PatternSize == 4) {
+      Res = cuMemsetD32Async((CUdeviceptr)TgtPtr,
+                             *(static_cast<const uint32_t *>(PatternPtr)), N,
+                             Stream);
+    } else {
+      // For larger patterns we can do a series of strided fills to copy the
+      // pattern efficiently
+      int64_t MemsetSize = PatternSize % 4u == 0u   ? 4u
+                           : PatternSize % 2u == 0u ? 2u
+                                                    : 1u;
+
+      int64_t NumberOfSteps = PatternSize / MemsetSize;
+      int64_t Pitch = NumberOfSteps * MemsetSize;
+      int64_t Height = Size / PatternSize;
+
+      for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
+        if (MemsetSize == 4) {
+          Res = cuMemsetD2D32Async(
+              (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+              *(static_cast<const uint32_t *>(PatternPtr) + Step), 1u, Height,
+              Stream);
+        } else if (MemsetSize == 2) {
+          Res = cuMemsetD2D16Async(
+              (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+              *(static_cast<const uint16_t *>(PatternPtr) + Step), 1u, Height,
+              Stream);
+        } else {
+          Res = cuMemsetD2D8Async(
+              (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+              *(static_cast<const uint8_t *>(PatternPtr) + Step), 1u, Height,
+              Stream);
+        }
+      }
+    }
+
+    return Plugin::check(Res, "error in cuMemset: %s");
+  }
+
   /// Initialize the async info for interoperability purposes.
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     if (auto Err = setContext())
@@ -856,23 +893,70 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
-  /// Initialize the device info for interoperability purposes.
-  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
-    assert(Context && "Context is null");
-    assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
+  /// Insert a data fence between previous data operations and the following
+  /// operations. This is a no-op for CUDA devices as operations inserted into
+  /// a queue are in-order.
+  Error dataFence(__tgt_async_info *Async) override {
+    return Plugin::success();
+  }
 
-    if (auto Err = setContext())
-      return Err;
+  interop_spec_t selectInteropPreference(int32_t InteropType,
+                                         int32_t NumPrefers,
+                                         interop_spec_t *Prefers) override {
+    return interop_spec_t{tgt_fr_cuda, {true, 0}, 0};
+  }
+
+  Expected<omp_interop_val_t *>
+  createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override {
+    auto *Ret = new omp_interop_val_t(
+        DeviceId, static_cast<kmp_interop_type_t>(InteropType));
+    Ret->fr_id = tgt_fr_cuda;
+    Ret->vendor_id = omp_vendor_nvidia;
+
+    if (InteropType == kmp_interop_type_target ||
+        InteropType == kmp_interop_type_targetsync) {
+      Ret->device_info.Platform = nullptr;
+      Ret->device_info.Device = reinterpret_cast<void *>(Device);
+      Ret->device_info.Context = Context;
+    }
+
+    if (InteropType == kmp_interop_type_targetsync) {
+      Ret->async_info = new __tgt_async_info();
+      if (auto Err = setContext())
+        return Err;
+      CUstream Stream;
+      if (auto Err = CUDAStreamManager.getResource(Stream))
+        return Err;
+
+      Ret->async_info->Queue = Stream;
+    }
+    return Ret;
+  }
 
-    if (!DeviceInfo->Context)
-      DeviceInfo->Context = Context;
+  Error releaseInterop(omp_interop_val_t *Interop) override {
+    if (!Interop)
+      return Plugin::success();
 
-    if (!DeviceInfo->Device)
-      DeviceInfo->Device = reinterpret_cast<void *>(Device);
+    if (Interop->async_info)
+      delete Interop->async_info;
 
+    delete Interop;
     return Plugin::success();
   }
 
+  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                            AsyncInfoWrapperTy &AsyncInfo) override {
+    if (auto Err = setContext())
+      return Err;
+
+    CUstream Stream;
+    if (auto Err = getStream(AsyncInfo, Stream))
+      return Err;
+
+    CUresult Res = cuLaunchHostFunc(Stream, Callback, UserData);
+    return Plugin::check(Res, "error in cuStreamLaunchHostFunc: %s");
+  };
+
   /// Create an event.
   Error createEventImpl(void **EventPtrStorage) override {
     CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
@@ -914,9 +998,33 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "error in cuStreamWaitEvent: %s");
   }
 
-  // TODO: This should be implementable on CUDA
   Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
-    return true;
+    CUstream Stream;
+    if (auto Err = getStream(AsyncInfo, Stream))
+      return Err;
+
+    CUresult Ret = cuStreamQuery(Stream);
+    if (Ret == CUDA_SUCCESS)
+      return false;
+
+    if (Ret == CUDA_ERROR_NOT_READY)
+      return true;
+
+    return Plugin::check(Ret, "error in cuStreamQuery: %s");
+  }
+
+  Expected<bool> isEventCompleteImpl(void *EventPtr,
+                                     AsyncInfoWrapperTy &) override {
+    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
+
+    CUresult Ret = cuEventQuery(Event);
+    if (Ret == CUDA_SUCCESS)
+      return true;
+
+    if (Ret == CUDA_ERROR_NOT_READY)
+      return false;
+
+    return Plugin::check(Ret, "error in cuEventQuery: %s");
   }
 
   /// Synchronize the current thread with the event.
@@ -944,18 +1052,27 @@ struct CUDADeviceTy : public GenericDeviceTy {
     Info.add("CUDA OpenMP Device Number", DeviceId);
 
     Res = cuDeviceGetName(TmpChar, 1000, Device);
-    if (Res == CUDA_SUCCESS)
+    if (Res == CUDA_SUCCESS) {
       Info.add("Device Name", TmpChar, "", DeviceInfo::NAME);
+      Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME);
+    }
 
     Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR);
 
+    Info.add("Vendor ID", uint64_t{4318}, "", DeviceInfo::VENDOR_ID);
+
+    Info.add("Memory Address Size", std::numeric_limits<CUdeviceptr>::digits,
+             "bits", DeviceInfo::ADDRESS_BITS);
+
     Res = cuDeviceTotalMem(&TmpSt, Device);
     if (Res == CUDA_SUCCESS)
-      Info.add("Global Memory Size", TmpSt, "bytes");
+      Info.add("Global Memory Size", TmpSt, "bytes",
+               DeviceInfo::GLOBAL_MEM_SIZE);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add("Number of Multiprocessors", TmpInt);
+      Info.add("Number of Multiprocessors", TmpInt, "",
+               DeviceInfo::NUM_COMPUTE_UNITS);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -995,7 +1112,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_SUCCESS)
       MaxBlock.add("z", TmpInt);
 
-    auto &MaxGrid = *Info.add("Maximum Grid Dimensions", "");
+    // TODO: I assume CUDA devices have no limit on the amount of threads,
+    // verify this
+    Info.add("Maximum Grid Size", std::numeric_limits<uint32_t>::max(), "",
+             DeviceInfo::MAX_WORK_SIZE);
+
+    auto &MaxGrid = *Info.add("Maximum Grid Dimensions", std::monostate{}, "",
+                              DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
     if (Res == CUDA_SUCCESS)
       MaxGrid.add("x", TmpInt);
@@ -1016,7 +1139,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add("Clock Rate", TmpInt, "kHz");
+      Info.add("Clock Rate", TmpInt / 1000, "MHz",
+               DeviceInfo::MAX_CLOCK_FREQUENCY);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -1053,7 +1177,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add("Memory Clock Rate", TmpInt, "kHz");
+      Info.add("Memory Clock Rate", TmpInt / 1000, "MHz",
+               DeviceInfo::MEMORY_CLOCK_RATE);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -1166,7 +1291,7 @@ private:
     // Perform a quick check for the named kernel in the image. The kernel
     // should be created by the 'nvptx-lower-ctor-dtor' pass.
     GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
+    if (!Handler.isSymbolInImage(*this, Image, KernelName))
       return Plugin::success();
 
     // The Nvidia backend cannot handle creating the ctor / dtor array
@@ -1201,8 +1326,12 @@ private:
 
     // Allocate a buffer to store all of the known constructor / destructor
     // functions in so we can iterate them on the device.
-    void *Buffer =
+    auto BufferOrErr =
         allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE);
+    if (!BufferOrErr)
+      return BufferOrErr.takeError();
+
+    void *Buffer = *BufferOrErr;
     if (!Buffer)
       return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
                            "failed to allocate memory for global buffer");
@@ -1251,12 +1380,10 @@ private:
 
     Error Err = Plugin::success();
     AsyncInfoWrapper.finalize(Err);
+    if (Err)
+      return Err;
 
-    if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
-      return Plugin::error(ErrorCode::UNKNOWN,
-                           "failed to free memory for global buffer");
-
-    return Err;
+    return free(Buffer, TARGET_ALLOC_DEVICE);
   }
 
   /// Stream manager for CUDA streams.
@@ -1319,7 +1446,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
         Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
     if (auto Err = Plugin::check(
             AttrResult,
-            "Error in cuLaunchKernel while setting the memory limits: %s"))
+            "error in cuFuncSetAttribute while setting the memory limits: %s"))
       return Err;
     MaxDynCGroupMemLimit = MaxDynCGroupMem;
   }
@@ -1453,7 +1580,7 @@ struct CUDAPluginTy final : public GenericPluginTy {
     unsigned SM =
         Header.e_ident[ELF::EI_ABIVERSION] == ELF::ELFABIVERSION_CUDA_V1
             ? Header.e_flags & ELF::EF_CUDA_SM
-            : (Header.e_flags & ELF::EF_CUDA_SM_MASK) >> 8;
+            : (Header.e_flags & ELF::EF_CUDA_SM_MASK) >> ELF::EF_CUDA_SM_OFFSET;
 
     CUdevice Device;
     CUresult Res = cuDeviceGet(&Device, DeviceId);
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index ed5213531999..eb4ecac9907a 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -114,6 +114,14 @@ struct GenELF64KernelTy : public GenericKernelTy {
     return Plugin::success();
   }
 
+  /// Return maximum block size for maximum occupancy
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &Device,
+                                  uint64_t DynamicMemSize) const override {
+    return Plugin::error(
+        ErrorCode::UNSUPPORTED,
+        "occupancy calculations are not implemented for the host device");
+  }
+
 private:
   /// The kernel function to execute.
   void (*Func)(void);
@@ -123,8 +131,8 @@ private:
 struct GenELF64DeviceImageTy : public DeviceImageTy {
   /// Create the GenELF64 image with the id and the target image pointer.
   GenELF64DeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
-                        const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, Device, TgtImage), DynLib() {}
+                        std::unique_ptr<MemoryBuffer> &&TgtImage)
+      : DeviceImageTy(ImageId, Device, std::move(TgtImage)), DynLib() {}
 
   /// Getter and setter for the dynamic library.
   DynamicLibrary &getDynamicLibrary() { return DynLib; }
@@ -181,11 +189,12 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error setContext() override { return Plugin::success(); }
 
   /// Load the binary image into the device and allocate an image object.
-  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
-                                           int32_t ImageId) override {
+  Expected<DeviceImageTy *>
+  loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
+                 int32_t ImageId) override {
     // Allocate and initialize the image object.
     GenELF64DeviceImageTy *Image = Plugin.allocate<GenELF64DeviceImageTy>();
-    new (Image) GenELF64DeviceImageTy(ImageId, *this, TgtImage);
+    new (Image) GenELF64DeviceImageTy(ImageId, *this, std::move(TgtImage));
 
     // Create a temporary file.
     char TmpFileName[] = "/tmp/tmpfile_XXXXXX";
@@ -231,7 +240,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate memory. Use std::malloc in all cases.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
+  Expected<void *> allocate(size_t Size, void *, TargetAllocTy Kind) override {
     if (Size == 0)
       return nullptr;
 
@@ -241,7 +250,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     case TARGET_ALLOC_DEVICE:
     case TARGET_ALLOC_HOST:
     case TARGET_ALLOC_SHARED:
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemAlloc = std::malloc(Size);
       break;
     }
@@ -249,9 +257,9 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   }
 
   /// Free the memory. Use std::free in all cases.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  Error free(void *TgtPtr, TargetAllocTy Kind) override {
     std::free(TgtPtr);
-    return OFFLOAD_SUCCESS;
+    return Plugin::success();
   }
 
   /// This plugin does nothing to lock buffers. Do not return an error, just
@@ -295,6 +303,28 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
                          "dataExchangeImpl not supported");
   }
 
+  /// Insert a data fence between previous data operations and the following
+  /// operations. This is a no-op for Host devices as operations inserted into
+  /// a queue are in-order.
+  Error dataFence(__tgt_async_info *Async) override {
+    return Plugin::success();
+  }
+
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (PatternSize == 1) {
+      std::memset(TgtPtr, *static_cast<const char *>(PatternPtr), Size);
+    } else {
+      for (unsigned int Step = 0; Step < Size; Step += PatternSize) {
+        auto *Dst = static_cast<char *>(TgtPtr) + Step;
+        std::memcpy(Dst, PatternPtr, PatternSize);
+      }
+    }
+
+    return Plugin::success();
+  }
+
   /// All functions are already synchronous. No need to do anything on this
   /// synchronization function.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo,
@@ -314,11 +344,11 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
                          "initAsyncInfoImpl not supported");
   }
 
-  /// This plugin does not support interoperability
-  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
-    return Plugin::error(ErrorCode::UNSUPPORTED,
-                         "initDeviceInfoImpl not supported");
-  }
+  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                            AsyncInfoWrapperTy &AsyncInfo) override {
+    Callback(UserData);
+    return Plugin::success();
+  };
 
   /// This plugin does not support the event API. Do nothing without failing.
   Error createEventImpl(void **EventPtrStorage) override {
@@ -337,6 +367,10 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
     return true;
   }
+  Expected<bool> isEventCompleteImpl(void *Event,
+                                     AsyncInfoWrapperTy &AsyncInfo) override {
+    return true;
+  }
   Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
 
   /// Print information about the device.
@@ -347,7 +381,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   }
 
   /// This plugin should not setup the device environment or memory pool.
-  virtual bool shouldSetupDeviceEnvironment() const override { return false; };
   virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
 
   /// Getters and setters for stack size and heap size not relevant.
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index f3e8e9a66685..c0290bfdab3f 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -83,6 +83,7 @@ config.test_format = lit.formats.ShTest()
 config.test_flags = " -I " + config.test_source_root + \
     " -I " + config.omp_header_directory + \
     " -L " + config.library_dir + \
+    " -L " + config.llvm_library_intdir + \
     " -L " + config.llvm_lib_directory
 
 # compiler specific flags
@@ -165,11 +166,12 @@ else: # Unices
         config.test_flags += " -nogpulib"
     config.test_flags += " -Wl,-rpath," + config.library_dir
     config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
+    config.test_flags += " -Wl,-rpath," + config.llvm_library_intdir
     config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory
     if config.cuda_libdir:
         config.test_flags += " -Wl,-rpath," + config.cuda_libdir
     if config.libomptarget_current_target.startswith('nvptx'):
-        config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.llvm_library_intdir
+        config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.llvm_library_intdir + "/nvptx64-nvidia-cuda"
     if config.libomptarget_current_target.endswith('-LTO'):
         config.test_flags += " -foffload-lto"
     if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env(
diff --git a/offload/test/mapping/chained_containing_structs_1.cc b/offload/test/mapping/chained_containing_structs_1.cc
new file mode 100644
index 000000000000..4dbb17140de1
--- /dev/null
+++ b/offload/test/mapping/chained_containing_structs_1.cc
@@ -0,0 +1,58 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// XFAIL: *
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+struct S {
+  int a;
+  int b;
+  int c;
+};
+
+struct T {
+  S *s0;
+  S *s1;
+  S *s2;
+};
+
+int main() {
+  T *v = (T *) malloc (sizeof(T));
+  v->s0 = (S *) malloc (sizeof(S));
+  v->s1 = (S *) malloc (sizeof(S));
+  v->s2 = (S *) malloc (sizeof(S));
+  v->s0->a = 10;
+  v->s0->b = 10;
+  v->s0->c = 10;
+  v->s1->a = 20;
+  v->s1->b = 20;
+  v->s1->c = 20;
+  v->s2->a = 30;
+  v->s2->b = 30;
+  v->s2->c = 30;
+
+#pragma omp target map(to: v[:1]) map(tofrom: v->s1->b, v->s1->c, v->s2->b)
+  {
+    v->s1->b += 3;
+    v->s1->c += 5;
+    v->s2->b += 7;
+  }
+
+  printf ("%d\n", v->s0->a); // CHECK: 10
+  printf ("%d\n", v->s0->b); // CHECK: 10
+  printf ("%d\n", v->s0->c); // CHECK: 10
+  printf ("%d\n", v->s1->a); // CHECK: 20
+  printf ("%d\n", v->s1->b); // CHECK: 23
+  printf ("%d\n", v->s1->c); // CHECK: 25
+  printf ("%d\n", v->s2->a); // CHECK: 30
+  printf ("%d\n", v->s2->b); // CHECK: 37
+  printf ("%d\n", v->s2->c); // CHECK: 30
+
+  free(v->s0);
+  free(v->s1);
+  free(v->s2);
+  free(v);
+
+  return 0;
+}
diff --git a/offload/test/mapping/chained_containing_structs_2.cc b/offload/test/mapping/chained_containing_structs_2.cc
new file mode 100644
index 000000000000..29c4c8b7fedf
--- /dev/null
+++ b/offload/test/mapping/chained_containing_structs_2.cc
@@ -0,0 +1,76 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// XFAIL: *
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+struct R {
+  int d;
+  int e;
+  int f;
+};
+
+struct S {
+  R *r0;
+  R *r1;
+  R *r2;
+};
+
+struct T {
+  S *s0;
+  S *s1;
+  S *s2;
+};
+
+int main() {
+  T *v = (T *) malloc (sizeof(T));
+
+  v->s0 = (S *) malloc (sizeof(S));
+  v->s1 = (S *) malloc (sizeof(S));
+  v->s2 = (S *) malloc (sizeof(S));
+
+  v->s0->r0 = (R *) calloc (1, sizeof(R));
+  v->s0->r1 = (R *) calloc (1, sizeof(R));
+  v->s0->r2 = (R *) calloc (1, sizeof(R));
+
+  v->s1->r0 = (R *) calloc (1, sizeof(R));
+  v->s1->r1 = (R *) calloc (1, sizeof(R));
+  v->s1->r2 = (R *) calloc (1, sizeof(R));
+
+  v->s2->r0 = (R *) calloc (1, sizeof(R));
+  v->s2->r1 = (R *) calloc (1, sizeof(R));
+  v->s2->r2 = (R *) calloc (1, sizeof(R));
+
+  #pragma omp target map(to: v->s1, v->s2, *v->s1, v->s1->r1, *v->s2, v->s2->r0) \
+                     map(tofrom: v->s1->r1->d, v->s1->r1->e, v->s1->r2->d, v->s1->r2->f, v->s2->r0->e)
+  {
+    v->s1->r1->d += 3;
+    v->s1->r1->e += 5;
+    v->s1->r2->d += 7;
+    v->s1->r2->f += 9;
+    v->s2->r0->e += 11;
+  }
+
+  printf ("%d\n", v->s1->r1->d); // CHECK: 3
+  printf ("%d\n", v->s1->r1->e); // CHECK: 5
+  printf ("%d\n", v->s1->r2->d); // CHECK: 7
+  printf ("%d\n", v->s1->r2->f); // CHECK: 9
+  printf ("%d\n", v->s2->r0->e); // CHECK: 11
+
+  free(v->s0->r0);
+  free(v->s0->r1);
+  free(v->s0->r2);
+  free(v->s1->r0);
+  free(v->s1->r1);
+  free(v->s1->r2);
+  free(v->s2->r0);
+  free(v->s2->r1);
+  free(v->s2->r2);
+  free(v->s0);
+  free(v->s1);
+  free(v->s2);
+  free(v);
+
+  return 0;
+}
diff --git a/offload/test/mapping/chained_containing_structs_3.cc b/offload/test/mapping/chained_containing_structs_3.cc
new file mode 100644
index 000000000000..23555bf69110
--- /dev/null
+++ b/offload/test/mapping/chained_containing_structs_3.cc
@@ -0,0 +1,217 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+
+#include <omp.h>
+
+struct R {
+  int d;
+  int e;
+  int f;
+};
+
+struct S {
+  int a;
+  int b;
+  struct {
+    int c;
+    R r;
+    R *rp;
+  } sub;
+  int g;
+};
+
+struct T {
+  int a;
+  int *ptr;
+  int b;
+};
+
+int main() {
+  R r;
+  R *rp = new R;
+  S s;
+  S *sp = new S;
+  T t;
+  T *tp = new T;
+
+  memset(&r, 0, sizeof(R));
+  memset(rp, 0, sizeof(R));
+  memset(&s, 0, sizeof(S));
+  memset(sp, 0, sizeof(S));
+  memset(&t, 0, sizeof(T));
+  memset(tp, 0, sizeof(T));
+
+  s.sub.rp = new R;
+  sp->sub.rp = new R;
+
+  memset(s.sub.rp, 0, sizeof(R));
+  memset(sp->sub.rp, 0, sizeof(R));
+
+  t.ptr = new int[10];
+  tp->ptr = new int[10];
+
+  memset(t.ptr, 0, sizeof(int)*10);
+  memset(tp->ptr, 0, sizeof(int)*10);
+
+#pragma omp target map(tofrom: r) map(tofrom: r.e)
+{
+  r.d++;
+  r.e += 2;
+  r.f += 3;
+}
+  printf ("%d\n", r.d); // CHECK: 1
+  printf ("%d\n", r.e); // CHECK-NEXT: 2
+  printf ("%d\n", r.f); // CHECK-NEXT: 3
+
+#pragma omp target map(tofrom: rp[:1]) map(tofrom: rp->e)
+{
+  rp->d++;
+  rp->e += 2;
+  rp->f += 3;
+}
+
+  printf ("%d\n", rp->d); // CHECK-NEXT: 1
+  printf ("%d\n", rp->e); // CHECK-NEXT: 2
+  printf ("%d\n", rp->f); // CHECK-NEXT: 3
+
+  int v;
+  int *orig_addr_v = &v;
+  bool separate_memory_space;
+
+#pragma omp target data map(v)
+  {
+    void *mapped_ptr_v =
+        omp_get_mapped_ptr(orig_addr_v, omp_get_default_device());
+    separate_memory_space = mapped_ptr_v != (void*) orig_addr_v;
+  }
+
+  const char *mapping_flavour = separate_memory_space ? "separate" : "unified";
+
+#pragma omp target map(to: s) map(tofrom: s.sub.r.e)
+{
+  s.b++;
+  s.sub.r.d+=2;
+  s.sub.r.e+=3;
+  s.sub.r.f+=4;
+}
+
+  printf ("%d/%s\n", s.b, mapping_flavour);
+  printf ("%d/%s\n", s.sub.r.d, mapping_flavour);
+  printf ("%d/%s\n", s.sub.r.e, mapping_flavour);
+  printf ("%d/%s\n", s.sub.r.f, mapping_flavour);
+
+  // CHECK: {{0/separate|1/unified}}
+  // CHECK-NEXT: {{0/separate|2/unified}}
+  // CHECK-NEXT: 3
+  // CHECK-NEXT: {{0/separate|4/unified}}
+
+#pragma omp target map(to: s, s.b) map(to: s.sub.rp[:1]) map(tofrom: s.sub.rp->e)
+{
+  s.b++;
+  s.sub.rp->d+=2;
+  s.sub.rp->e+=3;
+  s.sub.rp->f+=4;
+}
+
+  printf ("%d/%s\n", s.b, mapping_flavour);
+  printf ("%d/%s\n", s.sub.rp->d, mapping_flavour);
+  printf ("%d/%s\n", s.sub.rp->e, mapping_flavour);
+  printf ("%d/%s\n", s.sub.rp->f, mapping_flavour);
+
+  // CHECK-NEXT: {{0/separate|2/unified}}
+  // CHECK-NEXT: {{0/separate|2/unified}}
+  // CHECK-NEXT: 3
+  // CHECK-NEXT: {{0/separate|4/unified}}
+
+#pragma omp target map(to: sp[:1]) map(tofrom: sp->sub.r.e)
+{
+  sp->b++;
+  sp->sub.r.d+=2;
+  sp->sub.r.e+=3;
+  sp->sub.r.f+=4;
+}
+
+  printf ("%d/%s\n", sp->b, mapping_flavour);
+  printf ("%d/%s\n", sp->sub.r.d, mapping_flavour);
+  printf ("%d/%s\n", sp->sub.r.e, mapping_flavour);
+  printf ("%d/%s\n", sp->sub.r.f, mapping_flavour);
+
+  // CHECK-NEXT: {{0/separate|1/unified}}
+  // CHECK-NEXT: {{0/separate|2/unified}}
+  // CHECK-NEXT: 3
+  // CHECK-NEXT: {{0/separate|4/unified}}
+
+#pragma omp target map(to: sp[:1]) map(to: sp->sub.rp[:1]) map(tofrom: sp->sub.rp->e)
+{
+  sp->b++;
+  sp->sub.rp->d+=2;
+  sp->sub.rp->e+=3;
+  sp->sub.rp->f+=4;
+}
+
+  printf ("%d/%s\n", sp->b, mapping_flavour);
+  printf ("%d/%s\n", sp->sub.rp->d, mapping_flavour);
+  printf ("%d/%s\n", sp->sub.rp->e, mapping_flavour);
+  printf ("%d/%s\n", sp->sub.rp->f, mapping_flavour);
+
+  // CHECK-NEXT: {{0/separate|2/unified}}
+  // CHECK-NEXT: {{0/separate|2/unified}}
+  // CHECK-NEXT: 3
+  // CHECK-NEXT: {{0/separate|4/unified}}
+
+#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1])
+{
+  t.a++;
+  t.ptr[2]+=2;
+  t.b+=3;
+}
+
+  printf ("%d\n", t.a); // CHECK-NEXT: 1
+  printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 2
+  printf ("%d\n", t.b); // CHECK-NEXT: 3
+
+#pragma omp target map(tofrom: t) map(tofrom: t.a)
+{
+  t.b++;
+}
+
+  printf ("%d\n", t.b); // CHECK-NEXT: 4
+
+#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1], t.a)
+{
+  t.a++;
+  t.ptr[2]+=2;
+  t.b+=3;
+}
+
+  printf ("%d\n", t.a); // CHECK-NEXT: 2
+  printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 4
+  printf ("%d\n", t.b); // CHECK-NEXT: 7
+
+#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1], t.a)
+{
+  /* Empty */
+}
+
+  printf ("%d\n", t.a); // CHECK-NEXT: 2
+  printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 4
+  printf ("%d\n", t.b); // CHECK-NEXT: 7
+
+  delete s.sub.rp;
+  delete sp->sub.rp;
+
+  delete[] t.ptr;
+  delete[] tp->ptr;
+
+  delete rp;
+  delete sp;
+  delete tp;
+
+  return 0;
+}
diff --git a/offload/test/mapping/data_member_ref.cpp b/offload/test/mapping/data_member_ref.cpp
index fdb8abcaa650..7947a62c169f 100644
--- a/offload/test/mapping/data_member_ref.cpp
+++ b/offload/test/mapping/data_member_ref.cpp
@@ -60,7 +60,8 @@ int main() {
   printf("Host %d %d.\n", Bar.VRef.Data, V.Data);
   // CHECK: Host 123456.
   printf("Host %d.\n", *Baz.VRef.Data);
-#pragma omp target map(*Baz.VRef.Data) map(from : D1, D2)
+#pragma omp target map(Baz.VRef.Data) map(*Baz.VRef.Data) map(V1.Data[0 : 0])  \
+    map(from : D1, D2)
   {
     // CHECK: Device 123456.
     D1 = *Baz.VRef.Data;
diff --git a/offload/test/mapping/declare_mapper_nested_default_mappers.cpp b/offload/test/mapping/declare_mapper_nested_default_mappers.cpp
index c6c5657ae616..45fd042aedb0 100644
--- a/offload/test/mapping/declare_mapper_nested_default_mappers.cpp
+++ b/offload/test/mapping/declare_mapper_nested_default_mappers.cpp
@@ -44,8 +44,8 @@ int main() {
 
   int spp00fa = -1, spp00fca = -1, spp00fb_r = -1;
   __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]);
-#pragma omp target map(tofrom: spp[0][0]) firstprivate(p)                           \
-                   map(from: spp00fa, spp00fca, spp00fb_r)
+#pragma omp target map(tofrom : spp[0][0]) map(alloc : spp[0]) firstprivate(p) \
+    map(from : spp00fa, spp00fca, spp00fb_r)
   {
     spp00fa = spp[0][0].f.a;
     spp00fca = spp[0][0].f.c.a;
diff --git a/offload/test/mapping/declare_mapper_nested_mappers.cpp b/offload/test/mapping/declare_mapper_nested_mappers.cpp
index a9e3f05e0f5f..a59ed6980ec4 100644
--- a/offload/test/mapping/declare_mapper_nested_mappers.cpp
+++ b/offload/test/mapping/declare_mapper_nested_mappers.cpp
@@ -42,8 +42,8 @@ int main() {
   int spp00fa = -1, spp00fb_r = -1, spp00fg1 = -1, spp00fg_r = -1;
   __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]),
              p1 = reinterpret_cast<__intptr_t>(&y[0]);
-#pragma omp target map(tofrom : spp[0][0]) firstprivate(p, p1)                  \
-                   map(from: spp00fa, spp00fb_r, spp00fg1, spp00fg_r)
+#pragma omp target map(tofrom : spp[0][0]) map(alloc : spp[0])                 \
+    firstprivate(p, p1) map(from : spp00fa, spp00fb_r, spp00fg1, spp00fg_r)
   {
     spp00fa = spp[0][0].f.a;
     spp00fb_r = spp[0][0].f.b == reinterpret_cast<void *>(p) ? 1 : 0;
diff --git a/offload/test/mapping/lambda_by_value.cpp b/offload/test/mapping/lambda_by_value.cpp
index 5516dedd72a9..4c0278d40592 100644
--- a/offload/test/mapping/lambda_by_value.cpp
+++ b/offload/test/mapping/lambda_by_value.cpp
@@ -1,4 +1,5 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileopt-generic -fno-exceptions
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 
 #include <stdint.h>
 #include <stdio.h>
diff --git a/offload/test/mapping/map_back_race.cpp b/offload/test/mapping/map_back_race.cpp
index 8a988d3be3b4..49bbe87e2449 100644
--- a/offload/test/mapping/map_back_race.cpp
+++ b/offload/test/mapping/map_back_race.cpp
@@ -2,6 +2,9 @@
 
 // Taken from https://github.com/llvm/llvm-project/issues/54216
 
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
+
 #include <algorithm>
 #include <cstdlib>
 #include <iostream>
diff --git a/offload/test/mapping/map_both_pointer_pointee.c b/offload/test/mapping/map_both_pointer_pointee.c
index 7be1ba465e7d..1934b702dbba 100644
--- a/offload/test/mapping/map_both_pointer_pointee.c
+++ b/offload/test/mapping/map_both_pointer_pointee.c
@@ -1,11 +1,10 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+// RUN: %libomptarget-compile-run-and-check-generic
 
 // REQUIRES: unified_shared_memory
 // UNSUPPORTED: amdgcn-amd-amdhsa
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: nvidiagpu
 
 #pragma omp declare target
 int *ptr1;
diff --git a/offload/test/mapping/map_ptr_and_star_global.c b/offload/test/mapping/map_ptr_and_star_global.c
index c3b0dd2f49e6..869fb8ca9bc2 100644
--- a/offload/test/mapping/map_ptr_and_star_global.c
+++ b/offload/test/mapping/map_ptr_and_star_global.c
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/mapping/map_ptr_and_star_local.c b/offload/test/mapping/map_ptr_and_star_local.c
index f0ca84d1cc4d..97fa7cd53715 100644
--- a/offload/test/mapping/map_ptr_and_star_local.c
+++ b/offload/test/mapping/map_ptr_and_star_local.c
@@ -1,4 +1,9 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// REQUIRES: libc
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/mapping/map_ptr_and_subscript_global.c b/offload/test/mapping/map_ptr_and_subscript_global.c
index a3a10b6c9b21..839db068aa90 100644
--- a/offload/test/mapping/map_ptr_and_subscript_global.c
+++ b/offload/test/mapping/map_ptr_and_subscript_global.c
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/mapping/map_ptr_and_subscript_local.c b/offload/test/mapping/map_ptr_and_subscript_local.c
index bb44999541a7..68ac9dc0917f 100644
--- a/offload/test/mapping/map_ptr_and_subscript_local.c
+++ b/offload/test/mapping/map_ptr_and_subscript_local.c
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/mapping/map_structptr_and_member_global.c b/offload/test/mapping/map_structptr_and_member_global.c
index 10e72e070dbc..f855e87d7218 100644
--- a/offload/test/mapping/map_structptr_and_member_global.c
+++ b/offload/test/mapping/map_structptr_and_member_global.c
@@ -1,4 +1,9 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// REQUIRES: libc
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/mapping/map_structptr_and_member_local.c b/offload/test/mapping/map_structptr_and_member_local.c
index 9e59551ad3d6..bd9e2a89eb6f 100644
--- a/offload/test/mapping/map_structptr_and_member_local.c
+++ b/offload/test/mapping/map_structptr_and_member_local.c
@@ -1,4 +1,9 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// REQUIRES: libc
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/mapping/ptr_and_obj_motion.c b/offload/test/mapping/ptr_and_obj_motion.c
index 8fa2c9865b4a..a94c07aadc1b 100644
--- a/offload/test/mapping/ptr_and_obj_motion.c
+++ b/offload/test/mapping/ptr_and_obj_motion.c
@@ -17,7 +17,7 @@ void init(double vertexx[]) {
 }
 
 void change(DV *dvptr) {
-#pragma omp target map(dvptr->dataptr[0 : 100])
+#pragma omp target map(dvptr->dataptr[0 : 100]) map(alloc : dvptr -> dataptr)
   {
     printf("In change: %lf, expected 77.0\n", dvptr->dataptr[77]);
     dvptr->dataptr[77] += 1.0;
diff --git a/offload/test/mapping/target_derefence_array_pointrs.cpp b/offload/test/mapping/target_derefence_array_pointrs.cpp
index a6dd4069a8f5..d213c8744363 100644
--- a/offload/test/mapping/target_derefence_array_pointrs.cpp
+++ b/offload/test/mapping/target_derefence_array_pointrs.cpp
@@ -18,23 +18,24 @@ void foo(int **t1d) {
 
   for (j = 0; j < 3; j++)
     (*t1d)[j] = 0;
-#pragma omp target map(tofrom : (*t1d)[0 : 3])
+#pragma omp target map(tofrom : (*t1d)[0 : 3]) map(alloc : *t1d)
   { (*t1d)[1] = 1; }
   // CHECK: 1
   printf("%d\n", (*t1d)[1]);
-#pragma omp target map(tofrom : (**t2d)[0 : 3])
+#pragma omp target map(tofrom : (**t2d)[0 : 3]) map(alloc : **t2d, *t2d)
   { (**t2d)[1] = 2; }
   // CHECK: 2
   printf("%d\n", (**t2d)[1]);
-#pragma omp target map(tofrom : (***t3d)[0 : 3])
+#pragma omp target map(tofrom : (***t3d)[0 : 3])                               \
+    map(alloc : ***t3d, **t3d, *t3d)
   { (***t3d)[1] = 3; }
   // CHECK: 3
   printf("%d\n", (***t3d)[1]);
-#pragma omp target map(tofrom : (**t1d))
+#pragma omp target map(tofrom : (**t1d)) map(alloc : *t1d)
   { (*t1d)[0] = 4; }
   // CHECK: 4
   printf("%d\n", (*t1d)[0]);
-#pragma omp target map(tofrom : (*(*(t1d + a) + b)))
+#pragma omp target map(tofrom : (*(*(t1d + a) + b))) map(to : *(t1d + a))
   { *(*(t1d + a) + b) = 5; }
   // CHECK: 5
   printf("%d\n", *(*(t1d + a) + b));
@@ -49,7 +50,7 @@ void bar() {
   for (int i = 0; i < 3; i++) {
     (**a)[1] = i;
   }
-#pragma omp target map((**a)[ : 3])
+#pragma omp target map((**a)[ : 3]) map(alloc : **a, *a)
   {
     (**a)[1] = 6;
     // CHECK: 6
@@ -73,7 +74,8 @@ void zoo(int **f, SSA *sa) {
   *(f + sa->i + 1) = t;
   *(sa->sa->i + *(f + sa->i + 1)) = 4;
   printf("%d\n", *(sa->sa->i + *(1 + sa->i + f)));
-#pragma omp target map(sa, *(sa->sa->i + *(1 + sa->i + f)))
+#pragma omp target map(*(sa->sa->i + *(1 + sa->i + f))) map(alloc : sa->sa)    \
+    map(to : sa->i) map(to : sa->sa->i) map(to : *(1 + sa->i + f))
   { *(sa->sa->i + *(1 + sa->i + f)) = 7; }
   // CHECK: 7
   printf("%d\n", *(sa->sa->i + *(1 + sa->i + f)));
@@ -87,13 +89,13 @@ void xoo() {
 
 void yoo(int **x) {
   *x = (int *)malloc(2 * sizeof(int));
-#pragma omp target map(**x)
+#pragma omp target map(**x) map(alloc : *x)
   {
     **x = 8;
     // CHECK: 8
     printf("%d\n", **x);
   }
-#pragma omp target map(*(*x + 1))
+#pragma omp target map(*(*x + 1)) map(alloc : *x)
   {
     *(*x + 1) = 9;
     // CHECK: 9
diff --git a/offload/test/mapping/target_has_device_addr.c b/offload/test/mapping/target_has_device_addr.c
index e8bfff868c7e..f238832c4405 100644
--- a/offload/test/mapping/target_has_device_addr.c
+++ b/offload/test/mapping/target_has_device_addr.c
@@ -66,8 +66,9 @@ void zoo() {
   short **xpp = &xp[0];
 
   x[1] = 111;
-#pragma omp target data map(tofrom : xpp[1][1]) use_device_addr(xpp[1][1])
-#pragma omp target has_device_addr(xpp[1][1])
+#pragma omp target data map(tofrom : xpp[1][1]) map(xpp[1])                    \
+    use_device_addr(xpp[1])
+#pragma omp target has_device_addr(xpp[1])
   {
     xpp[1][1] = 222;
     // CHECK: 222
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp
new file mode 100644
index 000000000000..3b1a8192bf2c
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp
@@ -0,0 +1,85 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+  int *paa[10][10];
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    int *original_ph3 = &ph[3];
+    int **original_paa02 = &paa[0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5])
+    int *mapped_ptr_ph3 =
+        (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+    int **mapped_ptr_paa02 =
+        (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device());
+
+    // CHECK-COUNT-4: 1
+    printf("%d\n", mapped_ptr_ph3 != nullptr);
+    printf("%d\n", mapped_ptr_paa02 != nullptr);
+    printf("%d\n", original_ph3 != mapped_ptr_ph3);
+    printf("%d\n", original_paa02 != mapped_ptr_paa02);
+
+// (A) use_device_addr operand within mapped address range.
+// CHECK: A: 1
+#pragma omp target data use_device_addr(ph[3 : 4])
+    printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_addr operand in extended address range, but not
+// mapped address range.
+// CHECK: B: 1
+#pragma omp target data use_device_addr(ph[2])
+    printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) use_device_addr/map: same base-array, different first-location.
+// CHECK: C: 1
+#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1])
+    printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) use_device_addr/map: different base-array/pointers.
+// CHECK: D: 1
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+    printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) use_device_addr operand within mapped range of previous map.
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa[0])
+    printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (F) use_device_addr/map: different operands, same base-array.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2])
+    printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (G) use_device_addr/map: different base-array/pointers.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2])
+    printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5])
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp
new file mode 100644
index 000000000000..b9ebde431e7b
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp
@@ -0,0 +1,143 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+  int *paa[10][10];
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    int *original_ph3 = &ph[3];
+    int **original_paa02 = &paa[0][2];
+
+// (A) No corresponding map, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+    }
+
+// (B) use_device_addr/map: different operands, same base-pointer.
+// use_device_addr operand within mapped address range.
+// CHECK: B: 1 1 1
+#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1])
+    {
+      int *mapped_ptr_ph4 =
+          (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr,
+             mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4);
+    }
+
+// (C) use_device_addr/map: different base-pointers.
+// No corresponding storage, lookup should fail.
+// CHECK: C: 1 1 1
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+    }
+
+// (D) use_device_addr/map: one of two maps with matching base-pointer.
+// use_device_addr operand within mapped address range of second map,
+// lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (E) No corresponding map, lookup should fail
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == (int **)nullptr + 2);
+    }
+
+// (F) use_device_addr/map: different operands, same base-array.
+// use_device_addr within mapped address range. Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == mapped_ptr_paa02);
+    }
+
+// (G) use_device_addr/map: different operands, same base-array.
+// use_device_addr extends beyond existing mapping. Not spec compliant.
+// But the lookup succeeds because we use the base-address for translation.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[0][4]) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr(
+          original_paa02 + 2, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr,
+             mapped_ptr_paa04 != original_paa02 + 2,
+             &paa[0][4] == mapped_ptr_paa04);
+    }
+
+    int *original_paa020 = &paa[0][2][0];
+    int **original_paa0 = (int **)&paa[0];
+
+// (H) use_device_addr/map: different base-pointers.
+// No corresponding storage for use_device_addr opnd, lookup should fail.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa020 =
+          (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device());
+      int **mapped_ptr_paa0 =
+          (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr,
+             mapped_ptr_paa0 == nullptr, &paa[0] == nullptr);
+    }
+
+// (I) use_device_addr/map: one map with different, one with same base-ptr.
+// Lookup should succeed.
+// CHECK: I: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == mapped_ptr_paa02);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp
new file mode 100644
index 000000000000..e9a1124bc461
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp
@@ -0,0 +1,98 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section on a reference.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+  int *(&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    int *original_ph3 = &ph[3];
+    int **original_paa02 = &paa[0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5])
+    int *mapped_ptr_ph3 =
+        (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+    int **mapped_ptr_paa02 =
+        (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device());
+
+    // CHECK-COUNT-4: 1
+    printf("%d\n", mapped_ptr_ph3 != nullptr);
+    printf("%d\n", mapped_ptr_paa02 != nullptr);
+    printf("%d\n", original_ph3 != mapped_ptr_ph3);
+    printf("%d\n", original_paa02 != mapped_ptr_paa02);
+
+// (A) use_device_addr operand within mapped address range.
+// EXPECTED: A: 1
+// CHECK:    A: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_addr(ph[3 : 4])
+    printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_addr operand in extended address range, but not
+// mapped address range.
+// EXPECTED: B: 1
+// CHECK:    B: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_addr(ph[2])
+    printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) use_device_addr/map: same base-array, different first-location.
+// EXPECTED: C: 1
+// CHECK:    C: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1])
+    printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) use_device_addr/map: different base-array/pointers.
+// EXPECTED: D: 1
+// CHECK:    D: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+    printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) use_device_addr operand within mapped range of previous map.
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa[0])
+    printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (F) use_device_addr/map: different operands, same base-array.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2])
+    printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (G) use_device_addr/map: different base-array/pointers.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2])
+    printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5])
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp
new file mode 100644
index 000000000000..0090cdb09536
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp
@@ -0,0 +1,158 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section on a reference.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+  int *(&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    int *original_ph3 = &ph[3];
+    int **original_paa02 = &paa[0][2];
+
+// (A) No corresponding map, lookup should fail.
+// EXPECTED: A: 1 1 1
+// CHECK:    A: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+    }
+
+// (B) use_device_addr/map: different operands, same base-pointer.
+// use_device_addr operand within mapped address range.
+// EXPECTED: B: 1 1 1
+// CHECK:    B: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1])
+    {
+      int *mapped_ptr_ph4 =
+          (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr,
+             mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4);
+    }
+
+// (C) use_device_addr/map: different base-pointers.
+// No corresponding storage, lookup should fail.
+// EXPECTED: C: 1 1 1
+// CHECK:    C: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+    }
+
+// (D) use_device_addr/map: one of two maps with matching base-pointer.
+// use_device_addr operand within mapped address range of second map,
+// lookup should succeed.
+// EXPECTED: D: 1 1 1
+// CHECK:    D: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (E) No corresponding map, lookup should fail
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == (int **)nullptr + 2);
+    }
+
+// (F) use_device_addr/map: different operands, same base-array.
+// use_device_addr within mapped address range. Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == mapped_ptr_paa02);
+    }
+
+// (G) use_device_addr/map: different operands, same base-array.
+// use_device_addr extends beyond existing mapping. Not spec compliant.
+// But the lookup succeeds because we use the base-address for translation.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[0][4]) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr(
+          original_paa02 + 2, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr,
+             mapped_ptr_paa04 != original_paa02 + 2,
+             &paa[0][4] == mapped_ptr_paa04);
+    }
+
+    int *original_paa020 = &paa[0][2][0];
+    int **original_paa0 = (int **)&paa[0];
+
+// (H) use_device_addr/map: different base-pointers.
+// No corresponding storage for use_device_addr opnd, lookup should fail.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa020 =
+          (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device());
+      int **mapped_ptr_paa0 =
+          (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr,
+             mapped_ptr_paa0 == nullptr, &paa[0] == nullptr);
+    }
+
+// (I) use_device_addr/map: one map with different, one with same base-ptr.
+// Lookup should succeed.
+// CHECK: I: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == mapped_ptr_paa02);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp
new file mode 100644
index 000000000000..883297f7e90c
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp
@@ -0,0 +1,93 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a variable (not a section).
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+  int *paa[10][10];
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    void *original_addr_g = &g;
+    void *original_addr_h = &h;
+    void *original_addr_ph = &ph;
+    void *original_addr_paa = &paa;
+
+#pragma omp target enter data map(to : g, h, ph, paa)
+    void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device());
+    void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device());
+    void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device());
+    void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device());
+
+    // CHECK-COUNT-8: 1
+    printf("%d\n", mapped_ptr_g != nullptr);
+    printf("%d\n", mapped_ptr_h != nullptr);
+    printf("%d\n", mapped_ptr_ph != nullptr);
+    printf("%d\n", mapped_ptr_paa != nullptr);
+    printf("%d\n", original_addr_g != mapped_ptr_g);
+    printf("%d\n", original_addr_h != mapped_ptr_h);
+    printf("%d\n", original_addr_ph != mapped_ptr_ph);
+    printf("%d\n", original_addr_paa != mapped_ptr_paa);
+
+// (A)
+// CHECK: A: 1
+#pragma omp target data use_device_addr(g)
+    printf("A: %d\n", mapped_ptr_g == &g);
+
+// (B)
+// CHECK: B: 1
+#pragma omp target data use_device_addr(h)
+    printf("B: %d\n", mapped_ptr_h == &h);
+
+// (C)
+// CHECK: C: 1
+#pragma omp target data use_device_addr(ph)
+    printf("C: %d\n", mapped_ptr_ph == &ph);
+
+// (D) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &ph, not &ph[0/1].
+// CHECK: D: 1
+#pragma omp target data map(ph[1 : 2]) use_device_addr(ph)
+    printf("D: %d\n", mapped_ptr_ph == &ph);
+
+// (E)
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa)
+    printf("E: %d\n", mapped_ptr_paa == &paa);
+
+// (F) use_device_addr/map with same base-array, paa.
+// Address translation should happen for &paa.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][2]) use_device_addr(paa)
+    printf("F: %d\n", mapped_ptr_paa == &paa);
+
+// (G) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &paa.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+    printf("G: %d\n", mapped_ptr_paa == &paa);
+
+#pragma omp target exit data map(release : g, h, ph, paa)
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp
new file mode 100644
index 000000000000..79c6f69edba8
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp
@@ -0,0 +1,159 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a variable (not a section).
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+  int *paa[10][10];
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    void *original_addr_g = &g;
+    void *original_addr_h = &h;
+    void *original_addr_ph = &ph;
+    void *original_addr_paa = &paa;
+
+// (A) No corresponding item, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_addr(g)
+    {
+      void *mapped_ptr_g =
+          omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_g == nullptr,
+             mapped_ptr_g != original_addr_g, (void *)&g == nullptr);
+    }
+
+// (B) Lookup should succeed.
+// CHECK: B: 1 1 1
+#pragma omp target data map(g) use_device_addr(g)
+    {
+      void *mapped_ptr_g =
+          omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_g != nullptr,
+             mapped_ptr_g != original_addr_g, &g == mapped_ptr_g);
+    }
+
+// (C) No corresponding item, lookup should fail.
+// CHECK: C: 1 1 1
+#pragma omp target data use_device_addr(h)
+    {
+      void *mapped_ptr_h =
+          omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_h == nullptr,
+             mapped_ptr_h != original_addr_h, (void *)&h == nullptr);
+    }
+
+// (D) Lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(h) use_device_addr(h)
+    {
+      void *mapped_ptr_h =
+          omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_h != nullptr,
+             mapped_ptr_h != original_addr_h, &h == mapped_ptr_h);
+    }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_ph == nullptr,
+             mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+    }
+
+// (F) Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(ph) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_ph != nullptr,
+             mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+    }
+
+// (G) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: G: 1 1 1
+#pragma omp target data map(ph[0 : 1]) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_ph == nullptr,
+             mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+    }
+
+// (H) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_ph != nullptr,
+             mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+    }
+
+// (I) No corresponding item, lookup should fail.
+// CHECK: I: 1 1 1
+#pragma omp target data use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("I: %d %d %d\n", mapped_ptr_paa == nullptr,
+             mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+    }
+
+// (J) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: J: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("J: %d %d %d\n", mapped_ptr_paa == nullptr,
+             mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+    }
+
+// (K) Lookup should succeed.
+// CHECK: K: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("K: %d %d %d\n", mapped_ptr_paa != nullptr,
+             mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+    }
+
+// (L) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: L: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("L: %d %d %d\n", mapped_ptr_paa != nullptr,
+             mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp
new file mode 100644
index 000000000000..f018c65f36ec
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp
@@ -0,0 +1,100 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a reference variable.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+  int *(&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    void *original_addr_g = &g;
+    void *original_addr_h = &h;
+    void *original_addr_ph = &ph;
+    void *original_addr_paa = &paa;
+
+#pragma omp target enter data map(to : g, h, ph, paa)
+    void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device());
+    void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device());
+    void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device());
+    void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device());
+
+    // CHECK-COUNT-8: 1
+    printf("%d\n", mapped_ptr_g != nullptr);
+    printf("%d\n", mapped_ptr_h != nullptr);
+    printf("%d\n", mapped_ptr_ph != nullptr);
+    printf("%d\n", mapped_ptr_paa != nullptr);
+    printf("%d\n", original_addr_g != mapped_ptr_g);
+    printf("%d\n", original_addr_h != mapped_ptr_h);
+    printf("%d\n", original_addr_ph != mapped_ptr_ph);
+    printf("%d\n", original_addr_paa != mapped_ptr_paa);
+
+// (A)
+// CHECK: A: 1
+#pragma omp target data use_device_addr(g)
+    printf("A: %d\n", mapped_ptr_g == &g);
+
+// (B)
+// CHECK: B: 1
+#pragma omp target data use_device_addr(h)
+    printf("B: %d\n", mapped_ptr_h == &h);
+
+// (C)
+// CHECK: C: 1
+#pragma omp target data use_device_addr(ph)
+    printf("C: %d\n", mapped_ptr_ph == &ph);
+
+// (D) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &ph, not &ph[0/1].
+// CHECK: D: 1
+#pragma omp target data map(ph[1 : 2]) use_device_addr(ph)
+    printf("D: %d\n", mapped_ptr_ph == &ph);
+
+// (E)
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa)
+    printf("E: %d\n", mapped_ptr_paa == &paa);
+
+// (F) use_device_addr/map with same base-array, paa.
+// Address translation should happen for &paa.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][2]) use_device_addr(paa)
+    printf("F: %d\n", mapped_ptr_paa == &paa);
+
+// (G) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &paa.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+    printf("G: %d\n", mapped_ptr_paa == &paa);
+
+#pragma omp target exit data map(release : g, h, ph, paa)
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp
new file mode 100644
index 000000000000..9360db419504
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp
@@ -0,0 +1,166 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a reference variable.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+  int *(&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    void *original_addr_g = &g;
+    void *original_addr_h = &h;
+    void *original_addr_ph = &ph;
+    void *original_addr_paa = &paa;
+
+// (A) No corresponding item, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_addr(g)
+    {
+      void *mapped_ptr_g =
+          omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_g == nullptr,
+             mapped_ptr_g != original_addr_g, (void *)&g == nullptr);
+    }
+
+// (B) Lookup should succeed.
+// CHECK: B: 1 1 1
+#pragma omp target data map(g) use_device_addr(g)
+    {
+      void *mapped_ptr_g =
+          omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_g != nullptr,
+             mapped_ptr_g != original_addr_g, &g == mapped_ptr_g);
+    }
+
+// (C) No corresponding item, lookup should fail.
+// CHECK: C: 1 1 1
+#pragma omp target data use_device_addr(h)
+    {
+      void *mapped_ptr_h =
+          omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_h == nullptr,
+             mapped_ptr_h != original_addr_h, (void *)&h == nullptr);
+    }
+
+// (D) Lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(h) use_device_addr(h)
+    {
+      void *mapped_ptr_h =
+          omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_h != nullptr,
+             mapped_ptr_h != original_addr_h, &h == mapped_ptr_h);
+    }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_ph == nullptr,
+             mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+    }
+
+// (F) Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(ph) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_ph != nullptr,
+             mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+    }
+
+// (G) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: G: 1 1 1
+#pragma omp target data map(ph[0 : 1]) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_ph == nullptr,
+             mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+    }
+
+// (H) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_ph != nullptr,
+             mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+    }
+
+// (I) No corresponding item, lookup should fail.
+// CHECK: I: 1 1 1
+#pragma omp target data use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("I: %d %d %d\n", mapped_ptr_paa == nullptr,
+             mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+    }
+
+// (J) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: J: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("J: %d %d %d\n", mapped_ptr_paa == nullptr,
+             mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+    }
+
+// (K) Lookup should succeed.
+// CHECK: K: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("K: %d %d %d\n", mapped_ptr_paa != nullptr,
+             mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+    }
+
+// (L) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: L: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("L: %d %d %d\n", mapped_ptr_paa != nullptr,
+             mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/target_use_device_addr.c b/offload/test/mapping/use_device_addr/target_use_device_addr.c
index 5c2bb8a48f6e..4a9dbe252f76 100644
--- a/offload/test/mapping/target_use_device_addr.c
+++ b/offload/test/mapping/use_device_addr/target_use_device_addr.c
@@ -12,7 +12,9 @@ int main() {
   printf("%d, %p\n", xp[1], &xp[1]);
 #pragma omp target data use_device_addr(xp[1 : 3]) map(tofrom : x)
 #pragma omp target is_device_ptr(xp)
-  { xp[1] = 222; }
+  {
+    xp[1] = 222;
+  }
   // CHECK: 222
   printf("%d, %p\n", xp[1], &xp[1]);
 }
diff --git a/offload/test/mapping/target_wrong_use_device_addr.c b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c
index 7a5babd69253..28ec6857fa1a 100644
--- a/offload/test/mapping/target_wrong_use_device_addr.c
+++ b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c
@@ -14,7 +14,7 @@ int main() {
   // CHECK: host addr=0x[[#%x,HOST_ADDR:]]
   fprintf(stderr, "host addr=%p\n", x);
 
-#pragma omp target data map(to : x [0:10])
+#pragma omp target data map(to : x[0 : 10])
   {
 // CHECK: omptarget device 0 info: variable x does not have a valid device
 // counterpart
@@ -27,4 +27,3 @@ int main() {
 
   return 0;
 }
-
diff --git a/offload/test/mapping/array_section_use_device_ptr.c b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c
index 86e2875c35c4..4cfcce28c112 100644
--- a/offload/test/mapping/array_section_use_device_ptr.c
+++ b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c
@@ -20,7 +20,9 @@ int main() {
 
   float *A_dev = NULL;
 #pragma omp target data use_device_ptr(A)
-  { A_dev = A; }
+  {
+    A_dev = A;
+  }
 #pragma omp target exit data map(delete : A[FROM : LENGTH])
 
   // CHECK: Success
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp
new file mode 100644
index 000000000000..a7745de53298
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp
@@ -0,0 +1,100 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a variable.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int h[10];
+int *ph = &h[0];
+
+struct S {
+  int (*paa)[10][10] = &aa;
+
+  void f1(int i) {
+    paa--;
+    void *original_ph3 = &ph[3];
+    void *original_paa102 = &paa[1][0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5])
+    void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+    void *mapped_ptr_paa102 =
+        omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device());
+
+    // CHECK-COUNT-4: 1
+    printf("%d\n", mapped_ptr_ph3 != nullptr);
+    printf("%d\n", mapped_ptr_paa102 != nullptr);
+    printf("%d\n", original_ph3 != mapped_ptr_ph3);
+    printf("%d\n", original_paa102 != mapped_ptr_paa102);
+
+// (A) Mapped data is within extended address range. Lookup should succeed.
+// CHECK: A: 1
+#pragma omp target data use_device_ptr(ph)
+    printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_ptr/map on pointer, and pointee already exists.
+// Lookup should succeed.
+// CHECK: B: 1
+#pragma omp target data map(ph) use_device_ptr(ph)
+    printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: C: 1
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+    printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: D: 1
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+    printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) Mapped data is within extended address range. Lookup should succeed.
+// Lookup should succeed.
+// CHECK: E: 1
+#pragma omp target data use_device_ptr(paa)
+    printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (F) use_device_ptr/map on pointer, and pointee already exists.
+// &paa[0] should be in extended address-range of the existing paa[1][...]
+// Lookup should succeed.
+// FIXME: However, it currently does not. Might need an RT fix.
+// EXPECTED: F: 1
+// CHECK:    F: 0
+#pragma omp target data map(paa) use_device_ptr(paa)
+    printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+    printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+    printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5])
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp
new file mode 100644
index 000000000000..fe3cdb56e4ba
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp
@@ -0,0 +1,125 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a variable.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int h[10];
+int *ph = &h[0];
+
+struct S {
+  int (*paa)[10][10] = &aa;
+
+  void f1(int i) {
+    paa--;
+    void *original_addr_ph3 = &ph[3];
+    void *original_addr_paa102 = &paa[1][0][2];
+
+// (A) No corresponding item, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+    }
+
+// (B) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// CHECK: B: 1 1 1
+#pragma omp target data map(ph) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+    }
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: C: 1 1 1
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+             mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+    }
+
+// (F) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+             mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+    }
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+             mapped_ptr_paa102 != original_addr_paa102,
+             &paa[1][0][2] == mapped_ptr_paa102);
+    }
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+             mapped_ptr_paa102 != original_addr_paa102,
+             &paa[1][0][2] == mapped_ptr_paa102);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp
new file mode 100644
index 000000000000..66e65de4195a
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp
@@ -0,0 +1,111 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a reference variable.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int (*paa_ptee)[10][10] = &aa;
+
+int h[10];
+int *ph_ptee = &h[0];
+int *&ph = ph_ptee;
+
+struct S {
+  int (*&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa--;
+    void *original_ph3 = &ph[3];
+    void *original_paa102 = &paa[1][0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5])
+    void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+    void *mapped_ptr_paa102 =
+        omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device());
+
+    // CHECK-COUNT-4: 1
+    printf("%d\n", mapped_ptr_ph3 != nullptr);
+    printf("%d\n", mapped_ptr_paa102 != nullptr);
+    printf("%d\n", original_ph3 != mapped_ptr_ph3);
+    printf("%d\n", original_paa102 != mapped_ptr_paa102);
+
+// (A) Mapped data is within extended address range. Lookup should succeed.
+// EXPECTED: A: 1
+// CHECK:    A: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_ptr(ph)
+    printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_ptr/map on pointer, and pointee already exists.
+// Lookup should succeed.
+// EXPECTED: B: 1
+// CHECK:    B: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_ptr(ph)
+    printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: C: 1
+// CHECK:    C: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+    printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: D: 1
+// CHECK:    D: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+    printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) Mapped data is within extended address range. Lookup should succeed.
+// Lookup should succeed.
+// CHECK: E: 1
+#pragma omp target data use_device_ptr(paa)
+    printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (F) use_device_ptr/map on pointer, and pointee already exists.
+// &paa[0] should be in extended address-range of the existing paa[1][...]
+// Lookup should succeed.
+// FIXME: However, it currently does not. Might need an RT fix.
+// EXPECTED: F: 1
+// CHECK:    F: 0
+#pragma omp target data map(paa) use_device_ptr(paa)
+    printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+    printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+    printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5])
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp
new file mode 100644
index 000000000000..419ab3eb33d4
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp
@@ -0,0 +1,136 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a reference variable.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int (*paa_ptee)[10][10] = &aa;
+
+int h[10];
+int *ph_ptee = &h[0];
+int *&ph = ph_ptee;
+
+struct S {
+  int (*&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa--;
+    void *original_addr_ph3 = &ph[3];
+    void *original_addr_paa102 = &paa[1][0][2];
+
+// (A) No corresponding item, lookup should fail.
+// EXPECTED: A: 1 1 1
+// CHECK:    A: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+    }
+
+// (B) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// EXPECTED: B: 1 1 1
+// CHECK:    B: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+    }
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: C: 1 1 1
+// CHECK:    C: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: D: 1 1 1
+// CHECK:    D: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+             mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+    }
+
+// (F) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+             mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+    }
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+             mapped_ptr_paa102 != original_addr_paa102,
+             &paa[1][0][2] == mapped_ptr_paa102);
+    }
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+             mapped_ptr_paa102 != original_addr_paa102,
+             &paa[1][0][2] == mapped_ptr_paa102);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index 1f84a0e1288d..b2e1edf51e17 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -5,10 +5,10 @@
 // RUN: %t | %fcheck-generic
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
+// REQUIRES: gpu
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <stdio.h>
 
diff --git a/offload/test/offloading/bug51781.c b/offload/test/offloading/bug51781.c
index 2f30b035afbb..ff7fa51aafc2 100644
--- a/offload/test/offloading/bug51781.c
+++ b/offload/test/offloading/bug51781.c
@@ -16,6 +16,7 @@
 // the generic state machine.
 //
 // RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt \
+// RUN:   -Xoffload-linker -mllvm=-openmp-opt-disable-spmdization \
 // RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
 // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
@@ -24,7 +25,9 @@
 // Repeat with reduction clause, which has managed to break the custom state
 // machine in the past.
 //
-// RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt -DADD_REDUCTION \
+// RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt \
+// RUN:   -DADD_REDUCTION \
+// RUN:   -Xoffload-linker -mllvm=-openmp-opt-disable-spmdization \
 // RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
 // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
diff --git a/offload/test/offloading/force-usm.cpp b/offload/test/offloading/force-usm.cpp
index a043ba47f54a..9988c3dc4e9e 100644
--- a/offload/test/offloading/force-usm.cpp
+++ b/offload/test/offloading/force-usm.cpp
@@ -48,7 +48,7 @@ int main(void) {
 
 // clang-format off
 // NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
-// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12
+// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12
 // NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
 // NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI
 // NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
diff --git a/offload/test/offloading/fortran/declare-target-automap.f90 b/offload/test/offloading/fortran/declare-target-automap.f90
new file mode 100644
index 000000000000..b44c0b281527
--- /dev/null
+++ b/offload/test/offloading/fortran/declare-target-automap.f90
@@ -0,0 +1,40 @@
+!Offloading test for AUTOMAP modifier in declare target enter
+! REQUIRES: flang, amdgpu
+
+! FIXME: https://github.com/llvm/llvm-project/issues/161265
+! XFAIL: amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program automap_program
+   use iso_c_binding, only: c_loc
+   use omp_lib, only: omp_get_default_device, omp_target_is_present
+   integer, parameter :: N = 10
+   integer :: i
+   integer, allocatable, target :: automap_array(:)
+   !$omp declare target enter(automap:automap_array)
+
+   ! false since the storage is not present even though the descriptor is present
+   write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device())
+   ! CHECK: 0
+
+   allocate (automap_array(N))
+   ! true since the storage should be allocated and reference count incremented by the allocate
+   write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device())
+   ! CHECK: 1
+
+   ! since storage is present this should not be a runtime error
+   !$omp target teams loop
+   do i = 1, N
+      automap_array(i) = i
+   end do
+
+   !$omp target update from(automap_array)
+   write (*, *) automap_array
+   ! CHECK: 1 2 3 4 5 6 7 8 9 10
+
+   deallocate (automap_array)
+
+   ! automap_array should have it's storage unmapped on device here
+   write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device())
+   ! CHECK: 0
+end program
diff --git a/offload/test/offloading/fortran/descriptor-stack-jam-regression.f90 b/offload/test/offloading/fortran/descriptor-stack-jam-regression.f90
new file mode 100644
index 000000000000..45a18b7f38ed
--- /dev/null
+++ b/offload/test/offloading/fortran/descriptor-stack-jam-regression.f90
@@ -0,0 +1,101 @@
+! This test doesn't expect any results, the pass condition is running to completion
+! without any memory access errors on device or mapping issues from descriptor
+! collisions due to local descriptors being placed on device and not being unampped
+! before a subsequent local descriptor residing at the same address is mapped to
+! device.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+module test
+contains
+    subroutine kernel_1d(array)
+        implicit none
+        real, dimension(:) :: array
+        integer :: i
+
+        !$omp target enter data map(alloc:array)
+        !$omp target teams distribute parallel do
+        do i=1, ubound(array, 1)
+            array(i) = 42.0
+        end do
+        !$omp target update from(array)
+    end subroutine
+
+    subroutine kernel_2d(array)
+        implicit none
+        real, dimension(:,:) :: array
+        integer :: i, j
+
+        !$omp target enter data map(alloc:array)
+        !$omp target teams distribute parallel do collapse(2)
+        do j=1, ubound(array, 2)
+            do i=1, ubound(array, 1)
+                array(i,j) = 42.0
+            end do
+        end do
+        !$omp target update from(array)
+    end subroutine
+
+    subroutine kernel_3d(array)
+        implicit none
+        real, dimension(:,:,:) :: array
+        integer :: i, j, k
+
+        !$omp target enter data map(alloc:array)
+        !$omp target teams distribute parallel do collapse(3)
+        do k=1, ubound(array, 3)
+            do j=1, ubound(array, 2)
+                do i=1, ubound(array, 1)
+                    array(i,j,k) = 42.0
+                end do
+            end do
+        end do
+        !$omp target update from(array)
+    end subroutine
+
+    subroutine kernel_4d(array)
+        implicit none
+        real, dimension(:,:,:,:) :: array
+        integer :: i, j, k, l
+
+        !$omp target enter data map(alloc:array)
+        !$omp target teams distribute parallel do collapse(4)
+        do l=1, ubound(array, 4)
+            do k=1, ubound(array, 3)
+                do j=1, ubound(array, 2)
+                    do i=1, ubound(array, 1)
+                        array(i,j,k,l) = 42.0
+                    end do
+                end do
+            end do
+        enddo
+        !$omp target update from(array)
+    end subroutine
+end module
+
+program main
+    use test
+    implicit none
+    integer, parameter :: n = 2
+    real :: array1(n)
+    real :: array2(n,n)
+    real :: array3(n,n,n)
+    real :: array4(n,n,n,n)
+
+    call kernel_1d(array1)
+    call kernel_2d(array2)
+    call kernel_3d(array3)
+    call kernel_4d(array4)
+
+    print *, array1
+    print *, array2
+    print *, array3
+    print *, array4
+    print *, "PASS"
+end program
+
+! CHECK: 42. 42.
+! CHECK: 42. 42. 42. 42.
+! CHECK: 42. 42. 42. 42. 42. 42. 42. 42.
+! CHECK: 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42.
+! CHECK: PASS
diff --git a/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90 b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90
new file mode 100644
index 000000000000..c6f576acb90b
--- /dev/null
+++ b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90
@@ -0,0 +1,53 @@
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-generic -fdo-concurrent-to-openmp=device
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+module saxpymod
+   use iso_fortran_env
+   public :: saxpy
+contains
+
+subroutine saxpy(a, x, y, n, m)
+   use iso_fortran_env
+   implicit none
+   integer,intent(in) :: n, m
+   real(kind=real32),intent(in) :: a
+   real(kind=real32), dimension(:,:),intent(in) :: x
+   real(kind=real32), dimension(:,:),intent(inout) :: y
+   integer :: i, j
+
+   do concurrent(i=1:n, j=1:m)
+       y(i,j) = a * x(i,j) + y(i,j)
+   end do
+
+   write(*,*) "plausibility check:"
+   write(*,'("y(1,1) ",f8.6)') y(1,1)
+   write(*,'("y(n,m) ",f8.6)') y(n,m)
+end subroutine saxpy
+
+end module saxpymod
+
+program main
+   use iso_fortran_env
+   use saxpymod, ONLY:saxpy
+   implicit none
+
+   integer,parameter :: n = 1000, m=10000
+   real(kind=real32), allocatable, dimension(:,:) :: x, y
+   real(kind=real32) :: a
+   integer :: i
+
+   allocate(x(1:n,1:m), y(1:n,1:m))
+   a = 2.0_real32
+   x(:,:) = 1.0_real32
+   y(:,:) = 2.0_real32
+
+   call saxpy(a, x, y, n, m)
+
+   deallocate(x,y)
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  plausibility check:
+! CHECK:  y(1,1) 4.0
+! CHECK:  y(n,m) 4.0
diff --git a/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90 b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90
new file mode 100644
index 000000000000..e094a1d7459e
--- /dev/null
+++ b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90
@@ -0,0 +1,53 @@
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-generic -fdo-concurrent-to-openmp=device
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+module saxpymod
+   use iso_fortran_env
+   public :: saxpy
+contains
+
+subroutine saxpy(a, x, y, n)
+   use iso_fortran_env
+   implicit none
+   integer,intent(in) :: n
+   real(kind=real32),intent(in) :: a
+   real(kind=real32), dimension(:),intent(in) :: x
+   real(kind=real32), dimension(:),intent(inout) :: y
+   integer :: i
+
+   do concurrent(i=1:n)
+       y(i) = a * x(i) + y(i)
+   end do
+
+   write(*,*) "plausibility check:"
+   write(*,'("y(1) ",f8.6)') y(1)
+   write(*,'("y(n) ",f8.6)') y(n)
+end subroutine saxpy
+
+end module saxpymod
+
+program main
+   use iso_fortran_env
+   use saxpymod, ONLY:saxpy
+   implicit none
+
+   integer,parameter :: n = 10000000
+   real(kind=real32), allocatable, dimension(:) :: x, y
+   real(kind=real32) :: a
+   integer :: i
+
+   allocate(x(1:n), y(1:n))
+   a = 2.0_real32
+   x(:) = 1.0_real32
+   y(:) = 2.0_real32
+
+   call saxpy(a, x, y, n)
+
+   deallocate(x,y)
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  plausibility check:
+! CHECK:  y(1) 4.0
+! CHECK:  y(n) 4.0
diff --git a/offload/test/offloading/fortran/dtype-char-array-map-2.f90 b/offload/test/offloading/fortran/dtype-char-array-map-2.f90
new file mode 100644
index 000000000000..f17ea9e53853
--- /dev/null
+++ b/offload/test/offloading/fortran/dtype-char-array-map-2.f90
@@ -0,0 +1,25 @@
+! Offloading test that verifies certain type of character string arrays
+! map to and from device without problem.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+  implicit none
+  type char_t
+    CHARACTER(LEN=16), dimension(10,10) :: char_arr
+  end type char_t
+  type(char_t) :: dtype_char
+
+!$omp target enter data map(alloc:dtype_char%char_arr)
+
+!$omp target
+    dtype_char%char_arr(2,2) = 'c'
+!$omp end target
+
+!$omp target update from(dtype_char%char_arr)
+
+
+ print *, dtype_char%char_arr(2,2)
+end program
+
+!CHECK: c
diff --git a/offload/test/offloading/fortran/dtype-char-array-map.f90 b/offload/test/offloading/fortran/dtype-char-array-map.f90
new file mode 100644
index 000000000000..6b72c9e95101
--- /dev/null
+++ b/offload/test/offloading/fortran/dtype-char-array-map.f90
@@ -0,0 +1,27 @@
+! Offloading test that verifies certain type of character string arrays
+! (in this case allocatable) map to and from device without problem.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+  implicit none
+  type char_t
+    CHARACTER(LEN=16), dimension(:,:), allocatable :: char_arr
+  end type char_t
+  type(char_t) :: dtype_char
+
+  allocate(dtype_char%char_arr(10,10))
+
+!$omp target enter data map(alloc:dtype_char%char_arr)
+
+!$omp target
+    dtype_char%char_arr(2,2) = 'c'
+!$omp end target
+
+!$omp target update from(dtype_char%char_arr)
+
+
+ print *, dtype_char%char_arr(2,2)
+end program
+
+!CHECK: c
diff --git a/offload/test/offloading/fortran/target-declare-mapper-allocatable.f90 b/offload/test/offloading/fortran/target-declare-mapper-allocatable.f90
new file mode 100644
index 000000000000..d8d5e1b5631a
--- /dev/null
+++ b/offload/test/offloading/fortran/target-declare-mapper-allocatable.f90
@@ -0,0 +1,48 @@
+! This test validates that declare mapper for a derived type with an
+! allocatable component preserves TO/FROM semantics for the component,
+! ensuring the payload is copied back to the host on target exit.
+
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+
+program target_declare_mapper_allocatable
+  implicit none
+
+  type :: real_t
+    real, allocatable :: real_arr(:)
+  end type real_t
+
+  ! Map the allocatable array payload via a named mapper.
+  !$omp declare mapper (xyz : real_t :: t) map(tofrom: t%real_arr)
+
+  type(real_t) :: r
+  integer :: i
+  logical :: ok
+
+  allocate(r%real_arr(10))
+  r%real_arr = 1.0
+
+  !$omp target map(mapper(xyz), tofrom: r)
+    do i = 1, size(r%real_arr)
+      r%real_arr(i) = 3.0
+    end do
+  !$omp end target
+
+  ok = .true.
+  do i = 1, size(r%real_arr)
+    if (r%real_arr(i) /= 3.0) ok = .false.
+  end do
+  if (ok) then
+    print *, "Test passed!"
+  else
+    print *, "Test failed!"
+    do i = 1, size(r%real_arr)
+      print *, r%real_arr(i)
+    end do
+  end if
+
+  deallocate(r%real_arr)
+end program target_declare_mapper_allocatable
+
+! CHECK: Test passed!
diff --git a/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90 b/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90
new file mode 100644
index 000000000000..65e04af66e02
--- /dev/null
+++ b/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90
@@ -0,0 +1,43 @@
+! This test validates that declare mapper for a derived type that extends
+! a parent type with an allocatable component correctly maps the nested
+! allocatable payload via the mapper when the whole object is mapped on
+! target.
+
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+
+program target_declare_mapper_parent_allocatable
+  implicit none
+
+  type, abstract :: base_t
+    real, allocatable :: base_arr(:)
+  end type base_t
+
+  type, extends(base_t) :: real_t
+    real, allocatable :: real_arr(:)
+  end type real_t
+  !$omp declare mapper(custommapper: real_t :: t) map(t%base_arr, t%real_arr)
+
+  type(real_t) :: r
+  integer :: i
+  allocate(r%base_arr(10), source=1.0)
+  allocate(r%real_arr(10), source=1.0)
+
+  !$omp target map(mapper(custommapper), tofrom: r)
+  do i = 1, size(r%base_arr)
+    r%base_arr(i) = 2.0
+    r%real_arr(i) = 3.0
+    r%real_arr(i) = r%base_arr(1)
+  end do
+  !$omp end target
+
+
+  !CHECK: base_arr:  2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
+  print*, "base_arr: ", r%base_arr
+  !CHECK: real_arr:  2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
+  print*, "real_arr: ", r%real_arr
+
+  deallocate(r%real_arr)
+  deallocate(r%base_arr)
+end program target_declare_mapper_parent_allocatable
diff --git a/offload/test/offloading/fortran/target-no-loop.f90 b/offload/test/offloading/fortran/target-no-loop.f90
new file mode 100644
index 000000000000..3c88b00a5354
--- /dev/null
+++ b/offload/test/offloading/fortran/target-no-loop.f90
@@ -0,0 +1,97 @@
+! REQUIRES: flang
+! REQUIRES: gpu
+
+! RUN: %libomptarget-compile-fortran-generic -O3  -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription
+! RUN: env LIBOMPTARGET_INFO=16 OMP_NUM_TEAMS=16 OMP_TEAMS_THREAD_LIMIT=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+function check_errors(array) result (errors)
+   integer, intent(in) :: array(1024)
+   integer :: errors
+   integer :: i
+   errors = 0
+   do i = 1, 1024
+      if ( array( i) .ne. (i) ) then
+         errors = errors + 1
+      end if
+   end do
+end function
+
+program main
+   use omp_lib
+   implicit none
+   integer :: i,j,red
+   integer :: array(1024), errors = 0
+   array = 1
+
+   ! No-loop kernel
+   !$omp target teams distribute parallel do
+   do i = 1, 1024
+      array(i) = i
+   end do
+   errors = errors + check_errors(array)
+
+   ! SPMD kernel (num_teams clause blocks promotion to no-loop)
+   array = 1
+   !$omp target teams distribute parallel do num_teams(3)
+   do i = 1, 1024
+      array(i) = i
+   end do
+
+   errors = errors + check_errors(array)
+
+   ! No-loop kernel
+   array = 1
+   !$omp target teams distribute parallel do num_threads(64)
+   do i = 1, 1024
+      array(i) = i
+    end do
+
+   errors = errors + check_errors(array)
+
+   ! SPMD kernel
+   array = 1
+   !$omp target parallel do
+   do i = 1, 1024
+      array(i) = i
+   end do
+
+   errors = errors + check_errors(array)
+
+   ! Generic kernel
+   array = 1
+   !$omp target teams distribute
+   do i = 1, 1024
+      array(i) = i
+   end do
+
+   errors = errors + check_errors(array)
+
+   ! SPMD kernel (reduction clause blocks promotion to no-loop)
+   array = 1
+   red =0
+   !$omp target teams distribute parallel do reduction(+:red)
+   do i = 1, 1024
+      red = red + array(i)
+   end do
+
+   if (red .ne. 1024) then
+      errors = errors + 1
+   end if
+
+   print *,"number of errors: ", errors
+
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode
+! CHECK:  info: #Args: 3 Teams x Thrds:   64x  16
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode
+! CHECK:  info: #Args: 3 Teams x Thrds:   3x  16 {{.*}}
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode
+! CHECK:  info: #Args: 3 Teams x Thrds:   64x  16 {{.*}}
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode
+! CHECK:  info: #Args: 3 Teams x Thrds:   1x  16
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} Generic mode
+! CHECK:  info: #Args: 3 Teams x Thrds:   16x  16 {{.*}}
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode
+! CHECK:  info: #Args: 4 Teams x Thrds:   16x  16 {{.*}}
+! CHECK:  number of errors: 0
+
diff --git a/offload/test/offloading/interop.c b/offload/test/offloading/interop.c
index 26287e3ec533..d9fa2ef883b9 100644
--- a/offload/test/offloading/interop.c
+++ b/offload/test/offloading/interop.c
@@ -1,5 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: nvptx64-nvidia-cuda
+
+// XFAIL: *
 
 #include <assert.h>
 #include <omp.h>
diff --git a/offload/test/offloading/mandatory_but_no_devices.c b/offload/test/offloading/mandatory_but_no_devices.c
index ecdee72acad0..df8a5f3b9278 100644
--- a/offload/test/offloading/mandatory_but_no_devices.c
+++ b/offload/test/offloading/mandatory_but_no_devices.c
@@ -3,6 +3,47 @@
 // device.  This behavior is proposed for OpenMP 5.2 in OpenMP spec github
 // issue 2669.
 
+// AMD Tests
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR=target
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN:   %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN:   %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR='target teams'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN:   %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN:   %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR='target data map(X)'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN:   %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN:   %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \
+// RUN:   -DDIR='target enter data map(to:X)'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN:   %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN:   %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \
+// RUN:   -DDIR='target exit data map(from:X)'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN:   %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN:   %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \
+// RUN:   -DDIR='target update to(X)'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN:   %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN:   %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \
+// RUN:   -DDIR='target update from(X)'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN:   %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN:   %fcheck-amdgcn-amd-amdhsa
+
+// Nvidia Tests
 // RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DDIR=target
 // RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \
 // RUN:   %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
@@ -42,8 +83,6 @@
 // RUN:   %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
 // RUN:   %fcheck-nvptx64-nvidia-cuda
 
-// REQUIRES: nvptx64-nvidia-cuda
-
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/offloading/memory_manager.cpp b/offload/test/offloading/memory_manager.cpp
index fba1e4a54012..d6d8697fcdec 100644
--- a/offload/test/offloading/memory_manager.cpp
+++ b/offload/test/offloading/memory_manager.cpp
@@ -1,7 +1,5 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
-// REQUIRES: nvidiagpu
-
 #include <omp.h>
 
 #include <cassert>
diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_1.c b/offload/test/offloading/single_threaded_for_barrier_hang_1.c
index 8ee6b51fb681..a007521a5c74 100644
--- a/offload/test/offloading/single_threaded_for_barrier_hang_1.c
+++ b/offload/test/offloading/single_threaded_for_barrier_hang_1.c
@@ -1,6 +1,9 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // RUN: %libomptarget-compileopt-run-and-check-generic
 
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_2.c b/offload/test/offloading/single_threaded_for_barrier_hang_2.c
index a98abd6922da..cabd2ed3dde7 100644
--- a/offload/test/offloading/single_threaded_for_barrier_hang_2.c
+++ b/offload/test/offloading/single_threaded_for_barrier_hang_2.c
@@ -1,6 +1,7 @@
 // RUN: %libomptarget-compile-run-and-check-generic
-// FIXME: This fails with optimization enabled and prints b: 0
-// FIXME: RUN: %libomptarget-compileopt-run-and-check-generic
+
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/offloading/spmdization.c b/offload/test/offloading/spmdization.c
index 7f3f47d9ef32..48627cd7dae1 100644
--- a/offload/test/offloading/spmdization.c
+++ b/offload/test/offloading/spmdization.c
@@ -2,7 +2,8 @@
 // RUN: %libomptarget-compileopt-generic
 // RUN: env LIBOMPTARGET_INFO=16 \
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,SPMD
-// RUN: %libomptarget-compileopt-generic -mllvm --openmp-opt-disable-spmdization
+// RUN: %libomptarget-compileopt-generic -mllvm --openmp-opt-disable-spmdization \
+// RUN:   -Xoffload-linker -mllvm=--openmp-opt-disable-spmdization
 // RUN: env LIBOMPTARGET_INFO=16 \
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,GENERIC
 // clang-format on
diff --git a/offload/test/offloading/strided_multiple_update.c b/offload/test/offloading/strided_multiple_update.c
new file mode 100644
index 000000000000..a3e8d10863ae
--- /dev/null
+++ b/offload/test/offloading/strided_multiple_update.c
@@ -0,0 +1,62 @@
+// This test checks that #pragma omp target update from(data1[0:3:4],
+// data2[0:2:5]) correctly updates disjoint strided sections of multiple arrays
+// from the device to the host.
+
+// RUN: %libomptarget-compile-run-and-check-generic
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int len = 12;
+  double data1[len], data2[len];
+
+// Initial values
+#pragma omp target map(tofrom : data1[0 : len], data2[0 : len])
+  {
+    for (int i = 0; i < len; i++) {
+      data1[i] = i;
+      data2[i] = i * 10;
+    }
+  }
+
+  printf("original host array values:\n");
+  printf("data1: ");
+  for (int i = 0; i < len; i++)
+    printf("%.1f ", data1[i]);
+  printf("\ndata2: ");
+  for (int i = 0; i < len; i++)
+    printf("%.1f ", data2[i]);
+  printf("\n\n");
+
+#pragma omp target data map(to : data1[0 : len], data2[0 : len])
+  {
+// Modify arrays on device
+#pragma omp target
+    {
+      for (int i = 0; i < len; i++)
+        data1[i] += i;
+      for (int i = 0; i < len; i++)
+        data2[i] += 100;
+    }
+
+// data1[0:3:4]  // indices 0,4,8
+// data2[0:2:5]  // indices 0,5
+#pragma omp target update from(data1[0 : 3 : 4], data2[0 : 2 : 5])
+  }
+
+  printf("device array values after update from:\n");
+  printf("data1: ");
+  for (int i = 0; i < len; i++)
+    printf("%.1f ", data1[i]);
+  printf("\ndata2: ");
+  for (int i = 0; i < len; i++)
+    printf("%.1f ", data2[i]);
+  printf("\n\n");
+
+  // CHECK: data1: 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0
+  // CHECK: data2: 0.0 10.0 20.0 30.0 40.0 50.0 60.0 70.0 80.0 90.0 100.0 110.0
+
+  // CHECK: data1: 0.0 1.0 2.0 3.0 8.0 5.0 6.0 7.0 16.0 9.0 10.0 11.0
+  // CHECK: data2: 100.0 10.0 20.0 30.0 40.0 150.0 60.0 70.0 80.0 90.0 100.0
+  // 110.0
+}
diff --git a/offload/test/offloading/strided_partial_update.c b/offload/test/offloading/strided_partial_update.c
new file mode 100644
index 000000000000..15d477f2b9b7
--- /dev/null
+++ b/offload/test/offloading/strided_partial_update.c
@@ -0,0 +1,63 @@
+// This test checks that #pragma omp target update from(data[0:4:3]) correctly
+// updates every third element (stride 3) from the device to the host, partially
+// across the array
+
+// RUN: %libomptarget-compile-run-and-check-generic
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int len = 11;
+  double data[len];
+
+#pragma omp target map(tofrom : data[0 : len])
+  {
+    for (int i = 0; i < len; i++)
+      data[i] = i;
+  }
+
+  // Initial values
+  printf("original host array values:\n");
+  for (int i = 0; i < len; i++)
+    printf("%f\n", data[i]);
+  printf("\n");
+
+#pragma omp target data map(to : data[0 : len])
+  {
+// Modify arrays on device
+#pragma omp target
+    for (int i = 0; i < len; i++)
+      data[i] += i;
+
+#pragma omp target update from(data[0 : 4 : 3]) // indices 0,3,6,9
+  }
+
+  printf("device array values after update from:\n");
+  for (int i = 0; i < len; i++)
+    printf("%f\n", data[i]);
+  printf("\n");
+
+  // CHECK: 0.000000
+  // CHECK: 1.000000
+  // CHECK: 2.000000
+  // CHECK: 3.000000
+  // CHECK: 4.000000
+  // CHECK: 5.000000
+  // CHECK: 6.000000
+  // CHECK: 7.000000
+  // CHECK: 8.000000
+  // CHECK: 9.000000
+  // CHECK: 10.000000
+
+  // CHECK: 0.000000
+  // CHECK: 1.000000
+  // CHECK: 2.000000
+  // CHECK: 6.000000
+  // CHECK: 4.000000
+  // CHECK: 5.000000
+  // CHECK: 12.000000
+  // CHECK: 7.000000
+  // CHECK: 8.000000
+  // CHECK: 18.000000
+  // CHECK: 10.000000
+}
diff --git a/offload/test/offloading/strided_update.c b/offload/test/offloading/strided_update.c
new file mode 100644
index 000000000000..fe875b7fd55c
--- /dev/null
+++ b/offload/test/offloading/strided_update.c
@@ -0,0 +1,54 @@
+// This test checks that "update from" clause in OpenMP is supported when the
+// elements are updated in a non-contiguous manner. This test checks that
+// #pragma omp target update from(data[0:4:2]) correctly updates only every
+// other element (stride 2) from the device to the host
+
+// RUN: %libomptarget-compile-run-and-check-generic
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int len = 8;
+  double data[len];
+#pragma omp target map(tofrom : len, data[0 : len])
+  {
+    for (int i = 0; i < len; i++) {
+      data[i] = i;
+    }
+  }
+  // Initial values
+  printf("original host array values:\n");
+  for (int i = 0; i < len; i++)
+    printf("%f\n", data[i]);
+  printf("\n");
+
+#pragma omp target data map(to : len, data[0 : len])
+  {
+// Modify arrays on device
+#pragma omp target
+    for (int i = 0; i < len; i++) {
+      data[i] += i;
+    }
+
+#pragma omp target update from(data[0 : 4 : 2])
+  }
+  // CHECK: 0.000000
+  // CHECK: 1.000000
+  // CHECK: 4.000000
+  // CHECK: 3.000000
+  // CHECK: 8.000000
+  // CHECK: 5.000000
+  // CHECK: 12.000000
+  // CHECK: 7.000000
+  // CHECK-NOT: 2.000000
+  // CHECK-NOT: 6.000000
+  // CHECK-NOT: 10.000000
+  // CHECK-NOT: 14.000000
+
+  printf("from target array results:\n");
+  for (int i = 0; i < len; i++)
+    printf("%f\n", data[i]);
+  printf("\n");
+
+  return 0;
+}
diff --git a/offload/test/sanitizer/ptr_outside_alloc_1.c b/offload/test/sanitizer/ptr_outside_alloc_1.c
index bdd028352e40..b30ce12ef1ea 100644
--- a/offload/test/sanitizer/ptr_outside_alloc_1.c
+++ b/offload/test/sanitizer/ptr_outside_alloc_1.c
@@ -5,12 +5,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 #include <omp.h>
 
diff --git a/offload/test/sanitizer/ptr_outside_alloc_2.c b/offload/test/sanitizer/ptr_outside_alloc_2.c
index 6a67962f9eb3..3bb8bdaca8b4 100644
--- a/offload/test/sanitizer/ptr_outside_alloc_2.c
+++ b/offload/test/sanitizer/ptr_outside_alloc_2.c
@@ -3,12 +3,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 #include <omp.h>
 
diff --git a/offload/test/sanitizer/use_after_free_1.c b/offload/test/sanitizer/use_after_free_1.c
index c4783c5c36df..acc1de373f9e 100644
--- a/offload/test/sanitizer/use_after_free_1.c
+++ b/offload/test/sanitizer/use_after_free_1.c
@@ -5,12 +5,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 #include <omp.h>
 
diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c
index 1c1e09744a75..3d70fb7b3a3f 100644
--- a/offload/test/sanitizer/use_after_free_2.c
+++ b/offload/test/sanitizer/use_after_free_2.c
@@ -3,12 +3,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 // If offload memory pooling is enabled for a large allocation, reuse error is
 // not detected. UNSUPPORTED: large_allocation_memory_pool
diff --git a/offload/test/tools/llvm-omp-device-info.c b/offload/test/tools/llvm-omp-device-info.c
index 6f497309df2f..1ce8d4ac07f6 100644
--- a/offload/test/tools/llvm-omp-device-info.c
+++ b/offload/test/tools/llvm-omp-device-info.c
@@ -2,5 +2,5 @@
 //
 // Just check any device was found and something is printed
 //
-// CHECK: Found {{[1-9].*}} devices:
-// CHECK: Device 0:
+// CHECK: Num Devices: {{[1-9].*}}
+// CHECK: [{{[1-9A-Za-z].*}}]
diff --git a/offload/test/tools/offload-tblgen/default_returns.td b/offload/test/tools/offload-tblgen/default_returns.td
index e919492cc5bf..41949db7226a 100644
--- a/offload/test/tools/offload-tblgen/default_returns.td
+++ b/offload/test/tools/offload-tblgen/default_returns.td
@@ -6,13 +6,11 @@
 
 include "APIDefs.td"
 
-def : Handle {
-    let name = "ol_foo_handle_t";
+def ol_foo_handle_t : Handle {
     let desc = "Example handle type";
 }
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
     let params = [
diff --git a/offload/test/tools/offload-tblgen/entry_points.td b/offload/test/tools/offload-tblgen/entry_points.td
index c66d5b488b46..94ea820d453e 100644
--- a/offload/test/tools/offload-tblgen/entry_points.td
+++ b/offload/test/tools/offload-tblgen/entry_points.td
@@ -4,8 +4,7 @@
 
 include "APIDefs.td"
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
     let params = [
diff --git a/offload/test/tools/offload-tblgen/functions_basic.td b/offload/test/tools/offload-tblgen/functions_basic.td
index dec93577b57e..2802c78a2947 100644
--- a/offload/test/tools/offload-tblgen/functions_basic.td
+++ b/offload/test/tools/offload-tblgen/functions_basic.td
@@ -6,8 +6,7 @@
 
 include "APIDefs.td"
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
     let params = [
diff --git a/offload/test/tools/offload-tblgen/functions_code_loc.td b/offload/test/tools/offload-tblgen/functions_code_loc.td
index aec20129343f..8d7aa00c5f15 100644
--- a/offload/test/tools/offload-tblgen/functions_code_loc.td
+++ b/offload/test/tools/offload-tblgen/functions_code_loc.td
@@ -7,8 +7,7 @@
 
 include "APIDefs.td"
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
     let params = [
diff --git a/offload/test/tools/offload-tblgen/functions_ranged_param.td b/offload/test/tools/offload-tblgen/functions_ranged_param.td
index d0996b231973..1ce8b394b157 100644
--- a/offload/test/tools/offload-tblgen/functions_ranged_param.td
+++ b/offload/test/tools/offload-tblgen/functions_ranged_param.td
@@ -8,13 +8,11 @@
 
 include "APIDefs.td"
 
-def : Handle {
-    let name = "some_handle_t";
+def some_handle_t : Handle {
     let desc = "An example handle type";
 }
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
   let params = [
diff --git a/offload/test/tools/offload-tblgen/print_enum.td b/offload/test/tools/offload-tblgen/print_enum.td
index 97f869689293..c7573a9a415c 100644
--- a/offload/test/tools/offload-tblgen/print_enum.td
+++ b/offload/test/tools/offload-tblgen/print_enum.td
@@ -4,8 +4,7 @@
 
 include "APIDefs.td"
 
-def : Enum {
-  let name = "my_enum_t";
+def my_enum_t : Enum {
   let desc = "An example enum";
   let etors =[
     Etor<"VALUE_ONE", "The first enum value">,
diff --git a/offload/test/tools/offload-tblgen/print_function.td b/offload/test/tools/offload-tblgen/print_function.td
index ce1fe4c52760..74b39f145a40 100644
--- a/offload/test/tools/offload-tblgen/print_function.td
+++ b/offload/test/tools/offload-tblgen/print_function.td
@@ -5,13 +5,11 @@
 
 include "APIDefs.td"
 
-def : Handle {
-    let name = "ol_foo_handle_t";
+def ol_foo_handle_t : Handle {
     let desc = "Example handle type";
 }
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
     let params = [
diff --git a/offload/test/tools/offload-tblgen/type_tagged_enum.td b/offload/test/tools/offload-tblgen/type_tagged_enum.td
index 95964e32f0c9..b32531aac9c8 100644
--- a/offload/test/tools/offload-tblgen/type_tagged_enum.td
+++ b/offload/test/tools/offload-tblgen/type_tagged_enum.td
@@ -9,13 +9,11 @@
 
 include "APIDefs.td"
 
-def : Handle {
-    let name = "some_handle_t";
+def some_handle_t: Handle {
     let desc = "An example handle type";
 }
 
-def : Enum {
-  let name = "my_type_tagged_enum_t";
+def my_type_tagged_enum_t : Enum {
   let desc = "Example type tagged enum";
   let is_typed = 1;
   let etors = [
@@ -34,8 +32,7 @@ def : Enum {
 // CHECK-API-NEXT: [some_handle_t] Value three.
 // CHECK-API-NEXT: MY_TYPE_TAGGED_ENUM_VALUE_THREE = 2,
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
   let params = [
diff --git a/offload/tools/deviceinfo/CMakeLists.txt b/offload/tools/deviceinfo/CMakeLists.txt
index 3787c12f940a..cc2d0a6add8b 100644
--- a/offload/tools/deviceinfo/CMakeLists.txt
+++ b/offload/tools/deviceinfo/CMakeLists.txt
@@ -4,10 +4,6 @@ add_openmp_tool(llvm-offload-device-info llvm-offload-device-info.cpp)
 
 llvm_update_compile_flags(llvm-offload-device-info)
 
-target_include_directories(llvm-offload-device-info PRIVATE
-  ${LIBOMPTARGET_INCLUDE_DIR}
-)
 target_link_libraries(llvm-offload-device-info PRIVATE
-  omp
-  omptarget
+  LLVMOffload
 )
diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 2228fbf3ec17..9b58d67f017c 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -1,4 +1,4 @@
-//===- llvm-offload-device-info.cpp - Device info as seen by LLVM/Offload -===//
+//===- llvm-offload-device-info.cpp - Print liboffload properties ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,27 +6,272 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is a command line utility that, by using LLVM/Offload, and the device
-// plugins, list devices information as seen by the runtime.
+// This is a command line utility that, by using the new liboffload API, prints
+// all devices and properties
 //
 //===----------------------------------------------------------------------===//
 
-#include "omptarget.h"
-#include <cstdio>
+#include <OffloadAPI.h>
+#include <iostream>
+#include <vector>
 
-int main(int argc, char **argv) {
-  __tgt_bin_desc EmptyDesc = {0, nullptr, nullptr, nullptr};
-  __tgt_register_lib(&EmptyDesc);
-  __tgt_init_all_rtls();
+#define OFFLOAD_ERR(X)                                                         \
+  if (auto Err = X) {                                                          \
+    return Err;                                                                \
+  }
+
+enum class PrintKind {
+  NORMAL,
+  FP_FLAGS,
+};
+
+template <typename T, PrintKind PK = PrintKind::NORMAL>
+void doWrite(std::ostream &S, T &&Val) {
+  S << Val;
+}
+
+template <>
+void doWrite<ol_platform_backend_t>(std::ostream &S,
+                                    ol_platform_backend_t &&Val) {
+  switch (Val) {
+  case OL_PLATFORM_BACKEND_UNKNOWN:
+    S << "UNKNOWN";
+    break;
+  case OL_PLATFORM_BACKEND_CUDA:
+    S << "CUDA";
+    break;
+  case OL_PLATFORM_BACKEND_AMDGPU:
+    S << "AMDGPU";
+    break;
+  case OL_PLATFORM_BACKEND_HOST:
+    S << "HOST";
+    break;
+  default:
+    S << "<< INVALID >>";
+    break;
+  }
+}
+template <>
+void doWrite<ol_device_type_t>(std::ostream &S, ol_device_type_t &&Val) {
+  switch (Val) {
+  case OL_DEVICE_TYPE_GPU:
+    S << "GPU";
+    break;
+  case OL_DEVICE_TYPE_CPU:
+    S << "CPU";
+    break;
+  case OL_DEVICE_TYPE_HOST:
+    S << "HOST";
+    break;
+  default:
+    S << "<< INVALID >>";
+    break;
+  }
+}
+template <>
+void doWrite<ol_dimensions_t>(std::ostream &S, ol_dimensions_t &&Val) {
+  S << "{x: " << Val.x << ", y: " << Val.y << ", z: " << Val.z << "}";
+}
+template <>
+void doWrite<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
+    std::ostream &S, ol_device_fp_capability_flags_t &&Val) {
+  S << Val << " {";
+
+  if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT) {
+    S << " CORRECTLY_ROUNDED_DIVIDE_SQRT";
+  }
+  if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST) {
+    S << " ROUND_TO_NEAREST";
+  }
+  if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO) {
+    S << " ROUND_TO_ZERO";
+  }
+  if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF) {
+    S << " ROUND_TO_INF";
+  }
+  if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN) {
+    S << " INF_NAN";
+  }
+  if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_DENORM) {
+    S << " DENORM";
+  }
+  if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_FMA) {
+    S << " FMA";
+  }
+  if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_SOFT_FLOAT) {
+    S << " SOFT_FLOAT";
+  }
+
+  S << " }";
+}
 
-  printf("Found %d devices:\n", omp_get_num_devices());
-  for (int Dev = 0; Dev < omp_get_num_devices(); Dev++) {
-    printf("  Device %d:\n", Dev);
-    if (!__tgt_print_device_info(Dev))
-      printf("    print_device_info not implemented\n");
-    printf("\n");
+template <typename T>
+ol_result_t printPlatformValue(std::ostream &S, ol_platform_handle_t Plat,
+                               ol_platform_info_t Info, const char *Desc) {
+  S << Desc << ": ";
+
+  if constexpr (std::is_pointer_v<T>) {
+    std::vector<uint8_t> Val;
+    size_t Size;
+    OFFLOAD_ERR(olGetPlatformInfoSize(Plat, Info, &Size));
+    Val.resize(Size);
+    OFFLOAD_ERR(olGetPlatformInfo(Plat, Info, sizeof(Val), Val.data()));
+    doWrite(S, reinterpret_cast<T>(Val.data()));
+  } else {
+    T Val;
+    OFFLOAD_ERR(olGetPlatformInfo(Plat, Info, sizeof(Val), &Val));
+    doWrite(S, std::move(Val));
+  }
+  S << "\n";
+  return OL_SUCCESS;
+}
+
+template <typename T, PrintKind PK = PrintKind::NORMAL>
+ol_result_t printDeviceValue(std::ostream &S, ol_device_handle_t Dev,
+                             ol_device_info_t Info, const char *Desc,
+                             const char *Units = nullptr) {
+  S << Desc << ": ";
+
+  if constexpr (std::is_pointer_v<T>) {
+    std::vector<uint8_t> Val;
+    size_t Size;
+    OFFLOAD_ERR(olGetDeviceInfoSize(Dev, Info, &Size));
+    Val.resize(Size);
+    OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, Size, Val.data()));
+    doWrite<T, PK>(S, reinterpret_cast<T>(Val.data()));
+  } else {
+    T Val;
+    OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, sizeof(Val), &Val));
+    doWrite<T, PK>(S, std::move(Val));
+  }
+  if (Units)
+    S << " " << Units;
+  S << "\n";
+  return OL_SUCCESS;
+}
+
+ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) {
+  ol_platform_handle_t Platform;
+  OFFLOAD_ERR(
+      olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), &Platform));
+
+  std::vector<char> Name;
+  size_t NameSize;
+  OFFLOAD_ERR(olGetDeviceInfoSize(D, OL_DEVICE_INFO_PRODUCT_NAME, &NameSize))
+  Name.resize(NameSize);
+  OFFLOAD_ERR(
+      olGetDeviceInfo(D, OL_DEVICE_INFO_PRODUCT_NAME, NameSize, Name.data()));
+  S << "[" << Name.data() << "]\n";
+
+  OFFLOAD_ERR(printPlatformValue<const char *>(
+      S, Platform, OL_PLATFORM_INFO_NAME, "Platform Name"));
+  OFFLOAD_ERR(printPlatformValue<const char *>(
+      S, Platform, OL_PLATFORM_INFO_VENDOR_NAME, "Platform Vendor Name"));
+  OFFLOAD_ERR(printPlatformValue<const char *>(
+      S, Platform, OL_PLATFORM_INFO_VERSION, "Platform Version"));
+  OFFLOAD_ERR(printPlatformValue<ol_platform_backend_t>(
+      S, Platform, OL_PLATFORM_INFO_BACKEND, "Platform Backend"));
+
+  OFFLOAD_ERR(
+      printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_NAME, "Name"));
+  OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_PRODUCT_NAME,
+                                             "Product Name"));
+  OFFLOAD_ERR(
+      printDeviceValue<ol_device_type_t>(S, D, OL_DEVICE_INFO_TYPE, "Type"));
+  OFFLOAD_ERR(printDeviceValue<const char *>(
+      S, D, OL_DEVICE_INFO_DRIVER_VERSION, "Driver Version"));
+  OFFLOAD_ERR(printDeviceValue<uint32_t>(
+      S, D, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, "Max Work Group Size"));
+  OFFLOAD_ERR(printDeviceValue<ol_dimensions_t>(
+      S, D, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION,
+      "Max Work Group Size Per Dimension"));
+  OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_MAX_WORK_SIZE,
+                                         "Max Work Size"));
+  OFFLOAD_ERR(printDeviceValue<ol_dimensions_t>(
+      S, D, OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION,
+      "Max Work Size Per Dimension"));
+  OFFLOAD_ERR(
+      printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_VENDOR_ID, "Vendor ID"));
+  OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NUM_COMPUTE_UNITS,
+                                         "Num Compute Units"));
+  OFFLOAD_ERR(printDeviceValue<uint32_t>(
+      S, D, OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY, "Max Clock Frequency", "MHz"));
+  OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_MEMORY_CLOCK_RATE,
+                                         "Memory Clock Rate", "MHz"));
+  OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_ADDRESS_BITS,
+                                         "Address Bits"));
+  OFFLOAD_ERR(printDeviceValue<uint64_t>(
+      S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B"));
+  OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE,
+                                         "Global Mem Size", "B"));
+  OFFLOAD_ERR(
+      (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
+          S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG,
+          "Single Precision Floating Point Capability")));
+  OFFLOAD_ERR(
+      (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
+          S, D, OL_DEVICE_INFO_DOUBLE_FP_CONFIG,
+          "Double Precision Floating Point Capability")));
+  OFFLOAD_ERR(
+      (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
+          S, D, OL_DEVICE_INFO_HALF_FP_CONFIG,
+          "Half Precision Floating Point Capability")));
+  OFFLOAD_ERR(
+      printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR,
+                                 "Native Vector Width For Char"));
+  OFFLOAD_ERR(
+      printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT,
+                                 "Native Vector Width For Short"));
+  OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D,
+                                         OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT,
+                                         "Native Vector Width For Int"));
+  OFFLOAD_ERR(
+      printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG,
+                                 "Native Vector Width For Long"));
+  OFFLOAD_ERR(
+      printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT,
+                                 "Native Vector Width For Float"));
+  OFFLOAD_ERR(printDeviceValue<uint32_t>(
+      S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE,
+      "Native Vector Width For Double"));
+  OFFLOAD_ERR(
+      printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF,
+                                 "Native Vector Width For Half"));
+
+  return OL_SUCCESS;
+}
+
+ol_result_t printRoot(std::ostream &S) {
+  OFFLOAD_ERR(olInit());
+  S << "Liboffload Version: " << OL_VERSION_MAJOR << "." << OL_VERSION_MINOR
+    << "." << OL_VERSION_PATCH << "\n";
+
+  std::vector<ol_device_handle_t> Devices;
+  OFFLOAD_ERR(olIterateDevices(
+      [](ol_device_handle_t Device, void *UserData) {
+        reinterpret_cast<decltype(Devices) *>(UserData)->push_back(Device);
+        return true;
+      },
+      &Devices));
+
+  S << "Num Devices: " << Devices.size() << "\n";
+
+  for (auto &D : Devices) {
+    S << "\n";
+    OFFLOAD_ERR(printDevice(S, D));
   }
 
-  __tgt_unregister_lib(&EmptyDesc);
+  OFFLOAD_ERR(olShutDown());
+  return OL_SUCCESS;
+}
+
+int main(int argc, char **argv) {
+  auto Err = printRoot(std::cout);
+
+  if (Err) {
+    std::cerr << "[Liboffload error " << Err->Code << "]: " << Err->Details
+              << "\n";
+    return 1;
+  }
   return 0;
 }
diff --git a/offload/tools/offload-tblgen/APIGen.cpp b/offload/tools/offload-tblgen/APIGen.cpp
index 8c61d1f12de7..1e79c00ae06c 100644
--- a/offload/tools/offload-tblgen/APIGen.cpp
+++ b/offload/tools/offload-tblgen/APIGen.cpp
@@ -131,7 +131,8 @@ static void ProcessEnum(const EnumRec &Enum, raw_ostream &OS) {
   OS << formatv("/// @brief {0}\n", Enum.getDesc());
   OS << formatv("typedef enum {0} {{\n", Enum.getName());
 
-  uint32_t EtorVal = 0;
+  // Bitfields start from 1, other enums from 0
+  uint32_t EtorVal = Enum.isBitField();
   for (const auto &EnumVal : Enum.getValues()) {
     if (Enum.isTyped()) {
       OS << MakeComment(
@@ -141,7 +142,12 @@ static void ProcessEnum(const EnumRec &Enum, raw_ostream &OS) {
       OS << MakeComment(EnumVal.getDesc());
     }
     OS << formatv(TAB_1 "{0}_{1} = {2},\n", Enum.getEnumValNamePrefix(),
-                  EnumVal.getName(), EtorVal++);
+                  EnumVal.getName(), EtorVal);
+    if (Enum.isBitField()) {
+      EtorVal <<= 1u;
+    } else {
+      ++EtorVal;
+    }
   }
 
   // Add last_element/force uint32 val
@@ -220,31 +226,23 @@ OL_APIEXPORT ol_result_t OL_APICALL {0}WithCodeLoc(
 void EmitOffloadAPI(const RecordKeeper &Records, raw_ostream &OS) {
   OS << GenericHeader;
   OS << FileHeader;
-  // Generate main API definitions
-  for (auto *R : Records.getAllDerivedDefinitions("APIObject")) {
-    if (R->isSubClassOf("Macro")) {
-      ProcessMacro(MacroRec{R}, OS);
-    } else if (R->isSubClassOf("Typedef")) {
-      ProcessTypedef(TypedefRec{R}, OS);
-    } else if (R->isSubClassOf("Handle")) {
-      ProcessHandle(HandleRec{R}, OS);
-    } else if (R->isSubClassOf("Function")) {
-      ProcessFunction(FunctionRec{R}, OS);
-    } else if (R->isSubClassOf("Enum")) {
-      ProcessEnum(EnumRec{R}, OS);
-    } else if (R->isSubClassOf("Struct")) {
-      ProcessStruct(StructRec{R}, OS);
-    } else if (R->isSubClassOf("FptrTypedef")) {
-      ProcessFptrTypedef(FptrTypedefRec{R}, OS);
-    }
-  }
 
-  // Generate auxiliary definitions (func param structs etc)
+  // Generate main API definitions
+  for (auto *R : Records.getAllDerivedDefinitions("Macro"))
+    ProcessMacro(MacroRec{R}, OS);
+  for (auto *R : Records.getAllDerivedDefinitions("Handle"))
+    ProcessHandle(HandleRec{R}, OS);
+  for (auto *R : Records.getAllDerivedDefinitions("Enum"))
+    ProcessEnum(EnumRec{R}, OS);
+  for (auto *R : Records.getAllDerivedDefinitions("Typedef"))
+    ProcessTypedef(TypedefRec{R}, OS);
+  for (auto *R : Records.getAllDerivedDefinitions("FptrTypedef"))
+    ProcessFptrTypedef(FptrTypedefRec{R}, OS);
+  for (auto *R : Records.getAllDerivedDefinitions("Struct"))
+    ProcessStruct(StructRec{R}, OS);
   for (auto *R : Records.getAllDerivedDefinitions("Function")) {
     ProcessFuncParamStruct(FunctionRec{R}, OS);
-  }
-
-  for (auto *R : Records.getAllDerivedDefinitions("Function")) {
+    ProcessFunction(FunctionRec{R}, OS);
     ProcessFuncWithCodeLocVariant(FunctionRec{R}, OS);
   }
 
diff --git a/offload/tools/offload-tblgen/MiscGen.cpp b/offload/tools/offload-tblgen/MiscGen.cpp
index b90e5cfdec8b..8a8b9caf2348 100644
--- a/offload/tools/offload-tblgen/MiscGen.cpp
+++ b/offload/tools/offload-tblgen/MiscGen.cpp
@@ -86,7 +86,7 @@ void EmitOffloadErrcodes(const RecordKeeper &Records, raw_ostream &OS) {
 
 )";
 
-  auto ErrorCodeEnum = EnumRec{Records.getDef("ErrorCode")};
+  auto ErrorCodeEnum = EnumRec{Records.getDef("ol_errc_t")};
   uint32_t EtorVal = 0;
   for (const auto &EnumVal : ErrorCodeEnum.getValues()) {
     OS << formatv(TAB_1 "OFFLOAD_ERRC({0}, \"{1}\", {2})\n", EnumVal.getName(),
@@ -107,10 +107,16 @@ void EmitOffloadInfo(const RecordKeeper &Records, raw_ostream &OS) {
 
 )";
 
-  auto ErrorCodeEnum = EnumRec{Records.getDef("DeviceInfo")};
-  uint32_t EtorVal = 0;
-  for (const auto &EnumVal : ErrorCodeEnum.getValues()) {
+  auto Enum = EnumRec{Records.getDef("ol_device_info_t")};
+  // Bitfields start from 1, other enums from 0
+  uint32_t EtorVal = Enum.isBitField();
+  for (const auto &EnumVal : Enum.getValues()) {
     OS << formatv(TAB_1 "OFFLOAD_DEVINFO({0}, \"{1}\", {2})\n",
-                  EnumVal.getName(), EnumVal.getDesc(), EtorVal++);
+                  EnumVal.getName(), EnumVal.getDesc(), EtorVal);
+    if (Enum.isBitField()) {
+      EtorVal <<= 1u;
+    } else {
+      ++EtorVal;
+    }
   }
 }
diff --git a/offload/tools/offload-tblgen/RecordTypes.hpp b/offload/tools/offload-tblgen/RecordTypes.hpp
index 65c0a4ce4a2c..2abd9e10f0f9 100644
--- a/offload/tools/offload-tblgen/RecordTypes.hpp
+++ b/offload/tools/offload-tblgen/RecordTypes.hpp
@@ -16,25 +16,30 @@ namespace llvm {
 namespace offload {
 namespace tblgen {
 
-class HandleRec {
+class APIObject {
 public:
-  explicit HandleRec(const Record *rec) : rec(rec) {}
-  StringRef getName() const { return rec->getValueAsString("name"); }
+  StringRef getName() const { return rec->getName(); }
   StringRef getDesc() const { return rec->getValueAsString("desc"); }
 
-private:
+protected:
+  APIObject(const Record *rec) : rec(rec) {}
   const Record *rec;
 };
 
-class MacroRec {
+class HandleRec : public APIObject {
 public:
-  explicit MacroRec(const Record *rec) : rec(rec) {
-    auto Name = rec->getValueAsString("name");
+  explicit HandleRec(const Record *rec) : APIObject(rec) {};
+};
+
+class MacroRec : public APIObject {
+public:
+  explicit MacroRec(const Record *rec) : APIObject(rec) {
+    auto Name = rec->getName();
     auto OpenBrace = Name.find_first_of("(");
     nameWithoutArgs = Name.substr(0, OpenBrace);
   }
   StringRef getName() const { return nameWithoutArgs; }
-  StringRef getNameWithArgs() const { return rec->getValueAsString("name"); }
+  StringRef getNameWithArgs() const { return rec->getName(); }
   StringRef getDesc() const { return rec->getValueAsString("desc"); }
 
   std::optional<StringRef> getCondition() const {
@@ -46,19 +51,15 @@ public:
   }
 
 private:
-  const Record *rec;
   std::string nameWithoutArgs;
 };
 
-class TypedefRec {
+class TypedefRec : public APIObject {
 public:
-  explicit TypedefRec(const Record *rec) : rec(rec) {}
-  StringRef getName() const { return rec->getValueAsString("name"); }
-  StringRef getDesc() const { return rec->getValueAsString("desc"); }
-  StringRef getValue() const { return rec->getValueAsString("value"); }
+  explicit TypedefRec(const Record *rec) : APIObject(rec) {};
 
-private:
-  const Record *rec;
+public:
+  StringRef getValue() const { return rec->getValueAsString("value"); }
 };
 
 class EnumValueRec {
@@ -74,15 +75,13 @@ private:
   const Record *rec;
 };
 
-class EnumRec {
+class EnumRec : public APIObject {
 public:
-  explicit EnumRec(const Record *rec) : rec(rec) {
+  explicit EnumRec(const Record *rec) : APIObject(rec) {
     for (const auto *Val : rec->getValueAsListOfDefs("etors")) {
       vals.emplace_back(EnumValueRec{Val});
     }
   }
-  StringRef getName() const { return rec->getValueAsString("name"); }
-  StringRef getDesc() const { return rec->getValueAsString("desc"); }
   const std::vector<EnumValueRec> &getValues() const { return vals; }
 
   std::string getEnumValNamePrefix() const {
@@ -92,8 +91,9 @@ public:
 
   bool isTyped() const { return rec->getValueAsBit("is_typed"); }
 
+  bool isBitField() const { return rec->getValueAsBit("is_bit_field"); }
+
 private:
-  const Record *rec;
   std::vector<EnumValueRec> vals;
 };
 
@@ -110,22 +110,19 @@ private:
   const Record *rec;
 };
 
-class StructRec {
+class StructRec : public APIObject {
 public:
-  explicit StructRec(const Record *rec) : rec(rec) {
+  explicit StructRec(const Record *rec) : APIObject(rec) {
     for (auto *Member : rec->getValueAsListOfDefs("all_members")) {
       members.emplace_back(StructMemberRec(Member));
     }
   }
-  StringRef getName() const { return rec->getValueAsString("name"); }
-  StringRef getDesc() const { return rec->getValueAsString("desc"); }
   std::optional<StringRef> getBaseClass() const {
     return rec->getValueAsOptionalString("base_class");
   }
   const std::vector<StructMemberRec> &getMembers() const { return members; }
 
 private:
-  const Record *rec;
   std::vector<StructMemberRec> members;
 };
 
@@ -205,9 +202,9 @@ private:
   const Record *rec;
 };
 
-class FunctionRec {
+class FunctionRec : public APIObject {
 public:
-  FunctionRec(const Record *rec) : rec(rec) {
+  FunctionRec(const Record *rec) : APIObject(rec) {
     for (auto &Ret : rec->getValueAsListOfDefs("all_returns"))
       rets.emplace_back(Ret);
     for (auto &Param : rec->getValueAsListOfDefs("params"))
@@ -219,11 +216,9 @@ public:
                          llvm::convertToSnakeFromCamelCase(getName()));
   }
 
-  StringRef getName() const { return rec->getValueAsString("name"); }
   StringRef getClass() const { return rec->getValueAsString("api_class"); }
   const std::vector<ReturnRec> &getReturns() const { return rets; }
   const std::vector<ParamRec> &getParams() const { return params; }
-  StringRef getDesc() const { return rec->getValueAsString("desc"); }
   std::vector<StringRef> getDetails() const {
     return rec->getValueAsListOfStrings("details");
   }
@@ -234,25 +229,19 @@ public:
 private:
   std::vector<ReturnRec> rets;
   std::vector<ParamRec> params;
-
-  const Record *rec;
 };
 
-class FptrTypedefRec {
+class FptrTypedefRec : public APIObject {
 public:
-  explicit FptrTypedefRec(const Record *rec) : rec(rec) {
+  explicit FptrTypedefRec(const Record *rec) : APIObject(rec) {
     for (auto &Param : rec->getValueAsListOfDefs("params"))
       params.emplace_back(Param);
   }
-  StringRef getName() const { return rec->getValueAsString("name"); }
-  StringRef getDesc() const { return rec->getValueAsString("desc"); }
   StringRef getReturn() const { return rec->getValueAsString("return"); }
   const std::vector<ParamRec> &getParams() const { return params; }
 
 private:
   std::vector<ParamRec> params;
-
-  const Record *rec;
 };
 
 } // namespace tblgen
diff --git a/offload/unittests/Conformance/README.md b/offload/unittests/Conformance/README.md
new file mode 100644
index 000000000000..0202242c99a0
--- /dev/null
+++ b/offload/unittests/Conformance/README.md
@@ -0,0 +1,83 @@
+# GPU Math Conformance Tests
+
+## Overview
+
+This test suite provides a framework to systematically measure the accuracy of math functions on GPUs and verify their conformance with standards like OpenCL.
+
+While the primary focus is validating the implementations in the C standard math library (LLVM-libm), these tests can also be executed against other math library providers, such as CUDA Math and HIP Math, for comparison.
+
+The goals of this project are to empower LLVM-libm contributors with a robust tool for validating their implementations and to build trust with end-users by providing transparent accuracy data.
+
+### Table of Contents
+
+- [Getting Started](#getting-started)
+- [Running the Tests](#running-the-tests)
+- [Adding New Tests](#adding-new-tests)
+
+## Getting Started
+
+This guide covers how to build the necessary dependencies, which include the new Offload API and the C standard library for both host and GPU targets.
+
+### System Requirements
+
+Before you begin, ensure your system meets the following requirements:
+
+- A system with an AMD or NVIDIA GPU.
+- The latest proprietary GPU drivers installed.
+- The corresponding development SDK for your hardware:
+  - **AMD:** [ROCm SDK](https://rocm.docs.amd.com)
+  - **NVIDIA:** [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
+
+### Building the Dependencies
+
+The official documentation for building LLVM-libc for GPUs provides a detailed guide and should be considered the primary reference. Please follow the instructions in the **"Standard runtimes build"** section of that guide:
+
+- [Building the GPU C library (Official Documentation)](https://libc.llvm.org/gpu/building.html)
+
+> [!IMPORTANT]
+> For the conformance tests, the standard `cmake` command from the official documentation must be adapted slightly. You must also add `libc` to the main `-DLLVM_ENABLE_RUNTIMES` list. This is a crucial step because the tests need a host-side build of `libc` to use as the reference oracle for validating GPU results.
+
+## Running the Tests
+
+### Default Test
+
+To build and run the conformance test for a given function (e.g., `logf`) against the default C standard math library `llvm-libm` provider, use the following command. This will execute the test on all available and supported platforms.
+
+```bash
+ninja -C build/runtimes/runtimes-bins offload.conformance.logf
+```
+
+### Testing Other Providers
+
+Once the test binary has been built, you can run it against other math library providers using the `--test-configs` flag.
+
+- **For `cuda-math` on an NVIDIA GPU:**
+
+  ```bash
+  ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=cuda-math:cuda
+  ```
+
+- **For `hip-math` on an AMD GPU:**
+
+  ```bash
+  ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=hip-math:amdgpu
+  ```
+
+You can also run all available configurations for a test with:
+
+```bash
+./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=all
+```
+
+## Adding New Tests
+
+To add a conformance test for a new math function, follow these steps:
+
+1. **Implement the Device Kernels**: Create a kernel wrapper for the new function in each provider's source file. For CUDA Math and HIP Math, you must also add a forward declaration for the vendor function in `/device_code/DeviceAPIs.hpp`.
+
+2. **Implement the Host Test**: Create a new `.cpp` file in `/tests`. This file defines the `FunctionConfig` (function and kernel names, as well as ULP tolerance) and the input generation strategy.
+
+    - Use **exhaustive testing** (`ExhaustiveGenerator`) for functions with small input spaces (e.g., half-precision functions and single-precision univariate functions). This strategy iterates over every representable point in the input space, ensuring complete coverage.
+    - Use **randomized testing** (`RandomGenerator`) for functions with large input spaces (e.g., single-precision bivariate and double-precision functions), where exhaustive testing is computationally infeasible. Although not exhaustive, this strategy is deterministic, using a fixed seed to sample a large, reproducible subset of points from the input space.
+
+3. **Add the Build Target**: Add a new `add_conformance_test(...)` entry to `/tests/CMakeLists.txt` to make the test buildable.
diff --git a/offload/unittests/Conformance/device_code/CUDAMath.cpp b/offload/unittests/Conformance/device_code/CUDAMath.cpp
index a351e924b8f8..d80660b2e3c7 100644
--- a/offload/unittests/Conformance/device_code/CUDAMath.cpp
+++ b/offload/unittests/Conformance/device_code/CUDAMath.cpp
@@ -26,6 +26,22 @@ using namespace kernels;
 // Helpers
 //===----------------------------------------------------------------------===//
 
+static inline float powfRoundedExponent(float Base, float Exponent) {
+  return __nv_powf(Base, __nv_roundf(Exponent));
+}
+
+static inline double sincosSin(double X) {
+  double SinX, CosX;
+  __nv_sincos(X, &SinX, &CosX);
+  return SinX;
+}
+
+static inline double sincosCos(double X) {
+  double SinX, CosX;
+  __nv_sincos(X, &SinX, &CosX);
+  return CosX;
+}
+
 static inline float sincosfSin(float X) {
   float SinX, CosX;
   __nv_sincosf(X, &SinX, &CosX);
@@ -44,6 +60,11 @@ static inline float sincosfCos(float X) {
 
 extern "C" {
 
+__gpu_kernel void acosKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_acos>(NumElements, Out, X);
+}
+
 __gpu_kernel void acosfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__nv_acosf>(NumElements, Out, X);
@@ -54,6 +75,11 @@ __gpu_kernel void acoshfKernel(const float *X, float *Out,
   runKernelBody<__nv_acoshf>(NumElements, Out, X);
 }
 
+__gpu_kernel void asinKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_asin>(NumElements, Out, X);
+}
+
 __gpu_kernel void asinfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__nv_asinf>(NumElements, Out, X);
@@ -69,16 +95,31 @@ __gpu_kernel void atanfKernel(const float *X, float *Out,
   runKernelBody<__nv_atanf>(NumElements, Out, X);
 }
 
+__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_atan2f>(NumElements, Out, X, Y);
+}
+
 __gpu_kernel void atanhfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__nv_atanhf>(NumElements, Out, X);
 }
 
+__gpu_kernel void cbrtKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_cbrt>(NumElements, Out, X);
+}
+
 __gpu_kernel void cbrtfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__nv_cbrtf>(NumElements, Out, X);
 }
 
+__gpu_kernel void cosKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__nv_cos>(NumElements, Out, X);
+}
+
 __gpu_kernel void cosfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__nv_cosf>(NumElements, Out, X);
@@ -99,51 +140,127 @@ __gpu_kernel void erffKernel(const float *X, float *Out,
   runKernelBody<__nv_erff>(NumElements, Out, X);
 }
 
+__gpu_kernel void expKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__nv_exp>(NumElements, Out, X);
+}
+
 __gpu_kernel void expfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__nv_expf>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_exp10>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__nv_exp10f>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_exp2>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__nv_exp2f>(NumElements, Out, X);
 }
 
+__gpu_kernel void expm1Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_expm1>(NumElements, Out, X);
+}
+
 __gpu_kernel void expm1fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__nv_expm1f>(NumElements, Out, X);
 }
 
+__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_hypot>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_hypotf>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void logKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__nv_log>(NumElements, Out, X);
+}
+
 __gpu_kernel void logfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__nv_logf>(NumElements, Out, X);
 }
 
+__gpu_kernel void log10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_log10>(NumElements, Out, X);
+}
+
 __gpu_kernel void log10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__nv_log10f>(NumElements, Out, X);
 }
 
+__gpu_kernel void log1pKernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_log1p>(NumElements, Out, X);
+}
+
 __gpu_kernel void log1pfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__nv_log1pf>(NumElements, Out, X);
 }
 
+__gpu_kernel void log2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_log2>(NumElements, Out, X);
+}
+
 __gpu_kernel void log2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__nv_log2f>(NumElements, Out, X);
 }
 
+__gpu_kernel void powfKernel(const float *X, float *Y, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_powf>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y,
+                                            float *Out,
+                                            size_t NumElements) noexcept {
+  runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void sinKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__nv_sin>(NumElements, Out, X);
+}
+
 __gpu_kernel void sinfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__nv_sinf>(NumElements, Out, X);
 }
 
+__gpu_kernel void sincosSinKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosSin>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosCosKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosCos>(NumElements, Out, X);
+}
+
 __gpu_kernel void sincosfSinKernel(const float *X, float *Out,
                                    size_t NumElements) noexcept {
   runKernelBody<sincosfSin>(NumElements, Out, X);
@@ -164,6 +281,11 @@ __gpu_kernel void sinpifKernel(const float *X, float *Out,
   runKernelBody<__nv_sinpif>(NumElements, Out, X);
 }
 
+__gpu_kernel void tanKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__nv_tan>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__nv_tanf>(NumElements, Out, X);
diff --git a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp
index 8476dcbeff0c..894652a8e1af 100644
--- a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp
+++ b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp
@@ -48,29 +48,49 @@ extern const inline uint32_t __oclc_ISA_version = 9000;
 
 extern "C" {
 
+double __nv_acos(double);
 float __nv_acosf(float);
 float __nv_acoshf(float);
+double __nv_asin(double);
 float __nv_asinf(float);
 float __nv_asinhf(float);
 float __nv_atanf(float);
+float __nv_atan2f(float, float);
 float __nv_atanhf(float);
+double __nv_cbrt(double);
 float __nv_cbrtf(float);
+double __nv_cos(double);
 float __nv_cosf(float);
 float __nv_coshf(float);
 float __nv_cospif(float);
 float __nv_erff(float);
+double __nv_exp(double);
 float __nv_expf(float);
+double __nv_exp10(double);
 float __nv_exp10f(float);
+double __nv_exp2(double);
 float __nv_exp2f(float);
+double __nv_expm1(double);
 float __nv_expm1f(float);
+double __nv_hypot(double, double);
+float __nv_hypotf(float, float);
+double __nv_log(double);
 float __nv_logf(float);
+double __nv_log10(double);
 float __nv_log10f(float);
+double __nv_log1p(double);
 float __nv_log1pf(float);
+double __nv_log2(double);
 float __nv_log2f(float);
+float __nv_powf(float, float);
+float __nv_roundf(float);
+double __nv_sin(double);
 float __nv_sinf(float);
+void __nv_sincos(double, double *, double *);
 void __nv_sincosf(float, float *, float *);
 float __nv_sinhf(float);
 float __nv_sinpif(float);
+double __nv_tan(double);
 float __nv_tanf(float);
 float __nv_tanhf(float);
 } // extern "C"
@@ -81,31 +101,70 @@ float __nv_tanhf(float);
 
 extern "C" {
 
+double __ocml_acos_f64(double);
 float __ocml_acos_f32(float);
+float16 __ocml_acos_f16(float16);
 float __ocml_acosh_f32(float);
+float16 __ocml_acosh_f16(float16);
+double __ocml_asin_f64(double);
 float __ocml_asin_f32(float);
+float16 __ocml_asin_f16(float16);
 float __ocml_asinh_f32(float);
+float16 __ocml_asinh_f16(float16);
 float __ocml_atan_f32(float);
+float16 __ocml_atan_f16(float16);
+float __ocml_atan2_f32(float, float);
 float __ocml_atanh_f32(float);
+float16 __ocml_atanh_f16(float16);
+double __ocml_cbrt_f64(double);
 float __ocml_cbrt_f32(float);
+double __ocml_cos_f64(double);
 float __ocml_cos_f32(float);
+float16 __ocml_cos_f16(float16);
 float __ocml_cosh_f32(float);
+float16 __ocml_cosh_f16(float16);
 float __ocml_cospi_f32(float);
 float __ocml_erf_f32(float);
+double __ocml_exp_f64(double);
 float __ocml_exp_f32(float);
+float16 __ocml_exp_f16(float16);
+double __ocml_exp10_f64(double);
 float __ocml_exp10_f32(float);
+float16 __ocml_exp10_f16(float16);
+double __ocml_exp2_f64(double);
 float __ocml_exp2_f32(float);
+float16 __ocml_exp2_f16(float16);
+double __ocml_expm1_f64(double);
 float __ocml_expm1_f32(float);
+float16 __ocml_expm1_f16(float16);
+double __ocml_hypot_f64(double, double);
+float __ocml_hypot_f32(float, float);
+double __ocml_log_f64(double);
 float __ocml_log_f32(float);
+float16 __ocml_log_f16(float16);
+double __ocml_log10_f64(double);
 float __ocml_log10_f32(float);
+float16 __ocml_log10_f16(float16);
+double __ocml_log1p_f64(double);
 float __ocml_log1p_f32(float);
+double __ocml_log2_f64(double);
 float __ocml_log2_f32(float);
+float16 __ocml_log2_f16(float16);
+float __ocml_pow_f32(float, float);
+float __ocml_round_f32(float);
+double __ocml_sin_f64(double);
 float __ocml_sin_f32(float);
+float16 __ocml_sin_f16(float16);
+double __ocml_sincos_f64(double, double *);
 float __ocml_sincos_f32(float, float *);
 float __ocml_sinh_f32(float);
+float16 __ocml_sinh_f16(float16);
 float __ocml_sinpi_f32(float);
+double __ocml_tan_f64(double);
 float __ocml_tan_f32(float);
+float16 __ocml_tan_f16(float16);
 float __ocml_tanh_f32(float);
+float16 __ocml_tanh_f16(float16);
 } // extern "C"
 
 #endif // HIP_MATH_FOUND
diff --git a/offload/unittests/Conformance/device_code/HIPMath.cpp b/offload/unittests/Conformance/device_code/HIPMath.cpp
index 36efe6b2696a..7cc0ad5d9142 100644
--- a/offload/unittests/Conformance/device_code/HIPMath.cpp
+++ b/offload/unittests/Conformance/device_code/HIPMath.cpp
@@ -26,6 +26,22 @@ using namespace kernels;
 // Helpers
 //===----------------------------------------------------------------------===//
 
+static inline float powfRoundedExponent(float Base, float Exponent) {
+  return __ocml_pow_f32(Base, __ocml_round_f32(Exponent));
+}
+
+static inline double sincosSin(double X) {
+  double CosX;
+  double SinX = __ocml_sincos_f64(X, &CosX);
+  return SinX;
+}
+
+static inline double sincosCos(double X) {
+  double CosX;
+  double SinX = __ocml_sincos_f64(X, &CosX);
+  return CosX;
+}
+
 static inline float sincosfSin(float X) {
   float CosX;
   float SinX = __ocml_sincos_f32(X, &CosX);
@@ -44,51 +60,116 @@ static inline float sincosfCos(float X) {
 
 extern "C" {
 
+__gpu_kernel void acosKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_acos_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void acosfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_acos_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void acosf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_acos_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void acoshfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_acosh_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void acoshf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_acosh_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void asinKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_asin_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void asinfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_asin_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void asinf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_asin_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void asinhfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_asinh_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void asinhf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_asinh_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void atanfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_atan_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void atanf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_atan_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_atan2_f32>(NumElements, Out, X, Y);
+}
+
 __gpu_kernel void atanhfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_atanh_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void atanhf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_atanh_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void cbrtKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_cbrt_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void cbrtfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_cbrt_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void cosKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__ocml_cos_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void cosfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__ocml_cos_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void cosf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_cos_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void coshfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_cosh_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void coshf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_cosh_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void cospifKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_cospi_f32>(NumElements, Out, X);
@@ -99,51 +180,167 @@ __gpu_kernel void erffKernel(const float *X, float *Out,
   runKernelBody<__ocml_erf_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void expKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void expfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__ocml_exp_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void expf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp10_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_exp10_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp10f16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp10_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp2_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_exp2_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp2f16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp2_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void expm1Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__ocml_expm1_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void expm1fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_expm1_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void expm1f16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_expm1_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__ocml_hypot_f64>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_hypot_f32>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void logKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__ocml_log_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void logfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__ocml_log_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void logf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_log_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void log10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__ocml_log10_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void log10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_log10_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void log10f16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_log10_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void log1pKernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__ocml_log1p_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void log1pfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_log1p_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void log2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_log2_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void log2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_log2_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void log2f16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_log2_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void powfKernel(const float *X, float *Y, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_pow_f32>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y,
+                                            float *Out,
+                                            size_t NumElements) noexcept {
+  runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void sinKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__ocml_sin_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void sinfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__ocml_sin_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void sinf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_sin_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosSinKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosSin>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosCosKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosCos>(NumElements, Out, X);
+}
+
 __gpu_kernel void sincosfSinKernel(const float *X, float *Out,
                                    size_t NumElements) noexcept {
   runKernelBody<sincosfSin>(NumElements, Out, X);
@@ -159,20 +356,40 @@ __gpu_kernel void sinhfKernel(const float *X, float *Out,
   runKernelBody<__ocml_sinh_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void sinhf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_sinh_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void sinpifKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_sinpi_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void tanKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__ocml_tan_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__ocml_tan_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void tanf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_tan_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanhfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_tanh_f32>(NumElements, Out, X);
 }
+
+__gpu_kernel void tanhf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_tanh_f16>(NumElements, Out, X);
+}
 } // extern "C"
 
 #endif // HIP_MATH_FOUND
diff --git a/offload/unittests/Conformance/device_code/LLVMLibm.cpp b/offload/unittests/Conformance/device_code/LLVMLibm.cpp
index 8869d8701748..8673d809fd0a 100644
--- a/offload/unittests/Conformance/device_code/LLVMLibm.cpp
+++ b/offload/unittests/Conformance/device_code/LLVMLibm.cpp
@@ -25,6 +25,22 @@ using namespace kernels;
 // Helpers
 //===----------------------------------------------------------------------===//
 
+static inline float powfRoundedExponent(float Base, float Exponent) {
+  return powf(Base, roundf(Exponent));
+}
+
+static inline double sincosSin(double X) {
+  double SinX, CosX;
+  sincos(X, &SinX, &CosX);
+  return SinX;
+}
+
+static inline double sincosCos(double X) {
+  double SinX, CosX;
+  sincos(X, &SinX, &CosX);
+  return CosX;
+}
+
 static inline float sincosfSin(float X) {
   float SinX, CosX;
   sincosf(X, &SinX, &CosX);
@@ -43,111 +59,302 @@ static inline float sincosfCos(float X) {
 
 extern "C" {
 
+__gpu_kernel void acosKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<acos>(NumElements, Out, X);
+}
+
 __gpu_kernel void acosfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<acosf>(NumElements, Out, X);
 }
 
+__gpu_kernel void acosf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<acosf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void acoshfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<acoshf>(NumElements, Out, X);
 }
 
+__gpu_kernel void acoshf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<acoshf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void acospif16Kernel(const float16 *X, float16 *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<acospif16>(NumElements, Out, X);
+}
+
+__gpu_kernel void asinKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<asin>(NumElements, Out, X);
+}
+
 __gpu_kernel void asinfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<asinf>(NumElements, Out, X);
 }
 
+__gpu_kernel void asinf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<asinf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void asinhfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<asinhf>(NumElements, Out, X);
 }
 
+__gpu_kernel void asinhf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<asinhf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void atanfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<atanf>(NumElements, Out, X);
 }
 
+__gpu_kernel void atanf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<atanf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<atan2f>(NumElements, Out, X, Y);
+}
+
 __gpu_kernel void atanhfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<atanhf>(NumElements, Out, X);
 }
 
+__gpu_kernel void atanhf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<atanhf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void cbrtKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<cbrt>(NumElements, Out, X);
+}
+
 __gpu_kernel void cbrtfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<cbrtf>(NumElements, Out, X);
 }
 
+__gpu_kernel void cosKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<cos>(NumElements, Out, X);
+}
+
 __gpu_kernel void cosfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<cosf>(NumElements, Out, X);
 }
 
+__gpu_kernel void cosf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<cosf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void coshfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<coshf>(NumElements, Out, X);
 }
 
+__gpu_kernel void coshf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<coshf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void cospifKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<cospif>(NumElements, Out, X);
 }
 
+__gpu_kernel void cospif16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<cospif16>(NumElements, Out, X);
+}
+
 __gpu_kernel void erffKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<erff>(NumElements, Out, X);
 }
 
+__gpu_kernel void expKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<exp>(NumElements, Out, X);
+}
+
 __gpu_kernel void expfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<expf>(NumElements, Out, X);
 }
 
+__gpu_kernel void expf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<expf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<exp10>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<exp10f>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp10f16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<exp10f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<exp2>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<exp2f>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp2f16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<exp2f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void expm1Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<expm1>(NumElements, Out, X);
+}
+
 __gpu_kernel void expm1fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<expm1f>(NumElements, Out, X);
 }
 
-__gpu_kernel void hypotf16Kernel(const float16 *X, float16 *Y, float16 *Out,
+__gpu_kernel void expm1f16Kernel(const float16 *X, float16 *Out,
                                  size_t NumElements) noexcept {
+  runKernelBody<expm1f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<hypot>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<hypotf>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void hypotf16Kernel(const float16 *X, const float16 *Y,
+                                 float16 *Out, size_t NumElements) noexcept {
   runKernelBody<hypotf16>(NumElements, Out, X, Y);
 }
 
+__gpu_kernel void logKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<log>(NumElements, Out, X);
+}
+
 __gpu_kernel void logfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<logf>(NumElements, Out, X);
 }
 
+__gpu_kernel void logf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<logf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void log10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<log10>(NumElements, Out, X);
+}
+
 __gpu_kernel void log10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<log10f>(NumElements, Out, X);
 }
 
+__gpu_kernel void log10f16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<log10f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void log1pKernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<log1p>(NumElements, Out, X);
+}
+
 __gpu_kernel void log1pfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<log1pf>(NumElements, Out, X);
 }
 
+__gpu_kernel void log2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<log2>(NumElements, Out, X);
+}
+
 __gpu_kernel void log2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<log2f>(NumElements, Out, X);
 }
 
+__gpu_kernel void log2f16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<log2f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void powfKernel(const float *X, float *Y, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<powf>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y,
+                                            float *Out,
+                                            size_t NumElements) noexcept {
+  runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void sinKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<sin>(NumElements, Out, X);
+}
+
 __gpu_kernel void sinfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<sinf>(NumElements, Out, X);
 }
 
+__gpu_kernel void sinf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<sinf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosSinKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosSin>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosCosKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosCos>(NumElements, Out, X);
+}
+
 __gpu_kernel void sincosfSinKernel(const float *X, float *Out,
                                    size_t NumElements) noexcept {
   runKernelBody<sincosfSin>(NumElements, Out, X);
@@ -163,23 +370,53 @@ __gpu_kernel void sinhfKernel(const float *X, float *Out,
   runKernelBody<sinhf>(NumElements, Out, X);
 }
 
+__gpu_kernel void sinhf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<sinhf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void sinpifKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<sinpif>(NumElements, Out, X);
 }
 
+__gpu_kernel void sinpif16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<sinpif16>(NumElements, Out, X);
+}
+
+__gpu_kernel void tanKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<tan>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<tanf>(NumElements, Out, X);
 }
 
+__gpu_kernel void tanf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<tanf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanhfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<tanhf>(NumElements, Out, X);
 }
 
+__gpu_kernel void tanhf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<tanhf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanpifKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<tanpif>(NumElements, Out, X);
 }
+
+__gpu_kernel void tanpif16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<tanpif16>(NumElements, Out, X);
+}
 } // extern "C"
diff --git a/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp b/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp
index 6f7f7a9b665d..39c6838eecf7 100644
--- a/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp
+++ b/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp
@@ -8,8 +8,8 @@
 ///
 /// \file
 /// This file contains the definition of the ExhaustiveGenerator class, a
-/// concrete input generator that exhaustively creates inputs from a given
-/// sequence of ranges.
+/// concrete range-based generator that exhaustively creates inputs from a
+/// given sequence of ranges.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -17,89 +17,62 @@
 #define MATHTEST_EXHAUSTIVEGENERATOR_HPP
 
 #include "mathtest/IndexedRange.hpp"
-#include "mathtest/InputGenerator.hpp"
+#include "mathtest/RangeBasedGenerator.hpp"
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/Parallel.h"
-
-#include <algorithm>
 #include <array>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <tuple>
 
 namespace mathtest {
 
 template <typename... InTypes>
 class [[nodiscard]] ExhaustiveGenerator final
-    : public InputGenerator<InTypes...> {
-  static constexpr std::size_t NumInputs = sizeof...(InTypes);
-  static_assert(NumInputs > 0, "The number of inputs must be at least 1");
+    : public RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...> {
+
+  friend class RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...>;
+
+  using Base = RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...>;
+  using IndexArrayType = std::array<uint64_t, Base::NumInputs>;
+
+  using Base::RangesTuple;
+  using Base::Size;
 
 public:
   explicit constexpr ExhaustiveGenerator(
       const IndexedRange<InTypes> &...Ranges) noexcept
-      : RangesTuple(Ranges...) {
-    bool Overflowed = getSizeWithOverflow(Ranges..., Size);
+      : Base(Ranges...) {
+    const auto MaybeSize = getInputSpaceSize(Ranges...);
+
+    assert(MaybeSize.has_value() && "The size is too large");
+    Size = *MaybeSize;
 
-    assert(!Overflowed && "The input space size is too large");
-    assert((Size > 0) && "The input space size must be at least 1");
+    assert((Size > 0) && "The size must be at least 1");
 
     IndexArrayType DimSizes = {};
     std::size_t DimIndex = 0;
     ((DimSizes[DimIndex++] = Ranges.getSize()), ...);
 
-    Strides[NumInputs - 1] = 1;
-    if constexpr (NumInputs > 1)
-      for (int Index = static_cast<int>(NumInputs) - 2; Index >= 0; --Index)
+    Strides[Base::NumInputs - 1] = 1;
+    if constexpr (Base::NumInputs > 1)
+      for (int Index = static_cast<int>(Base::NumInputs) - 2; Index >= 0;
+           --Index)
         Strides[Index] = Strides[Index + 1] * DimSizes[Index + 1];
   }
 
-  void reset() noexcept override { NextFlatIndex = 0; }
-
-  [[nodiscard]] std::size_t
-  fill(llvm::MutableArrayRef<InTypes>... Buffers) noexcept override {
-    const std::array<std::size_t, NumInputs> BufferSizes = {Buffers.size()...};
-    const std::size_t BufferSize = BufferSizes[0];
-    assert((BufferSize != 0) && "Buffer size cannot be zero");
-    assert(std::all_of(BufferSizes.begin(), BufferSizes.end(),
-                       [&](std::size_t Size) { return Size == BufferSize; }) &&
-           "All input buffers must have the same size");
-
-    if (NextFlatIndex >= Size)
-      return 0;
-
-    const auto BatchSize = std::min<uint64_t>(BufferSize, Size - NextFlatIndex);
-    const auto CurrentFlatIndex = NextFlatIndex;
-    NextFlatIndex += BatchSize;
-
-    auto BufferPtrsTuple = std::make_tuple(Buffers.data()...);
-
-    llvm::parallelFor(0, BatchSize, [&](std::size_t Offset) {
-      writeInputs(CurrentFlatIndex, Offset, BufferPtrsTuple);
-    });
-
-    return static_cast<std::size_t>(BatchSize);
-  }
-
 private:
-  using RangesTupleType = std::tuple<IndexedRange<InTypes>...>;
-  using IndexArrayType = std::array<uint64_t, NumInputs>;
-
-  static bool getSizeWithOverflow(const IndexedRange<InTypes> &...Ranges,
-                                  uint64_t &Size) noexcept {
-    Size = 1;
-    bool Overflowed = false;
-
-    auto Multiplier = [&](const uint64_t RangeSize) {
-      if (!Overflowed)
-        Overflowed = __builtin_mul_overflow(Size, RangeSize, &Size);
-    };
+  [[nodiscard]] constexpr IndexArrayType
+  getNDIndex(uint64_t FlatIndex) const noexcept {
+    IndexArrayType NDIndex;
 
-    (Multiplier(Ranges.getSize()), ...);
+    for (std::size_t Index = 0; Index < Base::NumInputs; ++Index) {
+      NDIndex[Index] = FlatIndex / Strides[Index];
+      FlatIndex -= NDIndex[Index] * Strides[Index];
+    }
 
-    return Overflowed;
+    return NDIndex;
   }
 
   template <typename BufferPtrsTupleType>
@@ -109,31 +82,37 @@ private:
     writeInputsImpl<0>(NDIndex, Offset, BufferPtrsTuple);
   }
 
-  constexpr IndexArrayType getNDIndex(uint64_t FlatIndex) const noexcept {
-    IndexArrayType NDIndex;
-
-    for (std::size_t Index = 0; Index < NumInputs; ++Index) {
-      NDIndex[Index] = FlatIndex / Strides[Index];
-      FlatIndex -= NDIndex[Index] * Strides[Index];
-    }
-
-    return NDIndex;
-  }
-
   template <std::size_t Index, typename BufferPtrsTupleType>
   void writeInputsImpl(IndexArrayType NDIndex, uint64_t Offset,
                        BufferPtrsTupleType BufferPtrsTuple) const noexcept {
-    if constexpr (Index < NumInputs) {
+    if constexpr (Index < Base::NumInputs) {
       const auto &Range = std::get<Index>(RangesTuple);
       std::get<Index>(BufferPtrsTuple)[Offset] = Range[NDIndex[Index]];
+
       writeInputsImpl<Index + 1>(NDIndex, Offset, BufferPtrsTuple);
     }
   }
 
-  uint64_t Size = 1;
-  RangesTupleType RangesTuple;
+  [[nodiscard]] static constexpr std::optional<uint64_t>
+  getInputSpaceSize(const IndexedRange<InTypes> &...Ranges) noexcept {
+    uint64_t InputSpaceSize = 1;
+    bool Overflowed = false;
+
+    auto Multiplier = [&](const uint64_t RangeSize) {
+      if (!Overflowed)
+        Overflowed =
+            __builtin_mul_overflow(InputSpaceSize, RangeSize, &InputSpaceSize);
+    };
+
+    (Multiplier(Ranges.getSize()), ...);
+
+    if (Overflowed)
+      return std::nullopt;
+
+    return InputSpaceSize;
+  }
+
   IndexArrayType Strides = {};
-  uint64_t NextFlatIndex = 0;
 };
 } // namespace mathtest
 
diff --git a/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp b/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp
new file mode 100644
index 000000000000..436cd05f0a3d
--- /dev/null
+++ b/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the definition of the RandomGenerator class, a concrete
+/// range-based generator that randomly creates inputs from a given sequence of
+/// ranges.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef MATHTEST_RANDOMGENERATOR_HPP
+#define MATHTEST_RANDOMGENERATOR_HPP
+
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/RangeBasedGenerator.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace mathtest {
+
+template <typename... InTypes>
+class [[nodiscard]] RandomGenerator final
+    : public RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...> {
+
+  friend class RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...>;
+
+  using Base = RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...>;
+
+  using Base::RangesTuple;
+  using Base::Size;
+
+public:
+  explicit constexpr RandomGenerator(
+      SeedTy BaseSeed, uint64_t Size,
+      const IndexedRange<InTypes> &...Ranges) noexcept
+      : Base(Size, Ranges...), BaseSeed(BaseSeed) {}
+
+private:
+  [[nodiscard]] static uint64_t getRandomIndex(RandomState &RNG,
+                                               uint64_t RangeSize) noexcept {
+    if (RangeSize == 0)
+      return 0;
+
+    const uint64_t Threshold = (-RangeSize) % RangeSize;
+
+    uint64_t RandomNumber;
+    do {
+      RandomNumber = RNG.next();
+    } while (RandomNumber < Threshold);
+
+    return RandomNumber % RangeSize;
+  }
+
+  template <typename BufferPtrsTupleType>
+  void writeInputs(uint64_t CurrentFlatIndex, uint64_t Offset,
+                   BufferPtrsTupleType BufferPtrsTuple) const noexcept {
+
+    RandomState RNG(SeedTy{BaseSeed.Value ^ (CurrentFlatIndex + Offset)});
+    writeInputsImpl<0>(RNG, Offset, BufferPtrsTuple);
+  }
+
+  template <std::size_t Index, typename BufferPtrsTupleType>
+  void writeInputsImpl(RandomState &RNG, uint64_t Offset,
+                       BufferPtrsTupleType BufferPtrsTuple) const noexcept {
+    if constexpr (Index < Base::NumInputs) {
+      const auto &Range = std::get<Index>(RangesTuple);
+      const auto RandomIndex = getRandomIndex(RNG, Range.getSize());
+      std::get<Index>(BufferPtrsTuple)[Offset] = Range[RandomIndex];
+
+      writeInputsImpl<Index + 1>(RNG, Offset, BufferPtrsTuple);
+    }
+  }
+
+  SeedTy BaseSeed;
+};
+} // namespace mathtest
+
+#endif // MATHTEST_RANDOMGENERATOR_HPP
diff --git a/offload/unittests/Conformance/include/mathtest/RandomState.hpp b/offload/unittests/Conformance/include/mathtest/RandomState.hpp
new file mode 100644
index 000000000000..322d53175236
--- /dev/null
+++ b/offload/unittests/Conformance/include/mathtest/RandomState.hpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the definition of the RandomState class, a fast and
+/// lightweight pseudo-random number generator.
+///
+/// The implementation is based on the xorshift* generator, seeded using the
+/// SplitMix64 generator for robust initialization. For more details on the
+/// algorithm, see: https://en.wikipedia.org/wiki/Xorshift
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef MATHTEST_RANDOMSTATE_HPP
+#define MATHTEST_RANDOMSTATE_HPP
+
+#include <cstdint>
+
+struct SeedTy {
+  uint64_t Value;
+};
+
+class [[nodiscard]] RandomState {
+  uint64_t State;
+
+  [[nodiscard]] static constexpr uint64_t splitMix64(uint64_t X) noexcept {
+    X += 0x9E3779B97F4A7C15ULL;
+    X = (X ^ (X >> 30)) * 0xBF58476D1CE4E5B9ULL;
+    X = (X ^ (X >> 27)) * 0x94D049BB133111EBULL;
+    X = (X ^ (X >> 31));
+    return X ? X : 0x9E3779B97F4A7C15ULL;
+  }
+
+public:
+  explicit constexpr RandomState(SeedTy Seed) noexcept
+      : State(splitMix64(Seed.Value)) {}
+
+  inline uint64_t next() noexcept {
+    uint64_t X = State;
+    X ^= X >> 12;
+    X ^= X << 25;
+    X ^= X >> 27;
+    State = X;
+    return X * 0x2545F4914F6CDD1DULL;
+  }
+};
+
+#endif // MATHTEST_RANDOMSTATE_HPP
diff --git a/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp b/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp
new file mode 100644
index 000000000000..5e1e1139aba9
--- /dev/null
+++ b/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the definition of the RangeBasedGenerator class, a base
+/// class for input generators that operate on a sequence of ranges.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef MATHTEST_RANGEBASEDGENERATOR_HPP
+#define MATHTEST_RANGEBASEDGENERATOR_HPP
+
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/InputGenerator.hpp"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Parallel.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace mathtest {
+
+template <typename Derived, typename... InTypes>
+class [[nodiscard]] RangeBasedGenerator : public InputGenerator<InTypes...> {
+public:
+  void reset() noexcept override { NextFlatIndex = 0; }
+
+  [[nodiscard]] std::size_t
+  fill(llvm::MutableArrayRef<InTypes>... Buffers) noexcept override {
+    const std::array<std::size_t, NumInputs> BufferSizes = {Buffers.size()...};
+    const std::size_t BufferSize = BufferSizes[0];
+    assert((BufferSize != 0) && "Buffer size cannot be zero");
+    assert(std::all_of(BufferSizes.begin(), BufferSizes.end(),
+                       [&](std::size_t Size) { return Size == BufferSize; }) &&
+           "All input buffers must have the same size");
+
+    if (NextFlatIndex >= Size)
+      return 0;
+
+    const auto BatchSize = std::min<uint64_t>(BufferSize, Size - NextFlatIndex);
+    const auto CurrentFlatIndex = NextFlatIndex;
+    NextFlatIndex += BatchSize;
+
+    auto BufferPtrsTuple = std::make_tuple(Buffers.data()...);
+
+    llvm::parallelFor(0, BatchSize, [&](std::size_t Offset) {
+      static_cast<Derived *>(this)->writeInputs(CurrentFlatIndex, Offset,
+                                                BufferPtrsTuple);
+    });
+
+    return static_cast<std::size_t>(BatchSize);
+  }
+
+protected:
+  using RangesTupleType = std::tuple<IndexedRange<InTypes>...>;
+
+  static constexpr std::size_t NumInputs = sizeof...(InTypes);
+  static_assert(NumInputs > 0, "The number of inputs must be at least 1");
+
+  explicit constexpr RangeBasedGenerator(
+      const IndexedRange<InTypes> &...Ranges) noexcept
+      : RangesTuple(Ranges...) {}
+
+  explicit constexpr RangeBasedGenerator(
+      uint64_t Size, const IndexedRange<InTypes> &...Ranges) noexcept
+      : RangesTuple(Ranges...), Size(Size) {}
+
+  RangesTupleType RangesTuple;
+  uint64_t Size = 0;
+
+private:
+  uint64_t NextFlatIndex = 0;
+};
+} // namespace mathtest
+
+#endif // MATHTEST_RANGEBASEDGENERATOR_HPP
diff --git a/offload/unittests/Conformance/lib/DeviceContext.cpp b/offload/unittests/Conformance/lib/DeviceContext.cpp
index a0068c3cb59c..6c3425f1e17c 100644
--- a/offload/unittests/Conformance/lib/DeviceContext.cpp
+++ b/offload/unittests/Conformance/lib/DeviceContext.cpp
@@ -55,13 +55,14 @@ static OffloadInitWrapper Wrapper{};
 
 [[nodiscard]] std::string getDeviceName(ol_device_handle_t DeviceHandle) {
   std::size_t PropSize = 0;
-  OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_NAME, &PropSize));
+  OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME,
+                               &PropSize));
 
   if (PropSize == 0)
     return "";
 
   std::string PropValue(PropSize, '\0');
-  OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_NAME, PropSize,
+  OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME, PropSize,
                            PropValue.data()));
   PropValue.pop_back(); // Remove the null terminator
 
diff --git a/offload/unittests/Conformance/tests/AcosTest.cpp b/offload/unittests/Conformance/tests/AcosTest.cpp
new file mode 100644
index 000000000000..bc0d1d2b7280
--- /dev/null
+++ b/offload/unittests/Conformance/tests/AcosTest.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the acos function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'acos' function to select the double version
+constexpr auto acosd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(acos);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<acosd> {
+  static constexpr llvm::StringRef Name = "acos";
+  static constexpr llvm::StringRef KernelName = "acosKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the acos function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/-1.0,
+                             /*End=*/1.0,
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<acosd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Acosf16Test.cpp b/offload/unittests/Conformance/tests/Acosf16Test.cpp
new file mode 100644
index 000000000000..ce11cc2aa1e8
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Acosf16Test.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the acosf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 acosf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<acosf16> {
+  static constexpr llvm::StringRef Name = "acosf16";
+  static constexpr llvm::StringRef KernelName = "acosf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the acosf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(-1.0),
+                              /*End=*/float16(1.0),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<acosf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/AcosfTest.cpp b/offload/unittests/Conformance/tests/AcosfTest.cpp
index e69ee3b7d1fd..65b2d18d7728 100644
--- a/offload/unittests/Conformance/tests/AcosfTest.cpp
+++ b/offload/unittests/Conformance/tests/AcosfTest.cpp
@@ -40,7 +40,9 @@ int main(int argc, const char **argv) {
 
   using namespace mathtest;
 
-  IndexedRange<float> Range;
+  IndexedRange<float> Range(/*Begin=*/-1.0f,
+                            /*End=*/1.0f,
+                            /*Inclusive=*/true);
   ExhaustiveGenerator<float> Generator(Range);
 
   const auto Configs = cl::getTestConfigs();
diff --git a/offload/unittests/Conformance/tests/Acoshf16Test.cpp b/offload/unittests/Conformance/tests/Acoshf16Test.cpp
new file mode 100644
index 000000000000..80434477aa43
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Acoshf16Test.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the acoshf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/Numerics.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 acoshf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<acoshf16> {
+  static constexpr llvm::StringRef Name = "acoshf16";
+  static constexpr llvm::StringRef KernelName = "acoshf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the acoshf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(1.0),
+                              /*End=*/getMaxOrInf<float16>(),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<acoshf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Acospif16Test.cpp b/offload/unittests/Conformance/tests/Acospif16Test.cpp
new file mode 100644
index 000000000000..c5871e27cafc
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Acospif16Test.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the acospif16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 acospif16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<acospif16> {
+  static constexpr llvm::StringRef Name = "acospif16";
+  static constexpr llvm::StringRef KernelName = "acospif16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the acospif16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(-1.0),
+                              /*End=*/float16(1.0),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<acospif16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/AsinTest.cpp b/offload/unittests/Conformance/tests/AsinTest.cpp
new file mode 100644
index 000000000000..aaaa37af02bc
--- /dev/null
+++ b/offload/unittests/Conformance/tests/AsinTest.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the asin function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'asin' function to select the double version
+constexpr auto asind // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(asin);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<asind> {
+  static constexpr llvm::StringRef Name = "asin";
+  static constexpr llvm::StringRef KernelName = "asinKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the asin function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/-1.0,
+                             /*End=*/1.0,
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<asind>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Asinf16Test.cpp b/offload/unittests/Conformance/tests/Asinf16Test.cpp
new file mode 100644
index 000000000000..5784d6bfe08e
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Asinf16Test.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the asinf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 asinf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<asinf16> {
+  static constexpr llvm::StringRef Name = "asinf16";
+  static constexpr llvm::StringRef KernelName = "asinf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the asinf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(-1.0),
+                              /*End=*/float16(1.0),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<asinf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/AsinfTest.cpp b/offload/unittests/Conformance/tests/AsinfTest.cpp
index 991f79b111ef..aeee648b5fa0 100644
--- a/offload/unittests/Conformance/tests/AsinfTest.cpp
+++ b/offload/unittests/Conformance/tests/AsinfTest.cpp
@@ -40,7 +40,9 @@ int main(int argc, const char **argv) {
 
   using namespace mathtest;
 
-  IndexedRange<float> Range;
+  IndexedRange<float> Range(/*Begin=*/-1.0f,
+                            /*End=*/1.0f,
+                            /*Inclusive=*/true);
   ExhaustiveGenerator<float> Generator(Range);
 
   const auto Configs = cl::getTestConfigs();
diff --git a/offload/unittests/Conformance/tests/Asinhf16Test.cpp b/offload/unittests/Conformance/tests/Asinhf16Test.cpp
new file mode 100644
index 000000000000..0af9bcb06fef
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Asinhf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the asinhf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 asinhf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<asinhf16> {
+  static constexpr llvm::StringRef Name = "asinhf16";
+  static constexpr llvm::StringRef KernelName = "asinhf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the asinhf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<asinhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Atan2fTest.cpp b/offload/unittests/Conformance/tests/Atan2fTest.cpp
new file mode 100644
index 000000000000..4a46f9a61540
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Atan2fTest.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the atan2f function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace mathtest {
+
+template <> struct FunctionConfig<atan2f> {
+  static constexpr llvm::StringRef Name = "atan2f";
+  static constexpr llvm::StringRef KernelName = "atan2fKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 65, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 6;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the atan2f function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<float> RangeX;
+  IndexedRange<float> RangeY;
+  RandomGenerator<float, float> Generator(SeedTy{Seed}, Size, RangeX, RangeY);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<atan2f>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Atanf16Test.cpp b/offload/unittests/Conformance/tests/Atanf16Test.cpp
new file mode 100644
index 000000000000..3d3fa384e84d
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Atanf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the atanf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 atanf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<atanf16> {
+  static constexpr llvm::StringRef Name = "atanf16";
+  static constexpr llvm::StringRef KernelName = "atanf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the atanf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<atanf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Atanhf16Test.cpp b/offload/unittests/Conformance/tests/Atanhf16Test.cpp
new file mode 100644
index 000000000000..86a0f82ce376
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Atanhf16Test.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the atanhf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 atanhf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<atanhf16> {
+  static constexpr llvm::StringRef Name = "atanhf16";
+  static constexpr llvm::StringRef KernelName = "atanhf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the atanhf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(-1.0),
+                              /*End=*/float16(1.0),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<atanhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/CMakeLists.txt b/offload/unittests/Conformance/tests/CMakeLists.txt
index 8c0109ba62ce..ad94df8e7978 100644
--- a/offload/unittests/Conformance/tests/CMakeLists.txt
+++ b/offload/unittests/Conformance/tests/CMakeLists.txt
@@ -3,30 +3,72 @@ if(NOT TARGET libc)
     return()
 endif()
 
+add_conformance_test(acos AcosTest.cpp)
 add_conformance_test(acosf AcosfTest.cpp)
+add_conformance_test(acosf16 Acosf16Test.cpp)
 add_conformance_test(acoshf AcoshfTest.cpp)
+add_conformance_test(acoshf16 Acoshf16Test.cpp)
+add_conformance_test(acospif16 Acospif16Test.cpp)
+add_conformance_test(asin AsinTest.cpp)
 add_conformance_test(asinf AsinfTest.cpp)
+add_conformance_test(asinf16 Asinf16Test.cpp)
 add_conformance_test(asinhf AsinhfTest.cpp)
+add_conformance_test(asinhf16 Asinhf16Test.cpp)
 add_conformance_test(atanf AtanfTest.cpp)
+add_conformance_test(atanf16 Atanf16Test.cpp)
+add_conformance_test(atan2f Atan2fTest.cpp)
 add_conformance_test(atanhf AtanhfTest.cpp)
+add_conformance_test(atanhf16 Atanhf16Test.cpp)
+add_conformance_test(cbrt CbrtTest.cpp)
 add_conformance_test(cbrtf CbrtfTest.cpp)
+add_conformance_test(cos CosTest.cpp)
 add_conformance_test(cosf CosfTest.cpp)
+add_conformance_test(cosf16 Cosf16Test.cpp)
 add_conformance_test(coshf CoshfTest.cpp)
+add_conformance_test(coshf16 Coshf16Test.cpp)
 add_conformance_test(cospif CospifTest.cpp)
+add_conformance_test(cospif16 Cospif16Test.cpp)
 add_conformance_test(erff ErffTest.cpp)
+add_conformance_test(exp ExpTest.cpp)
 add_conformance_test(expf ExpfTest.cpp)
+add_conformance_test(expf16 Expf16Test.cpp)
+add_conformance_test(exp10 Exp10Test.cpp)
 add_conformance_test(exp10f Exp10fTest.cpp)
+add_conformance_test(exp10f16 Exp10f16Test.cpp)
+add_conformance_test(exp2 Exp2Test.cpp)
 add_conformance_test(exp2f Exp2fTest.cpp)
+add_conformance_test(exp2f16 Exp2f16Test.cpp)
+add_conformance_test(expm1 Expm1Test.cpp)
 add_conformance_test(expm1f Expm1fTest.cpp)
+add_conformance_test(expm1f16 Expm1f16Test.cpp)
+add_conformance_test(hypot HypotTest.cpp)
+add_conformance_test(hypotf HypotfTest.cpp)
 add_conformance_test(hypotf16 Hypotf16Test.cpp)
+add_conformance_test(log LogTest.cpp)
 add_conformance_test(logf LogfTest.cpp)
+add_conformance_test(logf16 Logf16Test.cpp)
+add_conformance_test(log10 Log10Test.cpp)
 add_conformance_test(log10f Log10fTest.cpp)
+add_conformance_test(log10f16 Log10f16Test.cpp)
+add_conformance_test(log1p Log1pTest.cpp)
 add_conformance_test(log1pf Log1pfTest.cpp)
+add_conformance_test(log2 Log2Test.cpp)
 add_conformance_test(log2f Log2fTest.cpp)
+add_conformance_test(log2f16 Log2f16Test.cpp)
+add_conformance_test(powf PowfTest.cpp)
+add_conformance_test(sin SinTest.cpp)
 add_conformance_test(sinf SinfTest.cpp)
+add_conformance_test(sinf16 Sinf16Test.cpp)
+add_conformance_test(sincos SincosTest.cpp)
 add_conformance_test(sincosf SincosfTest.cpp)
 add_conformance_test(sinhf SinhfTest.cpp)
+add_conformance_test(sinhf16 Sinhf16Test.cpp)
 add_conformance_test(sinpif SinpifTest.cpp)
+add_conformance_test(sinpif16 Sinpif16Test.cpp)
+add_conformance_test(tan TanTest.cpp)
 add_conformance_test(tanf TanfTest.cpp)
+add_conformance_test(tanf16 Tanf16Test.cpp)
 add_conformance_test(tanhf TanhfTest.cpp)
+add_conformance_test(tanhf16 Tanhf16Test.cpp)
 add_conformance_test(tanpif TanpifTest.cpp)
+add_conformance_test(tanpif16 Tanpif16Test.cpp)
diff --git a/offload/unittests/Conformance/tests/CbrtTest.cpp b/offload/unittests/Conformance/tests/CbrtTest.cpp
new file mode 100644
index 000000000000..3a6523b66ad8
--- /dev/null
+++ b/offload/unittests/Conformance/tests/CbrtTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the cbrt function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'cbrt' function to select the double version
+constexpr auto cbrtd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(cbrt);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<cbrtd> {
+  static constexpr llvm::StringRef Name = "cbrt";
+  static constexpr llvm::StringRef KernelName = "cbrtKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the cbrt function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<cbrtd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/CosTest.cpp b/offload/unittests/Conformance/tests/CosTest.cpp
new file mode 100644
index 000000000000..e3d3d3da8180
--- /dev/null
+++ b/offload/unittests/Conformance/tests/CosTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the cos function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'cos' function to select the double version
+constexpr auto cosd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(cos);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<cosd> {
+  static constexpr llvm::StringRef Name = "cos";
+  static constexpr llvm::StringRef KernelName = "cosKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the cos function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<cosd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Cosf16Test.cpp b/offload/unittests/Conformance/tests/Cosf16Test.cpp
new file mode 100644
index 000000000000..680e4b99c549
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Cosf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the cosf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 cosf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<cosf16> {
+  static constexpr llvm::StringRef Name = "cosf16";
+  static constexpr llvm::StringRef KernelName = "cosf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the cosf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<cosf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Coshf16Test.cpp b/offload/unittests/Conformance/tests/Coshf16Test.cpp
new file mode 100644
index 000000000000..1b378b5a9401
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Coshf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the coshf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 coshf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<coshf16> {
+  static constexpr llvm::StringRef Name = "coshf16";
+  static constexpr llvm::StringRef KernelName = "coshf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the coshf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<coshf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Cospif16Test.cpp b/offload/unittests/Conformance/tests/Cospif16Test.cpp
new file mode 100644
index 000000000000..84aa682b4884
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Cospif16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the cospif16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 cospif16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<cospif16> {
+  static constexpr llvm::StringRef Name = "cospif16";
+  static constexpr llvm::StringRef KernelName = "cospif16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the cospif16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<cospif16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Exp10Test.cpp b/offload/unittests/Conformance/tests/Exp10Test.cpp
new file mode 100644
index 000000000000..05af4780213b
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Exp10Test.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the exp10 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'exp10' function to select the double version
+constexpr auto exp10d // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(exp10);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<exp10d> {
+  static constexpr llvm::StringRef Name = "exp10";
+  static constexpr llvm::StringRef KernelName = "exp10Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the exp10 function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<exp10d>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Exp10f16Test.cpp b/offload/unittests/Conformance/tests/Exp10f16Test.cpp
new file mode 100644
index 000000000000..7d61ad0c6aef
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Exp10f16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the exp10f16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 exp10f16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<exp10f16> {
+  static constexpr llvm::StringRef Name = "exp10f16";
+  static constexpr llvm::StringRef KernelName = "exp10f16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the exp10f16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<exp10f16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Exp2Test.cpp b/offload/unittests/Conformance/tests/Exp2Test.cpp
new file mode 100644
index 000000000000..bb2fa10a0dfa
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Exp2Test.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the exp2 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'exp2' function to select the double version
+constexpr auto exp2d // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(exp2);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<exp2d> {
+  static constexpr llvm::StringRef Name = "exp2";
+  static constexpr llvm::StringRef KernelName = "exp2Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the exp2 function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<exp2d>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Exp2f16Test.cpp b/offload/unittests/Conformance/tests/Exp2f16Test.cpp
new file mode 100644
index 000000000000..9ea92564e738
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Exp2f16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the exp2f16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 exp2f16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<exp2f16> {
+  static constexpr llvm::StringRef Name = "exp2f16";
+  static constexpr llvm::StringRef KernelName = "exp2f16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the exp2f16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<exp2f16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/ExpTest.cpp b/offload/unittests/Conformance/tests/ExpTest.cpp
new file mode 100644
index 000000000000..9aa52b17905e
--- /dev/null
+++ b/offload/unittests/Conformance/tests/ExpTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the exp function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'exp' function to select the double version
+constexpr auto expd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(exp);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<expd> {
+  static constexpr llvm::StringRef Name = "exp";
+  static constexpr llvm::StringRef KernelName = "expKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the exp function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<expd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Expf16Test.cpp b/offload/unittests/Conformance/tests/Expf16Test.cpp
new file mode 100644
index 000000000000..8938815e26a8
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Expf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the expf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 expf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<expf16> {
+  static constexpr llvm::StringRef Name = "expf16";
+  static constexpr llvm::StringRef KernelName = "expf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the expf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<expf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Expm1Test.cpp b/offload/unittests/Conformance/tests/Expm1Test.cpp
new file mode 100644
index 000000000000..a27944bf722f
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Expm1Test.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the expm1 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'expm1' function to select the double version
+constexpr auto expm1d // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(expm1);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<expm1d> {
+  static constexpr llvm::StringRef Name = "expm1";
+  static constexpr llvm::StringRef KernelName = "expm1Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the expm1 function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<expm1d>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Expm1f16Test.cpp b/offload/unittests/Conformance/tests/Expm1f16Test.cpp
new file mode 100644
index 000000000000..447196bb8ea3
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Expm1f16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the expm1f16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 expm1f16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<expm1f16> {
+  static constexpr llvm::StringRef Name = "expm1f16";
+  static constexpr llvm::StringRef KernelName = "expm1f16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the expm1f16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<expm1f16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/HypotTest.cpp b/offload/unittests/Conformance/tests/HypotTest.cpp
new file mode 100644
index 000000000000..0417ad901d5e
--- /dev/null
+++ b/offload/unittests/Conformance/tests/HypotTest.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the hypot function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'hypot' function to select the double version
+constexpr auto hypotd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double, double)>(hypot);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<hypotd> {
+  static constexpr llvm::StringRef Name = "hypot";
+  static constexpr llvm::StringRef KernelName = "hypotKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the hypot function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> RangeX;
+  IndexedRange<double> RangeY;
+  RandomGenerator<double, double> Generator(SeedTy{Seed}, Size, RangeX, RangeY);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<hypotd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/HypotfTest.cpp b/offload/unittests/Conformance/tests/HypotfTest.cpp
new file mode 100644
index 000000000000..98a4e906920d
--- /dev/null
+++ b/offload/unittests/Conformance/tests/HypotfTest.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the hypotf function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace mathtest {
+
+template <> struct FunctionConfig<hypotf> {
+  static constexpr llvm::StringRef Name = "hypotf";
+  static constexpr llvm::StringRef KernelName = "hypotfKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 65, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the hypotf function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<float> RangeX;
+  IndexedRange<float> RangeY;
+  RandomGenerator<float, float> Generator(SeedTy{Seed}, Size, RangeX, RangeY);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<hypotf>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Log10Test.cpp b/offload/unittests/Conformance/tests/Log10Test.cpp
new file mode 100644
index 000000000000..bf46f11e960b
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Log10Test.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log10 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <limits>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'log10' function to select the double version
+constexpr auto log10d // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(log10);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<log10d> {
+  static constexpr llvm::StringRef Name = "log10";
+  static constexpr llvm::StringRef KernelName = "log10Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the log10 function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/0.0,
+                             /*End=*/std::numeric_limits<double>::infinity(),
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<log10d>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Log10f16Test.cpp b/offload/unittests/Conformance/tests/Log10f16Test.cpp
new file mode 100644
index 000000000000..605e1ae49077
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Log10f16Test.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log10f16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/Numerics.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 log10f16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<log10f16> {
+  static constexpr llvm::StringRef Name = "log10f16";
+  static constexpr llvm::StringRef KernelName = "log10f16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the log10f16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(0.0),
+                              /*End=*/getMaxOrInf<float16>(),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<log10f16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Log1pTest.cpp b/offload/unittests/Conformance/tests/Log1pTest.cpp
new file mode 100644
index 000000000000..023b67e770de
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Log1pTest.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log1p function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <limits>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'log1p' function to select the double version
+constexpr auto log1pd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(log1p);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<log1pd> {
+  static constexpr llvm::StringRef Name = "log1p";
+  static constexpr llvm::StringRef KernelName = "log1pKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the log1p function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/-1.0,
+                             /*End=*/std::numeric_limits<double>::infinity(),
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<log1pd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Log2Test.cpp b/offload/unittests/Conformance/tests/Log2Test.cpp
new file mode 100644
index 000000000000..2ae7e5c23292
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Log2Test.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log2 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <limits>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'log2' function to select the double version
+constexpr auto log2d // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(log2);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<log2d> {
+  static constexpr llvm::StringRef Name = "log2";
+  static constexpr llvm::StringRef KernelName = "log2Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the log2 function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/0.0,
+                             /*End=*/std::numeric_limits<double>::infinity(),
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<log2d>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Log2f16Test.cpp b/offload/unittests/Conformance/tests/Log2f16Test.cpp
new file mode 100644
index 000000000000..5ce46960774a
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Log2f16Test.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log2f16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/Numerics.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 log2f16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<log2f16> {
+  static constexpr llvm::StringRef Name = "log2f16";
+  static constexpr llvm::StringRef KernelName = "log2f16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the log2f16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(0.0),
+                              /*End=*/getMaxOrInf<float16>(),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<log2f16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/LogTest.cpp b/offload/unittests/Conformance/tests/LogTest.cpp
new file mode 100644
index 000000000000..ae568e2c4740
--- /dev/null
+++ b/offload/unittests/Conformance/tests/LogTest.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <limits>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'log' function to select the double version
+constexpr auto logd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(log);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<logd> {
+  static constexpr llvm::StringRef Name = "log";
+  static constexpr llvm::StringRef KernelName = "logKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the log function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/0.0,
+                             /*End=*/std::numeric_limits<double>::infinity(),
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<logd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Logf16Test.cpp b/offload/unittests/Conformance/tests/Logf16Test.cpp
new file mode 100644
index 000000000000..372dccb2ebb9
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Logf16Test.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the logf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/Numerics.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 logf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<logf16> {
+  static constexpr llvm::StringRef Name = "logf16";
+  static constexpr llvm::StringRef KernelName = "logf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the logf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(0.0),
+                              /*End=*/getMaxOrInf<float16>(),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<logf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/PowfTest.cpp b/offload/unittests/Conformance/tests/PowfTest.cpp
new file mode 100644
index 000000000000..246801e390ae
--- /dev/null
+++ b/offload/unittests/Conformance/tests/PowfTest.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the powf function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+static inline float powfRoundedExponent(float Base, float Exponent) {
+  return powf(Base, roundf(Exponent));
+}
+
+namespace mathtest {
+
+template <> struct FunctionConfig<powf> {
+  static constexpr llvm::StringRef Name = "powf (real exponents)";
+  static constexpr llvm::StringRef KernelName = "powfKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 65, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 16;
+};
+
+template <> struct FunctionConfig<powfRoundedExponent> {
+  static constexpr llvm::StringRef Name = "powf (integer exponents)";
+  static constexpr llvm::StringRef KernelName = "powfRoundedExponentKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 65, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 16;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the powf function");
+
+  using namespace mathtest;
+
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<float> RangeX;
+  IndexedRange<float> RangeY;
+  RandomGenerator<float, float> Generator0(SeedTy{42}, Size, RangeX, RangeY);
+  RandomGenerator<float, float> Generator1(SeedTy{51}, Size, RangeX, RangeY);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool RealExponentsPassed =
+      runTests<powf>(Generator0, Configs, DeviceBinaryDir, IsVerbose);
+  bool IntegerExponentsPassed = runTests<powfRoundedExponent>(
+      Generator1, Configs, DeviceBinaryDir, IsVerbose);
+
+  return (RealExponentsPassed && IntegerExponentsPassed) ? EXIT_SUCCESS
+                                                         : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/SinTest.cpp b/offload/unittests/Conformance/tests/SinTest.cpp
new file mode 100644
index 000000000000..36897d74c96a
--- /dev/null
+++ b/offload/unittests/Conformance/tests/SinTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the sin function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'sin' function to select the double version
+constexpr auto sind // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(sin);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<sind> {
+  static constexpr llvm::StringRef Name = "sin";
+  static constexpr llvm::StringRef KernelName = "sinKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the sin function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<sind>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/SincosTest.cpp b/offload/unittests/Conformance/tests/SincosTest.cpp
new file mode 100644
index 000000000000..a3d1650c54e4
--- /dev/null
+++ b/offload/unittests/Conformance/tests/SincosTest.cpp
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the sincos function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+static inline double sincosSin(double X) {
+  double SinX, CosX;
+  sincos(X, &SinX, &CosX);
+  return SinX;
+}
+
+static inline double sincosCos(double X) {
+  double SinX, CosX;
+  sincos(X, &SinX, &CosX);
+  return CosX;
+}
+
+namespace mathtest {
+
+template <> struct FunctionConfig<sincosSin> {
+  static constexpr llvm::StringRef Name = "sincos (sin part)";
+  static constexpr llvm::StringRef KernelName = "sincosSinKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+
+template <> struct FunctionConfig<sincosCos> {
+  static constexpr llvm::StringRef Name = "sincos (cos part)";
+  static constexpr llvm::StringRef KernelName = "sincosCosKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the sincos function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool SinPartPassed =
+      runTests<sincosSin>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+  bool CosPartPassed =
+      runTests<sincosCos>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return (SinPartPassed && CosPartPassed) ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Sinf16Test.cpp b/offload/unittests/Conformance/tests/Sinf16Test.cpp
new file mode 100644
index 000000000000..4c5fb2226288
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Sinf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the sinf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 sinf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<sinf16> {
+  static constexpr llvm::StringRef Name = "sinf16";
+  static constexpr llvm::StringRef KernelName = "sinf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the sinf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<sinf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Sinhf16Test.cpp b/offload/unittests/Conformance/tests/Sinhf16Test.cpp
new file mode 100644
index 000000000000..fe6f7dd4a49c
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Sinhf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the sinhf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 sinhf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<sinhf16> {
+  static constexpr llvm::StringRef Name = "sinhf16";
+  static constexpr llvm::StringRef KernelName = "sinhf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the sinhf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<sinhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Sinpif16Test.cpp b/offload/unittests/Conformance/tests/Sinpif16Test.cpp
new file mode 100644
index 000000000000..ff9c93c0bb5b
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Sinpif16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the sinpif16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 sinpif16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<sinpif16> {
+  static constexpr llvm::StringRef Name = "sinpif16";
+  static constexpr llvm::StringRef KernelName = "sinpif16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the sinpif16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<sinpif16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/TanTest.cpp b/offload/unittests/Conformance/tests/TanTest.cpp
new file mode 100644
index 000000000000..3a9a05874450
--- /dev/null
+++ b/offload/unittests/Conformance/tests/TanTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the tan function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'tan' function to select the double version
+constexpr auto tand // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(tan);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<tand> {
+  static constexpr llvm::StringRef Name = "tan";
+  static constexpr llvm::StringRef KernelName = "tanKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 5;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the tan function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<tand>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Tanf16Test.cpp b/offload/unittests/Conformance/tests/Tanf16Test.cpp
new file mode 100644
index 000000000000..eae9818830a2
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Tanf16Test.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the tanf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 tanf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<tanf16> {
+  static constexpr llvm::StringRef Name = "tanf16";
+  static constexpr llvm::StringRef KernelName = "tanf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  // Note:   The minimum accuracy at the source is 2.5 ULP, but we round it
+  //         down to ensure conformance.
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the tanf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<tanf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Tanhf16Test.cpp b/offload/unittests/Conformance/tests/Tanhf16Test.cpp
new file mode 100644
index 000000000000..1a11f3da7f09
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Tanhf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the tanhf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 tanhf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<tanhf16> {
+  static constexpr llvm::StringRef Name = "tanhf16";
+  static constexpr llvm::StringRef KernelName = "tanhf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the tanhf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<tanhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Tanpif16Test.cpp b/offload/unittests/Conformance/tests/Tanpif16Test.cpp
new file mode 100644
index 000000000000..76374807b92f
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Tanpif16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the tanpif16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 tanpif16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<tanpif16> {
+  static constexpr llvm::StringRef Name = "tanpif16";
+  static constexpr llvm::StringRef KernelName = "tanpif16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the tanpif16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<tanpif16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 8f0267eb39bd..50c99a5d5b63 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -20,12 +20,16 @@ add_offload_unittest("init"
 target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
 
 add_offload_unittest("kernel"
+    kernel/olCalculateOptimalOccupancy.cpp
     kernel/olLaunchKernel.cpp)
 
 add_offload_unittest("memory"
     memory/olMemAlloc.cpp
+    memory/olMemFill.cpp
     memory/olMemFree.cpp
-    memory/olMemcpy.cpp)
+    memory/olMemcpy.cpp
+    memory/olGetMemInfo.cpp
+    memory/olGetMemInfoSize.cpp)
 
 add_offload_unittest("platform"
     platform/olGetPlatformInfo.cpp
@@ -33,6 +37,7 @@ add_offload_unittest("platform"
 
 add_offload_unittest("program"
     program/olCreateProgram.cpp
+    program/olIsValidBinary.cpp
     program/olDestroyProgram.cpp)
 
 add_offload_unittest("queue"
@@ -41,7 +46,8 @@ add_offload_unittest("queue"
     queue/olDestroyQueue.cpp
     queue/olGetQueueInfo.cpp
     queue/olGetQueueInfoSize.cpp
-    queue/olWaitEvents.cpp)
+    queue/olWaitEvents.cpp
+    queue/olLaunchHostFunction.cpp)
 
 add_offload_unittest("symbol"
     symbol/olGetSymbol.cpp
diff --git a/offload/unittests/OffloadAPI/common/Environment.cpp b/offload/unittests/OffloadAPI/common/Environment.cpp
index ef092cd4187d..c9da6ef9be7c 100644
--- a/offload/unittests/OffloadAPI/common/Environment.cpp
+++ b/offload/unittests/OffloadAPI/common/Environment.cpp
@@ -41,9 +41,9 @@ raw_ostream &operator<<(raw_ostream &Out,
 
 raw_ostream &operator<<(raw_ostream &Out, const ol_device_handle_t &Device) {
   size_t Size;
-  olGetDeviceInfoSize(Device, OL_DEVICE_INFO_NAME, &Size);
+  olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size);
   std::vector<char> Name(Size);
-  olGetDeviceInfo(Device, OL_DEVICE_INFO_NAME, Size, Name.data());
+  olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data());
   Out << Name.data();
   return Out;
 }
@@ -129,6 +129,9 @@ const std::vector<TestEnvironment::Device> &TestEnvironment::getDevices() {
     }
   }
 
+  if (Devices.size() == 0)
+    errs() << "Warning: No devices found for OffloadAPI tests.\n";
+
   return Devices;
 }
 
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index 43240fa3c4a0..6f9961e2c6d5 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -26,12 +26,30 @@
   } while (0)
 #endif
 
-// TODO: rework this so the EXPECTED/ACTUAL results are readable
+#ifndef ASSERT_SUCCESS_OR_UNSUPPORTED
+#define ASSERT_SUCCESS_OR_UNSUPPORTED(ACTUAL)                                  \
+  do {                                                                         \
+    ol_result_t Res = ACTUAL;                                                  \
+    if (Res && Res->Code == OL_ERRC_UNSUPPORTED) {                             \
+      GTEST_SKIP() << #ACTUAL " returned unsupported; skipping test";          \
+      return;                                                                  \
+    } else if (Res && Res->Code != OL_ERRC_SUCCESS) {                          \
+      GTEST_FAIL() << #ACTUAL " returned " << Res->Code << ": "                \
+                   << Res->Details;                                            \
+    }                                                                          \
+  } while (0)
+#endif
+
 #ifndef ASSERT_ERROR
 #define ASSERT_ERROR(EXPECTED, ACTUAL)                                         \
   do {                                                                         \
     ol_result_t Res = ACTUAL;                                                  \
-    ASSERT_TRUE(Res && (Res->Code == EXPECTED));                               \
+    if (!Res)                                                                  \
+      GTEST_FAIL() << #ACTUAL " succeeded when we expected it to fail";        \
+    if (Res->Code != EXPECTED)                                                 \
+      GTEST_FAIL() << #ACTUAL " was expected to return "                       \
+                   << #EXPECTED " but instead returned " << Res->Code << ": "  \
+                   << Res->Details;                                            \
   } while (0)
 #endif
 
@@ -75,6 +93,40 @@ template <typename Fn> inline void threadify(Fn body) {
   }
 }
 
+/// Enqueues a task to the queue that can be manually resolved.
+// It will block until `trigger` is called.
+struct ManuallyTriggeredTask {
+  std::mutex M;
+  std::condition_variable CV;
+  bool Flag = false;
+  ol_event_handle_t CompleteEvent;
+
+  ol_result_t enqueue(ol_queue_handle_t Queue) {
+    if (auto Err = olLaunchHostFunction(
+            Queue,
+            [](void *That) {
+              static_cast<ManuallyTriggeredTask *>(That)->wait();
+            },
+            this))
+      return Err;
+
+    return olCreateEvent(Queue, &CompleteEvent);
+  }
+
+  void wait() {
+    std::unique_lock<std::mutex> lk(M);
+    CV.wait_for(lk, std::chrono::milliseconds(1000), [&] { return Flag; });
+    EXPECT_TRUE(Flag);
+  }
+
+  ol_result_t trigger() {
+    Flag = true;
+    CV.notify_one();
+
+    return olSyncEvent(CompleteEvent);
+  }
+};
+
 struct OffloadTest : ::testing::Test {
   ol_device_handle_t Host = TestEnvironment::getHostDevice();
 };
@@ -202,9 +254,13 @@ struct OffloadEventTest : OffloadQueueTest {
   ol_event_handle_t Event = nullptr;
 };
 
+// Devices might not be available for offload testing, so allow uninstantiated
+// tests (as the device list will be empty). This means that all tests requiring
+// a device will be silently skipped.
 #define OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(FIXTURE)                      \
   INSTANTIATE_TEST_SUITE_P(                                                    \
       , FIXTURE, ::testing::ValuesIn(TestEnvironment::getDevices()),           \
       [](const ::testing::TestParamInfo<TestEnvironment::Device> &info) {      \
         return SanitizeString(info.param.Name);                                \
-      })
+      });                                                                      \
+  GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(FIXTURE)
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
index 5657320a33a2..8cb0b8065c33 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
@@ -13,6 +13,38 @@
 using olGetDeviceInfoTest = OffloadDeviceTest;
 OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetDeviceInfoTest);
 
+#define OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Dev,   \
+                                          Expr)                                \
+  TEST_P(olGetDeviceInfoTest, Test##Dev##TestName) {                           \
+    PropType Value;                                                            \
+    ASSERT_SUCCESS(olGetDeviceInfo(Dev, PropName, sizeof(Value), &Value));     \
+    Expr;                                                                      \
+  }
+
+#define OL_DEVICE_INFO_TEST_DEVICE_SUCCESS(TestName, PropType, PropName)       \
+  OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Device, {})
+
+#define OL_DEVICE_INFO_TEST_HOST_SUCCESS(TestName, PropType, PropName)         \
+  OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Host, {})
+
+#define OL_DEVICE_INFO_TEST_SUCCESS(TestName, PropType, PropName)              \
+  OL_DEVICE_INFO_TEST_DEVICE_SUCCESS(TestName, PropType, PropName)             \
+  OL_DEVICE_INFO_TEST_HOST_SUCCESS(TestName, PropType, PropName)
+
+#define OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(TestName, PropType, PropName,      \
+                                            LowBound)                          \
+  OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Device,      \
+                                    ASSERT_GT(Value, LowBound))
+
+#define OL_DEVICE_INFO_TEST_HOST_VALUE_GT(TestName, PropType, PropName,        \
+                                          LowBound)                            \
+  OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Host,        \
+                                    ASSERT_GT(Value, LowBound))
+
+#define OL_DEVICE_INFO_TEST_VALUE_GT(TestName, PropType, PropName, LowBound)   \
+  OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(TestName, PropType, PropName, LowBound)  \
+  OL_DEVICE_INFO_TEST_HOST_VALUE_GT(TestName, PropType, PropName, LowBound)
+
 TEST_P(olGetDeviceInfoTest, SuccessType) {
   ol_device_type_t DeviceType;
   ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_TYPE,
@@ -54,6 +86,29 @@ TEST_P(olGetDeviceInfoTest, HostName) {
   ASSERT_EQ(std::strlen(Name.data()), Size - 1);
 }
 
+TEST_P(olGetDeviceInfoTest, SuccessProductName) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(
+      olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size));
+  ASSERT_GT(Size, 0ul);
+  std::vector<char> Name;
+  Name.resize(Size);
+  ASSERT_SUCCESS(
+      olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data()));
+  ASSERT_EQ(std::strlen(Name.data()), Size - 1);
+}
+
+TEST_P(olGetDeviceInfoTest, HostProductName) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size));
+  ASSERT_GT(Size, 0ul);
+  std::vector<char> Name;
+  Name.resize(Size);
+  ASSERT_SUCCESS(
+      olGetDeviceInfo(Host, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data()));
+  ASSERT_EQ(std::strlen(Name.data()), Size - 1);
+}
+
 TEST_P(olGetDeviceInfoTest, SuccessVendor) {
   size_t Size = 0;
   ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size));
@@ -77,12 +132,8 @@ TEST_P(olGetDeviceInfoTest, SuccessDriverVersion) {
   ASSERT_EQ(std::strlen(DriverVersion.data()), Size - 1);
 }
 
-TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSize) {
-  uint32_t Value;
-  ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
-                                 sizeof(Value), &Value));
-  ASSERT_GT(Value, 0u);
-}
+OL_DEVICE_INFO_TEST_VALUE_GT(MaxWorkGroupSize, uint32_t,
+                             OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, 0);
 
 TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSizePerDimension) {
   ol_dimensions_t Value{0, 0, 0};
@@ -94,6 +145,59 @@ TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSizePerDimension) {
   ASSERT_GT(Value.z, 0u);
 }
 
+OL_DEVICE_INFO_TEST_VALUE_GT(MaxWorkSize, uint32_t,
+                             OL_DEVICE_INFO_MAX_WORK_SIZE, 0);
+
+TEST_P(olGetDeviceInfoTest, SuccessMaxWorkSizePerDimension) {
+  ol_dimensions_t Value{0, 0, 0};
+  ASSERT_SUCCESS(olGetDeviceInfo(Device,
+                                 OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION,
+                                 sizeof(Value), &Value));
+  ASSERT_GT(Value.x, 0u);
+  ASSERT_GT(Value.y, 0u);
+  ASSERT_GT(Value.z, 0u);
+}
+
+OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(VendorId, uint32_t,
+                                    OL_DEVICE_INFO_VENDOR_ID, 0);
+OL_DEVICE_INFO_TEST_HOST_SUCCESS(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID);
+OL_DEVICE_INFO_TEST_VALUE_GT(NumComputeUnits, uint32_t,
+                             OL_DEVICE_INFO_NUM_COMPUTE_UNITS, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(SingleFPConfig, ol_device_fp_capability_flags_t,
+                             OL_DEVICE_INFO_SINGLE_FP_CONFIG, 0);
+OL_DEVICE_INFO_TEST_SUCCESS(HalfFPConfig, ol_device_fp_capability_flags_t,
+                            OL_DEVICE_INFO_HALF_FP_CONFIG);
+OL_DEVICE_INFO_TEST_VALUE_GT(DoubleFPConfig, ol_device_fp_capability_flags_t,
+                             OL_DEVICE_INFO_DOUBLE_FP_CONFIG, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthChar, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthShort, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthInt, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthLong, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthFloat, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthDouble, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE, 0);
+OL_DEVICE_INFO_TEST_SUCCESS(NativeVectorWidthHalf, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF);
+OL_DEVICE_INFO_TEST_VALUE_GT(MaxClockFrequency, uint32_t,
+                             OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(MemoryClockRate, uint32_t,
+                             OL_DEVICE_INFO_MEMORY_CLOCK_RATE, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(AddressBits, uint32_t, OL_DEVICE_INFO_ADDRESS_BITS,
+                             0);
+OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(MaxMemAllocSize, uint64_t,
+                                    OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, 0);
+OL_DEVICE_INFO_TEST_HOST_SUCCESS(MaxMemAllocSize, uint64_t,
+                                 OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE);
+OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t,
+                                    OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0);
+OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t,
+                                 OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
+
 TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) {
   ol_device_type_t DeviceType;
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
index 4e29978fc20f..c4a3c2d5e3c7 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
@@ -13,48 +13,76 @@
 using olGetDeviceInfoSizeTest = OffloadDeviceTest;
 OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetDeviceInfoSizeTest);
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessType) {
-  size_t Size = 0;
-  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_TYPE, &Size));
-  ASSERT_EQ(Size, sizeof(ol_device_type_t));
-}
+#define OL_DEVICE_INFO_SIZE_TEST(TestName, PropName, Expr)                     \
+  TEST_P(olGetDeviceInfoSizeTest, Success##TestName) {                         \
+    size_t Size = 0;                                                           \
+    ASSERT_SUCCESS(olGetDeviceInfoSize(Device, PropName, &Size));              \
+    Expr;                                                                      \
+  }
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessPlatform) {
-  size_t Size = 0;
-  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PLATFORM, &Size));
-  ASSERT_EQ(Size, sizeof(ol_platform_handle_t));
-}
+#define OL_DEVICE_INFO_SIZE_TEST_EQ(TestName, PropType, PropName)              \
+  OL_DEVICE_INFO_SIZE_TEST(TestName, PropName,                                 \
+                           ASSERT_EQ(Size, sizeof(PropType)));
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessName) {
-  size_t Size = 0;
-  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_NAME, &Size));
-  ASSERT_NE(Size, 0ul);
-}
-
-TEST_P(olGetDeviceInfoSizeTest, SuccessVendor) {
-  size_t Size = 0;
-  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size));
-  ASSERT_NE(Size, 0ul);
-}
+#define OL_DEVICE_INFO_SIZE_TEST_NONZERO(TestName, PropName)                   \
+  OL_DEVICE_INFO_SIZE_TEST(TestName, PropName, ASSERT_NE(Size, 0ul));
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessDriverVersion) {
-  size_t Size = 0;
-  ASSERT_SUCCESS(
-      olGetDeviceInfoSize(Device, OL_DEVICE_INFO_DRIVER_VERSION, &Size));
-  ASSERT_NE(Size, 0ul);
-}
+OL_DEVICE_INFO_SIZE_TEST_EQ(Type, ol_device_type_t, OL_DEVICE_INFO_TYPE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t,
+                            OL_DEVICE_INFO_PLATFORM);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t,
+                            OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkSize, uint32_t,
+                            OL_DEVICE_INFO_MAX_WORK_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NumComputeUnits, uint32_t,
+                            OL_DEVICE_INFO_NUM_COMPUTE_UNITS);
+OL_DEVICE_INFO_SIZE_TEST_EQ(SingleFPConfig, ol_device_fp_capability_flags_t,
+                            OL_DEVICE_INFO_SINGLE_FP_CONFIG);
+OL_DEVICE_INFO_SIZE_TEST_EQ(HalfFPConfig, ol_device_fp_capability_flags_t,
+                            OL_DEVICE_INFO_HALF_FP_CONFIG);
+OL_DEVICE_INFO_SIZE_TEST_EQ(DoubleFPConfig, ol_device_fp_capability_flags_t,
+                            OL_DEVICE_INFO_DOUBLE_FP_CONFIG);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthChar, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthShort, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthInt, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthLong, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthFloat, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthDouble, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthHalf, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MaxClockFrequency, uint32_t,
+                            OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MemoryClockRate, uint32_t,
+                            OL_DEVICE_INFO_MEMORY_CLOCK_RATE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(AddressBits, uint32_t, OL_DEVICE_INFO_ADDRESS_BITS);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t,
+                            OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t,
+                            OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSize) {
+TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) {
   size_t Size = 0;
-  ASSERT_SUCCESS(
-      olGetDeviceInfoSize(Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, &Size));
-  ASSERT_EQ(Size, sizeof(uint32_t));
+  ASSERT_SUCCESS(olGetDeviceInfoSize(
+      Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION, &Size));
+  ASSERT_EQ(Size, sizeof(ol_dimensions_t));
+  ASSERT_EQ(Size, sizeof(uint32_t) * 3);
 }
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) {
+TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkSizePerDimension) {
   size_t Size = 0;
   ASSERT_SUCCESS(olGetDeviceInfoSize(
-      Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION, &Size));
+      Device, OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, &Size));
   ASSERT_EQ(Size, sizeof(ol_dimensions_t));
   ASSERT_EQ(Size, sizeof(uint32_t) * 3);
 }
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 50e430597e64..1a042e1b3831 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -2,6 +2,7 @@ add_offload_test_device_code(foo.cpp foo)
 add_offload_test_device_code(bar.cpp bar)
 # Compile with optimizations to eliminate AMDGPU implicit arguments.
 add_offload_test_device_code(noargs.cpp noargs -O3)
+add_offload_test_device_code(byte.cpp byte)
 add_offload_test_device_code(localmem.cpp localmem)
 add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
 add_offload_test_device_code(localmem_static.cpp localmem_static)
@@ -14,6 +15,7 @@ add_custom_target(offload_device_binaries DEPENDS
     foo.bin
     bar.bin
     noargs.bin
+    byte.bin
     localmem.bin
     localmem_reduction.bin
     localmem_static.bin
diff --git a/offload/unittests/OffloadAPI/device_code/byte.cpp b/offload/unittests/OffloadAPI/device_code/byte.cpp
new file mode 100644
index 000000000000..779d120fefca
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/byte.cpp
@@ -0,0 +1,3 @@
+#include <gpuintrin.h>
+
+extern "C" __gpu_kernel void byte(unsigned char c) { (void)c; }
diff --git a/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp b/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp
index 908d2dcb6df5..b86d15f045eb 100644
--- a/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp
+++ b/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp
@@ -13,13 +13,22 @@
 using olGetEventInfoTest = OffloadEventTest;
 OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetEventInfoTest);
 
-TEST_P(olGetEventInfoTest, SuccessDevice) {
+TEST_P(olGetEventInfoTest, SuccessQueue) {
   ol_queue_handle_t RetrievedQueue;
   ASSERT_SUCCESS(olGetEventInfo(Event, OL_EVENT_INFO_QUEUE,
                                 sizeof(ol_queue_handle_t), &RetrievedQueue));
   ASSERT_EQ(Queue, RetrievedQueue);
 }
 
+TEST_P(olGetEventInfoTest, SuccessIsComplete) {
+  bool Complete = false;
+  while (!Complete) {
+    ASSERT_SUCCESS(olGetEventInfo(Event, OL_EVENT_INFO_IS_COMPLETE,
+                                  sizeof(Complete), &Complete));
+  }
+  ASSERT_EQ(Complete, true);
+}
+
 TEST_P(olGetEventInfoTest, InvalidNullHandle) {
   ol_queue_handle_t RetrievedQueue;
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
diff --git a/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp b/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp
index d7dee58e35e8..36f36c3a187f 100644
--- a/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp
@@ -19,6 +19,12 @@ TEST_P(olGetEventInfoSizeTest, SuccessQueue) {
   ASSERT_EQ(Size, sizeof(ol_queue_handle_t));
 }
 
+TEST_P(olGetEventInfoSizeTest, SuccessIsComplete) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetEventInfoSize(Event, OL_EVENT_INFO_IS_COMPLETE, &Size));
+  ASSERT_EQ(Size, sizeof(bool));
+}
+
 TEST_P(olGetEventInfoSizeTest, InvalidNullHandle) {
   size_t Size = 0;
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
diff --git a/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp
new file mode 100644
index 000000000000..17fa383cac3f
--- /dev/null
+++ b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp
@@ -0,0 +1,45 @@
+//===------- Offload API tests - olCalculateOptimalOccupancy --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olCalculateOptimalOccupancyTest = OffloadKernelTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olCalculateOptimalOccupancyTest);
+
+TEST_P(olCalculateOptimalOccupancyTest, Success) {
+  size_t Size{0};
+  ASSERT_SUCCESS_OR_UNSUPPORTED(
+      olCalculateOptimalOccupancy(Device, Kernel, 0, &Size));
+  ASSERT_GT(Size, 0u);
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, SuccessMem) {
+  size_t Size{0};
+  ASSERT_SUCCESS_OR_UNSUPPORTED(
+      olCalculateOptimalOccupancy(Device, Kernel, 1024, &Size));
+  ASSERT_GT(Size, 0u);
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, NullKernel) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olCalculateOptimalOccupancy(Device, nullptr, 0, &Size));
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, NullDevice) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olCalculateOptimalOccupancy(nullptr, Kernel, 0, &Size));
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, NullOutput) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olCalculateOptimalOccupancy(Device, Kernel, 0, nullptr));
+}
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 1dac8c50271b..c9eca36a4d44 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -55,6 +55,7 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
 
 KERNEL_TEST(Foo, foo)
 KERNEL_TEST(NoArgs, noargs)
+KERNEL_TEST(Byte, byte)
 KERNEL_TEST(LocalMem, localmem)
 KERNEL_TEST(LocalMemReduction, localmem_reduction)
 KERNEL_TEST(LocalMemStatic, localmem_static)
diff --git a/offload/unittests/OffloadAPI/memory/olGetMemInfo.cpp b/offload/unittests/OffloadAPI/memory/olGetMemInfo.cpp
new file mode 100644
index 000000000000..a4b382ff298a
--- /dev/null
+++ b/offload/unittests/OffloadAPI/memory/olGetMemInfo.cpp
@@ -0,0 +1,130 @@
+//===------- Offload API tests - olGetMemInfo -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+constexpr size_t SIZE = 1024;
+
+struct olGetMemInfoBaseTest : OffloadDeviceTest {
+  void *OffsetPtr() { return &reinterpret_cast<char *>(Ptr)[123]; }
+
+  void *Ptr;
+};
+
+template <ol_alloc_type_t AllocType>
+struct olGetMemInfoTest : olGetMemInfoBaseTest {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::SetUp());
+    ASSERT_SUCCESS(olMemAlloc(Device, AllocType, SIZE, &Ptr));
+  }
+
+  void TearDown() override {
+    ASSERT_SUCCESS(olMemFree(Ptr));
+    RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::TearDown());
+  }
+};
+using olGetMemInfoDeviceTest = olGetMemInfoTest<OL_ALLOC_TYPE_DEVICE>;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoDeviceTest);
+using olGetMemInfoManagedTest = olGetMemInfoTest<OL_ALLOC_TYPE_MANAGED>;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoManagedTest);
+using olGetMemInfoHostTest = olGetMemInfoTest<OL_ALLOC_TYPE_HOST>;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoHostTest);
+
+#define PER_ALLOC_TEST(FUNCTION)                                               \
+  TEST_P(olGetMemInfoDeviceTest, FUNCTION) {                                   \
+    FUNCTION(this, Ptr, OL_ALLOC_TYPE_DEVICE);                                 \
+  }                                                                            \
+  TEST_P(olGetMemInfoManagedTest, FUNCTION) {                                  \
+    FUNCTION(this, Ptr, OL_ALLOC_TYPE_MANAGED);                                \
+  }                                                                            \
+  TEST_P(olGetMemInfoHostTest, FUNCTION) {                                     \
+    FUNCTION(this, OffsetPtr(), OL_ALLOC_TYPE_HOST);                           \
+  }                                                                            \
+  TEST_P(olGetMemInfoDeviceTest, FUNCTION##Offset) {                           \
+    FUNCTION(this, Ptr, OL_ALLOC_TYPE_DEVICE);                                 \
+  }                                                                            \
+  TEST_P(olGetMemInfoManagedTest, FUNCTION##Offset) {                          \
+    FUNCTION(this, OffsetPtr(), OL_ALLOC_TYPE_MANAGED);                        \
+  }                                                                            \
+  TEST_P(olGetMemInfoHostTest, FUNCTION##Offset) {                             \
+    FUNCTION(this, OffsetPtr(), OL_ALLOC_TYPE_HOST);                           \
+  }
+
+void SuccessDevice(olGetMemInfoBaseTest *Fixture, void *Ptr,
+                   ol_alloc_type_t Type) {
+  ol_device_handle_t RetrievedDevice;
+  ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_DEVICE,
+                              sizeof(RetrievedDevice), &RetrievedDevice));
+  ASSERT_EQ(RetrievedDevice, Fixture->Device);
+}
+PER_ALLOC_TEST(SuccessDevice);
+
+void SuccessBase(olGetMemInfoBaseTest *Fixture, void *Ptr,
+                 ol_alloc_type_t Type) {
+  void *RetrievedBase;
+  ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_BASE,
+                              sizeof(RetrievedBase), &RetrievedBase));
+  ASSERT_EQ(RetrievedBase, Fixture->Ptr);
+}
+PER_ALLOC_TEST(SuccessBase);
+
+void SuccessSize(olGetMemInfoBaseTest *Fixture, void *Ptr,
+                 ol_alloc_type_t Type) {
+  size_t RetrievedSize;
+  ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_SIZE,
+                              sizeof(RetrievedSize), &RetrievedSize));
+  ASSERT_EQ(RetrievedSize, SIZE);
+}
+PER_ALLOC_TEST(SuccessSize);
+
+void SuccessType(olGetMemInfoBaseTest *Fixture, void *Ptr,
+                 ol_alloc_type_t Type) {
+  ol_alloc_type_t RetrievedType;
+  ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_TYPE,
+                              sizeof(RetrievedType), &RetrievedType));
+  ASSERT_EQ(RetrievedType, Type);
+}
+PER_ALLOC_TEST(SuccessType);
+
+TEST_P(olGetMemInfoDeviceTest, InvalidNotFound) {
+  // Assuming that we aren't unlucky and happen to get 0x1234 as a random
+  // pointer
+  void *RetrievedBase;
+  ASSERT_ERROR(OL_ERRC_NOT_FOUND,
+               olGetMemInfo(reinterpret_cast<void *>(0x1234), OL_MEM_INFO_BASE,
+                            sizeof(RetrievedBase), &RetrievedBase));
+}
+
+TEST_P(olGetMemInfoDeviceTest, InvalidNullPtr) {
+  ol_device_handle_t RetrievedDevice;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olGetMemInfo(nullptr, OL_MEM_INFO_DEVICE,
+                            sizeof(RetrievedDevice), &RetrievedDevice));
+}
+
+TEST_P(olGetMemInfoDeviceTest, InvalidSizeZero) {
+  ol_device_handle_t RetrievedDevice;
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olGetMemInfo(Ptr, OL_MEM_INFO_DEVICE, 0, &RetrievedDevice));
+}
+
+TEST_P(olGetMemInfoDeviceTest, InvalidSizeSmall) {
+  ol_device_handle_t RetrievedDevice;
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olGetMemInfo(Ptr, OL_MEM_INFO_DEVICE,
+                            sizeof(RetrievedDevice) - 1, &RetrievedDevice));
+}
+
+TEST_P(olGetMemInfoDeviceTest, InvalidNullPointerPropValue) {
+  ol_device_handle_t RetrievedDevice;
+  ASSERT_ERROR(
+      OL_ERRC_INVALID_NULL_POINTER,
+      olGetMemInfo(Ptr, OL_MEM_INFO_DEVICE, sizeof(RetrievedDevice), nullptr));
+}
diff --git a/offload/unittests/OffloadAPI/memory/olGetMemInfoSize.cpp b/offload/unittests/OffloadAPI/memory/olGetMemInfoSize.cpp
new file mode 100644
index 000000000000..f1a1e790fb22
--- /dev/null
+++ b/offload/unittests/OffloadAPI/memory/olGetMemInfoSize.cpp
@@ -0,0 +1,63 @@
+//===------- Offload API tests - olGetMemInfoSize -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <OffloadAPI.h>
+
+#include "../common/Fixtures.hpp"
+
+struct olGetMemInfoSizeTest : OffloadDeviceTest {
+  void *OffsetPtr() { return &reinterpret_cast<char *>(Ptr)[123]; }
+
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::SetUp());
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 0x1024, &Ptr));
+  }
+
+  void TearDown() override {
+    ASSERT_SUCCESS(olMemFree(Ptr));
+    RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::TearDown());
+  }
+
+  void *Ptr;
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoSizeTest);
+
+TEST_P(olGetMemInfoSizeTest, SuccessDevice) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_DEVICE, &Size));
+  ASSERT_EQ(Size, sizeof(ol_device_handle_t));
+}
+
+TEST_P(olGetMemInfoSizeTest, SuccessBase) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_BASE, &Size));
+  ASSERT_EQ(Size, sizeof(void *));
+}
+
+TEST_P(olGetMemInfoSizeTest, SuccessSize) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_SIZE, &Size));
+  ASSERT_EQ(Size, sizeof(size_t));
+}
+
+TEST_P(olGetMemInfoSizeTest, SuccessType) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_TYPE, &Size));
+  ASSERT_EQ(Size, sizeof(ol_alloc_type_t));
+}
+
+TEST_P(olGetMemInfoSizeTest, InvalidSymbolInfoEnumeration) {
+  size_t Size = 0;
+  ASSERT_ERROR(OL_ERRC_INVALID_ENUMERATION,
+               olGetMemInfoSize(Ptr, OL_MEM_INFO_FORCE_UINT32, &Size));
+}
+
+TEST_P(olGetMemInfoSizeTest, InvalidNullPointer) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olGetMemInfoSize(Ptr, OL_MEM_INFO_DEVICE, nullptr));
+}
diff --git a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
index 00e428ec2abc..445262aa0c58 100644
--- a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
@@ -34,6 +34,26 @@ TEST_P(olMemAllocTest, SuccessAllocDevice) {
   olMemFree(Alloc);
 }
 
+TEST_P(olMemAllocTest, SuccessAllocMany) {
+  std::vector<void *> Allocs;
+  Allocs.reserve(1000);
+
+  constexpr ol_alloc_type_t TYPES[3] = {
+      OL_ALLOC_TYPE_DEVICE, OL_ALLOC_TYPE_MANAGED, OL_ALLOC_TYPE_HOST};
+
+  for (size_t I = 1; I < 1000; I++) {
+    void *Alloc = nullptr;
+    ASSERT_SUCCESS(olMemAlloc(Device, TYPES[I % 3], 1024 * I, &Alloc));
+    ASSERT_NE(Alloc, nullptr);
+
+    Allocs.push_back(Alloc);
+  }
+
+  for (auto *A : Allocs) {
+    olMemFree(A);
+  }
+}
+
 TEST_P(olMemAllocTest, InvalidNullDevice) {
   void *Alloc = nullptr;
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
new file mode 100644
index 000000000000..a84ed3d78ecc
--- /dev/null
+++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
@@ -0,0 +1,193 @@
+//===------- Offload API tests - olMemFill --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+struct olMemFillTest : OffloadQueueTest {
+  template <typename PatternTy, PatternTy PatternVal, size_t Size,
+            bool Block = false>
+  void test_body() {
+    ManuallyTriggeredTask Manual;
+
+    // Block/enqueue tests ensure that the test has been enqueued to a queue
+    // (rather than being done synchronously if the queue happens to be empty)
+    if constexpr (Block) {
+      ASSERT_SUCCESS(Manual.enqueue(Queue));
+    }
+
+    void *Alloc;
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+    PatternTy Pattern = PatternVal;
+    ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+    if constexpr (Block) {
+      ASSERT_SUCCESS(Manual.trigger());
+    }
+    olSyncQueue(Queue);
+
+    size_t N = Size / sizeof(Pattern);
+    for (size_t i = 0; i < N; i++) {
+      PatternTy *AllocPtr = reinterpret_cast<PatternTy *>(Alloc);
+      ASSERT_EQ(AllocPtr[i], Pattern);
+    }
+
+    olMemFree(Alloc);
+  }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest);
+
+TEST_P(olMemFillTest, Success8) { test_body<uint8_t, 0x42, 1024>(); }
+TEST_P(olMemFillTest, Success8NotMultiple4) {
+  test_body<uint8_t, 0x42, 1023>();
+}
+TEST_P(olMemFillTest, Success8Enqueue) {
+  test_body<uint8_t, 0x42, 1024, true>();
+}
+TEST_P(olMemFillTest, Success8NotMultiple4Enqueue) {
+  test_body<uint8_t, 0x42, 1023, true>();
+}
+
+TEST_P(olMemFillTest, Success16) { test_body<uint8_t, 0x42, 1024>(); }
+TEST_P(olMemFillTest, Success16NotMultiple4) {
+  test_body<uint16_t, 0x4243, 1022>();
+}
+TEST_P(olMemFillTest, Success16Enqueue) {
+  test_body<uint8_t, 0x42, 1024, true>();
+}
+TEST_P(olMemFillTest, Success16NotMultiple4Enqueue) {
+  test_body<uint16_t, 0x4243, 1022, true>();
+}
+
+TEST_P(olMemFillTest, Success32) { test_body<uint32_t, 0xDEADBEEF, 1024>(); }
+TEST_P(olMemFillTest, Success32Enqueue) {
+  test_body<uint32_t, 0xDEADBEEF, 1024, true>();
+}
+
+TEST_P(olMemFillTest, SuccessLarge) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  struct PatternT {
+    uint64_t A;
+    uint64_t B;
+  } Pattern{UINT64_MAX, UINT64_MAX};
+
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+    ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLargeEnqueue) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  ManuallyTriggeredTask Manual;
+  ASSERT_SUCCESS(Manual.enqueue(Queue));
+
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  struct PatternT {
+    uint64_t A;
+    uint64_t B;
+  } Pattern{UINT64_MAX, UINT64_MAX};
+
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  Manual.trigger();
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+    ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLargeByteAligned) {
+  constexpr size_t Size = 17 * 64;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  struct __attribute__((packed)) PatternT {
+    uint64_t A;
+    uint64_t B;
+    uint8_t C;
+  } Pattern{UINT64_MAX, UINT64_MAX, 255};
+
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+    ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].C, 255);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLargeByteAlignedEnqueue) {
+  constexpr size_t Size = 17 * 64;
+  void *Alloc;
+  ManuallyTriggeredTask Manual;
+  ASSERT_SUCCESS(Manual.enqueue(Queue));
+
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  struct __attribute__((packed)) PatternT {
+    uint64_t A;
+    uint64_t B;
+    uint8_t C;
+  } Pattern{UINT64_MAX, UINT64_MAX, 255};
+
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  Manual.trigger();
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+    ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].C, 255);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, InvalidPatternSize) {
+  constexpr size_t Size = 1025;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  uint16_t Pattern = 0x4242;
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+  olMemFree(Alloc);
+}
diff --git a/offload/unittests/OffloadAPI/program/olIsValidBinary.cpp b/offload/unittests/OffloadAPI/program/olIsValidBinary.cpp
new file mode 100644
index 000000000000..02e805dd1135
--- /dev/null
+++ b/offload/unittests/OffloadAPI/program/olIsValidBinary.cpp
@@ -0,0 +1,49 @@
+//===------- Offload API tests - olIsValidBinary --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olIsValidBinaryTest = OffloadDeviceTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olIsValidBinaryTest);
+
+TEST_P(olIsValidBinaryTest, Success) {
+
+  std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
+  ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin));
+  ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
+
+  bool IsValid = false;
+  ASSERT_SUCCESS(olIsValidBinary(Device, DeviceBin->getBufferStart(),
+                                 DeviceBin->getBufferSize(), &IsValid));
+  ASSERT_TRUE(IsValid);
+
+  ASSERT_SUCCESS(
+      olIsValidBinary(Device, DeviceBin->getBufferStart(), 0, &IsValid));
+  ASSERT_FALSE(IsValid);
+}
+
+TEST_P(olIsValidBinaryTest, Invalid) {
+
+  std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
+  ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin));
+  ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
+
+  bool IsValid = false;
+  ASSERT_SUCCESS(
+      olIsValidBinary(Device, DeviceBin->getBufferStart(), 0, &IsValid));
+  ASSERT_FALSE(IsValid);
+}
+
+TEST_P(olIsValidBinaryTest, NullPointer) {
+  bool IsValid = false;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olIsValidBinary(Device, nullptr, 42, &IsValid));
+  ASSERT_FALSE(IsValid);
+}
diff --git a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp
index 0dc8527df532..aa9e372ede2c 100644
--- a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp
+++ b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp
@@ -18,6 +18,15 @@ TEST_P(olDestroyQueueTest, Success) {
   Queue = nullptr;
 }
 
+TEST_P(olDestroyQueueTest, SuccessDelayedResolution) {
+  ManuallyTriggeredTask Manual;
+  ASSERT_SUCCESS(Manual.enqueue(Queue));
+  ASSERT_SUCCESS(olDestroyQueue(Queue));
+  Queue = nullptr;
+
+  ASSERT_SUCCESS(Manual.trigger());
+}
+
 TEST_P(olDestroyQueueTest, InvalidNullHandle) {
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, olDestroyQueue(nullptr));
 }
diff --git a/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
new file mode 100644
index 000000000000..aa86750f6adf
--- /dev/null
+++ b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
@@ -0,0 +1,107 @@
+//===------- Offload API tests - olLaunchHostFunction ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+#include <thread>
+
+struct olLaunchHostFunctionTest : OffloadQueueTest {};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchHostFunctionTest);
+
+struct olLaunchHostFunctionKernelTest : OffloadKernelTest {};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchHostFunctionKernelTest);
+
+TEST_P(olLaunchHostFunctionTest, Success) {
+  ASSERT_SUCCESS(olLaunchHostFunction(Queue, [](void *) {}, nullptr));
+}
+
+TEST_P(olLaunchHostFunctionTest, SuccessSequence) {
+  uint32_t Buff[16] = {1, 1};
+
+  for (auto BuffPtr = &Buff[2]; BuffPtr != &Buff[16]; BuffPtr++) {
+    ASSERT_SUCCESS(olLaunchHostFunction(
+        Queue,
+        [](void *BuffPtr) {
+          uint32_t *AsU32 = reinterpret_cast<uint32_t *>(BuffPtr);
+          AsU32[0] = AsU32[-1] + AsU32[-2];
+        },
+        BuffPtr));
+  }
+
+  ASSERT_SUCCESS(olSyncQueue(Queue));
+
+  for (uint32_t i = 2; i < 16; i++) {
+    ASSERT_EQ(Buff[i], Buff[i - 1] + Buff[i - 2]);
+  }
+}
+
+TEST_P(olLaunchHostFunctionKernelTest, SuccessBlocking) {
+  // Verify that a host kernel can block execution - A host task is created that
+  // only resolves when Block is set to false.
+  ol_kernel_launch_size_args_t LaunchArgs;
+  LaunchArgs.Dimensions = 1;
+  LaunchArgs.GroupSize = {64, 1, 1};
+  LaunchArgs.NumGroups = {1, 1, 1};
+  LaunchArgs.DynSharedMemory = 0;
+
+  ol_queue_handle_t Queue;
+  ASSERT_SUCCESS(olCreateQueue(Device, &Queue));
+
+  void *Mem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+
+  uint32_t *Data = (uint32_t *)Mem;
+  for (uint32_t i = 0; i < 64; i++) {
+    Data[i] = 0;
+  }
+
+  volatile bool Block = true;
+  ASSERT_SUCCESS(olLaunchHostFunction(
+      Queue,
+      [](void *Ptr) {
+        volatile bool *Block =
+            reinterpret_cast<volatile bool *>(reinterpret_cast<bool *>(Ptr));
+
+        while (*Block)
+          std::this_thread::yield();
+      },
+      const_cast<bool *>(&Block)));
+
+  struct {
+    void *Mem;
+  } Args{Mem};
+  ASSERT_SUCCESS(
+      olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));
+  for (uint32_t i = 0; i < 64; i++) {
+    ASSERT_EQ(Data[i], 0);
+  }
+
+  Block = false;
+  ASSERT_SUCCESS(olSyncQueue(Queue));
+
+  for (uint32_t i = 0; i < 64; i++) {
+    ASSERT_EQ(Data[i], i);
+  }
+
+  ASSERT_SUCCESS(olDestroyQueue(Queue));
+  ASSERT_SUCCESS(olMemFree(Mem));
+}
+
+TEST_P(olLaunchHostFunctionTest, InvalidNullCallback) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olLaunchHostFunction(Queue, nullptr, nullptr));
+}
+
+TEST_P(olLaunchHostFunctionTest, InvalidNullQueue) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olLaunchHostFunction(nullptr, [](void *) {}, nullptr));
+}
author	Guillaume Chatelet <gchatelet@google.com>	2025-10-14 09:02:30 -0700
committer	Alex Richardson <alexrichardson@google.com>	2025-10-14 09:02:30 -0700
commit	e2d7be24a8dc31bb36380abd088b7eb0da7ef6b4 (patch)
tree	4811d025c12321c442695ad5aa4f511fa2fbd10b /offload
parent	1be5a8430be58baa5754e6f046eeacf7ca2f1a54 (diff)
parent	57726bdca274b152d2f36aaad7c961767bb1f91a (diff)