summaryrefslogtreecommitdiff
path: root/offload
diff options
context:
space:
mode:
Diffstat (limited to 'offload')
-rw-r--r--offload/CMakeLists.txt4
-rw-r--r--offload/DeviceRTL/include/Allocator.h45
-rw-r--r--offload/DeviceRTL/include/Configuration.h68
-rw-r--r--offload/DeviceRTL/include/Debug.h44
-rw-r--r--offload/DeviceRTL/include/DeviceTypes.h166
-rw-r--r--offload/DeviceRTL/include/DeviceUtils.h96
-rw-r--r--offload/DeviceRTL/include/Interface.h366
-rw-r--r--offload/DeviceRTL/include/LibC.h23
-rw-r--r--offload/DeviceRTL/include/Mapping.h108
-rw-r--r--offload/DeviceRTL/include/Profiling.h21
-rw-r--r--offload/DeviceRTL/include/State.h377
-rw-r--r--offload/DeviceRTL/include/Synchronization.h225
-rw-r--r--offload/DeviceRTL/include/Workshare.h26
-rw-r--r--offload/DeviceRTL/include/generated_microtask_cases.gen797
-rw-r--r--offload/DeviceRTL/src/Allocator.cpp77
-rw-r--r--offload/DeviceRTL/src/Configuration.cpp85
-rw-r--r--offload/DeviceRTL/src/Debug.cpp44
-rw-r--r--offload/DeviceRTL/src/DeviceUtils.cpp64
-rw-r--r--offload/DeviceRTL/src/Kernel.cpp162
-rw-r--r--offload/DeviceRTL/src/LibC.cpp48
-rw-r--r--offload/DeviceRTL/src/Mapping.cpp212
-rw-r--r--offload/DeviceRTL/src/Misc.cpp138
-rw-r--r--offload/DeviceRTL/src/Parallelism.cpp311
-rw-r--r--offload/DeviceRTL/src/Profiling.cpp18
-rw-r--r--offload/DeviceRTL/src/Reduction.cpp316
-rw-r--r--offload/DeviceRTL/src/State.cpp482
-rw-r--r--offload/DeviceRTL/src/Stub.cpp1
-rw-r--r--offload/DeviceRTL/src/Synchronization.cpp379
-rw-r--r--offload/DeviceRTL/src/Tasking.cpp103
-rw-r--r--offload/DeviceRTL/src/Workshare.cpp970
-rw-r--r--offload/cmake/caches/AMDGPUBot.cmake5
-rw-r--r--offload/cmake/caches/AMDGPULibcBot.cmake3
-rw-r--r--offload/cmake/caches/Offload.cmake4
-rw-r--r--offload/include/device.h2
-rw-r--r--offload/liboffload/API/Device.td3
-rw-r--r--offload/liboffload/src/OffloadImpl.cpp154
-rw-r--r--offload/libomptarget/device.cpp84
-rw-r--r--offload/libomptarget/omptarget.cpp6
-rw-r--r--offload/libomptarget/private.h9
-rw-r--r--offload/plugins-nextgen/amdgpu/src/rtl.cpp120
-rw-r--r--offload/plugins-nextgen/common/include/PluginInterface.h23
-rw-r--r--offload/plugins-nextgen/common/src/PluginInterface.cpp111
-rw-r--r--offload/plugins-nextgen/cuda/src/rtl.cpp14
-rw-r--r--offload/plugins-nextgen/host/src/rtl.cpp1
-rw-r--r--offload/test/lit.cfg4
-rw-r--r--offload/test/mapping/chained_containing_structs_1.cc58
-rw-r--r--offload/test/mapping/chained_containing_structs_2.cc76
-rw-r--r--offload/test/mapping/chained_containing_structs_3.cc217
-rw-r--r--offload/test/mapping/map_ptr_and_star_global.c2
-rw-r--r--offload/test/mapping/map_ptr_and_star_local.c2
-rw-r--r--offload/test/mapping/map_ptr_and_subscript_global.c2
-rw-r--r--offload/test/mapping/map_ptr_and_subscript_local.c2
-rw-r--r--offload/test/mapping/map_structptr_and_member_global.c2
-rw-r--r--offload/test/mapping/map_structptr_and_member_local.c2
-rw-r--r--offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp85
-rw-r--r--offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp143
-rw-r--r--offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp98
-rw-r--r--offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp158
-rw-r--r--offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp93
-rw-r--r--offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp159
-rw-r--r--offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp100
-rw-r--r--offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp166
-rw-r--r--offload/test/mapping/use_device_addr/target_use_device_addr.c (renamed from offload/test/mapping/target_use_device_addr.c)4
-rw-r--r--offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c (renamed from offload/test/mapping/target_wrong_use_device_addr.c)3
-rw-r--r--offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c (renamed from offload/test/mapping/array_section_use_device_ptr.c)4
-rw-r--r--offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp100
-rw-r--r--offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp125
-rw-r--r--offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp111
-rw-r--r--offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp136
-rw-r--r--offload/test/offloading/fortran/dtype-char-array-map-2.f9025
-rw-r--r--offload/test/offloading/fortran/dtype-char-array-map.f9027
-rw-r--r--offload/test/offloading/mandatory_but_no_devices.c43
-rw-r--r--offload/test/offloading/memory_manager.cpp2
-rw-r--r--offload/test/tools/llvm-omp-device-info.c4
-rw-r--r--offload/test/tools/offload-tblgen/default_returns.td6
-rw-r--r--offload/test/tools/offload-tblgen/entry_points.td3
-rw-r--r--offload/test/tools/offload-tblgen/functions_basic.td3
-rw-r--r--offload/test/tools/offload-tblgen/functions_code_loc.td3
-rw-r--r--offload/test/tools/offload-tblgen/functions_ranged_param.td6
-rw-r--r--offload/test/tools/offload-tblgen/print_enum.td3
-rw-r--r--offload/test/tools/offload-tblgen/print_function.td6
-rw-r--r--offload/test/tools/offload-tblgen/type_tagged_enum.td9
-rw-r--r--offload/tools/deviceinfo/CMakeLists.txt6
-rw-r--r--offload/tools/deviceinfo/llvm-offload-device-info.cpp275
-rw-r--r--offload/unittests/Conformance/README.md83
-rw-r--r--offload/unittests/Conformance/lib/DeviceContext.cpp5
-rw-r--r--offload/unittests/OffloadAPI/common/Environment.cpp7
-rw-r--r--offload/unittests/OffloadAPI/common/Fixtures.hpp40
-rw-r--r--offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp36
-rw-r--r--offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp11
-rw-r--r--offload/unittests/OffloadAPI/memory/olMemFill.cpp117
-rw-r--r--offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp9
92 files changed, 2867 insertions, 6029 deletions
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 38fa77e41bb5..b27738078350 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -4,7 +4,8 @@
cmake_minimum_required(VERSION 3.20.0)
set(LLVM_SUBPROJECT_TITLE "liboffload")
-if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+# Permit redefining OPENMP_STANDALONE_BUILD when doing a runtimes build.
+if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
set(OPENMP_STANDALONE_BUILD TRUE)
project(offload C CXX ASM)
else()
@@ -371,7 +372,6 @@ add_subdirectory(tools/offload-tblgen)
# Build offloading plugins and device RTLs if they are available.
add_subdirectory(plugins-nextgen)
-add_subdirectory(DeviceRTL)
add_subdirectory(tools)
add_subdirectory(docs)
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
deleted file mode 100644
index dc4d029ed75f..000000000000
--- a/offload/DeviceRTL/include/Allocator.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_ALLOCATOR_H
-#define OMPTARGET_ALLOCATOR_H
-
-#include "DeviceTypes.h"
-
-// Forward declaration.
-struct KernelEnvironmentTy;
-
-namespace ompx {
-
-namespace allocator {
-
-static uint64_t constexpr ALIGNMENT = 16;
-
-/// Initialize the allocator according to \p KernelEnvironment
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
-
-/// Allocate \p Size bytes.
-[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
-alloc(uint64_t Size);
-
-/// Free the allocation pointed to by \p Ptr.
-void free(void *Ptr);
-
-} // namespace allocator
-
-} // namespace ompx
-
-extern "C" {
-void *malloc(size_t Size);
-void free(void *Ptr);
-}
-
-#endif
diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h
deleted file mode 100644
index 95408933dd86..000000000000
--- a/offload/DeviceRTL/include/Configuration.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// API to query the global (constant) device environment.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_CONFIGURATION_H
-#define OMPTARGET_CONFIGURATION_H
-
-#include "Shared/Environment.h"
-
-#include "DeviceTypes.h"
-
-namespace ompx {
-namespace config {
-
-/// Return the number of devices in the system, same number as returned on the
-/// host by omp_get_num_devices.
-uint32_t getNumDevices();
-
-/// Return the device number in the system for omp_get_device_num.
-uint32_t getDeviceNum();
-
-/// Return the user chosen debug level.
-uint32_t getDebugKind();
-
-/// Return if teams oversubscription is assumed
-uint32_t getAssumeTeamsOversubscription();
-
-/// Return if threads oversubscription is assumed
-uint32_t getAssumeThreadsOversubscription();
-
-/// Return the amount of dynamic shared memory that was allocated at launch.
-uint64_t getDynamicMemorySize();
-
-/// Returns the cycles per second of the device's fixed frequency clock.
-uint64_t getClockFrequency();
-
-/// Returns the pointer to the beginning of the indirect call table.
-void *getIndirectCallTablePtr();
-
-/// Returns the size of the indirect call table.
-uint64_t getIndirectCallTableSize();
-
-/// Returns the size of the indirect call table.
-uint64_t getHardwareParallelism();
-
-/// Return if debugging is enabled for the given debug kind.
-bool isDebugMode(DeviceDebugKind Level);
-
-/// Indicates if this kernel may require thread-specific states, or if it was
-/// explicitly disabled by the user.
-bool mayUseThreadStates();
-
-/// Indicates if this kernel may require data environments for nested
-/// parallelism, or if it was explicitly disabled by the user.
-bool mayUseNestedParallelism();
-
-} // namespace config
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/Debug.h b/offload/DeviceRTL/include/Debug.h
deleted file mode 100644
index 98d0fa498d95..000000000000
--- a/offload/DeviceRTL/include/Debug.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_DEBUG_H
-#define OMPTARGET_DEVICERTL_DEBUG_H
-
-#include "Configuration.h"
-#include "LibC.h"
-
-/// Assertion
-///
-/// {
-extern "C" {
-void __assert_assume(bool condition);
-void __assert_fail(const char *expr, const char *file, unsigned line,
- const char *function);
-void __assert_fail_internal(const char *expr, const char *msg, const char *file,
- unsigned line, const char *function);
-}
-
-#define ASSERT(expr, msg) \
- { \
- if (config::isDebugMode(DeviceDebugKind::Assertion) && !(expr)) \
- __assert_fail_internal(#expr, msg, __FILE__, __LINE__, \
- __PRETTY_FUNCTION__); \
- else \
- __assert_assume(expr); \
- }
-#define UNREACHABLE(msg) \
- printf(msg); \
- __builtin_trap(); \
- __builtin_unreachable();
-
-///}
-
-#endif
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
deleted file mode 100644
index 2e5d92380f04..000000000000
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ /dev/null
@@ -1,166 +0,0 @@
-//===---------- DeviceTypes.h - OpenMP types ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_TYPES_H
-#define OMPTARGET_TYPES_H
-
-#include <gpuintrin.h>
-#include <stddef.h>
-#include <stdint.h>
-
-template <typename T> using Private = __gpu_private T;
-template <typename T> using Constant = __gpu_constant T;
-template <typename T> using Local = __gpu_local T;
-template <typename T> using Global = __gpu_local T;
-
-enum omp_proc_bind_t {
- omp_proc_bind_false = 0,
- omp_proc_bind_true = 1,
- omp_proc_bind_master = 2,
- omp_proc_bind_close = 3,
- omp_proc_bind_spread = 4
-};
-
-enum omp_sched_t {
- omp_sched_static = 1, /* chunkSize >0 */
- omp_sched_dynamic = 2, /* chunkSize >0 */
- omp_sched_guided = 3, /* chunkSize >0 */
- omp_sched_auto = 4, /* no chunkSize */
-};
-
-enum kmp_sched_t {
- kmp_sched_static_chunk = 33,
- kmp_sched_static_nochunk = 34,
- kmp_sched_dynamic = 35,
- kmp_sched_guided = 36,
- kmp_sched_runtime = 37,
- kmp_sched_auto = 38,
-
- kmp_sched_static_balanced_chunk = 45,
-
- kmp_sched_static_ordered = 65,
- kmp_sched_static_nochunk_ordered = 66,
- kmp_sched_dynamic_ordered = 67,
- kmp_sched_guided_ordered = 68,
- kmp_sched_runtime_ordered = 69,
- kmp_sched_auto_ordered = 70,
-
- kmp_sched_distr_static_chunk = 91,
- kmp_sched_distr_static_nochunk = 92,
- kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
-
- kmp_sched_default = kmp_sched_static_nochunk,
- kmp_sched_unordered_first = kmp_sched_static_chunk,
- kmp_sched_unordered_last = kmp_sched_auto,
- kmp_sched_ordered_first = kmp_sched_static_ordered,
- kmp_sched_ordered_last = kmp_sched_auto_ordered,
- kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
- kmp_sched_distribute_last =
- kmp_sched_distr_static_chunk_sched_static_chunkone,
-
- /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
- * Since we need to distinguish the three possible cases (no modifier,
- * monotonic modifier, nonmonotonic modifier), we need separate bits for
- * each modifier. The absence of monotonic does not imply nonmonotonic,
- * especially since 4.5 says that the behaviour of the "no modifier" case
- * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
- *
- * Since we're passing a full 32 bit value, we can use a couple of high
- * bits for these flags; out of paranoia we avoid the sign bit.
- *
- * These modifiers can be or-ed into non-static schedules by the compiler
- * to pass the additional information. They will be stripped early in the
- * processing in __kmp_dispatch_init when setting up schedules, so
- * most of the code won't ever see schedules with these bits set.
- */
- kmp_sched_modifier_monotonic = (1 << 29),
- /**< Set if the monotonic schedule modifier was present */
- kmp_sched_modifier_nonmonotonic = (1 << 30),
-/**< Set if the nonmonotonic schedule modifier was present */
-
-#define SCHEDULE_WITHOUT_MODIFIERS(s) \
- (enum kmp_sched_t)( \
- (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
-#define SCHEDULE_HAS_MONOTONIC(s) (((s) & kmp_sched_modifier_monotonic) != 0)
-#define SCHEDULE_HAS_NONMONOTONIC(s) \
- (((s) & kmp_sched_modifier_nonmonotonic) != 0)
-#define SCHEDULE_HAS_NO_MODIFIERS(s) \
- (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
- 0)
-
-};
-
-struct TaskDescriptorTy;
-using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr);
-struct TaskDescriptorTy {
- void *Payload;
- TaskFnTy TaskFn;
-};
-
-using LaneMaskTy = uint64_t;
-
-namespace lanes {
-enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
-} // namespace lanes
-
-/// The ident structure that describes a source location. The struct is
-/// identical to the one in the kmp.h file. We maintain the same data structure
-/// for compatibility.
-struct IdentTy {
- int32_t reserved_1; /**< might be used in Fortran; see above */
- int32_t flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
- identifies this union member */
- int32_t reserved_2; /**< not really used in Fortran any more; see above */
- int32_t reserved_3; /**< source[4] in Fortran, do not use for C++ */
- char const *psource; /**< String describing the source location.
- The string is composed of semi-colon separated fields
- which describe the source file, the function and a pair
- of line numbers that delimit the construct. */
-};
-
-using __kmpc_impl_lanemask_t = LaneMaskTy;
-
-using ParallelRegionFnTy = void *;
-
-using CriticalNameTy = int32_t[8];
-
-struct omp_lock_t {
- void *Lock;
-};
-
-using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num);
-using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id,
- int16_t lane_offset, int16_t shortCircuit);
-using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data);
-
-/// Macros for allocating variables in different address spaces.
-///{
-
-// Follows the pattern in interface.h
-typedef enum omp_allocator_handle_t {
- omp_null_allocator = 0,
- omp_default_mem_alloc = 1,
- omp_large_cap_mem_alloc = 2,
- omp_const_mem_alloc = 3,
- omp_high_bw_mem_alloc = 4,
- omp_low_lat_mem_alloc = 5,
- omp_cgroup_mem_alloc = 6,
- omp_pteam_mem_alloc = 7,
- omp_thread_mem_alloc = 8,
- KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
-} omp_allocator_handle_t;
-
-#define __PRAGMA(STR) _Pragma(#STR)
-#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
-
-///}
-
-#endif
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
deleted file mode 100644
index b92514ee9838..000000000000
--- a/offload/DeviceRTL/include/DeviceUtils.h
+++ /dev/null
@@ -1,96 +0,0 @@
-//===--- DeviceUtils.h - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H
-#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H
-
-#include "DeviceTypes.h"
-#include "Shared/Utils.h"
-
-namespace utils {
-
-template <typename T> struct type_identity {
- using type = T;
-};
-
-template <typename T, T v> struct integral_constant {
- inline static constexpr T value = v;
-};
-
-/// Freestanding SFINAE helpers.
-template <class T> struct remove_cv : type_identity<T> {};
-template <class T> struct remove_cv<const T> : type_identity<T> {};
-template <class T> struct remove_cv<volatile T> : type_identity<T> {};
-template <class T> struct remove_cv<const volatile T> : type_identity<T> {};
-template <class T> using remove_cv_t = typename remove_cv<T>::type;
-
-using true_type = integral_constant<bool, true>;
-using false_type = integral_constant<bool, false>;
-
-template <typename T, typename U> struct is_same : false_type {};
-template <typename T> struct is_same<T, T> : true_type {};
-template <typename T, typename U>
-inline constexpr bool is_same_v = is_same<T, U>::value;
-
-template <typename T> struct is_floating_point {
- inline static constexpr bool value =
- is_same_v<remove_cv_t<T>, float> || is_same_v<remove_cv_t<T>, double>;
-};
-template <typename T>
-inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
-
-template <bool B, typename T = void> struct enable_if;
-template <typename T> struct enable_if<true, T> : type_identity<T> {};
-template <bool B, typename T = void>
-using enable_if_t = typename enable_if<B, T>::type;
-
-template <class T> struct remove_addrspace : type_identity<T> {};
-template <class T, int N>
-struct remove_addrspace<T [[clang::address_space(N)]]> : type_identity<T> {};
-template <class T>
-using remove_addrspace_t = typename remove_addrspace<T>::type;
-
-template <typename To, typename From> inline To bitCast(From V) {
- static_assert(sizeof(To) == sizeof(From), "Bad conversion");
- return __builtin_bit_cast(To, V);
-}
-
-/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
-/// is identified by \p Mask.
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
-
-int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
-/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
-uint64_t pack(uint32_t LowBits, uint32_t HighBits);
-
-/// Unpack \p Val into \p LowBits and \p HighBits.
-void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
-
-/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
-bool isSharedMemPtr(void *Ptr);
-
-/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)).
-bool isThreadLocalMemPtr(void *Ptr);
-
-/// A pointer variable that has by design an `undef` value. Use with care.
-[[clang::loader_uninitialized]] static void *const UndefPtr;
-
-#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
-#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
-
-} // namespace utils
-
-#endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
deleted file mode 100644
index c4bfaaa2404b..000000000000
--- a/offload/DeviceRTL/include/Interface.h
+++ /dev/null
@@ -1,366 +0,0 @@
-//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_INTERFACE_H
-#define OMPTARGET_DEVICERTL_INTERFACE_H
-
-#include "Shared/Environment.h"
-
-#include "DeviceTypes.h"
-
-/// External API
-///
-///{
-
-extern "C" {
-
-/// ICV: dyn-var, constant 0
-///
-/// setter: ignored.
-/// getter: returns 0.
-///
-///{
-void omp_set_dynamic(int);
-int omp_get_dynamic(void);
-///}
-
-/// ICV: nthreads-var, integer
-///
-/// scope: data environment
-///
-/// setter: ignored.
-/// getter: returns false.
-///
-/// implementation notes:
-///
-///
-///{
-void omp_set_num_threads(int);
-int omp_get_max_threads(void);
-///}
-
-/// ICV: thread-limit-var, computed
-///
-/// getter: returns thread limited defined during launch.
-///
-///{
-int omp_get_thread_limit(void);
-///}
-
-/// ICV: max-active-level-var, constant 1
-///
-/// setter: ignored.
-/// getter: returns 1.
-///
-///{
-void omp_set_max_active_levels(int);
-int omp_get_max_active_levels(void);
-///}
-
-/// ICV: places-partition-var
-///
-///
-///{
-///}
-
-/// ICV: active-level-var, 0 or 1
-///
-/// getter: returns 0 or 1.
-///
-///{
-int omp_get_active_level(void);
-///}
-
-/// ICV: level-var
-///
-/// getter: returns parallel region nesting
-///
-///{
-int omp_get_level(void);
-///}
-
-/// ICV: run-sched-var
-///
-///
-///{
-void omp_set_schedule(omp_sched_t, int);
-void omp_get_schedule(omp_sched_t *, int *);
-///}
-
-/// TODO this is incomplete.
-int omp_get_num_threads(void);
-int omp_get_thread_num(void);
-void omp_set_nested(int);
-
-int omp_get_nested(void);
-
-void omp_set_max_active_levels(int Level);
-
-int omp_get_max_active_levels(void);
-
-omp_proc_bind_t omp_get_proc_bind(void);
-
-int omp_get_num_places(void);
-
-int omp_get_place_num_procs(int place_num);
-
-void omp_get_place_proc_ids(int place_num, int *ids);
-
-int omp_get_place_num(void);
-
-int omp_get_partition_num_places(void);
-
-void omp_get_partition_place_nums(int *place_nums);
-
-int omp_get_cancellation(void);
-
-void omp_set_default_device(int deviceId);
-
-int omp_get_default_device(void);
-
-int omp_get_num_devices(void);
-
-int omp_get_device_num(void);
-
-int omp_get_num_teams(void);
-
-int omp_get_team_num();
-
-int omp_get_initial_device(void);
-
-void *llvm_omp_target_dynamic_shared_alloc();
-
-/// Synchronization
-///
-///{
-void omp_init_lock(omp_lock_t *Lock);
-
-void omp_destroy_lock(omp_lock_t *Lock);
-
-void omp_set_lock(omp_lock_t *Lock);
-
-void omp_unset_lock(omp_lock_t *Lock);
-
-int omp_test_lock(omp_lock_t *Lock);
-///}
-
-/// Tasking
-///
-///{
-int omp_in_final(void);
-
-int omp_get_max_task_priority(void);
-///}
-
-/// Misc
-///
-///{
-double omp_get_wtick(void);
-
-double omp_get_wtime(void);
-///}
-}
-
-extern "C" {
-/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
-/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
-/// called by any thread, allocation happens *per thread*.
-void *__kmpc_alloc_shared(uint64_t Bytes);
-
-/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
-/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the
-/// allocated by __kmpc_alloc_shared by the same thread.
-void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
-
-/// Get a pointer to the memory buffer containing dynamically allocated shared
-/// memory configured at launch.
-void *__kmpc_get_dynamic_shared();
-
-/// Allocate sufficient space for \p NumArgs sequential `void*` and store the
-/// allocation address in \p GlobalArgs.
-///
-/// Called by the main thread prior to a parallel region.
-///
-/// We also remember it in GlobalArgsPtr to ensure the worker threads and
-/// deallocation function know the allocation address too.
-void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs);
-
-/// Deallocate the memory allocated by __kmpc_begin_sharing_variables.
-///
-/// Called by the main thread after a parallel region.
-void __kmpc_end_sharing_variables();
-
-/// Store the allocation address obtained via __kmpc_begin_sharing_variables in
-/// \p GlobalArgs.
-///
-/// Called by the worker threads in the parallel region (function).
-void __kmpc_get_shared_variables(void ***GlobalArgs);
-
-/// External interface to get the thread ID.
-uint32_t __kmpc_get_hardware_thread_id_in_block();
-
-/// External interface to get the number of threads.
-uint32_t __kmpc_get_hardware_num_threads_in_block();
-
-/// External interface to get the warp size.
-uint32_t __kmpc_get_warp_size();
-
-/// Kernel
-///
-///{
-// Forward declaration
-struct KernelEnvironmentTy;
-
-int8_t __kmpc_is_spmd_exec_mode();
-
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
-
-void __kmpc_target_deinit();
-
-///}
-
-/// Reduction
-///
-///{
-void *__kmpc_reduction_get_fixed_buffer();
-
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
- uint64_t reduce_data_size,
- void *reduce_data,
- ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct);
-
-int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
- IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
- uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
- ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct);
-///}
-
-/// Synchronization
-///
-///{
-void __kmpc_ordered(IdentTy *Loc, int32_t TId);
-
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId);
-
-int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
-
-void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
-
-void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
-
-void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId);
-
-int32_t __kmpc_master(IdentTy *Loc, int32_t TId);
-
-void __kmpc_end_master(IdentTy *Loc, int32_t TId);
-
-int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter);
-
-void __kmpc_end_masked(IdentTy *Loc, int32_t TId);
-
-int32_t __kmpc_single(IdentTy *Loc, int32_t TId);
-
-void __kmpc_end_single(IdentTy *Loc, int32_t TId);
-
-void __kmpc_flush(IdentTy *Loc);
-
-uint64_t __kmpc_warp_active_thread_mask(void);
-
-void __kmpc_syncwarp(uint64_t Mask);
-
-void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
-
-void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
-///}
-
-/// Parallelism
-///
-///{
-/// TODO
-void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn);
-
-/// TODO
-bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn);
-
-/// TODO
-void __kmpc_kernel_end_parallel();
-
-/// TODO
-void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
-
-/// TODO
-void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams,
- int32_t ThreadLimit);
-
-/// TODO
-uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t);
-
-///}
-
-/// Tasking
-///
-///{
-TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
- size_t TaskSizeInclPrivateValues,
- size_t SharedValuesSize,
- TaskFnTy TaskFn);
-
-int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor);
-
-int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor, int32_t,
- void *, int32_t, void *);
-
-void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor);
-
-void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor);
-
-void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
- void *);
-
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId);
-
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId);
-
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int);
-
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId);
-
-void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor, int,
- uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
- int32_t, uint64_t, void *);
-///}
-
-/// Misc
-///
-///{
-int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal);
-
-int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
-///}
-
-/// Shuffle
-///
-///{
-int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
-int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
-
-///}
-}
-
-#endif
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
deleted file mode 100644
index 94b5e6519606..000000000000
--- a/offload/DeviceRTL/include/LibC.h
+++ /dev/null
@@ -1,23 +0,0 @@
-//===--------- LibC.h - Simple implementation of libc functions --- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_LIBC_H
-#define OMPTARGET_LIBC_H
-
-#include "DeviceTypes.h"
-
-namespace ompx {
-
-int printf(const char *Format, ...);
-
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
deleted file mode 100644
index 8ba018b5314a..000000000000
--- a/offload/DeviceRTL/include/Mapping.h
+++ /dev/null
@@ -1,108 +0,0 @@
-//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_MAPPING_H
-#define OMPTARGET_MAPPING_H
-
-#include "DeviceTypes.h"
-
-namespace ompx {
-
-namespace mapping {
-
-enum {
- DIM_X = __GPU_X_DIM,
- DIM_Y = __GPU_Y_DIM,
- DIM_Z = __GPU_Z_DIM,
-};
-
-inline constexpr uint32_t MaxThreadsPerTeam = 1024;
-
-/// Initialize the mapping machinery.
-void init(bool IsSPMD);
-
-/// Return true if the kernel is executed in SPMD mode.
-bool isSPMDMode();
-
-/// Return true if the kernel is executed in generic mode.
-bool isGenericMode();
-
-/// Return true if the executing thread is the main thread in generic mode.
-/// These functions will lookup state and it is required that that is OK for the
-/// thread and location. See also `isInitialThreadInLevel0` for a stateless
-/// alternative for certain situations, e.g. during initialization.
-bool isMainThreadInGenericMode();
-bool isMainThreadInGenericMode(bool IsSPMD);
-
-/// Return true if this thread is the initial thread in parallel level 0.
-///
-/// The thread for which this returns true should be used for single threaded
-/// initialization tasks. We pick a special thread to ensure there are no
-/// races between the initialization and the first read of initialized state.
-bool isInitialThreadInLevel0(bool IsSPMD);
-
-/// Return true if the executing thread has the lowest Id of the active threads
-/// in the warp.
-bool isLeaderInWarp();
-
-/// Return a mask describing all active threads in the warp.
-LaneMaskTy activemask();
-
-/// Return a mask describing all threads with a smaller Id in the warp.
-LaneMaskTy lanemaskLT();
-
-/// Return a mask describing all threads with a larger Id in the warp.
-LaneMaskTy lanemaskGT();
-
-/// Return the thread Id in the warp, in [0, getWarpSize()).
-uint32_t getThreadIdInWarp();
-
-/// Return the warp size, thus number of threads in the warp.
-uint32_t getWarpSize();
-
-/// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()]
-uint32_t getWarpIdInBlock();
-
-/// Return the number of warps in the block.
-uint32_t getNumberOfWarpsInBlock();
-
-/// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)).
-uint32_t getThreadIdInBlock(int32_t Dim = DIM_X);
-
-/// Return the block size, thus number of threads in the block.
-uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X);
-
-/// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)).
-uint32_t getBlockIdInKernel(int32_t Dim = DIM_X);
-
-/// Return the number of blocks in the kernel.
-uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X);
-
-/// Return the kernel size, thus number of threads in the kernel.
-uint32_t getNumberOfThreadsInKernel();
-
-/// Return the maximal number of threads in the block usable for a team (=
-/// parallel region).
-///
-/// Note: The version taking \p IsSPMD mode explicitly can be used during the
-/// initialization of the target region, that is before `mapping::isSPMDMode()`
-/// can be called by any thread other than the main one.
-uint32_t getMaxTeamThreads();
-uint32_t getMaxTeamThreads(bool IsSPMD);
-
-/// Return the number of processing elements on the device.
-uint32_t getNumberOfProcessorElements();
-
-} // namespace mapping
-
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/Profiling.h b/offload/DeviceRTL/include/Profiling.h
deleted file mode 100644
index d99475225412..000000000000
--- a/offload/DeviceRTL/include/Profiling.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_PROFILING_H
-#define OMPTARGET_DEVICERTL_PROFILING_H
-
-extern "C" {
-void __llvm_profile_register_function(void *Ptr);
-void __llvm_profile_register_names_function(void *Ptr, long int I);
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
-}
-
-#endif
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
deleted file mode 100644
index db396dae6e44..000000000000
--- a/offload/DeviceRTL/include/State.h
+++ /dev/null
@@ -1,377 +0,0 @@
-//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_STATE_H
-#define OMPTARGET_STATE_H
-
-#include "Shared/Environment.h"
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Mapping.h"
-
-// Forward declaration.
-struct KernelEnvironmentTy;
-
-namespace ompx {
-
-namespace memory {
-
-/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
-///
-/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
-void *allocShared(uint64_t Size, const char *Reason);
-
-/// Free \p Ptr, allocated via allocShared, for \p Reason.
-///
-/// Note: See the restrictions on __kmpc_free_shared for proper usage.
-void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
-
-/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
-void *allocGlobal(uint64_t Size, const char *Reason);
-
-/// Return a pointer to the dynamic shared memory buffer.
-void *getDynamicBuffer();
-
-/// Free \p Ptr, allocated via allocGlobal, for \p Reason.
-void freeGlobal(void *Ptr, const char *Reason);
-
-} // namespace memory
-
-namespace state {
-
-inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
-
-struct ICVStateTy {
- uint32_t NThreadsVar;
- uint32_t LevelVar;
- uint32_t ActiveLevelVar;
- uint32_t Padding0Val;
- uint32_t MaxActiveLevelsVar;
- uint32_t RunSchedVar;
- uint32_t RunSchedChunkVar;
-
- bool operator==(const ICVStateTy &Other) const;
-
- void assertEqual(const ICVStateTy &Other) const;
-};
-
-struct TeamStateTy {
- void init(bool IsSPMD);
-
- bool operator==(const TeamStateTy &) const;
-
- void assertEqual(TeamStateTy &Other) const;
-
- /// ICVs
- ///
- /// Preallocated storage for ICV values that are used if the threads have not
- /// set a custom default. The latter is supported but unlikely and slow(er).
- ///
- ///{
- ICVStateTy ICVState;
- ///}
-
- uint32_t ParallelTeamSize;
- uint32_t HasThreadState;
- ParallelRegionFnTy ParallelRegionFnVar;
-};
-
-extern Local<TeamStateTy> TeamState;
-
-struct ThreadStateTy {
-
- /// ICVs have preallocated storage in the TeamStateTy which is used if a
- /// thread has not set a custom value. The latter is supported but unlikely.
- /// When it happens we will allocate dynamic memory to hold the values of all
- /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
- /// ICV struct to hold them all. This is slower than alternatives but allows
- /// users to pay only for what they use.
- ///
- state::ICVStateTy ICVState;
-
- ThreadStateTy *PreviousThreadState;
-
- void init() {
- ICVState = TeamState.ICVState;
- PreviousThreadState = nullptr;
- }
-
- void init(ThreadStateTy *PreviousTS) {
- ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
- PreviousThreadState = PreviousTS;
- }
-};
-
-extern Local<ThreadStateTy **> ThreadStates;
-
-/// Initialize the state machinery. Must be called by all threads.
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
-
-/// Return the kernel and kernel launch environment associated with the current
-/// kernel. The former is static and contains compile time information that
-/// holds for all instances of the kernel. The latter is dynamic and provides
-/// per-launch information.
-KernelEnvironmentTy &getKernelEnvironment();
-KernelLaunchEnvironmentTy &getKernelLaunchEnvironment();
-
-/// TODO
-enum ValueKind {
- VK_NThreads,
- VK_Level,
- VK_ActiveLevel,
- VK_MaxActiveLevels,
- VK_RunSched,
- // ---
- VK_RunSchedChunk,
- VK_ParallelRegionFn,
- VK_ParallelTeamSize,
- VK_HasThreadState,
-};
-
-/// TODO
-void enterDataEnvironment(IdentTy *Ident);
-
-/// TODO
-void exitDataEnvironment();
-
-/// TODO
-struct DateEnvironmentRAII {
- DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); }
- ~DateEnvironmentRAII() { exitDataEnvironment(); }
-};
-
-/// TODO
-void resetStateForThread(uint32_t TId);
-
-// FIXME: https://github.com/llvm/llvm-project/issues/123241.
-#define lookupForModify32Impl(Member, Ident, ForceTeamState) \
- { \
- if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() || \
- !TeamState.HasThreadState)) \
- return TeamState.ICVState.Member; \
- uint32_t TId = mapping::getThreadIdInBlock(); \
- if (OMP_UNLIKELY(!ThreadStates[TId])) { \
- ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>( \
- memory::allocGlobal(sizeof(ThreadStateTy), \
- "ICV modification outside data environment")); \
- ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!"); \
- TeamState.HasThreadState = true; \
- ThreadStates[TId]->init(); \
- } \
- return ThreadStates[TId]->ICVState.Member; \
- }
-
-// FIXME: https://github.com/llvm/llvm-project/issues/123241.
-#define lookupImpl(Member, ForceTeamState) \
- { \
- auto TId = mapping::getThreadIdInBlock(); \
- if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() && \
- TeamState.HasThreadState && ThreadStates[TId])) \
- return ThreadStates[TId]->ICVState.Member; \
- return TeamState.ICVState.Member; \
- }
-
-[[gnu::always_inline, gnu::flatten]] inline uint32_t &
-lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
- switch (Kind) {
- case state::VK_NThreads:
- if (IsReadonly)
- lookupImpl(NThreadsVar, ForceTeamState);
- lookupForModify32Impl(NThreadsVar, Ident, ForceTeamState);
- case state::VK_Level:
- if (IsReadonly)
- lookupImpl(LevelVar, ForceTeamState);
- lookupForModify32Impl(LevelVar, Ident, ForceTeamState);
- case state::VK_ActiveLevel:
- if (IsReadonly)
- lookupImpl(ActiveLevelVar, ForceTeamState);
- lookupForModify32Impl(ActiveLevelVar, Ident, ForceTeamState);
- case state::VK_MaxActiveLevels:
- if (IsReadonly)
- lookupImpl(MaxActiveLevelsVar, ForceTeamState);
- lookupForModify32Impl(MaxActiveLevelsVar, Ident, ForceTeamState);
- case state::VK_RunSched:
- if (IsReadonly)
- lookupImpl(RunSchedVar, ForceTeamState);
- lookupForModify32Impl(RunSchedVar, Ident, ForceTeamState);
- case state::VK_RunSchedChunk:
- if (IsReadonly)
- lookupImpl(RunSchedChunkVar, ForceTeamState);
- lookupForModify32Impl(RunSchedChunkVar, Ident, ForceTeamState);
- case state::VK_ParallelTeamSize:
- return TeamState.ParallelTeamSize;
- case state::VK_HasThreadState:
- return TeamState.HasThreadState;
- default:
- break;
- }
- __builtin_unreachable();
-}
-
-[[gnu::always_inline, gnu::flatten]] inline void *&
-lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
- switch (Kind) {
- case state::VK_ParallelRegionFn:
- return TeamState.ParallelRegionFnVar;
- default:
- break;
- }
- __builtin_unreachable();
-}
-
-/// A class without actual state used to provide a nice interface to lookup and
-/// update ICV values we can declare in global scope.
-template <typename Ty, ValueKind Kind> struct Value {
- [[gnu::flatten, gnu::always_inline]] operator Ty() {
- return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
- /*ForceTeamState=*/false);
- }
-
- [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) {
- set(Other, /*IdentTy=*/nullptr);
- return *this;
- }
-
- [[gnu::flatten, gnu::always_inline]] Value &operator++() {
- inc(1, /*IdentTy=*/nullptr);
- return *this;
- }
-
- [[gnu::flatten, gnu::always_inline]] Value &operator--() {
- inc(-1, /*IdentTy=*/nullptr);
- return *this;
- }
-
- [[gnu::flatten, gnu::always_inline]] void
- assert_eq(const Ty &V, IdentTy *Ident = nullptr,
- bool ForceTeamState = false) {
- ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr);
- }
-
-private:
- [[gnu::flatten, gnu::always_inline]] Ty &
- lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
- Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState);
- return t;
- }
-
- [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) {
- return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) +=
- UpdateVal);
- }
-
- [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) {
- return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) =
- UpdateVal);
- }
-
- template <typename VTy, typename Ty2> friend struct ValueRAII;
-};
-
-/// A mookup class without actual state used to provide
-/// a nice interface to lookup and update ICV values
-/// we can declare in global scope.
-template <typename Ty, ValueKind Kind> struct PtrValue {
- [[gnu::flatten, gnu::always_inline]] operator Ty() {
- return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
- /*ForceTeamState=*/false);
- }
-
- [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) {
- set(Other);
- return *this;
- }
-
-private:
- Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) {
- return lookupPtr(Kind, IsReadonly, ForceTeamState);
- }
-
- Ty &set(Ty UpdateVal) {
- return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr,
- /*ForceTeamState=*/false) = UpdateVal);
- }
-
- template <typename VTy, typename Ty2> friend struct ValueRAII;
-};
-
-template <typename VTy, typename Ty> struct ValueRAII {
- ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident,
- bool ForceTeamState = false)
- : Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState)
- : (Ty *)utils::UndefPtr),
- Val(OldValue), Active(Active) {
- if (!Active)
- return;
- ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!");
- *Ptr = NewValue;
- }
- ~ValueRAII() {
- if (Active)
- *Ptr = Val;
- }
-
-private:
- Ty *Ptr;
- Ty Val;
- bool Active;
-};
-
-/// TODO
-inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState;
-
-/// TODO
-inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
- ParallelRegionFn;
-
-void runAndCheckState(void(Func(void)));
-
-void assumeInitialState(bool IsSPMD);
-
-/// Return the value of the ParallelTeamSize ICV.
-int getEffectivePTeamSize();
-
-} // namespace state
-
-namespace icv {
-
-/// TODO
-inline state::Value<uint32_t, state::VK_NThreads> NThreads;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_Level> Level;
-
-/// The `active-level` describes which of the parallel level counted with the
-/// `level-var` is active. There can only be one.
-///
-/// active-level-var is 1, if ActiveLevelVar is not 0, otherwise it is 0.
-inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_RunSched> RunSched;
-
-} // namespace icv
-
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
deleted file mode 100644
index 7e7c8eacb917..000000000000
--- a/offload/DeviceRTL/include/Synchronization.h
+++ /dev/null
@@ -1,225 +0,0 @@
-//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
-#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
-
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-
-namespace ompx {
-namespace atomic {
-
-enum OrderingTy {
- relaxed = __ATOMIC_RELAXED,
- acquire = __ATOMIC_ACQUIRE,
- release = __ATOMIC_RELEASE,
- acq_rel = __ATOMIC_ACQ_REL,
- seq_cst = __ATOMIC_SEQ_CST,
-};
-
-enum MemScopeTy {
- system = __MEMORY_SCOPE_SYSTEM,
- device = __MEMORY_SCOPE_DEVICE,
- workgroup = __MEMORY_SCOPE_WRKGRP,
- wavefront = __MEMORY_SCOPE_WVFRNT,
- single = __MEMORY_SCOPE_SINGLE,
-};
-
-/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
-uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device);
-
-/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
-/// result is stored in \p *Addr;
-/// {
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc,
- atomic::OrderingTy OrderingFail,
- MemScopeTy MemScope = MemScopeTy::device) {
- return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false,
- OrderingSucc, OrderingFail, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V add(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- return __scoped_atomic_fetch_add(Address, Val, Ordering, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V load(Ty *Address, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
-#ifdef __NVPTX__
- return __scoped_atomic_fetch_add(Address, V(0), Ordering, MemScope);
-#else
- return __scoped_atomic_load_n(Address, Ordering, MemScope);
-#endif
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-void store(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- __scoped_atomic_store_n(Address, Val, Ordering, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V mul(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- Ty TypedCurrentVal, TypedResultVal, TypedNewVal;
- bool Success;
- do {
- TypedCurrentVal = atomic::load(Address, Ordering);
- TypedNewVal = TypedCurrentVal * Val;
- Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering,
- atomic::relaxed, MemScope);
- } while (!Success);
- return TypedResultVal;
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<!utils::is_floating_point_v<V>, V>
-max(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- return __scoped_atomic_fetch_max(Address, Val, Ordering, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, float>, V>
-max(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- if (Val >= 0)
- return utils::bitCast<float>(max(
- (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope));
- return utils::bitCast<float>(min(
- (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope));
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, double>, V>
-max(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- if (Val >= 0)
- return utils::bitCast<double>(max(
- (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope));
- return utils::bitCast<double>(min(
- (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope));
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<!utils::is_floating_point_v<V>, V>
-min(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- return __scoped_atomic_fetch_min(Address, Val, Ordering, MemScope);
-}
-
-// TODO: Implement this with __atomic_fetch_max and remove the duplication.
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, float>, V>
-min(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- if (Val >= 0)
- return utils::bitCast<float>(min(
- (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope));
- return utils::bitCast<float>(max(
- (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope));
-}
-
-// TODO: Implement this with __atomic_fetch_max and remove the duplication.
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-utils::enable_if_t<utils::is_same_v<V, double>, V>
-min(Ty *Address, utils::remove_addrspace_t<Ty> Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- if (Val >= 0)
- return utils::bitCast<double>(min(
- (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope));
- return utils::bitCast<double>(max(
- (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope));
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- return __scoped_atomic_fetch_or(Address, Val, Ordering, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- return __scoped_atomic_fetch_and(Address, Val, Ordering, MemScope);
-}
-
-template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
-V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- return __scoped_atomic_fetch_xor(Address, Val, Ordering, MemScope);
-}
-
-static inline uint32_t
-atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
- MemScopeTy MemScope = MemScopeTy::device) {
- uint32_t R;
- __scoped_atomic_exchange(Address, &Val, &R, Ordering, MemScope);
- return R;
-}
-
-///}
-
-} // namespace atomic
-
-namespace synchronize {
-
-/// Initialize the synchronization machinery. Must be called by all threads.
-void init(bool IsSPMD);
-
-/// Synchronize all threads in a warp identified by \p Mask.
-void warp(LaneMaskTy Mask);
-
-/// Synchronize all threads in a block and perform a fence before and after the
-/// barrier according to \p Ordering. Note that the fence might be part of the
-/// barrier.
-void threads(atomic::OrderingTy Ordering);
-
-/// Synchronizing threads is allowed even if they all hit different instances of
-/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
-/// restrictive in that it requires all threads to hit the same instance. The
-/// noinline is removed by the openmp-opt pass and helps to preserve the
-/// information till then.
-///{
-
-/// Synchronize all threads in a block, they are reaching the same instruction
-/// (hence all threads in the block are "aligned"). Also perform a fence before
-/// and after the barrier according to \p Ordering. Note that the
-/// fence might be part of the barrier if the target offers this.
-[[gnu::noinline, omp::assume("ompx_aligned_barrier")]] void
-threadsAligned(atomic::OrderingTy Ordering);
-
-///}
-
-} // namespace synchronize
-
-namespace fence {
-
-/// Memory fence with \p Ordering semantics for the team.
-void team(atomic::OrderingTy Ordering);
-
-/// Memory fence with \p Ordering semantics for the contention group.
-void kernel(atomic::OrderingTy Ordering);
-
-/// Memory fence with \p Ordering semantics for the system.
-void system(atomic::OrderingTy Ordering);
-
-} // namespace fence
-
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h
deleted file mode 100644
index 554c3271c334..000000000000
--- a/offload/DeviceRTL/include/Workshare.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-------- Workshare.h - OpenMP Workshare interface ------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_WORKSHARE_H
-#define OMPTARGET_WORKSHARE_H
-
-namespace ompx {
-
-namespace workshare {
-
-/// Initialize the worksharing machinery.
-void init(bool IsSPMD);
-
-} // namespace workshare
-
-} // namespace ompx
-
-#endif
diff --git a/offload/DeviceRTL/include/generated_microtask_cases.gen b/offload/DeviceRTL/include/generated_microtask_cases.gen
deleted file mode 100644
index a05f6da2f84f..000000000000
--- a/offload/DeviceRTL/include/generated_microtask_cases.gen
+++ /dev/null
@@ -1,797 +0,0 @@
-case 0:
-((void (*)(int32_t *, int32_t *))fn)(&global_tid, &bound_tid);
-break;
-case 1:
-((void (*)(int32_t *, int32_t *, void *))fn)(&global_tid, &bound_tid, args[0]);
-break;
-case 2:
-((void (*)(int32_t *, int32_t *, void *, void *))fn)(&global_tid, &bound_tid,
- args[0], args[1]);
-break;
-case 3:
-((void (*)(int32_t *, int32_t *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2]);
-break;
-case 4:
-((void (*)(int32_t *, int32_t *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3]);
-break;
-case 5:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4]);
-break;
-case 6:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5]);
-break;
-case 7:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6]);
-break;
-case 8:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
- args[2], args[3], args[4], args[5], args[6],
- args[7]);
-break;
-case 9:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
- args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8]);
-break;
-case 10:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
- args[1], args[2], args[3],
- args[4], args[5], args[6],
- args[7], args[8], args[9]);
-break;
-case 11:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10]);
-break;
-case 12:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11]);
-break;
-case 13:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12]);
-break;
-case 14:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13]);
-break;
-case 15:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14]);
-break;
-case 16:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
- args[2], args[3], args[4], args[5], args[6],
- args[7], args[8], args[9], args[10], args[11],
- args[12], args[13], args[14], args[15]);
-break;
-case 17:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
- args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16]);
-break;
-case 18:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17]);
-break;
-case 19:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17], args[18]);
-break;
-case 20:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19]);
-break;
-case 21:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17], args[18],
- args[19], args[20]);
-break;
-case 22:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17], args[18],
- args[19], args[20], args[21]);
-break;
-case 23:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17], args[18],
- args[19], args[20], args[21], args[22]);
-break;
-case 24:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
- args[2], args[3], args[4], args[5], args[6],
- args[7], args[8], args[9], args[10], args[11],
- args[12], args[13], args[14], args[15], args[16],
- args[17], args[18], args[19], args[20], args[21],
- args[22], args[23]);
-break;
-case 25:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
- args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16],
- args[17], args[18], args[19], args[20],
- args[21], args[22], args[23], args[24]);
-break;
-case 26:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25]);
-break;
-case 27:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26]);
-break;
-case 28:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17], args[18],
- args[19], args[20], args[21], args[22], args[23],
- args[24], args[25], args[26], args[27]);
-break;
-case 29:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17], args[18],
- args[19], args[20], args[21], args[22], args[23],
- args[24], args[25], args[26], args[27], args[28]);
-break;
-case 30:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29]);
-break;
-case 31:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17], args[18],
- args[19], args[20], args[21], args[22], args[23],
- args[24], args[25], args[26], args[27], args[28],
- args[29], args[30]);
-break;
-case 32:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
- args[2], args[3], args[4], args[5], args[6],
- args[7], args[8], args[9], args[10], args[11],
- args[12], args[13], args[14], args[15], args[16],
- args[17], args[18], args[19], args[20], args[21],
- args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30],
- args[31]);
-break;
-case 33:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32]);
-break;
-case 34:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33]);
-break;
-case 35:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34]);
-break;
-case 36:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35]);
-break;
-case 37:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36]);
-break;
-case 38:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37]);
-break;
-case 39:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17], args[18],
- args[19], args[20], args[21], args[22], args[23],
- args[24], args[25], args[26], args[27], args[28],
- args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38]);
-break;
-case 40:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
- args[2], args[3], args[4], args[5], args[6],
- args[7], args[8], args[9], args[10], args[11],
- args[12], args[13], args[14], args[15], args[16],
- args[17], args[18], args[19], args[20], args[21],
- args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31],
- args[32], args[33], args[34], args[35], args[36],
- args[37], args[38], args[39]);
-break;
-case 41:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40]);
-break;
-case 42:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41]);
-break;
-case 43:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42]);
-break;
-case 44:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43]);
-break;
-case 45:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44]);
-break;
-case 46:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45]);
-break;
-/// DONE TO HERE
-case 47:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17], args[18],
- args[19], args[20], args[21], args[22], args[23],
- args[24], args[25], args[26], args[27], args[28],
- args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38],
- args[39], args[40], args[41], args[42], args[43],
- args[44], args[45], args[46]);
-break;
-case 48:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47]);
-break;
-case 49:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48]);
-break;
-case 50:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49]);
-break;
-case 51:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50]);
-break;
-case 52:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51]);
-break;
-case 53:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51], args[52]);
-break;
-case 54:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51], args[52], args[53]);
-break;
-case 55:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51], args[52], args[53], args[54]);
-break;
-case 56:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
- args[2], args[3], args[4], args[5], args[6],
- args[7], args[8], args[9], args[10], args[11],
- args[12], args[13], args[14], args[15], args[16],
- args[17], args[18], args[19], args[20], args[21],
- args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31],
- args[32], args[33], args[34], args[35], args[36],
- args[37], args[38], args[39], args[40], args[41],
- args[42], args[43], args[44], args[45], args[46],
- args[47], args[48], args[49], args[50], args[51],
- args[52], args[53], args[54], args[55]);
-break;
-case 57:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51], args[52], args[53], args[54],
- args[55], args[56]);
-break;
-case 58:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51], args[52], args[53], args[54],
- args[55], args[56], args[57]);
-break;
-case 59:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51], args[52], args[53], args[54],
- args[55], args[56], args[57], args[58]);
-break;
-case 60:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51], args[52], args[53], args[54],
- args[55], args[56], args[57], args[58], args[59]);
-break;
-case 61:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51], args[52], args[53], args[54],
- args[55], args[56], args[57], args[58], args[59], args[60]);
-break;
-case 62:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51], args[52], args[53], args[54],
- args[55], args[56], args[57], args[58], args[59], args[60], args[61]);
-break;
-case 63:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
- args[3], args[4], args[5], args[6], args[7], args[8],
- args[9], args[10], args[11], args[12], args[13],
- args[14], args[15], args[16], args[17], args[18],
- args[19], args[20], args[21], args[22], args[23],
- args[24], args[25], args[26], args[27], args[28],
- args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38],
- args[39], args[40], args[41], args[42], args[43],
- args[44], args[45], args[46], args[47], args[48],
- args[49], args[50], args[51], args[52], args[53],
- args[54], args[55], args[56], args[57], args[58],
- args[59], args[60], args[61], args[62]);
-break;
-case 64:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *, void *, void *, void *, void *, void *, void *,
- void *, void *))fn)(
- &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
- args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
- args[13], args[14], args[15], args[16], args[17], args[18], args[19],
- args[20], args[21], args[22], args[23], args[24], args[25], args[26],
- args[27], args[28], args[29], args[30], args[31], args[32], args[33],
- args[34], args[35], args[36], args[37], args[38], args[39], args[40],
- args[41], args[42], args[43], args[44], args[45], args[46], args[47],
- args[48], args[49], args[50], args[51], args[52], args[53], args[54],
- args[55], args[56], args[57], args[58], args[59], args[60], args[61],
- args[62], args[63]);
-break;
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
deleted file mode 100644
index aac2a6005158..000000000000
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Mapping.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-[[gnu::used, gnu::retain, gnu::weak,
- gnu::visibility(
- "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
-[[gnu::used, gnu::retain, gnu::weak,
- gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
- __omp_rtl_device_memory_pool_tracker;
-
-/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
-/// directly.
-struct BumpAllocatorTy final {
-
- void *alloc(uint64_t Size) {
- Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
-
- if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
- atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
- atomic::seq_cst);
- atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
- atomic::seq_cst);
- atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
- atomic::seq_cst);
- atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
- atomic::seq_cst);
- }
-
- uint64_t *Data =
- reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
- uint64_t End =
- reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
-
- uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
- if (OldData + Size > End)
- __builtin_trap();
-
- return reinterpret_cast<void *>(OldData);
- }
-
- void free(void *) {}
-};
-
-BumpAllocatorTy BumpAllocator;
-
-/// allocator namespace implementation
-///
-///{
-
-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
- // TODO: Check KernelEnvironment for an allocator choice as soon as we have
- // more than one.
-}
-
-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
-
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
-
-///}
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
deleted file mode 100644
index 0c31c66ab2de..000000000000
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the data object of the constant device environment and the
-// query API.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Configuration.h"
-#include "DeviceTypes.h"
-#include "State.h"
-
-using namespace ompx;
-
-// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
-[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_nested_parallelism = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_threads_oversubscription =
- 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_teams_oversubscription = 0;
-
-// This variable should be visible to the plugin so we override the default
-// hidden visibility.
-[[gnu::used, gnu::retain, gnu::weak,
- gnu::visibility(
- "protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment;
-
-uint32_t config::getAssumeTeamsOversubscription() {
- return __omp_rtl_assume_teams_oversubscription;
-}
-
-uint32_t config::getAssumeThreadsOversubscription() {
- return __omp_rtl_assume_threads_oversubscription;
-}
-
-uint32_t config::getDebugKind() {
- return __omp_rtl_debug_kind & __omp_rtl_device_environment.DeviceDebugKind;
-}
-
-uint32_t config::getNumDevices() {
- return __omp_rtl_device_environment.NumDevices;
-}
-
-uint32_t config::getDeviceNum() {
- return __omp_rtl_device_environment.DeviceNum;
-}
-
-uint64_t config::getDynamicMemorySize() {
- return __omp_rtl_device_environment.DynamicMemSize;
-}
-
-uint64_t config::getClockFrequency() {
- return __omp_rtl_device_environment.ClockFrequency;
-}
-
-void *config::getIndirectCallTablePtr() {
- return reinterpret_cast<void *>(
- __omp_rtl_device_environment.IndirectCallTable);
-}
-
-uint64_t config::getHardwareParallelism() {
- return __omp_rtl_device_environment.HardwareParallelism;
-}
-
-uint64_t config::getIndirectCallTableSize() {
- return __omp_rtl_device_environment.IndirectCallTableSize;
-}
-
-bool config::isDebugMode(DeviceDebugKind Kind) {
- return config::getDebugKind() & uint32_t(Kind);
-}
-
-bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
-
-bool config::mayUseNestedParallelism() {
- if (__omp_rtl_assume_no_nested_parallelism)
- return false;
- return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
-}
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
deleted file mode 100644
index 5b5482d766b1..000000000000
--- a/offload/DeviceRTL/src/Debug.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains debug utilities
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Configuration.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-
-using namespace ompx;
-
-extern "C" {
-void __assert_assume(bool condition) { __builtin_assume(condition); }
-
-#ifndef OMPTARGET_HAS_LIBC
-[[gnu::weak]] void __assert_fail(const char *expr, const char *file,
- unsigned line, const char *function) {
- __assert_fail_internal(expr, nullptr, file, line, function);
-}
-#endif
-
-void __assert_fail_internal(const char *expr, const char *msg, const char *file,
- unsigned line, const char *function) {
- if (msg) {
- printf("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function,
- msg, expr);
- } else {
- printf("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr);
- }
- __builtin_trap();
-}
-}
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
deleted file mode 100644
index d6f8c499c890..000000000000
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "DeviceUtils.h"
-
-#include "Debug.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "gpuintrin.h"
-
-using namespace ompx;
-
-uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
- return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
-}
-
-void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
- static_assert(sizeof(unsigned long) == 8, "");
- LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
- HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
-}
-
-int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
- int32_t Width) {
- return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width);
-}
-
-int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
- int32_t Width) {
- int32_t Self = mapping::getThreadIdInWarp();
- int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
- return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
-}
-
-int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
- int32_t Width) {
- int32_t Self = mapping::getThreadIdInWarp();
- int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
- return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
-}
-
-uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
- return __gpu_ballot(Mask, Pred);
-}
-
-bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
-
-extern "C" {
-int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
- return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
-}
-
-int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
- return utils::shuffleDown(lanes::All, Val, Delta, Width);
-}
-}
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
deleted file mode 100644
index 467e44a65276..000000000000
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the kernel entry points for the device.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-#include "Workshare.h"
-
-using namespace ompx;
-
-// These flags are copied from "llvm/Frontend/OpenMP/OMPDeviceConstants.h" and
-// must be kept in-sync.
-enum OMPTgtExecModeFlags : unsigned char {
- OMP_TGT_EXEC_MODE_BARE = 0,
- OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
- OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
- OMP_TGT_EXEC_MODE_GENERIC_SPMD =
- OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
-};
-
-static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
- // Order is important here.
- synchronize::init(IsSPMD);
- mapping::init(IsSPMD);
- state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
- allocator::init(IsSPMD, KernelEnvironment);
- workshare::init(IsSPMD);
-}
-
-/// Simple generic state machine for worker threads.
-static void genericStateMachine(IdentTy *Ident) {
- uint32_t TId = mapping::getThreadIdInBlock();
-
- do {
- ParallelRegionFnTy WorkFn = nullptr;
-
- // Wait for the signal that we have a new work function.
- synchronize::threads(atomic::seq_cst);
-
- // Retrieve the work function from the runtime.
- bool IsActive = __kmpc_kernel_parallel(&WorkFn);
-
- // If there is nothing more to do, break out of the state machine by
- // returning to the caller.
- if (!WorkFn)
- return;
-
- if (IsActive) {
- ASSERT(!mapping::isSPMDMode(), nullptr);
- ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId);
- __kmpc_kernel_end_parallel();
- }
-
- synchronize::threads(atomic::seq_cst);
-
- } while (true);
-}
-
-extern "C" {
-
-/// Initialization
-///
-/// \param Ident Source location identification, can be NULL.
-///
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
- ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
- bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
- bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
- if (IsSPMD) {
- inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
- KernelLaunchEnvironment);
- synchronize::threadsAligned(atomic::relaxed);
- } else {
- inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
- KernelLaunchEnvironment);
- // No need to wait since only the main threads will execute user
- // code and workers will run into a barrier right away.
- }
-
- if (IsSPMD) {
- state::assumeInitialState(IsSPMD);
-
- // Synchronize to ensure the assertions above are in an aligned region.
- // The barrier is eliminated later.
- synchronize::threadsAligned(atomic::relaxed);
- return -1;
- }
-
- if (mapping::isInitialThreadInLevel0(IsSPMD))
- return -1;
-
- // Enter the generic state machine if enabled and if this thread can possibly
- // be an active worker thread.
- //
- // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
- // GPU. In those cases, a single thread can apparently satisfy a barrier on
- // behalf of all threads in the same warp. Thus, it would not be safe for
- // other threads in the main thread's warp to reach the first
- // synchronize::threads call in genericStateMachine before the main thread
- // reaches its corresponding synchronize::threads call: that would permit all
- // active worker threads to proceed before the main thread has actually set
- // state::ParallelRegionFn, and then they would immediately quit without
- // doing any work. mapping::getMaxTeamThreads() does not include any of the
- // main thread's warp, so none of its threads can ever be active worker
- // threads.
- if (UseGenericStateMachine &&
- mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD))
- genericStateMachine(KernelEnvironment.Ident);
-
- return mapping::getThreadIdInBlock();
-}
-
-/// De-Initialization
-///
-/// In non-SPMD, this function releases the workers trapped in a state machine
-/// and also any memory dynamically allocated by the runtime.
-///
-/// \param Ident Source location identification, can be NULL.
-///
-void __kmpc_target_deinit() {
- bool IsSPMD = mapping::isSPMDMode();
- if (IsSPMD)
- return;
-
- if (mapping::isInitialThreadInLevel0(IsSPMD)) {
- // Signal the workers to exit the state machine and exit the kernel.
- state::ParallelRegionFn = nullptr;
- } else if (!state::getKernelEnvironment()
- .Configuration.UseGenericStateMachine) {
- // Retrieve the work function just to ensure we always call
- // __kmpc_kernel_parallel even if a custom state machine is used.
- // TODO: this is not super pretty. The problem is we create the call to
- // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it
- // is not there yet. Thus, we assume we never reach it from
- // __kmpc_target_deinit. That allows us to remove the store in there to
- // ParallelRegionFn, which leads to bad results later on.
- ParallelRegionFnTy WorkFn = nullptr;
- __kmpc_kernel_parallel(&WorkFn);
- ASSERT(WorkFn == nullptr, nullptr);
- }
-}
-
-int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
-}
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
deleted file mode 100644
index 83f9233d9480..000000000000
--- a/offload/DeviceRTL/src/LibC.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===------- LibC.cpp - Simple implementation of libc functions --- C++ ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "LibC.h"
-
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
-extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; }
-#else
-extern "C" int vprintf(const char *format, __builtin_va_list);
-#endif
-
-extern "C" {
-[[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) {
- auto *L = reinterpret_cast<const unsigned char *>(lhs);
- auto *R = reinterpret_cast<const unsigned char *>(rhs);
-
- for (size_t I = 0; I < count; ++I)
- if (L[I] != R[I])
- return (int)L[I] - (int)R[I];
-
- return 0;
-}
-
-[[gnu::weak]] void memset(void *dst, int C, size_t count) {
- auto *dstc = reinterpret_cast<char *>(dst);
- for (size_t I = 0; I < count; ++I)
- dstc[I] = C;
-}
-
-[[gnu::weak]] int printf(const char *Format, ...) {
- __builtin_va_list vlist;
- __builtin_va_start(vlist, Format);
- return ::vprintf(Format, vlist);
-}
-}
-
-namespace ompx {
-[[clang::no_builtin("printf")]] int printf(const char *Format, ...) {
- __builtin_va_list vlist;
- __builtin_va_start(vlist, Format);
- return ::vprintf(Format, vlist);
-}
-} // namespace ompx
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
deleted file mode 100644
index b145892d1ece..000000000000
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Mapping.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "State.h"
-#include "gpuintrin.h"
-
-using namespace ompx;
-
-// FIXME: This resolves the handling for the AMDGPU workgroup size when the ABI
-// is set to 'none'. We only support COV5+ but this can be removed when COV4 is
-// fully deprecated.
-#ifdef __AMDGPU__
-extern const inline uint32_t __oclc_ABI_version = 500;
-[[gnu::alias("__oclc_ABI_version")]] const uint32_t __oclc_ABI_version__;
-#endif
-
-static bool isInLastWarp() {
- uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
- ~(mapping::getWarpSize() - 1);
- return mapping::getThreadIdInBlock() == MainTId;
-}
-
-bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
- if (IsSPMD || icv::Level)
- return false;
-
- // Check if this is the last warp in the block.
- return isInLastWarp();
-}
-
-bool mapping::isMainThreadInGenericMode() {
- return mapping::isMainThreadInGenericMode(mapping::isSPMDMode());
-}
-
-bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
- if (IsSPMD)
- return mapping::getThreadIdInBlock() == 0;
- return isInLastWarp();
-}
-
-bool mapping::isLeaderInWarp() {
- __kmpc_impl_lanemask_t Active = mapping::activemask();
- __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
- return utils::popc(Active & LaneMaskLT) == 0;
-}
-
-LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
-
-LaneMaskTy mapping::lanemaskLT() {
-#ifdef __NVPTX__
- return __nvvm_read_ptx_sreg_lanemask_lt();
-#else
- uint32_t Lane = mapping::getThreadIdInWarp();
- int64_t Ballot = mapping::activemask();
- uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
- return Mask & Ballot;
-#endif
-}
-
-LaneMaskTy mapping::lanemaskGT() {
-#ifdef __NVPTX__
- return __nvvm_read_ptx_sreg_lanemask_gt();
-#else
- uint32_t Lane = mapping::getThreadIdInWarp();
- if (Lane == (mapping::getWarpSize() - 1))
- return 0;
- int64_t Ballot = mapping::activemask();
- uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
- return Mask & Ballot;
-#endif
-}
-
-uint32_t mapping::getThreadIdInWarp() {
- uint32_t ThreadIdInWarp = __gpu_lane_id();
- ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
- return ThreadIdInWarp;
-}
-
-uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
- uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
- return ThreadIdInBlock;
-}
-
-uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
-
-uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
- uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
- // If we are in SPMD mode, remove one warp.
- return BlockSize - (!IsSPMD * mapping::getWarpSize());
-}
-uint32_t mapping::getMaxTeamThreads() {
- return mapping::getMaxTeamThreads(mapping::isSPMDMode());
-}
-
-uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
- return __gpu_num_threads(Dim);
-}
-
-uint32_t mapping::getNumberOfThreadsInKernel() {
- return mapping::getNumberOfThreadsInBlock(0) *
- mapping::getNumberOfBlocksInKernel(0) *
- mapping::getNumberOfThreadsInBlock(1) *
- mapping::getNumberOfBlocksInKernel(1) *
- mapping::getNumberOfThreadsInBlock(2) *
- mapping::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t mapping::getWarpIdInBlock() {
- uint32_t WarpID =
- mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
- ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
- return WarpID;
-}
-
-uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
- uint32_t BlockId = __gpu_block_id(Dim);
- ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
- return BlockId;
-}
-
-uint32_t mapping::getNumberOfWarpsInBlock() {
- return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
- mapping::getWarpSize();
-}
-
-uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
- return __gpu_num_blocks(Dim);
-}
-
-uint32_t mapping::getNumberOfProcessorElements() {
- return static_cast<uint32_t>(config::getHardwareParallelism());
-}
-
-///}
-
-/// Execution mode
-///
-///{
-
-// TODO: This is a workaround for initialization coming from kernels outside of
-// the TU. We will need to solve this more correctly in the future.
-[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode;
-
-void mapping::init(bool IsSPMD) {
- if (mapping::isInitialThreadInLevel0(IsSPMD))
- IsSPMDMode = IsSPMD;
-}
-
-bool mapping::isSPMDMode() { return IsSPMDMode; }
-
-bool mapping::isGenericMode() { return !isSPMDMode(); }
-///}
-
-extern "C" {
-[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
- return mapping::getThreadIdInBlock();
-}
-
-[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
- return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
-}
-
-[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
- return mapping::getWarpSize();
-}
-}
-
-#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME) \
- extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); }
-
-_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock)
-_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
-_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
-_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
-
-extern "C" {
-uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
- return utils::ballotSync(mask, pred);
-}
-
-int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) {
- return utils::shuffleDown(mask, var, delta, width);
-}
-
-float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
- int width) {
- return utils::bitCast<float>(
- utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width));
-}
-
-long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
- return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width);
-}
-
-double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
- int width) {
- return utils::bitCast<double>(
- utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
-}
-}
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
deleted file mode 100644
index a89f8b2a7453..000000000000
--- a/offload/DeviceRTL/src/Misc.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "DeviceTypes.h"
-#include "Shared/RPCOpcodes.h"
-#include "shared/rpc.h"
-
-#include "Debug.h"
-
-namespace ompx {
-namespace impl {
-
-/// Lookup a device-side function using a host pointer /p HstPtr using the table
-/// provided by the device plugin. The table is an ordered pair of host and
-/// device pointers sorted on the value of the host pointer.
-void *indirectCallLookup(void *HstPtr) {
- if (!HstPtr)
- return nullptr;
-
- struct IndirectCallTable {
- void *HstPtr;
- void *DevPtr;
- };
- IndirectCallTable *Table =
- reinterpret_cast<IndirectCallTable *>(config::getIndirectCallTablePtr());
- uint64_t TableSize = config::getIndirectCallTableSize();
-
- // If the table is empty we assume this is device pointer.
- if (!Table || !TableSize)
- return HstPtr;
-
- uint32_t Left = 0;
- uint32_t Right = TableSize;
-
- // If the pointer is definitely not contained in the table we exit early.
- if (HstPtr < Table[Left].HstPtr || HstPtr > Table[Right - 1].HstPtr)
- return HstPtr;
-
- while (Left != Right) {
- uint32_t Current = Left + (Right - Left) / 2;
- if (Table[Current].HstPtr == HstPtr)
- return Table[Current].DevPtr;
-
- if (HstPtr < Table[Current].HstPtr)
- Right = Current;
- else
- Left = Current;
- }
-
- // If we searched the whole table and found nothing this is a device pointer.
- return HstPtr;
-}
-
-/// The openmp client instance used to communicate with the server.
-[[gnu::visibility("protected"),
- gnu::weak]] rpc::Client Client asm("__llvm_rpc_client");
-
-} // namespace impl
-} // namespace ompx
-
-/// Interfaces
-///
-///{
-
-extern "C" {
-int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
-
-int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
-
-double omp_get_wtick(void) {
- // The number of ticks per second for the AMDGPU clock varies by card and can
- // only be retrieved by querying the driver. We rely on the device environment
- // to inform us what the proper frequency is. NVPTX uses a nanosecond
- // resolution, we could omit the global read but this makes it consistent.
- return 1.0 / ompx::config::getClockFrequency();
-}
-
-double omp_get_wtime(void) {
- return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick();
-}
-
-void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
- return ompx::impl::indirectCallLookup(HstPtr);
-}
-
-void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
- switch (allocator) {
- case omp_default_mem_alloc:
- case omp_large_cap_mem_alloc:
- case omp_const_mem_alloc:
- case omp_high_bw_mem_alloc:
- case omp_low_lat_mem_alloc:
- return malloc(size);
- default:
- return nullptr;
- }
-}
-
-void omp_free(void *ptr, omp_allocator_handle_t allocator) {
- switch (allocator) {
- case omp_default_mem_alloc:
- case omp_large_cap_mem_alloc:
- case omp_const_mem_alloc:
- case omp_high_bw_mem_alloc:
- case omp_low_lat_mem_alloc:
- free(ptr);
- case omp_null_allocator:
- default:
- return;
- }
-}
-
-unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) {
- rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_HOST_CALL>();
- Port.send_n(data, size);
- Port.send([=](rpc::Buffer *buffer, uint32_t) {
- buffer->data[0] = reinterpret_cast<uintptr_t>(fn);
- });
- unsigned long long Ret;
- Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
- Ret = static_cast<unsigned long long>(Buffer->data[0]);
- });
- Port.close();
- return Ret;
-}
-}
-
-///}
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
deleted file mode 100644
index 08ce616aee1c..000000000000
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Parallel implementation in the GPU. Here is the pattern:
-//
-// while (not finished) {
-//
-// if (master) {
-// sequential code, decide which par loop to do, or if finished
-// __kmpc_kernel_prepare_parallel() // exec by master only
-// }
-// syncthreads // A
-// __kmpc_kernel_parallel() // exec by all
-// if (this thread is included in the parallel) {
-// switch () for all parallel loops
-// __kmpc_kernel_end_parallel() // exec only by threads in parallel
-// }
-//
-//
-// The reason we don't exec end_parallel for the threads not included
-// in the parallel loop is that for each barrier in the parallel
-// region, these non-included threads will cycle through the
-// syncthread A. Thus they must preserve their current threadId that
-// is larger than thread in team.
-//
-// To make a long story short...
-//
-//===----------------------------------------------------------------------===//
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "LibC.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-namespace {
-
-uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
- uint32_t NThreadsICV =
- NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
- uint32_t NumThreads = mapping::getMaxTeamThreads();
-
- if (NThreadsICV != 0 && NThreadsICV < NumThreads)
- NumThreads = NThreadsICV;
-
- // SPMD mode allows any number of threads, for generic mode we round down to a
- // multiple of WARPSIZE since it is legal to do so in OpenMP.
- if (mapping::isSPMDMode())
- return NumThreads;
-
- if (NumThreads < mapping::getWarpSize())
- NumThreads = 1;
- else
- NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
-
- return NumThreads;
-}
-
-// Invoke an outlined parallel function unwrapping arguments (up to 32).
-[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
- int32_t bound_tid, void *fn,
- void **args, int64_t nargs) {
- switch (nargs) {
-#include "generated_microtask_cases.gen"
- default:
- printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
- __builtin_trap();
- }
-}
-
-} // namespace
-
-extern "C" {
-
-[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
- int32_t num_threads,
- void *fn, void **args,
- const int64_t nargs) {
- uint32_t TId = mapping::getThreadIdInBlock();
- uint32_t NumThreads = determineNumberOfThreads(num_threads);
- uint32_t PTeamSize =
- NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
- // Avoid the race between the read of the `icv::Level` above and the write
- // below by synchronizing all threads here.
- synchronize::threadsAligned(atomic::seq_cst);
- {
- // Note that the order here is important. `icv::Level` has to be updated
- // last or the other updates will cause a thread specific state to be
- // created.
- state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
- 1u, TId == 0, ident,
- /*ForceTeamState=*/true);
- state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
- /*ForceTeamState=*/true);
- state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
- /*ForceTeamState=*/true);
-
- // Synchronize all threads after the main thread (TId == 0) set up the
- // team state properly.
- synchronize::threadsAligned(atomic::acq_rel);
-
- state::ParallelTeamSize.assert_eq(PTeamSize, ident,
- /*ForceTeamState=*/true);
- icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true);
- icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true);
-
- // Ensure we synchronize before we run user code to avoid invalidating the
- // assumptions above.
- synchronize::threadsAligned(atomic::relaxed);
-
- if (!PTeamSize || TId < PTeamSize)
- invokeMicrotask(TId, 0, fn, args, nargs);
-
- // Synchronize all threads at the end of a parallel region.
- synchronize::threadsAligned(atomic::seq_cst);
- }
-
- // Synchronize all threads to make sure every thread exits the scope above;
- // otherwise the following assertions and the assumption in
- // __kmpc_target_deinit may not hold.
- synchronize::threadsAligned(atomic::acq_rel);
-
- state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true);
- icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true);
- icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true);
-
- // Ensure we synchronize to create an aligned region around the assumptions.
- synchronize::threadsAligned(atomic::relaxed);
-
- return;
-}
-
-[[clang::always_inline]] void
-__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
- int32_t num_threads, int proc_bind, void *fn,
- void *wrapper_fn, void **args, int64_t nargs) {
- uint32_t TId = mapping::getThreadIdInBlock();
-
- // Assert the parallelism level is zero if disabled by the user.
- ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
- "nested parallelism while disabled");
-
- // Handle the serialized case first, same for SPMD/non-SPMD:
- // 1) if-clause(0)
- // 2) parallel in task or other thread state inducing construct
- // 3) nested parallel regions
- if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
- (config::mayUseNestedParallelism() && icv::Level))) {
- state::DateEnvironmentRAII DERAII(ident);
- ++icv::Level;
- invokeMicrotask(TId, 0, fn, args, nargs);
- return;
- }
-
- // From this point forward we know that there is no thread state used.
- ASSERT(state::HasThreadState == false, nullptr);
-
- if (mapping::isSPMDMode()) {
- // This was moved to its own routine so it could be called directly
- // in certain situations to avoid resource consumption of unused
- // logic in parallel_51.
- __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
-
- return;
- }
-
- uint32_t NumThreads = determineNumberOfThreads(num_threads);
- uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
- uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
-
- // We do *not* create a new data environment because all threads in the team
- // that are active are now running this parallel region. They share the
- // TeamState, which has an increase level-var and potentially active-level
- // set, but they do not have individual ThreadStates yet. If they ever
- // modify the ICVs beyond this point a ThreadStates will be allocated.
-
- bool IsActiveParallelRegion = NumThreads > 1;
- if (!IsActiveParallelRegion) {
- state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
- invokeMicrotask(TId, 0, fn, args, nargs);
- return;
- }
-
- void **GlobalArgs = nullptr;
- if (nargs) {
- __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
- switch (nargs) {
- default:
- for (int I = 0; I < nargs; I++)
- GlobalArgs[I] = args[I];
- break;
- case 16:
- GlobalArgs[15] = args[15];
- [[fallthrough]];
- case 15:
- GlobalArgs[14] = args[14];
- [[fallthrough]];
- case 14:
- GlobalArgs[13] = args[13];
- [[fallthrough]];
- case 13:
- GlobalArgs[12] = args[12];
- [[fallthrough]];
- case 12:
- GlobalArgs[11] = args[11];
- [[fallthrough]];
- case 11:
- GlobalArgs[10] = args[10];
- [[fallthrough]];
- case 10:
- GlobalArgs[9] = args[9];
- [[fallthrough]];
- case 9:
- GlobalArgs[8] = args[8];
- [[fallthrough]];
- case 8:
- GlobalArgs[7] = args[7];
- [[fallthrough]];
- case 7:
- GlobalArgs[6] = args[6];
- [[fallthrough]];
- case 6:
- GlobalArgs[5] = args[5];
- [[fallthrough]];
- case 5:
- GlobalArgs[4] = args[4];
- [[fallthrough]];
- case 4:
- GlobalArgs[3] = args[3];
- [[fallthrough]];
- case 3:
- GlobalArgs[2] = args[2];
- [[fallthrough]];
- case 2:
- GlobalArgs[1] = args[1];
- [[fallthrough]];
- case 1:
- GlobalArgs[0] = args[0];
- [[fallthrough]];
- case 0:
- break;
- }
- }
-
- {
- // Note that the order here is important. `icv::Level` has to be updated
- // last or the other updates will cause a thread specific state to be
- // created.
- state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
- 1u, true, ident,
- /*ForceTeamState=*/true);
- state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
- (void *)nullptr, true, ident,
- /*ForceTeamState=*/true);
- state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
- /*ForceTeamState=*/true);
- state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
- /*ForceTeamState=*/true);
-
- // Master signals work to activate workers.
- synchronize::threads(atomic::seq_cst);
- // Master waits for workers to signal.
- synchronize::threads(atomic::seq_cst);
- }
-
- if (nargs)
- __kmpc_end_sharing_variables();
-}
-
-[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
- // Work function and arguments for L1 parallel region.
- *WorkFn = state::ParallelRegionFn;
-
- // If this is the termination signal from the master, quit early.
- if (!*WorkFn)
- return false;
-
- // Set to true for workers participating in the parallel region.
- uint32_t TId = mapping::getThreadIdInBlock();
- bool ThreadIsActive = TId < state::getEffectivePTeamSize();
- return ThreadIsActive;
-}
-
-[[clang::noinline]] void __kmpc_kernel_end_parallel() {
- // In case we have modified an ICV for this thread before a ThreadState was
- // created. We drop it now to not contaminate the next parallel region.
- ASSERT(!mapping::isSPMDMode(), nullptr);
- uint32_t TId = mapping::getThreadIdInBlock();
- state::resetStateForThread(TId);
- ASSERT(!mapping::isSPMDMode(), nullptr);
-}
-
-uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
-
-int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
-
-void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
- int32_t thread_limit) {}
-
-void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
-}
diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp
deleted file mode 100644
index df141af5ebee..000000000000
--- a/offload/DeviceRTL/src/Profiling.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===------- Profiling.cpp ---------------------------------------- C++ ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Profiling.h"
-
-extern "C" {
-
-// Provides empty implementations for certain functions in compiler-rt
-// that are emitted by the PGO instrumentation.
-void __llvm_profile_register_function(void *Ptr) {}
-void __llvm_profile_register_names_function(void *Ptr, long int I) {}
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
-}
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
deleted file mode 100644
index fffd0063940c..000000000000
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ /dev/null
@@ -1,316 +0,0 @@
-//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of reduction with KMPC interface.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-namespace {
-
-void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
- for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
- shflFct(reduce_data, /*LaneId - not used= */ 0,
- /*Offset = */ mask, /*AlgoVersion=*/0);
- }
-}
-
-void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
- uint32_t size, uint32_t tid) {
- uint32_t curr_size;
- uint32_t mask;
- curr_size = size;
- mask = curr_size / 2;
- while (mask > 0) {
- shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
- curr_size = (curr_size + 1) / 2;
- mask = curr_size / 2;
- }
-}
-
-static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
- ShuffleReductFnTy shflFct) {
- uint32_t size, remote_id, physical_lane_id;
- physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
- __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
- __kmpc_impl_lanemask_t Liveness = mapping::activemask();
- uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
- __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
- do {
- Liveness = mapping::activemask();
- remote_id = utils::ffs(Liveness & lanemask_gt);
- size = utils::popc(Liveness);
- logical_lane_id /= 2;
- shflFct(reduce_data, /*LaneId =*/logical_lane_id,
- /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
- } while (logical_lane_id % 2 == 0 && size > 1);
- return (logical_lane_id == 0);
-}
-
-static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
- ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct) {
- uint32_t BlockThreadId = mapping::getThreadIdInBlock();
- if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false))
- BlockThreadId = 0;
- uint32_t NumThreads = omp_get_num_threads();
- if (NumThreads == 1)
- return 1;
-
- //
- // This reduce function handles reduction within a team. It handles
- // parallel regions in both L1 and L2 parallelism levels. It also
- // supports Generic, SPMD, and NoOMP modes.
- //
- // 1. Reduce within a warp.
- // 2. Warp master copies value to warp 0 via shared memory.
- // 3. Warp 0 reduces to a single value.
- // 4. The reduced value is available in the thread that returns 1.
- //
-
-#if __has_builtin(__nvvm_reflect)
- if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
- uint32_t WarpsNeeded =
- (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
- uint32_t WarpId = mapping::getWarpIdInBlock();
-
- // Volta execution model:
- // For the Generic execution mode a parallel region either has 1 thread and
- // beyond that, always a multiple of 32. For the SPMD execution mode we may
- // have any number of threads.
- if ((NumThreads % mapping::getWarpSize() == 0) ||
- (WarpId < WarpsNeeded - 1))
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
- gpu_irregular_warp_reduce(
- reduce_data, shflFct,
- /*LaneCount=*/NumThreads % mapping::getWarpSize(),
- /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize());
-
- // When we have more than [mapping::getWarpSize()] number of threads
- // a block reduction is performed here.
- //
- // Only L1 parallel region can enter this if condition.
- if (NumThreads > mapping::getWarpSize()) {
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- BlockThreadId);
- }
- return BlockThreadId == 0;
- }
-#endif
- __kmpc_impl_lanemask_t Liveness = mapping::activemask();
- if (Liveness == lanes::All) // Full warp
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
- gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/utils::popc(Liveness),
- /*LaneId=*/mapping::getThreadIdInBlock() %
- mapping::getWarpSize());
- else { // Dispersed lanes. Only threads in L2
- // parallel region may enter here; return
- // early.
- return gpu_irregular_simd_reduce(reduce_data, shflFct);
- }
-
- // When we have more than [mapping::getWarpSize()] number of threads
- // a block reduction is performed here.
- //
- // Only L1 parallel region can enter this if condition.
- if (NumThreads > mapping::getWarpSize()) {
- uint32_t WarpsNeeded =
- (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- BlockThreadId);
-
- return BlockThreadId == 0;
- }
-
- // Get the OMP thread Id. This is different from BlockThreadId in the case
- // of an L2 parallel region.
- return BlockThreadId == 0;
-}
-
-uint32_t roundToWarpsize(uint32_t s) {
- if (s < mapping::getWarpSize())
- return 1;
- return (s & ~(unsigned)(mapping::getWarpSize() - 1));
-}
-
-uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
-
-} // namespace
-
-extern "C" {
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
- uint64_t reduce_data_size,
- void *reduce_data,
- ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct) {
- return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
-}
-
-int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
- IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
- uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
- ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
- // Terminate all threads in non-SPMD mode except for the master thread.
- uint32_t ThreadId = mapping::getThreadIdInBlock();
- if (mapping::isGenericMode()) {
- if (!mapping::isMainThreadInGenericMode())
- return 0;
- ThreadId = 0;
- }
-
- uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
- uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
-
- // In non-generic mode all workers participate in the teams reduction.
- // In generic mode only the team master participates in the teams
- // reduction because the workers are waiting for parallel work.
- uint32_t NumThreads = omp_get_num_threads();
- uint32_t TeamId = omp_get_team_num();
- uint32_t NumTeams = omp_get_num_teams();
- [[clang::loader_uninitialized]] static Local<unsigned> Bound;
- [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
-
- // Block progress for teams greater than the current upper
- // limit. We always only allow a number of teams less or equal
- // to the number of slots in the buffer.
- bool IsMaster = (ThreadId == 0);
- while (IsMaster) {
- Bound = atomic::load(&IterCnt, atomic::acquire);
- if (TeamId < Bound + num_of_records)
- break;
- }
-
- if (IsMaster) {
- int ModBockId = TeamId % num_of_records;
- if (TeamId < num_of_records) {
- lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
- } else
- lgredFct(GlobalBuffer, ModBockId, reduce_data);
-
- // Propagate the memory writes above to the world.
- fence::kernel(atomic::release);
-
- // Increment team counter.
- // This counter is incremented by all teams in the current
- // num_of_records chunk.
- ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
- atomic::MemScopeTy::device);
- }
-
- // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
- // state machine.
- if (mapping::isSPMDMode())
- synchronize::threadsAligned(atomic::acq_rel);
-
- // reduce_data is global or shared so before being reduced within the
- // warp we need to bring it in local memory:
- // local_reduce_data = reduce_data[i]
- //
- // Example for 3 reduction variables a, b, c (of potentially different
- // types):
- //
- // buffer layout (struct of arrays):
- // a, a, ..., a, b, b, ... b, c, c, ... c
- // |__________|
- // num_of_records
- //
- // local_data_reduce layout (struct):
- // a, b, c
- //
- // Each thread will have a local struct containing the values to be
- // reduced:
- // 1. do reduction within each warp.
- // 2. do reduction across warps.
- // 3. write the final result to the main reduction variable
- // by returning 1 in the thread holding the reduction result.
-
- // Check if this is the very last team.
- unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
- if (ChunkTeamCount == NumTeams - Bound - 1) {
- // Ensure we see the global memory writes by other teams
- fence::kernel(atomic::acquire);
-
- //
- // Last team processing.
- //
- if (ThreadId >= NumRecs)
- return 0;
- NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
- if (ThreadId >= NumThreads)
- return 0;
-
- // Load from buffer and reduce.
- glcpyFct(GlobalBuffer, ThreadId, reduce_data);
- for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
- glredFct(GlobalBuffer, i, reduce_data);
-
- // Reduce across warps to the warp master.
- if (NumThreads > 1) {
- gpu_regular_warp_reduce(reduce_data, shflFct);
-
- // When we have more than [mapping::getWarpSize()] number of threads
- // a block reduction is performed here.
- uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
- if (ActiveThreads > mapping::getWarpSize()) {
- uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
- mapping::getWarpSize();
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- uint32_t WarpId = ThreadId / mapping::getWarpSize();
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- ThreadId);
- }
- }
-
- if (IsMaster) {
- Cnt = 0;
- IterCnt = 0;
- return 1;
- }
- return 0;
- }
- if (IsMaster && ChunkTeamCount == num_of_records - 1) {
- // Allow SIZE number of teams to proceed writing their
- // intermediate results to the global buffer.
- atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst);
- }
-
- return 0;
-}
-}
-
-void *__kmpc_reduction_get_fixed_buffer() {
- return state::getKernelLaunchEnvironment().ReductionBuffer;
-}
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
deleted file mode 100644
index 475395102f47..000000000000
--- a/offload/DeviceRTL/src/State.cpp
+++ /dev/null
@@ -1,482 +0,0 @@
-//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "LibC.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-/// Memory implementation
-///
-///{
-
-/// External symbol to access dynamic shared memory.
-[[gnu::aligned(
- allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[];
-
-/// The kernel environment passed to the init method by the compiler.
-[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *>
- KernelEnvironmentPtr;
-
-/// The kernel launch environment passed as argument to the kernel by the
-/// runtime.
-[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
- KernelLaunchEnvironmentPtr;
-
-///}
-
-namespace {
-
-/// Fallback implementations are missing to trigger a link time error.
-/// Implementations for new devices, including the host, should go into a
-/// dedicated begin/end declare variant.
-///
-///{
-extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
-
-[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
-[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
-
-#else
-
-[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
-[[gnu::weak, gnu::leaf]] void free(void *Ptr);
-
-#endif
-}
-///}
-
-/// A "smart" stack in shared memory.
-///
-/// The stack exposes a malloc/free interface but works like a stack internally.
-/// In fact, it is a separate stack *per warp*. That means, each warp must push
-/// and pop symmetrically or this breaks, badly. The implementation will (aim
-/// to) detect non-lock-step warps and fallback to malloc/free. The same will
-/// happen if a warp runs out of memory. The master warp in generic memory is
-/// special and is given more memory than the rest.
-///
-struct SharedMemorySmartStackTy {
- /// Initialize the stack. Must be called by all threads.
- void init(bool IsSPMD);
-
- /// Allocate \p Bytes on the stack for the encountering thread. Each thread
- /// can call this function.
- void *push(uint64_t Bytes);
-
- /// Deallocate the last allocation made by the encountering thread and pointed
- /// to by \p Ptr from the stack. Each thread can call this function.
- void pop(void *Ptr, uint64_t Bytes);
-
-private:
- /// Compute the size of the storage space reserved for a thread.
- uint32_t computeThreadStorageTotal() {
- uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
- return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock,
- allocator::ALIGNMENT);
- }
-
- /// Return the top address of the warp data stack, that is the first address
- /// this warp will allocate memory at next.
- void *getThreadDataTop(uint32_t TId) {
- return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
- }
-
- /// The actual storage, shared among all warps.
- [[gnu::aligned(
- allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
- [[gnu::aligned(
- allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
-};
-
-static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
- "Shared scratchpad of this size not supported yet.");
-
-/// The allocation of a single shared memory scratchpad.
-[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy>
- SharedMemorySmartStack;
-
-void SharedMemorySmartStackTy::init(bool IsSPMD) {
- Usage[mapping::getThreadIdInBlock()] = 0;
-}
-
-void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
- // First align the number of requested bytes.
- /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
- /// be passed in as an argument and the stack rewritten to support it.
- uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT);
-
- uint32_t StorageTotal = computeThreadStorageTotal();
-
- // The main thread in generic mode gets the space of its entire warp as the
- // other threads do not participate in any computation at all.
- if (mapping::isMainThreadInGenericMode())
- StorageTotal *= mapping::getWarpSize();
-
- int TId = mapping::getThreadIdInBlock();
- if (Usage[TId] + AlignedBytes <= StorageTotal) {
- void *Ptr = getThreadDataTop(TId);
- Usage[TId] += AlignedBytes;
- return Ptr;
- }
-
- if (config::isDebugMode(DeviceDebugKind::CommonIssues))
- printf("Shared memory stack full, fallback to dynamic allocation of global "
- "memory will negatively impact performance.\n");
- void *GlobalMemory = memory::allocGlobal(
- AlignedBytes, "Slow path shared memory allocation, insufficient "
- "shared memory stack memory!");
- ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
-
- return GlobalMemory;
-}
-
-void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
- uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT);
- if (utils::isSharedMemPtr(Ptr)) {
- int TId = mapping::getThreadIdInBlock();
- Usage[TId] -= AlignedBytes;
- return;
- }
- memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
-}
-
-} // namespace
-
-void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
-
-void *memory::allocShared(uint64_t Bytes, const char *Reason) {
- return SharedMemorySmartStack.push(Bytes);
-}
-
-void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
- SharedMemorySmartStack.pop(Ptr, Bytes);
-}
-
-void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
- void *Ptr = malloc(Bytes);
- if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
- printf("nullptr returned by malloc!\n");
- return Ptr;
-}
-
-void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
-
-///}
-
-bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
- return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
- (ActiveLevelVar == Other.ActiveLevelVar) &
- (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
- (RunSchedVar == Other.RunSchedVar) &
- (RunSchedChunkVar == Other.RunSchedChunkVar);
-}
-
-void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
- ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
- ASSERT(LevelVar == Other.LevelVar, nullptr);
- ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
- ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
- ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
- ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
-}
-
-void state::TeamStateTy::init(bool IsSPMD) {
- ICVState.NThreadsVar = 0;
- ICVState.LevelVar = 0;
- ICVState.ActiveLevelVar = 0;
- ICVState.Padding0Val = 0;
- ICVState.MaxActiveLevelsVar = 1;
- ICVState.RunSchedVar = omp_sched_static;
- ICVState.RunSchedChunkVar = 1;
- ParallelTeamSize = 1;
- HasThreadState = false;
- ParallelRegionFnVar = nullptr;
-}
-
-bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
- return (ICVState == Other.ICVState) &
- (HasThreadState == Other.HasThreadState) &
- (ParallelTeamSize == Other.ParallelTeamSize);
-}
-
-void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
- ICVState.assertEqual(Other.ICVState);
- ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
- ASSERT(HasThreadState == Other.HasThreadState, nullptr);
-}
-
-[[clang::loader_uninitialized]] Local<state::TeamStateTy>
- ompx::state::TeamState;
-[[clang::loader_uninitialized]] Local<state::ThreadStateTy **>
- ompx::state::ThreadStates;
-
-namespace {
-
-int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
- int OutOfBoundsVal = -1) {
- if (Level == 0)
- return DefaultVal;
- int LevelVar = omp_get_level();
- if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
- return OutOfBoundsVal;
- int ActiveLevel = icv::ActiveLevel;
- if (OMP_UNLIKELY(Level != ActiveLevel))
- return DefaultVal;
- return Val;
-}
-
-} // namespace
-
-void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
- SharedMemorySmartStack.init(IsSPMD);
- if (mapping::isInitialThreadInLevel0(IsSPMD)) {
- TeamState.init(IsSPMD);
- ThreadStates = nullptr;
- KernelEnvironmentPtr = &KernelEnvironment;
- KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
- }
-}
-
-KernelEnvironmentTy &state::getKernelEnvironment() {
- return *KernelEnvironmentPtr;
-}
-
-KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
- return *KernelLaunchEnvironmentPtr;
-}
-
-void state::enterDataEnvironment(IdentTy *Ident) {
- ASSERT(config::mayUseThreadStates(),
- "Thread state modified while explicitly disabled!");
- if (!config::mayUseThreadStates())
- return;
-
- unsigned TId = mapping::getThreadIdInBlock();
- ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
- memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
- uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
- if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
- uint32_t Bytes =
- sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
- void *ThreadStatesPtr =
- memory::allocGlobal(Bytes, "Thread state array allocation");
- __builtin_memset(ThreadStatesPtr, 0, Bytes);
- if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
- reinterpret_cast<uintptr_t>(ThreadStatesPtr),
- atomic::seq_cst, atomic::seq_cst))
- memory::freeGlobal(ThreadStatesPtr,
- "Thread state array allocated multiple times");
- ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
- "Expected valid thread states bit!");
- }
- NewThreadState->init(ThreadStates[TId]);
- TeamState.HasThreadState = true;
- ThreadStates[TId] = NewThreadState;
-}
-
-void state::exitDataEnvironment() {
- ASSERT(config::mayUseThreadStates(),
- "Thread state modified while explicitly disabled!");
-
- unsigned TId = mapping::getThreadIdInBlock();
- resetStateForThread(TId);
-}
-
-void state::resetStateForThread(uint32_t TId) {
- if (!config::mayUseThreadStates())
- return;
- if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
- return;
-
- ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
- memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
- ThreadStates[TId] = PreviousThreadState;
-}
-
-void state::runAndCheckState(void(Func(void))) {
- TeamStateTy OldTeamState = TeamState;
- OldTeamState.assertEqual(TeamState);
-
- Func();
-
- OldTeamState.assertEqual(TeamState);
-}
-
-void state::assumeInitialState(bool IsSPMD) {
- TeamStateTy InitialTeamState;
- InitialTeamState.init(IsSPMD);
- InitialTeamState.assertEqual(TeamState);
- ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
-}
-
-int state::getEffectivePTeamSize() {
- int PTeamSize = state::ParallelTeamSize;
- return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
-}
-
-extern "C" {
-void omp_set_dynamic(int V) {}
-
-int omp_get_dynamic(void) { return 0; }
-
-void omp_set_num_threads(int V) { icv::NThreads = V; }
-
-int omp_get_max_threads(void) {
- int NT = icv::NThreads;
- return NT > 0 ? NT : mapping::getMaxTeamThreads();
-}
-
-int omp_get_level(void) {
- int LevelVar = icv::Level;
- ASSERT(LevelVar >= 0, nullptr);
- return LevelVar;
-}
-
-int omp_get_active_level(void) { return !!icv::ActiveLevel; }
-
-int omp_in_parallel(void) { return !!icv::ActiveLevel; }
-
-void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
- *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
- *ChunkSize = state::RunSchedChunk;
-}
-
-void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
- icv::RunSched = (int)ScheduleKind;
- state::RunSchedChunk = ChunkSize;
-}
-
-int omp_get_ancestor_thread_num(int Level) {
- return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
-}
-
-int omp_get_thread_num(void) {
- return omp_get_ancestor_thread_num(omp_get_level());
-}
-
-int omp_get_team_size(int Level) {
- return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
-}
-
-int omp_get_num_threads(void) {
- return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
-}
-
-int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
-
-int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
-
-void omp_set_nested(int) {}
-
-int omp_get_nested(void) { return false; }
-
-void omp_set_max_active_levels(int Levels) {
- icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
-}
-
-int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
-
-omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
-
-int omp_get_num_places(void) { return 0; }
-
-int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
-
-void omp_get_place_proc_ids(int, int *) {
- // TODO
-}
-
-int omp_get_place_num(void) { return 0; }
-
-int omp_get_partition_num_places(void) { return 0; }
-
-void omp_get_partition_place_nums(int *) {
- // TODO
-}
-
-int omp_get_cancellation(void) { return 0; }
-
-void omp_set_default_device(int) {}
-
-int omp_get_default_device(void) { return -1; }
-
-int omp_get_num_devices(void) { return config::getNumDevices(); }
-
-int omp_get_device_num(void) { return config::getDeviceNum(); }
-
-int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
-
-int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
-
-int omp_get_initial_device(void) { return -1; }
-
-int omp_is_initial_device(void) { return 0; }
-}
-
-extern "C" {
-[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
- return memory::allocShared(Bytes, "Frontend alloc shared");
-}
-
-[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
- memory::freeShared(Ptr, Bytes, "Frontend free shared");
-}
-
-void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
-
-void *llvm_omp_target_dynamic_shared_alloc() {
- return __kmpc_get_dynamic_shared();
-}
-
-void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
-
-/// Allocate storage in shared memory to communicate arguments from the main
-/// thread to the workers in generic mode. If we exceed
-/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
-constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
-
-[[clang::loader_uninitialized]] static Local<void *>
- SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
-[[clang::loader_uninitialized]] static Local<void **>
- SharedMemVariableSharingSpacePtr;
-
-void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
- if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
- SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
- } else {
- SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
- nArgs * sizeof(void *), "new extended args");
- ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
- "Nullptr returned by malloc!");
- }
- *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-
-void __kmpc_end_sharing_variables() {
- if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
- memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
-}
-
-void __kmpc_get_shared_variables(void ***GlobalArgs) {
- *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-}
diff --git a/offload/DeviceRTL/src/Stub.cpp b/offload/DeviceRTL/src/Stub.cpp
deleted file mode 100644
index e833423eb265..000000000000
--- a/offload/DeviceRTL/src/Stub.cpp
+++ /dev/null
@@ -1 +0,0 @@
-// This is an empty file used to create a device fatbinary.
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
deleted file mode 100644
index 2f1ed34a3f6d..000000000000
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Include all synchronization.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Synchronization.h"
-
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-
-using namespace ompx;
-
-namespace impl {
-
-/// Atomics
-///
-///{
-///}
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
- atomic::MemScopeTy MemScope) {
- // builtin_amdgcn_atomic_inc32 should expand to this switch when
- // passed a runtime value, but does not do so yet. Workaround here.
-
-#define ScopeSwitch(ORDER) \
- switch (MemScope) { \
- case atomic::MemScopeTy::system: \
- return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \
- case atomic::MemScopeTy::device: \
- return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \
- case atomic::MemScopeTy::workgroup: \
- return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \
- case atomic::MemScopeTy::wavefront: \
- return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "wavefront"); \
- case atomic::MemScopeTy::single: \
- return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "singlethread"); \
- }
-
-#define Case(ORDER) \
- case ORDER: \
- ScopeSwitch(ORDER)
-
- switch (Ordering) {
- default:
- __builtin_unreachable();
- Case(atomic::relaxed);
- Case(atomic::acquire);
- Case(atomic::release);
- Case(atomic::acq_rel);
- Case(atomic::seq_cst);
-#undef Case
-#undef ScopeSwitch
- }
-}
-
-[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker;
-
-void namedBarrierInit() {
- // Don't have global ctors, and shared memory is not zero init
- atomic::store(&namedBarrierTracker, 0u, atomic::release);
-}
-
-void namedBarrier() {
- uint32_t NumThreads = omp_get_num_threads();
- // assert(NumThreads % 32 == 0);
-
- uint32_t WarpSize = mapping::getWarpSize();
- uint32_t NumWaves = NumThreads / WarpSize;
-
- fence::team(atomic::acquire);
-
- // named barrier implementation for amdgcn.
- // Uses two 16 bit unsigned counters. One for the number of waves to have
- // reached the barrier, and one to count how many times the barrier has been
- // passed. These are packed in a single atomically accessed 32 bit integer.
- // Low bits for the number of waves, assumed zero before this call.
- // High bits to count the number of times the barrier has been passed.
-
- // precondition: NumWaves != 0;
- // invariant: NumWaves * WarpSize == NumThreads;
- // precondition: NumWaves < 0xffffu;
-
- // Increment the low 16 bits once, using the lowest active thread.
- if (mapping::isLeaderInWarp()) {
- uint32_t load = atomic::add(&namedBarrierTracker, 1,
- atomic::relaxed); // commutative
-
- // Record the number of times the barrier has been passed
- uint32_t generation = load & 0xffff0000u;
-
- if ((load & 0x0000ffffu) == (NumWaves - 1)) {
- // Reached NumWaves in low bits so this is the last wave.
- // Set low bits to zero and increment high bits
- load += 0x00010000u; // wrap is safe
- load &= 0xffff0000u; // because bits zeroed second
-
- // Reset the wave counter and release the waiting waves
- atomic::store(&namedBarrierTracker, load, atomic::relaxed);
- } else {
- // more waves still to go, spin until generation counter changes
- do {
- __builtin_amdgcn_s_sleep(0);
- load = atomic::load(&namedBarrierTracker, atomic::relaxed);
- } while ((load & 0xffff0000u) == generation);
- }
- }
- fence::team(atomic::release);
-}
-
-void fenceTeam(atomic::OrderingTy Ordering) {
- return __scoped_atomic_thread_fence(Ordering, atomic::workgroup);
-}
-
-void fenceKernel(atomic::OrderingTy Ordering) {
- return __scoped_atomic_thread_fence(Ordering, atomic::device);
-}
-
-void fenceSystem(atomic::OrderingTy Ordering) {
- return __scoped_atomic_thread_fence(Ordering, atomic::system);
-}
-
-void syncWarp(__kmpc_impl_lanemask_t) {
- // This is a no-op on current AMDGPU hardware but it is used by the optimizer
- // to enforce convergent behaviour between control flow graphs.
- __builtin_amdgcn_wave_barrier();
-}
-
-void syncThreads(atomic::OrderingTy Ordering) {
- if (Ordering != atomic::relaxed)
- fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
-
- __builtin_amdgcn_s_barrier();
-
- if (Ordering != atomic::relaxed)
- fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst);
-}
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
-
-// TODO: Don't have wavefront lane locks. Possibly can't have them.
-void unsetLock(omp_lock_t *) { __builtin_trap(); }
-int testLock(omp_lock_t *) { __builtin_trap(); }
-void initLock(omp_lock_t *) { __builtin_trap(); }
-void destroyLock(omp_lock_t *) { __builtin_trap(); }
-void setLock(omp_lock_t *) { __builtin_trap(); }
-
-constexpr uint32_t UNSET = 0;
-constexpr uint32_t SET = 1;
-
-void unsetCriticalLock(omp_lock_t *Lock) {
- (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel);
-}
-
-void setCriticalLock(omp_lock_t *Lock) {
- uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1;
- if (mapping::getThreadIdInWarp() == LowestActiveThread) {
- fenceKernel(atomic::release);
- while (
- !cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) {
- __builtin_amdgcn_s_sleep(32);
- }
- fenceKernel(atomic::acquire);
- }
-}
-
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
- atomic::MemScopeTy MemScope) {
- return __nvvm_atom_inc_gen_ui(Address, Val);
-}
-
-void namedBarrierInit() {}
-
-void namedBarrier() {
- uint32_t NumThreads = omp_get_num_threads();
- ASSERT(NumThreads % 32 == 0, nullptr);
-
- // The named barrier for active parallel threads of a team in an L1 parallel
- // region to synchronize with each other.
- constexpr int BarrierNo = 7;
- __nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
-}
-
-void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
-
-void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); }
-
-void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
-
-void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
-
-void syncThreads(atomic::OrderingTy Ordering) {
- constexpr int BarrierNo = 8;
- __nvvm_barrier_sync(BarrierNo);
-}
-
-void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
-
-constexpr uint32_t OMP_SPIN = 1000;
-constexpr uint32_t UNSET = 0;
-constexpr uint32_t SET = 1;
-
-// TODO: This seems to hide a bug in the declare variant handling. If it is
-// called before it is defined
-// here the overload won't happen. Investigate lalter!
-void unsetLock(omp_lock_t *Lock) {
- (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst);
-}
-
-int testLock(omp_lock_t *Lock) {
- return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst);
-}
-
-void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void setLock(omp_lock_t *Lock) {
- // TODO: not sure spinning is a good idea here..
- while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst,
- atomic::seq_cst) != UNSET) {
- int32_t start = __nvvm_read_ptx_sreg_clock();
- int32_t now;
- for (;;) {
- now = __nvvm_read_ptx_sreg_clock();
- int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
- if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) {
- break;
- }
- }
- } // wait for 0 to be the read value
-}
-
-void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
-
-#endif
-///}
-
-} // namespace impl
-
-void synchronize::init(bool IsSPMD) {
- if (!IsSPMD)
- impl::namedBarrierInit();
-}
-
-void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
-
-void synchronize::threads(atomic::OrderingTy Ordering) {
- impl::syncThreads(Ordering);
-}
-
-void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
- impl::syncThreadsAligned(Ordering);
-}
-
-void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
-
-void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); }
-
-void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); }
-
-uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering,
- atomic::MemScopeTy MemScope) {
- return impl::atomicInc(Addr, V, Ordering, MemScope);
-}
-
-void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
-
-void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
-
-extern "C" {
-void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
-
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
- __kmpc_barrier(Loc, TId);
- return 0;
-}
-
-void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
- if (mapping::isSPMDMode())
- return __kmpc_barrier_simple_spmd(Loc, TId);
-
- // Generic parallel regions are run with multiple of the warp size or single
- // threaded, in the latter case we need to stop here.
- if (omp_get_num_threads() == 1)
- return __kmpc_flush(Loc);
-
- impl::namedBarrier();
-}
-
-[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
- synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
-}
-
-[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc,
- int32_t TId) {
- synchronize::threads(atomic::OrderingTy::seq_cst);
-}
-
-int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
- return omp_get_thread_num() == 0;
-}
-
-void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) {
- return omp_get_thread_num() == Filter;
-}
-
-void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
- return __kmpc_master(Loc, TId);
-}
-
-void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
- // The barrier is explicitly called.
-}
-
-void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); }
-
-uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
-
-void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
-
-void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
- impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
-}
-
-void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
- impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
-}
-
-void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
-
-void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
-
-void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
-
-void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
-
-int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); }
-
-void ompx_sync_block(int Ordering) {
- impl::syncThreadsAligned(atomic::OrderingTy(Ordering));
-}
-void ompx_sync_block_acq_rel() {
- impl::syncThreadsAligned(atomic::OrderingTy::acq_rel);
-}
-void ompx_sync_block_divergent(int Ordering) {
- impl::syncThreads(atomic::OrderingTy(Ordering));
-}
-} // extern "C"
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp
deleted file mode 100644
index d0be0ace50df..000000000000
--- a/offload/DeviceRTL/src/Tasking.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Task implementation support.
-//
-// TODO: We should not allocate and execute the task in two steps. A new API is
-// needed for that though.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "State.h"
-
-using namespace ompx;
-
-extern "C" {
-
-TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
- size_t TaskSizeInclPrivateValues,
- size_t SharedValuesSize,
- TaskFnTy TaskFn) {
- auto TaskSizeInclPrivateValuesPadded =
- utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *));
- auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
- TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
- TaskSizeTotal, "explicit task descriptor");
- TaskDescriptor->Payload =
- utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
- TaskDescriptor->TaskFn = TaskFn;
-
- return TaskDescriptor;
-}
-
-int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor) {
- return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
-}
-
-int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor, int32_t,
- void *, int32_t, void *) {
- state::DateEnvironmentRAII DERAII(Loc);
-
- TaskDescriptor->TaskFn(0, TaskDescriptor);
-
- memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
- return 0;
-}
-
-void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor) {
- state::enterDataEnvironment(Loc);
-}
-
-void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor) {
- state::exitDataEnvironment();
-
- memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
-}
-
-void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
- void *) {}
-
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
-
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
-
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; }
-
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
-
-void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
- TaskDescriptorTy *TaskDescriptor, int,
- uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
- int32_t, uint64_t, void *) {
- // Skip task entirely if empty iteration space.
- if (*LowerBound > *UpperBound)
- return;
-
- // The compiler has already stored lb and ub in the TaskDescriptorTy structure
- // as we are using a single task to execute the entire loop, we can leave
- // the initial task_t untouched
- __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
-}
-
-int omp_in_final(void) {
- // treat all tasks as final... Specs may expect runtime to keep
- // track more precisely if a task was actively set by users... This
- // is not explicitly specified; will treat as if runtime can
- // actively decide to put a non-final task into a final one.
- return 1;
-}
-
-int omp_get_max_task_priority(void) { return 0; }
-}
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
deleted file mode 100644
index 59a2cc3f27ac..000000000000
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ /dev/null
@@ -1,970 +0,0 @@
-//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the KMPC interface
-// for the loop construct plus other worksharing constructs that use the same
-// interface as loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Workshare.h"
-#include "Debug.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-
-using namespace ompx;
-
-// TODO:
-struct DynamicScheduleTracker {
- int64_t Chunk;
- int64_t LoopUpperBound;
- int64_t NextLowerBound;
- int64_t Stride;
- kmp_sched_t ScheduleType;
- DynamicScheduleTracker *NextDST;
-};
-
-#define ASSERT0(...)
-
-// used by the library for the interface with the app
-#define DISPATCH_FINISHED 0
-#define DISPATCH_NOTFINISHED 1
-
-// used by dynamic scheduling
-#define FINISHED 0
-#define NOT_FINISHED 1
-#define LAST_CHUNK 2
-
-// TODO: This variable is a hack inherited from the old runtime.
-[[clang::loader_uninitialized]] static Local<uint64_t> Cnt;
-
-template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
- ////////////////////////////////////////////////////////////////////////////////
- // Loop with static scheduling with chunk
-
- // Generic implementation of OMP loop scheduling with static policy
- /*! \brief Calculate initial bounds for static loop and stride
- * @param[in] loc location in code of the call (not used here)
- * @param[in] global_tid global thread id
- * @param[in] schetype type of scheduling (see omptarget-nvptx.h)
- * @param[in] plastiter pointer to last iteration
- * @param[in,out] pointer to loop lower bound. it will contain value of
- * lower bound of first chunk
- * @param[in,out] pointer to loop upper bound. It will contain value of
- * upper bound of first chunk
- * @param[in,out] pointer to loop stride. It will contain value of stride
- * between two successive chunks executed by the same thread
- * @param[in] loop increment bump
- * @param[in] chunk size
- */
-
- // helper function for static chunk
- static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
- T entityId, T numberOfEntities) {
- // each thread executes multiple chunks all of the same size, except
- // the last one
- // distance between two successive chunks
- stride = numberOfEntities * chunk;
- lb = lb + entityId * chunk;
- T inputUb = ub;
- ub = lb + chunk - 1; // Clang uses i <= ub
- // Say ub' is the beginning of the last chunk. Then who ever has a
- // lower bound plus a multiple of the increment equal to ub' is
- // the last one.
- T beginingLastChunk = inputUb - (inputUb % chunk);
- last = ((beginingLastChunk - lb) % stride) == 0;
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Loop with static scheduling without chunk
-
- // helper function for static no chunk
- static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
- T entityId, T numberOfEntities) {
- // No chunk size specified. Each thread or warp gets at most one
- // chunk; chunks are all almost of equal size
- T loopSize = ub - lb + 1;
-
- chunk = loopSize / numberOfEntities;
- T leftOver = loopSize - chunk * numberOfEntities;
-
- if (entityId < leftOver) {
- chunk++;
- lb = lb + entityId * chunk;
- } else {
- lb = lb + entityId * chunk + leftOver;
- }
-
- T inputUb = ub;
- ub = lb + chunk - 1; // Clang uses i <= ub
- last = lb <= inputUb && inputUb <= ub;
- stride = loopSize; // make sure we only do 1 chunk per warp
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Support for Static Init
-
- static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter,
- T *plower, T *pupper, ST *pstride, ST chunk,
- bool IsSPMDExecutionMode) {
- int32_t gtid = omp_get_thread_num();
- int numberOfActiveOMPThreads = omp_get_num_threads();
-
- // All warps that are in excess of the maximum requested, do
- // not execute the loop
- ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
- "current thread is not needed here; error");
-
- // copy
- int lastiter = 0;
- T lb = *plower;
- T ub = *pupper;
- ST stride = *pstride;
-
- // init
- switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
- case kmp_sched_static_chunk: {
- if (chunk > 0) {
- ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- break;
- }
- [[fallthrough]];
- } // note: if chunk <=0, use nochunk
- case kmp_sched_static_balanced_chunk: {
- if (chunk > 0) {
- // round up to make sure the chunk is enough to cover all iterations
- T tripCount = ub - lb + 1; // +1 because ub is inclusive
- T span = (tripCount + numberOfActiveOMPThreads - 1) /
- numberOfActiveOMPThreads;
- // perform chunk adjustment
- chunk = (span + chunk - 1) & ~(chunk - 1);
-
- ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
- T oldUb = ub;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- if (ub > oldUb)
- ub = oldUb;
- break;
- }
- [[fallthrough]];
- } // note: if chunk <=0, use nochunk
- case kmp_sched_static_nochunk: {
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- break;
- }
- case kmp_sched_distr_static_chunk: {
- if (chunk > 0) {
- ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
- omp_get_num_teams());
- break;
- }
- [[fallthrough]];
- } // note: if chunk <=0, use nochunk
- case kmp_sched_distr_static_nochunk: {
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
- omp_get_num_teams());
- break;
- }
- case kmp_sched_distr_static_chunk_sched_static_chunkone: {
- ForStaticChunk(lastiter, lb, ub, stride, chunk,
- numberOfActiveOMPThreads * omp_get_team_num() + gtid,
- omp_get_num_teams() * numberOfActiveOMPThreads);
- break;
- }
- default: {
- // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
- ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- break;
- }
- }
- // copy back
- *plastiter = lastiter;
- *plower = lb;
- *pupper = ub;
- *pstride = stride;
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Support for dispatch Init
-
- static int OrderedSchedule(kmp_sched_t schedule) {
- return schedule >= kmp_sched_ordered_first &&
- schedule <= kmp_sched_ordered_last;
- }
-
- static void dispatch_init(IdentTy *loc, int32_t threadId,
- kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
- DynamicScheduleTracker *DST) {
- int tid = mapping::getThreadIdInBlock();
- T tnum = omp_get_num_threads();
- T tripCount = ub - lb + 1; // +1 because ub is inclusive
- ASSERT0(LT_FUSSY, threadId < tnum,
- "current thread is not needed here; error");
-
- /* Currently just ignore the monotonic and non-monotonic modifiers
- * (the compiler isn't producing them * yet anyway).
- * When it is we'll want to look at them somewhere here and use that
- * information to add to our schedule choice. We shouldn't need to pass
- * them on, they merely affect which schedule we can legally choose for
- * various dynamic cases. (In particular, whether or not a stealing scheme
- * is legal).
- */
- schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
-
- // Process schedule.
- if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
- if (OrderedSchedule(schedule))
- __kmpc_barrier(loc, threadId);
- schedule = kmp_sched_static_chunk;
- chunk = tripCount; // one thread gets the whole loop
- } else if (schedule == kmp_sched_runtime) {
- // process runtime
- omp_sched_t rtSched;
- int ChunkInt;
- omp_get_schedule(&rtSched, &ChunkInt);
- chunk = ChunkInt;
- switch (rtSched) {
- case omp_sched_static: {
- if (chunk > 0)
- schedule = kmp_sched_static_chunk;
- else
- schedule = kmp_sched_static_nochunk;
- break;
- }
- case omp_sched_auto: {
- schedule = kmp_sched_static_chunk;
- chunk = 1;
- break;
- }
- case omp_sched_dynamic:
- case omp_sched_guided: {
- schedule = kmp_sched_dynamic;
- break;
- }
- }
- } else if (schedule == kmp_sched_auto) {
- schedule = kmp_sched_static_chunk;
- chunk = 1;
- } else {
- // ASSERT(LT_FUSSY,
- // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
- // "unknown schedule %d & chunk %lld\n", (int)schedule,
- // (long long)chunk);
- }
-
- // init schedules
- if (schedule == kmp_sched_static_chunk) {
- ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
- // save sched state
- DST->ScheduleType = schedule;
- // save ub
- DST->LoopUpperBound = ub;
- // compute static chunk
- ST stride;
- int lastiter = 0;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
- // save computed params
- DST->Chunk = chunk;
- DST->NextLowerBound = lb;
- DST->Stride = stride;
- } else if (schedule == kmp_sched_static_balanced_chunk) {
- ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
- // save sched state
- DST->ScheduleType = schedule;
- // save ub
- DST->LoopUpperBound = ub;
- // compute static chunk
- ST stride;
- int lastiter = 0;
- // round up to make sure the chunk is enough to cover all iterations
- T span = (tripCount + tnum - 1) / tnum;
- // perform chunk adjustment
- chunk = (span + chunk - 1) & ~(chunk - 1);
-
- T oldUb = ub;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
- ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
- if (ub > oldUb)
- ub = oldUb;
- // save computed params
- DST->Chunk = chunk;
- DST->NextLowerBound = lb;
- DST->Stride = stride;
- } else if (schedule == kmp_sched_static_nochunk) {
- ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
- // save sched state
- DST->ScheduleType = schedule;
- // save ub
- DST->LoopUpperBound = ub;
- // compute static chunk
- ST stride;
- int lastiter = 0;
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
- // save computed params
- DST->Chunk = chunk;
- DST->NextLowerBound = lb;
- DST->Stride = stride;
- } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
- // save data
- DST->ScheduleType = schedule;
- if (chunk < 1)
- chunk = 1;
- DST->Chunk = chunk;
- DST->LoopUpperBound = ub;
- DST->NextLowerBound = lb;
- __kmpc_barrier(loc, threadId);
- if (tid == 0) {
- Cnt = 0;
- fence::team(atomic::seq_cst);
- }
- __kmpc_barrier(loc, threadId);
- }
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Support for dispatch next
-
- static uint64_t NextIter() {
- __kmpc_impl_lanemask_t active = mapping::activemask();
- uint32_t leader = utils::ffs(active) - 1;
- uint32_t change = utils::popc(active);
- __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
- unsigned int rank = utils::popc(active & lane_mask_lt);
- uint64_t warp_res = 0;
- if (rank == 0) {
- warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
- }
- warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize());
- return warp_res + rank;
- }
-
- static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
- T loopUpperBound) {
- T N = NextIter();
- lb = loopLowerBound + N * chunkSize;
- ub = lb + chunkSize - 1; // Clang uses i <= ub
-
- // 3 result cases:
- // a. lb and ub < loopUpperBound --> NOT_FINISHED
- // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
- // NOT_FINISHED
- // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
- // a.
- if (lb <= loopUpperBound && ub < loopUpperBound) {
- return NOT_FINISHED;
- }
- // b.
- if (lb <= loopUpperBound) {
- ub = loopUpperBound;
- return LAST_CHUNK;
- }
- // c. if we are here, we are in case 'c'
- lb = loopUpperBound + 2;
- ub = loopUpperBound + 1;
- return FINISHED;
- }
-
- static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
- T *plower, T *pupper, ST *pstride,
- DynamicScheduleTracker *DST) {
- // ID of a thread in its own warp
-
- // automatically selects thread or warp ID based on selected implementation
- ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
- "current thread is not needed here; error");
- // retrieve schedule
- kmp_sched_t schedule = DST->ScheduleType;
-
- // xxx reduce to one
- if (schedule == kmp_sched_static_chunk ||
- schedule == kmp_sched_static_nochunk) {
- T myLb = DST->NextLowerBound;
- T ub = DST->LoopUpperBound;
- // finished?
- if (myLb > ub) {
- return DISPATCH_FINISHED;
- }
- // not finished, save current bounds
- ST chunk = DST->Chunk;
- *plower = myLb;
- T myUb = myLb + chunk - 1; // Clang uses i <= ub
- if (myUb > ub)
- myUb = ub;
- *pupper = myUb;
- *plast = (int32_t)(myUb == ub);
-
- // increment next lower bound by the stride
- ST stride = DST->Stride;
- DST->NextLowerBound = myLb + stride;
- return DISPATCH_NOTFINISHED;
- }
- ASSERT0(LT_FUSSY,
- schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
- "bad sched");
- T myLb, myUb;
- int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
- DST->LoopUpperBound);
-
- if (finished == FINISHED)
- return DISPATCH_FINISHED;
-
- // not finished (either not finished or last chunk)
- *plast = (int32_t)(finished == LAST_CHUNK);
- *plower = myLb;
- *pupper = myUb;
- *pstride = 1;
-
- return DISPATCH_NOTFINISHED;
- }
-
- static void dispatch_fini() {
- // nothing
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // end of template class that encapsulate all the helper functions
- ////////////////////////////////////////////////////////////////////////////////
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (dyn loops)
-////////////////////////////////////////////////////////////////////////////////
-
-// TODO: Expand the dispatch API to take a DST pointer which can then be
-// allocated properly without malloc.
-// For now, each team will contain an LDS pointer (ThreadDST) to a global array
-// of references to the DST structs allocated (in global memory) for each thread
-// in the team. The global memory array is allocated during the init phase if it
-// was not allocated already and will be deallocated when the dispatch phase
-// ends:
-//
-// __kmpc_dispatch_init
-//
-// ** Dispatch loop **
-//
-// __kmpc_dispatch_deinit
-//
-[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **>
- ThreadDST;
-
-// Create a new DST, link the current one, and define the new as current.
-static DynamicScheduleTracker *pushDST() {
- int32_t ThreadIndex = mapping::getThreadIdInBlock();
- // Each block will allocate an array of pointers to DST structs. The array is
- // equal in length to the number of threads in that block.
- if (!ThreadDST) {
- // Allocate global memory array of pointers to DST structs:
- if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0)
- ThreadDST = static_cast<DynamicScheduleTracker **>(
- memory::allocGlobal(mapping::getNumberOfThreadsInBlock() *
- sizeof(DynamicScheduleTracker *),
- "new ThreadDST array"));
- synchronize::threads(atomic::seq_cst);
-
- // Initialize the array pointers:
- ThreadDST[ThreadIndex] = nullptr;
- }
-
- // Create a DST struct for the current thread:
- DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
- memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
- *NewDST = DynamicScheduleTracker({0});
-
- // Add the new DST struct to the array of DST structs:
- NewDST->NextDST = ThreadDST[ThreadIndex];
- ThreadDST[ThreadIndex] = NewDST;
- return NewDST;
-}
-
-// Return the current DST.
-static DynamicScheduleTracker *peekDST() {
- return ThreadDST[mapping::getThreadIdInBlock()];
-}
-
-// Pop the current DST and restore the last one.
-static void popDST() {
- int32_t ThreadIndex = mapping::getThreadIdInBlock();
- DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex];
- DynamicScheduleTracker *OldDST = CurrentDST->NextDST;
- memory::freeGlobal(CurrentDST, "remove DST");
- ThreadDST[ThreadIndex] = OldDST;
-
- // Check if we need to deallocate the global array. Ensure all threads
- // in the block have finished deallocating the individual DSTs.
- synchronize::threads(atomic::seq_cst);
- if (!ThreadDST[ThreadIndex] && !ThreadIndex) {
- memory::freeGlobal(ThreadDST, "remove ThreadDST array");
- ThreadDST = nullptr;
- }
- synchronize::threads(atomic::seq_cst);
-}
-
-void workshare::init(bool IsSPMD) {
- if (mapping::isInitialThreadInLevel0(IsSPMD))
- ThreadDST = nullptr;
-}
-
-extern "C" {
-
-// init
-void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
- int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
- DynamicScheduleTracker *DST = pushDST();
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
- uint32_t lb, uint32_t ub, int32_t st,
- int32_t chunk) {
- DynamicScheduleTracker *DST = pushDST();
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
- int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
- DynamicScheduleTracker *DST = pushDST();
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
- uint64_t lb, uint64_t ub, int64_t st,
- int64_t chunk) {
- DynamicScheduleTracker *DST = pushDST();
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-// next
-int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
- int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
- DynamicScheduleTracker *DST = peekDST();
- return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
- uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
- DynamicScheduleTracker *DST = peekDST();
- return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
- int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
- DynamicScheduleTracker *DST = peekDST();
- return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
- uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
- DynamicScheduleTracker *DST = peekDST();
- return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-// fini
-void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
-}
-
-void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
-}
-
-// deinit
-void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); }
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (static loops)
-////////////////////////////////////////////////////////////////////////////////
-
-void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr, int32_t chunk) {
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr, int32_t chunk) {
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr, int64_t chunk) {
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr, int64_t chunk) {
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
-
-void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
-}
-
-namespace ompx {
-
-/// Helper class to hide the generic loop nest and provide the template argument
-/// throughout.
-template <typename Ty> class StaticLoopChunker {
-
- /// Generic loop nest that handles block and/or thread distribution in the
- /// absence of user specified chunk sizes. This implicitly picks a block chunk
- /// size equal to the number of threads in the block and a thread chunk size
- /// equal to one. In contrast to the chunked version we can get away with a
- /// single loop in this case
- static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumBlocks, Ty BId, Ty NumThreads,
- Ty TId, Ty NumIters,
- uint8_t OneIterationPerThread) {
- Ty KernelIteration = NumBlocks * NumThreads;
-
- // Start index in the normalized space.
- Ty IV = BId * NumThreads + TId;
- ASSERT(IV >= 0, "Bad index");
-
- // Cover the entire iteration space, assumptions in the caller might allow
- // to simplify this loop to a conditional.
- if (IV < NumIters) {
- do {
-
- // Execute the loop body.
- LoopBody(IV, Arg);
-
- // Every thread executed one block and thread chunk now.
- IV += KernelIteration;
-
- if (OneIterationPerThread)
- return;
-
- } while (IV < NumIters);
- }
- }
-
- /// Generic loop nest that handles block and/or thread distribution in the
- /// presence of user specified chunk sizes (for at least one of them).
- static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
- Ty BlockChunk, Ty NumBlocks, Ty BId,
- Ty ThreadChunk, Ty NumThreads, Ty TId,
- Ty NumIters,
- uint8_t OneIterationPerThread) {
- Ty KernelIteration = NumBlocks * BlockChunk;
-
- // Start index in the chunked space.
- Ty IV = BId * BlockChunk + TId;
- ASSERT(IV >= 0, "Bad index");
-
- // Cover the entire iteration space, assumptions in the caller might allow
- // to simplify this loop to a conditional.
- do {
-
- Ty BlockChunkLeft =
- BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
- Ty ThreadChunkLeft =
- ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
-
- while (ThreadChunkLeft--) {
-
- // Given the blocking it's hard to keep track of what to execute.
- if (IV >= NumIters)
- return;
-
- // Execute the loop body.
- LoopBody(IV, Arg);
-
- if (OneIterationPerThread)
- return;
-
- ++IV;
- }
-
- IV += KernelIteration;
-
- } while (IV < NumIters);
- }
-
-public:
- /// Worksharing `for`-loop.
- /// \param[in] Loc Description of source location
- /// \param[in] LoopBody Function which corresponds to loop body
- /// \param[in] Arg Pointer to struct which contains loop body args
- /// \param[in] NumIters Number of loop iterations
- /// \param[in] NumThreads Number of GPU threads
- /// \param[in] ThreadChunk Size of thread chunk
- /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
- /// only one loop iteration or one thread chunk. This avoids an outer loop
- /// over all loop iterations/chunks.
- static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumIters, Ty NumThreads, Ty ThreadChunk,
- uint8_t OneIterationPerThread) {
- ASSERT(NumIters >= 0, "Bad iteration count");
- ASSERT(ThreadChunk >= 0, "Bad thread count");
-
- // All threads need to participate but we don't know if we are in a
- // parallel at all or if the user might have used a `num_threads` clause
- // on the parallel and reduced the number compared to the block size.
- // Since nested parallels are possible too we need to get the thread id
- // from the `omp` getter and not the mapping directly.
- Ty TId = omp_get_thread_num();
-
- // There are no blocks involved here.
- Ty BlockChunk = 0;
- Ty NumBlocks = 1;
- Ty BId = 0;
-
- // If the thread chunk is not specified we pick a default now.
- if (ThreadChunk == 0)
- ThreadChunk = 1;
-
- // If we know we have more threads than iterations we can indicate that to
- // avoid an outer loop.
- if (config::getAssumeThreadsOversubscription()) {
- OneIterationPerThread = true;
- }
-
- if (OneIterationPerThread)
- ASSERT(NumThreads >= NumIters, "Broken assumption");
-
- if (ThreadChunk != 1)
- NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
- ThreadChunk, NumThreads, TId, NumIters,
- OneIterationPerThread);
- else
- NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
- NumIters, OneIterationPerThread);
- }
-
- /// Worksharing `distribute`-loop.
- /// \param[in] Loc Description of source location
- /// \param[in] LoopBody Function which corresponds to loop body
- /// \param[in] Arg Pointer to struct which contains loop body args
- /// \param[in] NumIters Number of loop iterations
- /// \param[in] BlockChunk Size of block chunk
- /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
- /// only one loop iteration or one thread chunk. This avoids an outer loop
- /// over all loop iterations/chunks.
- static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
- Ty NumIters, Ty BlockChunk,
- uint8_t OneIterationPerThread) {
- ASSERT(icv::Level == 0, "Bad distribute");
- ASSERT(icv::ActiveLevel == 0, "Bad distribute");
- ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
- ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
-
- ASSERT(NumIters >= 0, "Bad iteration count");
- ASSERT(BlockChunk >= 0, "Bad block count");
-
- // There are no threads involved here.
- Ty ThreadChunk = 0;
- Ty NumThreads = 1;
- Ty TId = 0;
-
- // All teams need to participate.
- Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
- Ty BId = mapping::getBlockIdInKernel();
-
- // If the block chunk is not specified we pick a default now.
- if (BlockChunk == 0)
- BlockChunk = NumThreads;
-
- // If we know we have more blocks than iterations we can indicate that to
- // avoid an outer loop.
- if (config::getAssumeTeamsOversubscription()) {
- OneIterationPerThread = true;
- }
-
- if (OneIterationPerThread)
- ASSERT(NumBlocks >= NumIters, "Broken assumption");
-
- if (BlockChunk != NumThreads)
- NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
- ThreadChunk, NumThreads, TId, NumIters,
- OneIterationPerThread);
- else
- NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
- NumIters, OneIterationPerThread);
-
- ASSERT(icv::Level == 0, "Bad distribute");
- ASSERT(icv::ActiveLevel == 0, "Bad distribute");
- ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
- ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
- }
-
- /// Worksharing `distribute parallel for`-loop.
- /// \param[in] Loc Description of source location
- /// \param[in] LoopBody Function which corresponds to loop body
- /// \param[in] Arg Pointer to struct which contains loop body args
- /// \param[in] NumIters Number of loop iterations
- /// \param[in] NumThreads Number of GPU threads
- /// \param[in] BlockChunk Size of block chunk
- /// \param[in] ThreadChunk Size of thread chunk
- /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
- /// only one loop iteration or one thread chunk. This avoids an outer loop
- /// over all loop iterations/chunks.
- static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
- void *Arg, Ty NumIters, Ty NumThreads,
- Ty BlockChunk, Ty ThreadChunk,
- uint8_t OneIterationPerThread) {
- ASSERT(icv::Level == 1, "Bad distribute");
- ASSERT(icv::ActiveLevel == 1, "Bad distribute");
- ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-
- ASSERT(NumIters >= 0, "Bad iteration count");
- ASSERT(BlockChunk >= 0, "Bad block count");
- ASSERT(ThreadChunk >= 0, "Bad thread count");
-
- // All threads need to participate but the user might have used a
- // `num_threads` clause on the parallel and reduced the number compared to
- // the block size.
- Ty TId = mapping::getThreadIdInBlock();
-
- // All teams need to participate.
- Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
- Ty BId = mapping::getBlockIdInKernel();
-
- // If the block chunk is not specified we pick a default now.
- if (BlockChunk == 0)
- BlockChunk = NumThreads;
-
- // If the thread chunk is not specified we pick a default now.
- if (ThreadChunk == 0)
- ThreadChunk = 1;
-
- // If we know we have more threads (across all blocks) than iterations we
- // can indicate that to avoid an outer loop.
- if (config::getAssumeTeamsOversubscription() &
- config::getAssumeThreadsOversubscription()) {
- OneIterationPerThread = true;
- }
-
- if (OneIterationPerThread)
- ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
-
- if (BlockChunk != NumThreads || ThreadChunk != 1)
- NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
- ThreadChunk, NumThreads, TId, NumIters,
- OneIterationPerThread);
- else
- NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
- NumIters, OneIterationPerThread);
-
- ASSERT(icv::Level == 1, "Bad distribute");
- ASSERT(icv::ActiveLevel == 1, "Bad distribute");
- ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
- }
-};
-
-} // namespace ompx
-
-#define OMP_LOOP_ENTRY(BW, TY) \
- [[gnu::flatten, clang::always_inline]] void \
- __kmpc_distribute_for_static_loop##BW( \
- IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
- TY num_threads, TY block_chunk, TY thread_chunk, \
- uint8_t one_iteration_per_thread) { \
- ompx::StaticLoopChunker<TY>::DistributeFor( \
- loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk, \
- one_iteration_per_thread); \
- } \
- [[gnu::flatten, clang::always_inline]] void \
- __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
- void *arg, TY num_iters, TY block_chunk, \
- uint8_t one_iteration_per_thread) { \
- ompx::StaticLoopChunker<TY>::Distribute( \
- loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread); \
- } \
- [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
- IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
- TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) { \
- ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \
- thread_chunk, one_iteration_per_thread); \
- }
-
-extern "C" {
-OMP_LOOP_ENTRY(_4, int32_t)
-OMP_LOOP_ENTRY(_4u, uint32_t)
-OMP_LOOP_ENTRY(_8, int64_t)
-OMP_LOOP_ENTRY(_8u, uint64_t)
-}
diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake
index 0236f5f0b698..5a27a81c736b 100644
--- a/offload/cmake/caches/AMDGPUBot.cmake
+++ b/offload/cmake/caches/AMDGPUBot.cmake
@@ -15,7 +15,10 @@ set(LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;offload;flang-rt" CACHE STRING "")
set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
set(LLVM_TARGETS_TO_BUILD "host;AMDGPU;SPIRV" CACHE STRING "")
-set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "")
+set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 16" CACHE STRING "")
set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "")
+
+set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "openmp" CACHE STRING "")
diff --git a/offload/cmake/caches/AMDGPULibcBot.cmake b/offload/cmake/caches/AMDGPULibcBot.cmake
index a772043c7966..798f080a41ad 100644
--- a/offload/cmake/caches/AMDGPULibcBot.cmake
+++ b/offload/cmake/caches/AMDGPULibcBot.cmake
@@ -17,5 +17,6 @@ set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "")
set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "")
-set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc" CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/AMDGPU.cmake" CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;libc;libcxxabi;libcxx" CACHE STRING "")
set(RUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_TEST_JOBS 4 CACHE STRING "")
diff --git a/offload/cmake/caches/Offload.cmake b/offload/cmake/caches/Offload.cmake
index 5533a6508f5d..3747a1d3eb29 100644
--- a/offload/cmake/caches/Offload.cmake
+++ b/offload/cmake/caches/Offload.cmake
@@ -5,5 +5,5 @@ set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda CACHE STRING "")
set(RUNTIMES_nvptx64-nvidia-cuda_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/NVPTX.cmake" CACHE STRING "")
set(RUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/AMDGPU.cmake" CACHE STRING "")
-set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "")
-set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "")
+set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "")
diff --git a/offload/include/device.h b/offload/include/device.h
index 1e85bb1876c8..bf93ce0460ae 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -33,7 +33,9 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
+#include "GlobalHandler.h"
#include "PluginInterface.h"
+
using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
// Forward declarations.
diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index d1baa28687fb..5b54c79d83f9 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -28,10 +28,13 @@ def ol_device_info_t : Enum {
TaggedEtor<"TYPE", "ol_device_type_t", "type of the device">,
TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">,
TaggedEtor<"NAME", "char[]", "Device name">,
+ TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">,
TaggedEtor<"VENDOR", "char[]", "Device vendor">,
TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">,
TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">,
TaggedEtor<"MAX_WORK_GROUP_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work group size in each dimension">,
+ TaggedEtor<"MAX_WORK_SIZE", "uint32_t", "Maximum total work items">,
+ TaggedEtor<"MAX_WORK_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work items in each dimension">,
TaggedEtor<"VENDOR_ID", "uint32_t", "A unique vendor device identifier assigned by PCI-SIG">,
TaggedEtor<"NUM_COMPUTE_UNITS", "uint32_t", "The number of parallel compute units available to the device">,
TaggedEtor<"MAX_CLOCK_FREQUENCY", "uint32_t", "The maximum configured clock frequency of this device in MHz">,
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 9d342e06127a..7e8e297831f4 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -47,10 +47,59 @@ struct ol_device_impl_t {
ol_platform_handle_t Platform, InfoTreeNode &&DevInfo)
: DeviceNum(DeviceNum), Device(Device), Platform(Platform),
Info(std::forward<InfoTreeNode>(DevInfo)) {}
+
+ ~ol_device_impl_t() {
+ assert(!OutstandingQueues.size() &&
+ "Device object dropped with outstanding queues");
+ }
+
int DeviceNum;
GenericDeviceTy *Device;
ol_platform_handle_t Platform;
InfoTreeNode Info;
+
+ llvm::SmallVector<__tgt_async_info *> OutstandingQueues;
+ std::mutex OutstandingQueuesMutex;
+
+ /// If the device has any outstanding queues that are now complete, remove it
+ /// from the list and return it.
+ ///
+ /// Queues may be added to the outstanding queue list by olDestroyQueue if
+ /// they are destroyed but not completed.
+ __tgt_async_info *getOutstandingQueue() {
+ // Not locking the `size()` access is fine here - In the worst case we
+ // either miss a queue that exists or loop through an empty array after
+ // taking the lock. Both are sub-optimal but not that bad.
+ if (OutstandingQueues.size()) {
+ std::lock_guard<std::mutex> Lock(OutstandingQueuesMutex);
+
+ // As queues are pulled and popped from this list, longer running queues
+ // naturally bubble to the start of the array. Hence looping backwards.
+ for (auto Q = OutstandingQueues.rbegin(); Q != OutstandingQueues.rend();
+ Q++) {
+ if (!Device->hasPendingWork(*Q)) {
+ auto OutstandingQueue = *Q;
+ *Q = OutstandingQueues.back();
+ OutstandingQueues.pop_back();
+ return OutstandingQueue;
+ }
+ }
+ }
+ return nullptr;
+ }
+
+ /// Complete all pending work for this device and perform any needed cleanup.
+ ///
+ /// After calling this function, no liboffload functions should be called with
+ /// this device handle.
+ llvm::Error destroy() {
+ llvm::Error Result = Plugin::success();
+ for (auto Q : OutstandingQueues)
+ if (auto Err = Device->synchronize(Q, /*Release=*/true))
+ Result = llvm::joinErrors(std::move(Result), std::move(Err));
+ OutstandingQueues.clear();
+ return Result;
+ }
};
struct ol_platform_impl_t {
@@ -58,23 +107,51 @@ struct ol_platform_impl_t {
ol_platform_backend_t BackendType)
: Plugin(std::move(Plugin)), BackendType(BackendType) {}
std::unique_ptr<GenericPluginTy> Plugin;
- std::vector<ol_device_impl_t> Devices;
+ llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices;
ol_platform_backend_t BackendType;
+
+ /// Complete all pending work for this platform and perform any needed
+ /// cleanup.
+ ///
+ /// After calling this function, no liboffload functions should be called with
+ /// this platform handle.
+ llvm::Error destroy() {
+ llvm::Error Result = Plugin::success();
+ for (auto &D : Devices)
+ if (auto Err = D->destroy())
+ Result = llvm::joinErrors(std::move(Result), std::move(Err));
+
+ if (auto Res = Plugin->deinit())
+ Result = llvm::joinErrors(std::move(Result), std::move(Res));
+
+ return Result;
+ }
};
struct ol_queue_impl_t {
ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device)
- : AsyncInfo(AsyncInfo), Device(Device) {}
+ : AsyncInfo(AsyncInfo), Device(Device), Id(IdCounter++) {}
__tgt_async_info *AsyncInfo;
ol_device_handle_t Device;
+ // A unique identifier for the queue
+ size_t Id;
+ static std::atomic<size_t> IdCounter;
};
+std::atomic<size_t> ol_queue_impl_t::IdCounter(0);
struct ol_event_impl_t {
- ol_event_impl_t(void *EventInfo, ol_queue_handle_t Queue)
- : EventInfo(EventInfo), Queue(Queue) {}
+ ol_event_impl_t(void *EventInfo, ol_device_handle_t Device,
+ ol_queue_handle_t Queue)
+ : EventInfo(EventInfo), Device(Device), QueueId(Queue->Id), Queue(Queue) {
+ }
// EventInfo may be null, in which case the event should be considered always
// complete
void *EventInfo;
+ ol_device_handle_t Device;
+ size_t QueueId;
+ // Events may outlive the queue - don't assume this is always valid.
+ // It is provided only to implement OL_EVENT_INFO_QUEUE. Use QueueId to check
+ // for queue equality instead.
ol_queue_handle_t Queue;
};
@@ -131,7 +208,7 @@ struct OffloadContext {
ol_device_handle_t HostDevice() {
// The host platform is always inserted last
- return &Platforms.back().Devices[0];
+ return Platforms.back().Devices[0].get();
}
static OffloadContext &get() {
@@ -190,8 +267,8 @@ Error initPlugins(OffloadContext &Context) {
auto Info = Device->obtainInfoImpl();
if (auto Err = Info.takeError())
return Err;
- Platform.Devices.emplace_back(DevNum, Device, &Platform,
- std::move(*Info));
+ Platform.Devices.emplace_back(std::make_unique<ol_device_impl_t>(
+ DevNum, Device, &Platform, std::move(*Info)));
}
}
}
@@ -199,7 +276,8 @@ Error initPlugins(OffloadContext &Context) {
// Add the special host device
auto &HostPlatform = Context.Platforms.emplace_back(
ol_platform_impl_t{nullptr, OL_PLATFORM_BACKEND_HOST});
- HostPlatform.Devices.emplace_back(-1, nullptr, nullptr, InfoTreeNode{});
+ HostPlatform.Devices.emplace_back(
+ std::make_unique<ol_device_impl_t>(-1, nullptr, nullptr, InfoTreeNode{}));
Context.HostDevice()->Platform = &HostPlatform;
Context.TracingEnabled = std::getenv("OFFLOAD_TRACE");
@@ -240,7 +318,7 @@ Error olShutDown_impl() {
if (!P.Plugin || !P.Plugin->is_initialized())
continue;
- if (auto Res = P.Plugin->deinit())
+ if (auto Res = P.destroy())
Result = llvm::joinErrors(std::move(Result), std::move(Res));
}
@@ -367,6 +445,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
// Retrieve properties from the plugin interface
switch (PropName) {
case OL_DEVICE_INFO_NAME:
+ case OL_DEVICE_INFO_PRODUCT_NAME:
case OL_DEVICE_INFO_VENDOR:
case OL_DEVICE_INFO_DRIVER_VERSION: {
// String values
@@ -377,6 +456,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
}
case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
+ case OL_DEVICE_INFO_MAX_WORK_SIZE:
case OL_DEVICE_INFO_VENDOR_ID:
case OL_DEVICE_INFO_NUM_COMPUTE_UNITS:
case OL_DEVICE_INFO_ADDRESS_BITS:
@@ -393,6 +473,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
return Info.write(static_cast<uint32_t>(Value));
}
+ case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION:
case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: {
// {x, y, z} triples
ol_dimensions_t Out{0, 0, 0};
@@ -431,6 +512,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
assert(Device == OffloadContext::get().HostDevice());
InfoWriter Info(PropSize, PropValue, PropSizeRet);
+ constexpr auto uint32_max = std::numeric_limits<uint32_t>::max();
+
switch (PropName) {
case OL_DEVICE_INFO_PLATFORM:
return Info.write<void *>(Device->Platform);
@@ -438,6 +521,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_HOST);
case OL_DEVICE_INFO_NAME:
return Info.writeString("Virtual Host Device");
+ case OL_DEVICE_INFO_PRODUCT_NAME:
+ return Info.writeString("Virtual Host Device");
case OL_DEVICE_INFO_VENDOR:
return Info.writeString("Liboffload");
case OL_DEVICE_INFO_DRIVER_VERSION:
@@ -446,6 +531,11 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
return Info.write<uint32_t>(1);
case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION:
return Info.write<ol_dimensions_t>(ol_dimensions_t{1, 1, 1});
+ case OL_DEVICE_INFO_MAX_WORK_SIZE:
+ return Info.write<uint32_t>(uint32_max);
+ case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION:
+ return Info.write<ol_dimensions_t>(
+ ol_dimensions_t{uint32_max, uint32_max, uint32_max});
case OL_DEVICE_INFO_VENDOR_ID:
return Info.write<uint32_t>(0);
case OL_DEVICE_INFO_NUM_COMPUTE_UNITS:
@@ -505,7 +595,7 @@ Error olGetDeviceInfoSize_impl(ol_device_handle_t Device,
Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) {
for (auto &Platform : OffloadContext::get().Platforms) {
for (auto &Device : Platform.Devices) {
- if (!Callback(&Device, UserData)) {
+ if (!Callback(Device.get(), UserData)) {
break;
}
}
@@ -566,14 +656,46 @@ Error olMemFree_impl(void *Address) {
Error olCreateQueue_impl(ol_device_handle_t Device, ol_queue_handle_t *Queue) {
auto CreatedQueue = std::make_unique<ol_queue_impl_t>(nullptr, Device);
- if (auto Err = Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo)))
+
+ auto OutstandingQueue = Device->getOutstandingQueue();
+ if (OutstandingQueue) {
+ // The queue is empty, but we still need to sync it to release any temporary
+ // memory allocations or do other cleanup.
+ if (auto Err =
+ Device->Device->synchronize(OutstandingQueue, /*Release=*/false))
+ return Err;
+ CreatedQueue->AsyncInfo = OutstandingQueue;
+ } else if (auto Err =
+ Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo))) {
return Err;
+ }
*Queue = CreatedQueue.release();
return Error::success();
}
-Error olDestroyQueue_impl(ol_queue_handle_t Queue) { return olDestroy(Queue); }
+Error olDestroyQueue_impl(ol_queue_handle_t Queue) {
+ auto *Device = Queue->Device;
+ // This is safe; as soon as olDestroyQueue is called it is not possible to add
+ // any more work to the queue, so if it's finished now it will remain finished
+ // forever.
+ auto Res = Device->Device->hasPendingWork(Queue->AsyncInfo);
+ if (!Res)
+ return Res.takeError();
+
+ if (!*Res) {
+ // The queue is complete, so sync it and throw it back into the pool.
+ if (auto Err = Device->Device->synchronize(Queue->AsyncInfo,
+ /*Release=*/true))
+ return Err;
+ } else {
+ // The queue still has outstanding work. Store it so we can check it later.
+ std::lock_guard<std::mutex> Lock(Device->OutstandingQueuesMutex);
+ Device->OutstandingQueues.push_back(Queue->AsyncInfo);
+ }
+
+ return olDestroy(Queue);
+}
Error olSyncQueue_impl(ol_queue_handle_t Queue) {
// Host plugin doesn't have a queue set so it's not safe to call synchronize
@@ -601,7 +723,7 @@ Error olWaitEvents_impl(ol_queue_handle_t Queue, ol_event_handle_t *Events,
"olWaitEvents asked to wait on a NULL event");
// Do nothing if the event is for this queue or the event is always complete
- if (Event->Queue == Queue || !Event->EventInfo)
+ if (Event->QueueId == Queue->Id || !Event->EventInfo)
continue;
if (auto Err = Device->waitEvent(Event->EventInfo, Queue->AsyncInfo))
@@ -649,7 +771,7 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
if (!Event->EventInfo)
return Plugin::success();
- if (auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo))
+ if (auto Res = Event->Device->Device->syncEvent(Event->EventInfo))
return Res;
return Error::success();
@@ -657,7 +779,7 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
Error olDestroyEvent_impl(ol_event_handle_t Event) {
if (Event->EventInfo)
- if (auto Res = Event->Queue->Device->Device->destroyEvent(Event->EventInfo))
+ if (auto Res = Event->Device->Device->destroyEvent(Event->EventInfo))
return Res;
return olDestroy(Event);
@@ -708,7 +830,7 @@ Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) {
if (auto Err = Pending.takeError())
return Err;
- *EventOut = new ol_event_impl_t(nullptr, Queue);
+ *EventOut = new ol_event_impl_t(nullptr, Queue->Device, Queue);
if (!*Pending)
// Queue is empty, don't record an event and consider the event always
// complete
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index 6585286bf428..71423ae0c94d 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -37,6 +37,8 @@
using namespace llvm::omp::target::ompt;
#endif
+using namespace llvm::omp::target::plugin;
+
int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
AsyncInfoTy &AsyncInfo) const {
// First, check if the user disabled atomic map transfer/malloc/dealloc.
@@ -97,7 +99,55 @@ llvm::Error DeviceTy::init() {
return llvm::Error::success();
}
-// Load binary to device.
+// Extract the mapping of host function pointers to device function pointers
+// from the entry table. Functions marked as 'indirect' in OpenMP will have
+// offloading entries generated for them which map the host's function pointer
+// to a global containing the corresponding function pointer on the device.
+static llvm::Expected<std::pair<void *, uint64_t>>
+setupIndirectCallTable(DeviceTy &Device, __tgt_device_image *Image,
+ __tgt_device_binary Binary) {
+ AsyncInfoTy AsyncInfo(Device);
+ llvm::ArrayRef<llvm::offloading::EntryTy> Entries(Image->EntriesBegin,
+ Image->EntriesEnd);
+ llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
+ for (const auto &Entry : Entries) {
+ if (Entry.Kind != llvm::object::OffloadKind::OFK_OpenMP ||
+ Entry.Size == 0 || !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT))
+ continue;
+
+ assert(Entry.Size == sizeof(void *) && "Global not a function pointer?");
+ auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
+
+ void *Ptr;
+ if (Device.RTL->get_global(Binary, Entry.Size, Entry.SymbolName, &Ptr))
+ return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
+ "failed to load %s", Entry.SymbolName);
+
+ HstPtr = Entry.Address;
+ if (Device.retrieveData(&DevPtr, Ptr, Entry.Size, AsyncInfo))
+ return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
+ "failed to load %s", Entry.SymbolName);
+ }
+
+ // If we do not have any indirect globals we exit early.
+ if (IndirectCallTable.empty())
+ return std::pair{nullptr, 0};
+
+ // Sort the array to allow for more efficient lookup of device pointers.
+ llvm::sort(IndirectCallTable,
+ [](const auto &x, const auto &y) { return x.first < y.first; });
+
+ uint64_t TableSize =
+ IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
+ void *DevicePtr = Device.allocData(TableSize, nullptr, TARGET_ALLOC_DEVICE);
+ if (Device.submitData(DevicePtr, IndirectCallTable.data(), TableSize,
+ AsyncInfo))
+ return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
+ "failed to copy data");
+ return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
+}
+
+// Load binary to device and perform global initialization if needed.
llvm::Expected<__tgt_device_binary>
DeviceTy::loadBinary(__tgt_device_image *Img) {
__tgt_device_binary Binary;
@@ -105,6 +155,38 @@ DeviceTy::loadBinary(__tgt_device_image *Img) {
if (RTL->load_binary(RTLDeviceID, Img, &Binary) != OFFLOAD_SUCCESS)
return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
"failed to load binary %p", Img);
+
+ // This symbol is optional.
+ void *DeviceEnvironmentPtr;
+ if (RTL->get_global(Binary, sizeof(DeviceEnvironmentTy),
+ "__omp_rtl_device_environment", &DeviceEnvironmentPtr))
+ return Binary;
+
+ // Obtain a table mapping host function pointers to device function pointers.
+ auto CallTablePairOrErr = setupIndirectCallTable(*this, Img, Binary);
+ if (!CallTablePairOrErr)
+ return CallTablePairOrErr.takeError();
+
+ GenericDeviceTy &GenericDevice = RTL->getDevice(RTLDeviceID);
+ DeviceEnvironmentTy DeviceEnvironment;
+ DeviceEnvironment.DeviceDebugKind = GenericDevice.getDebugKind();
+ DeviceEnvironment.NumDevices = RTL->getNumDevices();
+ // TODO: The device ID used here is not the real device ID used by OpenMP.
+ DeviceEnvironment.DeviceNum = RTLDeviceID;
+ DeviceEnvironment.DynamicMemSize = GenericDevice.getDynamicMemorySize();
+ DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency();
+ DeviceEnvironment.IndirectCallTable =
+ reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
+ DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
+ DeviceEnvironment.HardwareParallelism =
+ GenericDevice.getHardwareParallelism();
+
+ AsyncInfoTy AsyncInfo(*this);
+ if (submitData(DeviceEnvironmentPtr, &DeviceEnvironment,
+ sizeof(DeviceEnvironment), AsyncInfo))
+ return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
+ "failed to copy data");
+
return Binary;
}
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 32e89cc75efc..4c8eba1e7180 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -403,6 +403,12 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
reinterpret_cast<uint64_t>(HstPteeBase);
void *TgtPteeBase = reinterpret_cast<void *>(
reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta);
+ DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD
+ ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n",
+ DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta);
+ DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD
+ "\n",
+ DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin));
// Add shadow pointer tracking
// TODO: Support shadow-tracking of larger than VoidPtrSize pointers,
diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h
index 0b3d54599048..90e5e1780e66 100644
--- a/offload/libomptarget/private.h
+++ b/offload/libomptarget/private.h
@@ -55,7 +55,14 @@ printKernelArguments(const ident_t *Loc, const int64_t DeviceId,
const char *Type = nullptr;
const char *Implicit =
(ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT) ? "(implicit)" : "";
- if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO && ArgTypes[I] & OMP_TGT_MAPTYPE_FROM)
+
+ if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH &&
+ ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS)
+ Type = "attach:always";
+ else if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH)
+ Type = "attach";
+ else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO &&
+ ArgTypes[I] & OMP_TGT_MAPTYPE_FROM)
Type = "tofrom";
else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO)
Type = "to";
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 7ba55715ff58..c26cfe961aa0 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -924,6 +924,7 @@ private:
void *Dst;
const void *Src;
size_t Size;
+ size_t NumTimes;
};
/// Utility struct holding arguments for freeing buffers to memory managers.
@@ -974,9 +975,14 @@ private:
StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
/// Schedule a host memory copy action on the slot.
- Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
+ ///
+ /// Num times will repeat the copy that many times, sequentually in the dest
+ /// buffer.
+ Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size,
+ size_t NumTimes = 1) {
Callbacks.emplace_back(memcpyAction);
- ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
+ ActionArgs.emplace_back().MemcpyArgs =
+ MemcpyArgsTy{Dst, Src, Size, NumTimes};
return Plugin::success();
}
@@ -1216,7 +1222,11 @@ private:
assert(Args->Dst && "Invalid destination buffer");
assert(Args->Src && "Invalid source buffer");
- std::memcpy(Args->Dst, Args->Src, Args->Size);
+ auto BasePtr = Args->Dst;
+ for (size_t I = 0; I < Args->NumTimes; I++) {
+ std::memcpy(BasePtr, Args->Src, Args->Size);
+ BasePtr = reinterpret_cast<uint8_t *>(BasePtr) + Args->Size;
+ }
return Plugin::success();
}
@@ -1421,7 +1431,8 @@ public:
/// manager once the operation completes.
Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
uint64_t CopySize,
- AMDGPUMemoryManagerTy &MemoryManager) {
+ AMDGPUMemoryManagerTy &MemoryManager,
+ size_t NumTimes = 1) {
// Retrieve available signals for the operation's outputs.
AMDGPUSignalTy *OutputSignals[2] = {};
if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
@@ -1443,7 +1454,8 @@ public:
// The std::memcpy is done asynchronously using an async handler. We store
// the function's information in the action but it is not actually a
// post action.
- if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
+ if (auto Err =
+ Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize, NumTimes))
return Err;
// Make changes on this slot visible to the async handler's thread.
@@ -1464,7 +1476,11 @@ public:
std::tie(Curr, InputSignal) = consume(OutputSignal);
} else {
// All preceding operations completed, copy the memory synchronously.
- std::memcpy(Inter, Src, CopySize);
+ auto *InterPtr = Inter;
+ for (size_t I = 0; I < NumTimes; I++) {
+ std::memcpy(InterPtr, Src, CopySize);
+ InterPtr = reinterpret_cast<uint8_t *>(InterPtr) + CopySize;
+ }
// Return the second signal because it will not be used.
OutputSignals[1]->decreaseUseCount();
@@ -1481,11 +1497,11 @@ public:
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
- Agent, CopySize, 1, &InputSignalRaw,
- OutputSignal->get());
+ Agent, CopySize * NumTimes, 1,
+ &InputSignalRaw, OutputSignal->get());
}
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
- Agent, CopySize, 0, nullptr,
+ Agent, CopySize * NumTimes, 0, nullptr,
OutputSignal->get());
}
@@ -2611,26 +2627,73 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
- hsa_status_t Status;
+ // Fast case, where we can use the 4 byte hsa_amd_memory_fill
+ if (Size % 4 == 0 &&
+ (PatternSize == 4 || PatternSize == 2 || PatternSize == 1)) {
+ uint32_t Pattern;
+ if (PatternSize == 1) {
+ auto *Byte = reinterpret_cast<const uint8_t *>(PatternPtr);
+ Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24;
+ } else if (PatternSize == 2) {
+ auto *Word = reinterpret_cast<const uint16_t *>(PatternPtr);
+ Pattern = *Word | (*Word << 16);
+ } else if (PatternSize == 4) {
+ Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr);
+ } else {
+ // Shouldn't be here if the pattern size is outwith those values
+ llvm_unreachable("Invalid pattern size");
+ }
- // We can use hsa_amd_memory_fill for this size, but it's not async so the
- // queue needs to be synchronized first
- if (PatternSize == 4) {
- if (AsyncInfoWrapper.hasQueue())
- if (auto Err = synchronize(AsyncInfoWrapper))
+ if (hasPendingWorkImpl(AsyncInfoWrapper)) {
+ AMDGPUStreamTy *Stream = nullptr;
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
- Status = hsa_amd_memory_fill(TgtPtr,
- *static_cast<const uint32_t *>(PatternPtr),
- Size / PatternSize);
- if (auto Err =
- Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"))
- return Err;
- } else {
- // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
- // memory and copying to the device in one go.
- return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size");
+ struct MemFillArgsTy {
+ void *Dst;
+ uint32_t Pattern;
+ int64_t Size;
+ };
+ auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4};
+ auto Fill = [](void *Data) {
+ MemFillArgsTy *Args = reinterpret_cast<MemFillArgsTy *>(Data);
+ assert(Args && "Invalid arguments");
+
+ auto Status =
+ hsa_amd_memory_fill(Args->Dst, Args->Pattern, Args->Size);
+ delete Args;
+ auto Err =
+ Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
+ if (Err) {
+ FATAL_MESSAGE(1, "error performing async fill: %s",
+ toString(std::move(Err)).data());
+ }
+ };
+
+ // hsa_amd_memory_fill doesn't signal completion using a signal, so use
+ // the existing host callback logic to handle that instead
+ return Stream->pushHostCallback(Fill, Args);
+ } else {
+ // If there is no pending work, do the fill synchronously
+ auto Status = hsa_amd_memory_fill(TgtPtr, Pattern, Size / 4);
+ return Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
+ }
}
+
+ // Slow case; allocate an appropriate memory size and enqueue copies
+ void *PinnedPtr = nullptr;
+ AMDGPUMemoryManagerTy &PinnedMemoryManager =
+ HostDevice.getPinnedMemoryManager();
+ if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
+ return Err;
+
+ AMDGPUStreamTy *Stream = nullptr;
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+
+ return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr,
+ PatternSize, PinnedMemoryManager,
+ Size / PatternSize);
}
/// Initialize the async info for interoperability purposes.
@@ -2744,7 +2807,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
if (Status == HSA_STATUS_SUCCESS)
- Info.add("Product Name", TmpChar);
+ Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
if (Status == HSA_STATUS_SUCCESS)
@@ -2861,11 +2924,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
- Info.add("Grid Max Size", TmpUInt);
+ Info.add("Grid Max Size", TmpUInt, "", DeviceInfo::MAX_WORK_SIZE);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
if (Status == HSA_STATUS_SUCCESS) {
- auto &MaxDim = *Info.add("Grid Max Size per Dimension");
+ auto &MaxDim = *Info.add("Grid Max Size per Dimension", std::monostate{},
+ "", DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION);
MaxDim.add("x", GridMaxDim.x);
MaxDim.add("y", GridMaxDim.y);
MaxDim.add("z", GridMaxDim.z);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 75f87cab6049..6ff3ef8cda17 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -417,6 +417,7 @@ struct GenericKernelTy {
case OMP_TGT_EXEC_MODE_SPMD:
case OMP_TGT_EXEC_MODE_GENERIC:
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
+ case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
return true;
}
return false;
@@ -434,6 +435,8 @@ protected:
return "Generic";
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return "Generic-SPMD";
+ case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
+ return "SPMD-No-Loop";
}
llvm_unreachable("Unknown execution mode!");
}
@@ -471,7 +474,8 @@ private:
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
- /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+ /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop
+ /// or SPMD mode.
bool isGenericSPMDMode() const {
return KernelEnvironment.Configuration.ExecMode ==
OMP_TGT_EXEC_MODE_GENERIC_SPMD;
@@ -486,6 +490,10 @@ private:
bool isBareMode() const {
return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE;
}
+ bool isNoLoopMode() const {
+ return KernelEnvironment.Configuration.ExecMode ==
+ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
+ }
/// The kernel name.
std::string Name;
@@ -831,11 +839,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error unloadBinary(DeviceImageTy *Image);
virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
- /// Setup the device environment if needed. Notice this setup may not be run
- /// on some plugins. By default, it will be executed, but plugins can change
- /// this behavior by overriding the shouldSetupDeviceEnvironment function.
- Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
-
/// Setup the global device memory pool, if the plugin requires one.
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
uint64_t PoolSize);
@@ -1035,6 +1038,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
uint32_t getDefaultNumBlocks() const {
return GridValues.GV_Default_Num_Teams;
}
+ uint32_t getDebugKind() const { return OMPX_DebugKind; }
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
@@ -1175,11 +1179,6 @@ private:
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
virtual Error setDeviceHeapSize(uint64_t V) = 0;
- /// Indicate whether the device should setup the device environment. Notice
- /// that returning false in this function will change the behavior of the
- /// setupDeviceEnvironment() function.
- virtual bool shouldSetupDeviceEnvironment() const { return true; }
-
/// Indicate whether the device should setup the global device memory pool. If
/// false is return the value on the device will be uninitialized.
virtual bool shouldSetupDeviceMemoryPool() const { return true; }
@@ -1235,7 +1234,7 @@ protected:
enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
/// Array of peer access states with the rest of devices. This means that if
- /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
+ /// the device I has a matrix PeerAccesses with PeerAccesses == AVAILABLE,
/// the device I can access device J's memory directly. However, notice this
/// does not mean that device J can access device I's memory directly.
llvm::SmallVector<PeerAccessState> PeerAccesses;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index d4b5f914c667..36cdd6035e26 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -371,54 +371,6 @@ public:
};
} // namespace llvm::omp::target::plugin
-// Extract the mapping of host function pointers to device function pointers
-// from the entry table. Functions marked as 'indirect' in OpenMP will have
-// offloading entries generated for them which map the host's function pointer
-// to a global containing the corresponding function pointer on the device.
-static Expected<std::pair<void *, uint64_t>>
-setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device,
- DeviceImageTy &Image) {
- GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-
- llvm::ArrayRef<llvm::offloading::EntryTy> Entries(
- Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd);
- llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
- for (const auto &Entry : Entries) {
- if (Entry.Kind != object::OffloadKind::OFK_OpenMP || Entry.Size == 0 ||
- !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT))
- continue;
-
- assert(Entry.Size == sizeof(void *) && "Global not a function pointer?");
- auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
-
- GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size);
- if (auto Err =
- Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
- return std::move(Err);
-
- HstPtr = Entry.Address;
- if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(),
- Entry.Size, nullptr))
- return std::move(Err);
- }
-
- // If we do not have any indirect globals we exit early.
- if (IndirectCallTable.empty())
- return std::pair{nullptr, 0};
-
- // Sort the array to allow for more efficient lookup of device pointers.
- llvm::sort(IndirectCallTable,
- [](const auto &x, const auto &y) { return x.first < y.first; });
-
- uint64_t TableSize =
- IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
- void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE);
- if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(),
- TableSize, nullptr))
- return std::move(Err);
- return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
-}
-
AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
__tgt_async_info *AsyncInfoPtr)
: Device(Device),
@@ -662,6 +614,10 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
}
+ // Return the number of teams required to cover the loop iterations.
+ if (isNoLoopMode())
+ return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1;
+
uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
if (LoopTripCount > 0) {
@@ -939,10 +895,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
// Add the image to list.
LoadedImages.push_back(Image);
- // Setup the device environment if needed.
- if (auto Err = setupDeviceEnvironment(Plugin, *Image))
- return std::move(Err);
-
// Setup the global device memory pool if needed.
if (!Plugin.getRecordReplay().isReplaying() &&
shouldSetupDeviceMemoryPool()) {
@@ -978,43 +930,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
return Image;
}
-Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
- DeviceImageTy &Image) {
- // There are some plugins that do not need this step.
- if (!shouldSetupDeviceEnvironment())
- return Plugin::success();
-
- // Obtain a table mapping host function pointers to device function pointers.
- auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image);
- if (!CallTablePairOrErr)
- return CallTablePairOrErr.takeError();
-
- DeviceEnvironmentTy DeviceEnvironment;
- DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind;
- DeviceEnvironment.NumDevices = Plugin.getNumDevices();
- // TODO: The device ID used here is not the real device ID used by OpenMP.
- DeviceEnvironment.DeviceNum = DeviceId;
- DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
- DeviceEnvironment.ClockFrequency = getClockFrequency();
- DeviceEnvironment.IndirectCallTable =
- reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
- DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
- DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
-
- // Create the metainfo of the device environment global.
- GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
- sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
-
- // Write device environment values to the device.
- GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
- if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) {
- DP("Missing symbol %s, continue execution anyway.\n",
- DevEnvGlobal.getName().data());
- consumeError(std::move(Err));
- }
- return Plugin::success();
-}
-
Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
DeviceImageTy &Image,
uint64_t PoolSize) {
@@ -1337,16 +1252,19 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
bool ReleaseQueue) {
+ if (!AsyncInfo)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "invalid async info queue");
+
SmallVector<void *> AllocsToDelete{};
{
std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
- if (!AsyncInfo || !AsyncInfo->Queue)
- return Plugin::error(ErrorCode::INVALID_ARGUMENT,
- "invalid async info queue");
-
- if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
- return Err;
+ // This can be false when no work has been added to the AsyncInfo. In which
+ // case, the device has nothing to synchronize.
+ if (AsyncInfo->Queue)
+ if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
+ return Err;
std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations);
}
@@ -2252,8 +2170,7 @@ int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
GenericGlobalHandlerTy &GHandler = getGlobalHandler();
if (auto Err =
GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
- REPORT("Failure to look up global address: %s\n",
- toString(std::move(Err)).data());
+ consumeError(std::move(Err));
return OFFLOAD_FAIL;
}
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index bf335ab20f75..af3c74636bff 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1060,8 +1060,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
Info.add("CUDA OpenMP Device Number", DeviceId);
Res = cuDeviceGetName(TmpChar, 1000, Device);
- if (Res == CUDA_SUCCESS)
+ if (Res == CUDA_SUCCESS) {
Info.add("Device Name", TmpChar, "", DeviceInfo::NAME);
+ Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME);
+ }
Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR);
@@ -1118,7 +1120,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (Res == CUDA_SUCCESS)
MaxBlock.add("z", TmpInt);
- auto &MaxGrid = *Info.add("Maximum Grid Dimensions", "");
+ // TODO: I assume CUDA devices have no limit on the amount of threads,
+ // verify this
+ Info.add("Maximum Grid Size", std::numeric_limits<uint32_t>::max(), "",
+ DeviceInfo::MAX_WORK_SIZE);
+
+ auto &MaxGrid = *Info.add("Maximum Grid Dimensions", std::monostate{}, "",
+ DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION);
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
if (Res == CUDA_SUCCESS)
MaxGrid.add("x", TmpInt);
@@ -1444,7 +1452,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
if (auto Err = Plugin::check(
AttrResult,
- "Error in cuLaunchKernel while setting the memory limits: %s"))
+ "error in cuFuncSetAttribute while setting the memory limits: %s"))
return Err;
MaxDynCGroupMemLimit = MaxDynCGroupMem;
}
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index f440ebaf17fe..5436cae3b029 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -387,7 +387,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
}
/// This plugin should not setup the device environment or memory pool.
- virtual bool shouldSetupDeviceEnvironment() const override { return false; };
virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
/// Getters and setters for stack size and heap size not relevant.
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index f3e8e9a66685..c0290bfdab3f 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -83,6 +83,7 @@ config.test_format = lit.formats.ShTest()
config.test_flags = " -I " + config.test_source_root + \
" -I " + config.omp_header_directory + \
" -L " + config.library_dir + \
+ " -L " + config.llvm_library_intdir + \
" -L " + config.llvm_lib_directory
# compiler specific flags
@@ -165,11 +166,12 @@ else: # Unices
config.test_flags += " -nogpulib"
config.test_flags += " -Wl,-rpath," + config.library_dir
config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
+ config.test_flags += " -Wl,-rpath," + config.llvm_library_intdir
config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory
if config.cuda_libdir:
config.test_flags += " -Wl,-rpath," + config.cuda_libdir
if config.libomptarget_current_target.startswith('nvptx'):
- config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.llvm_library_intdir
+ config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.llvm_library_intdir + "/nvptx64-nvidia-cuda"
if config.libomptarget_current_target.endswith('-LTO'):
config.test_flags += " -foffload-lto"
if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env(
diff --git a/offload/test/mapping/chained_containing_structs_1.cc b/offload/test/mapping/chained_containing_structs_1.cc
new file mode 100644
index 000000000000..4dbb17140de1
--- /dev/null
+++ b/offload/test/mapping/chained_containing_structs_1.cc
@@ -0,0 +1,58 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// XFAIL: *
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+struct S {
+ int a;
+ int b;
+ int c;
+};
+
+struct T {
+ S *s0;
+ S *s1;
+ S *s2;
+};
+
+int main() {
+ T *v = (T *) malloc (sizeof(T));
+ v->s0 = (S *) malloc (sizeof(S));
+ v->s1 = (S *) malloc (sizeof(S));
+ v->s2 = (S *) malloc (sizeof(S));
+ v->s0->a = 10;
+ v->s0->b = 10;
+ v->s0->c = 10;
+ v->s1->a = 20;
+ v->s1->b = 20;
+ v->s1->c = 20;
+ v->s2->a = 30;
+ v->s2->b = 30;
+ v->s2->c = 30;
+
+#pragma omp target map(to: v[:1]) map(tofrom: v->s1->b, v->s1->c, v->s2->b)
+ {
+ v->s1->b += 3;
+ v->s1->c += 5;
+ v->s2->b += 7;
+ }
+
+ printf ("%d\n", v->s0->a); // CHECK: 10
+ printf ("%d\n", v->s0->b); // CHECK: 10
+ printf ("%d\n", v->s0->c); // CHECK: 10
+ printf ("%d\n", v->s1->a); // CHECK: 20
+ printf ("%d\n", v->s1->b); // CHECK: 23
+ printf ("%d\n", v->s1->c); // CHECK: 25
+ printf ("%d\n", v->s2->a); // CHECK: 30
+ printf ("%d\n", v->s2->b); // CHECK: 37
+ printf ("%d\n", v->s2->c); // CHECK: 30
+
+ free(v->s0);
+ free(v->s1);
+ free(v->s2);
+ free(v);
+
+ return 0;
+}
diff --git a/offload/test/mapping/chained_containing_structs_2.cc b/offload/test/mapping/chained_containing_structs_2.cc
new file mode 100644
index 000000000000..29c4c8b7fedf
--- /dev/null
+++ b/offload/test/mapping/chained_containing_structs_2.cc
@@ -0,0 +1,76 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// XFAIL: *
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+struct R {
+ int d;
+ int e;
+ int f;
+};
+
+struct S {
+ R *r0;
+ R *r1;
+ R *r2;
+};
+
+struct T {
+ S *s0;
+ S *s1;
+ S *s2;
+};
+
+int main() {
+ T *v = (T *) malloc (sizeof(T));
+
+ v->s0 = (S *) malloc (sizeof(S));
+ v->s1 = (S *) malloc (sizeof(S));
+ v->s2 = (S *) malloc (sizeof(S));
+
+ v->s0->r0 = (R *) calloc (1, sizeof(R));
+ v->s0->r1 = (R *) calloc (1, sizeof(R));
+ v->s0->r2 = (R *) calloc (1, sizeof(R));
+
+ v->s1->r0 = (R *) calloc (1, sizeof(R));
+ v->s1->r1 = (R *) calloc (1, sizeof(R));
+ v->s1->r2 = (R *) calloc (1, sizeof(R));
+
+ v->s2->r0 = (R *) calloc (1, sizeof(R));
+ v->s2->r1 = (R *) calloc (1, sizeof(R));
+ v->s2->r2 = (R *) calloc (1, sizeof(R));
+
+ #pragma omp target map(to: v->s1, v->s2, *v->s1, v->s1->r1, *v->s2, v->s2->r0) \
+ map(tofrom: v->s1->r1->d, v->s1->r1->e, v->s1->r2->d, v->s1->r2->f, v->s2->r0->e)
+ {
+ v->s1->r1->d += 3;
+ v->s1->r1->e += 5;
+ v->s1->r2->d += 7;
+ v->s1->r2->f += 9;
+ v->s2->r0->e += 11;
+ }
+
+ printf ("%d\n", v->s1->r1->d); // CHECK: 3
+ printf ("%d\n", v->s1->r1->e); // CHECK: 5
+ printf ("%d\n", v->s1->r2->d); // CHECK: 7
+ printf ("%d\n", v->s1->r2->f); // CHECK: 9
+ printf ("%d\n", v->s2->r0->e); // CHECK: 11
+
+ free(v->s0->r0);
+ free(v->s0->r1);
+ free(v->s0->r2);
+ free(v->s1->r0);
+ free(v->s1->r1);
+ free(v->s1->r2);
+ free(v->s2->r0);
+ free(v->s2->r1);
+ free(v->s2->r2);
+ free(v->s0);
+ free(v->s1);
+ free(v->s2);
+ free(v);
+
+ return 0;
+}
diff --git a/offload/test/mapping/chained_containing_structs_3.cc b/offload/test/mapping/chained_containing_structs_3.cc
new file mode 100644
index 000000000000..23555bf69110
--- /dev/null
+++ b/offload/test/mapping/chained_containing_structs_3.cc
@@ -0,0 +1,217 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+
+#include <omp.h>
+
+struct R {
+ int d;
+ int e;
+ int f;
+};
+
+struct S {
+ int a;
+ int b;
+ struct {
+ int c;
+ R r;
+ R *rp;
+ } sub;
+ int g;
+};
+
+struct T {
+ int a;
+ int *ptr;
+ int b;
+};
+
+int main() {
+ R r;
+ R *rp = new R;
+ S s;
+ S *sp = new S;
+ T t;
+ T *tp = new T;
+
+ memset(&r, 0, sizeof(R));
+ memset(rp, 0, sizeof(R));
+ memset(&s, 0, sizeof(S));
+ memset(sp, 0, sizeof(S));
+ memset(&t, 0, sizeof(T));
+ memset(tp, 0, sizeof(T));
+
+ s.sub.rp = new R;
+ sp->sub.rp = new R;
+
+ memset(s.sub.rp, 0, sizeof(R));
+ memset(sp->sub.rp, 0, sizeof(R));
+
+ t.ptr = new int[10];
+ tp->ptr = new int[10];
+
+ memset(t.ptr, 0, sizeof(int)*10);
+ memset(tp->ptr, 0, sizeof(int)*10);
+
+#pragma omp target map(tofrom: r) map(tofrom: r.e)
+{
+ r.d++;
+ r.e += 2;
+ r.f += 3;
+}
+ printf ("%d\n", r.d); // CHECK: 1
+ printf ("%d\n", r.e); // CHECK-NEXT: 2
+ printf ("%d\n", r.f); // CHECK-NEXT: 3
+
+#pragma omp target map(tofrom: rp[:1]) map(tofrom: rp->e)
+{
+ rp->d++;
+ rp->e += 2;
+ rp->f += 3;
+}
+
+ printf ("%d\n", rp->d); // CHECK-NEXT: 1
+ printf ("%d\n", rp->e); // CHECK-NEXT: 2
+ printf ("%d\n", rp->f); // CHECK-NEXT: 3
+
+ int v;
+ int *orig_addr_v = &v;
+ bool separate_memory_space;
+
+#pragma omp target data map(v)
+ {
+ void *mapped_ptr_v =
+ omp_get_mapped_ptr(orig_addr_v, omp_get_default_device());
+ separate_memory_space = mapped_ptr_v != (void*) orig_addr_v;
+ }
+
+ const char *mapping_flavour = separate_memory_space ? "separate" : "unified";
+
+#pragma omp target map(to: s) map(tofrom: s.sub.r.e)
+{
+ s.b++;
+ s.sub.r.d+=2;
+ s.sub.r.e+=3;
+ s.sub.r.f+=4;
+}
+
+ printf ("%d/%s\n", s.b, mapping_flavour);
+ printf ("%d/%s\n", s.sub.r.d, mapping_flavour);
+ printf ("%d/%s\n", s.sub.r.e, mapping_flavour);
+ printf ("%d/%s\n", s.sub.r.f, mapping_flavour);
+
+ // CHECK: {{0/separate|1/unified}}
+ // CHECK-NEXT: {{0/separate|2/unified}}
+ // CHECK-NEXT: 3
+ // CHECK-NEXT: {{0/separate|4/unified}}
+
+#pragma omp target map(to: s, s.b) map(to: s.sub.rp[:1]) map(tofrom: s.sub.rp->e)
+{
+ s.b++;
+ s.sub.rp->d+=2;
+ s.sub.rp->e+=3;
+ s.sub.rp->f+=4;
+}
+
+ printf ("%d/%s\n", s.b, mapping_flavour);
+ printf ("%d/%s\n", s.sub.rp->d, mapping_flavour);
+ printf ("%d/%s\n", s.sub.rp->e, mapping_flavour);
+ printf ("%d/%s\n", s.sub.rp->f, mapping_flavour);
+
+ // CHECK-NEXT: {{0/separate|2/unified}}
+ // CHECK-NEXT: {{0/separate|2/unified}}
+ // CHECK-NEXT: 3
+ // CHECK-NEXT: {{0/separate|4/unified}}
+
+#pragma omp target map(to: sp[:1]) map(tofrom: sp->sub.r.e)
+{
+ sp->b++;
+ sp->sub.r.d+=2;
+ sp->sub.r.e+=3;
+ sp->sub.r.f+=4;
+}
+
+ printf ("%d/%s\n", sp->b, mapping_flavour);
+ printf ("%d/%s\n", sp->sub.r.d, mapping_flavour);
+ printf ("%d/%s\n", sp->sub.r.e, mapping_flavour);
+ printf ("%d/%s\n", sp->sub.r.f, mapping_flavour);
+
+ // CHECK-NEXT: {{0/separate|1/unified}}
+ // CHECK-NEXT: {{0/separate|2/unified}}
+ // CHECK-NEXT: 3
+ // CHECK-NEXT: {{0/separate|4/unified}}
+
+#pragma omp target map(to: sp[:1]) map(to: sp->sub.rp[:1]) map(tofrom: sp->sub.rp->e)
+{
+ sp->b++;
+ sp->sub.rp->d+=2;
+ sp->sub.rp->e+=3;
+ sp->sub.rp->f+=4;
+}
+
+ printf ("%d/%s\n", sp->b, mapping_flavour);
+ printf ("%d/%s\n", sp->sub.rp->d, mapping_flavour);
+ printf ("%d/%s\n", sp->sub.rp->e, mapping_flavour);
+ printf ("%d/%s\n", sp->sub.rp->f, mapping_flavour);
+
+ // CHECK-NEXT: {{0/separate|2/unified}}
+ // CHECK-NEXT: {{0/separate|2/unified}}
+ // CHECK-NEXT: 3
+ // CHECK-NEXT: {{0/separate|4/unified}}
+
+#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1])
+{
+ t.a++;
+ t.ptr[2]+=2;
+ t.b+=3;
+}
+
+ printf ("%d\n", t.a); // CHECK-NEXT: 1
+ printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 2
+ printf ("%d\n", t.b); // CHECK-NEXT: 3
+
+#pragma omp target map(tofrom: t) map(tofrom: t.a)
+{
+ t.b++;
+}
+
+ printf ("%d\n", t.b); // CHECK-NEXT: 4
+
+#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1], t.a)
+{
+ t.a++;
+ t.ptr[2]+=2;
+ t.b+=3;
+}
+
+ printf ("%d\n", t.a); // CHECK-NEXT: 2
+ printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 4
+ printf ("%d\n", t.b); // CHECK-NEXT: 7
+
+#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1], t.a)
+{
+ /* Empty */
+}
+
+ printf ("%d\n", t.a); // CHECK-NEXT: 2
+ printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 4
+ printf ("%d\n", t.b); // CHECK-NEXT: 7
+
+ delete s.sub.rp;
+ delete sp->sub.rp;
+
+ delete[] t.ptr;
+ delete[] tp->ptr;
+
+ delete rp;
+ delete sp;
+ delete tp;
+
+ return 0;
+}
diff --git a/offload/test/mapping/map_ptr_and_star_global.c b/offload/test/mapping/map_ptr_and_star_global.c
index c3b0dd2f49e6..869fb8ca9bc2 100644
--- a/offload/test/mapping/map_ptr_and_star_global.c
+++ b/offload/test/mapping/map_ptr_and_star_global.c
@@ -1,5 +1,7 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: libc
+
#include <omp.h>
#include <stdio.h>
diff --git a/offload/test/mapping/map_ptr_and_star_local.c b/offload/test/mapping/map_ptr_and_star_local.c
index f0ca84d1cc4d..cc826b3c0290 100644
--- a/offload/test/mapping/map_ptr_and_star_local.c
+++ b/offload/test/mapping/map_ptr_and_star_local.c
@@ -1,5 +1,7 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: libc
+
#include <omp.h>
#include <stdio.h>
diff --git a/offload/test/mapping/map_ptr_and_subscript_global.c b/offload/test/mapping/map_ptr_and_subscript_global.c
index a3a10b6c9b21..839db068aa90 100644
--- a/offload/test/mapping/map_ptr_and_subscript_global.c
+++ b/offload/test/mapping/map_ptr_and_subscript_global.c
@@ -1,5 +1,7 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: libc
+
#include <omp.h>
#include <stdio.h>
diff --git a/offload/test/mapping/map_ptr_and_subscript_local.c b/offload/test/mapping/map_ptr_and_subscript_local.c
index bb44999541a7..68ac9dc0917f 100644
--- a/offload/test/mapping/map_ptr_and_subscript_local.c
+++ b/offload/test/mapping/map_ptr_and_subscript_local.c
@@ -1,5 +1,7 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: libc
+
#include <omp.h>
#include <stdio.h>
diff --git a/offload/test/mapping/map_structptr_and_member_global.c b/offload/test/mapping/map_structptr_and_member_global.c
index 10e72e070dbc..960eea419964 100644
--- a/offload/test/mapping/map_structptr_and_member_global.c
+++ b/offload/test/mapping/map_structptr_and_member_global.c
@@ -1,5 +1,7 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: libc
+
#include <omp.h>
#include <stdio.h>
diff --git a/offload/test/mapping/map_structptr_and_member_local.c b/offload/test/mapping/map_structptr_and_member_local.c
index 9e59551ad3d6..bd759407ef09 100644
--- a/offload/test/mapping/map_structptr_and_member_local.c
+++ b/offload/test/mapping/map_structptr_and_member_local.c
@@ -1,5 +1,7 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: libc
+
#include <omp.h>
#include <stdio.h>
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp
new file mode 100644
index 000000000000..3b1a8192bf2c
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp
@@ -0,0 +1,85 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+ int *paa[10][10];
+
+ void f1(int i) {
+ paa[0][2] = &g;
+
+ int *original_ph3 = &ph[3];
+ int **original_paa02 = &paa[0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5])
+ int *mapped_ptr_ph3 =
+ (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+ int **mapped_ptr_paa02 =
+ (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device());
+
+ // CHECK-COUNT-4: 1
+ printf("%d\n", mapped_ptr_ph3 != nullptr);
+ printf("%d\n", mapped_ptr_paa02 != nullptr);
+ printf("%d\n", original_ph3 != mapped_ptr_ph3);
+ printf("%d\n", original_paa02 != mapped_ptr_paa02);
+
+// (A) use_device_addr operand within mapped address range.
+// CHECK: A: 1
+#pragma omp target data use_device_addr(ph[3 : 4])
+ printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_addr operand in extended address range, but not
+// mapped address range.
+// CHECK: B: 1
+#pragma omp target data use_device_addr(ph[2])
+ printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) use_device_addr/map: same base-array, different first-location.
+// CHECK: C: 1
+#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1])
+ printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) use_device_addr/map: different base-array/pointers.
+// CHECK: D: 1
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+ printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) use_device_addr operand within mapped range of previous map.
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa[0])
+ printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (F) use_device_addr/map: different operands, same base-array.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2])
+ printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (G) use_device_addr/map: different base-array/pointers.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2])
+ printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5])
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp
new file mode 100644
index 000000000000..b9ebde431e7b
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp
@@ -0,0 +1,143 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+ int *paa[10][10];
+
+ void f1(int i) {
+ paa[0][2] = &g;
+
+ int *original_ph3 = &ph[3];
+ int **original_paa02 = &paa[0][2];
+
+// (A) No corresponding map, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_addr(ph[3 : 4])
+ {
+ int *mapped_ptr_ph3 =
+ (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+ printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+ mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+ }
+
+// (B) use_device_addr/map: different operands, same base-pointer.
+// use_device_addr operand within mapped address range.
+// CHECK: B: 1 1 1
+#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1])
+ {
+ int *mapped_ptr_ph4 =
+ (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device());
+ printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr,
+ mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4);
+ }
+
+// (C) use_device_addr/map: different base-pointers.
+// No corresponding storage, lookup should fail.
+// CHECK: C: 1 1 1
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+ {
+ int *mapped_ptr_ph3 =
+ (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+ printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+ mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+ }
+
+// (D) use_device_addr/map: one of two maps with matching base-pointer.
+// use_device_addr operand within mapped address range of second map,
+// lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4])
+ {
+ int *mapped_ptr_ph3 =
+ (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+ printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+ mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3);
+ }
+
+// (E) No corresponding map, lookup should fail
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(paa[0])
+ {
+ int **mapped_ptr_paa02 =
+ (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+ printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr,
+ mapped_ptr_paa02 != original_paa02,
+ &paa[0][2] == (int **)nullptr + 2);
+ }
+
+// (F) use_device_addr/map: different operands, same base-array.
+// use_device_addr within mapped address range. Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa[0])
+ {
+ int **mapped_ptr_paa02 =
+ (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+ printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+ mapped_ptr_paa02 != original_paa02,
+ &paa[0][2] == mapped_ptr_paa02);
+ }
+
+// (G) use_device_addr/map: different operands, same base-array.
+// use_device_addr extends beyond existing mapping. Not spec compliant.
+// But the lookup succeeds because we use the base-address for translation.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[0][4]) use_device_addr(paa[0])
+ {
+ int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr(
+ original_paa02 + 2, omp_get_default_device());
+ printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr,
+ mapped_ptr_paa04 != original_paa02 + 2,
+ &paa[0][4] == mapped_ptr_paa04);
+ }
+
+ int *original_paa020 = &paa[0][2][0];
+ int **original_paa0 = (int **)&paa[0];
+
+// (H) use_device_addr/map: different base-pointers.
+// No corresponding storage for use_device_addr opnd, lookup should fail.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0])
+ {
+ int **mapped_ptr_paa020 =
+ (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device());
+ int **mapped_ptr_paa0 =
+ (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device());
+ printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr,
+ mapped_ptr_paa0 == nullptr, &paa[0] == nullptr);
+ }
+
+// (I) use_device_addr/map: one map with different, one with same base-ptr.
+// Lookup should succeed.
+// CHECK: I: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2])
+ {
+ int **mapped_ptr_paa02 =
+ (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+ printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+ mapped_ptr_paa02 != original_paa02,
+ &paa[0][2] == mapped_ptr_paa02);
+ }
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp
new file mode 100644
index 000000000000..e9a1124bc461
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp
@@ -0,0 +1,98 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section on a reference.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+ int *(&paa)[10][10] = paa_ptee;
+
+ void f1(int i) {
+ paa[0][2] = &g;
+
+ int *original_ph3 = &ph[3];
+ int **original_paa02 = &paa[0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5])
+ int *mapped_ptr_ph3 =
+ (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+ int **mapped_ptr_paa02 =
+ (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device());
+
+ // CHECK-COUNT-4: 1
+ printf("%d\n", mapped_ptr_ph3 != nullptr);
+ printf("%d\n", mapped_ptr_paa02 != nullptr);
+ printf("%d\n", original_ph3 != mapped_ptr_ph3);
+ printf("%d\n", original_paa02 != mapped_ptr_paa02);
+
+// (A) use_device_addr operand within mapped address range.
+// EXPECTED: A: 1
+// CHECK: A: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_addr(ph[3 : 4])
+ printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_addr operand in extended address range, but not
+// mapped address range.
+// EXPECTED: B: 1
+// CHECK: B: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_addr(ph[2])
+ printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) use_device_addr/map: same base-array, different first-location.
+// EXPECTED: C: 1
+// CHECK: C: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1])
+ printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) use_device_addr/map: different base-array/pointers.
+// EXPECTED: D: 1
+// CHECK: D: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+ printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) use_device_addr operand within mapped range of previous map.
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa[0])
+ printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (F) use_device_addr/map: different operands, same base-array.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2])
+ printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (G) use_device_addr/map: different base-array/pointers.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2])
+ printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5])
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp
new file mode 100644
index 000000000000..0090cdb09536
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp
@@ -0,0 +1,158 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section on a reference.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+ int *(&paa)[10][10] = paa_ptee;
+
+ void f1(int i) {
+ paa[0][2] = &g;
+
+ int *original_ph3 = &ph[3];
+ int **original_paa02 = &paa[0][2];
+
+// (A) No corresponding map, lookup should fail.
+// EXPECTED: A: 1 1 1
+// CHECK: A: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_addr(ph[3 : 4])
+ {
+ int *mapped_ptr_ph3 =
+ (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+ printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+ mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+ }
+
+// (B) use_device_addr/map: different operands, same base-pointer.
+// use_device_addr operand within mapped address range.
+// EXPECTED: B: 1 1 1
+// CHECK: B: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1])
+ {
+ int *mapped_ptr_ph4 =
+ (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device());
+ printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr,
+ mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4);
+ }
+
+// (C) use_device_addr/map: different base-pointers.
+// No corresponding storage, lookup should fail.
+// EXPECTED: C: 1 1 1
+// CHECK: C: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+ {
+ int *mapped_ptr_ph3 =
+ (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+ printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+ mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+ }
+
+// (D) use_device_addr/map: one of two maps with matching base-pointer.
+// use_device_addr operand within mapped address range of second map,
+// lookup should succeed.
+// EXPECTED: D: 1 1 1
+// CHECK: D: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4])
+ {
+ int *mapped_ptr_ph3 =
+ (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+ printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+ mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3);
+ }
+
+// (E) No corresponding map, lookup should fail
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(paa[0])
+ {
+ int **mapped_ptr_paa02 =
+ (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+ printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr,
+ mapped_ptr_paa02 != original_paa02,
+ &paa[0][2] == (int **)nullptr + 2);
+ }
+
+// (F) use_device_addr/map: different operands, same base-array.
+// use_device_addr within mapped address range. Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa[0])
+ {
+ int **mapped_ptr_paa02 =
+ (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+ printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+ mapped_ptr_paa02 != original_paa02,
+ &paa[0][2] == mapped_ptr_paa02);
+ }
+
+// (G) use_device_addr/map: different operands, same base-array.
+// use_device_addr extends beyond existing mapping. Not spec compliant.
+// But the lookup succeeds because we use the base-address for translation.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[0][4]) use_device_addr(paa[0])
+ {
+ int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr(
+ original_paa02 + 2, omp_get_default_device());
+ printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr,
+ mapped_ptr_paa04 != original_paa02 + 2,
+ &paa[0][4] == mapped_ptr_paa04);
+ }
+
+ int *original_paa020 = &paa[0][2][0];
+ int **original_paa0 = (int **)&paa[0];
+
+// (H) use_device_addr/map: different base-pointers.
+// No corresponding storage for use_device_addr opnd, lookup should fail.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0])
+ {
+ int **mapped_ptr_paa020 =
+ (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device());
+ int **mapped_ptr_paa0 =
+ (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device());
+ printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr,
+ mapped_ptr_paa0 == nullptr, &paa[0] == nullptr);
+ }
+
+// (I) use_device_addr/map: one map with different, one with same base-ptr.
+// Lookup should succeed.
+// CHECK: I: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2])
+ {
+ int **mapped_ptr_paa02 =
+ (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+ printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+ mapped_ptr_paa02 != original_paa02,
+ &paa[0][2] == mapped_ptr_paa02);
+ }
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp
new file mode 100644
index 000000000000..883297f7e90c
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp
@@ -0,0 +1,93 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a variable (not a section).
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+ int *paa[10][10];
+
+ void f1(int i) {
+ paa[0][2] = &g;
+
+ void *original_addr_g = &g;
+ void *original_addr_h = &h;
+ void *original_addr_ph = &ph;
+ void *original_addr_paa = &paa;
+
+#pragma omp target enter data map(to : g, h, ph, paa)
+ void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device());
+ void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device());
+ void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device());
+ void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device());
+
+ // CHECK-COUNT-8: 1
+ printf("%d\n", mapped_ptr_g != nullptr);
+ printf("%d\n", mapped_ptr_h != nullptr);
+ printf("%d\n", mapped_ptr_ph != nullptr);
+ printf("%d\n", mapped_ptr_paa != nullptr);
+ printf("%d\n", original_addr_g != mapped_ptr_g);
+ printf("%d\n", original_addr_h != mapped_ptr_h);
+ printf("%d\n", original_addr_ph != mapped_ptr_ph);
+ printf("%d\n", original_addr_paa != mapped_ptr_paa);
+
+// (A)
+// CHECK: A: 1
+#pragma omp target data use_device_addr(g)
+ printf("A: %d\n", mapped_ptr_g == &g);
+
+// (B)
+// CHECK: B: 1
+#pragma omp target data use_device_addr(h)
+ printf("B: %d\n", mapped_ptr_h == &h);
+
+// (C)
+// CHECK: C: 1
+#pragma omp target data use_device_addr(ph)
+ printf("C: %d\n", mapped_ptr_ph == &ph);
+
+// (D) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &ph, not &ph[0/1].
+// CHECK: D: 1
+#pragma omp target data map(ph[1 : 2]) use_device_addr(ph)
+ printf("D: %d\n", mapped_ptr_ph == &ph);
+
+// (E)
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa)
+ printf("E: %d\n", mapped_ptr_paa == &paa);
+
+// (F) use_device_addr/map with same base-array, paa.
+// Address translation should happen for &paa.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][2]) use_device_addr(paa)
+ printf("F: %d\n", mapped_ptr_paa == &paa);
+
+// (G) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &paa.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+ printf("G: %d\n", mapped_ptr_paa == &paa);
+
+#pragma omp target exit data map(release : g, h, ph, paa)
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp
new file mode 100644
index 000000000000..79c6f69edba8
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp
@@ -0,0 +1,159 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a variable (not a section).
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+ int *paa[10][10];
+
+ void f1(int i) {
+ paa[0][2] = &g;
+
+ void *original_addr_g = &g;
+ void *original_addr_h = &h;
+ void *original_addr_ph = &ph;
+ void *original_addr_paa = &paa;
+
+// (A) No corresponding item, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_addr(g)
+ {
+ void *mapped_ptr_g =
+ omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+ printf("A: %d %d %d\n", mapped_ptr_g == nullptr,
+ mapped_ptr_g != original_addr_g, (void *)&g == nullptr);
+ }
+
+// (B) Lookup should succeed.
+// CHECK: B: 1 1 1
+#pragma omp target data map(g) use_device_addr(g)
+ {
+ void *mapped_ptr_g =
+ omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+ printf("B: %d %d %d\n", mapped_ptr_g != nullptr,
+ mapped_ptr_g != original_addr_g, &g == mapped_ptr_g);
+ }
+
+// (C) No corresponding item, lookup should fail.
+// CHECK: C: 1 1 1
+#pragma omp target data use_device_addr(h)
+ {
+ void *mapped_ptr_h =
+ omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+ printf("C: %d %d %d\n", mapped_ptr_h == nullptr,
+ mapped_ptr_h != original_addr_h, (void *)&h == nullptr);
+ }
+
+// (D) Lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(h) use_device_addr(h)
+ {
+ void *mapped_ptr_h =
+ omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+ printf("D: %d %d %d\n", mapped_ptr_h != nullptr,
+ mapped_ptr_h != original_addr_h, &h == mapped_ptr_h);
+ }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(ph)
+ {
+ void *mapped_ptr_ph =
+ omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+ printf("E: %d %d %d\n", mapped_ptr_ph == nullptr,
+ mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+ }
+
+// (F) Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(ph) use_device_addr(ph)
+ {
+ void *mapped_ptr_ph =
+ omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+ printf("F: %d %d %d\n", mapped_ptr_ph != nullptr,
+ mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+ }
+
+// (G) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: G: 1 1 1
+#pragma omp target data map(ph[0 : 1]) use_device_addr(ph)
+ {
+ void *mapped_ptr_ph =
+ omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+ printf("G: %d %d %d\n", mapped_ptr_ph == nullptr,
+ mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+ }
+
+// (H) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph)
+ {
+ void *mapped_ptr_ph =
+ omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+ printf("H: %d %d %d\n", mapped_ptr_ph != nullptr,
+ mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+ }
+
+// (I) No corresponding item, lookup should fail.
+// CHECK: I: 1 1 1
+#pragma omp target data use_device_addr(paa)
+ {
+ void *mapped_ptr_paa =
+ omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+ printf("I: %d %d %d\n", mapped_ptr_paa == nullptr,
+ mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+ }
+
+// (J) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: J: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+ {
+ void *mapped_ptr_paa =
+ omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+ printf("J: %d %d %d\n", mapped_ptr_paa == nullptr,
+ mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+ }
+
+// (K) Lookup should succeed.
+// CHECK: K: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa)
+ {
+ void *mapped_ptr_paa =
+ omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+ printf("K: %d %d %d\n", mapped_ptr_paa != nullptr,
+ mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+ }
+
+// (L) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: L: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa)
+ {
+ void *mapped_ptr_paa =
+ omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+ printf("L: %d %d %d\n", mapped_ptr_paa != nullptr,
+ mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+ }
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp
new file mode 100644
index 000000000000..f018c65f36ec
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp
@@ -0,0 +1,100 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a reference variable.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+ int *(&paa)[10][10] = paa_ptee;
+
+ void f1(int i) {
+ paa[0][2] = &g;
+
+ void *original_addr_g = &g;
+ void *original_addr_h = &h;
+ void *original_addr_ph = &ph;
+ void *original_addr_paa = &paa;
+
+#pragma omp target enter data map(to : g, h, ph, paa)
+ void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device());
+ void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device());
+ void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device());
+ void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device());
+
+ // CHECK-COUNT-8: 1
+ printf("%d\n", mapped_ptr_g != nullptr);
+ printf("%d\n", mapped_ptr_h != nullptr);
+ printf("%d\n", mapped_ptr_ph != nullptr);
+ printf("%d\n", mapped_ptr_paa != nullptr);
+ printf("%d\n", original_addr_g != mapped_ptr_g);
+ printf("%d\n", original_addr_h != mapped_ptr_h);
+ printf("%d\n", original_addr_ph != mapped_ptr_ph);
+ printf("%d\n", original_addr_paa != mapped_ptr_paa);
+
+// (A)
+// CHECK: A: 1
+#pragma omp target data use_device_addr(g)
+ printf("A: %d\n", mapped_ptr_g == &g);
+
+// (B)
+// CHECK: B: 1
+#pragma omp target data use_device_addr(h)
+ printf("B: %d\n", mapped_ptr_h == &h);
+
+// (C)
+// CHECK: C: 1
+#pragma omp target data use_device_addr(ph)
+ printf("C: %d\n", mapped_ptr_ph == &ph);
+
+// (D) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &ph, not &ph[0/1].
+// CHECK: D: 1
+#pragma omp target data map(ph[1 : 2]) use_device_addr(ph)
+ printf("D: %d\n", mapped_ptr_ph == &ph);
+
+// (E)
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa)
+ printf("E: %d\n", mapped_ptr_paa == &paa);
+
+// (F) use_device_addr/map with same base-array, paa.
+// Address translation should happen for &paa.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][2]) use_device_addr(paa)
+ printf("F: %d\n", mapped_ptr_paa == &paa);
+
+// (G) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &paa.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+ printf("G: %d\n", mapped_ptr_paa == &paa);
+
+#pragma omp target exit data map(release : g, h, ph, paa)
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp
new file mode 100644
index 000000000000..9360db419504
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp
@@ -0,0 +1,166 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a reference variable.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+ int *(&paa)[10][10] = paa_ptee;
+
+ void f1(int i) {
+ paa[0][2] = &g;
+
+ void *original_addr_g = &g;
+ void *original_addr_h = &h;
+ void *original_addr_ph = &ph;
+ void *original_addr_paa = &paa;
+
+// (A) No corresponding item, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_addr(g)
+ {
+ void *mapped_ptr_g =
+ omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+ printf("A: %d %d %d\n", mapped_ptr_g == nullptr,
+ mapped_ptr_g != original_addr_g, (void *)&g == nullptr);
+ }
+
+// (B) Lookup should succeed.
+// CHECK: B: 1 1 1
+#pragma omp target data map(g) use_device_addr(g)
+ {
+ void *mapped_ptr_g =
+ omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+ printf("B: %d %d %d\n", mapped_ptr_g != nullptr,
+ mapped_ptr_g != original_addr_g, &g == mapped_ptr_g);
+ }
+
+// (C) No corresponding item, lookup should fail.
+// CHECK: C: 1 1 1
+#pragma omp target data use_device_addr(h)
+ {
+ void *mapped_ptr_h =
+ omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+ printf("C: %d %d %d\n", mapped_ptr_h == nullptr,
+ mapped_ptr_h != original_addr_h, (void *)&h == nullptr);
+ }
+
+// (D) Lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(h) use_device_addr(h)
+ {
+ void *mapped_ptr_h =
+ omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+ printf("D: %d %d %d\n", mapped_ptr_h != nullptr,
+ mapped_ptr_h != original_addr_h, &h == mapped_ptr_h);
+ }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(ph)
+ {
+ void *mapped_ptr_ph =
+ omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+ printf("E: %d %d %d\n", mapped_ptr_ph == nullptr,
+ mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+ }
+
+// (F) Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(ph) use_device_addr(ph)
+ {
+ void *mapped_ptr_ph =
+ omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+ printf("F: %d %d %d\n", mapped_ptr_ph != nullptr,
+ mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+ }
+
+// (G) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: G: 1 1 1
+#pragma omp target data map(ph[0 : 1]) use_device_addr(ph)
+ {
+ void *mapped_ptr_ph =
+ omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+ printf("G: %d %d %d\n", mapped_ptr_ph == nullptr,
+ mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+ }
+
+// (H) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph)
+ {
+ void *mapped_ptr_ph =
+ omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+ printf("H: %d %d %d\n", mapped_ptr_ph != nullptr,
+ mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+ }
+
+// (I) No corresponding item, lookup should fail.
+// CHECK: I: 1 1 1
+#pragma omp target data use_device_addr(paa)
+ {
+ void *mapped_ptr_paa =
+ omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+ printf("I: %d %d %d\n", mapped_ptr_paa == nullptr,
+ mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+ }
+
+// (J) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: J: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+ {
+ void *mapped_ptr_paa =
+ omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+ printf("J: %d %d %d\n", mapped_ptr_paa == nullptr,
+ mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+ }
+
+// (K) Lookup should succeed.
+// CHECK: K: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa)
+ {
+ void *mapped_ptr_paa =
+ omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+ printf("K: %d %d %d\n", mapped_ptr_paa != nullptr,
+ mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+ }
+
+// (L) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: L: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa)
+ {
+ void *mapped_ptr_paa =
+ omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+ printf("L: %d %d %d\n", mapped_ptr_paa != nullptr,
+ mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+ }
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/target_use_device_addr.c b/offload/test/mapping/use_device_addr/target_use_device_addr.c
index 5c2bb8a48f6e..4a9dbe252f76 100644
--- a/offload/test/mapping/target_use_device_addr.c
+++ b/offload/test/mapping/use_device_addr/target_use_device_addr.c
@@ -12,7 +12,9 @@ int main() {
printf("%d, %p\n", xp[1], &xp[1]);
#pragma omp target data use_device_addr(xp[1 : 3]) map(tofrom : x)
#pragma omp target is_device_ptr(xp)
- { xp[1] = 222; }
+ {
+ xp[1] = 222;
+ }
// CHECK: 222
printf("%d, %p\n", xp[1], &xp[1]);
}
diff --git a/offload/test/mapping/target_wrong_use_device_addr.c b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c
index 7a5babd69253..28ec6857fa1a 100644
--- a/offload/test/mapping/target_wrong_use_device_addr.c
+++ b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c
@@ -14,7 +14,7 @@ int main() {
// CHECK: host addr=0x[[#%x,HOST_ADDR:]]
fprintf(stderr, "host addr=%p\n", x);
-#pragma omp target data map(to : x [0:10])
+#pragma omp target data map(to : x[0 : 10])
{
// CHECK: omptarget device 0 info: variable x does not have a valid device
// counterpart
@@ -27,4 +27,3 @@ int main() {
return 0;
}
-
diff --git a/offload/test/mapping/array_section_use_device_ptr.c b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c
index 86e2875c35c4..4cfcce28c112 100644
--- a/offload/test/mapping/array_section_use_device_ptr.c
+++ b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c
@@ -20,7 +20,9 @@ int main() {
float *A_dev = NULL;
#pragma omp target data use_device_ptr(A)
- { A_dev = A; }
+ {
+ A_dev = A;
+ }
#pragma omp target exit data map(delete : A[FROM : LENGTH])
// CHECK: Success
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp
new file mode 100644
index 000000000000..a7745de53298
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp
@@ -0,0 +1,100 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a variable.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int h[10];
+int *ph = &h[0];
+
+struct S {
+ int (*paa)[10][10] = &aa;
+
+ void f1(int i) {
+ paa--;
+ void *original_ph3 = &ph[3];
+ void *original_paa102 = &paa[1][0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5])
+ void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+ void *mapped_ptr_paa102 =
+ omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device());
+
+ // CHECK-COUNT-4: 1
+ printf("%d\n", mapped_ptr_ph3 != nullptr);
+ printf("%d\n", mapped_ptr_paa102 != nullptr);
+ printf("%d\n", original_ph3 != mapped_ptr_ph3);
+ printf("%d\n", original_paa102 != mapped_ptr_paa102);
+
+// (A) Mapped data is within extended address range. Lookup should succeed.
+// CHECK: A: 1
+#pragma omp target data use_device_ptr(ph)
+ printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_ptr/map on pointer, and pointee already exists.
+// Lookup should succeed.
+// CHECK: B: 1
+#pragma omp target data map(ph) use_device_ptr(ph)
+ printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: C: 1
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+ printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: D: 1
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+ printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) Mapped data is within extended address range. Lookup should succeed.
+// Lookup should succeed.
+// CHECK: E: 1
+#pragma omp target data use_device_ptr(paa)
+ printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (F) use_device_ptr/map on pointer, and pointee already exists.
+// &paa[0] should be in extended address-range of the existing paa[1][...]
+// Lookup should succeed.
+// FIXME: However, it currently does not. Might need an RT fix.
+// EXPECTED: F: 1
+// CHECK: F: 0
+#pragma omp target data map(paa) use_device_ptr(paa)
+ printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+ printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+ printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5])
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp
new file mode 100644
index 000000000000..fe3cdb56e4ba
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp
@@ -0,0 +1,125 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a variable.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int h[10];
+int *ph = &h[0];
+
+struct S {
+ int (*paa)[10][10] = &aa;
+
+ void f1(int i) {
+ paa--;
+ void *original_addr_ph3 = &ph[3];
+ void *original_addr_paa102 = &paa[1][0][2];
+
+// (A) No corresponding item, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_ptr(ph)
+ {
+ void *mapped_ptr_ph3 =
+ omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+ printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+ mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+ }
+
+// (B) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// CHECK: B: 1 1 1
+#pragma omp target data map(ph) use_device_ptr(ph)
+ {
+ void *mapped_ptr_ph3 =
+ omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+ printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+ mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+ }
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: C: 1 1 1
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+ {
+ void *mapped_ptr_ph3 =
+ omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+ printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+ mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+ }
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+ {
+ void *mapped_ptr_ph3 =
+ omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+ printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+ mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+ }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_ptr(paa)
+ {
+ void *mapped_ptr_paa102 =
+ omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+ printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+ mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+ }
+
+// (F) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_ptr(paa)
+ {
+ void *mapped_ptr_paa102 =
+ omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+ printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+ mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+ }
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+ {
+ void *mapped_ptr_paa102 =
+ omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+ printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+ mapped_ptr_paa102 != original_addr_paa102,
+ &paa[1][0][2] == mapped_ptr_paa102);
+ }
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+ {
+ void *mapped_ptr_paa102 =
+ omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+ printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+ mapped_ptr_paa102 != original_addr_paa102,
+ &paa[1][0][2] == mapped_ptr_paa102);
+ }
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp
new file mode 100644
index 000000000000..66e65de4195a
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp
@@ -0,0 +1,111 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a reference variable.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int (*paa_ptee)[10][10] = &aa;
+
+int h[10];
+int *ph_ptee = &h[0];
+int *&ph = ph_ptee;
+
+struct S {
+ int (*&paa)[10][10] = paa_ptee;
+
+ void f1(int i) {
+ paa--;
+ void *original_ph3 = &ph[3];
+ void *original_paa102 = &paa[1][0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5])
+ void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+ void *mapped_ptr_paa102 =
+ omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device());
+
+ // CHECK-COUNT-4: 1
+ printf("%d\n", mapped_ptr_ph3 != nullptr);
+ printf("%d\n", mapped_ptr_paa102 != nullptr);
+ printf("%d\n", original_ph3 != mapped_ptr_ph3);
+ printf("%d\n", original_paa102 != mapped_ptr_paa102);
+
+// (A) Mapped data is within extended address range. Lookup should succeed.
+// EXPECTED: A: 1
+// CHECK: A: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_ptr(ph)
+ printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_ptr/map on pointer, and pointee already exists.
+// Lookup should succeed.
+// EXPECTED: B: 1
+// CHECK: B: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_ptr(ph)
+ printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: C: 1
+// CHECK: C: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+ printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: D: 1
+// CHECK: D: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+ printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) Mapped data is within extended address range. Lookup should succeed.
+// Lookup should succeed.
+// CHECK: E: 1
+#pragma omp target data use_device_ptr(paa)
+ printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (F) use_device_ptr/map on pointer, and pointee already exists.
+// &paa[0] should be in extended address-range of the existing paa[1][...]
+// Lookup should succeed.
+// FIXME: However, it currently does not. Might need an RT fix.
+// EXPECTED: F: 1
+// CHECK: F: 0
+#pragma omp target data map(paa) use_device_ptr(paa)
+ printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+ printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+ printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5])
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp
new file mode 100644
index 000000000000..419ab3eb33d4
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp
@@ -0,0 +1,136 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a reference variable.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int (*paa_ptee)[10][10] = &aa;
+
+int h[10];
+int *ph_ptee = &h[0];
+int *&ph = ph_ptee;
+
+struct S {
+ int (*&paa)[10][10] = paa_ptee;
+
+ void f1(int i) {
+ paa--;
+ void *original_addr_ph3 = &ph[3];
+ void *original_addr_paa102 = &paa[1][0][2];
+
+// (A) No corresponding item, lookup should fail.
+// EXPECTED: A: 1 1 1
+// CHECK: A: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_ptr(ph)
+ {
+ void *mapped_ptr_ph3 =
+ omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+ printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+ mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+ }
+
+// (B) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// EXPECTED: B: 1 1 1
+// CHECK: B: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_ptr(ph)
+ {
+ void *mapped_ptr_ph3 =
+ omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+ printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+ mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+ }
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: C: 1 1 1
+// CHECK: C: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+ {
+ void *mapped_ptr_ph3 =
+ omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+ printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+ mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+ }
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: D: 1 1 1
+// CHECK: D: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+ {
+ void *mapped_ptr_ph3 =
+ omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+ printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+ mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+ }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_ptr(paa)
+ {
+ void *mapped_ptr_paa102 =
+ omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+ printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+ mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+ }
+
+// (F) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_ptr(paa)
+ {
+ void *mapped_ptr_paa102 =
+ omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+ printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+ mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+ }
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+ {
+ void *mapped_ptr_paa102 =
+ omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+ printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+ mapped_ptr_paa102 != original_addr_paa102,
+ &paa[1][0][2] == mapped_ptr_paa102);
+ }
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+ {
+ void *mapped_ptr_paa102 =
+ omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+ printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+ mapped_ptr_paa102 != original_addr_paa102,
+ &paa[1][0][2] == mapped_ptr_paa102);
+ }
+ }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/offloading/fortran/dtype-char-array-map-2.f90 b/offload/test/offloading/fortran/dtype-char-array-map-2.f90
new file mode 100644
index 000000000000..f17ea9e53853
--- /dev/null
+++ b/offload/test/offloading/fortran/dtype-char-array-map-2.f90
@@ -0,0 +1,25 @@
+! Offloading test that verifies certain type of character string arrays
+! map to and from device without problem.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+ implicit none
+ type char_t
+ CHARACTER(LEN=16), dimension(10,10) :: char_arr
+ end type char_t
+ type(char_t) :: dtype_char
+
+!$omp target enter data map(alloc:dtype_char%char_arr)
+
+!$omp target
+ dtype_char%char_arr(2,2) = 'c'
+!$omp end target
+
+!$omp target update from(dtype_char%char_arr)
+
+
+ print *, dtype_char%char_arr(2,2)
+end program
+
+!CHECK: c
diff --git a/offload/test/offloading/fortran/dtype-char-array-map.f90 b/offload/test/offloading/fortran/dtype-char-array-map.f90
new file mode 100644
index 000000000000..6b72c9e95101
--- /dev/null
+++ b/offload/test/offloading/fortran/dtype-char-array-map.f90
@@ -0,0 +1,27 @@
+! Offloading test that verifies certain type of character string arrays
+! (in this case allocatable) map to and from device without problem.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+ implicit none
+ type char_t
+ CHARACTER(LEN=16), dimension(:,:), allocatable :: char_arr
+ end type char_t
+ type(char_t) :: dtype_char
+
+ allocate(dtype_char%char_arr(10,10))
+
+!$omp target enter data map(alloc:dtype_char%char_arr)
+
+!$omp target
+ dtype_char%char_arr(2,2) = 'c'
+!$omp end target
+
+!$omp target update from(dtype_char%char_arr)
+
+
+ print *, dtype_char%char_arr(2,2)
+end program
+
+!CHECK: c
diff --git a/offload/test/offloading/mandatory_but_no_devices.c b/offload/test/offloading/mandatory_but_no_devices.c
index ecdee72acad0..df8a5f3b9278 100644
--- a/offload/test/offloading/mandatory_but_no_devices.c
+++ b/offload/test/offloading/mandatory_but_no_devices.c
@@ -3,6 +3,47 @@
// device. This behavior is proposed for OpenMP 5.2 in OpenMP spec github
// issue 2669.
+// AMD Tests
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR=target
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN: %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR='target teams'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN: %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR='target data map(X)'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN: %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \
+// RUN: -DDIR='target enter data map(to:X)'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN: %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \
+// RUN: -DDIR='target exit data map(from:X)'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN: %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \
+// RUN: -DDIR='target update to(X)'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN: %fcheck-amdgcn-amd-amdhsa
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \
+// RUN: -DDIR='target update from(X)'
+// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \
+// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \
+// RUN: %fcheck-amdgcn-amd-amdhsa
+
+// Nvidia Tests
// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DDIR=target
// RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \
// RUN: %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
@@ -42,8 +83,6 @@
// RUN: %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
// RUN: %fcheck-nvptx64-nvidia-cuda
-// REQUIRES: nvptx64-nvidia-cuda
-
#include <omp.h>
#include <stdio.h>
diff --git a/offload/test/offloading/memory_manager.cpp b/offload/test/offloading/memory_manager.cpp
index fba1e4a54012..d6d8697fcdec 100644
--- a/offload/test/offloading/memory_manager.cpp
+++ b/offload/test/offloading/memory_manager.cpp
@@ -1,7 +1,5 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
-// REQUIRES: nvidiagpu
-
#include <omp.h>
#include <cassert>
diff --git a/offload/test/tools/llvm-omp-device-info.c b/offload/test/tools/llvm-omp-device-info.c
index 6f497309df2f..1ce8d4ac07f6 100644
--- a/offload/test/tools/llvm-omp-device-info.c
+++ b/offload/test/tools/llvm-omp-device-info.c
@@ -2,5 +2,5 @@
//
// Just check any device was found and something is printed
//
-// CHECK: Found {{[1-9].*}} devices:
-// CHECK: Device 0:
+// CHECK: Num Devices: {{[1-9].*}}
+// CHECK: [{{[1-9A-Za-z].*}}]
diff --git a/offload/test/tools/offload-tblgen/default_returns.td b/offload/test/tools/offload-tblgen/default_returns.td
index e919492cc5bf..41949db7226a 100644
--- a/offload/test/tools/offload-tblgen/default_returns.td
+++ b/offload/test/tools/offload-tblgen/default_returns.td
@@ -6,13 +6,11 @@
include "APIDefs.td"
-def : Handle {
- let name = "ol_foo_handle_t";
+def ol_foo_handle_t : Handle {
let desc = "Example handle type";
}
-def : Function {
- let name = "FunctionA";
+def FunctionA : Function {
let desc = "Function A description";
let details = [ "Function A detailed information" ];
let params = [
diff --git a/offload/test/tools/offload-tblgen/entry_points.td b/offload/test/tools/offload-tblgen/entry_points.td
index c66d5b488b46..94ea820d453e 100644
--- a/offload/test/tools/offload-tblgen/entry_points.td
+++ b/offload/test/tools/offload-tblgen/entry_points.td
@@ -4,8 +4,7 @@
include "APIDefs.td"
-def : Function {
- let name = "FunctionA";
+def FunctionA : Function {
let desc = "Function A description";
let details = [ "Function A detailed information" ];
let params = [
diff --git a/offload/test/tools/offload-tblgen/functions_basic.td b/offload/test/tools/offload-tblgen/functions_basic.td
index dec93577b57e..2802c78a2947 100644
--- a/offload/test/tools/offload-tblgen/functions_basic.td
+++ b/offload/test/tools/offload-tblgen/functions_basic.td
@@ -6,8 +6,7 @@
include "APIDefs.td"
-def : Function {
- let name = "FunctionA";
+def FunctionA : Function {
let desc = "Function A description";
let details = [ "Function A detailed information" ];
let params = [
diff --git a/offload/test/tools/offload-tblgen/functions_code_loc.td b/offload/test/tools/offload-tblgen/functions_code_loc.td
index aec20129343f..8d7aa00c5f15 100644
--- a/offload/test/tools/offload-tblgen/functions_code_loc.td
+++ b/offload/test/tools/offload-tblgen/functions_code_loc.td
@@ -7,8 +7,7 @@
include "APIDefs.td"
-def : Function {
- let name = "FunctionA";
+def FunctionA : Function {
let desc = "Function A description";
let details = [ "Function A detailed information" ];
let params = [
diff --git a/offload/test/tools/offload-tblgen/functions_ranged_param.td b/offload/test/tools/offload-tblgen/functions_ranged_param.td
index d0996b231973..1ce8b394b157 100644
--- a/offload/test/tools/offload-tblgen/functions_ranged_param.td
+++ b/offload/test/tools/offload-tblgen/functions_ranged_param.td
@@ -8,13 +8,11 @@
include "APIDefs.td"
-def : Handle {
- let name = "some_handle_t";
+def some_handle_t : Handle {
let desc = "An example handle type";
}
-def : Function {
- let name = "FunctionA";
+def FunctionA : Function {
let desc = "Function A description";
let details = [ "Function A detailed information" ];
let params = [
diff --git a/offload/test/tools/offload-tblgen/print_enum.td b/offload/test/tools/offload-tblgen/print_enum.td
index 97f869689293..c7573a9a415c 100644
--- a/offload/test/tools/offload-tblgen/print_enum.td
+++ b/offload/test/tools/offload-tblgen/print_enum.td
@@ -4,8 +4,7 @@
include "APIDefs.td"
-def : Enum {
- let name = "my_enum_t";
+def my_enum_t : Enum {
let desc = "An example enum";
let etors =[
Etor<"VALUE_ONE", "The first enum value">,
diff --git a/offload/test/tools/offload-tblgen/print_function.td b/offload/test/tools/offload-tblgen/print_function.td
index ce1fe4c52760..74b39f145a40 100644
--- a/offload/test/tools/offload-tblgen/print_function.td
+++ b/offload/test/tools/offload-tblgen/print_function.td
@@ -5,13 +5,11 @@
include "APIDefs.td"
-def : Handle {
- let name = "ol_foo_handle_t";
+def ol_foo_handle_t : Handle {
let desc = "Example handle type";
}
-def : Function {
- let name = "FunctionA";
+def FunctionA : Function {
let desc = "Function A description";
let details = [ "Function A detailed information" ];
let params = [
diff --git a/offload/test/tools/offload-tblgen/type_tagged_enum.td b/offload/test/tools/offload-tblgen/type_tagged_enum.td
index 95964e32f0c9..b32531aac9c8 100644
--- a/offload/test/tools/offload-tblgen/type_tagged_enum.td
+++ b/offload/test/tools/offload-tblgen/type_tagged_enum.td
@@ -9,13 +9,11 @@
include "APIDefs.td"
-def : Handle {
- let name = "some_handle_t";
+def some_handle_t: Handle {
let desc = "An example handle type";
}
-def : Enum {
- let name = "my_type_tagged_enum_t";
+def my_type_tagged_enum_t : Enum {
let desc = "Example type tagged enum";
let is_typed = 1;
let etors = [
@@ -34,8 +32,7 @@ def : Enum {
// CHECK-API-NEXT: [some_handle_t] Value three.
// CHECK-API-NEXT: MY_TYPE_TAGGED_ENUM_VALUE_THREE = 2,
-def : Function {
- let name = "FunctionA";
+def FunctionA : Function {
let desc = "Function A description";
let details = [ "Function A detailed information" ];
let params = [
diff --git a/offload/tools/deviceinfo/CMakeLists.txt b/offload/tools/deviceinfo/CMakeLists.txt
index 3787c12f940a..cc2d0a6add8b 100644
--- a/offload/tools/deviceinfo/CMakeLists.txt
+++ b/offload/tools/deviceinfo/CMakeLists.txt
@@ -4,10 +4,6 @@ add_openmp_tool(llvm-offload-device-info llvm-offload-device-info.cpp)
llvm_update_compile_flags(llvm-offload-device-info)
-target_include_directories(llvm-offload-device-info PRIVATE
- ${LIBOMPTARGET_INCLUDE_DIR}
-)
target_link_libraries(llvm-offload-device-info PRIVATE
- omp
- omptarget
+ LLVMOffload
)
diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 2228fbf3ec17..a2955d49d396 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -1,4 +1,4 @@
-//===- llvm-offload-device-info.cpp - Device info as seen by LLVM/Offload -===//
+//===- llvm-offload-device-info.cpp - Print liboffload properties ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,27 +6,270 @@
//
//===----------------------------------------------------------------------===//
//
-// This is a command line utility that, by using LLVM/Offload, and the device
-// plugins, list devices information as seen by the runtime.
+// This is a command line utility that, by using the new liboffload API, prints
+// all devices and properties
//
//===----------------------------------------------------------------------===//
-#include "omptarget.h"
-#include <cstdio>
+#include <OffloadAPI.h>
+#include <iostream>
+#include <vector>
-int main(int argc, char **argv) {
- __tgt_bin_desc EmptyDesc = {0, nullptr, nullptr, nullptr};
- __tgt_register_lib(&EmptyDesc);
- __tgt_init_all_rtls();
+#define OFFLOAD_ERR(X) \
+ if (auto Err = X) { \
+ return Err; \
+ }
+
+enum class PrintKind {
+ NORMAL,
+ FP_FLAGS,
+};
+
+template <typename T, PrintKind PK = PrintKind::NORMAL>
+void doWrite(std::ostream &S, T &&Val) {
+ S << Val;
+}
+
+template <>
+void doWrite<ol_platform_backend_t>(std::ostream &S,
+ ol_platform_backend_t &&Val) {
+ switch (Val) {
+ case OL_PLATFORM_BACKEND_UNKNOWN:
+ S << "UNKNOWN";
+ break;
+ case OL_PLATFORM_BACKEND_CUDA:
+ S << "CUDA";
+ break;
+ case OL_PLATFORM_BACKEND_AMDGPU:
+ S << "AMDGPU";
+ break;
+ case OL_PLATFORM_BACKEND_HOST:
+ S << "HOST";
+ break;
+ default:
+ S << "<< INVALID >>";
+ break;
+ }
+}
+template <>
+void doWrite<ol_device_type_t>(std::ostream &S, ol_device_type_t &&Val) {
+ switch (Val) {
+ case OL_DEVICE_TYPE_GPU:
+ S << "GPU";
+ break;
+ case OL_DEVICE_TYPE_CPU:
+ S << "CPU";
+ break;
+ case OL_DEVICE_TYPE_HOST:
+ S << "HOST";
+ break;
+ default:
+ S << "<< INVALID >>";
+ break;
+ }
+}
+template <>
+void doWrite<ol_dimensions_t>(std::ostream &S, ol_dimensions_t &&Val) {
+ S << "{x: " << Val.x << ", y: " << Val.y << ", z: " << Val.z << "}";
+}
+template <>
+void doWrite<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
+ std::ostream &S, ol_device_fp_capability_flags_t &&Val) {
+ S << Val << " {";
+
+ if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT) {
+ S << " CORRECTLY_ROUNDED_DIVIDE_SQRT";
+ }
+ if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST) {
+ S << " ROUND_TO_NEAREST";
+ }
+ if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO) {
+ S << " ROUND_TO_ZERO";
+ }
+ if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF) {
+ S << " ROUND_TO_INF";
+ }
+ if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN) {
+ S << " INF_NAN";
+ }
+ if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_DENORM) {
+ S << " DENORM";
+ }
+ if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_FMA) {
+ S << " FMA";
+ }
+ if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_SOFT_FLOAT) {
+ S << " SOFT_FLOAT";
+ }
+
+ S << " }";
+}
- printf("Found %d devices:\n", omp_get_num_devices());
- for (int Dev = 0; Dev < omp_get_num_devices(); Dev++) {
- printf(" Device %d:\n", Dev);
- if (!__tgt_print_device_info(Dev))
- printf(" print_device_info not implemented\n");
- printf("\n");
+template <typename T>
+ol_result_t printPlatformValue(std::ostream &S, ol_platform_handle_t Plat,
+ ol_platform_info_t Info, const char *Desc) {
+ S << Desc << ": ";
+
+ if constexpr (std::is_pointer_v<T>) {
+ std::vector<uint8_t> Val;
+ size_t Size;
+ OFFLOAD_ERR(olGetPlatformInfoSize(Plat, Info, &Size));
+ Val.resize(Size);
+ OFFLOAD_ERR(olGetPlatformInfo(Plat, Info, sizeof(Val), Val.data()));
+ doWrite(S, reinterpret_cast<T>(Val.data()));
+ } else {
+ T Val;
+ OFFLOAD_ERR(olGetPlatformInfo(Plat, Info, sizeof(Val), &Val));
+ doWrite(S, std::move(Val));
+ }
+ S << "\n";
+ return OL_SUCCESS;
+}
+
+template <typename T, PrintKind PK = PrintKind::NORMAL>
+ol_result_t printDeviceValue(std::ostream &S, ol_device_handle_t Dev,
+ ol_device_info_t Info, const char *Desc,
+ const char *Units = nullptr) {
+ S << Desc << ": ";
+
+ if constexpr (std::is_pointer_v<T>) {
+ std::vector<uint8_t> Val;
+ size_t Size;
+ OFFLOAD_ERR(olGetDeviceInfoSize(Dev, Info, &Size));
+ Val.resize(Size);
+ OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, sizeof(Val), Val.data()));
+ doWrite<T, PK>(S, reinterpret_cast<T>(Val.data()));
+ } else {
+ T Val;
+ OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, sizeof(Val), &Val));
+ doWrite<T, PK>(S, std::move(Val));
+ }
+ if (Units)
+ S << " " << Units;
+ S << "\n";
+ return OL_SUCCESS;
+}
+
+ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) {
+ ol_platform_handle_t Platform;
+ OFFLOAD_ERR(
+ olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), &Platform));
+
+ std::vector<char> Name;
+ size_t NameSize;
+ OFFLOAD_ERR(olGetDeviceInfoSize(D, OL_DEVICE_INFO_PRODUCT_NAME, &NameSize))
+ Name.resize(NameSize);
+ OFFLOAD_ERR(
+ olGetDeviceInfo(D, OL_DEVICE_INFO_PRODUCT_NAME, NameSize, Name.data()));
+ S << "[" << Name.data() << "]\n";
+
+ OFFLOAD_ERR(printPlatformValue<const char *>(
+ S, Platform, OL_PLATFORM_INFO_NAME, "Platform Name"));
+ OFFLOAD_ERR(printPlatformValue<const char *>(
+ S, Platform, OL_PLATFORM_INFO_VENDOR_NAME, "Platform Vendor Name"));
+ OFFLOAD_ERR(printPlatformValue<const char *>(
+ S, Platform, OL_PLATFORM_INFO_VERSION, "Platform Version"));
+ OFFLOAD_ERR(printPlatformValue<ol_platform_backend_t>(
+ S, Platform, OL_PLATFORM_INFO_BACKEND, "Platform Backend"));
+
+ OFFLOAD_ERR(
+ printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_NAME, "Name"));
+ OFFLOAD_ERR(
+ printDeviceValue<ol_device_type_t>(S, D, OL_DEVICE_INFO_TYPE, "Type"));
+ OFFLOAD_ERR(printDeviceValue<const char *>(
+ S, D, OL_DEVICE_INFO_DRIVER_VERSION, "Driver Version"));
+ OFFLOAD_ERR(printDeviceValue<uint32_t>(
+ S, D, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, "Max Work Group Size"));
+ OFFLOAD_ERR(printDeviceValue<ol_dimensions_t>(
+ S, D, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION,
+ "Max Work Group Size Per Dimension"));
+ OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_MAX_WORK_SIZE,
+ "Max Work Size"));
+ OFFLOAD_ERR(printDeviceValue<ol_dimensions_t>(
+ S, D, OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION,
+ "Max Work Size Per Dimension"));
+ OFFLOAD_ERR(
+ printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_VENDOR_ID, "Vendor ID"));
+ OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NUM_COMPUTE_UNITS,
+ "Num Compute Units"));
+ OFFLOAD_ERR(printDeviceValue<uint32_t>(
+ S, D, OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY, "Max Clock Frequency", "MHz"));
+ OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_MEMORY_CLOCK_RATE,
+ "Memory Clock Rate", "MHz"));
+ OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_ADDRESS_BITS,
+ "Address Bits"));
+ OFFLOAD_ERR(printDeviceValue<uint64_t>(
+ S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B"));
+ OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE,
+ "Global Mem Size", "B"));
+ OFFLOAD_ERR(
+ (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
+ S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG,
+ "Single Precision Floating Point Capability")));
+ OFFLOAD_ERR(
+ (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
+ S, D, OL_DEVICE_INFO_DOUBLE_FP_CONFIG,
+ "Double Precision Floating Point Capability")));
+ OFFLOAD_ERR(
+ (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
+ S, D, OL_DEVICE_INFO_HALF_FP_CONFIG,
+ "Half Precision Floating Point Capability")));
+ OFFLOAD_ERR(
+ printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR,
+ "Native Vector Width For Char"));
+ OFFLOAD_ERR(
+ printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT,
+ "Native Vector Width For Short"));
+ OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D,
+ OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT,
+ "Native Vector Width For Int"));
+ OFFLOAD_ERR(
+ printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG,
+ "Native Vector Width For Long"));
+ OFFLOAD_ERR(
+ printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT,
+ "Native Vector Width For Float"));
+ OFFLOAD_ERR(printDeviceValue<uint32_t>(
+ S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE,
+ "Native Vector Width For Double"));
+ OFFLOAD_ERR(
+ printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF,
+ "Native Vector Width For Half"));
+
+ return OL_SUCCESS;
+}
+
+ol_result_t printRoot(std::ostream &S) {
+ OFFLOAD_ERR(olInit());
+ S << "Liboffload Version: " << OL_VERSION_MAJOR << "." << OL_VERSION_MINOR
+ << "." << OL_VERSION_PATCH << "\n";
+
+ std::vector<ol_device_handle_t> Devices;
+ OFFLOAD_ERR(olIterateDevices(
+ [](ol_device_handle_t Device, void *UserData) {
+ reinterpret_cast<decltype(Devices) *>(UserData)->push_back(Device);
+ return true;
+ },
+ &Devices));
+
+ S << "Num Devices: " << Devices.size() << "\n";
+
+ for (auto &D : Devices) {
+ S << "\n";
+ OFFLOAD_ERR(printDevice(S, D));
}
- __tgt_unregister_lib(&EmptyDesc);
+ OFFLOAD_ERR(olShutDown());
+ return OL_SUCCESS;
+}
+
+int main(int argc, char **argv) {
+ auto Err = printRoot(std::cout);
+
+ if (Err) {
+ std::cerr << "[Liboffload error " << Err->Code << "]: " << Err->Details
+ << "\n";
+ return 1;
+ }
return 0;
}
diff --git a/offload/unittests/Conformance/README.md b/offload/unittests/Conformance/README.md
new file mode 100644
index 000000000000..0202242c99a0
--- /dev/null
+++ b/offload/unittests/Conformance/README.md
@@ -0,0 +1,83 @@
+# GPU Math Conformance Tests
+
+## Overview
+
+This test suite provides a framework to systematically measure the accuracy of math functions on GPUs and verify their conformance with standards like OpenCL.
+
+While the primary focus is validating the implementations in the C standard math library (LLVM-libm), these tests can also be executed against other math library providers, such as CUDA Math and HIP Math, for comparison.
+
+The goals of this project are to empower LLVM-libm contributors with a robust tool for validating their implementations and to build trust with end-users by providing transparent accuracy data.
+
+### Table of Contents
+
+- [Getting Started](#getting-started)
+- [Running the Tests](#running-the-tests)
+- [Adding New Tests](#adding-new-tests)
+
+## Getting Started
+
+This guide covers how to build the necessary dependencies, which include the new Offload API and the C standard library for both host and GPU targets.
+
+### System Requirements
+
+Before you begin, ensure your system meets the following requirements:
+
+- A system with an AMD or NVIDIA GPU.
+- The latest proprietary GPU drivers installed.
+- The corresponding development SDK for your hardware:
+ - **AMD:** [ROCm SDK](https://rocm.docs.amd.com)
+ - **NVIDIA:** [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
+
+### Building the Dependencies
+
+The official documentation for building LLVM-libc for GPUs provides a detailed guide and should be considered the primary reference. Please follow the instructions in the **"Standard runtimes build"** section of that guide:
+
+- [Building the GPU C library (Official Documentation)](https://libc.llvm.org/gpu/building.html)
+
+> [!IMPORTANT]
+> For the conformance tests, the standard `cmake` command from the official documentation must be adapted slightly. You must also add `libc` to the main `-DLLVM_ENABLE_RUNTIMES` list. This is a crucial step because the tests need a host-side build of `libc` to use as the reference oracle for validating GPU results.
+
+## Running the Tests
+
+### Default Test
+
+To build and run the conformance test for a given function (e.g., `logf`) against the default C standard math library `llvm-libm` provider, use the following command. This will execute the test on all available and supported platforms.
+
+```bash
+ninja -C build/runtimes/runtimes-bins offload.conformance.logf
+```
+
+### Testing Other Providers
+
+Once the test binary has been built, you can run it against other math library providers using the `--test-configs` flag.
+
+- **For `cuda-math` on an NVIDIA GPU:**
+
+ ```bash
+ ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=cuda-math:cuda
+ ```
+
+- **For `hip-math` on an AMD GPU:**
+
+ ```bash
+ ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=hip-math:amdgpu
+ ```
+
+You can also run all available configurations for a test with:
+
+```bash
+./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=all
+```
+
+## Adding New Tests
+
+To add a conformance test for a new math function, follow these steps:
+
+1. **Implement the Device Kernels**: Create a kernel wrapper for the new function in each provider's source file. For CUDA Math and HIP Math, you must also add a forward declaration for the vendor function in `/device_code/DeviceAPIs.hpp`.
+
+2. **Implement the Host Test**: Create a new `.cpp` file in `/tests`. This file defines the `FunctionConfig` (function and kernel names, as well as ULP tolerance) and the input generation strategy.
+
+ - Use **exhaustive testing** (`ExhaustiveGenerator`) for functions with small input spaces (e.g., half-precision functions and single-precision univariate functions). This strategy iterates over every representable point in the input space, ensuring complete coverage.
+ - Use **randomized testing** (`RandomGenerator`) for functions with large input spaces (e.g., single-precision bivariate and double-precision functions), where exhaustive testing is computationally infeasible. Although not exhaustive, this strategy is deterministic, using a fixed seed to sample a large, reproducible subset of points from the input space.
+
+3. **Add the Build Target**: Add a new `add_conformance_test(...)` entry to `/tests/CMakeLists.txt` to make the test buildable.
diff --git a/offload/unittests/Conformance/lib/DeviceContext.cpp b/offload/unittests/Conformance/lib/DeviceContext.cpp
index a0068c3cb59c..6c3425f1e17c 100644
--- a/offload/unittests/Conformance/lib/DeviceContext.cpp
+++ b/offload/unittests/Conformance/lib/DeviceContext.cpp
@@ -55,13 +55,14 @@ static OffloadInitWrapper Wrapper{};
[[nodiscard]] std::string getDeviceName(ol_device_handle_t DeviceHandle) {
std::size_t PropSize = 0;
- OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_NAME, &PropSize));
+ OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME,
+ &PropSize));
if (PropSize == 0)
return "";
std::string PropValue(PropSize, '\0');
- OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_NAME, PropSize,
+ OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME, PropSize,
PropValue.data()));
PropValue.pop_back(); // Remove the null terminator
diff --git a/offload/unittests/OffloadAPI/common/Environment.cpp b/offload/unittests/OffloadAPI/common/Environment.cpp
index ef092cd4187d..c9da6ef9be7c 100644
--- a/offload/unittests/OffloadAPI/common/Environment.cpp
+++ b/offload/unittests/OffloadAPI/common/Environment.cpp
@@ -41,9 +41,9 @@ raw_ostream &operator<<(raw_ostream &Out,
raw_ostream &operator<<(raw_ostream &Out, const ol_device_handle_t &Device) {
size_t Size;
- olGetDeviceInfoSize(Device, OL_DEVICE_INFO_NAME, &Size);
+ olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size);
std::vector<char> Name(Size);
- olGetDeviceInfo(Device, OL_DEVICE_INFO_NAME, Size, Name.data());
+ olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data());
Out << Name.data();
return Out;
}
@@ -129,6 +129,9 @@ const std::vector<TestEnvironment::Device> &TestEnvironment::getDevices() {
}
}
+ if (Devices.size() == 0)
+ errs() << "Warning: No devices found for OffloadAPI tests.\n";
+
return Devices;
}
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index fe7198a9c283..c5a35faba7a2 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -89,6 +89,40 @@ template <typename Fn> inline void threadify(Fn body) {
}
}
+/// Enqueues a task to the queue that can be manually resolved.
+// It will block until `trigger` is called.
+struct ManuallyTriggeredTask {
+ std::mutex M;
+ std::condition_variable CV;
+ bool Flag = false;
+ ol_event_handle_t CompleteEvent;
+
+ ol_result_t enqueue(ol_queue_handle_t Queue) {
+ if (auto Err = olLaunchHostFunction(
+ Queue,
+ [](void *That) {
+ static_cast<ManuallyTriggeredTask *>(That)->wait();
+ },
+ this))
+ return Err;
+
+ return olCreateEvent(Queue, &CompleteEvent);
+ }
+
+ void wait() {
+ std::unique_lock<std::mutex> lk(M);
+ CV.wait_for(lk, std::chrono::milliseconds(1000), [&] { return Flag; });
+ EXPECT_TRUE(Flag);
+ }
+
+ ol_result_t trigger() {
+ Flag = true;
+ CV.notify_one();
+
+ return olSyncEvent(CompleteEvent);
+ }
+};
+
struct OffloadTest : ::testing::Test {
ol_device_handle_t Host = TestEnvironment::getHostDevice();
};
@@ -216,9 +250,13 @@ struct OffloadEventTest : OffloadQueueTest {
ol_event_handle_t Event = nullptr;
};
+// Devices might not be available for offload testing, so allow uninstantiated
+// tests (as the device list will be empty). This means that all tests requiring
+// a device will be silently skipped.
#define OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(FIXTURE) \
INSTANTIATE_TEST_SUITE_P( \
, FIXTURE, ::testing::ValuesIn(TestEnvironment::getDevices()), \
[](const ::testing::TestParamInfo<TestEnvironment::Device> &info) { \
return SanitizeString(info.param.Name); \
- })
+ }); \
+ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(FIXTURE)
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
index 212a5d6ddf22..8cb0b8065c33 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
@@ -86,6 +86,29 @@ TEST_P(olGetDeviceInfoTest, HostName) {
ASSERT_EQ(std::strlen(Name.data()), Size - 1);
}
+TEST_P(olGetDeviceInfoTest, SuccessProductName) {
+ size_t Size = 0;
+ ASSERT_SUCCESS(
+ olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size));
+ ASSERT_GT(Size, 0ul);
+ std::vector<char> Name;
+ Name.resize(Size);
+ ASSERT_SUCCESS(
+ olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data()));
+ ASSERT_EQ(std::strlen(Name.data()), Size - 1);
+}
+
+TEST_P(olGetDeviceInfoTest, HostProductName) {
+ size_t Size = 0;
+ ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size));
+ ASSERT_GT(Size, 0ul);
+ std::vector<char> Name;
+ Name.resize(Size);
+ ASSERT_SUCCESS(
+ olGetDeviceInfo(Host, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data()));
+ ASSERT_EQ(std::strlen(Name.data()), Size - 1);
+}
+
TEST_P(olGetDeviceInfoTest, SuccessVendor) {
size_t Size = 0;
ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size));
@@ -122,6 +145,19 @@ TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSizePerDimension) {
ASSERT_GT(Value.z, 0u);
}
+OL_DEVICE_INFO_TEST_VALUE_GT(MaxWorkSize, uint32_t,
+ OL_DEVICE_INFO_MAX_WORK_SIZE, 0);
+
+TEST_P(olGetDeviceInfoTest, SuccessMaxWorkSizePerDimension) {
+ ol_dimensions_t Value{0, 0, 0};
+ ASSERT_SUCCESS(olGetDeviceInfo(Device,
+ OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION,
+ sizeof(Value), &Value));
+ ASSERT_GT(Value.x, 0u);
+ ASSERT_GT(Value.y, 0u);
+ ASSERT_GT(Value.z, 0u);
+}
+
OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(VendorId, uint32_t,
OL_DEVICE_INFO_VENDOR_ID, 0);
OL_DEVICE_INFO_TEST_HOST_SUCCESS(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID);
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
index a28089d918e0..c4a3c2d5e3c7 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
@@ -31,10 +31,13 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(Type, ol_device_type_t, OL_DEVICE_INFO_TYPE);
OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t,
OL_DEVICE_INFO_PLATFORM);
OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME);
OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR);
OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION);
OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t,
OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkSize, uint32_t,
+ OL_DEVICE_INFO_MAX_WORK_SIZE);
OL_DEVICE_INFO_SIZE_TEST_EQ(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID);
OL_DEVICE_INFO_SIZE_TEST_EQ(NumComputeUnits, uint32_t,
OL_DEVICE_INFO_NUM_COMPUTE_UNITS);
@@ -76,6 +79,14 @@ TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) {
ASSERT_EQ(Size, sizeof(uint32_t) * 3);
}
+TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkSizePerDimension) {
+ size_t Size = 0;
+ ASSERT_SUCCESS(olGetDeviceInfoSize(
+ Device, OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, &Size));
+ ASSERT_EQ(Size, sizeof(ol_dimensions_t));
+ ASSERT_EQ(Size, sizeof(uint32_t) * 3);
+}
+
TEST_P(olGetDeviceInfoSizeTest, InvalidNullHandle) {
size_t Size = 0;
ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
index 1b0bafa20208..a84ed3d78ecc 100644
--- a/offload/unittests/OffloadAPI/memory/olMemFill.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
@@ -10,75 +10,129 @@
#include <OffloadAPI.h>
#include <gtest/gtest.h>
-using olMemFillTest = OffloadQueueTest;
+struct olMemFillTest : OffloadQueueTest {
+ template <typename PatternTy, PatternTy PatternVal, size_t Size,
+ bool Block = false>
+ void test_body() {
+ ManuallyTriggeredTask Manual;
+
+ // Block/enqueue tests ensure that the test has been enqueued to a queue
+ // (rather than being done synchronously if the queue happens to be empty)
+ if constexpr (Block) {
+ ASSERT_SUCCESS(Manual.enqueue(Queue));
+ }
+
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ PatternTy Pattern = PatternVal;
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ if constexpr (Block) {
+ ASSERT_SUCCESS(Manual.trigger());
+ }
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ PatternTy *AllocPtr = reinterpret_cast<PatternTy *>(Alloc);
+ ASSERT_EQ(AllocPtr[i], Pattern);
+ }
+
+ olMemFree(Alloc);
+ }
+};
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest);
-TEST_P(olMemFillTest, Success8) {
- constexpr size_t Size = 1024;
- void *Alloc;
- ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
-
- uint8_t Pattern = 0x42;
- ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
-
- olSyncQueue(Queue);
+TEST_P(olMemFillTest, Success8) { test_body<uint8_t, 0x42, 1024>(); }
+TEST_P(olMemFillTest, Success8NotMultiple4) {
+ test_body<uint8_t, 0x42, 1023>();
+}
+TEST_P(olMemFillTest, Success8Enqueue) {
+ test_body<uint8_t, 0x42, 1024, true>();
+}
+TEST_P(olMemFillTest, Success8NotMultiple4Enqueue) {
+ test_body<uint8_t, 0x42, 1023, true>();
+}
- size_t N = Size / sizeof(Pattern);
- for (size_t i = 0; i < N; i++) {
- uint8_t *AllocPtr = reinterpret_cast<uint8_t *>(Alloc);
- ASSERT_EQ(AllocPtr[i], Pattern);
- }
+TEST_P(olMemFillTest, Success16) { test_body<uint8_t, 0x42, 1024>(); }
+TEST_P(olMemFillTest, Success16NotMultiple4) {
+ test_body<uint16_t, 0x4243, 1022>();
+}
+TEST_P(olMemFillTest, Success16Enqueue) {
+ test_body<uint8_t, 0x42, 1024, true>();
+}
+TEST_P(olMemFillTest, Success16NotMultiple4Enqueue) {
+ test_body<uint16_t, 0x4243, 1022, true>();
+}
- olMemFree(Alloc);
+TEST_P(olMemFillTest, Success32) { test_body<uint32_t, 0xDEADBEEF, 1024>(); }
+TEST_P(olMemFillTest, Success32Enqueue) {
+ test_body<uint32_t, 0xDEADBEEF, 1024, true>();
}
-TEST_P(olMemFillTest, Success16) {
+TEST_P(olMemFillTest, SuccessLarge) {
constexpr size_t Size = 1024;
void *Alloc;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
- uint16_t Pattern = 0x4242;
+ struct PatternT {
+ uint64_t A;
+ uint64_t B;
+ } Pattern{UINT64_MAX, UINT64_MAX};
+
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
olSyncQueue(Queue);
size_t N = Size / sizeof(Pattern);
for (size_t i = 0; i < N; i++) {
- uint16_t *AllocPtr = reinterpret_cast<uint16_t *>(Alloc);
- ASSERT_EQ(AllocPtr[i], Pattern);
+ PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+ ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
}
olMemFree(Alloc);
}
-TEST_P(olMemFillTest, Success32) {
+TEST_P(olMemFillTest, SuccessLargeEnqueue) {
constexpr size_t Size = 1024;
void *Alloc;
+ ManuallyTriggeredTask Manual;
+ ASSERT_SUCCESS(Manual.enqueue(Queue));
+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
- uint32_t Pattern = 0xDEADBEEF;
+ struct PatternT {
+ uint64_t A;
+ uint64_t B;
+ } Pattern{UINT64_MAX, UINT64_MAX};
+
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+ Manual.trigger();
olSyncQueue(Queue);
size_t N = Size / sizeof(Pattern);
for (size_t i = 0; i < N; i++) {
- uint32_t *AllocPtr = reinterpret_cast<uint32_t *>(Alloc);
- ASSERT_EQ(AllocPtr[i], Pattern);
+ PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+ ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
}
olMemFree(Alloc);
}
-TEST_P(olMemFillTest, SuccessLarge) {
- constexpr size_t Size = 1024;
+TEST_P(olMemFillTest, SuccessLargeByteAligned) {
+ constexpr size_t Size = 17 * 64;
void *Alloc;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
- struct PatternT {
+ struct __attribute__((packed)) PatternT {
uint64_t A;
uint64_t B;
- } Pattern{UINT64_MAX, UINT64_MAX};
+ uint8_t C;
+ } Pattern{UINT64_MAX, UINT64_MAX, 255};
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
@@ -89,14 +143,18 @@ TEST_P(olMemFillTest, SuccessLarge) {
PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].C, 255);
}
olMemFree(Alloc);
}
-TEST_P(olMemFillTest, SuccessLargeByteAligned) {
+TEST_P(olMemFillTest, SuccessLargeByteAlignedEnqueue) {
constexpr size_t Size = 17 * 64;
void *Alloc;
+ ManuallyTriggeredTask Manual;
+ ASSERT_SUCCESS(Manual.enqueue(Queue));
+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
struct __attribute__((packed)) PatternT {
@@ -107,6 +165,7 @@ TEST_P(olMemFillTest, SuccessLargeByteAligned) {
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+ Manual.trigger();
olSyncQueue(Queue);
size_t N = Size / sizeof(Pattern);
diff --git a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp
index 0dc8527df532..aa9e372ede2c 100644
--- a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp
+++ b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp
@@ -18,6 +18,15 @@ TEST_P(olDestroyQueueTest, Success) {
Queue = nullptr;
}
+TEST_P(olDestroyQueueTest, SuccessDelayedResolution) {
+ ManuallyTriggeredTask Manual;
+ ASSERT_SUCCESS(Manual.enqueue(Queue));
+ ASSERT_SUCCESS(olDestroyQueue(Queue));
+ Queue = nullptr;
+
+ ASSERT_SUCCESS(Manual.trigger());
+}
+
TEST_P(olDestroyQueueTest, InvalidNullHandle) {
ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, olDestroyQueue(nullptr));
}